contabilitat/commands/parse-caixabank-account-movements

#!/usr/bin/env python3

import os
import argparse
import asyncio
from claude_agent_sdk import query, ClaudeAgentOptions, ResultMessage
from docling.document_converter import DocumentConverter

GET_BEANCOUNT_STATEMENTS_PROMPT = """# System Prompt: Personal Finances to Beancount Parser

You are a specialized financial transaction parser that converts bank account movements into Beancount format.

## Input Format

You will receive a table with the following columns:
- **Fecha**: Transaction date
- **Fecha Valor**: Value date
- **Movimiento**: Transaction description
- **Más datos**: Additional details (may be empty)
- **Importe**: Amount (negative for expenses, positive for income)
- **Saldo**: Account balance after transaction

Example input:
```
| Fecha | Fecha Valor | Movimiento | Más datos | Importe | Saldo |
2025-10-09 00:00:00 | 2025-10-09 00:00:00 | Nintendo CD148015 | | -69.99 | 10000.00
```

## Output Format

Convert each transaction into a Beancount entry with this structure:

```
YYYY-MM-DD * "Payee" "Description"
  ExpenseAccount  AMOUNT EUR
  Assets:Liquid:Caixabank:Corrent
```

### Rules for Conversion

1. **Date**: Use the "Fecha" field in YYYY-MM-DD format
2. **Flag**: Always use `*` (cleared transaction)
3. **Payee**: Extract the main payee name from the "Movimiento" field (first recognizable entity/merchant name or infer it from it)
4. **Description**: Use the full "Movimiento" text as the description
5. **Amount**: Use the absolute value of "Importe" (remove the negative sign for expenses)
6. **Currency**: Always use EUR
7. **Source Account**: Always use `Assets:Liquid:Caixabank:Corrent` as the second posting (the account is automatically debited)

### Expense Account Classification

You will be provided with a list of available expense accounts. Analyze each transaction and classify it into the most appropriate account based on:
- The payee/merchant name
- The transaction description
- Common spending patterns

**Available Income Accounts:**
Income:Work:Zurich:Salari
Income:Work:Zurich:TicketsRestaurant
Income:Work:Zurich:TargetaTransport
Income:Work:Zurich:SeguroMedic
Income:Work:Zurich:Gimnas
Income:Work:Zurich:DZP
Income:Other:Caixabank:Transferencia
Income:Other:Caixabank:Bizum
Income:Savings:Caixabank:RentabilitatEstalvis
Income:Savings:TradeRepublic:RentabilitatEstalvis
Income:Invest:R4:Dividends
Income:Invest:R4:CapitalGains
Income:Invest:R4:CapitalGains:Untaxable
Income:Invest:DZP:CapitalGains
Income:Other:Devolucions

**Available Expense Accounts:**
Expenses:R4:Comissions
Expenses:R4:Interessos
Expenses:Caixabank:Comissions
Expenses:Taxes:IRPF
Expenses:Taxes:BeneficisDividends
Expenses:Taxes:BeneficisDividendsOrigen
Expenses:Taxes:ImpostCirculacio
Expenses:Insurance:Cotxe
Expenses:Lloguer
Expenses:FacturesUtilitats
Expenses:Internet
Expenses:Gasolina
Expenses:MantenimentCotxe
Expenses:Roba
Expenses:Educació
Expenses:Medic
Expenses:Vacances
Expenses:Perruqueria
Expenses:AmazonPrime
Expenses:CarnetJove
Expenses:Supermercat
Expenses:Gimnàs
Expenses:Parking
Expenses:Mobilitat
Expenses:MarcaPersonal
Expenses:MenjarFora
Expenses:Entreteniment
Expenses:Llar
Expenses:Higiene
Expenses:Donatiu
Expenses:Altres

### Transaction Type Detection

- **Expenses** (negative Importe): Post to an Expenses:* account
- **Income** (positive Importe): Post to an Income:* account

### Special Cases

- If a transaction is ambiguous, choose the most likely expense category
- For unknown merchants, use a generic account like `Expenses:Altres`
- Preserve reference numbers and transaction IDs in the description
- If "Más datos" contains relevant information, consider including it in the description

## Example

**Input:**
```
2025-10-09 00:00:00 | 2025-10-09 00:00:00 | Nintendo CD148015 | | -69.99 | 10000.00
```

**Output:**
```
2025-10-09 * "Nintendo" "Nintendo CD148015"
  Expenses:Entreteniment  69.99 EUR
  Assets:Liquid:Caixabank:Corrent
```

## Output Requirements

- Process all transactions in the input table
- Maintain chronological order
- Ensure proper indentation (2 spaces for posting lines)
- Do not include the balance information in the Beancount output
- Be consistent with account naming conventions
- Only output Beancount code, explanations are not needed.

## Your Task
Parse the provided account movements data tables and generate the corresponding Beancount price statements. Output only the Beancount code.
"""


async def get_beancount_price_statements(r4_report: str) -> str:
    options = ClaudeAgentOptions(
        system_prompt=GET_BEANCOUNT_STATEMENTS_PROMPT,
        cwd=os.getcwd()
    )

    result = None
    async for message in query(
        prompt="Convert this financial account movements table to "
        f"beancount price statements:\n{r4_report}",
        options=options
    ):
        if isinstance(message, ResultMessage) and message.subtype == "success":
            result = message.result
        else:
            print(message)

    if result is not None and isinstance(result, str):
        return result
    else:
        raise ValueError(
            "Unable to get Beancount price statements from the report!")


def parse_response(beancount_statements: str):
    """
        The input beancount statements might be inside a markdown beancount
        code block or in plain text.
    """
    import re

    # Extract content from markdown code block if present
    code_block_pattern = r'```(?:beancount)?\n(.*?)```'
    match = re.search(code_block_pattern, beancount_statements, re.DOTALL)
    if match:
        content = match.group(1)
    else:
        content = beancount_statements
    return content


def extract_last_balances_by_month(markdown_report: str) -> dict:
    """
        Extract the last transaction date and balance for each month from the markdown table.
        Returns a dict with keys as (year, month) tuples and values as (date, balance) tuples.
    """
    import re
    from collections import defaultdict

    lines = markdown_report.split('\n')

    transactions_by_month = {}

    for line in lines:
        if '|' not in line:
            continue

        parts = line.split('|')
        if len(parts) < 7:
            continue

        fecha_col = parts[1].strip()
        saldo_col = parts[6].strip()

        date_match = re.search(r'(\d{4})-(\d{2})-(\d{2})', fecha_col)
        if date_match and saldo_col:
            try:
                balance = saldo_col.replace(',', '').replace(' ', '')
                float(balance)

                year = date_match.group(1)
                month = date_match.group(2)
                date = f"{year}-{month}-{date_match.group(3)}"
                month_key = (year, month)

                if month_key not in transactions_by_month or date > transactions_by_month[month_key][0]:
                    transactions_by_month[month_key] = (date, balance)
            except ValueError:
                continue

    return transactions_by_month


def save_statements(
        beancount_statements: str,
        last_balances_by_month: dict
):
    """
        The statements are saved in beancount files in
        ledger/transactions/YYYY/MM.beancount.
        Statements are sorted chronologically and split by month if they
        span multiple months.
        A balance assertion is added at the end of each month's file.
    """
    import re
    from pathlib import Path
    from collections import defaultdict

    if not beancount_statements.strip():
        print("Warning: No valid statements to save")
        return

    lines = beancount_statements.strip().split('\n')

    transactions = []
    current_transaction = []

    for line in lines:
        if re.match(r'^\d{4}-\d{2}-\d{2}', line):
            if current_transaction:
                transactions.append('\n'.join(current_transaction))
            current_transaction = [line]
        elif current_transaction:
            current_transaction.append(line)

    if current_transaction:
        transactions.append('\n'.join(current_transaction))

    transactions.sort(key=lambda t: re.match(
        r'^(\d{4}-\d{2}-\d{2})', t).group(1))

    transactions_by_month = defaultdict(list)
    for transaction in transactions:
        date_match = re.match(r'^(\d{4})-(\d{2})-\d{2}', transaction)
        if date_match:
            year = date_match.group(1)
            month = date_match.group(2)
            key = (year, month)
            transactions_by_month[key].append(transaction)

    for (year, month), month_transactions in sorted(transactions_by_month.items()):
        output_dir = Path(f"ledger/transactions/{year}")
        output_dir.mkdir(parents=True, exist_ok=True)

        output_file = output_dir / f"{month}.beancount"

        existing_content = ""
        if output_file.exists():
            with open(output_file, 'r') as f:
                existing_content = f.read()

        with open(output_file, 'w') as f:
            if existing_content:
                f.write(existing_content)
                if not existing_content.endswith('\n'):
                    f.write('\n')
            f.write('\n'.join(month_transactions))
            f.write('\n')

            month_key = (year, month)
            if month_key in last_balances_by_month:
                last_date, last_balance = last_balances_by_month[month_key]
                if last_date and last_balance:
                    f.write(f'\n{last_date} balance Assets:Liquid:Caixabank:Corrent  {
                            last_balance} EUR\n')

        print(f"Saved statements to {output_file}")


def filter_markdown_by_date(markdown_report: str, from_date: str) -> str:
    """
        Filter markdown table to only include rows with dates >= from_date.
    """
    import re
    from datetime import datetime

    if not from_date:
        return markdown_report

    try:
        filter_date = datetime.strptime(from_date, "%Y-%m-%d")
    except ValueError:
        print(f"Warning: Invalid date format '{from_date}'. Expected YYYY-MM-DD. Ignoring filter.")
        return markdown_report

    lines = markdown_report.split('\n')
    filtered_lines = []

    for line in lines:
        if '|' not in line:
            filtered_lines.append(line)
            continue

        parts = line.split('|')
        if len(parts) < 7:
            filtered_lines.append(line)
            continue

        fecha_col = parts[1].strip()
        date_match = re.search(r'(\d{4})-(\d{2})-(\d{2})', fecha_col)

        if date_match:
            line_date = datetime.strptime(f"{date_match.group(1)}-{date_match.group(2)}-{date_match.group(3)}", "%Y-%m-%d")
            if line_date >= filter_date:
                filtered_lines.append(line)
        else:
            filtered_lines.append(line)

    return '\n'.join(filtered_lines)


def convert_file_to_markdown(path: str):
    converter = DocumentConverter()
    result = converter.convert(path)
    return result.document.export_to_markdown()


async def main():
    parser = argparse.ArgumentParser(
        description="Parse R4 report from XLSX format")
    parser.add_argument("source", help="Path to the input XLSX file")
    parser.add_argument("--from", dest="from_date", help="Filter transactions from this date (YYYY-MM-DD)")
    args = parser.parse_args()

    if not args.source.endswith(".xlsx"):
        parser.error("Input file must have .xlsx format")

    markdown_report = convert_file_to_markdown(args.source)

    if args.from_date:
        markdown_report = filter_markdown_by_date(markdown_report, args.from_date)

    beancount_statements = await get_beancount_price_statements(
        markdown_report
    )
    print(f"Final result: \n{beancount_statements}")

    clean_beancount_statements = parse_response(beancount_statements)
    last_balances_by_month = extract_last_balances_by_month(markdown_report)
    save_statements(clean_beancount_statements, last_balances_by_month)


if __name__ == "__main__":
    asyncio.run(main())