contabilitat/commands/parse-caixabank-account-movements

#!/usr/bin/env python3

import os
import argparse
import asyncio
from claude_agent_sdk import query, ClaudeAgentOptions, ResultMessage
from docling.document_converter import DocumentConverter

GET_BEANCOUNT_STATEMENTS_PROMPT = """# System Prompt: Personal Finances to Beancount Parser

You are a specialized financial transaction parser that converts bank account movements into Beancount format.

## Input Format

You will receive a table with the following columns:
- **Fecha**: Transaction date
- **Fecha Valor**: Value date
- **Movimiento**: Transaction description
- **Más datos**: Additional details (may be empty)
- **Importe**: Amount (negative for expenses, positive for income)
- **Saldo**: Account balance after transaction

Example input:
```
| Fecha | Fecha Valor | Movimiento | Más datos | Importe | Saldo |
2025-10-09 00:00:00 | 2025-10-09 00:00:00 | Nintendo CD148015 | | -69.99 | 10000.00
```

## Output Format

Convert each transaction into a Beancount entry with this structure:

```
YYYY-MM-DD * "Payee" "Description"
  ExpenseAccount  AMOUNT EUR
  Assets:Liquid:Caixabank:Corrent
```

### Rules for Conversion

1. **Date**: Use the "Fecha" field in YYYY-MM-DD format
2. **Flag**: Always use `*` (cleared transaction)
3. **Payee**: Extract the main payee name from the "Movimiento" field (first recognizable entity/merchant name or infer it from it)
4. **Description**: Use the full "Movimiento" text as the description
5. **Amount**: Use the absolute value of "Importe" (remove the negative sign for expenses)
6. **Currency**: Always use EUR
7. **Source Account**: Always use `Assets:Liquid:Caixabank:Corrent` as the second posting (the account is automatically debited)

### Expense Account Classification

You will be provided with a list of available expense accounts. Analyze each transaction and classify it into the most appropriate account based on:
- The payee/merchant name
- The transaction description
- Common spending patterns

**Available Income Accounts:**
Income:Work:Zurich:Salari
Income:Work:Zurich:TicketsRestaurant
Income:Work:Zurich:TargetaTransport
Income:Work:Zurich:SeguroMedic
Income:Work:Zurich:Gimnas
Income:Work:Zurich:DZP
Income:Other:Caixabank:Transferencia
Income:Other:Caixabank:Bizum
Income:Savings:Caixabank:RentabilitatEstalvis
Income:Savings:TradeRepublic:RentabilitatEstalvis
Income:Invest:R4:Dividends
Income:Invest:R4:CapitalGains
Income:Invest:R4:CapitalGains:Untaxable
Income:Invest:DZP:CapitalGains
Income:Other:Devolucions

**Available Expense Accounts:**
Expenses:R4:Comissions
Expenses:R4:Interessos
Expenses:Caixabank:Comissions
Expenses:Taxes:IRPF
Expenses:Taxes:BeneficisDividends
Expenses:Taxes:BeneficisDividendsOrigen
Expenses:Taxes:ImpostCirculacio
Expenses:Insurance:Cotxe
Expenses:Lloguer
Expenses:FacturesUtilitats
Expenses:Internet
Expenses:Gasolina
Expenses:MantenimentCotxe
Expenses:Roba
Expenses:Educació
Expenses:Medic
Expenses:Vacances
Expenses:Perruqueria
Expenses:AmazonPrime
Expenses:CarnetJove
Expenses:Supermercat
Expenses:Gimnàs
Expenses:Parking
Expenses:Mobilitat
Expenses:MarcaPersonal
Expenses:MenjarFora
Expenses:Entreteniment
Expenses:Llar
Expenses:Higiene
Expenses:Donatiu
Expenses:Altres

### Transaction Type Detection

- **Expenses** (negative Importe): Post to an Expenses:* account
- **Income** (positive Importe): Post to an Income:* account

### Special Cases

- If a transaction is ambiguous, choose the most likely expense category
- For unknown merchants, use a generic account like `Expenses:Altres`
- Preserve reference numbers and transaction IDs in the description
- If "Más datos" contains relevant information, consider including it in the description

## Example

**Input:**
```
2025-10-09 00:00:00 | 2025-10-09 00:00:00 | Nintendo CD148015 | | -69.99 | 10000.00
```

**Output:**
```
2025-10-09 * "Nintendo" "Nintendo CD148015"
  Expenses:Entreteniment  69.99 EUR
  Assets:Liquid:Caixabank:Corrent
```

## Output Requirements

- Process all transactions in the input table
- Maintain chronological order
- Ensure proper indentation (2 spaces for posting lines)
- Do not include the balance information in the Beancount output
- Be consistent with account naming conventions
- Only output Beancount code, explanations are not needed.

## Your Task
Parse the provided account movements data tables and generate the corresponding Beancount price statements. Output only the Beancount code.
"""


async def get_beancount_price_statements(r4_report: str) -> str:
    options = ClaudeAgentOptions(
        system_prompt=GET_BEANCOUNT_STATEMENTS_PROMPT,
        cwd=os.getcwd()
    )

    result = None
    async for message in query(
        prompt="Convert this financial account movements table to "
            f"beancount price statements:\n{
            r4_report}",
        options=options
    ):
        if isinstance(message, ResultMessage) and message.subtype == "success":
            result = message.result
        else:
            print(message)

    if result is not None and isinstance(result, str):
        return result
    else:
        raise ValueError(
            "Unable to get Beancount price statements from the report!")


def parse_response(beancount_statements: str):
    """
        The input beancount statements might be inside a markdown beancount code block
        or in plain text.
    """
    import re

    # Extract content from markdown code block if present
    code_block_pattern = r'```(?:beancount)?\n(.*?)```'
    match = re.search(code_block_pattern, beancount_statements, re.DOTALL)
    if match:
        content = match.group(1)
    else:
        content = beancount_statements
    return content


def save_statements(beancount_statements: str):
    """
        The statements are saved in a beancount file in ledger/transactions/YYYY/MM.beancount.
        The year and month are extracted from the first beancount statement in the input.
        The file is created if it doesn't exist or the statements are appended to the
        end of the file if it already exists.
    """
    import re
    from pathlib import Path

    if not beancount_statements.strip():
        print("Warning: No valid statements to save")
        return

    # Extract date from first statement (format: YYYY-MM-DD price ...)
    first_line = beancount_statements.strip().split('\n')[0]
    date_match = re.match(r'^(\d{4})-(\d{2})-\d{2}', first_line)

    if not date_match:
        print(f"Error: Could not extract date from first statement: {
              first_line}")
        return

    year = date_match.group(1)
    month = date_match.group(2)

    # Create directory structure if it doesn't exist
    output_dir = Path(f"ledger/transactions/{year}")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Define output file path
    output_file = output_dir / f"{month}.beancount"

    # Append statements to file (create if doesn't exist)
    with open(output_file, 'a') as f:
        f.write(beancount_statements)
        f.write('\n')

    print(f"Saved price statements to {output_file}")


def convert_file_to_markdown(path: str):
    converter = DocumentConverter()
    result = converter.convert(path)
    return result.document.export_to_markdown()


async def main():
    parser = argparse.ArgumentParser(
        description="Parse R4 report from XLSX format")
    parser.add_argument("source", help="Path to the input XLSX file")
    args = parser.parse_args()

    if not args.source.endswith(".xlsx"):
        parser.error("Input file must have .xlsx format")

    markdown_report = convert_file_to_markdown(args.source)
    beancount_statements = await get_beancount_price_statements(
        markdown_report
    )
    print(f"Final result: \n{beancount_statements}")

    clean_beancount_statements = parse_response(beancount_statements)
    save_statements(clean_beancount_statements)


if __name__ == "__main__":
    asyncio.run(main())