contabilitat/commands/parse-r4-report

#!/usr/bin/env python3
import argparse
import asyncio
from claude_agent_sdk import query, ClaudeAgentOptions, ResultMessage
from docling.document_converter import DocumentConverter

GET_BEANCOUNT_STATEMENTS_PROMPT = """# Financial Data to Beancount Converter

You are an expert in converting markdown financial data tables to beancount price statements.

## Task
Parse markdown tables containing financial portfolio data and convert them to Beancount price statements.

## Input Format
You will receive markdown tables with the following sections:
- **ETFs**: Contains ETF holdings with descriptions, markets, currency, shares, and valuations
- **Fondos de Inversión** (Investment Funds): Contains fund holdings with descriptions, manager, currency, units, and valuations
- **Saldo en EUR**: Contains cash balances (should be ignored for price statements)

The date appears in the format `DD/MM/YYYY HH:MM:SS` near the top of each section.

## Commodity Mapping
Map the Spanish fund/ETF descriptions to the following commodity codes:

- "VANGUARD GLOBAL STOCK INDEX (EUR) ACC" → VANGL
- "VANGUARD GB SMALL-CAP IDX \"INV\" (EUR)" → VANSMCAP
- "ISHARES EDGE MSCI WORLD VALUE" → IWVL
- "X MSCI WORLD QUALITY ETF 1C" → XDEQ
- "RENTA 4 RENTA FIJA EURO, FI CLASE A" → R4RF
- "VANGUARD EM MK ST IX \"INV\" (EUR)" → VANEMMK

## Output Format
Generate Beancount price statements in this format:
YYYY-MM-DD price COMMODITY PRICE CURRENCY

## Rules
1. Extract the date from the table (format: DD/MM/YYYY) and convert to ISO format (YYYY-MM-DD)
2. For each ETF or Fund entry:
   - Match the "Descripción" field to the commodity mapping
   - Extract the price from "Precio actual" (ETFs) or "Valor liquidativo" (Funds)
   - Use the currency specified in the "Divisa" column
3. Ignore the "Saldo en EUR" (cash balance) section - do not create price statements for cash
4. Only output Beancount statements, no explanations or additional text
5. If a description doesn't match any commodity in the mapping, skip it or ask for clarification

## Example
Given this input data dated 04/10/2025 for "VANGUARD GLOBAL STOCK INDEX (EUR) ACC" that costs 54.12 EUR per participation:

Output:
2025-10-04 price VANGL 54.12 EUR

## Your Task
Parse the provided financial data tables and generate the corresponding Beancount price statements. Output only the Beancount code.
"""


def convert_file_to_markdown(path: str):
    converter = DocumentConverter()
    result = converter.convert(path)
    return result.document.export_to_markdown()


async def get_beancount_price_statements(r4_report: str) -> str:
    options = ClaudeAgentOptions(
        system_prompt=GET_BEANCOUNT_STATEMENTS_PROMPT,
        cwd="/home/roger/Projectes/contabilitat-2024/"
    )

    result = None
    async for message in query(
        prompt=f"Convert this financial statement to beancount price statements:\n{
            r4_report}",
        options=options
    ):
        if isinstance(message, ResultMessage) and message.subtype == "success":
            result = message.result
        else:
            print(message)

    if result is not None and isinstance(result, str):
        return result
    else:
        raise ValueError(
            "Unable to get Beancount price statements from the report!")


def parse_price_statements(beancount_statements: str):
    """
        The input beancount statements might be inside a markdown beancount code block
        or in plain text. After parsing them, we check that each line meets the
        format YYYY-MM-DD price COMMODITY PRICE CURRENCY.
        All lines that don't meet the criteria are dropped and are logged as errors.
    """
    import re

    # Extract content from markdown code block if present
    code_block_pattern = r'```(?:beancount)?\n(.*?)```'
    match = re.search(code_block_pattern, beancount_statements, re.DOTALL)
    if match:
        content = match.group(1)
    else:
        content = beancount_statements

    # Pattern for valid beancount price statement: YYYY-MM-DD price COMMODITY PRICE CURRENCY
    price_pattern = r'^\d{4}-\d{2}-\d{2}\s+price\s+[A-Z0-9]+\s+[\d.]+\s+[A-Z]+$'

    valid_statements = []
    for line in content.strip().split('\n'):
        line = line.strip()
        if not line:
            continue
        if re.match(price_pattern, line):
            valid_statements.append(line)
        else:
            print(f"Warning: Dropping invalid line: {line}")

    return '\n'.join(valid_statements)


def save_price_statements(beancount_statements: str):
    """
        The statements are saved in a beancount file in ledger/trading/YYYY/MM.beancount.
        The year and month are extracted from the first beancount statement in the input.
        The file is created if it doesn't exist or the statements are appended to the
        end of the file if it already exists.
    """
    import re
    from pathlib import Path

    if not beancount_statements.strip():
        print("Warning: No valid statements to save")
        return

    # Extract date from first statement (format: YYYY-MM-DD price ...)
    first_line = beancount_statements.strip().split('\n')[0]
    date_match = re.match(r'^(\d{4})-(\d{2})-\d{2}', first_line)

    if not date_match:
        print(f"Error: Could not extract date from first statement: {
              first_line}")
        return

    year = date_match.group(1)
    month = date_match.group(2)

    # Create directory structure if it doesn't exist
    output_dir = Path(f"ledger/trading/{year}")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Define output file path
    output_file = output_dir / f"{month}.beancount"

    # Append statements to file (create if doesn't exist)
    with open(output_file, 'a') as f:
        f.write(beancount_statements)
        f.write('\n')

    print(f"Saved price statements to {output_file}")


async def main():
    parser = argparse.ArgumentParser(
        description="Parse R4 report from XLSX format")
    parser.add_argument("source", help="Path to the input XLSX file")
    args = parser.parse_args()

    if not args.source.endswith(".xlsx"):
        parser.error("Input file must have .xlsx format")

    markdown_report = convert_file_to_markdown(args.source)
    beancount_statements = await get_beancount_price_statements(markdown_report)
    print(f"Final result: \n{beancount_statements}")

    clean_beancount_statements = parse_price_statements(beancount_statements)
    save_price_statements(clean_beancount_statements)


if __name__ == "__main__":
    asyncio.run(main())
parse-r4-report script to parse r4 investment prices 2025-10-04 17:51:17 +02:00			`#!/usr/bin/env python3`
			`import argparse`
			`import asyncio`
			`from claude_agent_sdk import query, ClaudeAgentOptions, ResultMessage`
			`from docling.document_converter import DocumentConverter`

			`GET_BEANCOUNT_STATEMENTS_PROMPT = """# Financial Data to Beancount Converter`

			`You are an expert in converting markdown financial data tables to beancount price statements.`

			`## Task`
			`Parse markdown tables containing financial portfolio data and convert them to Beancount price statements.`

			`## Input Format`
			`You will receive markdown tables with the following sections:`
			`- ETFs: Contains ETF holdings with descriptions, markets, currency, shares, and valuations`
			`- Fondos de Inversión (Investment Funds): Contains fund holdings with descriptions, manager, currency, units, and valuations`
			`- Saldo en EUR: Contains cash balances (should be ignored for price statements)`

			The date appears in the format `DD/MM/YYYY HH:MM:SS` near the top of each section.

			`## Commodity Mapping`
			`Map the Spanish fund/ETF descriptions to the following commodity codes:`

			`- "VANGUARD GLOBAL STOCK INDEX (EUR) ACC" → VANGL`
			`- "VANGUARD GB SMALL-CAP IDX \"INV\" (EUR)" → VANSMCAP`
			`- "ISHARES EDGE MSCI WORLD VALUE" → IWVL`
			`- "X MSCI WORLD QUALITY ETF 1C" → XDEQ`
			`- "RENTA 4 RENTA FIJA EURO, FI CLASE A" → R4RF`
			`- "VANGUARD EM MK ST IX \"INV\" (EUR)" → VANEMMK`

			`## Output Format`
			`Generate Beancount price statements in this format:`
			`YYYY-MM-DD price COMMODITY PRICE CURRENCY`

			`## Rules`
			`1. Extract the date from the table (format: DD/MM/YYYY) and convert to ISO format (YYYY-MM-DD)`
			`2. For each ETF or Fund entry:`
			`- Match the "Descripción" field to the commodity mapping`
			`- Extract the price from "Precio actual" (ETFs) or "Valor liquidativo" (Funds)`
			`- Use the currency specified in the "Divisa" column`
			`3. Ignore the "Saldo en EUR" (cash balance) section - do not create price statements for cash`
			`4. Only output Beancount statements, no explanations or additional text`
			`5. If a description doesn't match any commodity in the mapping, skip it or ask for clarification`

			`## Example`
			`Given this input data dated 04/10/2025 for "VANGUARD GLOBAL STOCK INDEX (EUR) ACC" that costs 54.12 EUR per participation:`

			`Output:`
			`2025-10-04 price VANGL 54.12 EUR`

			`## Your Task`
			`Parse the provided financial data tables and generate the corresponding Beancount price statements. Output only the Beancount code.`
			`"""`


			`def convert_file_to_markdown(path: str):`
			`converter = DocumentConverter()`
			`result = converter.convert(path)`
			`return result.document.export_to_markdown()`


			`async def get_beancount_price_statements(r4_report: str) -> str:`
			`options = ClaudeAgentOptions(`
			`system_prompt=GET_BEANCOUNT_STATEMENTS_PROMPT,`
			`cwd="/home/roger/Projectes/contabilitat-2024/"`
			`)`

			`result = None`
			`async for message in query(`
			`prompt=f"Convert this financial statement to beancount price statements:\n{`
			`r4_report}",`
			`options=options`
			`):`
			`if isinstance(message, ResultMessage) and message.subtype == "success":`
			`result = message.result`
			`else:`
			`print(message)`

			`if result is not None and isinstance(result, str):`
			`return result`
			`else:`
			`raise ValueError(`
			`"Unable to get Beancount price statements from the report!")`


			`def parse_price_statements(beancount_statements: str):`
			`"""`
			`The input beancount statements might be inside a markdown beancount code block`
			`or in plain text. After parsing them, we check that each line meets the`
			`format YYYY-MM-DD price COMMODITY PRICE CURRENCY.`
			`All lines that don't meet the criteria are dropped and are logged as errors.`
			`"""`
			`import re`

			`# Extract content from markdown code block if present`
			code_block_pattern = r'```(?:beancount)?\n(.*?)```'
			`match = re.search(code_block_pattern, beancount_statements, re.DOTALL)`
			`if match:`
			`content = match.group(1)`
			`else:`
			`content = beancount_statements`

			`# Pattern for valid beancount price statement: YYYY-MM-DD price COMMODITY PRICE CURRENCY`
			`price_pattern = r'^\d{4}-\d{2}-\d{2}\s+price\s+[A-Z0-9]+\s+[\d.]+\s+[A-Z]+$'`

			`valid_statements = []`
			`for line in content.strip().split('\n'):`
			`line = line.strip()`
			`if not line:`
			`continue`
			`if re.match(price_pattern, line):`
			`valid_statements.append(line)`
			`else:`
			`print(f"Warning: Dropping invalid line: {line}")`

			`return '\n'.join(valid_statements)`


			`def save_price_statements(beancount_statements: str):`
			`"""`
			`The statements are saved in a beancount file in ledger/trading/YYYY/MM.beancount.`
			`The year and month are extracted from the first beancount statement in the input.`
			`The file is created if it doesn't exist or the statements are appended to the`
			`end of the file if it already exists.`
			`"""`
			`import re`
			`from pathlib import Path`

			`if not beancount_statements.strip():`
			`print("Warning: No valid statements to save")`
			`return`

			`# Extract date from first statement (format: YYYY-MM-DD price ...)`
			`first_line = beancount_statements.strip().split('\n')[0]`
			`date_match = re.match(r'^(\d{4})-(\d{2})-\d{2}', first_line)`

			`if not date_match:`
			`print(f"Error: Could not extract date from first statement: {`
			`first_line}")`
			`return`

			`year = date_match.group(1)`
			`month = date_match.group(2)`

			`# Create directory structure if it doesn't exist`
			`output_dir = Path(f"ledger/trading/{year}")`
			`output_dir.mkdir(parents=True, exist_ok=True)`

			`# Define output file path`
			`output_file = output_dir / f"{month}.beancount"`

			`# Append statements to file (create if doesn't exist)`
			`with open(output_file, 'a') as f:`
			`f.write(beancount_statements)`
			`f.write('\n')`

			`print(f"Saved price statements to {output_file}")`


			`async def main():`
			`parser = argparse.ArgumentParser(`
			`description="Parse R4 report from XLSX format")`
			`parser.add_argument("source", help="Path to the input XLSX file")`
			`args = parser.parse_args()`

			`if not args.source.endswith(".xlsx"):`
			`parser.error("Input file must have .xlsx format")`

			`markdown_report = convert_file_to_markdown(args.source)`
			`beancount_statements = await get_beancount_price_statements(markdown_report)`
			`print(f"Final result: \n{beancount_statements}")`

			`clean_beancount_statements = parse_price_statements(beancount_statements)`
			`save_price_statements(clean_beancount_statements)`


			`if __name__ == "__main__":`
			`asyncio.run(main())`