parse-r4-report script to parse r4 investment prices

2025-10-04 17:51:17 +02:00
parent f24d549415
commit 9b9027b670
5 changed files with 1905 additions and 17 deletions
--- a/commands/parse-r4-report
+++ b/commands/parse-r4-report
@@ -0,0 +1,179 @@
 #!/usr/bin/env python3
 import argparse
 import asyncio
 from claude_agent_sdk import query, ClaudeAgentOptions, ResultMessage
 from docling.document_converter import DocumentConverter
 GET_BEANCOUNT_STATEMENTS_PROMPT = """# Financial Data to Beancount Converter
 You are an expert in converting markdown financial data tables to beancount price statements.
 ## Task
 Parse markdown tables containing financial portfolio data and convert them to Beancount price statements.
 ## Input Format
 You will receive markdown tables with the following sections:
 - **ETFs**: Contains ETF holdings with descriptions, markets, currency, shares, and valuations
 - **Fondos de Inversión** (Investment Funds): Contains fund holdings with descriptions, manager, currency, units, and valuations
 - **Saldo en EUR**: Contains cash balances (should be ignored for price statements)
 The date appears in the format `DD/MM/YYYY HH:MM:SS` near the top of each section.
 ## Commodity Mapping
 Map the Spanish fund/ETF descriptions to the following commodity codes:
 - "VANGUARD GLOBAL STOCK INDEX (EUR) ACC" → VANGL
 - "VANGUARD GB SMALL-CAP IDX \"INV\" (EUR)" → VANSMCAP
 - "ISHARES EDGE MSCI WORLD VALUE" → IWVL
 - "X MSCI WORLD QUALITY ETF 1C" → XDEQ
 - "RENTA 4 RENTA FIJA EURO, FI CLASE A" → R4RF
 - "VANGUARD EM MK ST IX \"INV\" (EUR)" → VANEMMK
 ## Output Format
 Generate Beancount price statements in this format:
 YYYY-MM-DD price COMMODITY PRICE CURRENCY
 ## Rules
 1. Extract the date from the table (format: DD/MM/YYYY) and convert to ISO format (YYYY-MM-DD)
 2. For each ETF or Fund entry:
   - Match the "Descripción" field to the commodity mapping
   - Extract the price from "Precio actual" (ETFs) or "Valor liquidativo" (Funds)
   - Use the currency specified in the "Divisa" column
 3. Ignore the "Saldo en EUR" (cash balance) section - do not create price statements for cash
 4. Only output Beancount statements, no explanations or additional text
 5. If a description doesn't match any commodity in the mapping, skip it or ask for clarification
 ## Example
 Given this input data dated 04/10/2025 for "VANGUARD GLOBAL STOCK INDEX (EUR) ACC" that costs 54.12 EUR per participation:
 Output:
 2025-10-04 price VANGL 54.12 EUR
 ## Your Task
 Parse the provided financial data tables and generate the corresponding Beancount price statements. Output only the Beancount code.
 """
 def convert_file_to_markdown(path: str):
    converter = DocumentConverter()
    result = converter.convert(path)
    return result.document.export_to_markdown()
 async def get_beancount_price_statements(r4_report: str) -> str:
    options = ClaudeAgentOptions(
        system_prompt=GET_BEANCOUNT_STATEMENTS_PROMPT,
        cwd="/home/roger/Projectes/contabilitat-2024/"
    )
    result = None
    async for message in query(
        prompt=f"Convert this financial statement to beancount price statements:\n{
            r4_report}",
        options=options
    ):
        if isinstance(message, ResultMessage) and message.subtype == "success":
            result = message.result
        else:
            print(message)
    if result is not None and isinstance(result, str):
        return result
    else:
        raise ValueError(
            "Unable to get Beancount price statements from the report!")
 def parse_price_statements(beancount_statements: str):
    """
        The input beancount statements might be inside a markdown beancount code block
        or in plain text. After parsing them, we check that each line meets the
        format YYYY-MM-DD price COMMODITY PRICE CURRENCY.
        All lines that don't meet the criteria are dropped and are logged as errors.
    """
    import re
    # Extract content from markdown code block if present
    code_block_pattern = r'```(?:beancount)?\n(.*?)```'
    match = re.search(code_block_pattern, beancount_statements, re.DOTALL)
    if match:
        content = match.group(1)
    else:
        content = beancount_statements
    # Pattern for valid beancount price statement: YYYY-MM-DD price COMMODITY PRICE CURRENCY
    price_pattern = r'^\d{4}-\d{2}-\d{2}\s+price\s+[A-Z0-9]+\s+[\d.]+\s+[A-Z]+$'
    valid_statements = []
    for line in content.strip().split('\n'):
        line = line.strip()
        if not line:
            continue
        if re.match(price_pattern, line):
            valid_statements.append(line)
        else:
            print(f"Warning: Dropping invalid line: {line}")
    return '\n'.join(valid_statements)
 def save_price_statements(beancount_statements: str):
    """
        The statements are saved in a beancount file in ledger/trading/YYYY/MM.beancount.
        The year and month are extracted from the first beancount statement in the input.
        The file is created if it doesn't exist or the statements are appended to the
        end of the file if it already exists.
    """
    import re
    from pathlib import Path
    if not beancount_statements.strip():
        print("Warning: No valid statements to save")
        return
    # Extract date from first statement (format: YYYY-MM-DD price ...)
    first_line = beancount_statements.strip().split('\n')[0]
    date_match = re.match(r'^(\d{4})-(\d{2})-\d{2}', first_line)
    if not date_match:
        print(f"Error: Could not extract date from first statement: {
              first_line}")
        return
    year = date_match.group(1)
    month = date_match.group(2)
    # Create directory structure if it doesn't exist
    output_dir = Path(f"ledger/trading/{year}")
    output_dir.mkdir(parents=True, exist_ok=True)
    # Define output file path
    output_file = output_dir / f"{month}.beancount"
    # Append statements to file (create if doesn't exist)
    with open(output_file, 'a') as f:
        f.write(beancount_statements)
        f.write('\n')
    print(f"Saved price statements to {output_file}")
 async def main():
    parser = argparse.ArgumentParser(
        description="Parse R4 report from XLSX format")
    parser.add_argument("source", help="Path to the input XLSX file")
    args = parser.parse_args()
    if not args.source.endswith(".xlsx"):
        parser.error("Input file must have .xlsx format")
    markdown_report = convert_file_to_markdown(args.source)
    beancount_statements = await get_beancount_price_statements(markdown_report)
    print(f"Final result: \n{beancount_statements}")
    clean_beancount_statements = parse_price_statements(beancount_statements)
    save_price_statements(clean_beancount_statements)
 if __name__ == "__main__":
    asyncio.run(main())
--- a/ledger/trading/2025/10.beancount
+++ b/ledger/trading/2025/10.beancount
@@ -0,0 +1,6 @@
 2025-10-04 price IWVL 47.49 EUR
 2025-10-04 price XDEQ 67.88 EUR
 2025-10-04 price VANGL 54.12 EUR
 2025-10-04 price R4RF 15.58 EUR
 2025-10-04 price VANSMCAP 371.37 EUR
 2025-10-04 price VANEMMK 240.14 EUR
--- a/ledger/trading/index.beancount
+++ b/ledger/trading/index.beancount
@@ -1,2 +1,2 @@
-include "2024/*"
+include "*/*"
-include "prices.beancount"
+include "prices.beancount"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,16 +7,18 @@ requires-python = ">=3.13"
 dependencies = [
    "babel==2.13.1",
    "beancount==2.3.6",
-    "beautifulsoup4==4.12.2",
+    "beautifulsoup4==4.12.3",
    "blinker==1.7.0",
    "bottle==0.12.25",
    "budget-report==0.4",
    "cachetools==5.3.2",
-    "certifi==2023.11.17",
+    "certifi==2024.7.4",
    "chardet==5.2.0",
    "charset-normalizer==3.3.2",
    "cheroot==10.0.0",
    "claude-agent-sdk>=0.1.0",
    "click==8.1.7",
    "docling==2.55.1",
    "fava==1.26.2",
    "fava-plugins==1.0",
    "flask==3.0.0",
@@ -48,7 +50,7 @@ dependencies = [
    "python-dateutil==2.8.2",
    "python-magic==0.4.27",
    "pytz==2023.3.post1",
-    "requests==2.31.0",
+    "requests==2.32.2",
    "rsa==4.9",
    "simplejson==3.19.2",
    "six==1.16.0",
--- a/uv.lock
+++ b/uv.lock
`@@ -1,2 +1,2 @@`
	`include "2024/*"`	`include "/"`
	`include "prices.beancount"`	`include "prices.beancount"`