#!/usr/bin/env python3 import argparse import asyncio from claude_agent_sdk import query, ClaudeAgentOptions, ResultMessage from docling.document_converter import DocumentConverter GET_BEANCOUNT_STATEMENTS_PROMPT = """# Financial Data to Beancount Converter You are an expert in converting markdown financial data tables to beancount price statements. ## Task Parse markdown tables containing financial portfolio data and convert them to Beancount price statements. ## Input Format You will receive markdown tables with the following sections: - **ETFs**: Contains ETF holdings with descriptions, markets, currency, shares, and valuations - **Fondos de Inversión** (Investment Funds): Contains fund holdings with descriptions, manager, currency, units, and valuations - **Saldo en EUR**: Contains cash balances (should be ignored for price statements) The date appears in the format `DD/MM/YYYY HH:MM:SS` near the top of each section. ## Commodity Mapping Map the Spanish fund/ETF descriptions to the following commodity codes: - "VANGUARD GLOBAL STOCK INDEX (EUR) ACC" → VANGL - "VANGUARD GB SMALL-CAP IDX \"INV\" (EUR)" → VANSMCAP - "ISHARES EDGE MSCI WORLD VALUE" → IWVL - "X MSCI WORLD QUALITY ETF 1C" → XDEQ - "RENTA 4 RENTA FIJA EURO, FI CLASE A" → R4RF - "VANGUARD EM MK ST IX \"INV\" (EUR)" → VANEMMK ## Output Format Generate Beancount price statements in this format: YYYY-MM-DD price COMMODITY PRICE CURRENCY ## Rules 1. Extract the date from the table (format: DD/MM/YYYY) and convert to ISO format (YYYY-MM-DD) 2. For each ETF or Fund entry: - Match the "Descripción" field to the commodity mapping - Extract the price from "Precio actual" (ETFs) or "Valor liquidativo" (Funds) - Use the currency specified in the "Divisa" column 3. Ignore the "Saldo en EUR" (cash balance) section - do not create price statements for cash 4. Only output Beancount statements, no explanations or additional text 5. If a description doesn't match any commodity in the mapping, skip it or ask for clarification ## Example Given this input data dated 04/10/2025 for "VANGUARD GLOBAL STOCK INDEX (EUR) ACC" that costs 54.12 EUR per participation: Output: 2025-10-04 price VANGL 54.12 EUR ## Your Task Parse the provided financial data tables and generate the corresponding Beancount price statements. Output only the Beancount code. """ def convert_file_to_markdown(path: str): converter = DocumentConverter() result = converter.convert(path) return result.document.export_to_markdown() async def get_beancount_price_statements(r4_report: str) -> str: options = ClaudeAgentOptions( system_prompt=GET_BEANCOUNT_STATEMENTS_PROMPT, cwd="/home/roger/Projectes/contabilitat-2024/" ) result = None async for message in query( prompt=f"Convert this financial statement to beancount price statements:\n{ r4_report}", options=options ): if isinstance(message, ResultMessage) and message.subtype == "success": result = message.result else: print(message) if result is not None and isinstance(result, str): return result else: raise ValueError( "Unable to get Beancount price statements from the report!") def parse_price_statements(beancount_statements: str): """ The input beancount statements might be inside a markdown beancount code block or in plain text. After parsing them, we check that each line meets the format YYYY-MM-DD price COMMODITY PRICE CURRENCY. All lines that don't meet the criteria are dropped and are logged as errors. """ import re # Extract content from markdown code block if present code_block_pattern = r'```(?:beancount)?\n(.*?)```' match = re.search(code_block_pattern, beancount_statements, re.DOTALL) if match: content = match.group(1) else: content = beancount_statements # Pattern for valid beancount price statement: YYYY-MM-DD price COMMODITY PRICE CURRENCY price_pattern = r'^\d{4}-\d{2}-\d{2}\s+price\s+[A-Z0-9]+\s+[\d.]+\s+[A-Z]+$' valid_statements = [] for line in content.strip().split('\n'): line = line.strip() if not line: continue if re.match(price_pattern, line): valid_statements.append(line) else: print(f"Warning: Dropping invalid line: {line}") return '\n'.join(valid_statements) def save_price_statements(beancount_statements: str): """ The statements are saved in a beancount file in ledger/trading/YYYY/MM.beancount. The year and month are extracted from the first beancount statement in the input. The file is created if it doesn't exist or the statements are appended to the end of the file if it already exists. """ import re from pathlib import Path if not beancount_statements.strip(): print("Warning: No valid statements to save") return # Extract date from first statement (format: YYYY-MM-DD price ...) first_line = beancount_statements.strip().split('\n')[0] date_match = re.match(r'^(\d{4})-(\d{2})-\d{2}', first_line) if not date_match: print(f"Error: Could not extract date from first statement: { first_line}") return year = date_match.group(1) month = date_match.group(2) # Create directory structure if it doesn't exist output_dir = Path(f"ledger/trading/{year}") output_dir.mkdir(parents=True, exist_ok=True) # Define output file path output_file = output_dir / f"{month}.beancount" # Append statements to file (create if doesn't exist) with open(output_file, 'a') as f: f.write(beancount_statements) f.write('\n') print(f"Saved price statements to {output_file}") async def main(): parser = argparse.ArgumentParser( description="Parse R4 report from XLSX format") parser.add_argument("source", help="Path to the input XLSX file") args = parser.parse_args() if not args.source.endswith(".xlsx"): parser.error("Input file must have .xlsx format") markdown_report = convert_file_to_markdown(args.source) beancount_statements = await get_beancount_price_statements(markdown_report) print(f"Final result: \n{beancount_statements}") clean_beancount_statements = parse_price_statements(beancount_statements) save_price_statements(clean_beancount_statements) if __name__ == "__main__": asyncio.run(main())