parse-r4-report script to parse r4 investment prices
This commit is contained in:
179
commands/parse-r4-report
Normal file
179
commands/parse-r4-report
Normal file
@@ -0,0 +1,179 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
from claude_agent_sdk import query, ClaudeAgentOptions, ResultMessage
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
GET_BEANCOUNT_STATEMENTS_PROMPT = """# Financial Data to Beancount Converter
|
||||||
|
|
||||||
|
You are an expert in converting markdown financial data tables to beancount price statements.
|
||||||
|
|
||||||
|
## Task
|
||||||
|
Parse markdown tables containing financial portfolio data and convert them to Beancount price statements.
|
||||||
|
|
||||||
|
## Input Format
|
||||||
|
You will receive markdown tables with the following sections:
|
||||||
|
- **ETFs**: Contains ETF holdings with descriptions, markets, currency, shares, and valuations
|
||||||
|
- **Fondos de Inversión** (Investment Funds): Contains fund holdings with descriptions, manager, currency, units, and valuations
|
||||||
|
- **Saldo en EUR**: Contains cash balances (should be ignored for price statements)
|
||||||
|
|
||||||
|
The date appears in the format `DD/MM/YYYY HH:MM:SS` near the top of each section.
|
||||||
|
|
||||||
|
## Commodity Mapping
|
||||||
|
Map the Spanish fund/ETF descriptions to the following commodity codes:
|
||||||
|
|
||||||
|
- "VANGUARD GLOBAL STOCK INDEX (EUR) ACC" → VANGL
|
||||||
|
- "VANGUARD GB SMALL-CAP IDX \"INV\" (EUR)" → VANSMCAP
|
||||||
|
- "ISHARES EDGE MSCI WORLD VALUE" → IWVL
|
||||||
|
- "X MSCI WORLD QUALITY ETF 1C" → XDEQ
|
||||||
|
- "RENTA 4 RENTA FIJA EURO, FI CLASE A" → R4RF
|
||||||
|
- "VANGUARD EM MK ST IX \"INV\" (EUR)" → VANEMMK
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
Generate Beancount price statements in this format:
|
||||||
|
YYYY-MM-DD price COMMODITY PRICE CURRENCY
|
||||||
|
|
||||||
|
## Rules
|
||||||
|
1. Extract the date from the table (format: DD/MM/YYYY) and convert to ISO format (YYYY-MM-DD)
|
||||||
|
2. For each ETF or Fund entry:
|
||||||
|
- Match the "Descripción" field to the commodity mapping
|
||||||
|
- Extract the price from "Precio actual" (ETFs) or "Valor liquidativo" (Funds)
|
||||||
|
- Use the currency specified in the "Divisa" column
|
||||||
|
3. Ignore the "Saldo en EUR" (cash balance) section - do not create price statements for cash
|
||||||
|
4. Only output Beancount statements, no explanations or additional text
|
||||||
|
5. If a description doesn't match any commodity in the mapping, skip it or ask for clarification
|
||||||
|
|
||||||
|
## Example
|
||||||
|
Given this input data dated 04/10/2025 for "VANGUARD GLOBAL STOCK INDEX (EUR) ACC" that costs 54.12 EUR per participation:
|
||||||
|
|
||||||
|
Output:
|
||||||
|
2025-10-04 price VANGL 54.12 EUR
|
||||||
|
|
||||||
|
## Your Task
|
||||||
|
Parse the provided financial data tables and generate the corresponding Beancount price statements. Output only the Beancount code.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def convert_file_to_markdown(path: str):
|
||||||
|
converter = DocumentConverter()
|
||||||
|
result = converter.convert(path)
|
||||||
|
return result.document.export_to_markdown()
|
||||||
|
|
||||||
|
|
||||||
|
async def get_beancount_price_statements(r4_report: str) -> str:
|
||||||
|
options = ClaudeAgentOptions(
|
||||||
|
system_prompt=GET_BEANCOUNT_STATEMENTS_PROMPT,
|
||||||
|
cwd="/home/roger/Projectes/contabilitat-2024/"
|
||||||
|
)
|
||||||
|
|
||||||
|
result = None
|
||||||
|
async for message in query(
|
||||||
|
prompt=f"Convert this financial statement to beancount price statements:\n{
|
||||||
|
r4_report}",
|
||||||
|
options=options
|
||||||
|
):
|
||||||
|
if isinstance(message, ResultMessage) and message.subtype == "success":
|
||||||
|
result = message.result
|
||||||
|
else:
|
||||||
|
print(message)
|
||||||
|
|
||||||
|
if result is not None and isinstance(result, str):
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Unable to get Beancount price statements from the report!")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_price_statements(beancount_statements: str):
|
||||||
|
"""
|
||||||
|
The input beancount statements might be inside a markdown beancount code block
|
||||||
|
or in plain text. After parsing them, we check that each line meets the
|
||||||
|
format YYYY-MM-DD price COMMODITY PRICE CURRENCY.
|
||||||
|
All lines that don't meet the criteria are dropped and are logged as errors.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Extract content from markdown code block if present
|
||||||
|
code_block_pattern = r'```(?:beancount)?\n(.*?)```'
|
||||||
|
match = re.search(code_block_pattern, beancount_statements, re.DOTALL)
|
||||||
|
if match:
|
||||||
|
content = match.group(1)
|
||||||
|
else:
|
||||||
|
content = beancount_statements
|
||||||
|
|
||||||
|
# Pattern for valid beancount price statement: YYYY-MM-DD price COMMODITY PRICE CURRENCY
|
||||||
|
price_pattern = r'^\d{4}-\d{2}-\d{2}\s+price\s+[A-Z0-9]+\s+[\d.]+\s+[A-Z]+$'
|
||||||
|
|
||||||
|
valid_statements = []
|
||||||
|
for line in content.strip().split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
if re.match(price_pattern, line):
|
||||||
|
valid_statements.append(line)
|
||||||
|
else:
|
||||||
|
print(f"Warning: Dropping invalid line: {line}")
|
||||||
|
|
||||||
|
return '\n'.join(valid_statements)
|
||||||
|
|
||||||
|
|
||||||
|
def save_price_statements(beancount_statements: str):
|
||||||
|
"""
|
||||||
|
The statements are saved in a beancount file in ledger/trading/YYYY/MM.beancount.
|
||||||
|
The year and month are extracted from the first beancount statement in the input.
|
||||||
|
The file is created if it doesn't exist or the statements are appended to the
|
||||||
|
end of the file if it already exists.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
if not beancount_statements.strip():
|
||||||
|
print("Warning: No valid statements to save")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Extract date from first statement (format: YYYY-MM-DD price ...)
|
||||||
|
first_line = beancount_statements.strip().split('\n')[0]
|
||||||
|
date_match = re.match(r'^(\d{4})-(\d{2})-\d{2}', first_line)
|
||||||
|
|
||||||
|
if not date_match:
|
||||||
|
print(f"Error: Could not extract date from first statement: {
|
||||||
|
first_line}")
|
||||||
|
return
|
||||||
|
|
||||||
|
year = date_match.group(1)
|
||||||
|
month = date_match.group(2)
|
||||||
|
|
||||||
|
# Create directory structure if it doesn't exist
|
||||||
|
output_dir = Path(f"ledger/trading/{year}")
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Define output file path
|
||||||
|
output_file = output_dir / f"{month}.beancount"
|
||||||
|
|
||||||
|
# Append statements to file (create if doesn't exist)
|
||||||
|
with open(output_file, 'a') as f:
|
||||||
|
f.write(beancount_statements)
|
||||||
|
f.write('\n')
|
||||||
|
|
||||||
|
print(f"Saved price statements to {output_file}")
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Parse R4 report from XLSX format")
|
||||||
|
parser.add_argument("source", help="Path to the input XLSX file")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.source.endswith(".xlsx"):
|
||||||
|
parser.error("Input file must have .xlsx format")
|
||||||
|
|
||||||
|
markdown_report = convert_file_to_markdown(args.source)
|
||||||
|
beancount_statements = await get_beancount_price_statements(markdown_report)
|
||||||
|
print(f"Final result: \n{beancount_statements}")
|
||||||
|
|
||||||
|
clean_beancount_statements = parse_price_statements(beancount_statements)
|
||||||
|
save_price_statements(clean_beancount_statements)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
||||||
6
ledger/trading/2025/10.beancount
Normal file
6
ledger/trading/2025/10.beancount
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
2025-10-04 price IWVL 47.49 EUR
|
||||||
|
2025-10-04 price XDEQ 67.88 EUR
|
||||||
|
2025-10-04 price VANGL 54.12 EUR
|
||||||
|
2025-10-04 price R4RF 15.58 EUR
|
||||||
|
2025-10-04 price VANSMCAP 371.37 EUR
|
||||||
|
2025-10-04 price VANEMMK 240.14 EUR
|
||||||
@@ -1,2 +1,2 @@
|
|||||||
include "2024/*"
|
include "*/*"
|
||||||
include "prices.beancount"
|
include "prices.beancount"
|
||||||
|
|||||||
@@ -7,16 +7,18 @@ requires-python = ">=3.13"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"babel==2.13.1",
|
"babel==2.13.1",
|
||||||
"beancount==2.3.6",
|
"beancount==2.3.6",
|
||||||
"beautifulsoup4==4.12.2",
|
"beautifulsoup4==4.12.3",
|
||||||
"blinker==1.7.0",
|
"blinker==1.7.0",
|
||||||
"bottle==0.12.25",
|
"bottle==0.12.25",
|
||||||
"budget-report==0.4",
|
"budget-report==0.4",
|
||||||
"cachetools==5.3.2",
|
"cachetools==5.3.2",
|
||||||
"certifi==2023.11.17",
|
"certifi==2024.7.4",
|
||||||
"chardet==5.2.0",
|
"chardet==5.2.0",
|
||||||
"charset-normalizer==3.3.2",
|
"charset-normalizer==3.3.2",
|
||||||
"cheroot==10.0.0",
|
"cheroot==10.0.0",
|
||||||
|
"claude-agent-sdk>=0.1.0",
|
||||||
"click==8.1.7",
|
"click==8.1.7",
|
||||||
|
"docling==2.55.1",
|
||||||
"fava==1.26.2",
|
"fava==1.26.2",
|
||||||
"fava-plugins==1.0",
|
"fava-plugins==1.0",
|
||||||
"flask==3.0.0",
|
"flask==3.0.0",
|
||||||
@@ -48,7 +50,7 @@ dependencies = [
|
|||||||
"python-dateutil==2.8.2",
|
"python-dateutil==2.8.2",
|
||||||
"python-magic==0.4.27",
|
"python-magic==0.4.27",
|
||||||
"pytz==2023.3.post1",
|
"pytz==2023.3.post1",
|
||||||
"requests==2.31.0",
|
"requests==2.32.2",
|
||||||
"rsa==4.9",
|
"rsa==4.9",
|
||||||
"simplejson==3.19.2",
|
"simplejson==3.19.2",
|
||||||
"six==1.16.0",
|
"six==1.16.0",
|
||||||
|
|||||||
Reference in New Issue
Block a user