From 020ba16723af101f40115e17f5a46a5948955380 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 22:01:46 +0000 Subject: [PATCH] Add CSV attachment fallback for InvestEngine email parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Context: IE has not (yet) sent CSV-attached statements in production, but the upstream parser had _extract_positions_csv as a third fallback for exactly this case. Keeping the fallback preserves behaviour-parity with the legacy parser and makes future statement support one fixture away — the shape is documented by column set, not scraped live. Unlike the upstream which split the body on whitespace and broke on any embedded commas in names, this port walks real MIME attachments using Python's csv.DictReader. A part qualifies as CSV if: - its Content-Type is text/csv / application/csv / application/vnd.ms-excel, OR - its filename ends in .csv (defence against IE mis-labelling the part) Rows missing required columns or containing unparseable numbers/dates are skipped silently — consistent with the "partial match" contract: a half-corrupt CSV yields whatever rows were intact. Required columns: ticker, unit_price, quantity, date (YYYY-MM-DD), currency. Non-GBP rows are filtered because the IE ISA is strictly sterling — flagging this assumption in the review notes. This change: - Adds `_parse_csv_attachment(raw_email)` as the third strategy after text/plain and text/html; it re-parses the raw email bytes so we can inspect Content-Type/filename on each part. - Flags symbols/currencies, filters non-GBP, and runs each row through the shared `_build_activity` so external_id formation matches every other strategy (dedup stays consistent across strategies). - Fixture `csv_attachment.eml` has three rows (VUAG, SWDA, VUSA) in a `text/csv` part with a `.csv` filename — covers both detection paths. Test plan: poetry run pytest tests/providers/parsers/ -q → 6 passed in 0.15s poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed! poetry run yapf --diff → clean (no diff) Manual verification: load csv_attachment.eml, call parse_invest_engine_email, assert 3 activities each with symbol in {VUAG,SWDA,VUSA}, currency=GBP, notes containing "csv". --- .../providers/parsers/invest_engine.py | 83 ++++++++++++++++++- .../fixtures/invest_engine/csv_attachment.eml | 22 +++++ tests/providers/parsers/test_invest_engine.py | 21 +++++ 3 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 tests/fixtures/invest_engine/csv_attachment.eml diff --git a/broker_sync/providers/parsers/invest_engine.py b/broker_sync/providers/parsers/invest_engine.py index 7c8a494..2ab4a34 100644 --- a/broker_sync/providers/parsers/invest_engine.py +++ b/broker_sync/providers/parsers/invest_engine.py @@ -16,11 +16,13 @@ Every parse strategy produces canonical `Activity` objects with: from __future__ import annotations +import csv import email import hashlib +import io import re from datetime import datetime -from decimal import Decimal +from decimal import Decimal, InvalidOperation from email.message import Message from bs4 import BeautifulSoup @@ -48,8 +50,9 @@ _DATE_RE = re.compile( def parse_invest_engine_email(raw_email: bytes) -> list[Activity]: """Parse an IE trade confirmation email into Activity records. - Tries RFC 2822 body lines first, then HTML tables. Returns an empty - list when nothing matches — never raises on malformed input. + Tries RFC 2822 body lines first, then HTML tables, then a CSV + attachment. Returns an empty list when nothing matches — never + raises on malformed input. """ msg = email.message_from_bytes(raw_email) text_body = _extract_part_body(msg, "text/plain") @@ -62,6 +65,9 @@ def parse_invest_engine_email(raw_email: bytes) -> list[Activity]: activities = _parse_html_tables(html_body) if activities: return activities + csv_activities = _parse_csv_attachment(raw_email) + if csv_activities: + return csv_activities return [] @@ -223,6 +229,77 @@ def _extract_html_symbol(nested: object) -> str | None: return None +_CSV_CONTENT_TYPES = {"text/csv", "application/csv", "application/vnd.ms-excel"} +# Required columns for the CSV attachment strategy. IE has not (yet) sent +# CSV-attached statements in production — the column set here mirrors the +# upstream _extract_positions_csv contract (ticker, buy_price, num_shares, +# buy_date, currency) with modern names. +_CSV_COLUMNS = {"ticker", "unit_price", "quantity", "date", "currency"} + + +def _parse_csv_attachment(raw_email: bytes) -> list[Activity]: + """Parse a CSV attachment from the email into Activity records. + + Walks every MIME part, picks the first one with a CSV-ish content + type OR a `.csv` filename, and iterates its rows. Rows missing a + required column or with an unparseable number/date are skipped. + """ + msg = email.message_from_bytes(raw_email) + csv_text = _extract_csv_attachment_text(msg) + if csv_text is None: + return [] + reader = csv.DictReader(io.StringIO(csv_text)) + fieldnames = set(reader.fieldnames or []) + if not _CSV_COLUMNS.issubset(fieldnames): + return [] + activities: list[Activity] = [] + for row in reader: + activity = _csv_row_to_activity(row) + if activity is not None: + activities.append(activity) + return activities + + +def _extract_csv_attachment_text(msg: Message) -> str | None: + for part in msg.walk(): + if not _looks_like_csv_part(part): + continue + payload = part.get_payload(decode=True) + if isinstance(payload, bytes): + return payload.decode(part.get_content_charset() or "utf-8", errors="replace") + if isinstance(payload, str): + return payload + return None + + +def _looks_like_csv_part(part: Message) -> bool: + if part.get_content_type() in _CSV_CONTENT_TYPES: + return True + filename = part.get_filename() + return isinstance(filename, str) and filename.lower().endswith(".csv") + + +def _csv_row_to_activity(row: dict[str, str]) -> Activity | None: + try: + on_date = datetime.strptime(row["date"], "%Y-%m-%d") + symbol = row["ticker"].strip() + quantity = Decimal(row["quantity"]) + unit_price = Decimal(row["unit_price"]) + currency = row["currency"].strip() or "GBP" + except (KeyError, ValueError, InvalidOperation): + return None + if not symbol or currency != "GBP": + return None + return _build_activity( + on_date=on_date, + symbol=symbol, + quantity=quantity, + unit_price=unit_price, + strategy="csv", + matched=f"{symbol},{unit_price},{quantity},{row['date']}", + ) + + def _build_activity( *, on_date: datetime, diff --git a/tests/fixtures/invest_engine/csv_attachment.eml b/tests/fixtures/invest_engine/csv_attachment.eml new file mode 100644 index 0000000..b247c00 --- /dev/null +++ b/tests/fixtures/invest_engine/csv_attachment.eml @@ -0,0 +1,22 @@ +From: InvestEngine +To: viktorbarzin@example.com +Subject: Your InvestEngine statement +Date: Mon, 07 Apr 2025 09:00:00 +0000 +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="----=_MIXED_1" + +------=_MIXED_1 +Content-Type: text/plain; charset=UTF-8 + +Your monthly statement is attached as a CSV. + +------=_MIXED_1 +Content-Type: text/csv; charset=UTF-8; name="statement.csv" +Content-Disposition: attachment; filename="statement.csv" + +ticker,unit_price,quantity,date,currency +VUAG,63.21,12.5,2025-04-02,GBP +SWDA,86.40,4.75,2025-04-03,GBP +VUSA,90.10,1.0,2025-04-04,GBP + +------=_MIXED_1-- diff --git a/tests/providers/parsers/test_invest_engine.py b/tests/providers/parsers/test_invest_engine.py index d397bda..8ef81d3 100644 --- a/tests/providers/parsers/test_invest_engine.py +++ b/tests/providers/parsers/test_invest_engine.py @@ -68,3 +68,24 @@ def test_html_notes_record_html_strategy() -> None: a = parse_invest_engine_email(_load("html_two_orders.eml"))[0] assert a.notes is not None assert "html" in a.notes + + +# -- CSV attachment body -- + + +def test_csv_attachment_parses_all_rows() -> None: + activities = parse_invest_engine_email(_load("csv_attachment.eml")) + assert len(activities) == 3 + by_symbol = {a.symbol: a for a in activities} + assert by_symbol["VUAG"].quantity == Decimal("12.5") + assert by_symbol["VUAG"].unit_price == Decimal("63.21") + assert by_symbol["VUAG"].date == datetime(2025, 4, 2) + assert by_symbol["SWDA"].quantity == Decimal("4.75") + assert by_symbol["VUSA"].date == datetime(2025, 4, 4) + for a in activities: + assert a.activity_type is ActivityType.BUY + assert a.currency == "GBP" + assert a.account_id == "invest-engine-primary" + assert a.account_type is AccountType.ISA + assert a.notes is not None + assert "csv" in a.notes