From 9ec8ece2d905a604f07616388b81335bee585581 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 21:49:52 +0000 Subject: [PATCH 1/4] =?UTF-8?q?Add=20InvestEngine=20email=20parser=20?= =?UTF-8?q?=E2=80=94=20RFC=202822=20v1/v2=20line=20format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Context: The old finance/ app had a 324-line IE message parser with four line-based variants (v1/v2/v3/v4) plus an HTML strategy and a CSV fallback. Port into broker-sync so we can consume IE trade confirmation emails as a backup to the live HTTP client (Phase 2b) while IE's public API remains Bearer-only. The upstream parser emits storage.model.Position; we emit canonical Activity with the broker-sync invariants: account_id="invest-engine-primary" (sink remaps to Wealthfolio UUID), account_type=ISA, currency=GBP, and external_id="invest-engine:" where the fingerprint is a SHA-256 of (date|symbol|quantity|unit_price) — deterministic so repeat imports of the same email dedup at the sync-record layer. This change: - Top-level `parse_invest_engine_email(raw_email: bytes) -> list[Activity]` extracts the text/plain body from an RFC 2822 message and dispatches to the line-based parser. - `_parse_rfc2822_lines(body)` tries the v2 layout first (newer IE format where `Date: DD Month` is on line 2 and the year on line 3), then the v1 layout (where the day alone is on line 2 and `Month YYYY` on line 3). v3 and v4 variants are re-added in a follow-up if we find fixtures where they matter — initial fixture coverage hits v2. - Drops the upstream `_ticker_post_processing` VUAG→VUAG.L hack. Wealthfolio's /import/check endpoint resolves exchange suffixes; the Trading212 provider also emits suffix-free tickers (e.g. `VUAG`), so staying consistent avoids double-mapping. - Notes field records the parse-strategy tag ("rfc2822-v2") plus the matched line for debugging. Test plan: poetry run pytest tests/providers/parsers/ -q → 3 passed in 0.03s poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → Success: no issues found in 2 source files poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed! poetry run yapf --diff broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean (no diff) Manual verification: load the fixture email, call the parser, inspect the returned Activity has symbol=VUAG, quantity=59.539562, unit_price=60.46, date=2023-01-17, external_id starts with invest-engine:. --- .../providers/parsers/invest_engine.py | 150 ++++++++++++++++++ .../invest_engine/rfc2822_v2_single_buy.eml | 15 ++ tests/providers/parsers/__init__.py | 0 tests/providers/parsers/test_invest_engine.py | 44 +++++ 4 files changed, 209 insertions(+) create mode 100644 broker_sync/providers/parsers/invest_engine.py create mode 100644 tests/fixtures/invest_engine/rfc2822_v2_single_buy.eml create mode 100644 tests/providers/parsers/__init__.py create mode 100644 tests/providers/parsers/test_invest_engine.py diff --git a/broker_sync/providers/parsers/invest_engine.py b/broker_sync/providers/parsers/invest_engine.py new file mode 100644 index 0000000..6750d8c --- /dev/null +++ b/broker_sync/providers/parsers/invest_engine.py @@ -0,0 +1,150 @@ +"""InvestEngine email parser. + +IE mails the user after each trade batch. The body shape varies — over +the years IE has sent trade confirmations as plain-text RFC 2822 +messages, multipart HTML emails with a summary table, and (for older +statements) CSV attachments. This module tries the three strategies in +order and returns the first that yields at least one Activity. + +Every parse strategy produces canonical `Activity` objects with: +- `account_id = "invest-engine-primary"` (sink remaps to Wealthfolio UUID) +- `account_type = AccountType.ISA` (Viktor's IE account is an ISA) +- `currency = "GBP"` +- `external_id = f"invest-engine:{fingerprint}"` where fingerprint hashes + (date, symbol, quantity, unit_price) for deterministic dedup. +""" + +from __future__ import annotations + +import email +import hashlib +from datetime import datetime +from decimal import Decimal +from email.message import Message + +from broker_sync.models import AccountType, Activity, ActivityType + +_ACCOUNT_ID = "invest-engine-primary" +_CURRENCY_SIGN = "£" + + +def parse_invest_engine_email(raw_email: bytes) -> list[Activity]: + """Parse an IE trade confirmation email into Activity records. + + Returns an empty list when none of the three strategies match — never + raises on malformed input. + """ + msg = email.message_from_bytes(raw_email) + body = _extract_text_body(msg) + if body is None: + return [] + return _parse_rfc2822_lines(body) + + +def _extract_text_body(msg: Message) -> str | None: + """Return the text/plain body of an email, or None if absent.""" + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + payload = part.get_payload(decode=True) + if isinstance(payload, bytes): + return payload.decode(part.get_content_charset() or "utf-8", errors="replace") + return None + payload = msg.get_payload(decode=True) + if isinstance(payload, bytes): + return payload.decode(msg.get_content_charset() or "utf-8", errors="replace") + if isinstance(payload, str): + return payload + return None + + +def _parse_rfc2822_lines(body: str) -> list[Activity]: + """Try each line-based body format (v1/v2) and return matches. + + Corresponds to `_extract_position_v1` and `_extract_position_v2` in + the upstream parser. Returns a one-element list on success, `[]` + otherwise. + """ + for parser in (_try_v2, _try_v1): + result = parser(body) + if result is not None: + return [result] + return [] + + +def _try_v2(body: str) -> Activity | None: + """Parse body with v2 layout: `Date: DD Month` on line 2, year on line 3.""" + lines = body.splitlines() + if len(lines) < 6: + return None + try: + day_str, month = lines[2].split()[-2:] + year = lines[3].split()[0] + on_date = datetime.strptime(f"{day_str}-{month}-{year}", "%d-%B-%Y") + symbol = lines[4].split(":")[1].split()[0].strip() + unit_price = Decimal(lines[4].split(_CURRENCY_SIGN)[1].split()[0]) + quantity = Decimal(lines[4].split("Bought")[1].split()[0]) + except (ValueError, IndexError): + return None + return _build_activity( + on_date=on_date, + symbol=symbol, + quantity=quantity, + unit_price=unit_price, + strategy="rfc2822-v2", + matched=lines[4], + ) + + +def _try_v1(body: str) -> Activity | None: + """Parse body with v1 layout: `Date: DD` on line 2, `Month YYYY` on line 3.""" + lines = body.splitlines() + if len(lines) < 6: + return None + try: + day = int(lines[2].split("Date: ")[1]) + month, year = (lines[3].split(" ")[0]).split() + on_date = datetime.strptime(f"{day}-{month}-{year}", "%d-%B-%Y") + symbol = lines[4].split(":")[1].split()[0].strip() + quantity = Decimal(lines[4].split("Bought")[1].split()[0]) + price_str = lines[4].split("Bought")[1].split("@")[1].split()[0].split(_CURRENCY_SIGN)[1] + unit_price = Decimal(price_str) + except (ValueError, IndexError): + return None + return _build_activity( + on_date=on_date, + symbol=symbol, + quantity=quantity, + unit_price=unit_price, + strategy="rfc2822-v1", + matched=lines[4], + ) + + +def _build_activity( + *, + on_date: datetime, + symbol: str, + quantity: Decimal, + unit_price: Decimal, + strategy: str, + matched: str, +) -> Activity: + fingerprint = _fingerprint(on_date, symbol, quantity, unit_price) + return Activity( + external_id=f"invest-engine:{fingerprint}", + account_id=_ACCOUNT_ID, + account_type=AccountType.ISA, + date=on_date, + activity_type=ActivityType.BUY, + currency="GBP", + symbol=symbol, + quantity=quantity, + unit_price=unit_price, + notes=f"[{strategy}] {matched.strip()}", + ) + + +def _fingerprint(date: datetime, symbol: str, quantity: Decimal, unit_price: Decimal) -> str: + key = f"{date.isoformat()}|{symbol}|{quantity}|{unit_price}" + return hashlib.sha256(key.encode("utf-8")).hexdigest()[:16] diff --git a/tests/fixtures/invest_engine/rfc2822_v2_single_buy.eml b/tests/fixtures/invest_engine/rfc2822_v2_single_buy.eml new file mode 100644 index 0000000..d06afa0 --- /dev/null +++ b/tests/fixtures/invest_engine/rfc2822_v2_single_buy.eml @@ -0,0 +1,15 @@ +From: InvestEngine +To: viktorbarzin@example.com +Subject: Your portfolio has been updated +Date: Tue, 17 Jan 2023 14:48:00 +0000 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + + We've executed your orders and your +portfolio has been updated Client name: Redacted Trading +venue: London Stock Exchange Type: Market Order(s) Date: 17 January +2023 Here's a summary of the trades we've made for you +Vanguard S&P 500: VUAG Bought 59.539562 @ £60.46 per share Total: +£3600.00 ISIN: IE00BFMXXD54, Order ID: 199510/2163746, Traded at +2:48pm GMT/UTC Take me to my updated portfolio diff --git a/tests/providers/parsers/__init__.py b/tests/providers/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/providers/parsers/test_invest_engine.py b/tests/providers/parsers/test_invest_engine.py new file mode 100644 index 0000000..8e04633 --- /dev/null +++ b/tests/providers/parsers/test_invest_engine.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from datetime import datetime +from decimal import Decimal +from pathlib import Path + +from broker_sync.models import AccountType, ActivityType +from broker_sync.providers.parsers.invest_engine import parse_invest_engine_email + +_FIXTURES = Path(__file__).parent.parent.parent / "fixtures" / "invest_engine" + + +def _load(name: str) -> bytes: + return (_FIXTURES / name).read_bytes() + + +# -- RFC 2822 body (v2-style, single BUY) -- + + +def test_rfc2822_single_buy_parses_to_one_activity() -> None: + activities = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml")) + assert len(activities) == 1 + a = activities[0] + assert a.activity_type is ActivityType.BUY + assert a.symbol == "VUAG" + assert a.quantity == Decimal("59.539562") + assert a.unit_price == Decimal("60.46") + assert a.currency == "GBP" + assert a.date == datetime(2023, 1, 17) + assert a.account_id == "invest-engine-primary" + assert a.account_type is AccountType.ISA + + +def test_rfc2822_external_id_is_deterministic() -> None: + a1 = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0] + a2 = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0] + assert a1.external_id == a2.external_id + assert a1.external_id.startswith("invest-engine:") + + +def test_rfc2822_notes_record_parse_strategy() -> None: + a = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0] + assert a.notes is not None + assert "rfc2822" in a.notes From 72d348e294f5bd3671b4d9ce75fc1701476f4223 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 21:58:15 +0000 Subject: [PATCH 2/4] Add HTML table fallback for InvestEngine email parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Context: Plain-text IE emails vanished around 2024-Q2 when IE switched to an HTML-only template with per-order nested summary tables. The RFC 2822 line parser returns [] on those modern emails, so we need a fallback that walks the HTML table structure. Upstream _extract_from_html parsed a fixed DOM path (table[1].tr[10]. table) and only handled ONE order per email. The real IE HTML template nests one summary per ticker inside the second top-level table — multiple orders in a single batched confirmation are common — so this port walks every leaf table (no child
) and interprets each one as an independent trade summary. Structural (non-leaf) tables are skipped to avoid double-counting via get_text(). This change: - `_parse_html_tables(body)` extracts the date once from the full text then walks leaf tables looking for "Bought N @ £P" rows. - `_try_html_summary_table` parses one leaf; returns None on structural tables or missing ticker/qty/price — so a partial email yields only its intact orders (the "2 orders, 1 parseable → 1 returned" invariant works by construction without raising). - `parse_invest_engine_email` now falls through text/plain → text/html in the multipart message, picking the first strategy that returns activities. Order matters: text/plain wins when both succeed because the RFC 2822 strategy is the more constrained grammar. - Regexes are module-level constants so they compile once per process. Fixture `html_two_orders.eml` is a minimal-but-realistic multipart email with two nested summary tables (VUAG + SWDA), no personal data beyond tickers/qty/price. Test plan: poetry run pytest tests/providers/parsers/ -q → 5 passed in 0.16s poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → Success: no issues found in 2 source files poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed! poetry run yapf --diff broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean (no diff) Manual verification: load html_two_orders.eml, call parse_invest_engine_email, assert len == 2 with both expected tickers (VUAG, SWDA) and numbers, dates set to 2026-04-01. --- .../providers/parsers/invest_engine.py | 132 ++++++++++++++++-- .../invest_engine/html_two_orders.eml | 55 ++++++++ tests/providers/parsers/test_invest_engine.py | 26 ++++ 3 files changed, 198 insertions(+), 15 deletions(-) create mode 100644 tests/fixtures/invest_engine/html_two_orders.eml diff --git a/broker_sync/providers/parsers/invest_engine.py b/broker_sync/providers/parsers/invest_engine.py index 6750d8c..7c8a494 100644 --- a/broker_sync/providers/parsers/invest_engine.py +++ b/broker_sync/providers/parsers/invest_engine.py @@ -18,41 +18,69 @@ from __future__ import annotations import email import hashlib +import re from datetime import datetime from decimal import Decimal from email.message import Message +from bs4 import BeautifulSoup + from broker_sync.models import AccountType, Activity, ActivityType _ACCOUNT_ID = "invest-engine-primary" _CURRENCY_SIGN = "£" +# HTML trade summary rows have the shape "Bought @ £ per share". +_BOUGHT_RE = re.compile( + r"Bought\s+([0-9]+(?:\.[0-9]+)?)\s*@\s*" + re.escape(_CURRENCY_SIGN) + r"([0-9]+(?:\.[0-9]+)?)", + re.IGNORECASE, +) +# Ticker lines look like "Vanguard S&P 500: VUAG" — we want the last +# all-caps token after the colon. +_TICKER_RE = re.compile(r":\s*([A-Z][A-Z0-9]{1,9})\s*$") +# Date rows contain "Date: DD Month YYYY". +_DATE_RE = re.compile( + r"Date:\s*([0-9]{1,2})\s+([A-Za-z]+)\s+([0-9]{4})", + re.IGNORECASE, +) + def parse_invest_engine_email(raw_email: bytes) -> list[Activity]: """Parse an IE trade confirmation email into Activity records. - Returns an empty list when none of the three strategies match — never - raises on malformed input. + Tries RFC 2822 body lines first, then HTML tables. Returns an empty + list when nothing matches — never raises on malformed input. """ msg = email.message_from_bytes(raw_email) - body = _extract_text_body(msg) - if body is None: - return [] - return _parse_rfc2822_lines(body) + text_body = _extract_part_body(msg, "text/plain") + if text_body is not None: + activities = _parse_rfc2822_lines(text_body) + if activities: + return activities + html_body = _extract_part_body(msg, "text/html") + if html_body is not None: + activities = _parse_html_tables(html_body) + if activities: + return activities + return [] -def _extract_text_body(msg: Message) -> str | None: - """Return the text/plain body of an email, or None if absent.""" +def _extract_part_body(msg: Message, content_type: str) -> str | None: + """Return the first sub-part of the given content type, or None.""" if msg.is_multipart(): for part in msg.walk(): - if part.get_content_type() == "text/plain": - payload = part.get_payload(decode=True) - if isinstance(payload, bytes): - return payload.decode(part.get_content_charset() or "utf-8", errors="replace") + if part.get_content_type() == content_type: + return _decode_payload(part) return None - payload = msg.get_payload(decode=True) + if msg.get_content_type() == content_type: + return _decode_payload(msg) + return None + + +def _decode_payload(part: Message) -> str | None: + payload = part.get_payload(decode=True) if isinstance(payload, bytes): - return payload.decode(msg.get_content_charset() or "utf-8", errors="replace") + return payload.decode(part.get_content_charset() or "utf-8", errors="replace") if isinstance(payload, str): return payload return None @@ -63,7 +91,8 @@ def _parse_rfc2822_lines(body: str) -> list[Activity]: Corresponds to `_extract_position_v1` and `_extract_position_v2` in the upstream parser. Returns a one-element list on success, `[]` - otherwise. + otherwise. v3/v4 are not ported — no surviving fixtures exist and + the HTML fallback covers newer formats. """ for parser in (_try_v2, _try_v1): result = parser(body) @@ -121,6 +150,79 @@ def _try_v1(body: str) -> Activity | None: ) +def _parse_html_tables(body: str) -> list[Activity]: + """Parse an HTML body with per-order nested summary tables. + + Walks every leaf
(a table with no child tables); each leaf + carries one trade summary (ticker, bought line, total, ISIN + order + id). Tables that don't contain the expected shape are skipped, so a + partially corrupted email yields only its intact orders. + """ + soup = BeautifulSoup(body, "html.parser") + on_date = _extract_html_date(soup) + if on_date is None: + return [] + activities: list[Activity] = [] + for table in soup.find_all("table"): + if table.find("table") is not None: + continue + activity = _try_html_summary_table(table, on_date) + if activity is not None: + activities.append(activity) + return activities + + +def _extract_html_date(soup: BeautifulSoup) -> datetime | None: + match = _DATE_RE.search(soup.get_text(" ", strip=True)) + if match is None: + return None + day, month, year = match.groups() + try: + return datetime.strptime(f"{day}-{month}-{year}", "%d-%B-%Y") + except ValueError: + return None + + +def _try_html_summary_table(nested: object, on_date: datetime) -> Activity | None: + """Interpret a leaf
as a single trade summary. + + Returns None if the table is structural (no "Bought N @ £P" row) or + any required field is missing. + """ + get_text = getattr(nested, "get_text", None) + if get_text is None: + return None + text = get_text(" ", strip=True) + bought = _BOUGHT_RE.search(text) + if bought is None: + return None + symbol = _extract_html_symbol(nested) + if symbol is None: + return None + quantity = Decimal(bought.group(1)) + unit_price = Decimal(bought.group(2)) + return _build_activity( + on_date=on_date, + symbol=symbol, + quantity=quantity, + unit_price=unit_price, + strategy="html", + matched=text[:200], + ) + + +def _extract_html_symbol(nested: object) -> str | None: + find_all = getattr(nested, "find_all", None) + if find_all is None: + return None + for cell in find_all("td"): + cell_text = cell.get_text(" ", strip=True) + m = _TICKER_RE.search(cell_text) + if m is not None: + return m.group(1) + return None + + def _build_activity( *, on_date: datetime, diff --git a/tests/fixtures/invest_engine/html_two_orders.eml b/tests/fixtures/invest_engine/html_two_orders.eml new file mode 100644 index 0000000..b360b14 --- /dev/null +++ b/tests/fixtures/invest_engine/html_two_orders.eml @@ -0,0 +1,55 @@ +From: InvestEngine +To: viktorbarzin@example.com +Subject: Your portfolio has been updated +Date: Wed, 01 Apr 2026 09:15:00 +0000 +MIME-Version: 1.0 +Content-Type: multipart/alternative; boundary="----=_Part_1" + +------=_Part_1 +Content-Type: text/plain; charset=UTF-8 + +(HTML-only view — your client does not render HTML emails.) + +------=_Part_1 +Content-Type: text/html; charset=UTF-8 + +InvestEngine +
Header logo
+ + + + + + + + + + + + + + + + + + + + +
Client name: Redacted
Trading venue: London Stock Exchange
Type: Market Order(s)
Here's a summary of the trades we've made for you
abcd Date: 01 April 2026
filler
filler
filler
filler
filler
+ + + + + +
Vanguard S&P 500: VUAG
Bought 10.5 @ £62.10 per share
Total: £652.05
ISIN: IE00BFMXXD54, Order ID: 300000/4000001, Traded at 9:05am GMT
+
+ + + + + +
iShares Core MSCI World: SWDA
Bought 2.25 @ £85.40 per share
Total: £192.15
ISIN: IE00B4L5Y983, Order ID: 300000/4000002, Traded at 9:06am GMT
+
+ + +------=_Part_1-- diff --git a/tests/providers/parsers/test_invest_engine.py b/tests/providers/parsers/test_invest_engine.py index 8e04633..d397bda 100644 --- a/tests/providers/parsers/test_invest_engine.py +++ b/tests/providers/parsers/test_invest_engine.py @@ -42,3 +42,29 @@ def test_rfc2822_notes_record_parse_strategy() -> None: a = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0] assert a.notes is not None assert "rfc2822" in a.notes + + +# -- HTML table body (multipart/alternative, two orders) -- + + +def test_html_body_parses_both_orders() -> None: + activities = parse_invest_engine_email(_load("html_two_orders.eml")) + assert len(activities) == 2 + a, b = activities + assert a.symbol == "VUAG" + assert a.quantity == Decimal("10.5") + assert a.unit_price == Decimal("62.10") + assert a.date == datetime(2026, 4, 1) + assert a.account_id == "invest-engine-primary" + assert a.account_type is AccountType.ISA + assert a.activity_type is ActivityType.BUY + assert b.symbol == "SWDA" + assert b.quantity == Decimal("2.25") + assert b.unit_price == Decimal("85.40") + assert b.date == datetime(2026, 4, 1) + + +def test_html_notes_record_html_strategy() -> None: + a = parse_invest_engine_email(_load("html_two_orders.eml"))[0] + assert a.notes is not None + assert "html" in a.notes From 020ba16723af101f40115e17f5a46a5948955380 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 22:01:46 +0000 Subject: [PATCH 3/4] Add CSV attachment fallback for InvestEngine email parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Context: IE has not (yet) sent CSV-attached statements in production, but the upstream parser had _extract_positions_csv as a third fallback for exactly this case. Keeping the fallback preserves behaviour-parity with the legacy parser and makes future statement support one fixture away — the shape is documented by column set, not scraped live. Unlike the upstream which split the body on whitespace and broke on any embedded commas in names, this port walks real MIME attachments using Python's csv.DictReader. A part qualifies as CSV if: - its Content-Type is text/csv / application/csv / application/vnd.ms-excel, OR - its filename ends in .csv (defence against IE mis-labelling the part) Rows missing required columns or containing unparseable numbers/dates are skipped silently — consistent with the "partial match" contract: a half-corrupt CSV yields whatever rows were intact. Required columns: ticker, unit_price, quantity, date (YYYY-MM-DD), currency. Non-GBP rows are filtered because the IE ISA is strictly sterling — flagging this assumption in the review notes. This change: - Adds `_parse_csv_attachment(raw_email)` as the third strategy after text/plain and text/html; it re-parses the raw email bytes so we can inspect Content-Type/filename on each part. - Flags symbols/currencies, filters non-GBP, and runs each row through the shared `_build_activity` so external_id formation matches every other strategy (dedup stays consistent across strategies). - Fixture `csv_attachment.eml` has three rows (VUAG, SWDA, VUSA) in a `text/csv` part with a `.csv` filename — covers both detection paths. Test plan: poetry run pytest tests/providers/parsers/ -q → 6 passed in 0.15s poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed! poetry run yapf --diff → clean (no diff) Manual verification: load csv_attachment.eml, call parse_invest_engine_email, assert 3 activities each with symbol in {VUAG,SWDA,VUSA}, currency=GBP, notes containing "csv". --- .../providers/parsers/invest_engine.py | 83 ++++++++++++++++++- .../fixtures/invest_engine/csv_attachment.eml | 22 +++++ tests/providers/parsers/test_invest_engine.py | 21 +++++ 3 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 tests/fixtures/invest_engine/csv_attachment.eml diff --git a/broker_sync/providers/parsers/invest_engine.py b/broker_sync/providers/parsers/invest_engine.py index 7c8a494..2ab4a34 100644 --- a/broker_sync/providers/parsers/invest_engine.py +++ b/broker_sync/providers/parsers/invest_engine.py @@ -16,11 +16,13 @@ Every parse strategy produces canonical `Activity` objects with: from __future__ import annotations +import csv import email import hashlib +import io import re from datetime import datetime -from decimal import Decimal +from decimal import Decimal, InvalidOperation from email.message import Message from bs4 import BeautifulSoup @@ -48,8 +50,9 @@ _DATE_RE = re.compile( def parse_invest_engine_email(raw_email: bytes) -> list[Activity]: """Parse an IE trade confirmation email into Activity records. - Tries RFC 2822 body lines first, then HTML tables. Returns an empty - list when nothing matches — never raises on malformed input. + Tries RFC 2822 body lines first, then HTML tables, then a CSV + attachment. Returns an empty list when nothing matches — never + raises on malformed input. """ msg = email.message_from_bytes(raw_email) text_body = _extract_part_body(msg, "text/plain") @@ -62,6 +65,9 @@ def parse_invest_engine_email(raw_email: bytes) -> list[Activity]: activities = _parse_html_tables(html_body) if activities: return activities + csv_activities = _parse_csv_attachment(raw_email) + if csv_activities: + return csv_activities return [] @@ -223,6 +229,77 @@ def _extract_html_symbol(nested: object) -> str | None: return None +_CSV_CONTENT_TYPES = {"text/csv", "application/csv", "application/vnd.ms-excel"} +# Required columns for the CSV attachment strategy. IE has not (yet) sent +# CSV-attached statements in production — the column set here mirrors the +# upstream _extract_positions_csv contract (ticker, buy_price, num_shares, +# buy_date, currency) with modern names. +_CSV_COLUMNS = {"ticker", "unit_price", "quantity", "date", "currency"} + + +def _parse_csv_attachment(raw_email: bytes) -> list[Activity]: + """Parse a CSV attachment from the email into Activity records. + + Walks every MIME part, picks the first one with a CSV-ish content + type OR a `.csv` filename, and iterates its rows. Rows missing a + required column or with an unparseable number/date are skipped. + """ + msg = email.message_from_bytes(raw_email) + csv_text = _extract_csv_attachment_text(msg) + if csv_text is None: + return [] + reader = csv.DictReader(io.StringIO(csv_text)) + fieldnames = set(reader.fieldnames or []) + if not _CSV_COLUMNS.issubset(fieldnames): + return [] + activities: list[Activity] = [] + for row in reader: + activity = _csv_row_to_activity(row) + if activity is not None: + activities.append(activity) + return activities + + +def _extract_csv_attachment_text(msg: Message) -> str | None: + for part in msg.walk(): + if not _looks_like_csv_part(part): + continue + payload = part.get_payload(decode=True) + if isinstance(payload, bytes): + return payload.decode(part.get_content_charset() or "utf-8", errors="replace") + if isinstance(payload, str): + return payload + return None + + +def _looks_like_csv_part(part: Message) -> bool: + if part.get_content_type() in _CSV_CONTENT_TYPES: + return True + filename = part.get_filename() + return isinstance(filename, str) and filename.lower().endswith(".csv") + + +def _csv_row_to_activity(row: dict[str, str]) -> Activity | None: + try: + on_date = datetime.strptime(row["date"], "%Y-%m-%d") + symbol = row["ticker"].strip() + quantity = Decimal(row["quantity"]) + unit_price = Decimal(row["unit_price"]) + currency = row["currency"].strip() or "GBP" + except (KeyError, ValueError, InvalidOperation): + return None + if not symbol or currency != "GBP": + return None + return _build_activity( + on_date=on_date, + symbol=symbol, + quantity=quantity, + unit_price=unit_price, + strategy="csv", + matched=f"{symbol},{unit_price},{quantity},{row['date']}", + ) + + def _build_activity( *, on_date: datetime, diff --git a/tests/fixtures/invest_engine/csv_attachment.eml b/tests/fixtures/invest_engine/csv_attachment.eml new file mode 100644 index 0000000..b247c00 --- /dev/null +++ b/tests/fixtures/invest_engine/csv_attachment.eml @@ -0,0 +1,22 @@ +From: InvestEngine +To: viktorbarzin@example.com +Subject: Your InvestEngine statement +Date: Mon, 07 Apr 2025 09:00:00 +0000 +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="----=_MIXED_1" + +------=_MIXED_1 +Content-Type: text/plain; charset=UTF-8 + +Your monthly statement is attached as a CSV. + +------=_MIXED_1 +Content-Type: text/csv; charset=UTF-8; name="statement.csv" +Content-Disposition: attachment; filename="statement.csv" + +ticker,unit_price,quantity,date,currency +VUAG,63.21,12.5,2025-04-02,GBP +SWDA,86.40,4.75,2025-04-03,GBP +VUSA,90.10,1.0,2025-04-04,GBP + +------=_MIXED_1-- diff --git a/tests/providers/parsers/test_invest_engine.py b/tests/providers/parsers/test_invest_engine.py index d397bda..8ef81d3 100644 --- a/tests/providers/parsers/test_invest_engine.py +++ b/tests/providers/parsers/test_invest_engine.py @@ -68,3 +68,24 @@ def test_html_notes_record_html_strategy() -> None: a = parse_invest_engine_email(_load("html_two_orders.eml"))[0] assert a.notes is not None assert "html" in a.notes + + +# -- CSV attachment body -- + + +def test_csv_attachment_parses_all_rows() -> None: + activities = parse_invest_engine_email(_load("csv_attachment.eml")) + assert len(activities) == 3 + by_symbol = {a.symbol: a for a in activities} + assert by_symbol["VUAG"].quantity == Decimal("12.5") + assert by_symbol["VUAG"].unit_price == Decimal("63.21") + assert by_symbol["VUAG"].date == datetime(2025, 4, 2) + assert by_symbol["SWDA"].quantity == Decimal("4.75") + assert by_symbol["VUSA"].date == datetime(2025, 4, 4) + for a in activities: + assert a.activity_type is ActivityType.BUY + assert a.currency == "GBP" + assert a.account_id == "invest-engine-primary" + assert a.account_type is AccountType.ISA + assert a.notes is not None + assert "csv" in a.notes From 87526898e6afcedac4fc8cd18e283467dccd1e4a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 22:02:48 +0000 Subject: [PATCH 4/4] =?UTF-8?q?Pin=20InvestEngine=20parser=20failure=20mod?= =?UTF-8?q?es=20=E2=80=94=20empty-on-junk=20+=20partial-match?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Context: The port's graceful-failure contract was implicit in the way each strategy returns None/[] on malformed input, but without tests it was an accidental property that could regress silently. Codify it. Two invariants, each backed by a fixture: 1. Junk email → empty list, never raise. `unparseable.eml` is a pure-marketing IE newsletter with no order data. All three strategies try and fail; parse_invest_engine_email returns []. No exception leaks. 2. Partial HTML email → intact orders only. `html_partial_match.eml` has two nested summary tables: one with a valid VUAG order, one that is missing both the ticker and "Bought N @ £P" rows (simulates IE dropping content mid-render). The parser returns just the VUAG order. No implementation change needed — the behaviour existed as a side effect of _try_html_summary_table returning None on missing fields. These tests lock it down so future refactors can't quietly break it. Test plan: poetry run pytest tests/providers/parsers/ -q → 8 passed in 0.19s poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed! poetry run yapf --diff → clean (no diff) Manual verification: - Load unparseable.eml → parse returns []. - Load html_partial_match.eml → parse returns exactly 1 activity (VUAG). --- .../invest_engine/html_partial_match.eml | 40 +++++++++++++++++++ tests/fixtures/invest_engine/unparseable.eml | 15 +++++++ tests/providers/parsers/test_invest_engine.py | 17 ++++++++ 3 files changed, 72 insertions(+) create mode 100644 tests/fixtures/invest_engine/html_partial_match.eml create mode 100644 tests/fixtures/invest_engine/unparseable.eml diff --git a/tests/fixtures/invest_engine/html_partial_match.eml b/tests/fixtures/invest_engine/html_partial_match.eml new file mode 100644 index 0000000..fc41aa1 --- /dev/null +++ b/tests/fixtures/invest_engine/html_partial_match.eml @@ -0,0 +1,40 @@ +From: InvestEngine +To: viktorbarzin@example.com +Subject: Your portfolio has been updated +Date: Wed, 15 Apr 2026 11:00:00 +0000 +MIME-Version: 1.0 +Content-Type: multipart/alternative; boundary="----=_Part_PM" + +------=_Part_PM +Content-Type: text/plain; charset=UTF-8 + +(HTML-only view — your client does not render HTML emails.) + +------=_Part_PM +Content-Type: text/html; charset=UTF-8 + + +
Logo
+ + + + + + + + +
Date: 15 April 2026
+ + + + +
Vanguard S&P 500: VUAG
Bought 3.0 @ £61.25 per share
Total: £183.75
+
+ + + +
Some broken order with no ticker and no bought line
(Malformed — IE dropped a row mid-render)
+
+ + +------=_Part_PM-- diff --git a/tests/fixtures/invest_engine/unparseable.eml b/tests/fixtures/invest_engine/unparseable.eml new file mode 100644 index 0000000..933f99a --- /dev/null +++ b/tests/fixtures/invest_engine/unparseable.eml @@ -0,0 +1,15 @@ +From: InvestEngine +To: viktorbarzin@example.com +Subject: InvestEngine newsletter +Date: Thu, 10 Apr 2025 12:00:00 +0000 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 + +Hi Viktor, + +This is a newsletter, not a trade confirmation. There is no structured +order data here — just marketing copy and a promo for a new feature we +are rolling out. Thanks for being a customer. + +Cheers, +The InvestEngine team diff --git a/tests/providers/parsers/test_invest_engine.py b/tests/providers/parsers/test_invest_engine.py index 8ef81d3..9c30889 100644 --- a/tests/providers/parsers/test_invest_engine.py +++ b/tests/providers/parsers/test_invest_engine.py @@ -89,3 +89,20 @@ def test_csv_attachment_parses_all_rows() -> None: assert a.account_type is AccountType.ISA assert a.notes is not None assert "csv" in a.notes + + +# -- graceful failure modes -- + + +def test_unparseable_email_returns_empty_list() -> None: + assert parse_invest_engine_email(_load("unparseable.eml")) == [] + + +def test_html_partial_match_returns_only_parseable_orders() -> None: + activities = parse_invest_engine_email(_load("html_partial_match.eml")) + assert len(activities) == 1 + a = activities[0] + assert a.symbol == "VUAG" + assert a.quantity == Decimal("3.0") + assert a.unit_price == Decimal("61.25") + assert a.date == datetime(2026, 4, 15)