From 72d348e294f5bd3671b4d9ce75fc1701476f4223 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 21:58:15 +0000 Subject: [PATCH] Add HTML table fallback for InvestEngine email parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Context: Plain-text IE emails vanished around 2024-Q2 when IE switched to an HTML-only template with per-order nested summary tables. The RFC 2822 line parser returns [] on those modern emails, so we need a fallback that walks the HTML table structure. Upstream _extract_from_html parsed a fixed DOM path (table[1].tr[10]. table) and only handled ONE order per email. The real IE HTML template nests one summary per ticker inside the second top-level table — multiple orders in a single batched confirmation are common — so this port walks every leaf table (no child
) and interprets each one as an independent trade summary. Structural (non-leaf) tables are skipped to avoid double-counting via get_text(). This change: - `_parse_html_tables(body)` extracts the date once from the full text then walks leaf tables looking for "Bought N @ £P" rows. - `_try_html_summary_table` parses one leaf; returns None on structural tables or missing ticker/qty/price — so a partial email yields only its intact orders (the "2 orders, 1 parseable → 1 returned" invariant works by construction without raising). - `parse_invest_engine_email` now falls through text/plain → text/html in the multipart message, picking the first strategy that returns activities. Order matters: text/plain wins when both succeed because the RFC 2822 strategy is the more constrained grammar. - Regexes are module-level constants so they compile once per process. Fixture `html_two_orders.eml` is a minimal-but-realistic multipart email with two nested summary tables (VUAG + SWDA), no personal data beyond tickers/qty/price. Test plan: poetry run pytest tests/providers/parsers/ -q → 5 passed in 0.16s poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → Success: no issues found in 2 source files poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed! poetry run yapf --diff broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean (no diff) Manual verification: load html_two_orders.eml, call parse_invest_engine_email, assert len == 2 with both expected tickers (VUAG, SWDA) and numbers, dates set to 2026-04-01. --- .../providers/parsers/invest_engine.py | 132 ++++++++++++++++-- .../invest_engine/html_two_orders.eml | 55 ++++++++ tests/providers/parsers/test_invest_engine.py | 26 ++++ 3 files changed, 198 insertions(+), 15 deletions(-) create mode 100644 tests/fixtures/invest_engine/html_two_orders.eml diff --git a/broker_sync/providers/parsers/invest_engine.py b/broker_sync/providers/parsers/invest_engine.py index 6750d8c..7c8a494 100644 --- a/broker_sync/providers/parsers/invest_engine.py +++ b/broker_sync/providers/parsers/invest_engine.py @@ -18,41 +18,69 @@ from __future__ import annotations import email import hashlib +import re from datetime import datetime from decimal import Decimal from email.message import Message +from bs4 import BeautifulSoup + from broker_sync.models import AccountType, Activity, ActivityType _ACCOUNT_ID = "invest-engine-primary" _CURRENCY_SIGN = "£" +# HTML trade summary rows have the shape "Bought @ £ per share". +_BOUGHT_RE = re.compile( + r"Bought\s+([0-9]+(?:\.[0-9]+)?)\s*@\s*" + re.escape(_CURRENCY_SIGN) + r"([0-9]+(?:\.[0-9]+)?)", + re.IGNORECASE, +) +# Ticker lines look like "Vanguard S&P 500: VUAG" — we want the last +# all-caps token after the colon. +_TICKER_RE = re.compile(r":\s*([A-Z][A-Z0-9]{1,9})\s*$") +# Date rows contain "Date: DD Month YYYY". +_DATE_RE = re.compile( + r"Date:\s*([0-9]{1,2})\s+([A-Za-z]+)\s+([0-9]{4})", + re.IGNORECASE, +) + def parse_invest_engine_email(raw_email: bytes) -> list[Activity]: """Parse an IE trade confirmation email into Activity records. - Returns an empty list when none of the three strategies match — never - raises on malformed input. + Tries RFC 2822 body lines first, then HTML tables. Returns an empty + list when nothing matches — never raises on malformed input. """ msg = email.message_from_bytes(raw_email) - body = _extract_text_body(msg) - if body is None: - return [] - return _parse_rfc2822_lines(body) + text_body = _extract_part_body(msg, "text/plain") + if text_body is not None: + activities = _parse_rfc2822_lines(text_body) + if activities: + return activities + html_body = _extract_part_body(msg, "text/html") + if html_body is not None: + activities = _parse_html_tables(html_body) + if activities: + return activities + return [] -def _extract_text_body(msg: Message) -> str | None: - """Return the text/plain body of an email, or None if absent.""" +def _extract_part_body(msg: Message, content_type: str) -> str | None: + """Return the first sub-part of the given content type, or None.""" if msg.is_multipart(): for part in msg.walk(): - if part.get_content_type() == "text/plain": - payload = part.get_payload(decode=True) - if isinstance(payload, bytes): - return payload.decode(part.get_content_charset() or "utf-8", errors="replace") + if part.get_content_type() == content_type: + return _decode_payload(part) return None - payload = msg.get_payload(decode=True) + if msg.get_content_type() == content_type: + return _decode_payload(msg) + return None + + +def _decode_payload(part: Message) -> str | None: + payload = part.get_payload(decode=True) if isinstance(payload, bytes): - return payload.decode(msg.get_content_charset() or "utf-8", errors="replace") + return payload.decode(part.get_content_charset() or "utf-8", errors="replace") if isinstance(payload, str): return payload return None @@ -63,7 +91,8 @@ def _parse_rfc2822_lines(body: str) -> list[Activity]: Corresponds to `_extract_position_v1` and `_extract_position_v2` in the upstream parser. Returns a one-element list on success, `[]` - otherwise. + otherwise. v3/v4 are not ported — no surviving fixtures exist and + the HTML fallback covers newer formats. """ for parser in (_try_v2, _try_v1): result = parser(body) @@ -121,6 +150,79 @@ def _try_v1(body: str) -> Activity | None: ) +def _parse_html_tables(body: str) -> list[Activity]: + """Parse an HTML body with per-order nested summary tables. + + Walks every leaf
(a table with no child tables); each leaf + carries one trade summary (ticker, bought line, total, ISIN + order + id). Tables that don't contain the expected shape are skipped, so a + partially corrupted email yields only its intact orders. + """ + soup = BeautifulSoup(body, "html.parser") + on_date = _extract_html_date(soup) + if on_date is None: + return [] + activities: list[Activity] = [] + for table in soup.find_all("table"): + if table.find("table") is not None: + continue + activity = _try_html_summary_table(table, on_date) + if activity is not None: + activities.append(activity) + return activities + + +def _extract_html_date(soup: BeautifulSoup) -> datetime | None: + match = _DATE_RE.search(soup.get_text(" ", strip=True)) + if match is None: + return None + day, month, year = match.groups() + try: + return datetime.strptime(f"{day}-{month}-{year}", "%d-%B-%Y") + except ValueError: + return None + + +def _try_html_summary_table(nested: object, on_date: datetime) -> Activity | None: + """Interpret a leaf
as a single trade summary. + + Returns None if the table is structural (no "Bought N @ £P" row) or + any required field is missing. + """ + get_text = getattr(nested, "get_text", None) + if get_text is None: + return None + text = get_text(" ", strip=True) + bought = _BOUGHT_RE.search(text) + if bought is None: + return None + symbol = _extract_html_symbol(nested) + if symbol is None: + return None + quantity = Decimal(bought.group(1)) + unit_price = Decimal(bought.group(2)) + return _build_activity( + on_date=on_date, + symbol=symbol, + quantity=quantity, + unit_price=unit_price, + strategy="html", + matched=text[:200], + ) + + +def _extract_html_symbol(nested: object) -> str | None: + find_all = getattr(nested, "find_all", None) + if find_all is None: + return None + for cell in find_all("td"): + cell_text = cell.get_text(" ", strip=True) + m = _TICKER_RE.search(cell_text) + if m is not None: + return m.group(1) + return None + + def _build_activity( *, on_date: datetime, diff --git a/tests/fixtures/invest_engine/html_two_orders.eml b/tests/fixtures/invest_engine/html_two_orders.eml new file mode 100644 index 0000000..b360b14 --- /dev/null +++ b/tests/fixtures/invest_engine/html_two_orders.eml @@ -0,0 +1,55 @@ +From: InvestEngine +To: viktorbarzin@example.com +Subject: Your portfolio has been updated +Date: Wed, 01 Apr 2026 09:15:00 +0000 +MIME-Version: 1.0 +Content-Type: multipart/alternative; boundary="----=_Part_1" + +------=_Part_1 +Content-Type: text/plain; charset=UTF-8 + +(HTML-only view — your client does not render HTML emails.) + +------=_Part_1 +Content-Type: text/html; charset=UTF-8 + +InvestEngine +
Header logo
+ + + + + + + + + + + + + + + + + + + + +
Client name: Redacted
Trading venue: London Stock Exchange
Type: Market Order(s)
Here's a summary of the trades we've made for you
abcd Date: 01 April 2026
filler
filler
filler
filler
filler
+ + + + + +
Vanguard S&P 500: VUAG
Bought 10.5 @ £62.10 per share
Total: £652.05
ISIN: IE00BFMXXD54, Order ID: 300000/4000001, Traded at 9:05am GMT
+
+ + + + + +
iShares Core MSCI World: SWDA
Bought 2.25 @ £85.40 per share
Total: £192.15
ISIN: IE00B4L5Y983, Order ID: 300000/4000002, Traded at 9:06am GMT
+
+ + +------=_Part_1-- diff --git a/tests/providers/parsers/test_invest_engine.py b/tests/providers/parsers/test_invest_engine.py index 8e04633..d397bda 100644 --- a/tests/providers/parsers/test_invest_engine.py +++ b/tests/providers/parsers/test_invest_engine.py @@ -42,3 +42,29 @@ def test_rfc2822_notes_record_parse_strategy() -> None: a = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0] assert a.notes is not None assert "rfc2822" in a.notes + + +# -- HTML table body (multipart/alternative, two orders) -- + + +def test_html_body_parses_both_orders() -> None: + activities = parse_invest_engine_email(_load("html_two_orders.eml")) + assert len(activities) == 2 + a, b = activities + assert a.symbol == "VUAG" + assert a.quantity == Decimal("10.5") + assert a.unit_price == Decimal("62.10") + assert a.date == datetime(2026, 4, 1) + assert a.account_id == "invest-engine-primary" + assert a.account_type is AccountType.ISA + assert a.activity_type is ActivityType.BUY + assert b.symbol == "SWDA" + assert b.quantity == Decimal("2.25") + assert b.unit_price == Decimal("85.40") + assert b.date == datetime(2026, 4, 1) + + +def test_html_notes_record_html_strategy() -> None: + a = parse_invest_engine_email(_load("html_two_orders.eml"))[0] + assert a.notes is not None + assert "html" in a.notes