Add HTML table fallback for InvestEngine email parser

Context: Plain-text IE emails vanished around 2024-Q2 when IE switched to an HTML-only template with per-order nested summary tables. The RFC 2822 line parser returns [] on those modern emails, so we need a fallback that walks the HTML table structure. Upstream _extract_from_html parsed a fixed DOM path (table[1].tr[10]. table) and only handled ONE order per email. The real IE HTML template nests one summary <table> per ticker inside the second top-level table — multiple orders in a single batched confirmation are common — so this port walks every leaf table (no child <table>) and interprets each one as an independent trade summary. Structural (non-leaf) tables are skipped to avoid double-counting via get_text(). This change: - `_parse_html_tables(body)` extracts the date once from the full text then walks leaf tables looking for "Bought N @ £P" rows. - `_try_html_summary_table` parses one leaf; returns None on structural tables or missing ticker/qty/price — so a partial email yields only its intact orders (the "2 orders, 1 parseable → 1 returned" invariant works by construction without raising). - `parse_invest_engine_email` now falls through text/plain → text/html in the multipart message, picking the first strategy that returns activities. Order matters: text/plain wins when both succeed because the RFC 2822 strategy is the more constrained grammar. - Regexes are module-level constants so they compile once per process. Fixture `html_two_orders.eml` is a minimal-but-realistic multipart email with two nested summary tables (VUAG + SWDA), no personal data beyond tickers/qty/price. Test plan: poetry run pytest tests/providers/parsers/ -q → 5 passed in 0.16s poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → Success: no issues found in 2 source files poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed! poetry run yapf --diff broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean (no diff) Manual verification: load html_two_orders.eml, call parse_invest_engine_email, assert len == 2 with both expected tickers (VUAG, SWDA) and numbers, dates set to 2026-04-01.
2026-04-17 21:58:15 +00:00 · 2026-04-17 21:58:15 +00:00 · 72d348e294
commit 72d348e294
parent 9ec8ece2d9
3 changed files with 198 additions and 15 deletions
--- a/broker_sync/providers/parsers/invest_engine.py
+++ b/broker_sync/providers/parsers/invest_engine.py
@ -18,41 +18,69 @@ from __future__ import annotations

 import email
 import hashlib
+import re
 from datetime import datetime
 from decimal import Decimal
 from email.message import Message

+from bs4 import BeautifulSoup
+
 from broker_sync.models import AccountType, Activity, ActivityType

 _ACCOUNT_ID = "invest-engine-primary"
 _CURRENCY_SIGN = "£"

+# HTML trade summary rows have the shape "Bought <qty> @ £<price> per share".
+_BOUGHT_RE = re.compile(
+    r"Bought\s+([0-9]+(?:\.[0-9]+)?)\s*@\s*" + re.escape(_CURRENCY_SIGN) + r"([0-9]+(?:\.[0-9]+)?)",
+    re.IGNORECASE,
+)
+# Ticker lines look like "Vanguard S&P 500: VUAG" — we want the last
+# all-caps token after the colon.
+_TICKER_RE = re.compile(r":\s*([A-Z][A-Z0-9]{1,9})\s*$")
+# Date rows contain "Date: DD Month YYYY".
+_DATE_RE = re.compile(
+    r"Date:\s*([0-9]{1,2})\s+([A-Za-z]+)\s+([0-9]{4})",
+    re.IGNORECASE,
+)
+

 def parse_invest_engine_email(raw_email: bytes) -> list[Activity]:
    """Parse an IE trade confirmation email into Activity records.

-    Returns an empty list when none of the three strategies match — never
-    raises on malformed input.
+    Tries RFC 2822 body lines first, then HTML tables. Returns an empty
+    list when nothing matches — never raises on malformed input.
    """
    msg = email.message_from_bytes(raw_email)
-    body = _extract_text_body(msg)
-    if body is None:
-        return []
-    return _parse_rfc2822_lines(body)
+    text_body = _extract_part_body(msg, "text/plain")
+    if text_body is not None:
+        activities = _parse_rfc2822_lines(text_body)
+        if activities:
+            return activities
+    html_body = _extract_part_body(msg, "text/html")
+    if html_body is not None:
+        activities = _parse_html_tables(html_body)
+        if activities:
+            return activities
+    return []


-def _extract_text_body(msg: Message) -> str | None:
-    """Return the text/plain body of an email, or None if absent."""
+def _extract_part_body(msg: Message, content_type: str) -> str | None:
+    """Return the first sub-part of the given content type, or None."""
    if msg.is_multipart():
        for part in msg.walk():
-            if part.get_content_type() == "text/plain":
-                payload = part.get_payload(decode=True)
-                if isinstance(payload, bytes):
-                    return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
+            if part.get_content_type() == content_type:
+                return _decode_payload(part)
        return None
-    payload = msg.get_payload(decode=True)
+    if msg.get_content_type() == content_type:
+        return _decode_payload(msg)
+    return None
+
+
+def _decode_payload(part: Message) -> str | None:
+    payload = part.get_payload(decode=True)
    if isinstance(payload, bytes):
-        return payload.decode(msg.get_content_charset() or "utf-8", errors="replace")
+        return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
    if isinstance(payload, str):
        return payload
    return None
@ -63,7 +91,8 @@ def _parse_rfc2822_lines(body: str) -> list[Activity]:

    Corresponds to `_extract_position_v1` and `_extract_position_v2` in
    the upstream parser. Returns a one-element list on success, `[]`
-    otherwise.
+    otherwise. v3/v4 are not ported — no surviving fixtures exist and
+    the HTML fallback covers newer formats.
    """
    for parser in (_try_v2, _try_v1):
        result = parser(body)
@ -121,6 +150,79 @@ def _try_v1(body: str) -> Activity | None:
    )


+def _parse_html_tables(body: str) -> list[Activity]:
+    """Parse an HTML body with per-order nested summary tables.
+
+    Walks every leaf <table> (a table with no child tables); each leaf
+    carries one trade summary (ticker, bought line, total, ISIN + order
+    id). Tables that don't contain the expected shape are skipped, so a
+    partially corrupted email yields only its intact orders.
+    """
+    soup = BeautifulSoup(body, "html.parser")
+    on_date = _extract_html_date(soup)
+    if on_date is None:
+        return []
+    activities: list[Activity] = []
+    for table in soup.find_all("table"):
+        if table.find("table") is not None:
+            continue
+        activity = _try_html_summary_table(table, on_date)
+        if activity is not None:
+            activities.append(activity)
+    return activities
+
+
+def _extract_html_date(soup: BeautifulSoup) -> datetime | None:
+    match = _DATE_RE.search(soup.get_text(" ", strip=True))
+    if match is None:
+        return None
+    day, month, year = match.groups()
+    try:
+        return datetime.strptime(f"{day}-{month}-{year}", "%d-%B-%Y")
+    except ValueError:
+        return None
+
+
+def _try_html_summary_table(nested: object, on_date: datetime) -> Activity | None:
+    """Interpret a leaf <table> as a single trade summary.
+
+    Returns None if the table is structural (no "Bought N @ £P" row) or
+    any required field is missing.
+    """
+    get_text = getattr(nested, "get_text", None)
+    if get_text is None:
+        return None
+    text = get_text(" ", strip=True)
+    bought = _BOUGHT_RE.search(text)
+    if bought is None:
+        return None
+    symbol = _extract_html_symbol(nested)
+    if symbol is None:
+        return None
+    quantity = Decimal(bought.group(1))
+    unit_price = Decimal(bought.group(2))
+    return _build_activity(
+        on_date=on_date,
+        symbol=symbol,
+        quantity=quantity,
+        unit_price=unit_price,
+        strategy="html",
+        matched=text[:200],
+    )
+
+
+def _extract_html_symbol(nested: object) -> str | None:
+    find_all = getattr(nested, "find_all", None)
+    if find_all is None:
+        return None
+    for cell in find_all("td"):
+        cell_text = cell.get_text(" ", strip=True)
+        m = _TICKER_RE.search(cell_text)
+        if m is not None:
+            return m.group(1)
+    return None
+
+
 def _build_activity(
    *,
    on_date: datetime,
--- a/tests/fixtures/invest_engine/html_two_orders.eml
+++ b/tests/fixtures/invest_engine/html_two_orders.eml
@ -0,0 +1,55 @@
+From: InvestEngine <no-reply@investengine.com>
+To: viktorbarzin@example.com
+Subject: Your portfolio has been updated
+Date: Wed, 01 Apr 2026 09:15:00 +0000
+MIME-Version: 1.0
+Content-Type: multipart/alternative; boundary="----=_Part_1"
+
+------=_Part_1
+Content-Type: text/plain; charset=UTF-8
+
+(HTML-only view — your client does not render HTML emails.)
+
+------=_Part_1
+Content-Type: text/html; charset=UTF-8
+
+<html><head><title>InvestEngine</title></head><body>
+<table><tr><td>Header logo</td></tr></table>
+<table>
+ <tr><td>Client name: Redacted</td></tr>
+ <tr><td>Trading venue: London Stock Exchange</td></tr>
+ <tr><td>Type: Market Order(s)</td></tr>
+ <tr><td>Here's a summary of the trades we've made for you</td></tr>
+ <tr>
+  <td>a</td><td>b</td><td>c</td><td>d</td>
+  <td> Date: 01 April 2026 </td>
+ </tr>
+ <tr><td>filler</td></tr>
+ <tr><td>filler</td></tr>
+ <tr><td>filler</td></tr>
+ <tr><td>filler</td></tr>
+ <tr><td>filler</td></tr>
+ <tr>
+  <td>
+   <table>
+    <tr><td>Vanguard S&amp;P 500: VUAG</td></tr>
+    <tr><td>Bought 10.5 @ &pound;62.10 per share</td></tr>
+    <tr><td>Total: &pound;652.05</td></tr>
+    <tr><td>ISIN: IE00BFMXXD54, Order ID: 300000/4000001, Traded at 9:05am GMT</td></tr>
+   </table>
+  </td>
+ </tr>
+ <tr>
+  <td>
+   <table>
+    <tr><td>iShares Core MSCI World: SWDA</td></tr>
+    <tr><td>Bought 2.25 @ &pound;85.40 per share</td></tr>
+    <tr><td>Total: &pound;192.15</td></tr>
+    <tr><td>ISIN: IE00B4L5Y983, Order ID: 300000/4000002, Traded at 9:06am GMT</td></tr>
+   </table>
+  </td>
+ </tr>
+</table>
+</body></html>
+
+------=_Part_1--
--- a/tests/providers/parsers/test_invest_engine.py
+++ b/tests/providers/parsers/test_invest_engine.py
@ -42,3 +42,29 @@ def test_rfc2822_notes_record_parse_strategy() -> None:
    a = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0]
    assert a.notes is not None
    assert "rfc2822" in a.notes
+
+
+# -- HTML table body (multipart/alternative, two orders) --
+
+
+def test_html_body_parses_both_orders() -> None:
+    activities = parse_invest_engine_email(_load("html_two_orders.eml"))
+    assert len(activities) == 2
+    a, b = activities
+    assert a.symbol == "VUAG"
+    assert a.quantity == Decimal("10.5")
+    assert a.unit_price == Decimal("62.10")
+    assert a.date == datetime(2026, 4, 1)
+    assert a.account_id == "invest-engine-primary"
+    assert a.account_type is AccountType.ISA
+    assert a.activity_type is ActivityType.BUY
+    assert b.symbol == "SWDA"
+    assert b.quantity == Decimal("2.25")
+    assert b.unit_price == Decimal("85.40")
+    assert b.date == datetime(2026, 4, 1)
+
+
+def test_html_notes_record_html_strategy() -> None:
+    a = parse_invest_engine_email(_load("html_two_orders.eml"))[0]
+    assert a.notes is not None
+    assert "html" in a.notes