Add HTML table fallback for InvestEngine email parser

Context: Plain-text IE emails vanished around 2024-Q2 when IE switched to
an HTML-only template with per-order nested summary tables. The RFC 2822
line parser returns [] on those modern emails, so we need a fallback
that walks the HTML table structure.

Upstream _extract_from_html parsed a fixed DOM path (table[1].tr[10].
table) and only handled ONE order per email. The real IE HTML template
nests one summary <table> per ticker inside the second top-level table —
multiple orders in a single batched confirmation are common — so this
port walks every leaf table (no child <table>) and interprets each one
as an independent trade summary. Structural (non-leaf) tables are
skipped to avoid double-counting via get_text().

This change:
- `_parse_html_tables(body)` extracts the date once from the full text
  then walks leaf tables looking for "Bought N @ £P" rows.
- `_try_html_summary_table` parses one leaf; returns None on structural
  tables or missing ticker/qty/price — so a partial email yields only
  its intact orders (the "2 orders, 1 parseable → 1 returned" invariant
  works by construction without raising).
- `parse_invest_engine_email` now falls through text/plain → text/html
  in the multipart message, picking the first strategy that returns
  activities. Order matters: text/plain wins when both succeed because
  the RFC 2822 strategy is the more constrained grammar.
- Regexes are module-level constants so they compile once per process.

Fixture `html_two_orders.eml` is a minimal-but-realistic multipart email
with two nested summary tables (VUAG + SWDA), no personal data beyond
tickers/qty/price.

Test plan:
  poetry run pytest tests/providers/parsers/ -q
  → 5 passed in 0.16s
  poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py
  → Success: no issues found in 2 source files
  poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py
  → All checks passed!
  poetry run yapf --diff broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py
  → clean (no diff)

Manual verification: load html_two_orders.eml, call parse_invest_engine_email,
assert len == 2 with both expected tickers (VUAG, SWDA) and numbers,
dates set to 2026-04-01.
This commit is contained in:
Viktor Barzin 2026-04-17 21:58:15 +00:00
parent 9ec8ece2d9
commit 72d348e294
3 changed files with 198 additions and 15 deletions

View file

@ -18,41 +18,69 @@ from __future__ import annotations
import email
import hashlib
import re
from datetime import datetime
from decimal import Decimal
from email.message import Message
from bs4 import BeautifulSoup
from broker_sync.models import AccountType, Activity, ActivityType
_ACCOUNT_ID = "invest-engine-primary"
_CURRENCY_SIGN = "£"
# HTML trade summary rows have the shape "Bought <qty> @ £<price> per share".
_BOUGHT_RE = re.compile(
r"Bought\s+([0-9]+(?:\.[0-9]+)?)\s*@\s*" + re.escape(_CURRENCY_SIGN) + r"([0-9]+(?:\.[0-9]+)?)",
re.IGNORECASE,
)
# Ticker lines look like "Vanguard S&P 500: VUAG" — we want the last
# all-caps token after the colon.
_TICKER_RE = re.compile(r":\s*([A-Z][A-Z0-9]{1,9})\s*$")
# Date rows contain "Date: DD Month YYYY".
_DATE_RE = re.compile(
r"Date:\s*([0-9]{1,2})\s+([A-Za-z]+)\s+([0-9]{4})",
re.IGNORECASE,
)
def parse_invest_engine_email(raw_email: bytes) -> list[Activity]:
"""Parse an IE trade confirmation email into Activity records.
Returns an empty list when none of the three strategies match never
raises on malformed input.
Tries RFC 2822 body lines first, then HTML tables. Returns an empty
list when nothing matches never raises on malformed input.
"""
msg = email.message_from_bytes(raw_email)
body = _extract_text_body(msg)
if body is None:
return []
return _parse_rfc2822_lines(body)
text_body = _extract_part_body(msg, "text/plain")
if text_body is not None:
activities = _parse_rfc2822_lines(text_body)
if activities:
return activities
html_body = _extract_part_body(msg, "text/html")
if html_body is not None:
activities = _parse_html_tables(html_body)
if activities:
return activities
return []
def _extract_text_body(msg: Message) -> str | None:
"""Return the text/plain body of an email, or None if absent."""
def _extract_part_body(msg: Message, content_type: str) -> str | None:
"""Return the first sub-part of the given content type, or None."""
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
payload = part.get_payload(decode=True)
if isinstance(payload, bytes):
return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
if part.get_content_type() == content_type:
return _decode_payload(part)
return None
payload = msg.get_payload(decode=True)
if msg.get_content_type() == content_type:
return _decode_payload(msg)
return None
def _decode_payload(part: Message) -> str | None:
payload = part.get_payload(decode=True)
if isinstance(payload, bytes):
return payload.decode(msg.get_content_charset() or "utf-8", errors="replace")
return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
if isinstance(payload, str):
return payload
return None
@ -63,7 +91,8 @@ def _parse_rfc2822_lines(body: str) -> list[Activity]:
Corresponds to `_extract_position_v1` and `_extract_position_v2` in
the upstream parser. Returns a one-element list on success, `[]`
otherwise.
otherwise. v3/v4 are not ported no surviving fixtures exist and
the HTML fallback covers newer formats.
"""
for parser in (_try_v2, _try_v1):
result = parser(body)
@ -121,6 +150,79 @@ def _try_v1(body: str) -> Activity | None:
)
def _parse_html_tables(body: str) -> list[Activity]:
"""Parse an HTML body with per-order nested summary tables.
Walks every leaf <table> (a table with no child tables); each leaf
carries one trade summary (ticker, bought line, total, ISIN + order
id). Tables that don't contain the expected shape are skipped, so a
partially corrupted email yields only its intact orders.
"""
soup = BeautifulSoup(body, "html.parser")
on_date = _extract_html_date(soup)
if on_date is None:
return []
activities: list[Activity] = []
for table in soup.find_all("table"):
if table.find("table") is not None:
continue
activity = _try_html_summary_table(table, on_date)
if activity is not None:
activities.append(activity)
return activities
def _extract_html_date(soup: BeautifulSoup) -> datetime | None:
match = _DATE_RE.search(soup.get_text(" ", strip=True))
if match is None:
return None
day, month, year = match.groups()
try:
return datetime.strptime(f"{day}-{month}-{year}", "%d-%B-%Y")
except ValueError:
return None
def _try_html_summary_table(nested: object, on_date: datetime) -> Activity | None:
"""Interpret a leaf <table> as a single trade summary.
Returns None if the table is structural (no "Bought N @ £P" row) or
any required field is missing.
"""
get_text = getattr(nested, "get_text", None)
if get_text is None:
return None
text = get_text(" ", strip=True)
bought = _BOUGHT_RE.search(text)
if bought is None:
return None
symbol = _extract_html_symbol(nested)
if symbol is None:
return None
quantity = Decimal(bought.group(1))
unit_price = Decimal(bought.group(2))
return _build_activity(
on_date=on_date,
symbol=symbol,
quantity=quantity,
unit_price=unit_price,
strategy="html",
matched=text[:200],
)
def _extract_html_symbol(nested: object) -> str | None:
find_all = getattr(nested, "find_all", None)
if find_all is None:
return None
for cell in find_all("td"):
cell_text = cell.get_text(" ", strip=True)
m = _TICKER_RE.search(cell_text)
if m is not None:
return m.group(1)
return None
def _build_activity(
*,
on_date: datetime,

View file

@ -0,0 +1,55 @@
From: InvestEngine <no-reply@investengine.com>
To: viktorbarzin@example.com
Subject: Your portfolio has been updated
Date: Wed, 01 Apr 2026 09:15:00 +0000
MIME-Version: 1.0
Content-Type: multipart/alternative; boundary="----=_Part_1"
------=_Part_1
Content-Type: text/plain; charset=UTF-8
(HTML-only view — your client does not render HTML emails.)
------=_Part_1
Content-Type: text/html; charset=UTF-8
<html><head><title>InvestEngine</title></head><body>
<table><tr><td>Header logo</td></tr></table>
<table>
<tr><td>Client name: Redacted</td></tr>
<tr><td>Trading venue: London Stock Exchange</td></tr>
<tr><td>Type: Market Order(s)</td></tr>
<tr><td>Here's a summary of the trades we've made for you</td></tr>
<tr>
<td>a</td><td>b</td><td>c</td><td>d</td>
<td> Date: 01 April 2026 </td>
</tr>
<tr><td>filler</td></tr>
<tr><td>filler</td></tr>
<tr><td>filler</td></tr>
<tr><td>filler</td></tr>
<tr><td>filler</td></tr>
<tr>
<td>
<table>
<tr><td>Vanguard S&amp;P 500: VUAG</td></tr>
<tr><td>Bought 10.5 @ &pound;62.10 per share</td></tr>
<tr><td>Total: &pound;652.05</td></tr>
<tr><td>ISIN: IE00BFMXXD54, Order ID: 300000/4000001, Traded at 9:05am GMT</td></tr>
</table>
</td>
</tr>
<tr>
<td>
<table>
<tr><td>iShares Core MSCI World: SWDA</td></tr>
<tr><td>Bought 2.25 @ &pound;85.40 per share</td></tr>
<tr><td>Total: &pound;192.15</td></tr>
<tr><td>ISIN: IE00B4L5Y983, Order ID: 300000/4000002, Traded at 9:06am GMT</td></tr>
</table>
</td>
</tr>
</table>
</body></html>
------=_Part_1--

View file

@ -42,3 +42,29 @@ def test_rfc2822_notes_record_parse_strategy() -> None:
a = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0]
assert a.notes is not None
assert "rfc2822" in a.notes
# -- HTML table body (multipart/alternative, two orders) --
def test_html_body_parses_both_orders() -> None:
activities = parse_invest_engine_email(_load("html_two_orders.eml"))
assert len(activities) == 2
a, b = activities
assert a.symbol == "VUAG"
assert a.quantity == Decimal("10.5")
assert a.unit_price == Decimal("62.10")
assert a.date == datetime(2026, 4, 1)
assert a.account_id == "invest-engine-primary"
assert a.account_type is AccountType.ISA
assert a.activity_type is ActivityType.BUY
assert b.symbol == "SWDA"
assert b.quantity == Decimal("2.25")
assert b.unit_price == Decimal("85.40")
assert b.date == datetime(2026, 4, 1)
def test_html_notes_record_html_strategy() -> None:
a = parse_invest_engine_email(_load("html_two_orders.eml"))[0]
assert a.notes is not None
assert "html" in a.notes