Add HTML table fallback for InvestEngine email parser
Context: Plain-text IE emails vanished around 2024-Q2 when IE switched to an HTML-only template with per-order nested summary tables. The RFC 2822 line parser returns [] on those modern emails, so we need a fallback that walks the HTML table structure. Upstream _extract_from_html parsed a fixed DOM path (table[1].tr[10]. table) and only handled ONE order per email. The real IE HTML template nests one summary <table> per ticker inside the second top-level table — multiple orders in a single batched confirmation are common — so this port walks every leaf table (no child <table>) and interprets each one as an independent trade summary. Structural (non-leaf) tables are skipped to avoid double-counting via get_text(). This change: - `_parse_html_tables(body)` extracts the date once from the full text then walks leaf tables looking for "Bought N @ £P" rows. - `_try_html_summary_table` parses one leaf; returns None on structural tables or missing ticker/qty/price — so a partial email yields only its intact orders (the "2 orders, 1 parseable → 1 returned" invariant works by construction without raising). - `parse_invest_engine_email` now falls through text/plain → text/html in the multipart message, picking the first strategy that returns activities. Order matters: text/plain wins when both succeed because the RFC 2822 strategy is the more constrained grammar. - Regexes are module-level constants so they compile once per process. Fixture `html_two_orders.eml` is a minimal-but-realistic multipart email with two nested summary tables (VUAG + SWDA), no personal data beyond tickers/qty/price. Test plan: poetry run pytest tests/providers/parsers/ -q → 5 passed in 0.16s poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → Success: no issues found in 2 source files poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed! poetry run yapf --diff broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean (no diff) Manual verification: load html_two_orders.eml, call parse_invest_engine_email, assert len == 2 with both expected tickers (VUAG, SWDA) and numbers, dates set to 2026-04-01.
This commit is contained in:
parent
9ec8ece2d9
commit
72d348e294
3 changed files with 198 additions and 15 deletions
|
|
@ -18,41 +18,69 @@ from __future__ import annotations
|
|||
|
||||
import email
|
||||
import hashlib
|
||||
import re
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from email.message import Message
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from broker_sync.models import AccountType, Activity, ActivityType
|
||||
|
||||
_ACCOUNT_ID = "invest-engine-primary"
|
||||
_CURRENCY_SIGN = "£"
|
||||
|
||||
# HTML trade summary rows have the shape "Bought <qty> @ £<price> per share".
|
||||
_BOUGHT_RE = re.compile(
|
||||
r"Bought\s+([0-9]+(?:\.[0-9]+)?)\s*@\s*" + re.escape(_CURRENCY_SIGN) + r"([0-9]+(?:\.[0-9]+)?)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Ticker lines look like "Vanguard S&P 500: VUAG" — we want the last
|
||||
# all-caps token after the colon.
|
||||
_TICKER_RE = re.compile(r":\s*([A-Z][A-Z0-9]{1,9})\s*$")
|
||||
# Date rows contain "Date: DD Month YYYY".
|
||||
_DATE_RE = re.compile(
|
||||
r"Date:\s*([0-9]{1,2})\s+([A-Za-z]+)\s+([0-9]{4})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def parse_invest_engine_email(raw_email: bytes) -> list[Activity]:
|
||||
"""Parse an IE trade confirmation email into Activity records.
|
||||
|
||||
Returns an empty list when none of the three strategies match — never
|
||||
raises on malformed input.
|
||||
Tries RFC 2822 body lines first, then HTML tables. Returns an empty
|
||||
list when nothing matches — never raises on malformed input.
|
||||
"""
|
||||
msg = email.message_from_bytes(raw_email)
|
||||
body = _extract_text_body(msg)
|
||||
if body is None:
|
||||
return []
|
||||
return _parse_rfc2822_lines(body)
|
||||
text_body = _extract_part_body(msg, "text/plain")
|
||||
if text_body is not None:
|
||||
activities = _parse_rfc2822_lines(text_body)
|
||||
if activities:
|
||||
return activities
|
||||
html_body = _extract_part_body(msg, "text/html")
|
||||
if html_body is not None:
|
||||
activities = _parse_html_tables(html_body)
|
||||
if activities:
|
||||
return activities
|
||||
return []
|
||||
|
||||
|
||||
def _extract_text_body(msg: Message) -> str | None:
|
||||
"""Return the text/plain body of an email, or None if absent."""
|
||||
def _extract_part_body(msg: Message, content_type: str) -> str | None:
|
||||
"""Return the first sub-part of the given content type, or None."""
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
payload = part.get_payload(decode=True)
|
||||
if isinstance(payload, bytes):
|
||||
return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
|
||||
if part.get_content_type() == content_type:
|
||||
return _decode_payload(part)
|
||||
return None
|
||||
payload = msg.get_payload(decode=True)
|
||||
if msg.get_content_type() == content_type:
|
||||
return _decode_payload(msg)
|
||||
return None
|
||||
|
||||
|
||||
def _decode_payload(part: Message) -> str | None:
|
||||
payload = part.get_payload(decode=True)
|
||||
if isinstance(payload, bytes):
|
||||
return payload.decode(msg.get_content_charset() or "utf-8", errors="replace")
|
||||
return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
|
||||
if isinstance(payload, str):
|
||||
return payload
|
||||
return None
|
||||
|
|
@ -63,7 +91,8 @@ def _parse_rfc2822_lines(body: str) -> list[Activity]:
|
|||
|
||||
Corresponds to `_extract_position_v1` and `_extract_position_v2` in
|
||||
the upstream parser. Returns a one-element list on success, `[]`
|
||||
otherwise.
|
||||
otherwise. v3/v4 are not ported — no surviving fixtures exist and
|
||||
the HTML fallback covers newer formats.
|
||||
"""
|
||||
for parser in (_try_v2, _try_v1):
|
||||
result = parser(body)
|
||||
|
|
@ -121,6 +150,79 @@ def _try_v1(body: str) -> Activity | None:
|
|||
)
|
||||
|
||||
|
||||
def _parse_html_tables(body: str) -> list[Activity]:
|
||||
"""Parse an HTML body with per-order nested summary tables.
|
||||
|
||||
Walks every leaf <table> (a table with no child tables); each leaf
|
||||
carries one trade summary (ticker, bought line, total, ISIN + order
|
||||
id). Tables that don't contain the expected shape are skipped, so a
|
||||
partially corrupted email yields only its intact orders.
|
||||
"""
|
||||
soup = BeautifulSoup(body, "html.parser")
|
||||
on_date = _extract_html_date(soup)
|
||||
if on_date is None:
|
||||
return []
|
||||
activities: list[Activity] = []
|
||||
for table in soup.find_all("table"):
|
||||
if table.find("table") is not None:
|
||||
continue
|
||||
activity = _try_html_summary_table(table, on_date)
|
||||
if activity is not None:
|
||||
activities.append(activity)
|
||||
return activities
|
||||
|
||||
|
||||
def _extract_html_date(soup: BeautifulSoup) -> datetime | None:
|
||||
match = _DATE_RE.search(soup.get_text(" ", strip=True))
|
||||
if match is None:
|
||||
return None
|
||||
day, month, year = match.groups()
|
||||
try:
|
||||
return datetime.strptime(f"{day}-{month}-{year}", "%d-%B-%Y")
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _try_html_summary_table(nested: object, on_date: datetime) -> Activity | None:
|
||||
"""Interpret a leaf <table> as a single trade summary.
|
||||
|
||||
Returns None if the table is structural (no "Bought N @ £P" row) or
|
||||
any required field is missing.
|
||||
"""
|
||||
get_text = getattr(nested, "get_text", None)
|
||||
if get_text is None:
|
||||
return None
|
||||
text = get_text(" ", strip=True)
|
||||
bought = _BOUGHT_RE.search(text)
|
||||
if bought is None:
|
||||
return None
|
||||
symbol = _extract_html_symbol(nested)
|
||||
if symbol is None:
|
||||
return None
|
||||
quantity = Decimal(bought.group(1))
|
||||
unit_price = Decimal(bought.group(2))
|
||||
return _build_activity(
|
||||
on_date=on_date,
|
||||
symbol=symbol,
|
||||
quantity=quantity,
|
||||
unit_price=unit_price,
|
||||
strategy="html",
|
||||
matched=text[:200],
|
||||
)
|
||||
|
||||
|
||||
def _extract_html_symbol(nested: object) -> str | None:
|
||||
find_all = getattr(nested, "find_all", None)
|
||||
if find_all is None:
|
||||
return None
|
||||
for cell in find_all("td"):
|
||||
cell_text = cell.get_text(" ", strip=True)
|
||||
m = _TICKER_RE.search(cell_text)
|
||||
if m is not None:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _build_activity(
|
||||
*,
|
||||
on_date: datetime,
|
||||
|
|
|
|||
55
tests/fixtures/invest_engine/html_two_orders.eml
vendored
Normal file
55
tests/fixtures/invest_engine/html_two_orders.eml
vendored
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
From: InvestEngine <no-reply@investengine.com>
|
||||
To: viktorbarzin@example.com
|
||||
Subject: Your portfolio has been updated
|
||||
Date: Wed, 01 Apr 2026 09:15:00 +0000
|
||||
MIME-Version: 1.0
|
||||
Content-Type: multipart/alternative; boundary="----=_Part_1"
|
||||
|
||||
------=_Part_1
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
|
||||
(HTML-only view — your client does not render HTML emails.)
|
||||
|
||||
------=_Part_1
|
||||
Content-Type: text/html; charset=UTF-8
|
||||
|
||||
<html><head><title>InvestEngine</title></head><body>
|
||||
<table><tr><td>Header logo</td></tr></table>
|
||||
<table>
|
||||
<tr><td>Client name: Redacted</td></tr>
|
||||
<tr><td>Trading venue: London Stock Exchange</td></tr>
|
||||
<tr><td>Type: Market Order(s)</td></tr>
|
||||
<tr><td>Here's a summary of the trades we've made for you</td></tr>
|
||||
<tr>
|
||||
<td>a</td><td>b</td><td>c</td><td>d</td>
|
||||
<td> Date: 01 April 2026 </td>
|
||||
</tr>
|
||||
<tr><td>filler</td></tr>
|
||||
<tr><td>filler</td></tr>
|
||||
<tr><td>filler</td></tr>
|
||||
<tr><td>filler</td></tr>
|
||||
<tr><td>filler</td></tr>
|
||||
<tr>
|
||||
<td>
|
||||
<table>
|
||||
<tr><td>Vanguard S&P 500: VUAG</td></tr>
|
||||
<tr><td>Bought 10.5 @ £62.10 per share</td></tr>
|
||||
<tr><td>Total: £652.05</td></tr>
|
||||
<tr><td>ISIN: IE00BFMXXD54, Order ID: 300000/4000001, Traded at 9:05am GMT</td></tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<table>
|
||||
<tr><td>iShares Core MSCI World: SWDA</td></tr>
|
||||
<tr><td>Bought 2.25 @ £85.40 per share</td></tr>
|
||||
<tr><td>Total: £192.15</td></tr>
|
||||
<tr><td>ISIN: IE00B4L5Y983, Order ID: 300000/4000002, Traded at 9:06am GMT</td></tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</body></html>
|
||||
|
||||
------=_Part_1--
|
||||
|
|
@ -42,3 +42,29 @@ def test_rfc2822_notes_record_parse_strategy() -> None:
|
|||
a = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0]
|
||||
assert a.notes is not None
|
||||
assert "rfc2822" in a.notes
|
||||
|
||||
|
||||
# -- HTML table body (multipart/alternative, two orders) --
|
||||
|
||||
|
||||
def test_html_body_parses_both_orders() -> None:
|
||||
activities = parse_invest_engine_email(_load("html_two_orders.eml"))
|
||||
assert len(activities) == 2
|
||||
a, b = activities
|
||||
assert a.symbol == "VUAG"
|
||||
assert a.quantity == Decimal("10.5")
|
||||
assert a.unit_price == Decimal("62.10")
|
||||
assert a.date == datetime(2026, 4, 1)
|
||||
assert a.account_id == "invest-engine-primary"
|
||||
assert a.account_type is AccountType.ISA
|
||||
assert a.activity_type is ActivityType.BUY
|
||||
assert b.symbol == "SWDA"
|
||||
assert b.quantity == Decimal("2.25")
|
||||
assert b.unit_price == Decimal("85.40")
|
||||
assert b.date == datetime(2026, 4, 1)
|
||||
|
||||
|
||||
def test_html_notes_record_html_strategy() -> None:
|
||||
a = parse_invest_engine_email(_load("html_two_orders.eml"))[0]
|
||||
assert a.notes is not None
|
||||
assert "html" in a.notes
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue