Merge ie-email-parser: HTML + CSV fallbacks + failure-mode tests
# Conflicts: # broker_sync/providers/parsers/invest_engine.py # tests/providers/parsers/test_invest_engine.py
This commit is contained in:
commit
1aa60ce348
6 changed files with 390 additions and 15 deletions
|
|
@ -16,43 +16,77 @@ Every parse strategy produces canonical `Activity` objects with:
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import email
|
||||
import hashlib
|
||||
import io
|
||||
import re
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from email.message import Message
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from broker_sync.models import AccountType, Activity, ActivityType
|
||||
|
||||
_ACCOUNT_ID = "invest-engine-primary"
|
||||
_CURRENCY_SIGN = "£"
|
||||
|
||||
# HTML trade summary rows have the shape "Bought <qty> @ £<price> per share".
|
||||
_BOUGHT_RE = re.compile(
|
||||
r"Bought\s+([0-9]+(?:\.[0-9]+)?)\s*@\s*" + re.escape(_CURRENCY_SIGN) + r"([0-9]+(?:\.[0-9]+)?)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Ticker lines look like "Vanguard S&P 500: VUAG" — we want the last
|
||||
# all-caps token after the colon.
|
||||
_TICKER_RE = re.compile(r":\s*([A-Z][A-Z0-9]{1,9})\s*$")
|
||||
# Date rows contain "Date: DD Month YYYY".
|
||||
_DATE_RE = re.compile(
|
||||
r"Date:\s*([0-9]{1,2})\s+([A-Za-z]+)\s+([0-9]{4})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def parse_invest_engine_email(raw_email: bytes) -> list[Activity]:
|
||||
"""Parse an IE trade confirmation email into Activity records.
|
||||
|
||||
Returns an empty list when none of the three strategies match — never
|
||||
Tries RFC 2822 body lines first, then HTML tables, then a CSV
|
||||
attachment. Returns an empty list when nothing matches — never
|
||||
raises on malformed input.
|
||||
"""
|
||||
msg = email.message_from_bytes(raw_email)
|
||||
body = _extract_text_body(msg)
|
||||
if body is None:
|
||||
return []
|
||||
return _parse_rfc2822_lines(body)
|
||||
text_body = _extract_part_body(msg, "text/plain")
|
||||
if text_body is not None:
|
||||
activities = _parse_rfc2822_lines(text_body)
|
||||
if activities:
|
||||
return activities
|
||||
html_body = _extract_part_body(msg, "text/html")
|
||||
if html_body is not None:
|
||||
activities = _parse_html_tables(html_body)
|
||||
if activities:
|
||||
return activities
|
||||
csv_activities = _parse_csv_attachment(raw_email)
|
||||
if csv_activities:
|
||||
return csv_activities
|
||||
return []
|
||||
|
||||
|
||||
def _extract_text_body(msg: Message) -> str | None:
|
||||
"""Return the text/plain body of an email, or None if absent."""
|
||||
def _extract_part_body(msg: Message, content_type: str) -> str | None:
|
||||
"""Return the first sub-part of the given content type, or None."""
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
if part.get_content_type() == "text/plain":
|
||||
payload = part.get_payload(decode=True)
|
||||
if isinstance(payload, bytes):
|
||||
return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
|
||||
if part.get_content_type() == content_type:
|
||||
return _decode_payload(part)
|
||||
return None
|
||||
payload = msg.get_payload(decode=True)
|
||||
if msg.get_content_type() == content_type:
|
||||
return _decode_payload(msg)
|
||||
return None
|
||||
|
||||
|
||||
def _decode_payload(part: Message) -> str | None:
|
||||
payload = part.get_payload(decode=True)
|
||||
if isinstance(payload, bytes):
|
||||
return payload.decode(msg.get_content_charset() or "utf-8", errors="replace")
|
||||
return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
|
||||
if isinstance(payload, str):
|
||||
return payload
|
||||
return None
|
||||
|
|
@ -63,7 +97,8 @@ def _parse_rfc2822_lines(body: str) -> list[Activity]:
|
|||
|
||||
Corresponds to `_extract_position_v1` and `_extract_position_v2` in
|
||||
the upstream parser. Returns a one-element list on success, `[]`
|
||||
otherwise.
|
||||
otherwise. v3/v4 are not ported — no surviving fixtures exist and
|
||||
the HTML fallback covers newer formats.
|
||||
"""
|
||||
for parser in (_try_v2, _try_v1):
|
||||
result = parser(body)
|
||||
|
|
@ -121,6 +156,150 @@ def _try_v1(body: str) -> Activity | None:
|
|||
)
|
||||
|
||||
|
||||
def _parse_html_tables(body: str) -> list[Activity]:
|
||||
"""Parse an HTML body with per-order nested summary tables.
|
||||
|
||||
Walks every leaf <table> (a table with no child tables); each leaf
|
||||
carries one trade summary (ticker, bought line, total, ISIN + order
|
||||
id). Tables that don't contain the expected shape are skipped, so a
|
||||
partially corrupted email yields only its intact orders.
|
||||
"""
|
||||
soup = BeautifulSoup(body, "html.parser")
|
||||
on_date = _extract_html_date(soup)
|
||||
if on_date is None:
|
||||
return []
|
||||
activities: list[Activity] = []
|
||||
for table in soup.find_all("table"):
|
||||
if table.find("table") is not None:
|
||||
continue
|
||||
activity = _try_html_summary_table(table, on_date)
|
||||
if activity is not None:
|
||||
activities.append(activity)
|
||||
return activities
|
||||
|
||||
|
||||
def _extract_html_date(soup: BeautifulSoup) -> datetime | None:
|
||||
match = _DATE_RE.search(soup.get_text(" ", strip=True))
|
||||
if match is None:
|
||||
return None
|
||||
day, month, year = match.groups()
|
||||
try:
|
||||
return datetime.strptime(f"{day}-{month}-{year}", "%d-%B-%Y")
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def _try_html_summary_table(nested: object, on_date: datetime) -> Activity | None:
|
||||
"""Interpret a leaf <table> as a single trade summary.
|
||||
|
||||
Returns None if the table is structural (no "Bought N @ £P" row) or
|
||||
any required field is missing.
|
||||
"""
|
||||
get_text = getattr(nested, "get_text", None)
|
||||
if get_text is None:
|
||||
return None
|
||||
text = get_text(" ", strip=True)
|
||||
bought = _BOUGHT_RE.search(text)
|
||||
if bought is None:
|
||||
return None
|
||||
symbol = _extract_html_symbol(nested)
|
||||
if symbol is None:
|
||||
return None
|
||||
quantity = Decimal(bought.group(1))
|
||||
unit_price = Decimal(bought.group(2))
|
||||
return _build_activity(
|
||||
on_date=on_date,
|
||||
symbol=symbol,
|
||||
quantity=quantity,
|
||||
unit_price=unit_price,
|
||||
strategy="html",
|
||||
matched=text[:200],
|
||||
)
|
||||
|
||||
|
||||
def _extract_html_symbol(nested: object) -> str | None:
|
||||
find_all = getattr(nested, "find_all", None)
|
||||
if find_all is None:
|
||||
return None
|
||||
for cell in find_all("td"):
|
||||
cell_text = cell.get_text(" ", strip=True)
|
||||
m = _TICKER_RE.search(cell_text)
|
||||
if m is not None:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
|
||||
_CSV_CONTENT_TYPES = {"text/csv", "application/csv", "application/vnd.ms-excel"}
|
||||
# Required columns for the CSV attachment strategy. IE has not (yet) sent
|
||||
# CSV-attached statements in production — the column set here mirrors the
|
||||
# upstream _extract_positions_csv contract (ticker, buy_price, num_shares,
|
||||
# buy_date, currency) with modern names.
|
||||
_CSV_COLUMNS = {"ticker", "unit_price", "quantity", "date", "currency"}
|
||||
|
||||
|
||||
def _parse_csv_attachment(raw_email: bytes) -> list[Activity]:
|
||||
"""Parse a CSV attachment from the email into Activity records.
|
||||
|
||||
Walks every MIME part, picks the first one with a CSV-ish content
|
||||
type OR a `.csv` filename, and iterates its rows. Rows missing a
|
||||
required column or with an unparseable number/date are skipped.
|
||||
"""
|
||||
msg = email.message_from_bytes(raw_email)
|
||||
csv_text = _extract_csv_attachment_text(msg)
|
||||
if csv_text is None:
|
||||
return []
|
||||
reader = csv.DictReader(io.StringIO(csv_text))
|
||||
fieldnames = set(reader.fieldnames or [])
|
||||
if not _CSV_COLUMNS.issubset(fieldnames):
|
||||
return []
|
||||
activities: list[Activity] = []
|
||||
for row in reader:
|
||||
activity = _csv_row_to_activity(row)
|
||||
if activity is not None:
|
||||
activities.append(activity)
|
||||
return activities
|
||||
|
||||
|
||||
def _extract_csv_attachment_text(msg: Message) -> str | None:
|
||||
for part in msg.walk():
|
||||
if not _looks_like_csv_part(part):
|
||||
continue
|
||||
payload = part.get_payload(decode=True)
|
||||
if isinstance(payload, bytes):
|
||||
return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
|
||||
if isinstance(payload, str):
|
||||
return payload
|
||||
return None
|
||||
|
||||
|
||||
def _looks_like_csv_part(part: Message) -> bool:
|
||||
if part.get_content_type() in _CSV_CONTENT_TYPES:
|
||||
return True
|
||||
filename = part.get_filename()
|
||||
return isinstance(filename, str) and filename.lower().endswith(".csv")
|
||||
|
||||
|
||||
def _csv_row_to_activity(row: dict[str, str]) -> Activity | None:
|
||||
try:
|
||||
on_date = datetime.strptime(row["date"], "%Y-%m-%d")
|
||||
symbol = row["ticker"].strip()
|
||||
quantity = Decimal(row["quantity"])
|
||||
unit_price = Decimal(row["unit_price"])
|
||||
currency = row["currency"].strip() or "GBP"
|
||||
except (KeyError, ValueError, InvalidOperation):
|
||||
return None
|
||||
if not symbol or currency != "GBP":
|
||||
return None
|
||||
return _build_activity(
|
||||
on_date=on_date,
|
||||
symbol=symbol,
|
||||
quantity=quantity,
|
||||
unit_price=unit_price,
|
||||
strategy="csv",
|
||||
matched=f"{symbol},{unit_price},{quantity},{row['date']}",
|
||||
)
|
||||
|
||||
|
||||
def _build_activity(
|
||||
*,
|
||||
on_date: datetime,
|
||||
|
|
|
|||
22
tests/fixtures/invest_engine/csv_attachment.eml
vendored
Normal file
22
tests/fixtures/invest_engine/csv_attachment.eml
vendored
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
From: InvestEngine <no-reply@investengine.com>
|
||||
To: viktorbarzin@example.com
|
||||
Subject: Your InvestEngine statement
|
||||
Date: Mon, 07 Apr 2025 09:00:00 +0000
|
||||
MIME-Version: 1.0
|
||||
Content-Type: multipart/mixed; boundary="----=_MIXED_1"
|
||||
|
||||
------=_MIXED_1
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
|
||||
Your monthly statement is attached as a CSV.
|
||||
|
||||
------=_MIXED_1
|
||||
Content-Type: text/csv; charset=UTF-8; name="statement.csv"
|
||||
Content-Disposition: attachment; filename="statement.csv"
|
||||
|
||||
ticker,unit_price,quantity,date,currency
|
||||
VUAG,63.21,12.5,2025-04-02,GBP
|
||||
SWDA,86.40,4.75,2025-04-03,GBP
|
||||
VUSA,90.10,1.0,2025-04-04,GBP
|
||||
|
||||
------=_MIXED_1--
|
||||
40
tests/fixtures/invest_engine/html_partial_match.eml
vendored
Normal file
40
tests/fixtures/invest_engine/html_partial_match.eml
vendored
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
From: InvestEngine <no-reply@investengine.com>
|
||||
To: viktorbarzin@example.com
|
||||
Subject: Your portfolio has been updated
|
||||
Date: Wed, 15 Apr 2026 11:00:00 +0000
|
||||
MIME-Version: 1.0
|
||||
Content-Type: multipart/alternative; boundary="----=_Part_PM"
|
||||
|
||||
------=_Part_PM
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
|
||||
(HTML-only view — your client does not render HTML emails.)
|
||||
|
||||
------=_Part_PM
|
||||
Content-Type: text/html; charset=UTF-8
|
||||
|
||||
<html><body>
|
||||
<table><tr><td>Logo</td></tr></table>
|
||||
<table>
|
||||
<tr><td> Date: 15 April 2026 </td></tr>
|
||||
<tr>
|
||||
<td>
|
||||
<table>
|
||||
<tr><td>Vanguard S&P 500: VUAG</td></tr>
|
||||
<tr><td>Bought 3.0 @ £61.25 per share</td></tr>
|
||||
<tr><td>Total: £183.75</td></tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<table>
|
||||
<tr><td>Some broken order with no ticker and no bought line</td></tr>
|
||||
<tr><td>(Malformed — IE dropped a row mid-render)</td></tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</body></html>
|
||||
|
||||
------=_Part_PM--
|
||||
55
tests/fixtures/invest_engine/html_two_orders.eml
vendored
Normal file
55
tests/fixtures/invest_engine/html_two_orders.eml
vendored
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
From: InvestEngine <no-reply@investengine.com>
|
||||
To: viktorbarzin@example.com
|
||||
Subject: Your portfolio has been updated
|
||||
Date: Wed, 01 Apr 2026 09:15:00 +0000
|
||||
MIME-Version: 1.0
|
||||
Content-Type: multipart/alternative; boundary="----=_Part_1"
|
||||
|
||||
------=_Part_1
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
|
||||
(HTML-only view — your client does not render HTML emails.)
|
||||
|
||||
------=_Part_1
|
||||
Content-Type: text/html; charset=UTF-8
|
||||
|
||||
<html><head><title>InvestEngine</title></head><body>
|
||||
<table><tr><td>Header logo</td></tr></table>
|
||||
<table>
|
||||
<tr><td>Client name: Redacted</td></tr>
|
||||
<tr><td>Trading venue: London Stock Exchange</td></tr>
|
||||
<tr><td>Type: Market Order(s)</td></tr>
|
||||
<tr><td>Here's a summary of the trades we've made for you</td></tr>
|
||||
<tr>
|
||||
<td>a</td><td>b</td><td>c</td><td>d</td>
|
||||
<td> Date: 01 April 2026 </td>
|
||||
</tr>
|
||||
<tr><td>filler</td></tr>
|
||||
<tr><td>filler</td></tr>
|
||||
<tr><td>filler</td></tr>
|
||||
<tr><td>filler</td></tr>
|
||||
<tr><td>filler</td></tr>
|
||||
<tr>
|
||||
<td>
|
||||
<table>
|
||||
<tr><td>Vanguard S&P 500: VUAG</td></tr>
|
||||
<tr><td>Bought 10.5 @ £62.10 per share</td></tr>
|
||||
<tr><td>Total: £652.05</td></tr>
|
||||
<tr><td>ISIN: IE00BFMXXD54, Order ID: 300000/4000001, Traded at 9:05am GMT</td></tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<table>
|
||||
<tr><td>iShares Core MSCI World: SWDA</td></tr>
|
||||
<tr><td>Bought 2.25 @ £85.40 per share</td></tr>
|
||||
<tr><td>Total: £192.15</td></tr>
|
||||
<tr><td>ISIN: IE00B4L5Y983, Order ID: 300000/4000002, Traded at 9:06am GMT</td></tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</body></html>
|
||||
|
||||
------=_Part_1--
|
||||
15
tests/fixtures/invest_engine/unparseable.eml
vendored
Normal file
15
tests/fixtures/invest_engine/unparseable.eml
vendored
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
From: InvestEngine <no-reply@investengine.com>
|
||||
To: viktorbarzin@example.com
|
||||
Subject: InvestEngine newsletter
|
||||
Date: Thu, 10 Apr 2025 12:00:00 +0000
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
|
||||
Hi Viktor,
|
||||
|
||||
This is a newsletter, not a trade confirmation. There is no structured
|
||||
order data here — just marketing copy and a promo for a new feature we
|
||||
are rolling out. Thanks for being a customer.
|
||||
|
||||
Cheers,
|
||||
The InvestEngine team
|
||||
|
|
@ -42,3 +42,67 @@ def test_rfc2822_notes_record_parse_strategy() -> None:
|
|||
a = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0]
|
||||
assert a.notes is not None
|
||||
assert "rfc2822" in a.notes
|
||||
|
||||
|
||||
# -- HTML table body (multipart/alternative, two orders) --
|
||||
|
||||
|
||||
def test_html_body_parses_both_orders() -> None:
|
||||
activities = parse_invest_engine_email(_load("html_two_orders.eml"))
|
||||
assert len(activities) == 2
|
||||
a, b = activities
|
||||
assert a.symbol == "VUAG"
|
||||
assert a.quantity == Decimal("10.5")
|
||||
assert a.unit_price == Decimal("62.10")
|
||||
assert a.date == datetime(2026, 4, 1)
|
||||
assert a.account_id == "invest-engine-primary"
|
||||
assert a.account_type is AccountType.ISA
|
||||
assert a.activity_type is ActivityType.BUY
|
||||
assert b.symbol == "SWDA"
|
||||
assert b.quantity == Decimal("2.25")
|
||||
assert b.unit_price == Decimal("85.40")
|
||||
assert b.date == datetime(2026, 4, 1)
|
||||
|
||||
|
||||
def test_html_notes_record_html_strategy() -> None:
|
||||
a = parse_invest_engine_email(_load("html_two_orders.eml"))[0]
|
||||
assert a.notes is not None
|
||||
assert "html" in a.notes
|
||||
|
||||
|
||||
# -- CSV attachment body --
|
||||
|
||||
|
||||
def test_csv_attachment_parses_all_rows() -> None:
|
||||
activities = parse_invest_engine_email(_load("csv_attachment.eml"))
|
||||
assert len(activities) == 3
|
||||
by_symbol = {a.symbol: a for a in activities}
|
||||
assert by_symbol["VUAG"].quantity == Decimal("12.5")
|
||||
assert by_symbol["VUAG"].unit_price == Decimal("63.21")
|
||||
assert by_symbol["VUAG"].date == datetime(2025, 4, 2)
|
||||
assert by_symbol["SWDA"].quantity == Decimal("4.75")
|
||||
assert by_symbol["VUSA"].date == datetime(2025, 4, 4)
|
||||
for a in activities:
|
||||
assert a.activity_type is ActivityType.BUY
|
||||
assert a.currency == "GBP"
|
||||
assert a.account_id == "invest-engine-primary"
|
||||
assert a.account_type is AccountType.ISA
|
||||
assert a.notes is not None
|
||||
assert "csv" in a.notes
|
||||
|
||||
|
||||
# -- graceful failure modes --
|
||||
|
||||
|
||||
def test_unparseable_email_returns_empty_list() -> None:
|
||||
assert parse_invest_engine_email(_load("unparseable.eml")) == []
|
||||
|
||||
|
||||
def test_html_partial_match_returns_only_parseable_orders() -> None:
|
||||
activities = parse_invest_engine_email(_load("html_partial_match.eml"))
|
||||
assert len(activities) == 1
|
||||
a = activities[0]
|
||||
assert a.symbol == "VUAG"
|
||||
assert a.quantity == Decimal("3.0")
|
||||
assert a.unit_price == Decimal("61.25")
|
||||
assert a.date == datetime(2026, 4, 15)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue