Add CSV attachment fallback for InvestEngine email parser
Context: IE has not (yet) sent CSV-attached statements in production,
but the upstream parser had _extract_positions_csv as a third fallback
for exactly this case. Keeping the fallback preserves behaviour-parity
with the legacy parser and makes future statement support one fixture
away — the shape is documented by column set, not scraped live.
Unlike the upstream which split the body on whitespace and broke on any
embedded commas in names, this port walks real MIME attachments using
Python's csv.DictReader. A part qualifies as CSV if:
- its Content-Type is text/csv / application/csv / application/vnd.ms-excel, OR
- its filename ends in .csv (defence against IE mis-labelling the part)
Rows missing required columns or containing unparseable numbers/dates
are skipped silently — consistent with the "partial match" contract:
a half-corrupt CSV yields whatever rows were intact. Required columns:
ticker, unit_price, quantity, date (YYYY-MM-DD), currency. Non-GBP
rows are filtered because the IE ISA is strictly sterling — flagging
this assumption in the review notes.
This change:
- Adds `_parse_csv_attachment(raw_email)` as the third strategy after
text/plain and text/html; it re-parses the raw email bytes so we can
inspect Content-Type/filename on each part.
- Flags symbols/currencies, filters non-GBP, and runs each row through
the shared `_build_activity` so external_id formation matches every
other strategy (dedup stays consistent across strategies).
- Fixture `csv_attachment.eml` has three rows (VUAG, SWDA, VUSA) in a
`text/csv` part with a `.csv` filename — covers both detection paths.
Test plan:
poetry run pytest tests/providers/parsers/ -q → 6 passed in 0.15s
poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean
poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed!
poetry run yapf --diff → clean (no diff)
Manual verification: load csv_attachment.eml, call parse_invest_engine_email,
assert 3 activities each with symbol in {VUAG,SWDA,VUSA}, currency=GBP,
notes containing "csv".
This commit is contained in:
parent
72d348e294
commit
020ba16723
3 changed files with 123 additions and 3 deletions
|
|
@ -16,11 +16,13 @@ Every parse strategy produces canonical `Activity` objects with:
|
|||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import email
|
||||
import hashlib
|
||||
import io
|
||||
import re
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from email.message import Message
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
|
@ -48,8 +50,9 @@ _DATE_RE = re.compile(
|
|||
def parse_invest_engine_email(raw_email: bytes) -> list[Activity]:
|
||||
"""Parse an IE trade confirmation email into Activity records.
|
||||
|
||||
Tries RFC 2822 body lines first, then HTML tables. Returns an empty
|
||||
list when nothing matches — never raises on malformed input.
|
||||
Tries RFC 2822 body lines first, then HTML tables, then a CSV
|
||||
attachment. Returns an empty list when nothing matches — never
|
||||
raises on malformed input.
|
||||
"""
|
||||
msg = email.message_from_bytes(raw_email)
|
||||
text_body = _extract_part_body(msg, "text/plain")
|
||||
|
|
@ -62,6 +65,9 @@ def parse_invest_engine_email(raw_email: bytes) -> list[Activity]:
|
|||
activities = _parse_html_tables(html_body)
|
||||
if activities:
|
||||
return activities
|
||||
csv_activities = _parse_csv_attachment(raw_email)
|
||||
if csv_activities:
|
||||
return csv_activities
|
||||
return []
|
||||
|
||||
|
||||
|
|
@ -223,6 +229,77 @@ def _extract_html_symbol(nested: object) -> str | None:
|
|||
return None
|
||||
|
||||
|
||||
_CSV_CONTENT_TYPES = {"text/csv", "application/csv", "application/vnd.ms-excel"}
|
||||
# Required columns for the CSV attachment strategy. IE has not (yet) sent
|
||||
# CSV-attached statements in production — the column set here mirrors the
|
||||
# upstream _extract_positions_csv contract (ticker, buy_price, num_shares,
|
||||
# buy_date, currency) with modern names.
|
||||
_CSV_COLUMNS = {"ticker", "unit_price", "quantity", "date", "currency"}
|
||||
|
||||
|
||||
def _parse_csv_attachment(raw_email: bytes) -> list[Activity]:
|
||||
"""Parse a CSV attachment from the email into Activity records.
|
||||
|
||||
Walks every MIME part, picks the first one with a CSV-ish content
|
||||
type OR a `.csv` filename, and iterates its rows. Rows missing a
|
||||
required column or with an unparseable number/date are skipped.
|
||||
"""
|
||||
msg = email.message_from_bytes(raw_email)
|
||||
csv_text = _extract_csv_attachment_text(msg)
|
||||
if csv_text is None:
|
||||
return []
|
||||
reader = csv.DictReader(io.StringIO(csv_text))
|
||||
fieldnames = set(reader.fieldnames or [])
|
||||
if not _CSV_COLUMNS.issubset(fieldnames):
|
||||
return []
|
||||
activities: list[Activity] = []
|
||||
for row in reader:
|
||||
activity = _csv_row_to_activity(row)
|
||||
if activity is not None:
|
||||
activities.append(activity)
|
||||
return activities
|
||||
|
||||
|
||||
def _extract_csv_attachment_text(msg: Message) -> str | None:
|
||||
for part in msg.walk():
|
||||
if not _looks_like_csv_part(part):
|
||||
continue
|
||||
payload = part.get_payload(decode=True)
|
||||
if isinstance(payload, bytes):
|
||||
return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
|
||||
if isinstance(payload, str):
|
||||
return payload
|
||||
return None
|
||||
|
||||
|
||||
def _looks_like_csv_part(part: Message) -> bool:
|
||||
if part.get_content_type() in _CSV_CONTENT_TYPES:
|
||||
return True
|
||||
filename = part.get_filename()
|
||||
return isinstance(filename, str) and filename.lower().endswith(".csv")
|
||||
|
||||
|
||||
def _csv_row_to_activity(row: dict[str, str]) -> Activity | None:
|
||||
try:
|
||||
on_date = datetime.strptime(row["date"], "%Y-%m-%d")
|
||||
symbol = row["ticker"].strip()
|
||||
quantity = Decimal(row["quantity"])
|
||||
unit_price = Decimal(row["unit_price"])
|
||||
currency = row["currency"].strip() or "GBP"
|
||||
except (KeyError, ValueError, InvalidOperation):
|
||||
return None
|
||||
if not symbol or currency != "GBP":
|
||||
return None
|
||||
return _build_activity(
|
||||
on_date=on_date,
|
||||
symbol=symbol,
|
||||
quantity=quantity,
|
||||
unit_price=unit_price,
|
||||
strategy="csv",
|
||||
matched=f"{symbol},{unit_price},{quantity},{row['date']}",
|
||||
)
|
||||
|
||||
|
||||
def _build_activity(
|
||||
*,
|
||||
on_date: datetime,
|
||||
|
|
|
|||
22
tests/fixtures/invest_engine/csv_attachment.eml
vendored
Normal file
22
tests/fixtures/invest_engine/csv_attachment.eml
vendored
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
From: InvestEngine <no-reply@investengine.com>
|
||||
To: viktorbarzin@example.com
|
||||
Subject: Your InvestEngine statement
|
||||
Date: Mon, 07 Apr 2025 09:00:00 +0000
|
||||
MIME-Version: 1.0
|
||||
Content-Type: multipart/mixed; boundary="----=_MIXED_1"
|
||||
|
||||
------=_MIXED_1
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
|
||||
Your monthly statement is attached as a CSV.
|
||||
|
||||
------=_MIXED_1
|
||||
Content-Type: text/csv; charset=UTF-8; name="statement.csv"
|
||||
Content-Disposition: attachment; filename="statement.csv"
|
||||
|
||||
ticker,unit_price,quantity,date,currency
|
||||
VUAG,63.21,12.5,2025-04-02,GBP
|
||||
SWDA,86.40,4.75,2025-04-03,GBP
|
||||
VUSA,90.10,1.0,2025-04-04,GBP
|
||||
|
||||
------=_MIXED_1--
|
||||
|
|
@ -68,3 +68,24 @@ def test_html_notes_record_html_strategy() -> None:
|
|||
a = parse_invest_engine_email(_load("html_two_orders.eml"))[0]
|
||||
assert a.notes is not None
|
||||
assert "html" in a.notes
|
||||
|
||||
|
||||
# -- CSV attachment body --
|
||||
|
||||
|
||||
def test_csv_attachment_parses_all_rows() -> None:
|
||||
activities = parse_invest_engine_email(_load("csv_attachment.eml"))
|
||||
assert len(activities) == 3
|
||||
by_symbol = {a.symbol: a for a in activities}
|
||||
assert by_symbol["VUAG"].quantity == Decimal("12.5")
|
||||
assert by_symbol["VUAG"].unit_price == Decimal("63.21")
|
||||
assert by_symbol["VUAG"].date == datetime(2025, 4, 2)
|
||||
assert by_symbol["SWDA"].quantity == Decimal("4.75")
|
||||
assert by_symbol["VUSA"].date == datetime(2025, 4, 4)
|
||||
for a in activities:
|
||||
assert a.activity_type is ActivityType.BUY
|
||||
assert a.currency == "GBP"
|
||||
assert a.account_id == "invest-engine-primary"
|
||||
assert a.account_type is AccountType.ISA
|
||||
assert a.notes is not None
|
||||
assert "csv" in a.notes
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue