Add InvestEngine email parser — RFC 2822 v1/v2 line format
Context: The old finance/ app had a 324-line IE message parser with four
line-based variants (v1/v2/v3/v4) plus an HTML strategy and a CSV
fallback. Port into broker-sync so we can consume IE trade confirmation
emails as a backup to the live HTTP client (Phase 2b) while IE's public
API remains Bearer-only.
The upstream parser emits storage.model.Position; we emit canonical
Activity with the broker-sync invariants: account_id="invest-engine-primary"
(sink remaps to Wealthfolio UUID), account_type=ISA, currency=GBP, and
external_id="invest-engine:<fingerprint>" where the fingerprint is a
SHA-256 of (date|symbol|quantity|unit_price) — deterministic so repeat
imports of the same email dedup at the sync-record layer.
This change:
- Top-level `parse_invest_engine_email(raw_email: bytes) -> list[Activity]`
extracts the text/plain body from an RFC 2822 message and dispatches to
the line-based parser.
- `_parse_rfc2822_lines(body)` tries the v2 layout first (newer IE format
where `Date: DD Month` is on line 2 and the year on line 3), then the
v1 layout (where the day alone is on line 2 and `Month YYYY` on line 3).
v3 and v4 variants are re-added in a follow-up if we find fixtures
where they matter — initial fixture coverage hits v2.
- Drops the upstream `_ticker_post_processing` VUAG→VUAG.L hack.
Wealthfolio's /import/check endpoint resolves exchange suffixes; the
Trading212 provider also emits suffix-free tickers (e.g. `VUAG`), so
staying consistent avoids double-mapping.
- Notes field records the parse-strategy tag ("rfc2822-v2") plus the
matched line for debugging.
Test plan:
poetry run pytest tests/providers/parsers/ -q
→ 3 passed in 0.03s
poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py
→ Success: no issues found in 2 source files
poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py
→ All checks passed!
poetry run yapf --diff broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py
→ clean (no diff)
Manual verification: load the fixture email, call the parser, inspect the
returned Activity has symbol=VUAG, quantity=59.539562, unit_price=60.46,
date=2023-01-17, external_id starts with invest-engine:.
2026-04-17 21:49:52 +00:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from decimal import Decimal
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
from broker_sync.models import AccountType, ActivityType
|
|
|
|
|
from broker_sync.providers.parsers.invest_engine import parse_invest_engine_email
|
|
|
|
|
|
|
|
|
|
_FIXTURES = Path(__file__).parent.parent.parent / "fixtures" / "invest_engine"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load(name: str) -> bytes:
|
|
|
|
|
return (_FIXTURES / name).read_bytes()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# -- RFC 2822 body (v2-style, single BUY) --
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_rfc2822_single_buy_parses_to_one_activity() -> None:
|
|
|
|
|
activities = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))
|
|
|
|
|
assert len(activities) == 1
|
|
|
|
|
a = activities[0]
|
|
|
|
|
assert a.activity_type is ActivityType.BUY
|
|
|
|
|
assert a.symbol == "VUAG"
|
|
|
|
|
assert a.quantity == Decimal("59.539562")
|
|
|
|
|
assert a.unit_price == Decimal("60.46")
|
|
|
|
|
assert a.currency == "GBP"
|
|
|
|
|
assert a.date == datetime(2023, 1, 17)
|
|
|
|
|
assert a.account_id == "invest-engine-primary"
|
|
|
|
|
assert a.account_type is AccountType.ISA
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_rfc2822_external_id_is_deterministic() -> None:
|
|
|
|
|
a1 = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0]
|
|
|
|
|
a2 = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0]
|
|
|
|
|
assert a1.external_id == a2.external_id
|
|
|
|
|
assert a1.external_id.startswith("invest-engine:")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_rfc2822_notes_record_parse_strategy() -> None:
|
|
|
|
|
a = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0]
|
|
|
|
|
assert a.notes is not None
|
|
|
|
|
assert "rfc2822" in a.notes
|
2026-04-17 21:58:15 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# -- HTML table body (multipart/alternative, two orders) --
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_html_body_parses_both_orders() -> None:
|
|
|
|
|
activities = parse_invest_engine_email(_load("html_two_orders.eml"))
|
|
|
|
|
assert len(activities) == 2
|
|
|
|
|
a, b = activities
|
|
|
|
|
assert a.symbol == "VUAG"
|
|
|
|
|
assert a.quantity == Decimal("10.5")
|
|
|
|
|
assert a.unit_price == Decimal("62.10")
|
|
|
|
|
assert a.date == datetime(2026, 4, 1)
|
|
|
|
|
assert a.account_id == "invest-engine-primary"
|
|
|
|
|
assert a.account_type is AccountType.ISA
|
|
|
|
|
assert a.activity_type is ActivityType.BUY
|
|
|
|
|
assert b.symbol == "SWDA"
|
|
|
|
|
assert b.quantity == Decimal("2.25")
|
|
|
|
|
assert b.unit_price == Decimal("85.40")
|
|
|
|
|
assert b.date == datetime(2026, 4, 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_html_notes_record_html_strategy() -> None:
|
|
|
|
|
a = parse_invest_engine_email(_load("html_two_orders.eml"))[0]
|
|
|
|
|
assert a.notes is not None
|
|
|
|
|
assert "html" in a.notes
|
Add CSV attachment fallback for InvestEngine email parser
Context: IE has not (yet) sent CSV-attached statements in production,
but the upstream parser had _extract_positions_csv as a third fallback
for exactly this case. Keeping the fallback preserves behaviour-parity
with the legacy parser and makes future statement support one fixture
away — the shape is documented by column set, not scraped live.
Unlike the upstream which split the body on whitespace and broke on any
embedded commas in names, this port walks real MIME attachments using
Python's csv.DictReader. A part qualifies as CSV if:
- its Content-Type is text/csv / application/csv / application/vnd.ms-excel, OR
- its filename ends in .csv (defence against IE mis-labelling the part)
Rows missing required columns or containing unparseable numbers/dates
are skipped silently — consistent with the "partial match" contract:
a half-corrupt CSV yields whatever rows were intact. Required columns:
ticker, unit_price, quantity, date (YYYY-MM-DD), currency. Non-GBP
rows are filtered because the IE ISA is strictly sterling — flagging
this assumption in the review notes.
This change:
- Adds `_parse_csv_attachment(raw_email)` as the third strategy after
text/plain and text/html; it re-parses the raw email bytes so we can
inspect Content-Type/filename on each part.
- Flags symbols/currencies, filters non-GBP, and runs each row through
the shared `_build_activity` so external_id formation matches every
other strategy (dedup stays consistent across strategies).
- Fixture `csv_attachment.eml` has three rows (VUAG, SWDA, VUSA) in a
`text/csv` part with a `.csv` filename — covers both detection paths.
Test plan:
poetry run pytest tests/providers/parsers/ -q → 6 passed in 0.15s
poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean
poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed!
poetry run yapf --diff → clean (no diff)
Manual verification: load csv_attachment.eml, call parse_invest_engine_email,
assert 3 activities each with symbol in {VUAG,SWDA,VUSA}, currency=GBP,
notes containing "csv".
2026-04-17 22:01:46 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# -- CSV attachment body --
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_csv_attachment_parses_all_rows() -> None:
|
|
|
|
|
activities = parse_invest_engine_email(_load("csv_attachment.eml"))
|
|
|
|
|
assert len(activities) == 3
|
|
|
|
|
by_symbol = {a.symbol: a for a in activities}
|
|
|
|
|
assert by_symbol["VUAG"].quantity == Decimal("12.5")
|
|
|
|
|
assert by_symbol["VUAG"].unit_price == Decimal("63.21")
|
|
|
|
|
assert by_symbol["VUAG"].date == datetime(2025, 4, 2)
|
|
|
|
|
assert by_symbol["SWDA"].quantity == Decimal("4.75")
|
|
|
|
|
assert by_symbol["VUSA"].date == datetime(2025, 4, 4)
|
|
|
|
|
for a in activities:
|
|
|
|
|
assert a.activity_type is ActivityType.BUY
|
|
|
|
|
assert a.currency == "GBP"
|
|
|
|
|
assert a.account_id == "invest-engine-primary"
|
|
|
|
|
assert a.account_type is AccountType.ISA
|
|
|
|
|
assert a.notes is not None
|
|
|
|
|
assert "csv" in a.notes
|
2026-04-17 22:02:48 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
# -- graceful failure modes --
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_unparseable_email_returns_empty_list() -> None:
|
|
|
|
|
assert parse_invest_engine_email(_load("unparseable.eml")) == []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_html_partial_match_returns_only_parseable_orders() -> None:
|
|
|
|
|
activities = parse_invest_engine_email(_load("html_partial_match.eml"))
|
|
|
|
|
assert len(activities) == 1
|
|
|
|
|
a = activities[0]
|
|
|
|
|
assert a.symbol == "VUAG"
|
|
|
|
|
assert a.quantity == Decimal("3.0")
|
|
|
|
|
assert a.unit_price == Decimal("61.25")
|
|
|
|
|
assert a.date == datetime(2026, 4, 15)
|