broker-sync/broker_sync/providers/parsers/invest_engine.py
Viktor Barzin ea15b80111 Add InvestEngine email parser — RFC 2822 v1/v2 line format
Context: The old finance/ app had a 324-line IE message parser with four
line-based variants (v1/v2/v3/v4) plus an HTML strategy and a CSV
fallback. Port into broker-sync so we can consume IE trade confirmation
emails as a backup to the live HTTP client (Phase 2b) while IE's public
API remains Bearer-only.

The upstream parser emits storage.model.Position; we emit canonical
Activity with the broker-sync invariants: account_id="invest-engine-primary"
(sink remaps to Wealthfolio UUID), account_type=ISA, currency=GBP, and
external_id="invest-engine:<fingerprint>" where the fingerprint is a
SHA-256 of (date|symbol|quantity|unit_price) — deterministic so repeat
imports of the same email dedup at the sync-record layer.

This change:
- Top-level `parse_invest_engine_email(raw_email: bytes) -> list[Activity]`
  extracts the text/plain body from an RFC 2822 message and dispatches to
  the line-based parser.
- `_parse_rfc2822_lines(body)` tries the v2 layout first (newer IE format
  where `Date: DD Month` is on line 2 and the year on line 3), then the
  v1 layout (where the day alone is on line 2 and `Month YYYY` on line 3).
  v3 and v4 variants are re-added in a follow-up if we find fixtures
  where they matter — initial fixture coverage hits v2.
- Drops the upstream `_ticker_post_processing` VUAG→VUAG.L hack.
  Wealthfolio's /import/check endpoint resolves exchange suffixes; the
  Trading212 provider also emits suffix-free tickers (e.g. `VUAG`), so
  staying consistent avoids double-mapping.
- Notes field records the parse-strategy tag ("rfc2822-v2") plus the
  matched line for debugging.

Test plan:
  poetry run pytest tests/providers/parsers/ -q
  → 3 passed in 0.03s
  poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py
  → Success: no issues found in 2 source files
  poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py
  → All checks passed!
  poetry run yapf --diff broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py
  → clean (no diff)

Manual verification: load the fixture email, call the parser, inspect the
returned Activity has symbol=VUAG, quantity=59.539562, unit_price=60.46,
date=2023-01-17, external_id starts with invest-engine:.
2026-04-17 21:49:52 +00:00

150 lines
5 KiB
Python

"""InvestEngine email parser.
IE mails the user after each trade batch. The body shape varies — over
the years IE has sent trade confirmations as plain-text RFC 2822
messages, multipart HTML emails with a summary table, and (for older
statements) CSV attachments. This module tries the three strategies in
order and returns the first that yields at least one Activity.
Every parse strategy produces canonical `Activity` objects with:
- `account_id = "invest-engine-primary"` (sink remaps to Wealthfolio UUID)
- `account_type = AccountType.ISA` (Viktor's IE account is an ISA)
- `currency = "GBP"`
- `external_id = f"invest-engine:{fingerprint}"` where fingerprint hashes
(date, symbol, quantity, unit_price) for deterministic dedup.
"""
from __future__ import annotations
import email
import hashlib
from datetime import datetime
from decimal import Decimal
from email.message import Message
from broker_sync.models import AccountType, Activity, ActivityType
_ACCOUNT_ID = "invest-engine-primary"
_CURRENCY_SIGN = "£"
def parse_invest_engine_email(raw_email: bytes) -> list[Activity]:
"""Parse an IE trade confirmation email into Activity records.
Returns an empty list when none of the three strategies match — never
raises on malformed input.
"""
msg = email.message_from_bytes(raw_email)
body = _extract_text_body(msg)
if body is None:
return []
return _parse_rfc2822_lines(body)
def _extract_text_body(msg: Message) -> str | None:
"""Return the text/plain body of an email, or None if absent."""
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
payload = part.get_payload(decode=True)
if isinstance(payload, bytes):
return payload.decode(part.get_content_charset() or "utf-8", errors="replace")
return None
payload = msg.get_payload(decode=True)
if isinstance(payload, bytes):
return payload.decode(msg.get_content_charset() or "utf-8", errors="replace")
if isinstance(payload, str):
return payload
return None
def _parse_rfc2822_lines(body: str) -> list[Activity]:
"""Try each line-based body format (v1/v2) and return matches.
Corresponds to `_extract_position_v1` and `_extract_position_v2` in
the upstream parser. Returns a one-element list on success, `[]`
otherwise.
"""
for parser in (_try_v2, _try_v1):
result = parser(body)
if result is not None:
return [result]
return []
def _try_v2(body: str) -> Activity | None:
"""Parse body with v2 layout: `Date: DD Month` on line 2, year on line 3."""
lines = body.splitlines()
if len(lines) < 6:
return None
try:
day_str, month = lines[2].split()[-2:]
year = lines[3].split()[0]
on_date = datetime.strptime(f"{day_str}-{month}-{year}", "%d-%B-%Y")
symbol = lines[4].split(":")[1].split()[0].strip()
unit_price = Decimal(lines[4].split(_CURRENCY_SIGN)[1].split()[0])
quantity = Decimal(lines[4].split("Bought")[1].split()[0])
except (ValueError, IndexError):
return None
return _build_activity(
on_date=on_date,
symbol=symbol,
quantity=quantity,
unit_price=unit_price,
strategy="rfc2822-v2",
matched=lines[4],
)
def _try_v1(body: str) -> Activity | None:
"""Parse body with v1 layout: `Date: DD` on line 2, `Month YYYY` on line 3."""
lines = body.splitlines()
if len(lines) < 6:
return None
try:
day = int(lines[2].split("Date: ")[1])
month, year = (lines[3].split(" ")[0]).split()
on_date = datetime.strptime(f"{day}-{month}-{year}", "%d-%B-%Y")
symbol = lines[4].split(":")[1].split()[0].strip()
quantity = Decimal(lines[4].split("Bought")[1].split()[0])
price_str = lines[4].split("Bought")[1].split("@")[1].split()[0].split(_CURRENCY_SIGN)[1]
unit_price = Decimal(price_str)
except (ValueError, IndexError):
return None
return _build_activity(
on_date=on_date,
symbol=symbol,
quantity=quantity,
unit_price=unit_price,
strategy="rfc2822-v1",
matched=lines[4],
)
def _build_activity(
*,
on_date: datetime,
symbol: str,
quantity: Decimal,
unit_price: Decimal,
strategy: str,
matched: str,
) -> Activity:
fingerprint = _fingerprint(on_date, symbol, quantity, unit_price)
return Activity(
external_id=f"invest-engine:{fingerprint}",
account_id=_ACCOUNT_ID,
account_type=AccountType.ISA,
date=on_date,
activity_type=ActivityType.BUY,
currency="GBP",
symbol=symbol,
quantity=quantity,
unit_price=unit_price,
notes=f"[{strategy}] {matched.strip()}",
)
def _fingerprint(date: datetime, symbol: str, quantity: Decimal, unit_price: Decimal) -> str:
key = f"{date.isoformat()}|{symbol}|{quantity}|{unit_price}"
return hashlib.sha256(key.encode("utf-8")).hexdigest()[:16]