From 9ec8ece2d905a604f07616388b81335bee585581 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 21:49:52 +0000 Subject: [PATCH 01/17] =?UTF-8?q?Add=20InvestEngine=20email=20parser=20?= =?UTF-8?q?=E2=80=94=20RFC=202822=20v1/v2=20line=20format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Context: The old finance/ app had a 324-line IE message parser with four line-based variants (v1/v2/v3/v4) plus an HTML strategy and a CSV fallback. Port into broker-sync so we can consume IE trade confirmation emails as a backup to the live HTTP client (Phase 2b) while IE's public API remains Bearer-only. The upstream parser emits storage.model.Position; we emit canonical Activity with the broker-sync invariants: account_id="invest-engine-primary" (sink remaps to Wealthfolio UUID), account_type=ISA, currency=GBP, and external_id="invest-engine:" where the fingerprint is a SHA-256 of (date|symbol|quantity|unit_price) — deterministic so repeat imports of the same email dedup at the sync-record layer. This change: - Top-level `parse_invest_engine_email(raw_email: bytes) -> list[Activity]` extracts the text/plain body from an RFC 2822 message and dispatches to the line-based parser. - `_parse_rfc2822_lines(body)` tries the v2 layout first (newer IE format where `Date: DD Month` is on line 2 and the year on line 3), then the v1 layout (where the day alone is on line 2 and `Month YYYY` on line 3). v3 and v4 variants are re-added in a follow-up if we find fixtures where they matter — initial fixture coverage hits v2. - Drops the upstream `_ticker_post_processing` VUAG→VUAG.L hack. Wealthfolio's /import/check endpoint resolves exchange suffixes; the Trading212 provider also emits suffix-free tickers (e.g. `VUAG`), so staying consistent avoids double-mapping. - Notes field records the parse-strategy tag ("rfc2822-v2") plus the matched line for debugging. Test plan: poetry run pytest tests/providers/parsers/ -q → 3 passed in 0.03s poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → Success: no issues found in 2 source files poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed! poetry run yapf --diff broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean (no diff) Manual verification: load the fixture email, call the parser, inspect the returned Activity has symbol=VUAG, quantity=59.539562, unit_price=60.46, date=2023-01-17, external_id starts with invest-engine:. --- .../providers/parsers/invest_engine.py | 150 ++++++++++++++++++ .../invest_engine/rfc2822_v2_single_buy.eml | 15 ++ tests/providers/parsers/__init__.py | 0 tests/providers/parsers/test_invest_engine.py | 44 +++++ 4 files changed, 209 insertions(+) create mode 100644 broker_sync/providers/parsers/invest_engine.py create mode 100644 tests/fixtures/invest_engine/rfc2822_v2_single_buy.eml create mode 100644 tests/providers/parsers/__init__.py create mode 100644 tests/providers/parsers/test_invest_engine.py diff --git a/broker_sync/providers/parsers/invest_engine.py b/broker_sync/providers/parsers/invest_engine.py new file mode 100644 index 0000000..6750d8c --- /dev/null +++ b/broker_sync/providers/parsers/invest_engine.py @@ -0,0 +1,150 @@ +"""InvestEngine email parser. + +IE mails the user after each trade batch. The body shape varies — over +the years IE has sent trade confirmations as plain-text RFC 2822 +messages, multipart HTML emails with a summary table, and (for older +statements) CSV attachments. This module tries the three strategies in +order and returns the first that yields at least one Activity. + +Every parse strategy produces canonical `Activity` objects with: +- `account_id = "invest-engine-primary"` (sink remaps to Wealthfolio UUID) +- `account_type = AccountType.ISA` (Viktor's IE account is an ISA) +- `currency = "GBP"` +- `external_id = f"invest-engine:{fingerprint}"` where fingerprint hashes + (date, symbol, quantity, unit_price) for deterministic dedup. +""" + +from __future__ import annotations + +import email +import hashlib +from datetime import datetime +from decimal import Decimal +from email.message import Message + +from broker_sync.models import AccountType, Activity, ActivityType + +_ACCOUNT_ID = "invest-engine-primary" +_CURRENCY_SIGN = "£" + + +def parse_invest_engine_email(raw_email: bytes) -> list[Activity]: + """Parse an IE trade confirmation email into Activity records. + + Returns an empty list when none of the three strategies match — never + raises on malformed input. + """ + msg = email.message_from_bytes(raw_email) + body = _extract_text_body(msg) + if body is None: + return [] + return _parse_rfc2822_lines(body) + + +def _extract_text_body(msg: Message) -> str | None: + """Return the text/plain body of an email, or None if absent.""" + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + payload = part.get_payload(decode=True) + if isinstance(payload, bytes): + return payload.decode(part.get_content_charset() or "utf-8", errors="replace") + return None + payload = msg.get_payload(decode=True) + if isinstance(payload, bytes): + return payload.decode(msg.get_content_charset() or "utf-8", errors="replace") + if isinstance(payload, str): + return payload + return None + + +def _parse_rfc2822_lines(body: str) -> list[Activity]: + """Try each line-based body format (v1/v2) and return matches. + + Corresponds to `_extract_position_v1` and `_extract_position_v2` in + the upstream parser. Returns a one-element list on success, `[]` + otherwise. + """ + for parser in (_try_v2, _try_v1): + result = parser(body) + if result is not None: + return [result] + return [] + + +def _try_v2(body: str) -> Activity | None: + """Parse body with v2 layout: `Date: DD Month` on line 2, year on line 3.""" + lines = body.splitlines() + if len(lines) < 6: + return None + try: + day_str, month = lines[2].split()[-2:] + year = lines[3].split()[0] + on_date = datetime.strptime(f"{day_str}-{month}-{year}", "%d-%B-%Y") + symbol = lines[4].split(":")[1].split()[0].strip() + unit_price = Decimal(lines[4].split(_CURRENCY_SIGN)[1].split()[0]) + quantity = Decimal(lines[4].split("Bought")[1].split()[0]) + except (ValueError, IndexError): + return None + return _build_activity( + on_date=on_date, + symbol=symbol, + quantity=quantity, + unit_price=unit_price, + strategy="rfc2822-v2", + matched=lines[4], + ) + + +def _try_v1(body: str) -> Activity | None: + """Parse body with v1 layout: `Date: DD` on line 2, `Month YYYY` on line 3.""" + lines = body.splitlines() + if len(lines) < 6: + return None + try: + day = int(lines[2].split("Date: ")[1]) + month, year = (lines[3].split(" ")[0]).split() + on_date = datetime.strptime(f"{day}-{month}-{year}", "%d-%B-%Y") + symbol = lines[4].split(":")[1].split()[0].strip() + quantity = Decimal(lines[4].split("Bought")[1].split()[0]) + price_str = lines[4].split("Bought")[1].split("@")[1].split()[0].split(_CURRENCY_SIGN)[1] + unit_price = Decimal(price_str) + except (ValueError, IndexError): + return None + return _build_activity( + on_date=on_date, + symbol=symbol, + quantity=quantity, + unit_price=unit_price, + strategy="rfc2822-v1", + matched=lines[4], + ) + + +def _build_activity( + *, + on_date: datetime, + symbol: str, + quantity: Decimal, + unit_price: Decimal, + strategy: str, + matched: str, +) -> Activity: + fingerprint = _fingerprint(on_date, symbol, quantity, unit_price) + return Activity( + external_id=f"invest-engine:{fingerprint}", + account_id=_ACCOUNT_ID, + account_type=AccountType.ISA, + date=on_date, + activity_type=ActivityType.BUY, + currency="GBP", + symbol=symbol, + quantity=quantity, + unit_price=unit_price, + notes=f"[{strategy}] {matched.strip()}", + ) + + +def _fingerprint(date: datetime, symbol: str, quantity: Decimal, unit_price: Decimal) -> str: + key = f"{date.isoformat()}|{symbol}|{quantity}|{unit_price}" + return hashlib.sha256(key.encode("utf-8")).hexdigest()[:16] diff --git a/tests/fixtures/invest_engine/rfc2822_v2_single_buy.eml b/tests/fixtures/invest_engine/rfc2822_v2_single_buy.eml new file mode 100644 index 0000000..d06afa0 --- /dev/null +++ b/tests/fixtures/invest_engine/rfc2822_v2_single_buy.eml @@ -0,0 +1,15 @@ +From: InvestEngine +To: viktorbarzin@example.com +Subject: Your portfolio has been updated +Date: Tue, 17 Jan 2023 14:48:00 +0000 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + + We've executed your orders and your +portfolio has been updated Client name: Redacted Trading +venue: London Stock Exchange Type: Market Order(s) Date: 17 January +2023 Here's a summary of the trades we've made for you +Vanguard S&P 500: VUAG Bought 59.539562 @ £60.46 per share Total: +£3600.00 ISIN: IE00BFMXXD54, Order ID: 199510/2163746, Traded at +2:48pm GMT/UTC Take me to my updated portfolio diff --git a/tests/providers/parsers/__init__.py b/tests/providers/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/providers/parsers/test_invest_engine.py b/tests/providers/parsers/test_invest_engine.py new file mode 100644 index 0000000..8e04633 --- /dev/null +++ b/tests/providers/parsers/test_invest_engine.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from datetime import datetime +from decimal import Decimal +from pathlib import Path + +from broker_sync.models import AccountType, ActivityType +from broker_sync.providers.parsers.invest_engine import parse_invest_engine_email + +_FIXTURES = Path(__file__).parent.parent.parent / "fixtures" / "invest_engine" + + +def _load(name: str) -> bytes: + return (_FIXTURES / name).read_bytes() + + +# -- RFC 2822 body (v2-style, single BUY) -- + + +def test_rfc2822_single_buy_parses_to_one_activity() -> None: + activities = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml")) + assert len(activities) == 1 + a = activities[0] + assert a.activity_type is ActivityType.BUY + assert a.symbol == "VUAG" + assert a.quantity == Decimal("59.539562") + assert a.unit_price == Decimal("60.46") + assert a.currency == "GBP" + assert a.date == datetime(2023, 1, 17) + assert a.account_id == "invest-engine-primary" + assert a.account_type is AccountType.ISA + + +def test_rfc2822_external_id_is_deterministic() -> None: + a1 = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0] + a2 = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0] + assert a1.external_id == a2.external_id + assert a1.external_id.startswith("invest-engine:") + + +def test_rfc2822_notes_record_parse_strategy() -> None: + a = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0] + assert a.notes is not None + assert "rfc2822" in a.notes From 72d348e294f5bd3671b4d9ce75fc1701476f4223 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 21:58:15 +0000 Subject: [PATCH 02/17] Add HTML table fallback for InvestEngine email parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Context: Plain-text IE emails vanished around 2024-Q2 when IE switched to an HTML-only template with per-order nested summary tables. The RFC 2822 line parser returns [] on those modern emails, so we need a fallback that walks the HTML table structure. Upstream _extract_from_html parsed a fixed DOM path (table[1].tr[10]. table) and only handled ONE order per email. The real IE HTML template nests one summary per ticker inside the second top-level table — multiple orders in a single batched confirmation are common — so this port walks every leaf table (no child
) and interprets each one as an independent trade summary. Structural (non-leaf) tables are skipped to avoid double-counting via get_text(). This change: - `_parse_html_tables(body)` extracts the date once from the full text then walks leaf tables looking for "Bought N @ £P" rows. - `_try_html_summary_table` parses one leaf; returns None on structural tables or missing ticker/qty/price — so a partial email yields only its intact orders (the "2 orders, 1 parseable → 1 returned" invariant works by construction without raising). - `parse_invest_engine_email` now falls through text/plain → text/html in the multipart message, picking the first strategy that returns activities. Order matters: text/plain wins when both succeed because the RFC 2822 strategy is the more constrained grammar. - Regexes are module-level constants so they compile once per process. Fixture `html_two_orders.eml` is a minimal-but-realistic multipart email with two nested summary tables (VUAG + SWDA), no personal data beyond tickers/qty/price. Test plan: poetry run pytest tests/providers/parsers/ -q → 5 passed in 0.16s poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → Success: no issues found in 2 source files poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed! poetry run yapf --diff broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean (no diff) Manual verification: load html_two_orders.eml, call parse_invest_engine_email, assert len == 2 with both expected tickers (VUAG, SWDA) and numbers, dates set to 2026-04-01. --- .../providers/parsers/invest_engine.py | 132 ++++++++++++++++-- .../invest_engine/html_two_orders.eml | 55 ++++++++ tests/providers/parsers/test_invest_engine.py | 26 ++++ 3 files changed, 198 insertions(+), 15 deletions(-) create mode 100644 tests/fixtures/invest_engine/html_two_orders.eml diff --git a/broker_sync/providers/parsers/invest_engine.py b/broker_sync/providers/parsers/invest_engine.py index 6750d8c..7c8a494 100644 --- a/broker_sync/providers/parsers/invest_engine.py +++ b/broker_sync/providers/parsers/invest_engine.py @@ -18,41 +18,69 @@ from __future__ import annotations import email import hashlib +import re from datetime import datetime from decimal import Decimal from email.message import Message +from bs4 import BeautifulSoup + from broker_sync.models import AccountType, Activity, ActivityType _ACCOUNT_ID = "invest-engine-primary" _CURRENCY_SIGN = "£" +# HTML trade summary rows have the shape "Bought @ £ per share". +_BOUGHT_RE = re.compile( + r"Bought\s+([0-9]+(?:\.[0-9]+)?)\s*@\s*" + re.escape(_CURRENCY_SIGN) + r"([0-9]+(?:\.[0-9]+)?)", + re.IGNORECASE, +) +# Ticker lines look like "Vanguard S&P 500: VUAG" — we want the last +# all-caps token after the colon. +_TICKER_RE = re.compile(r":\s*([A-Z][A-Z0-9]{1,9})\s*$") +# Date rows contain "Date: DD Month YYYY". +_DATE_RE = re.compile( + r"Date:\s*([0-9]{1,2})\s+([A-Za-z]+)\s+([0-9]{4})", + re.IGNORECASE, +) + def parse_invest_engine_email(raw_email: bytes) -> list[Activity]: """Parse an IE trade confirmation email into Activity records. - Returns an empty list when none of the three strategies match — never - raises on malformed input. + Tries RFC 2822 body lines first, then HTML tables. Returns an empty + list when nothing matches — never raises on malformed input. """ msg = email.message_from_bytes(raw_email) - body = _extract_text_body(msg) - if body is None: - return [] - return _parse_rfc2822_lines(body) + text_body = _extract_part_body(msg, "text/plain") + if text_body is not None: + activities = _parse_rfc2822_lines(text_body) + if activities: + return activities + html_body = _extract_part_body(msg, "text/html") + if html_body is not None: + activities = _parse_html_tables(html_body) + if activities: + return activities + return [] -def _extract_text_body(msg: Message) -> str | None: - """Return the text/plain body of an email, or None if absent.""" +def _extract_part_body(msg: Message, content_type: str) -> str | None: + """Return the first sub-part of the given content type, or None.""" if msg.is_multipart(): for part in msg.walk(): - if part.get_content_type() == "text/plain": - payload = part.get_payload(decode=True) - if isinstance(payload, bytes): - return payload.decode(part.get_content_charset() or "utf-8", errors="replace") + if part.get_content_type() == content_type: + return _decode_payload(part) return None - payload = msg.get_payload(decode=True) + if msg.get_content_type() == content_type: + return _decode_payload(msg) + return None + + +def _decode_payload(part: Message) -> str | None: + payload = part.get_payload(decode=True) if isinstance(payload, bytes): - return payload.decode(msg.get_content_charset() or "utf-8", errors="replace") + return payload.decode(part.get_content_charset() or "utf-8", errors="replace") if isinstance(payload, str): return payload return None @@ -63,7 +91,8 @@ def _parse_rfc2822_lines(body: str) -> list[Activity]: Corresponds to `_extract_position_v1` and `_extract_position_v2` in the upstream parser. Returns a one-element list on success, `[]` - otherwise. + otherwise. v3/v4 are not ported — no surviving fixtures exist and + the HTML fallback covers newer formats. """ for parser in (_try_v2, _try_v1): result = parser(body) @@ -121,6 +150,79 @@ def _try_v1(body: str) -> Activity | None: ) +def _parse_html_tables(body: str) -> list[Activity]: + """Parse an HTML body with per-order nested summary tables. + + Walks every leaf
(a table with no child tables); each leaf + carries one trade summary (ticker, bought line, total, ISIN + order + id). Tables that don't contain the expected shape are skipped, so a + partially corrupted email yields only its intact orders. + """ + soup = BeautifulSoup(body, "html.parser") + on_date = _extract_html_date(soup) + if on_date is None: + return [] + activities: list[Activity] = [] + for table in soup.find_all("table"): + if table.find("table") is not None: + continue + activity = _try_html_summary_table(table, on_date) + if activity is not None: + activities.append(activity) + return activities + + +def _extract_html_date(soup: BeautifulSoup) -> datetime | None: + match = _DATE_RE.search(soup.get_text(" ", strip=True)) + if match is None: + return None + day, month, year = match.groups() + try: + return datetime.strptime(f"{day}-{month}-{year}", "%d-%B-%Y") + except ValueError: + return None + + +def _try_html_summary_table(nested: object, on_date: datetime) -> Activity | None: + """Interpret a leaf
as a single trade summary. + + Returns None if the table is structural (no "Bought N @ £P" row) or + any required field is missing. + """ + get_text = getattr(nested, "get_text", None) + if get_text is None: + return None + text = get_text(" ", strip=True) + bought = _BOUGHT_RE.search(text) + if bought is None: + return None + symbol = _extract_html_symbol(nested) + if symbol is None: + return None + quantity = Decimal(bought.group(1)) + unit_price = Decimal(bought.group(2)) + return _build_activity( + on_date=on_date, + symbol=symbol, + quantity=quantity, + unit_price=unit_price, + strategy="html", + matched=text[:200], + ) + + +def _extract_html_symbol(nested: object) -> str | None: + find_all = getattr(nested, "find_all", None) + if find_all is None: + return None + for cell in find_all("td"): + cell_text = cell.get_text(" ", strip=True) + m = _TICKER_RE.search(cell_text) + if m is not None: + return m.group(1) + return None + + def _build_activity( *, on_date: datetime, diff --git a/tests/fixtures/invest_engine/html_two_orders.eml b/tests/fixtures/invest_engine/html_two_orders.eml new file mode 100644 index 0000000..b360b14 --- /dev/null +++ b/tests/fixtures/invest_engine/html_two_orders.eml @@ -0,0 +1,55 @@ +From: InvestEngine +To: viktorbarzin@example.com +Subject: Your portfolio has been updated +Date: Wed, 01 Apr 2026 09:15:00 +0000 +MIME-Version: 1.0 +Content-Type: multipart/alternative; boundary="----=_Part_1" + +------=_Part_1 +Content-Type: text/plain; charset=UTF-8 + +(HTML-only view — your client does not render HTML emails.) + +------=_Part_1 +Content-Type: text/html; charset=UTF-8 + +InvestEngine +
Header logo
+ + + + + + + + + + + + + + + + + + + + +
Client name: Redacted
Trading venue: London Stock Exchange
Type: Market Order(s)
Here's a summary of the trades we've made for you
abcd Date: 01 April 2026
filler
filler
filler
filler
filler
+ + + + + +
Vanguard S&P 500: VUAG
Bought 10.5 @ £62.10 per share
Total: £652.05
ISIN: IE00BFMXXD54, Order ID: 300000/4000001, Traded at 9:05am GMT
+
+ + + + + +
iShares Core MSCI World: SWDA
Bought 2.25 @ £85.40 per share
Total: £192.15
ISIN: IE00B4L5Y983, Order ID: 300000/4000002, Traded at 9:06am GMT
+
+ + +------=_Part_1-- diff --git a/tests/providers/parsers/test_invest_engine.py b/tests/providers/parsers/test_invest_engine.py index 8e04633..d397bda 100644 --- a/tests/providers/parsers/test_invest_engine.py +++ b/tests/providers/parsers/test_invest_engine.py @@ -42,3 +42,29 @@ def test_rfc2822_notes_record_parse_strategy() -> None: a = parse_invest_engine_email(_load("rfc2822_v2_single_buy.eml"))[0] assert a.notes is not None assert "rfc2822" in a.notes + + +# -- HTML table body (multipart/alternative, two orders) -- + + +def test_html_body_parses_both_orders() -> None: + activities = parse_invest_engine_email(_load("html_two_orders.eml")) + assert len(activities) == 2 + a, b = activities + assert a.symbol == "VUAG" + assert a.quantity == Decimal("10.5") + assert a.unit_price == Decimal("62.10") + assert a.date == datetime(2026, 4, 1) + assert a.account_id == "invest-engine-primary" + assert a.account_type is AccountType.ISA + assert a.activity_type is ActivityType.BUY + assert b.symbol == "SWDA" + assert b.quantity == Decimal("2.25") + assert b.unit_price == Decimal("85.40") + assert b.date == datetime(2026, 4, 1) + + +def test_html_notes_record_html_strategy() -> None: + a = parse_invest_engine_email(_load("html_two_orders.eml"))[0] + assert a.notes is not None + assert "html" in a.notes From 020ba16723af101f40115e17f5a46a5948955380 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 22:01:46 +0000 Subject: [PATCH 03/17] Add CSV attachment fallback for InvestEngine email parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Context: IE has not (yet) sent CSV-attached statements in production, but the upstream parser had _extract_positions_csv as a third fallback for exactly this case. Keeping the fallback preserves behaviour-parity with the legacy parser and makes future statement support one fixture away — the shape is documented by column set, not scraped live. Unlike the upstream which split the body on whitespace and broke on any embedded commas in names, this port walks real MIME attachments using Python's csv.DictReader. A part qualifies as CSV if: - its Content-Type is text/csv / application/csv / application/vnd.ms-excel, OR - its filename ends in .csv (defence against IE mis-labelling the part) Rows missing required columns or containing unparseable numbers/dates are skipped silently — consistent with the "partial match" contract: a half-corrupt CSV yields whatever rows were intact. Required columns: ticker, unit_price, quantity, date (YYYY-MM-DD), currency. Non-GBP rows are filtered because the IE ISA is strictly sterling — flagging this assumption in the review notes. This change: - Adds `_parse_csv_attachment(raw_email)` as the third strategy after text/plain and text/html; it re-parses the raw email bytes so we can inspect Content-Type/filename on each part. - Flags symbols/currencies, filters non-GBP, and runs each row through the shared `_build_activity` so external_id formation matches every other strategy (dedup stays consistent across strategies). - Fixture `csv_attachment.eml` has three rows (VUAG, SWDA, VUSA) in a `text/csv` part with a `.csv` filename — covers both detection paths. Test plan: poetry run pytest tests/providers/parsers/ -q → 6 passed in 0.15s poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed! poetry run yapf --diff → clean (no diff) Manual verification: load csv_attachment.eml, call parse_invest_engine_email, assert 3 activities each with symbol in {VUAG,SWDA,VUSA}, currency=GBP, notes containing "csv". --- .../providers/parsers/invest_engine.py | 83 ++++++++++++++++++- .../fixtures/invest_engine/csv_attachment.eml | 22 +++++ tests/providers/parsers/test_invest_engine.py | 21 +++++ 3 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 tests/fixtures/invest_engine/csv_attachment.eml diff --git a/broker_sync/providers/parsers/invest_engine.py b/broker_sync/providers/parsers/invest_engine.py index 7c8a494..2ab4a34 100644 --- a/broker_sync/providers/parsers/invest_engine.py +++ b/broker_sync/providers/parsers/invest_engine.py @@ -16,11 +16,13 @@ Every parse strategy produces canonical `Activity` objects with: from __future__ import annotations +import csv import email import hashlib +import io import re from datetime import datetime -from decimal import Decimal +from decimal import Decimal, InvalidOperation from email.message import Message from bs4 import BeautifulSoup @@ -48,8 +50,9 @@ _DATE_RE = re.compile( def parse_invest_engine_email(raw_email: bytes) -> list[Activity]: """Parse an IE trade confirmation email into Activity records. - Tries RFC 2822 body lines first, then HTML tables. Returns an empty - list when nothing matches — never raises on malformed input. + Tries RFC 2822 body lines first, then HTML tables, then a CSV + attachment. Returns an empty list when nothing matches — never + raises on malformed input. """ msg = email.message_from_bytes(raw_email) text_body = _extract_part_body(msg, "text/plain") @@ -62,6 +65,9 @@ def parse_invest_engine_email(raw_email: bytes) -> list[Activity]: activities = _parse_html_tables(html_body) if activities: return activities + csv_activities = _parse_csv_attachment(raw_email) + if csv_activities: + return csv_activities return [] @@ -223,6 +229,77 @@ def _extract_html_symbol(nested: object) -> str | None: return None +_CSV_CONTENT_TYPES = {"text/csv", "application/csv", "application/vnd.ms-excel"} +# Required columns for the CSV attachment strategy. IE has not (yet) sent +# CSV-attached statements in production — the column set here mirrors the +# upstream _extract_positions_csv contract (ticker, buy_price, num_shares, +# buy_date, currency) with modern names. +_CSV_COLUMNS = {"ticker", "unit_price", "quantity", "date", "currency"} + + +def _parse_csv_attachment(raw_email: bytes) -> list[Activity]: + """Parse a CSV attachment from the email into Activity records. + + Walks every MIME part, picks the first one with a CSV-ish content + type OR a `.csv` filename, and iterates its rows. Rows missing a + required column or with an unparseable number/date are skipped. + """ + msg = email.message_from_bytes(raw_email) + csv_text = _extract_csv_attachment_text(msg) + if csv_text is None: + return [] + reader = csv.DictReader(io.StringIO(csv_text)) + fieldnames = set(reader.fieldnames or []) + if not _CSV_COLUMNS.issubset(fieldnames): + return [] + activities: list[Activity] = [] + for row in reader: + activity = _csv_row_to_activity(row) + if activity is not None: + activities.append(activity) + return activities + + +def _extract_csv_attachment_text(msg: Message) -> str | None: + for part in msg.walk(): + if not _looks_like_csv_part(part): + continue + payload = part.get_payload(decode=True) + if isinstance(payload, bytes): + return payload.decode(part.get_content_charset() or "utf-8", errors="replace") + if isinstance(payload, str): + return payload + return None + + +def _looks_like_csv_part(part: Message) -> bool: + if part.get_content_type() in _CSV_CONTENT_TYPES: + return True + filename = part.get_filename() + return isinstance(filename, str) and filename.lower().endswith(".csv") + + +def _csv_row_to_activity(row: dict[str, str]) -> Activity | None: + try: + on_date = datetime.strptime(row["date"], "%Y-%m-%d") + symbol = row["ticker"].strip() + quantity = Decimal(row["quantity"]) + unit_price = Decimal(row["unit_price"]) + currency = row["currency"].strip() or "GBP" + except (KeyError, ValueError, InvalidOperation): + return None + if not symbol or currency != "GBP": + return None + return _build_activity( + on_date=on_date, + symbol=symbol, + quantity=quantity, + unit_price=unit_price, + strategy="csv", + matched=f"{symbol},{unit_price},{quantity},{row['date']}", + ) + + def _build_activity( *, on_date: datetime, diff --git a/tests/fixtures/invest_engine/csv_attachment.eml b/tests/fixtures/invest_engine/csv_attachment.eml new file mode 100644 index 0000000..b247c00 --- /dev/null +++ b/tests/fixtures/invest_engine/csv_attachment.eml @@ -0,0 +1,22 @@ +From: InvestEngine +To: viktorbarzin@example.com +Subject: Your InvestEngine statement +Date: Mon, 07 Apr 2025 09:00:00 +0000 +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="----=_MIXED_1" + +------=_MIXED_1 +Content-Type: text/plain; charset=UTF-8 + +Your monthly statement is attached as a CSV. + +------=_MIXED_1 +Content-Type: text/csv; charset=UTF-8; name="statement.csv" +Content-Disposition: attachment; filename="statement.csv" + +ticker,unit_price,quantity,date,currency +VUAG,63.21,12.5,2025-04-02,GBP +SWDA,86.40,4.75,2025-04-03,GBP +VUSA,90.10,1.0,2025-04-04,GBP + +------=_MIXED_1-- diff --git a/tests/providers/parsers/test_invest_engine.py b/tests/providers/parsers/test_invest_engine.py index d397bda..8ef81d3 100644 --- a/tests/providers/parsers/test_invest_engine.py +++ b/tests/providers/parsers/test_invest_engine.py @@ -68,3 +68,24 @@ def test_html_notes_record_html_strategy() -> None: a = parse_invest_engine_email(_load("html_two_orders.eml"))[0] assert a.notes is not None assert "html" in a.notes + + +# -- CSV attachment body -- + + +def test_csv_attachment_parses_all_rows() -> None: + activities = parse_invest_engine_email(_load("csv_attachment.eml")) + assert len(activities) == 3 + by_symbol = {a.symbol: a for a in activities} + assert by_symbol["VUAG"].quantity == Decimal("12.5") + assert by_symbol["VUAG"].unit_price == Decimal("63.21") + assert by_symbol["VUAG"].date == datetime(2025, 4, 2) + assert by_symbol["SWDA"].quantity == Decimal("4.75") + assert by_symbol["VUSA"].date == datetime(2025, 4, 4) + for a in activities: + assert a.activity_type is ActivityType.BUY + assert a.currency == "GBP" + assert a.account_id == "invest-engine-primary" + assert a.account_type is AccountType.ISA + assert a.notes is not None + assert "csv" in a.notes From 87526898e6afcedac4fc8cd18e283467dccd1e4a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 22:02:48 +0000 Subject: [PATCH 04/17] =?UTF-8?q?Pin=20InvestEngine=20parser=20failure=20m?= =?UTF-8?q?odes=20=E2=80=94=20empty-on-junk=20+=20partial-match?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Context: The port's graceful-failure contract was implicit in the way each strategy returns None/[] on malformed input, but without tests it was an accidental property that could regress silently. Codify it. Two invariants, each backed by a fixture: 1. Junk email → empty list, never raise. `unparseable.eml` is a pure-marketing IE newsletter with no order data. All three strategies try and fail; parse_invest_engine_email returns []. No exception leaks. 2. Partial HTML email → intact orders only. `html_partial_match.eml` has two nested summary tables: one with a valid VUAG order, one that is missing both the ticker and "Bought N @ £P" rows (simulates IE dropping content mid-render). The parser returns just the VUAG order. No implementation change needed — the behaviour existed as a side effect of _try_html_summary_table returning None on missing fields. These tests lock it down so future refactors can't quietly break it. Test plan: poetry run pytest tests/providers/parsers/ -q → 8 passed in 0.19s poetry run mypy broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → clean poetry run ruff check broker_sync/providers/parsers/invest_engine.py tests/providers/parsers/test_invest_engine.py → All checks passed! poetry run yapf --diff → clean (no diff) Manual verification: - Load unparseable.eml → parse returns []. - Load html_partial_match.eml → parse returns exactly 1 activity (VUAG). --- .../invest_engine/html_partial_match.eml | 40 +++++++++++++++++++ tests/fixtures/invest_engine/unparseable.eml | 15 +++++++ tests/providers/parsers/test_invest_engine.py | 17 ++++++++ 3 files changed, 72 insertions(+) create mode 100644 tests/fixtures/invest_engine/html_partial_match.eml create mode 100644 tests/fixtures/invest_engine/unparseable.eml diff --git a/tests/fixtures/invest_engine/html_partial_match.eml b/tests/fixtures/invest_engine/html_partial_match.eml new file mode 100644 index 0000000..fc41aa1 --- /dev/null +++ b/tests/fixtures/invest_engine/html_partial_match.eml @@ -0,0 +1,40 @@ +From: InvestEngine +To: viktorbarzin@example.com +Subject: Your portfolio has been updated +Date: Wed, 15 Apr 2026 11:00:00 +0000 +MIME-Version: 1.0 +Content-Type: multipart/alternative; boundary="----=_Part_PM" + +------=_Part_PM +Content-Type: text/plain; charset=UTF-8 + +(HTML-only view — your client does not render HTML emails.) + +------=_Part_PM +Content-Type: text/html; charset=UTF-8 + + +
Logo
+ + + + + + + + +
Date: 15 April 2026
+ + + + +
Vanguard S&P 500: VUAG
Bought 3.0 @ £61.25 per share
Total: £183.75
+
+ + + +
Some broken order with no ticker and no bought line
(Malformed — IE dropped a row mid-render)
+
+ + +------=_Part_PM-- diff --git a/tests/fixtures/invest_engine/unparseable.eml b/tests/fixtures/invest_engine/unparseable.eml new file mode 100644 index 0000000..933f99a --- /dev/null +++ b/tests/fixtures/invest_engine/unparseable.eml @@ -0,0 +1,15 @@ +From: InvestEngine +To: viktorbarzin@example.com +Subject: InvestEngine newsletter +Date: Thu, 10 Apr 2025 12:00:00 +0000 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 + +Hi Viktor, + +This is a newsletter, not a trade confirmation. There is no structured +order data here — just marketing copy and a promo for a new feature we +are rolling out. Thanks for being a customer. + +Cheers, +The InvestEngine team diff --git a/tests/providers/parsers/test_invest_engine.py b/tests/providers/parsers/test_invest_engine.py index 8ef81d3..9c30889 100644 --- a/tests/providers/parsers/test_invest_engine.py +++ b/tests/providers/parsers/test_invest_engine.py @@ -89,3 +89,20 @@ def test_csv_attachment_parses_all_rows() -> None: assert a.account_type is AccountType.ISA assert a.notes is not None assert "csv" in a.notes + + +# -- graceful failure modes -- + + +def test_unparseable_email_returns_empty_list() -> None: + assert parse_invest_engine_email(_load("unparseable.eml")) == [] + + +def test_html_partial_match_returns_only_parseable_orders() -> None: + activities = parse_invest_engine_email(_load("html_partial_match.eml")) + assert len(activities) == 1 + a = activities[0] + assert a.symbol == "VUAG" + assert a.quantity == Decimal("3.0") + assert a.unit_price == Decimal("61.25") + assert a.date == datetime(2026, 4, 15) From f089b8b93a6a72009006f49eb7c115d316f0cffe Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 22:08:40 +0000 Subject: [PATCH 05/17] Add Schwab email parser (port from finance/) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Schwab's workplace-RSU confirmation emails have 5 data td elements with class='dark-background-body' align='right': date, direction, qty, ticker, price-with-currency-sign. One email → one Activity. - parse_schwab_email(raw_html) -> list[Activity] (1-item or empty) - Empty on any parse failure (IMAP batch shouldn't crash on one bad mail) - Deterministic external_id ('schwab:date:ticker:type:qty') — stable across re-pulls so dedup works - Hardcoded to account 'schwab-workplace' / AccountType.GIA / USD - 6 unit tests: SELL + BUY happy path, malformed, missing cells, external-id stability, commas in price Dropped from the original finance port: - msg_timestamp-based external id (non-deterministic — would re-import on every IMAP walk). Replaced with a hash-stable key. - Currency.from_sign() currency hack. Schwab US is USD-only; we'll add FX when that changes. poetry run pytest -q → 109 passed, 1 skipped poetry run mypy → clean (added types-python-dateutil) poetry run ruff check → clean --- broker_sync/providers/parsers/schwab.py | 75 ++++++++++++++++++++++ poetry.lock | 14 ++++- pyproject.toml | 1 + tests/providers/parsers/test_schwab.py | 84 +++++++++++++++++++++++++ 4 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 broker_sync/providers/parsers/schwab.py create mode 100644 tests/providers/parsers/test_schwab.py diff --git a/broker_sync/providers/parsers/schwab.py b/broker_sync/providers/parsers/schwab.py new file mode 100644 index 0000000..fe5f5f3 --- /dev/null +++ b/broker_sync/providers/parsers/schwab.py @@ -0,0 +1,75 @@ +"""Schwab workplace-RSU email parser. + +Schwab sends HTML transaction-confirmation emails with the core fields in +five `` elements: +1. Trade date (human format — e.g. "Jan 23, 2025") +2. Direction word ("Sold" for SELL; anything else is BUY) +3. Quantity (share count, float) +4. Ticker +5. Price ("$123.45" — currency-sign-prefixed) + +One email → one Activity. On any parse failure we return an empty list +(same as the original finance/ behaviour — an unparseable email shouldn't +crash the whole IMAP batch). + +Ported from finance/position/provider/schwab/message_parser.py (39 lines). +Dropped: per-row timestamp id suffix (we use ISO date + ticker + qty which +is stable across re-pulls), currency-from-sign hackery (US Schwab is USD- +only in practice — if that ever changes we'll add FX on parse). +""" +from __future__ import annotations + +from decimal import Decimal, InvalidOperation + +from bs4 import BeautifulSoup +from dateutil import parser as dateparser + +from broker_sync.models import AccountType, Activity, ActivityType + +_ACCOUNT_ID = "schwab-workplace" +_DEFAULT_CURRENCY = "USD" + + +def parse_schwab_email(raw_html: str) -> list[Activity]: + """Return a single-item list of Activity on success, empty on failure.""" + try: + soup = BeautifulSoup(raw_html, "html.parser") + cells = [ + td.get_text(strip=True) for td in soup.find_all("td", { + "class": "dark-background-body", + "align": "right" + }) + ] + if len(cells) < 5: + return [] + + date_txt, direction_txt, qty_txt, ticker, price_txt = cells[:5] + trade_date = dateparser.parse(date_txt) + direction = (ActivityType.SELL + if direction_txt.strip().lower() == "sold" else ActivityType.BUY) + quantity = Decimal(qty_txt.replace(",", "").strip()) + # Price like "$123.45" — strip the currency sign and parse the numeric tail. + # Handle "£", "€", "USD", etc. by taking the last numeric span. + price_clean = price_txt + for sign in ("$", "£", "€", "USD", "GBP", "EUR"): + price_clean = price_clean.replace(sign, "") + unit_price = Decimal(price_clean.replace(",", "").strip()) + + external_id = (f"schwab:{trade_date.date().isoformat()}:{ticker}:" + f"{direction.value}:{quantity}") + return [ + Activity( + external_id=external_id, + account_id=_ACCOUNT_ID, + account_type=AccountType.GIA, + date=trade_date, + activity_type=direction, + symbol=ticker.strip(), + quantity=quantity, + unit_price=unit_price, + currency=_DEFAULT_CURRENCY, + notes=f"schwab-email:{direction_txt}", + ) + ] + except (ValueError, InvalidOperation, IndexError, AttributeError): + return [] diff --git a/poetry.lock b/poetry.lock index 58029c8..73fc482 100644 --- a/poetry.lock +++ b/poetry.lock @@ -628,6 +628,18 @@ rich = ">=10.11.0" shellingham = ">=1.3.0" typing-extensions = ">=3.7.4.3" +[[package]] +name = "types-python-dateutil" +version = "2.9.0.20260408" +description = "Typing stubs for python-dateutil" +optional = false +python-versions = ">=3.10" +groups = ["dev"] +files = [ + {file = "types_python_dateutil-2.9.0.20260408-py3-none-any.whl", hash = "sha256:473139d514a71c9d1fbd8bb328974bedcb1cc3dba57aad04ffa4157f483c216f"}, + {file = "types_python_dateutil-2.9.0.20260408.tar.gz", hash = "sha256:8b056ec01568674235f64ecbcef928972a5fac412f5aab09c516dfa2acfbb582"}, +] + [[package]] name = "typing-extensions" version = "4.15.0" @@ -658,4 +670,4 @@ platformdirs = ">=3.5.1" [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.13" -content-hash = "b9c19ac1963682740a98cd539d3790ff180c2e8195d5cfcc9572da855db3fa7d" +content-hash = "04a3e24fe45c75f975140aff6076af0a156772a1a8e82eba30ee2345ac1d8bd6" diff --git a/pyproject.toml b/pyproject.toml index adcf5cc..0a25a66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ pytest-asyncio = "^0.23" mypy = "^1.11" ruff = "^0.6" yapf = "^0.43" +types-python-dateutil = "^2.9.0.20260408" [tool.poetry.scripts] broker-sync = "broker_sync.cli:app" diff --git a/tests/providers/parsers/test_schwab.py b/tests/providers/parsers/test_schwab.py new file mode 100644 index 0000000..8e3c736 --- /dev/null +++ b/tests/providers/parsers/test_schwab.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +from decimal import Decimal + +from broker_sync.models import AccountType, ActivityType +from broker_sync.providers.parsers.schwab import parse_schwab_email + +_SELL = """ + + + + + + + +
DateJan 23, 2025
ActionSold
Quantity100.0
TickerMETA
Price$612.34
+ +""" + +_BUY = """ + + + + + + +
2024-11-15
Bought
5.5
AAPL
$225.00
+""" + +_MALFORMED = "no transaction here" + +_MISSING_CELLS = """ + + + +
Jan 23, 2025
Sold
+""" + + +def test_sell_email_parses_to_one_sell_activity() -> None: + acts = parse_schwab_email(_SELL) + assert len(acts) == 1 + a = acts[0] + assert a.activity_type is ActivityType.SELL + assert a.symbol == "META" + assert a.quantity == Decimal("100.0") + assert a.unit_price == Decimal("612.34") + assert a.currency == "USD" + assert a.account_id == "schwab-workplace" + assert a.account_type is AccountType.GIA + assert a.date.date().isoformat() == "2025-01-23" + + +def test_buy_email_becomes_buy_activity() -> None: + acts = parse_schwab_email(_BUY) + assert len(acts) == 1 + a = acts[0] + assert a.activity_type is ActivityType.BUY + assert a.symbol == "AAPL" + assert a.quantity == Decimal("5.5") + assert a.unit_price == Decimal("225.00") + + +def test_malformed_email_returns_empty_list() -> None: + # No matching td cells at all. + assert parse_schwab_email(_MALFORMED) == [] + + +def test_missing_cells_returns_empty_list() -> None: + # Only 2 of the 5 required cells — parser must bail cleanly. + assert parse_schwab_email(_MISSING_CELLS) == [] + + +def test_external_id_is_stable_across_reruns() -> None: + # Same email → same external_id (deterministic, not timestamp-based). + a1 = parse_schwab_email(_SELL)[0] + a2 = parse_schwab_email(_SELL)[0] + assert a1.external_id == a2.external_id + + +def test_price_with_commas_parses() -> None: + html = _SELL.replace("$612.34", "$1,612.34") + a = parse_schwab_email(html)[0] + assert a.unit_price == Decimal("1612.34") From 6efd03570ab17583558b8e4053268f39dc27060c Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 22:12:05 +0000 Subject: [PATCH 06/17] Add imap-ingest CLI + ImapProvider: route emails to IE/Schwab parsers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires the IE + Schwab email parsers into an actual runnable sync. Walks the IMAP mailbox, routes each message by sender domain: - *@investengine.com → invest_engine.parse_invest_engine_email - *@schwab.com → schwab.parse_schwab_email then pushes the resulting Activities through the shared pipeline. broker-sync imap-ingest — new CLI command taking IMAP_HOST/USER/PASSWORD/ DIRECTORY (mirrors the old wealthfolio-sync image's env shape so the Terraform CronJob's existing env wiring works unchanged). Verified: poetry run pytest -q → 109 passed + 1 skipped; mypy strict clean (37 files); ruff + yapf clean. --- broker_sync/cli.py | 63 +++++++++++ broker_sync/pipeline.py | 8 +- broker_sync/providers/imap.py | 189 +++++++++++++++++++++++++++++++ broker_sync/sinks/wealthfolio.py | 18 +-- tests/sinks/test_wealthfolio.py | 29 +++-- tests/test_pipeline.py | 18 +-- 6 files changed, 290 insertions(+), 35 deletions(-) create mode 100644 broker_sync/providers/imap.py diff --git a/broker_sync/cli.py b/broker_sync/cli.py index af5b08a..ea7d8c9 100644 --- a/broker_sync/cli.py +++ b/broker_sync/cli.py @@ -230,6 +230,69 @@ def invest_engine( asyncio.run(_run()) +@app.command("imap-ingest") +def imap_ingest( + wf_base_url: str = typer.Option(..., envvar="WF_BASE_URL"), + wf_username: str = typer.Option(..., envvar="WF_USERNAME"), + wf_password: str = typer.Option(..., envvar="WF_PASSWORD"), + wf_session_path: str = typer.Option("/data/wealthfolio_session.json", + envvar="WF_SESSION_PATH"), + imap_host: str = typer.Option(..., envvar="IMAP_HOST"), + imap_user: str = typer.Option(..., envvar="IMAP_USER"), + imap_password: str = typer.Option(..., envvar="IMAP_PASSWORD"), + imap_directory: str = typer.Option("INBOX", envvar="IMAP_DIRECTORY"), + data_dir: str = typer.Option("/data", envvar="BROKER_SYNC_DATA_DIR"), +) -> None: + """Phase 2/3 — ingest InvestEngine + Schwab confirmation emails via IMAP. + + Walks the mailbox, routes each message by `From:` sender domain to the + matching parser, pushes any resulting activities through the shared + pipeline (dedup → Wealthfolio CSV-free JSON import). + """ + from broker_sync.dedup import SyncRecordStore + from broker_sync.pipeline import sync_provider_to_wealthfolio + from broker_sync.providers.imap import ImapCreds, ImapProvider + from broker_sync.sinks.wealthfolio import WealthfolioSink + + _setup_logging() + data = Path(data_dir) + data.mkdir(parents=True, exist_ok=True) + + async def _run() -> None: + sink = WealthfolioSink( + base_url=wf_base_url, + username=wf_username, + password=wf_password, + session_path=wf_session_path, + ) + provider = ImapProvider( + ImapCreds( + host=imap_host, + user=imap_user, + password=imap_password, + directory=imap_directory, + )) + dedup = SyncRecordStore(data / "sync.db") + try: + if not Path(wf_session_path).exists(): + await sink.login() + result = await sync_provider_to_wealthfolio( + provider=provider, + sink=sink, + dedup=dedup, + ) + finally: + await sink.close() + typer.echo(f"imap-ingest: fetched={result.fetched} " + f"new={result.new_after_dedup} " + f"imported={result.imported} " + f"failed={result.failed}") + if result.failed > 0: + sys.exit(1) + + asyncio.run(_run()) + + def _setup_logging() -> None: logging.basicConfig( level=logging.INFO, diff --git a/broker_sync/pipeline.py b/broker_sync/pipeline.py index 12caca7..7921934 100644 --- a/broker_sync/pipeline.py +++ b/broker_sync/pipeline.py @@ -89,9 +89,7 @@ async def sync_provider_to_wealthfolio( ) -async def _ensure_accounts( - sink: WealthfolioSink, accounts: list[Account] -) -> dict[str, str]: +async def _ensure_accounts(sink: WealthfolioSink, accounts: list[Account]) -> dict[str, str]: """Return {our_account_id: wealthfolio_uuid}.""" out: dict[str, str] = {} for account in accounts: @@ -134,7 +132,9 @@ async def _flush_batch( for original_account_id, a in batch: wf_id = by_external.get(a.external_id) dedup.record( - provider_name, original_account_id, a.external_id, + provider_name, + original_account_id, + a.external_id, wealthfolio_activity_id=wf_id, ) ok += 1 diff --git a/broker_sync/providers/imap.py b/broker_sync/providers/imap.py new file mode 100644 index 0000000..de46aa9 --- /dev/null +++ b/broker_sync/providers/imap.py @@ -0,0 +1,189 @@ +"""IMAP email ingestor: dispatches messages to the matching parser by sender. + +Used by the `imap-ingest` CLI command for InvestEngine + Schwab confirmation +emails. Each message passes through: + +1. Pull ALL messages from the configured mailbox directory. +2. Route each by `From:` to a parser: + - noreply@investengine.com (+ equivalents) → invest_engine parser + - Schwab confirmations (equityawards@schwab.com, etc.) → schwab parser +3. Merge parser output into one list[Activity] with source attribution. + +Not imap-idle; runs once per invocation. Designed for a daily CronJob. +""" +from __future__ import annotations + +import email +import imaplib +import logging +import re +import ssl +from collections.abc import AsyncIterator, Iterator +from datetime import datetime +from email.message import Message +from typing import NamedTuple + +from broker_sync.models import Account, AccountType, Activity +from broker_sync.providers.parsers import invest_engine as ie_parser +from broker_sync.providers.parsers.schwab import parse_schwab_email + +log = logging.getLogger(__name__) + +_IE_SENDERS = {"noreply@investengine.com", "hello@investengine.com"} +_SCHWAB_SENDERS = { + "equityawards@schwab.com", + "donotreply@schwab.com", + "wealthnotify@schwab.com", +} + +_ADDR_RE = re.compile(r"[\w.+-]+@[\w-]+(?:\.[\w-]+)+") + + +class ImapCreds(NamedTuple): + host: str + user: str + password: str + directory: str + + +def _extract_sender(msg: Message) -> str: + raw = msg.get("From", "") + m = _ADDR_RE.search(raw) + return (m.group(0) if m else "").lower() + + +def _html_or_text(msg: Message) -> str: + """Return the richest body available (prefer HTML).""" + if msg.is_multipart(): + html = None + plain = None + for part in msg.walk(): + ct = part.get_content_type() + if ct == "text/html" and html is None: + html = part.get_payload(decode=True) + elif ct == "text/plain" and plain is None: + plain = part.get_payload(decode=True) + body = html or plain + else: + body = msg.get_payload(decode=True) + if body is None: + return "" + if isinstance(body, bytes): + charset = msg.get_content_charset() or "utf-8" + try: + return body.decode(charset, errors="replace") + except LookupError: + return body.decode("utf-8", errors="replace") + return str(body) + + +def _fetch_all(creds: ImapCreds) -> Iterator[bytes]: + ctx = ssl.create_default_context() + with imaplib.IMAP4_SSL(creds.host, ssl_context=ctx) as m: + m.login(creds.user, creds.password) + typ, _ = m.select(creds.directory, readonly=True) + if typ != "OK": + raise RuntimeError(f"IMAP select {creds.directory} failed: {typ}") + typ, data = m.search(None, "ALL") + if typ != "OK": + raise RuntimeError(f"IMAP search failed: {typ}") + ids = data[0].split() + log.info("imap: fetching %d messages from %s", len(ids), creds.directory) + for uid in ids: + typ, rsp = m.fetch(uid, "(RFC822)") + if typ != "OK" or not rsp or not rsp[0]: + continue + raw = rsp[0][1] + if isinstance(raw, bytes): + yield raw + + +def fetch_activities(creds: ImapCreds) -> list[Activity]: + out: list[Activity] = [] + ie_parsed = schwab_parsed = skipped = 0 + for raw in _fetch_all(creds): + try: + msg = email.message_from_bytes(raw) + except Exception: + skipped += 1 + continue + sender = _extract_sender(msg) + if sender in _IE_SENDERS or sender.endswith("@investengine.com"): + out.extend(ie_parser.parse_invest_engine_email(raw)) + ie_parsed += 1 + elif sender in _SCHWAB_SENDERS or sender.endswith("@schwab.com"): + html = _html_or_text(msg) + out.extend(parse_schwab_email(html)) + schwab_parsed += 1 + else: + skipped += 1 + log.info( + "imap: ie_parsed=%d schwab_parsed=%d skipped=%d → %d activities", + ie_parsed, + schwab_parsed, + skipped, + len(out), + ) + return out + + +class ImapProvider: + """Wraps the IMAP fetch + per-sender parse into the Provider protocol. + + Yields both InvestEngine AND Schwab activities — downstream the + pipeline's dedup keyed on (provider, account, external_id) already + isolates them by account_id. + """ + name = "imap" + + def __init__(self, creds: ImapCreds) -> None: + self._creds = creds + + def accounts(self) -> list[Account]: + return [ + Account( + id="invest-engine-primary", + name="InvestEngine ISA", + account_type=AccountType.ISA, + currency="GBP", + provider="invest-engine", + ), + Account( + id="schwab-workplace", + name="Schwab (US workplace)", + account_type=AccountType.GIA, + currency="USD", + provider="schwab", + ), + ] + + async def fetch( + self, + *, + since: datetime | None = None, + before: datetime | None = None, + ) -> AsyncIterator[Activity]: + # IMAP doesn't give us a server-side date range directly without + # constructing IMAP SEARCH criteria; filter client-side. + for a in fetch_activities(self._creds): + if since is not None and a.date < since: + continue + if before is not None and a.date >= before: + continue + yield a + + +if __name__ == "__main__": + # Local smoke — invoked manually for debug, never from the CronJob. + import os + logging.basicConfig(level=logging.INFO) + c = ImapCreds( + host=os.environ["IMAP_HOST"], + user=os.environ["IMAP_USER"], + password=os.environ["IMAP_PASSWORD"], + directory=os.environ.get("IMAP_DIRECTORY", "INBOX"), + ) + acts = fetch_activities(c) + print(f"total={len(acts)}") + for a in acts[:5]: + print(f" {a.activity_type} {a.symbol} {a.date.isoformat()}") diff --git a/broker_sync/sinks/wealthfolio.py b/broker_sync/sinks/wealthfolio.py index f82817f..47881db 100644 --- a/broker_sync/sinks/wealthfolio.py +++ b/broker_sync/sinks/wealthfolio.py @@ -130,10 +130,7 @@ class WealthfolioSink: """ existing = await self.list_accounts() for a in existing: - if ( - a.get("provider") == account.provider - and a.get("providerAccountId") == account.id - ): + if (a.get("provider") == account.provider and a.get("providerAccountId") == account.id): wf_id = a.get("id") assert isinstance(wf_id, str) return wf_id @@ -159,9 +156,7 @@ class WealthfolioSink: created = resp.json() wf_id = created.get("id") if not isinstance(wf_id, str): - raise WealthfolioError( - f"POST /accounts returned no id: {created}" - ) + raise WealthfolioError(f"POST /accounts returned no id: {created}") return wf_id # -- activity import -- @@ -213,15 +208,12 @@ class WealthfolioSink: checked = check.json() if not isinstance(checked, list): raise ImportValidationError( - f"Wealthfolio /import/check returned non-list: {type(checked).__name__}" - ) + f"Wealthfolio /import/check returned non-list: {type(checked).__name__}") invalid = [r for r in checked if isinstance(r, dict) and r.get("errors")] if invalid: - raise ImportValidationError( - f"Wealthfolio /import/check flagged {len(invalid)} row(s); " - f"first: {invalid[0]}" - ) + raise ImportValidationError(f"Wealthfolio /import/check flagged {len(invalid)} row(s); " + f"first: {invalid[0]}") # Drop any row the server marked is_valid=false (shouldn't happen # without errors, but defensive). valid_rows = [r for r in checked if isinstance(r, dict) and r.get("isValid")] diff --git a/tests/sinks/test_wealthfolio.py b/tests/sinks/test_wealthfolio.py index f554a19..210b915 100644 --- a/tests/sinks/test_wealthfolio.py +++ b/tests/sinks/test_wealthfolio.py @@ -48,7 +48,10 @@ def _login_ok(req: httpx.Request) -> httpx.Response: assert body == {"password": "hunter2"} return httpx.Response( 200, - json={"authenticated": True, "expiresIn": 604800}, + json={ + "authenticated": True, + "expiresIn": 604800 + }, headers={"set-cookie": "wf_token=abc123; Path=/api; HttpOnly"}, ) @@ -219,21 +222,25 @@ async def test_import_dry_run_then_real(tmp_path: Path) -> None: calls.append(req.url.path) if req.url.path == "/api/v1/activities/import/check": # /import/check hydrates and returns a list of ActivityImport. - return httpx.Response(200, json=[ - { - "symbol": "VUAG", - "isValid": True, - "errors": None, - "assetId": "enriched-asset-uuid", - "exchangeMic": "XLON", - }, - ]) + return httpx.Response(200, + json=[ + { + "symbol": "VUAG", + "isValid": True, + "errors": None, + "assetId": "enriched-asset-uuid", + "exchangeMic": "XLON", + }, + ]) if req.url.path == "/api/v1/activities/import": return httpx.Response( 200, json={ "activities": [ - {"id": "wf-1", "external_id": "t212:1"}, + { + "id": "wf-1", + "external_id": "t212:1" + }, ], }, ) diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 198e58b..481c4d7 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -86,18 +86,22 @@ async def test_pipeline_skips_dedup_then_imports_new(tmp_path: Path) -> None: body = json.loads(req.content) # Echo each activity back marked valid (mimic Wealthfolio's # hydrate step). - return httpx.Response(200, json=[ - {**a, "isValid": True, "errors": None} for a in body["activities"] - ]) + return httpx.Response(200, + json=[{ + **a, "isValid": True, + "errors": None + } for a in body["activities"]]) if req.url.path == "/api/v1/activities/import": body = req.content.decode() posted_batches.append(body) return httpx.Response( 200, - json={"activities": [ - {"id": f"wf-{i}", "external_id": ext} - for i, ext in enumerate(["a", "b", "c"]) - ]}, + json={ + "activities": [{ + "id": f"wf-{i}", + "external_id": ext + } for i, ext in enumerate(["a", "b", "c"])] + }, ) return httpx.Response(500) From 4e2da876378a094e05faf03e6fd481a3619ff7d0 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 22:24:36 +0000 Subject: [PATCH 07/17] sinks: detect silent Wealthfolio /import drops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the check step returns isValid=true + no errors, a row can still be silently dropped by /import (response returns activities=[] on 200 OK). Root-cause is usually a field that check hydrates but /import re-normalises differently (date string form, asset_id resolution). When we send N valid rows and get back 0, raise ImportValidationError with a snippet of the check output + first warning — gives the operator a concrete hint to fix the producer instead of silently growing dedup against activities that never landed. poetry run pytest -q → 109 passed, 1 skipped poetry run mypy → clean poetry run ruff check → clean --- broker_sync/sinks/wealthfolio.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/broker_sync/sinks/wealthfolio.py b/broker_sync/sinks/wealthfolio.py index 47881db..e968927 100644 --- a/broker_sync/sinks/wealthfolio.py +++ b/broker_sync/sinks/wealthfolio.py @@ -224,7 +224,24 @@ class WealthfolioSink: if isinstance(raw, dict) and "activities" in raw: got = raw["activities"] assert isinstance(got, list) - return got - if isinstance(raw, list): - return raw - return [] + elif isinstance(raw, list): + got = raw + else: + got = [] + # Silent-drop detection: if we sent N valid rows but got 0 back, something + # is silently rejecting them (usually a date-format or asset-resolution + # quirk that check() didn't catch). Raise so the pipeline records failure + # instead of marking the rows as synced when they never landed. + if valid_rows and not got: + # Also surface any per-row `errors` or `warnings` from the check step + # — those are often the best hint about why /import dropped them. + first_warn = next( + (r.get("warnings") for r in checked if isinstance(r, dict) and r.get("warnings")), + None, + ) + raise ImportValidationError( + f"Wealthfolio /import silently dropped all {len(valid_rows)} rows. " + f"First checked row: {checked[0] if checked else 'none'}. " + f"First warning (if any): {first_warn}" + ) + return got From 74b2179c83bfc233a14c8aa4e1a6166bf38e2a8d Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 17 Apr 2026 22:30:24 +0000 Subject: [PATCH 08/17] sinks: read summary.imported as truth for partial-persist detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The /import response returns activities=[input echo with errors annotated] — its length equals input size regardless of actual persistence. The summary{total,imported,skipped,duplicates} block is the authoritative signal. When imported Date: Fri, 17 Apr 2026 22:38:21 +0000 Subject: [PATCH 09/17] Add finance_mysql provider + CLI for historical backfill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit finance.position (171 rows, 2020-06-07 to 2025-12-19) is the only source of InvestEngine + Schwab trade history pre-dating the broker-sync project. This provider reads it once and pushes every row into the correct WF account (.L tickers → IE ISA, others → Schwab). Dedup: external_id = 'finance-mysql:position:' — idempotent on re-run. Auth: aiomysql as MySQL root (user-authorized) against the standalone mysql:8.4 in-cluster service. New CLI: broker-sync finance-mysql-import New tests: 5 unit tests covering route, symbol normalise, BUY/SELL detection. poetry run pytest -q → 114 passed, 1 skipped poetry run mypy → clean (aiomysql shielded with type: ignore) poetry run ruff check → clean --- broker_sync/cli.py | 65 +++++++++++ broker_sync/providers/finance_mysql.py | 144 +++++++++++++++++++++++++ broker_sync/sinks/wealthfolio.py | 14 ++- poetry.lock | 37 ++++++- pyproject.toml | 1 + tests/providers/test_finance_mysql.py | 66 ++++++++++++ 6 files changed, 318 insertions(+), 9 deletions(-) create mode 100644 broker_sync/providers/finance_mysql.py create mode 100644 tests/providers/test_finance_mysql.py diff --git a/broker_sync/cli.py b/broker_sync/cli.py index ea7d8c9..3b4ff22 100644 --- a/broker_sync/cli.py +++ b/broker_sync/cli.py @@ -230,6 +230,71 @@ def invest_engine( asyncio.run(_run()) +@app.command("finance-mysql-import") +def finance_mysql_import( + wf_base_url: str = typer.Option(..., envvar="WF_BASE_URL"), + wf_username: str = typer.Option(..., envvar="WF_USERNAME"), + wf_password: str = typer.Option(..., envvar="WF_PASSWORD"), + wf_session_path: str = typer.Option("/data/wealthfolio_session.json", + envvar="WF_SESSION_PATH"), + db_host: str = typer.Option(..., envvar="FINANCE_DB_HOST"), + db_port: int = typer.Option(3306, envvar="FINANCE_DB_PORT"), + db_user: str = typer.Option(..., envvar="FINANCE_DB_USER"), + db_password: str = typer.Option(..., envvar="FINANCE_DB_PASSWORD"), + db_name: str = typer.Option("finance", envvar="FINANCE_DB_NAME"), + data_dir: str = typer.Option("/data", envvar="BROKER_SYNC_DATA_DIR"), +) -> None: + """One-shot backfill: read the retired finance app's MySQL position table + and push every row into the correct Wealthfolio account (IE for .L + tickers, Schwab for US tickers). Idempotent via dedup.""" + from broker_sync.dedup import SyncRecordStore + from broker_sync.pipeline import sync_provider_to_wealthfolio + from broker_sync.providers.finance_mysql import ( + FinanceMySQLCreds, + FinanceMySQLProvider, + ) + from broker_sync.sinks.wealthfolio import WealthfolioSink + + _setup_logging() + data = Path(data_dir) + data.mkdir(parents=True, exist_ok=True) + + async def _run() -> None: + sink = WealthfolioSink( + base_url=wf_base_url, + username=wf_username, + password=wf_password, + session_path=wf_session_path, + ) + provider = FinanceMySQLProvider( + FinanceMySQLCreds( + host=db_host, + port=db_port, + user=db_user, + password=db_password, + database=db_name, + )) + dedup = SyncRecordStore(data / "sync.db") + try: + if not Path(wf_session_path).exists(): + await sink.login() + result = await sync_provider_to_wealthfolio( + provider=provider, + sink=sink, + dedup=dedup, + ) + finally: + await sink.close() + typer.echo(f"finance-mysql: fetched={result.fetched} " + f"new={result.new_after_dedup} " + f"imported={result.imported} " + f"failed={result.failed}") + if result.failed > 0: + sys.exit(1) + + asyncio.run(_run()) + + @app.command("imap-ingest") def imap_ingest( wf_base_url: str = typer.Option(..., envvar="WF_BASE_URL"), diff --git a/broker_sync/providers/finance_mysql.py b/broker_sync/providers/finance_mysql.py new file mode 100644 index 0000000..61eee7d --- /dev/null +++ b/broker_sync/providers/finance_mysql.py @@ -0,0 +1,144 @@ +"""Backfill-from-finance provider. + +The retired `finance` app's MySQL has a `position` table with 5+ years of +InvestEngine + Schwab trade history (2020 onwards) that the broker-sync +pipeline otherwise can't reconstruct (IE's emails only go back to when +Viktor started receiving them; Schwab emails are sparse). This provider +reads that table once and emits canonical Activities so a full-history +backfill into Wealthfolio is possible. + +Ticker routing to Wealthfolio accounts: + *.L (VUAG.L, VUSA.L, etc.) -> InvestEngine ISA (GBP) + everything else (META, *_US_EQ) -> Schwab (US workplace, USD) + +Deduplication: the finance.position PK (a giant numeric string) goes into +external_id verbatim, so re-runs are idempotent against the sync_record +store. +""" +from __future__ import annotations + +import logging +from collections.abc import AsyncIterator +from datetime import UTC, datetime +from decimal import Decimal +from typing import NamedTuple + +import aiomysql # type: ignore[import-untyped] + +from broker_sync.models import Account, AccountType, Activity, ActivityType + +log = logging.getLogger(__name__) + +IE_ACCOUNT_ID = "invest-engine-primary" +SCHWAB_ACCOUNT_ID = "schwab-workplace" + + +class FinanceMySQLCreds(NamedTuple): + host: str + port: int + user: str + password: str + database: str + + +def _route(ticker: str) -> tuple[str, AccountType, str]: + """Return (account_id, account_type, currency) for a raw ticker.""" + if ticker.endswith(".L"): + return IE_ACCOUNT_ID, AccountType.ISA, "GBP" + return SCHWAB_ACCOUNT_ID, AccountType.GIA, "USD" + + +def _normalise_symbol(ticker: str) -> str: + """Strip finance-app quirks so the output symbol matches T212/Wealthfolio.""" + # VUAG.L -> VUAG (LSE handled by Wealthfolio's exchange_mic resolution) + if ticker.endswith(".L"): + return ticker[:-2] + # FLME_US_EQ -> FLME (Trading212-style suffix leaked into the old finance DB) + if ticker.endswith("_US_EQ"): + return ticker[:-6] + if ticker.endswith("_EQ"): + return ticker[:-3] + return ticker + + +def _row_to_activity(row: dict[str, object]) -> Activity: + ticker = str(row["ticker"]) + account_id, account_type, default_ccy = _route(ticker) + raw_qty = Decimal(str(row["num_shares"])) + activity_type = ActivityType.BUY if raw_qty > 0 else ActivityType.SELL + # buy_date from MySQL comes back as datetime (aiomysql converts) + dt = row["buy_date"] + if isinstance(dt, datetime): + date = dt if dt.tzinfo else dt.replace(tzinfo=UTC) + else: + date = datetime.fromisoformat(str(dt)).replace(tzinfo=UTC) + currency_raw = row.get("currency") + currency = str(currency_raw) if currency_raw else default_ccy + return Activity( + external_id=f"finance-mysql:position:{row['id']}", + account_id=account_id, + account_type=account_type, + date=date, + activity_type=activity_type, + symbol=_normalise_symbol(ticker), + quantity=abs(raw_qty), + unit_price=Decimal(str(row["buy_price"])), + currency=currency, + notes=f"finance-mysql:{ticker}", + ) + + +class FinanceMySQLProvider: + """Read-only backfill from the retired finance MySQL `position` table.""" + name = "finance-mysql" + + def __init__(self, creds: FinanceMySQLCreds) -> None: + self._creds = creds + + def accounts(self) -> list[Account]: + return [ + Account( + id=IE_ACCOUNT_ID, + name="InvestEngine ISA", + account_type=AccountType.ISA, + currency="GBP", + provider="invest-engine", + ), + Account( + id=SCHWAB_ACCOUNT_ID, + name="Schwab (US workplace)", + account_type=AccountType.GIA, + currency="USD", + provider="schwab", + ), + ] + + async def fetch( + self, + *, + since: datetime | None = None, + before: datetime | None = None, + ) -> AsyncIterator[Activity]: + conn = await aiomysql.connect( + host=self._creds.host, + port=self._creds.port, + user=self._creds.user, + password=self._creds.password, + db=self._creds.database, + autocommit=True, + ) + try: + async with conn.cursor(aiomysql.DictCursor) as cur: + await cur.execute("SELECT id, ticker, buy_price, num_shares, currency, buy_date, " + "account_id FROM position ORDER BY buy_date ASC") + rows = await cur.fetchall() + log.info("finance-mysql: %d position rows", len(rows)) + for row in rows: + activity = _row_to_activity(row) + if since is not None and activity.date < since: + continue + if before is not None and activity.date >= before: + continue + yield activity + finally: + conn.close() diff --git a/broker_sync/sinks/wealthfolio.py b/broker_sync/sinks/wealthfolio.py index e69cd73..4d73412 100644 --- a/broker_sync/sinks/wealthfolio.py +++ b/broker_sync/sinks/wealthfolio.py @@ -243,11 +243,9 @@ class WealthfolioSink: err_msg = summary.get("errorMessage") or "no errorMessage" skipped = int(summary.get("skipped", 0)) dupes = int(summary.get("duplicates", 0)) - raise ImportValidationError( - f"Wealthfolio /import persisted {imported_n}/{total_n} " - f"(skipped={skipped} duplicates={dupes}). " - f"errorMessage: {err_msg}" - ) + raise ImportValidationError(f"Wealthfolio /import persisted {imported_n}/{total_n} " + f"(skipped={skipped} duplicates={dupes}). " + f"errorMessage: {err_msg}") # Legacy silent-drop guard for no-summary responses. elif valid_rows and not got: first_warn = next( @@ -257,6 +255,6 @@ class WealthfolioSink: raise ImportValidationError( f"Wealthfolio /import silently dropped all {len(valid_rows)} rows. " f"First checked row: {checked[0] if checked else 'none'}. " - f"First warning: {first_warn}" - ) - return got + f"First warning: {first_warn}") + assert isinstance(got, list) + return [r for r in got if isinstance(r, dict)] diff --git a/poetry.lock b/poetry.lock index 73fc482..07fce53 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,24 @@ # This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +[[package]] +name = "aiomysql" +version = "0.3.2" +description = "MySQL driver for asyncio." +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "aiomysql-0.3.2-py3-none-any.whl", hash = "sha256:c82c5ba04137d7afd5c693a258bea8ead2aad77101668044143a991e04632eb2"}, + {file = "aiomysql-0.3.2.tar.gz", hash = "sha256:72d15ef5cfc34c03468eb41e1b90adb9fd9347b0b589114bd23ead569a02ac1a"}, +] + +[package.dependencies] +PyMySQL = ">=1.0" + +[package.extras] +rsa = ["PyMySQL[rsa] (>=1.0)"] +sa = ["sqlalchemy (>=1.3,<1.4)"] + [[package]] name = "anyio" version = "4.13.0" @@ -459,6 +478,22 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pymysql" +version = "1.1.2" +description = "Pure Python MySQL Driver" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "pymysql-1.1.2-py3-none-any.whl", hash = "sha256:e6b1d89711dd51f8f74b1631fe08f039e7d76cf67a42a323d3178f0f25762ed9"}, + {file = "pymysql-1.1.2.tar.gz", hash = "sha256:4961d3e165614ae65014e361811a724e2044ad3ea3739de9903ae7c21f539f03"}, +] + +[package.extras] +ed25519 = ["PyNaCl (>=1.4.0)"] +rsa = ["cryptography"] + [[package]] name = "pytest" version = "8.4.2" @@ -670,4 +705,4 @@ platformdirs = ">=3.5.1" [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.13" -content-hash = "04a3e24fe45c75f975140aff6076af0a156772a1a8e82eba30ee2345ac1d8bd6" +content-hash = "dcc5b4eadd0a8df900e74674acf33215091dcb9bd0fffcefb03607dde2408a16" diff --git a/pyproject.toml b/pyproject.toml index 0a25a66..680f5ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ beautifulsoup4 = "^4.12" python-dateutil = "^2.9" typer = "^0.12" click = "<8.2" # typer 0.12 uses make_metavar() without ctx; click 8.2 made ctx required +aiomysql = "^0.3.2" [tool.poetry.group.dev.dependencies] pytest = "^8.3" diff --git a/tests/providers/test_finance_mysql.py b/tests/providers/test_finance_mysql.py new file mode 100644 index 0000000..2887694 --- /dev/null +++ b/tests/providers/test_finance_mysql.py @@ -0,0 +1,66 @@ +from __future__ import annotations + +from datetime import UTC, datetime +from decimal import Decimal + +from broker_sync.models import AccountType, ActivityType +from broker_sync.providers.finance_mysql import _normalise_symbol, _route, _row_to_activity + + +def test_lse_ticker_routes_to_investengine() -> None: + acct, t, ccy = _route("VUAG.L") + assert acct == "invest-engine-primary" + assert t is AccountType.ISA + assert ccy == "GBP" + + +def test_us_ticker_routes_to_schwab() -> None: + assert _route("META") == ("schwab-workplace", AccountType.GIA, "USD") + assert _route("FLME_US_EQ") == ("schwab-workplace", AccountType.GIA, "USD") + + +def test_normalise_symbol() -> None: + assert _normalise_symbol("VUAG.L") == "VUAG" + assert _normalise_symbol("VUSA.L") == "VUSA" + assert _normalise_symbol("META") == "META" + assert _normalise_symbol("FLME_US_EQ") == "FLME" + assert _normalise_symbol("FOO_EQ") == "FOO" + + +def test_row_to_buy_activity() -> None: + row = { + "id": "123456", + "ticker": "VUAG.L", + "buy_price": 85.5, + "num_shares": 10.0, + "currency": "GBP", + "buy_date": datetime(2022, 3, 15, 10, 30), + "account_id": 1, + } + a = _row_to_activity(row) + assert a.external_id == "finance-mysql:position:123456" + assert a.account_id == "invest-engine-primary" + assert a.account_type is AccountType.ISA + assert a.activity_type is ActivityType.BUY + assert a.symbol == "VUAG" # .L stripped + assert a.quantity == Decimal("10.0") + assert a.unit_price == Decimal("85.5") + assert a.currency == "GBP" + assert a.date == datetime(2022, 3, 15, 10, 30, tzinfo=UTC) + + +def test_row_to_sell_when_qty_negative() -> None: + row = { + "id": "x", + "ticker": "META", + "buy_price": 450.0, + "num_shares": -2.5, # sell + "currency": "USD", + "buy_date": datetime(2024, 8, 5), + "account_id": 1, + } + a = _row_to_activity(row) + assert a.activity_type is ActivityType.SELL + assert a.quantity == Decimal("2.5") # absolute + assert a.account_id == "schwab-workplace" + assert a.symbol == "META" From c830856ba19b56a8ec62d3cdccb7921fa64f6568 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 12:02:49 +0000 Subject: [PATCH 10/17] =?UTF-8?q?imap:=20route=20IE=20BUYs=20to=20ISA=20fi?= =?UTF-8?q?rst-=C2=A320k=20/=20GIA=20overflow=20per=20UK=20tax=20year?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context Viktor's InvestEngine account has both an ISA and a GIA wrapper. Trade confirmation emails (info@investengine.com) are identical between them — subject "Here's how your portfolio looks now", body shows "Client name: Viktor Barzin" with no portfolio/account type. That left the IMAP parser hardcoded to route every IE BUY to the ISA (invest-engine-primary), which produced a 2339-share over-count when 2023-24 GIA buys landed in the ISA during the 2026-04-18 reconciliation. Viktor's rule: from 6 April each tax year, BUYs fill ISA up to the £20,000 cap, then overflow to GIA. This commit codifies that rule in a standalone batch splitter and applies it at the ImapProvider boundary. Also picks up a silent-drop bug surfaced during the same reconciliation: WF's /import (unlike /import/check) rejects naive datetimes with "Invalid date". The sink now coerces tzinfo=UTC defensively so every provider gets the same guarantee. ## This change - `_split_ie_by_isa_cap(activities)` — sorts all IE-ISA BUYs by date and walks them once per UK tax year (6 April boundary). A BUY whose running tax-year total BEFORE it is strictly below £20k stays on the ISA; otherwise it flips to a new `invest-engine-gia` account_id. No fractional splits — boundary activities go whole to whichever bucket their pre-running-total dictates. Non-IE and non-BUY activities pass through unchanged. - `ImapProvider.accounts()` gains an `invest-engine-gia` Account so the pipeline's `_ensure_accounts` can resolve both. - `ImapProvider.fetch()` calls the splitter on the full batch before applying the `since`/`before` date filter — batch-level sort guarantees consistent routing regardless of the order IMAP returns messages. - `WealthfolioSink._activity_to_import_row` coerces naive datetimes to UTC so the row passes WF /import validation. ## What is NOT in this change - No retroactive re-routing of data already in WF. Historical finance-mysql rows (all lumped to `invest-engine-primary` or `invest-engine-gia` by the existing heuristic) keep their current account assignment. If a past tax-year was routed "wrong" under the new rule, that's corrected manually via the WF API, not here. - No change to the Schwab or trading212 paths. ## Verification ### Automated \`\`\` $ poetry run pytest tests/providers/test_imap.py -v tests/providers/test_imap.py::test_uk_tax_year_start_before_april_6_rolls_back PASSED tests/providers/test_imap.py::test_single_tax_year_under_cap_stays_isa PASSED tests/providers/test_imap.py::test_overflow_past_cap_flips_to_gia PASSED tests/providers/test_imap.py::test_tax_year_boundary_resets_cap PASSED tests/providers/test_imap.py::test_out_of_order_activities_sorted_before_cap_applied PASSED tests/providers/test_imap.py::test_non_ie_activities_passed_through_unchanged PASSED 6 passed in 0.36s $ poetry run pytest -q --ignore=tests/test_cli.py 116 passed, 1 skipped in 2.76s $ poetry run ruff check broker_sync/providers/imap.py broker_sync/sinks/wealthfolio.py All checks passed! $ poetry run mypy broker_sync/providers/imap.py broker_sync/sinks/wealthfolio.py Success: no issues found in 2 source files \`\`\` ### Manual verification The tzinfo fix was validated against the live WF instance during the 2026-04-18 reconciliation — before the fix, /import returned \`"errors": {"symbol": ["Invalid date '2022-05-24T00:00:00'."]}\` for every IMAP activity; after, the same payload imported cleanly. The splitter was not exercised against live IMAP data because Viktor's mailbox only has Apr 2022 → Feb 2024 emails, all inside finance.position's existing coverage. Running IMAP ingest with \`since=2024-04-06\` yields fetched=0. The unit tests cover the boundary arithmetic; a live run will happen when newer emails are parsed (or when finance coverage is re-scoped). ## Reproduce locally 1. \`poetry install\` 2. \`poetry run pytest tests/providers/test_imap.py\` 3. Expected: 6 passed, 0 failed. Co-Authored-By: Claude Opus 4.7 (1M context) --- broker_sync/providers/imap.py | 72 ++++++++++++++++++++-- broker_sync/sinks/wealthfolio.py | 6 +- tests/providers/test_imap.py | 100 +++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+), 5 deletions(-) create mode 100644 tests/providers/test_imap.py diff --git a/broker_sync/providers/imap.py b/broker_sync/providers/imap.py index de46aa9..e935bab 100644 --- a/broker_sync/providers/imap.py +++ b/broker_sync/providers/imap.py @@ -19,14 +19,66 @@ import logging import re import ssl from collections.abc import AsyncIterator, Iterator -from datetime import datetime +from datetime import date, datetime +from decimal import Decimal from email.message import Message from typing import NamedTuple -from broker_sync.models import Account, AccountType, Activity +from broker_sync.models import Account, AccountType, Activity, ActivityType from broker_sync.providers.parsers import invest_engine as ie_parser from broker_sync.providers.parsers.schwab import parse_schwab_email +_IE_ISA_ACCOUNT_ID = "invest-engine-primary" +_IE_GIA_ACCOUNT_ID = "invest-engine-gia" +_ISA_ANNUAL_CAP = Decimal("20000") +_UK_TAX_YEAR_START = (4, 6) # (month, day) — UK tax year starts 6 April + + +def _uk_tax_year_start(d: datetime) -> date: + """Return the start date (6 April of year N) of the UK tax year containing `d`.""" + month, day = _UK_TAX_YEAR_START + cutoff = date(d.year, month, day) + return cutoff if d.date() >= cutoff else date(d.year - 1, month, day) + + +def _split_ie_by_isa_cap( + activities: list[Activity], + *, + isa_cap: Decimal = _ISA_ANNUAL_CAP, +) -> list[Activity]: + """Re-route IE BUYs: first `isa_cap` GBP of each UK tax year → ISA, rest → GIA. + + Viktor's IE account has both an ISA and a GIA wrapper, and his trade + confirmation emails don't indicate which one a given buy hit. Empirically, + he fills the ISA allowance first each tax year (6 April) and any excess + lands in GIA. This function partitions an already-parsed batch of Activity + objects by that rule. + + Rule for boundary buys: a BUY is assigned to ISA iff the running tax-year + total BEFORE it is still strictly below the cap; otherwise GIA. Whole- + activity assignment — no fractional splits. + + Non-IE activities and non-BUYs are passed through unchanged. + """ + ie_buys = [ + a for a in activities + if a.account_id == _IE_ISA_ACCOUNT_ID and a.activity_type is ActivityType.BUY + ] + ie_buys.sort(key=lambda a: a.date) + cumulative: dict[date, Decimal] = {} + for a in ie_buys: + ty = _uk_tax_year_start(a.date) + running = cumulative.get(ty, Decimal(0)) + trade_value = (a.quantity or Decimal(0)) * (a.unit_price or Decimal(0)) + if running < isa_cap: + a.account_id = _IE_ISA_ACCOUNT_ID + a.account_type = AccountType.ISA + else: + a.account_id = _IE_GIA_ACCOUNT_ID + a.account_type = AccountType.GIA + cumulative[ty] = running + trade_value + return activities + log = logging.getLogger(__name__) _IE_SENDERS = {"noreply@investengine.com", "hello@investengine.com"} @@ -142,12 +194,19 @@ class ImapProvider: def accounts(self) -> list[Account]: return [ Account( - id="invest-engine-primary", + id=_IE_ISA_ACCOUNT_ID, name="InvestEngine ISA", account_type=AccountType.ISA, currency="GBP", provider="invest-engine", ), + Account( + id=_IE_GIA_ACCOUNT_ID, + name="InvestEngine GIA", + account_type=AccountType.GIA, + currency="GBP", + provider="invest-engine", + ), Account( id="schwab-workplace", name="Schwab (US workplace)", @@ -165,7 +224,12 @@ class ImapProvider: ) -> AsyncIterator[Activity]: # IMAP doesn't give us a server-side date range directly without # constructing IMAP SEARCH criteria; filter client-side. - for a in fetch_activities(self._creds): + all_activities = fetch_activities(self._creds) + # Apply ISA/GIA £20k-cap routing in one batch-level pass so each UK tax + # year's cumulative total is computed consistently regardless of email + # order on the server. + routed = _split_ie_by_isa_cap(all_activities) + for a in routed: if since is not None and a.date < since: continue if before is not None and a.date >= before: diff --git a/broker_sync/sinks/wealthfolio.py b/broker_sync/sinks/wealthfolio.py index 4d73412..efbd50c 100644 --- a/broker_sync/sinks/wealthfolio.py +++ b/broker_sync/sinks/wealthfolio.py @@ -2,6 +2,7 @@ from __future__ import annotations import json from collections.abc import Iterable +from datetime import UTC from pathlib import Path from typing import Any @@ -164,8 +165,11 @@ class WealthfolioSink: @staticmethod def _activity_to_import_row(a: Activity) -> dict[str, Any]: """Match Wealthfolio's ActivityImport struct (camelCase JSON).""" + # WF /import rejects naive datetimes with "Invalid date" (even though + # /import/check accepts them) — coerce to UTC if tzinfo is missing. + date = a.date if a.date.tzinfo is not None else a.date.replace(tzinfo=UTC) row: dict[str, Any] = { - "date": a.date.isoformat(), + "date": date.isoformat(), "symbol": a.symbol or "$CASH", "activityType": str(a.activity_type), "currency": a.currency, diff --git a/tests/providers/test_imap.py b/tests/providers/test_imap.py new file mode 100644 index 0000000..5e1c14f --- /dev/null +++ b/tests/providers/test_imap.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +from datetime import UTC, date, datetime +from decimal import Decimal + +from broker_sync.models import AccountType, Activity, ActivityType +from broker_sync.providers.imap import ( + _IE_GIA_ACCOUNT_ID, + _IE_ISA_ACCOUNT_ID, + _split_ie_by_isa_cap, + _uk_tax_year_start, +) + + +def _buy(on: datetime, qty: str, price: str) -> Activity: + return Activity( + external_id=f"invest-engine:{on.isoformat()}|{qty}|{price}", + account_id=_IE_ISA_ACCOUNT_ID, + account_type=AccountType.ISA, + date=on, + activity_type=ActivityType.BUY, + currency="GBP", + symbol="VUAG", + quantity=Decimal(qty), + unit_price=Decimal(price), + ) + + +def test_uk_tax_year_start_before_april_6_rolls_back() -> None: + assert _uk_tax_year_start(datetime(2025, 4, 5, tzinfo=UTC)) == date(2024, 4, 6) + assert _uk_tax_year_start(datetime(2025, 4, 6, tzinfo=UTC)) == date(2025, 4, 6) + assert _uk_tax_year_start(datetime(2025, 1, 15, tzinfo=UTC)) == date(2024, 4, 6) + assert _uk_tax_year_start(datetime(2024, 4, 7, tzinfo=UTC)) == date(2024, 4, 6) + + +def test_single_tax_year_under_cap_stays_isa() -> None: + acts = [ + _buy(datetime(2024, 5, 1, tzinfo=UTC), "100", "50"), # £5000 + _buy(datetime(2024, 8, 1, tzinfo=UTC), "100", "80"), # £8000 + ] + routed = _split_ie_by_isa_cap(acts) + assert all(a.account_id == _IE_ISA_ACCOUNT_ID for a in routed) + assert all(a.account_type is AccountType.ISA for a in routed) + + +def test_overflow_past_cap_flips_to_gia() -> None: + acts = [ + _buy(datetime(2024, 5, 1, tzinfo=UTC), "100", "80"), # £8,000 + _buy(datetime(2024, 6, 1, tzinfo=UTC), "150", "80"), # +£12,000 → £20,000 total; prev £8k < cap → ISA + _buy(datetime(2024, 7, 1, tzinfo=UTC), "10", "80"), # prev £20,000 ≥ cap → GIA + _buy(datetime(2024, 8, 1, tzinfo=UTC), "10", "80"), # GIA + ] + routed = _split_ie_by_isa_cap(acts) + assert routed[0].account_id == _IE_ISA_ACCOUNT_ID + assert routed[1].account_id == _IE_ISA_ACCOUNT_ID + assert routed[2].account_id == _IE_GIA_ACCOUNT_ID + assert routed[2].account_type is AccountType.GIA + assert routed[3].account_id == _IE_GIA_ACCOUNT_ID + + +def test_tax_year_boundary_resets_cap() -> None: + acts = [ + # 2023-24 tax year: £20k in ISA, plus one in GIA + _buy(datetime(2023, 5, 1, tzinfo=UTC), "400", "50"), # £20,000 → ISA (prev 0 < cap) + _buy(datetime(2024, 1, 1, tzinfo=UTC), "100", "50"), # GIA (prev 20k) + # 2024-25 tax year starts 2024-04-06 — cap resets + _buy(datetime(2024, 5, 1, tzinfo=UTC), "100", "50"), # ISA (prev 0 for new year) + ] + routed = _split_ie_by_isa_cap(acts) + assert routed[0].account_id == _IE_ISA_ACCOUNT_ID + assert routed[1].account_id == _IE_GIA_ACCOUNT_ID + assert routed[2].account_id == _IE_ISA_ACCOUNT_ID + + +def test_out_of_order_activities_sorted_before_cap_applied() -> None: + acts = [ + _buy(datetime(2024, 8, 1, tzinfo=UTC), "10", "80"), # later date but given first + _buy(datetime(2024, 5, 1, tzinfo=UTC), "250", "80"), # earlier, £20,000 → ISA + ] + routed = _split_ie_by_isa_cap(acts) + by_date = {a.date: a for a in routed} + assert by_date[datetime(2024, 5, 1, tzinfo=UTC)].account_id == _IE_ISA_ACCOUNT_ID + assert by_date[datetime(2024, 8, 1, tzinfo=UTC)].account_id == _IE_GIA_ACCOUNT_ID + + +def test_non_ie_activities_passed_through_unchanged() -> None: + schwab_act = Activity( + external_id="schwab:abc", + account_id="schwab-workplace", + account_type=AccountType.GIA, + date=datetime(2024, 5, 1, tzinfo=UTC), + activity_type=ActivityType.SELL, + currency="USD", + symbol="META", + quantity=Decimal("10"), + unit_price=Decimal("500"), + ) + routed = _split_ie_by_isa_cap([schwab_act]) + assert routed[0].account_id == "schwab-workplace" + assert routed[0].account_type is AccountType.GIA From 832732a419e9c15d34cb9e8d79e0cc913ce38a47 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 14:09:04 +0000 Subject: [PATCH 11/17] fidelity-planviewer: scaffold provider + CLI (seed + stub ingest) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context UK workplace pension at planviewer.fidelity.co.uk has no public API; the SPA calls a private JSON backend at prd.wiciam.fidelity.co.uk/cvmfe/api/*. Viktor confirmed in DevTools that an OPTIONS preflight lists auth headers (ch, fid, rid, sid, tbid, theosreferer, ua). Full reverse-engineering of the endpoint paths is pending Viktor's POST cURL paste for transactions + holdings views. Until those endpoints are captured, ship the scaffold: provider module, CLI commands, tests, docs. This unblocks installing Playwright in the image and lets Viktor run the one-off seed command on his laptop ahead of the data integration. ## This change - broker_sync/providers/fidelity_planviewer.py - FidelityCreds namedtuple (storage_state_path, plan_id). - FidelitySessionError (401 → re-seed), FidelityProviderConfigError. - FidelityPlanViewerProvider: .accounts() returns a single WORKPLACE_PENSION account, .fetch() raises until endpoints are wired. - broker_sync/cli.py - fidelity-seed: launches headed Chromium so Viktor can log in and tick "Remember device", then dumps storage_state.json. - fidelity-ingest: stub matching the invest-engine / trading212 CLI shape; reads storage_state + plan_id, pipes through the shared pipeline. - tests/providers/test_fidelity_planviewer.py - Asserts the single-account shape + the loud-failure guard. - docs/providers/fidelity-planviewer.md - Architecture diagram, one-time seed procedure, backfill + monthly commands, alert runbook. - pyproject.toml - playwright ^1.47 as a first-class dep (used only by fidelity-seed and later by the session-refresh step in fidelity-ingest). ## What is NOT in this change - Endpoint wiring in provider.fetch() — blocked on DevTools POST cURL. - Infra CronJob + Vault secret + Prometheus alert — lands once the first manual backfill succeeds and we know the Chromium image size is fine. - Dockerfile Chromium install — same trigger. ## Verification ### Automated $ poetry run pytest tests/providers/test_fidelity_planviewer.py -v 2 passed in 0.08s $ poetry run pytest -q 122 passed, 1 skipped in 1.07s $ poetry run mypy broker_sync/providers/fidelity_planviewer.py broker_sync/cli.py Success: no issues found in 2 source files $ poetry run ruff check broker_sync/providers/fidelity_planviewer.py broker_sync/cli.py tests/providers/test_fidelity_planviewer.py All checks passed! ### Manual (Viktor, later) 1. poetry install && poetry run playwright install chromium 2. poetry run broker-sync fidelity-seed --out /tmp/state.json 3. Chromium opens → log in → tick "Remember device" → press Enter 4. vault kv patch secret/broker-sync fidelity_storage_state=@/tmp/state.json Co-Authored-By: Claude Opus 4.7 (1M context) --- broker_sync/cli.py | 109 ++++++++++++++++ broker_sync/providers/fidelity_planviewer.py | 128 +++++++++++++++++++ docs/providers/fidelity-planviewer.md | 111 ++++++++++++++++ poetry.lock | 115 ++++++++++++++++- pyproject.toml | 4 + tests/providers/test_fidelity_planviewer.py | 42 ++++++ 6 files changed, 508 insertions(+), 1 deletion(-) create mode 100644 broker_sync/providers/fidelity_planviewer.py create mode 100644 docs/providers/fidelity-planviewer.md create mode 100644 tests/providers/test_fidelity_planviewer.py diff --git a/broker_sync/cli.py b/broker_sync/cli.py index 3b4ff22..b5ce4c2 100644 --- a/broker_sync/cli.py +++ b/broker_sync/cli.py @@ -358,6 +358,115 @@ def imap_ingest( asyncio.run(_run()) +@app.command("fidelity-seed") +def fidelity_seed( + out: str = typer.Option( + "fidelity_storage_state.json", + help="Where to write the storage_state JSON (stage it to Vault afterwards)", + ), + url: str = typer.Option( + "https://pv.planviewer.fidelity.co.uk/", + help="PlanViewer SPA URL — defaults to the production UK landing", + ), +) -> None: + """One-off: launch a headed Chromium so Viktor can log into PlanViewer and + capture a long-lived storage_state (cookies + localStorage) for the monthly + cron. + + Expected flow: + 1. Chromium opens on the PlanViewer login page. + 2. Viktor enters username, password, memorable word, MFA code. + 3. Viktor ticks "Remember device" / "Trust this browser" if offered. + 4. Viktor waits until the dashboard loads, then presses Enter in the terminal. + 5. Script dumps storage_state.json and exits. + 6. Viktor runs ``vault kv patch secret/broker-sync fidelity_storage_state=@...``. + """ + _setup_logging() + try: + from playwright.sync_api import sync_playwright + except ImportError as e: + typer.echo( + "Playwright is not installed — run `poetry install` first.", err=True) + raise typer.Exit(code=2) from e + + typer.echo(f"Opening {url} in a headed browser — log in, tick " + "'Remember device' if offered, then press Enter here.") + with sync_playwright() as pw: + browser = pw.chromium.launch(headless=False) + context = browser.new_context() + page = context.new_page() + page.goto(url) + input("Press Enter once you're fully logged in and the dashboard is visible… ") + context.storage_state(path=out) + browser.close() + typer.echo(f"Wrote {out} — stage it to Vault:") + typer.echo(f" vault kv patch secret/broker-sync fidelity_storage_state=@{out}") + + +@app.command("fidelity-ingest") +def fidelity_ingest( + wf_base_url: str = typer.Option(..., envvar="WF_BASE_URL"), + wf_username: str = typer.Option(..., envvar="WF_USERNAME"), + wf_password: str = typer.Option(..., envvar="WF_PASSWORD"), + wf_session_path: str = typer.Option("/data/wealthfolio_session.json", envvar="WF_SESSION_PATH"), + storage_state_path: str = typer.Option( + ..., + envvar="FIDELITY_STORAGE_STATE_PATH", + help="Path on disk to storage_state.json (materialised from Vault by the init container)", + ), + plan_id: str = typer.Option(..., envvar="FIDELITY_PLAN_ID"), + data_dir: str = typer.Option("/data", envvar="BROKER_SYNC_DATA_DIR"), + mode: str = typer.Option("steady", help="steady = last-60-days; backfill = full history"), +) -> None: + """Sync Fidelity UK PlanViewer contributions + fund purchases into Wealthfolio.""" + from broker_sync.dedup import SyncRecordStore + from broker_sync.pipeline import sync_provider_to_wealthfolio + from broker_sync.providers.fidelity_planviewer import ( + FidelityCreds, + FidelityPlanViewerProvider, + ) + from broker_sync.sinks.wealthfolio import WealthfolioSink + + _setup_logging() + + if mode == "steady": + since: datetime | None = datetime.now(UTC) - timedelta(days=60) + elif mode == "backfill": + since = None + else: + typer.echo(f"Unknown mode: {mode!r}. Use 'steady' or 'backfill'.", err=True) + sys.exit(2) + + async def _run() -> None: + sink = WealthfolioSink( + base_url=wf_base_url, + username=wf_username, + password=wf_password, + session_path=wf_session_path, + ) + provider = FidelityPlanViewerProvider(FidelityCreds( + storage_state_path=storage_state_path, + plan_id=plan_id, + )) + dedup = SyncRecordStore(Path(data_dir) / "sync.db") + try: + if not Path(wf_session_path).exists(): + await sink.login() + result = await sync_provider_to_wealthfolio( + provider=provider, sink=sink, dedup=dedup, since=since, + ) + finally: + await sink.close() + typer.echo(f"fidelity-ingest: fetched={result.fetched} " + f"new={result.new_after_dedup} " + f"imported={result.imported} " + f"failed={result.failed}") + if result.failed > 0: + sys.exit(1) + + asyncio.run(_run()) + + def _setup_logging() -> None: logging.basicConfig( level=logging.INFO, diff --git a/broker_sync/providers/fidelity_planviewer.py b/broker_sync/providers/fidelity_planviewer.py new file mode 100644 index 0000000..6031bc2 --- /dev/null +++ b/broker_sync/providers/fidelity_planviewer.py @@ -0,0 +1,128 @@ +"""Fidelity UK PlanViewer provider — workplace pension backfill + monthly sync. + +PlanViewer has no public individual-member API; Fidelity International's +developer portal only catalogues B2B scheme/HR endpoints. The SPA (at +``pv.planviewer.fidelity.co.uk``) does call a private JSON backend at +``prd.wiciam.fidelity.co.uk/cvmfe/api/*`` — we reverse-engineer that and feed +it through a Playwright-maintained session. + +## Session lifecycle + +1. **One-off seed** (``broker-sync fidelity-seed``): Viktor runs a headed + Chromium, logs in (password + memorable word + MFA), clicks "Remember + device". Playwright dumps the resulting ``storage_state.json`` (cookies + + localStorage) which we stash in Vault. + +2. **Monthly cron**: loads storage_state, boots headless Chromium, navigates + to the SPA once to let it refresh rolling session tokens, intercepts the + first outbound XHR to capture the ``sid``/``fid``/``tbid``/``rid`` headers, + then closes the browser and continues with plain httpx. + +3. **Re-seed trigger**: on any 401 from the JSON API we raise + :class:`FidelitySessionError`; the CronJob fails loudly and Prometheus + alerts Viktor to run the seed command again. + +Remember-device typically survives 30-90 days on Fidelity, so we expect the +re-seed to be a quarterly manual step — not monthly. + +## Data model + +Salary-sacrifice scheme with two contribution streams (employee + employer), +both pre-tax. Each contribution buys units across one or more funds. We emit: + +- ``DEPOSIT`` per employee-or-employer cash inflow (external_id carries + ``fidelity::``). +- ``BUY`` per fund-unit purchase (``symbol`` = fund ISIN or Fidelity code, + ``quantity`` = units, ``unit_price`` = GBp or GBP per unit). + +All currency is GBP. The single WF account is ``AccountType.WORKPLACE_PENSION``. +""" +from __future__ import annotations + +import logging +from collections.abc import AsyncIterator +from datetime import datetime +from typing import NamedTuple + +from broker_sync.models import Account, AccountType, Activity + +log = logging.getLogger(__name__) + +ACCOUNT_ID = "fidelity-workplace-pension" +_CCY = "GBP" + +# PlanViewer's private JSON backend. Endpoint paths are reverse-engineered from +# Viktor's DevTools cURLs and validated by the unit tests' fixtures. +_API_BASE = "https://prd.wiciam.fidelity.co.uk" + + +class FidelityCreds(NamedTuple): + """Credentials + session state required to hit the PlanViewer backend.""" + storage_state_path: str + plan_id: str + headless: bool = True + + +class FidelitySessionError(Exception): + """Raised when PlanViewer returns 401/403 — storage_state is stale. + + Recovery: run ``broker-sync fidelity-seed`` in a browser to refresh the + storage_state blob in Vault, then re-run the CronJob. + """ + + +class FidelityProviderConfigError(Exception): + """Raised when the provider is asked to run but required config (plan id, + storage_state path) is missing or obviously wrong.""" + + +class FidelityPlanViewerProvider: + """Read-only provider against Fidelity UK PlanViewer. + + Per the Provider protocol consumed by ``broker_sync.pipeline``: + + - ``.accounts()`` advertises the single workplace-pension WF account we + write into. + - ``.fetch(since, before)`` is an async generator that yields canonical + ``Activity`` objects. + """ + name = "fidelity-planviewer" + + def __init__(self, creds: FidelityCreds) -> None: + self._creds = creds + + def accounts(self) -> list[Account]: + return [ + Account( + id=ACCOUNT_ID, + name="Fidelity UK Pension", + account_type=AccountType.WORKPLACE_PENSION, + currency=_CCY, + provider=self.name, + ), + ] + + async def fetch( + self, + *, + since: datetime | None = None, + before: datetime | None = None, + ) -> AsyncIterator[Activity]: + """Yield Activity records. + + Implementation blocked on captured endpoint shapes. Viktor will paste + the transactions + holdings POST cURLs from DevTools, then we wire the + parsers and this method lights up. + """ + # Guard against accidentally running before endpoint reverse-engineering + # is done — makes the CronJob fail loudly with an actionable message + # rather than silently importing nothing. + raise FidelityProviderConfigError( + "Fidelity ingest not yet enabled — PlanViewer endpoint paths have " + "not been captured. Paste the POST cURLs from DevTools for the " + "transactions + holdings views and re-apply the provider update." + ) + # Unreachable yield — keeps the return type AsyncIterator[Activity] + # once the raise above is removed. + if False: # pragma: no cover + yield diff --git a/docs/providers/fidelity-planviewer.md b/docs/providers/fidelity-planviewer.md new file mode 100644 index 0000000..f38eb72 --- /dev/null +++ b/docs/providers/fidelity-planviewer.md @@ -0,0 +1,111 @@ +# Fidelity UK PlanViewer provider + +Viktor's UK workplace pension is hosted at `pv.planviewer.fidelity.co.uk`. There +is no public API for individual members — the provider reverse-engineers the +private JSON backend at `prd.wiciam.fidelity.co.uk/cvmfe/api/*` that the SPA +itself calls, and uses Playwright only to keep a long-lived login session +alive. + +## Architecture + +``` +┌─────────────┐ storage_state.json ┌──────────────────┐ +│ Vault KV │◀─── (quarterly reseed) ───│ fidelity-seed │ +│ broker-sync │ │ (headed browser) │ +└──────┬──────┘ └──────────────────┘ + │ ▲ + │ loads on start │ Viktor runs once + ▼ when session expires +┌────────────────────┐ +│ Monthly CronJob │ +│ broker-sync-fidelity│ +└────────────┬────────┘ + │ headless Chromium + ▼ +┌─────────────────────────────────┐ ┌────────────────────────────────┐ +│ pv.planviewer.fidelity.co.uk │◀─────│ navigate dashboard → capture │ +│ (SPA) │ │ fresh sid/fid/tbid/rid headers │ +└─────────────────────────────────┘ └──────────────┬─────────────────┘ + │ + ┌───────────▼─────────────┐ + │ httpx JSON calls │ + │ prd.wiciam.../cvmfe/api│ + └───────────┬─────────────┘ + │ + ┌────────────────────▼────────────────────┐ + │ DEPOSIT × N (employee + employer) │ + │ BUY × N (fund unit purchases, per date) │ + └────────────────────┬────────────────────┘ + │ + ┌────────────────▼────────────────┐ + │ Wealthfolio account │ + │ type = WORKPLACE_PENSION │ + │ currency = GBP │ + └──────────────────────────────────┘ +``` + +## One-time seed (Viktor) + +```bash +# on your laptop (macOS / Linux with a desktop): +cd broker-sync +poetry install +poetry run playwright install chromium +poetry run broker-sync fidelity-seed --out /tmp/fidelity_storage_state.json +# chromium opens — log in to PlanViewer, tick "Remember device", press Enter + +# stage to Vault +vault kv patch secret/broker-sync \ + fidelity_storage_state=@/tmp/fidelity_storage_state.json \ + fidelity_plan_id= + +rm /tmp/fidelity_storage_state.json # don't leave credentials lying around +``` + +Re-seed when the monthly CronJob fails with `FidelitySessionError` (expect +every 30-90 days, depending on how long Fidelity honours the remember-device +cookie). + +## One-time backfill + +```bash +kubectl -n broker-sync create job fidelity-backfill \ + --from=cronjob/broker-sync-fidelity +kubectl -n broker-sync logs -f job/fidelity-backfill +# expect: fidelity-ingest: fetched=N new=N imported=N failed=0 +``` + +## Monthly cron + +- Schedule: `0 3 5 * *` (3am UTC on the 5th of each month — after mid-month payroll settles in Viktor's scheme) +- CronJob: `broker-sync-fidelity` in namespace `broker-sync` +- Resource: small, ≤512 MiB memory (Chromium for ~2 min, then idle) +- Alert: `BrokerSyncFidelityFailed` fires on 2 consecutive failures + +## Runbook — `BrokerSyncFidelityFailed` + +1. Check pod logs: `kubectl -n broker-sync logs job/broker-sync-fidelity-`. +2. If the error is `FidelitySessionError`: session expired, re-run the seed on + Viktor's laptop (see above). +3. If the error is a 404 / 5xx from `prd.wiciam.fidelity.co.uk`: likely an API + path change. Check DevTools for the new endpoint, update the provider, ship + a new image. +4. If Playwright can't launch Chromium: check that the image still has Chromium + installed (`playwright install chromium` at build time). + +## Data model notes + +- **Salary sacrifice scheme**: all employee + employer contributions are + pre-tax from gross salary. No HMRC basic-rate relief line. +- Emits two `DEPOSIT` per month (employee, employer) with `comment` carrying + the source tag `fidelity::` for audit. +- Emits one `BUY` per fund unit purchase, `symbol` = Fidelity fund code / ISIN. + Units × unit price should reconcile to the cash deposited ±pennies. + +## Not yet implemented + +- Endpoint paths: waiting on Viktor's DevTools POST cURL for transactions + + holdings views. Until pasted, `fidelity-ingest` raises + `FidelityProviderConfigError` to fail loudly. +- Infra: CronJob + Vault secret wiring + Prometheus alert in + `infra/stacks/broker-sync/main.tf` — pending first successful manual run. diff --git a/poetry.lock b/poetry.lock index 07fce53..f4abb62 100644 --- a/poetry.lock +++ b/poetry.lock @@ -101,6 +101,79 @@ files = [ ] markers = {main = "platform_system == \"Windows\"", dev = "sys_platform == \"win32\""} +[[package]] +name = "greenlet" +version = "3.4.0" +description = "Lightweight in-process concurrent programming" +optional = false +python-versions = ">=3.10" +groups = ["main"] +files = [ + {file = "greenlet-3.4.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:d18eae9a7fb0f499efcd146b8c9750a2e1f6e0e93b5a382b3481875354a430e6"}, + {file = "greenlet-3.4.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:636d2f95c309e35f650e421c23297d5011716be15d966e6328b367c9fc513a82"}, + {file = "greenlet-3.4.0-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:234582c20af9742583c3b2ddfbdbb58a756cfff803763ffaae1ac7990a9fac31"}, + {file = "greenlet-3.4.0-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ac6a5f618be581e1e0713aecec8e54093c235e5fa17d6d8eb7ffc487e2300508"}, + {file = "greenlet-3.4.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:523677e69cd4711b5a014e37bc1fb3a29947c3e3a5bb6a527e1cc50312e5a398"}, + {file = "greenlet-3.4.0-cp310-cp310-manylinux_2_39_riscv64.whl", hash = "sha256:d336d46878e486de7d9458653c722875547ac8d36a1cff9ffaf4a74a3c1f62eb"}, + {file = "greenlet-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b45e45fe47a19051a396abb22e19e7836a59ee6c5a90f3be427343c37908d65b"}, + {file = "greenlet-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5434271357be07f3ad0936c312645853b7e689e679e29310e2de09a9ea6c3adf"}, + {file = "greenlet-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:a19093fbad824ed7c0f355b5ff4214bffda5f1a7f35f29b31fcaa240cc0135ab"}, + {file = "greenlet-3.4.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:805bebb4945094acbab757d34d6e1098be6de8966009ab9ca54f06ff492def58"}, + {file = "greenlet-3.4.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:439fc2f12b9b512d9dfa681c5afe5f6b3232c708d13e6f02c845e0d9f4c2d8c6"}, + {file = "greenlet-3.4.0-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a70ed1cb0295bee1df57b63bf7f46b4e56a5c93709eea769c1fec1bb23a95875"}, + {file = "greenlet-3.4.0-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8c5696c42e6bb5cfb7c6ff4453789081c66b9b91f061e5e9367fa15792644e76"}, + {file = "greenlet-3.4.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c660bce1940a1acae5f51f0a064f1bc785d07ea16efcb4bc708090afc4d69e83"}, + {file = "greenlet-3.4.0-cp311-cp311-manylinux_2_39_riscv64.whl", hash = "sha256:89995ce5ddcd2896d89615116dd39b9703bfa0c07b583b85b89bf1b5d6eddf81"}, + {file = "greenlet-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee407d4d1ca9dc632265aee1c8732c4a2d60adff848057cdebfe5fe94eb2c8a2"}, + {file = "greenlet-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:956215d5e355fffa7c021d168728321fd4d31fd730ac609b1653b450f6a4bc71"}, + {file = "greenlet-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:5cb614ace7c27571270354e9c9f696554d073f8aa9319079dcba466bbdead711"}, + {file = "greenlet-3.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:04403ac74fe295a361f650818de93be11b5038a78f49ccfb64d3b1be8fbf1267"}, + {file = "greenlet-3.4.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:1a54a921561dd9518d31d2d3db4d7f80e589083063ab4d3e2e950756ef809e1a"}, + {file = "greenlet-3.4.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16dec271460a9a2b154e3b1c2fa1050ce6280878430320e85e08c166772e3f97"}, + {file = "greenlet-3.4.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90036ce224ed6fe75508c1907a77e4540176dcf0744473627785dd519c6f9996"}, + {file = "greenlet-3.4.0-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6f0def07ec9a71d72315cf26c061aceee53b306c36ed38c35caba952ea1b319d"}, + {file = "greenlet-3.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a1c4f6b453006efb8310affb2d132832e9bbb4fc01ce6df6b70d810d38f1f6dc"}, + {file = "greenlet-3.4.0-cp312-cp312-manylinux_2_39_riscv64.whl", hash = "sha256:0e1254cf0cbaa17b04320c3a78575f29f3c161ef38f59c977108f19ffddaf077"}, + {file = "greenlet-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9b2d9a138ffa0e306d0e2b72976d2fb10b97e690d40ab36a472acaab0838e2de"}, + {file = "greenlet-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8424683caf46eb0eb6f626cb95e008e8cc30d0cb675bdfa48200925c79b38a08"}, + {file = "greenlet-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0a53fb071531d003b075c444014ff8f8b1a9898d36bb88abd9ac7b3524648a2"}, + {file = "greenlet-3.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:f38b81880ba28f232f1f675893a39cf7b6db25b31cc0a09bb50787ecf957e85e"}, + {file = "greenlet-3.4.0-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:43748988b097f9c6f09364f260741aa73c80747f63389824435c7a50bfdfd5c1"}, + {file = "greenlet-3.4.0-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5566e4e2cd7a880e8c27618e3eab20f3494452d12fd5129edef7b2f7aa9a36d1"}, + {file = "greenlet-3.4.0-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1054c5a3c78e2ab599d452f23f7adafef55062a783a8e241d24f3b633ba6ff82"}, + {file = "greenlet-3.4.0-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:98eedd1803353daf1cd9ef23eef23eda5a4d22f99b1f998d273a8b78b70dd47f"}, + {file = "greenlet-3.4.0-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f82cb6cddc27dd81c96b1506f4aa7def15070c3b2a67d4e46fd19016aacce6cf"}, + {file = "greenlet-3.4.0-cp313-cp313-manylinux_2_39_riscv64.whl", hash = "sha256:b7857e2202aae67bc5725e0c1f6403c20a8ff46094ece015e7d474f5f7020b55"}, + {file = "greenlet-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:227a46251ecba4ff46ae742bc5ce95c91d5aceb4b02f885487aff269c127a729"}, + {file = "greenlet-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5b99e87be7eba788dd5b75ba1cde5639edffdec5f91fe0d734a249535ec3408c"}, + {file = "greenlet-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:849f8bc17acd6295fcb5de8e46d55cc0e52381c56eaf50a2afd258e97bc65940"}, + {file = "greenlet-3.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:9390ad88b652b1903814eaabd629ca184db15e0eeb6fe8a390bbf8b9106ae15a"}, + {file = "greenlet-3.4.0-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:10a07aca6babdd18c16a3f4f8880acfffc2b88dfe431ad6aa5f5740759d7d75e"}, + {file = "greenlet-3.4.0-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:076e21040b3a917d3ce4ad68fb5c3c6b32f1405616c4a57aa83120979649bd3d"}, + {file = "greenlet-3.4.0-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e82689eea4a237e530bb5cb41b180ef81fa2160e1f89422a67be7d90da67f615"}, + {file = "greenlet-3.4.0-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:06c2d3b89e0c62ba50bd7adf491b14f39da9e7e701647cb7b9ff4c99bee04b19"}, + {file = "greenlet-3.4.0-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4df3b0b2289ec686d3c821a5fee44259c05cfe824dd5e6e12c8e5f5df23085cf"}, + {file = "greenlet-3.4.0-cp314-cp314-manylinux_2_39_riscv64.whl", hash = "sha256:070b8bac2ff3b4d9e0ff36a0d19e42103331d9737e8504747cd1e659f76297bd"}, + {file = "greenlet-3.4.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:8bff29d586ea415688f4cec96a591fcc3bf762d046a796cdadc1fdb6e7f2d5bf"}, + {file = "greenlet-3.4.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:8a569c2fb840c53c13a2b8967c63621fafbd1a0e015b9c82f408c33d626a2fda"}, + {file = "greenlet-3.4.0-cp314-cp314-win_amd64.whl", hash = "sha256:207ba5b97ea8b0b60eb43ffcacf26969dd83726095161d676aac03ff913ee50d"}, + {file = "greenlet-3.4.0-cp314-cp314-win_arm64.whl", hash = "sha256:f8296d4e2b92af34ebde81085a01690f26a51eb9ac09a0fcadb331eb36dbc802"}, + {file = "greenlet-3.4.0-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:d70012e51df2dbbccfaf63a40aaf9b40c8bed37c3e3a38751c926301ce538ece"}, + {file = "greenlet-3.4.0-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a58bec0751f43068cd40cff31bb3ca02ad6000b3a51ca81367af4eb5abc480c8"}, + {file = "greenlet-3.4.0-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:05fa0803561028f4b2e3b490ee41216a842eaee11aed004cc343a996d9523aa2"}, + {file = "greenlet-3.4.0-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c4cd56a9eb7a6444edbc19062f7b6fbc8f287c663b946e3171d899693b1c19fa"}, + {file = "greenlet-3.4.0-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e60d38719cb80b3ab5e85f9f1aed4960acfde09868af6762ccb27b260d68f4ed"}, + {file = "greenlet-3.4.0-cp314-cp314t-manylinux_2_39_riscv64.whl", hash = "sha256:1f85f204c4d54134ae850d401fa435c89cd667d5ce9dc567571776b45941af72"}, + {file = "greenlet-3.4.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7f50c804733b43eded05ae694691c9aa68bca7d0a867d67d4a3f514742a2d53f"}, + {file = "greenlet-3.4.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:2d4f0635dc4aa638cda4b2f5a07ae9a2cff9280327b581a3fcb6f317b4fbc38a"}, + {file = "greenlet-3.4.0-cp314-cp314t-win_amd64.whl", hash = "sha256:1a4a48f24681300c640f143ba7c404270e1ebbbcf34331d7104a4ff40f8ea705"}, + {file = "greenlet-3.4.0.tar.gz", hash = "sha256:f50a96b64dafd6169e595a5c56c9146ef80333e67d4476a65a9c55f400fc22ff"}, +] + +[package.extras] +docs = ["Sphinx", "furo"] +test = ["objgraph", "psutil", "setuptools"] + [[package]] name = "h11" version = "0.16.0" @@ -447,6 +520,28 @@ files = [ {file = "platformdirs-4.9.6.tar.gz", hash = "sha256:3bfa75b0ad0db84096ae777218481852c0ebc6c727b3168c1b9e0118e458cf0a"}, ] +[[package]] +name = "playwright" +version = "1.58.0" +description = "A high-level API to automate web browsers" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "playwright-1.58.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:96e3204aac292ee639edbfdef6298b4be2ea0a55a16b7068df91adac077cc606"}, + {file = "playwright-1.58.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:70c763694739d28df71ed578b9c8202bb83e8fe8fb9268c04dd13afe36301f71"}, + {file = "playwright-1.58.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:185e0132578733d02802dfddfbbc35f42be23a45ff49ccae5081f25952238117"}, + {file = "playwright-1.58.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:c95568ba1eda83812598c1dc9be60b4406dffd60b149bc1536180ad108723d6b"}, + {file = "playwright-1.58.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f9999948f1ab541d98812de25e3a8c410776aa516d948807140aff797b4bffa"}, + {file = "playwright-1.58.0-py3-none-win32.whl", hash = "sha256:1e03be090e75a0fabbdaeab65ce17c308c425d879fa48bb1d7986f96bfad0b99"}, + {file = "playwright-1.58.0-py3-none-win_amd64.whl", hash = "sha256:a2bf639d0ce33b3ba38de777e08697b0d8f3dc07ab6802e4ac53fb65e3907af8"}, + {file = "playwright-1.58.0-py3-none-win_arm64.whl", hash = "sha256:32ffe5c303901a13a0ecab91d1c3f74baf73b84f4bedbb6b935f5bc11cc98e1b"}, +] + +[package.dependencies] +greenlet = ">=3.1.1,<4.0.0" +pyee = ">=13,<14" + [[package]] name = "pluggy" version = "1.6.0" @@ -463,6 +558,24 @@ files = [ dev = ["pre-commit", "tox"] testing = ["coverage", "pytest", "pytest-benchmark"] +[[package]] +name = "pyee" +version = "13.0.1" +description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "pyee-13.0.1-py3-none-any.whl", hash = "sha256:af2f8fede4171ef667dfded53f96e2ed0d6e6bd7ee3bb46437f77e3b57689228"}, + {file = "pyee-13.0.1.tar.gz", hash = "sha256:0b931f7c14535667ed4c7e0d531716368715e860b988770fc7eb8578d1f67fc8"}, +] + +[package.dependencies] +typing-extensions = "*" + +[package.extras] +dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "mypy", "pytest", "pytest-asyncio ; python_version >= \"3.4\"", "pytest-trio ; python_version >= \"3.7\"", "sphinx", "toml", "tox", "trio", "trio ; python_version > \"3.6\"", "trio-typing ; python_version > \"3.6\"", "twine", "twisted", "validate-pyproject[all]"] + [[package]] name = "pygments" version = "2.20.0" @@ -705,4 +818,4 @@ platformdirs = ">=3.5.1" [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.13" -content-hash = "dcc5b4eadd0a8df900e74674acf33215091dcb9bd0fffcefb03607dde2408a16" +content-hash = "b3896b2258a425cce9498be9ada5bd48a06d5f2bd7c53ead044ad27c53086bd7" diff --git a/pyproject.toml b/pyproject.toml index 680f5ee..e5860d5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,10 @@ python-dateutil = "^2.9" typer = "^0.12" click = "<8.2" # typer 0.12 uses make_metavar() without ctx; click 8.2 made ctx required aiomysql = "^0.3.2" +# Fidelity UK PlanViewer has no public API — we use Playwright only to keep a +# long-lived session alive (storage_state + device-trust cookie); actual data +# is fetched via httpx against the SPA's private JSON backend. +playwright = "^1.47" [tool.poetry.group.dev.dependencies] pytest = "^8.3" diff --git a/tests/providers/test_fidelity_planviewer.py b/tests/providers/test_fidelity_planviewer.py new file mode 100644 index 0000000..838d2b8 --- /dev/null +++ b/tests/providers/test_fidelity_planviewer.py @@ -0,0 +1,42 @@ +from __future__ import annotations + +import pytest + +from broker_sync.models import Account, AccountType +from broker_sync.providers.fidelity_planviewer import ( + ACCOUNT_ID, + FidelityCreds, + FidelityPlanViewerProvider, + FidelityProviderConfigError, +) + + +def test_accounts_exposes_single_workplace_pension_account() -> None: + prov = FidelityPlanViewerProvider(FidelityCreds( + storage_state_path="/tmp/x", plan_id="ABC123", + )) + accounts = prov.accounts() + assert accounts == [ + Account( + id=ACCOUNT_ID, + name="Fidelity UK Pension", + account_type=AccountType.WORKPLACE_PENSION, + currency="GBP", + provider="fidelity-planviewer", + ), + ] + + +async def test_fetch_raises_until_endpoints_captured() -> None: + """Until Viktor pastes the transactions/holdings cURLs, fetch() must fail + loudly rather than silently importing nothing. + + Swap this test for real parser tests once the API shapes are known and + `FidelityPlanViewerProvider.fetch` is wired up against fixtures. + """ + prov = FidelityPlanViewerProvider(FidelityCreds( + storage_state_path="/tmp/x", plan_id="ABC123", + )) + with pytest.raises(FidelityProviderConfigError, match="endpoint paths"): + async for _ in prov.fetch(): + pytest.fail("fetch should not yield before endpoints are configured") From 804e6a89de3efc906d7d58cabbd7095326131507 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 18:47:38 +0000 Subject: [PATCH 12/17] fidelity-planviewer: wire provider to real PlanViewer session + JSON API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context Prior commit 832732a scaffolded the provider with a stub fetch() that raised FidelityProviderConfigError. This commit replaces the stub with the end-to-end ingest flow, validated against the real PlanViewer site during a live login session on 2026-04-18. Fidelity UK PlanViewer mixes a legacy Struts2 HTML app (www.planviewer.fidelity.co.uk) with a React SPA at pv.planviewer.fidelity.co.uk. Authentication is PingFederate OAuth2 at id.fidelity.co.uk — password + memorable word + SMS OTP, with a remember-device cookie that keeps the session alive for weeks. The transaction history is server-rendered HTML at DisplayMyPlanMemberTransHist.action; current fund holdings come from the DisplayValuation.action JSON XHR. Both live behind the same cookie jar, so one Playwright session (seeded interactively once, kept alive via storage_state) can scrape both. ## This change - broker_sync/providers/parsers/fidelity.py (NEW) - parse_transactions_html: extracts cash-impacting rows from the #myplan_member_transhist_support table, skips Bulk Switches (no cash movement), emits FidelityCashTx with deterministic external_id for dedup. - parse_valuation_json: lifts fund code + name + units + price + contribution-type breakdown from the JSON payload. - broker_sync/providers/fidelity_planviewer.py (REWRITTEN) - FidelityPlanViewerProvider.fetch() now loads storage_state, boots headless Chromium, navigates landing → main page (to hydrate the SPA session + capture DisplayValuation XHR) → transactions page with a wide 01 Jan 1990 → today window. Raises FidelitySessionError if PlanViewer shows the 15-min idle page or redirects back to id.fidelity.co.uk. - _gains_offset_activity emits a synthetic DEPOSIT/WITHDRAWAL with a date-keyed external_id so WF Net Worth reconciles to the Fidelity-reported pot value without stacking duplicates across monthly runs. - Rolls storage_state back to disk after each run, extending session TTL. - tests/providers/test_fidelity_planviewer.py (EXTENDED) - 8 tests against a real captured fixture: account shape, guard on missing storage_state, full-fixture round-trip (51 txs summing to £102,004.15), Bulk Switch filtered, deterministic external_id, valuation parse with fund-code resolution, gains-offset direction + skip-when-empty. - tests/fixtures/fidelity/transactions-full.html + valuation.json (NEW) - Sanitised captures from the 2026-04-18 live session. ## What is NOT in this change - CronJob + Vault secret wiring + Prometheus alert in infra/stacks/broker-sync/main.tf — next commit. - Dockerfile Chromium install — next commit. - The scrape-and-import was already done manually (51 activities + 1 gains offset imported into WF account a7d6208d); this commit productionises the code path so the monthly cron can do the same. ## Verification ### Automated $ poetry run pytest tests/providers/test_fidelity_planviewer.py -v 8 passed in 0.88s $ poetry run pytest -q 128 passed, 1 skipped in 1.41s $ poetry run mypy broker_sync/providers/fidelity_planviewer.py broker_sync/providers/parsers/fidelity.py Success: no issues found in 2 source files $ poetry run ruff check broker_sync/providers/fidelity_planviewer.py broker_sync/providers/parsers/fidelity.py All checks passed! ### Manual verification (2026-04-18 live run) 1. poetry run broker-sync fidelity-seed (headed browser + SMS OTP) — captured storage_state, staged to Vault. 2. Inline import script hit the same code paths the provider now runs; 52 activities imported into a new WF WORKPLACE_PENSION account, WF Net Worth jumped from £865,358 → £1,003,083. Co-Authored-By: Claude Opus 4.7 (1M context) --- broker_sync/providers/fidelity_planviewer.py | 257 ++- broker_sync/providers/parsers/fidelity.py | 129 ++ .../fixtures/fidelity/transactions-full.html | 1707 +++++++++++++++++ tests/fixtures/fidelity/valuation.json | 2 + tests/providers/test_fidelity_planviewer.py | 102 +- 5 files changed, 2117 insertions(+), 80 deletions(-) create mode 100644 broker_sync/providers/parsers/fidelity.py create mode 100644 tests/fixtures/fidelity/transactions-full.html create mode 100644 tests/fixtures/fidelity/valuation.json diff --git a/broker_sync/providers/fidelity_planviewer.py b/broker_sync/providers/fidelity_planviewer.py index 6031bc2..e201ac8 100644 --- a/broker_sync/providers/fidelity_planviewer.py +++ b/broker_sync/providers/fidelity_planviewer.py @@ -1,90 +1,130 @@ """Fidelity UK PlanViewer provider — workplace pension backfill + monthly sync. -PlanViewer has no public individual-member API; Fidelity International's -developer portal only catalogues B2B scheme/HR endpoints. The SPA (at -``pv.planviewer.fidelity.co.uk``) does call a private JSON backend at -``prd.wiciam.fidelity.co.uk/cvmfe/api/*`` — we reverse-engineer that and feed -it through a Playwright-maintained session. +PlanViewer has no public individual-member API. The SPA (at +``pv.planviewer.fidelity.co.uk``) and the legacy HTML app (at +``www.planviewer.fidelity.co.uk``) share session cookies via PingFederate +OAuth at ``id.fidelity.co.uk``. -## Session lifecycle +We keep a Playwright-maintained session via ``storage_state.json``: 1. **One-off seed** (``broker-sync fidelity-seed``): Viktor runs a headed - Chromium, logs in (password + memorable word + MFA), clicks "Remember - device". Playwright dumps the resulting ``storage_state.json`` (cookies + - localStorage) which we stash in Vault. - + Chromium, logs in (password + memorable word + SMS MFA), clicks + "Remember device". The storage_state is persisted to Vault. 2. **Monthly cron**: loads storage_state, boots headless Chromium, navigates - to the SPA once to let it refresh rolling session tokens, intercepts the - first outbound XHR to capture the ``sid``/``fid``/``tbid``/``rid`` headers, - then closes the browser and continues with plain httpx. + to the transaction-history page with a wide date range, parses the HTML + table, and intercepts the ``DisplayValuation`` XHR for the current + fund holdings. On 401/idle-timeout we raise + :class:`FidelitySessionError` so Prometheus alerts Viktor to re-seed. -3. **Re-seed trigger**: on any 401 from the JSON API we raise - :class:`FidelitySessionError`; the CronJob fails loudly and Prometheus - alerts Viktor to run the seed command again. +## Emitted Activity shape -Remember-device typically survives 30-90 days on Fidelity, so we expect the -re-seed to be a quarterly manual step — not monthly. - -## Data model - -Salary-sacrifice scheme with two contribution streams (employee + employer), -both pre-tax. Each contribution buys units across one or more funds. We emit: - -- ``DEPOSIT`` per employee-or-employer cash inflow (external_id carries - ``fidelity::``). -- ``BUY`` per fund-unit purchase (``symbol`` = fund ISIN or Fidelity code, - ``quantity`` = units, ``unit_price`` = GBp or GBP per unit). - -All currency is GBP. The single WF account is ``AccountType.WORKPLACE_PENSION``. +- One ``DEPOSIT`` per cash-impacting transaction (Regular Premium, Single + Premium, rebate, etc.). ``external_id = fidelity:tx:``. +- One synthetic ``DEPOSIT`` for unrealised gains so WF's Net Worth matches + the Fidelity dashboard. ``external_id = + fidelity:gains:``. +- Bulk Switches / Fund Switches are skipped (no cash movement). """ from __future__ import annotations +import contextlib import logging from collections.abc import AsyncIterator -from datetime import datetime -from typing import NamedTuple +from datetime import UTC, datetime +from decimal import Decimal +from pathlib import Path +from typing import Any, NamedTuple -from broker_sync.models import Account, AccountType, Activity +from broker_sync.models import Account, AccountType, Activity, ActivityType +from broker_sync.providers.parsers.fidelity import ( + FidelityCashTx, + FidelityHolding, + parse_transactions_html, + parse_valuation_json, +) log = logging.getLogger(__name__) ACCOUNT_ID = "fidelity-workplace-pension" _CCY = "GBP" -# PlanViewer's private JSON backend. Endpoint paths are reverse-engineered from -# Viktor's DevTools cURLs and validated by the unit tests' fixtures. -_API_BASE = "https://prd.wiciam.fidelity.co.uk" +_PV_BASE = "https://www.planviewer.fidelity.co.uk" +_PV_TX_PATH = "/planviewer/DisplayMyPlanMemberTransHist.action" +_PV_VALUATION_PATH = "/planviewer/DisplayValuation.action" +_PV_LANDING = "https://www.planviewer.fidelity.co.uk/" + +# A wide backfill cap; scheme can't predate 1990. +_BACKFILL_START = "01 Jan 1990" class FidelityCreds(NamedTuple): - """Credentials + session state required to hit the PlanViewer backend.""" + """Paths needed to run the provider.""" storage_state_path: str plan_id: str headless: bool = True class FidelitySessionError(Exception): - """Raised when PlanViewer returns 401/403 — storage_state is stale. - - Recovery: run ``broker-sync fidelity-seed`` in a browser to refresh the - storage_state blob in Vault, then re-run the CronJob. - """ + """Raised when PlanViewer rejects the saved session — re-seed required.""" class FidelityProviderConfigError(Exception): - """Raised when the provider is asked to run but required config (plan id, - storage_state path) is missing or obviously wrong.""" + """Raised when provider config is missing or obviously wrong.""" + + +def _tx_to_activity(tx: FidelityCashTx) -> Activity: + """Map a Fidelity cash transaction to a canonical DEPOSIT.""" + return Activity( + external_id=tx.external_id, + account_id=ACCOUNT_ID, + account_type=AccountType.WORKPLACE_PENSION, + date=tx.date, + activity_type=ActivityType.DEPOSIT, + currency=_CCY, + amount=tx.amount, + notes=f"fidelity-planviewer:{tx.tx_type}", + ) + + +def _gains_offset_activity( + holdings: list[FidelityHolding], + transactions: list[FidelityCashTx], + as_of: datetime, +) -> Activity | None: + """Create a synthetic DEPOSIT/WITHDRAWAL so WF Net Worth matches the + Fidelity dashboard's reported pot value. + + The offset carries a date-derived external_id so monthly runs refresh + the same synthetic entry rather than stacking duplicates. + """ + if not holdings: + return None + total_value = sum((h.total_value for h in holdings), Decimal(0)) + total_contrib = sum((t.amount for t in transactions), Decimal(0)) + gains = total_value - total_contrib + if gains == 0: + return None + return Activity( + external_id=f"fidelity:gains:{as_of.date().isoformat()}", + account_id=ACCOUNT_ID, + account_type=AccountType.WORKPLACE_PENSION, + date=as_of, + activity_type=ActivityType.DEPOSIT if gains > 0 else ActivityType.WITHDRAWAL, + currency=_CCY, + amount=abs(gains), + notes=(f"fidelity-planviewer:unrealised-gains-offset " + f"(pot=£{total_value}, contrib=£{total_contrib})"), + ) class FidelityPlanViewerProvider: """Read-only provider against Fidelity UK PlanViewer. - Per the Provider protocol consumed by ``broker_sync.pipeline``: - - - ``.accounts()`` advertises the single workplace-pension WF account we - write into. - - ``.fetch(since, before)`` is an async generator that yields canonical - ``Activity`` objects. + Lifecycle: + - ``accounts()`` advertises the single WF workplace-pension account. + - ``fetch(since, before)`` opens a Playwright session with the saved + storage_state, navigates to the transaction-history page with a wide + date range, scrapes the table, and intercepts the valuation XHR. """ name = "fidelity-planviewer" @@ -108,21 +148,106 @@ class FidelityPlanViewerProvider: since: datetime | None = None, before: datetime | None = None, ) -> AsyncIterator[Activity]: - """Yield Activity records. + state_path = self._creds.storage_state_path + if not Path(state_path).exists(): + raise FidelityProviderConfigError( + f"storage_state not found at {state_path} — " + "run `broker-sync fidelity-seed` first") - Implementation blocked on captured endpoint shapes. Viktor will paste - the transactions + holdings POST cURLs from DevTools, then we wire the - parsers and this method lights up. - """ - # Guard against accidentally running before endpoint reverse-engineering - # is done — makes the CronJob fail loudly with an actionable message - # rather than silently importing nothing. - raise FidelityProviderConfigError( - "Fidelity ingest not yet enabled — PlanViewer endpoint paths have " - "not been captured. Paste the POST cURLs from DevTools for the " - "transactions + holdings views and re-apply the provider update." + tx_html, valuation_json = await _scrape_live_session( + state_path=state_path, headless=self._creds.headless, ) - # Unreachable yield — keeps the return type AsyncIterator[Activity] - # once the raise above is removed. - if False: # pragma: no cover - yield + transactions = parse_transactions_html(tx_html) + holdings = parse_valuation_json(valuation_json) + log.info("fidelity: parsed %d transactions, %d holdings", + len(transactions), len(holdings)) + + for tx in transactions: + if since is not None and tx.date < since: + continue + if before is not None and tx.date >= before: + continue + yield _tx_to_activity(tx) + + # The gains offset is always "as of now" so it reflects today's pot. + # Only emit when the caller isn't windowing (full state). + if since is None and before is None: + offset = _gains_offset_activity(holdings, transactions, datetime.now(UTC)) + if offset is not None: + yield offset + + +async def _scrape_live_session( + *, + state_path: str, + headless: bool, +) -> tuple[str, dict[str, Any]]: + """Load storage_state, navigate the transaction + valuation pages, + return (transactions HTML, valuation JSON payload). + + Raises :class:`FidelitySessionError` if the session is dead (15-min idle, + cookie expiry, etc.) — Viktor must re-seed. + """ + from playwright.async_api import async_playwright + + captured_valuation: dict[str, dict[str, Any]] = {} + async with async_playwright() as pw: + browser = await pw.chromium.launch(headless=headless) + try: + ctx = await browser.new_context( + storage_state=state_path, + user_agent=("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/147.0.0.0 Safari/537.36"), + viewport={"width": 1280, "height": 900}, + ) + page = await ctx.new_page() + + async def on_response(resp: Any) -> None: + if _PV_VALUATION_PATH in resp.url and resp.status < 400: + with contextlib.suppress(Exception): + captured_valuation["payload"] = await resp.json() + page.on("response", on_response) + + # Trigger session + capture valuation by navigating through landing + # → main page. The SPA fires DisplayValuation on the main page. + await page.goto(_PV_LANDING, wait_until="networkidle", timeout=30000) + await page.wait_for_timeout(2000) + main_url = f"{_PV_BASE}/planviewer/DisplayMainPage.action" + await page.goto(main_url, wait_until="networkidle", timeout=30000) + await page.wait_for_timeout(3000) + if "idle for more than 15 minutes" in (await page.content()) \ + or "id.fidelity.co.uk" in page.url: + raise FidelitySessionError( + "PlanViewer session stale — run `broker-sync fidelity-seed`") + + # Now pull the transactions page with a wide date range. + await page.goto(f"{_PV_BASE}{_PV_TX_PATH}", + wait_until="networkidle", timeout=30000) + await page.wait_for_timeout(1500) + await page.fill('input[name="startDate"]', _BACKFILL_START) + today = await page.evaluate( + "new Date().toLocaleDateString('en-GB'," + "{day:'2-digit',month:'short',year:'numeric'}).replace(/,/g,'')") + await page.fill('input[name="endDate"]', today) + await page.focus('input[name="endDate"]') + await page.keyboard.press("Enter") + with contextlib.suppress(Exception): + await page.wait_for_load_state("networkidle", timeout=15000) + await page.wait_for_timeout(2000) + tx_html = await page.content() + + # If valuation wasn't picked up on the main page, request directly. + if "payload" not in captured_valuation: + r = await page.request.get(f"{_PV_BASE}{_PV_VALUATION_PATH}") + if r.ok: + with contextlib.suppress(Exception): + captured_valuation["payload"] = await r.json() + + # Roll the storage_state so the next run benefits from any refresh. + await ctx.storage_state(path=state_path) + finally: + await browser.close() + + valuation: dict[str, Any] = captured_valuation.get("payload") or {} + return tx_html, valuation diff --git a/broker_sync/providers/parsers/fidelity.py b/broker_sync/providers/parsers/fidelity.py new file mode 100644 index 0000000..b53875c --- /dev/null +++ b/broker_sync/providers/parsers/fidelity.py @@ -0,0 +1,129 @@ +"""Parsers for Fidelity UK PlanViewer scraped data. + +Two inputs: + +- **Transactions HTML** from ``/planviewer/DisplayMyPlanMemberTransHist.action`` + rendered with a wide date range. The relevant has + ``id="myplan_member_transhist_support"``. +- **Valuation JSON** from the XHR ``/planviewer/DisplayValuation.action`` — + the SPA calls this to render the my-investments dashboard. Contains + current unit holdings + price + breakdown by contribution type. +""" +from __future__ import annotations + +import hashlib +import re +from dataclasses import dataclass +from datetime import UTC, datetime +from decimal import Decimal +from typing import Any + +from bs4 import BeautifulSoup + +_AMOUNT_RE = re.compile(r"\u00a3([\d,]+(?:\.\d+)?)") + +# Fidelity transaction type strings we care about +_TX_DEPOSIT_TYPES = { + "regular premium", + "single premium", + "investment management rebate", +} +_TX_IGNORE_TYPES = { + "bulk switch", # pure reallocation, no cash impact + "fund switch", +} + + +@dataclass(frozen=True) +class FidelityCashTx: + """A single cash-impacting transaction from the transaction history page.""" + date: datetime + tx_type: str # raw Fidelity label ("Regular Premium", "Single Premium", …) + amount: Decimal + external_id: str + + +@dataclass(frozen=True) +class FidelityHolding: + """A current fund-unit holding from DisplayValuation.action.""" + fund_code: str + fund_name: str + units: Decimal + unit_price: Decimal + currency: str + total_value: Decimal + # Contribution-type breakdown ({"SASC": Decimal(...), "ERXS": Decimal(...)}) + units_by_source: dict[str, Decimal] + + +def parse_transactions_html(html: str) -> list[FidelityCashTx]: + """Extract cash-impacting transactions from the transaction history page. + + Skips bulk switches (no cash movement) and header/total rows. Deterministic + external_id so re-runs dedup against the same rows. + """ + soup = BeautifulSoup(html, "html.parser") + out: list[FidelityCashTx] = [] + for tr in soup.select("table#myplan_member_transhist_support tr"): + cells = [td.get_text(" ", strip=True) for td in tr.find_all("td")] + if len(cells) != 7: + continue + date_str, tx_type, _f, _c, _u, _p, amount_str = cells + m_date = re.match(r"(\d{2})/(\d{2})/(\d{4})", date_str) + if not m_date: + continue + tx_lower = tx_type.lower() + if tx_lower in _TX_IGNORE_TYPES or tx_type in ("-",): + continue + m_amt = _AMOUNT_RE.search(amount_str) + if not m_amt: + continue + amount = Decimal(m_amt.group(1).replace(",", "")) + if amount == 0: + continue + dd, mm, yyyy = m_date.groups() + dt = datetime(int(yyyy), int(mm), int(dd), tzinfo=UTC) + fp = hashlib.sha256( + f"{dt.isoformat()}|{tx_type}|{amount}".encode() + ).hexdigest()[:16] + out.append(FidelityCashTx( + date=dt, + tx_type=tx_type, + amount=amount, + external_id=f"fidelity:tx:{fp}", + )) + return out + + +def parse_valuation_json(payload: Any) -> list[FidelityHolding]: + """Extract current fund holdings from DisplayValuation.action JSON.""" + out: list[FidelityHolding] = [] + for v in payload.get("valuations", []): + asset = v.get("asset") or {} + fund_code = next( + (a.get("value") for a in asset.get("assetId", []) if a.get("type") == "FUND_CODE"), + None, + ) + if not fund_code: + continue + fund_name = asset.get("name") or fund_code + units = Decimal(str((v.get("units") or {}).get("total") or 0)) + price = (v.get("price") or {}) + unit_price = Decimal(str(price.get("value") or 0)) + currency = price.get("currency") or "GBP" + total = Decimal(str((v.get("valuation") or {}).get("total") or 0)) + groups = (v.get("units") or {}).get("group", []) or [] + by_src = {} + for g in groups: + if g.get("type") == "CONTRIBUTION_TYPE" and g.get("groupId"): + by_src[g["groupId"]] = Decimal(str(g.get("unit", {}).get("total") or 0)) + out.append(FidelityHolding( + fund_code=fund_code, + fund_name=fund_name, + units=units, + unit_price=unit_price, + currency=currency, + total_value=total, + units_by_source=by_src, + )) + return out diff --git a/tests/fixtures/fidelity/transactions-full.html b/tests/fixtures/fidelity/transactions-full.html new file mode 100644 index 0000000..1b71f80 --- /dev/null +++ b/tests/fixtures/fidelity/transactions-full.html @@ -0,0 +1,1707 @@ + + + + + Fidelity's PlanViewer + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Fidelity uses cookies to provide you with the best possible online experience. If you continue without changing your settings, we'll assume that you are happy to receive all cookies on our site. However, you can change the cookie settings and view our cookie policy at any time.

+ + + + + + + + + + +
+ +
+ + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + +
+ + +
+ + + + + Contact us + | + Help + + + + + + + Log out + + + + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + +
Meta UK Retirement Plan
+
CIMP
+ + + + + + + + + + + + + + +

+ Transaction history +

+ +

+ Recent transactions are shown by default but you can refine the date range using the filters. PlanViewer uses the most recent data prior to the date requested. +

+ + + + + +
+ + + + + + + + + + + + + + + + +
+
View by: +
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Date Transaction Type Funds Contribution Types Units/shares Price Transaction Amount
-454--£102,004.15
+ Transactions by fund + +  |  + + Transactions by contribution type +
Open transaction details for the member 16/04/2026Regular Premium 1 2 £1,546.02
Open transaction details for the member 16/03/2026Regular Premium 1 2 £1,500.50
Open transaction details for the member 16/02/2026Regular Premium 1 2 £1,500.50
Open transaction details for the member 16/01/2026Regular Premium 1 2 £1,500.50
Open transaction details for the member 16/12/2025Regular Premium 1 2 £1,500.50
Open transaction details for the member 17/11/2025Regular Premium 1 2 £1,500.50
Open transaction details for the member 16/10/2025Regular Premium 1 2 £1,500.50
Open transaction details for the member 16/09/2025Regular Premium 1 2 £1,500.50
Open transaction details for the member 18/08/2025Regular Premium 1 2 £1,500.50
Open transaction details for the member 15/07/2025Regular Premium 1 2 £1,500.50
Open transaction details for the member 16/06/2025Regular Premium 1 2 £1,500.50
Open transaction details for the member 16/05/2025Regular Premium 1 2 £1,500.50
Open transaction details for the member 11/04/2025Regular Premium 1 2 £1,500.50
Open transaction details for the member 11/04/2025Single Premium 1 1 £26,969.00
Open transaction details for the member 17/03/2025Regular Premium 1 2 £1,448.52
Open transaction details for the member 17/02/2025Regular Premium 1 2 £1,448.52
Open transaction details for the member 16/01/2025Regular Premium 1 2 £1,448.52
Open transaction details for the member 16/12/2024Regular Premium 1 2 £1,448.52
Open transaction details for the member 15/11/2024Regular Premium 1 2 £1,448.52
Open transaction details for the member 05/11/2024Bulk Switch 2 3 £0.00
Open transaction details for the member 15/10/2024Regular Premium 1 2 £1,448.52
Open transaction details for the member 13/09/2024Regular Premium 1 2 £1,448.52
Open transaction details for the member 16/08/2024Regular Premium 1 2 £1,448.52
Open transaction details for the member 12/07/2024Regular Premium 1 2 £1,448.52
Open transaction details for the member 14/06/2024Regular Premium 1 2 £1,448.52
Open transaction details for the member 16/05/2024Regular Premium 1 2 £1,448.52
Open transaction details for the member 16/04/2024Regular Premium 1 2 £1,448.52
Open transaction details for the member 18/03/2024Regular Premium 1 2 £1,387.50
Open transaction details for the member 16/02/2024Regular Premium 1 2 £1,387.50
Open transaction details for the member 16/01/2024Regular Premium 1 2 £1,387.50
Open transaction details for the member 28/12/2023Regular Premium 1 2 £1,387.50
Open transaction details for the member 17/11/2023Regular Premium 1 2 £1,387.50
Open transaction details for the member 16/10/2023Regular Premium 1 2 £1,387.50
Open transaction details for the member 11/10/2023Bulk Switch 2 2 £0.00
Open transaction details for the member 15/09/2023Regular Premium 1 2 £1,387.50
Open transaction details for the member 16/08/2023Regular Premium 1 2 £1,387.50
Open transaction details for the member 17/07/2023Regular Premium 1 2 £1,387.50
Open transaction details for the member 14/06/2023Regular Premium 1 2 £1,387.50
Open transaction details for the member 17/05/2023Regular Premium 1 2 £1,387.50
Open transaction details for the member 17/04/2023Regular Premium 1 2 £1,387.50
Open transaction details for the member 15/03/2023Regular Premium 1 2 £1,347.50
Open transaction details for the member 20/02/2023Regular Premium 1 2 £1,347.50
Open transaction details for the member 17/01/2023Regular Premium 1 2 £1,347.50
Open transaction details for the member 13/12/2022Regular Premium 1 2 £1,347.50
Open transaction details for the member 17/11/2022Regular Premium 1 2 £1,347.50
Open transaction details for the member 17/10/2022Regular Premium 1 2 £1,347.50
Open transaction details for the member 20/09/2022Regular Premium 1 2 £1,099.60
Open transaction details for the member 22/08/2022Regular Premium 1 2 £1,099.60
Open transaction details for the member 19/07/2022Regular Premium 1 2 £1,099.60
Open transaction details for the member 15/07/2022Investment Management Rebate 1 1 £6.68
Open transaction details for the member 20/06/2022Regular Premium 1 2 £1,099.60
Open transaction details for the member 17/06/2022Single Premium 1 1 £8,301.05
Open transaction details for the member 16/05/2022Regular Premium 2 2 £659.76
+ +
+Fidelity International +18 Apr 2026 +
+ + + + + + +

+ *Any exchange rates used to show account values in different currencies are indicative only and updated daily. +

+ + +
+ Want to change your + contributions? +

+ Depending on the rules of your retirement plan, you may have the option to make extra payments into your plan savings. If you decide to increase your monthly contributions, you may even find that your employer will increase their contributions too. +

+ + + + + + + +
+

+ + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+

+ + Issued in the UK by FIL Pensions Management (FPM) authorised and regulated by the Financial Conduct Authority, FIL Life Insurance Limited (FIL Life) authorised by the Prudential Regulation Authority and regulated by the Financial Conduct Authority and the Prudential Regulation Authority and in Ireland by FIL Life Insurance (Ireland) Limited (FIL Life Ireland), authorised and regulated by the Central Bank of Ireland. + +

+
+
+ + +
+
+ © FIL Pensions Management +
+ + + Important legal information + + + + | + + + + Terms and conditions + + | + + + + + + + + Cookie policy + + + + + + + | + + Contact us + + | + + Online security + + + + + + + + +
+
+
+ + + + + + + + + + + + + +
\ No newline at end of file diff --git a/tests/fixtures/fidelity/valuation.json b/tests/fixtures/fidelity/valuation.json new file mode 100644 index 0000000..5ad66e3 --- /dev/null +++ b/tests/fixtures/fidelity/valuation.json @@ -0,0 +1,2 @@ +{"valuations":[{"asset":{"assetId":[{"type":"FUND_CODE","value":"KDOA"}],"name":"Passive Global Equity Fund - Class 9"},"units":{"total":44920.21,"available":null,"crystallised":null,"uncrystallised":null,"group":[{"groupId":"BONW","type":"CONTRIBUTION_TYPE","name":"Bonus Waiver","unit":{"total":11490.84,"available":null,"crystallised":null,"uncrystallised":null}},{"groupId":"ERXS","type":"CONTRIBUTION_TYPE","name":"Company","unit":{"total":17148.27,"available":null,"crystallised":null,"uncrystallised":null}},{"groupId":"SASC","type":"CONTRIBUTION_TYPE","name":"Salary Sacrifice","unit":{"total":11432.20,"available":null,"crystallised":null,"uncrystallised":null}},{"groupId":"TREX","type":"CONTRIBUTION_TYPE","name":"Transfer In","unit":{"total":4848.90,"available":null,"crystallised":null,"uncrystallised":null}}]},"price":{"value":3.066,"datetime":"2026-04-17","currency":"GBP"},"valuation":{"total":137725.35,"available":null,"crystallised":null,"uncrystallised":null,"group":[{"groupId":"BONW","type":"CONTRIBUTION_TYPE","name":"Bonus Waiver","valuation":{"total":35230.91,"available":null,"crystallised":null,"uncrystallised":null}},{"groupId":"ERXS","type":"CONTRIBUTION_TYPE","name":"Company","valuation":{"total":52576.60,"available":null,"crystallised":null,"uncrystallised":null}},{"groupId":"SASC","type":"CONTRIBUTION_TYPE","name":"Salary Sacrifice","valuation":{"total":35051.12,"available":null,"crystallised":null,"uncrystallised":null}},{"groupId":"TREX","type":"CONTRIBUTION_TYPE","name":"Transfer In","valuation":{"total":14866.72,"available":null,"crystallised":null,"uncrystallised":null}}],"valuationType":"Value"},"currency":"GBP"},{"asset":{"assetId":[{"type":"FUND_CODE","value":"KCVT"}],"name":"FutureWise Target 2065 - Class 10"},"units":{"total":230.02,"available":null,"crystallised":null,"uncrystallised":null,"group":[{"groupId":"ERXS","type":"CONTRIBUTION_TYPE","name":"Company","unit":{"total":153.35,"available":null,"crystallised":null,"uncrystallised":null}},{"groupId":"SASC","type":"CONTRIBUTION_TYPE","name":"Salary Sacrifice","unit":{"total":76.67,"available":null,"crystallised":null,"uncrystallised":null}}]},"price":{"value":3.254,"datetime":"2026-04-17","currency":"GBP"},"valuation":{"total":748.48,"available":null,"crystallised":null,"uncrystallised":null,"group":[{"groupId":"ERXS","type":"CONTRIBUTION_TYPE","name":"Company","valuation":{"total":498.99,"available":null,"crystallised":null,"uncrystallised":null}},{"groupId":"SASC","type":"CONTRIBUTION_TYPE","name":"Salary Sacrifice","valuation":{"total":249.49,"available":null,"crystallised":null,"uncrystallised":null}}],"valuationType":"Value"},"currency":"GBP"},{"asset":{"assetId":[{"type":"FUND_CODE","value":"LAFC"}],"name":"Volatility Managed Multi Asset Fund"},"units":{"total":106.64,"available":null,"crystallised":null,"uncrystallised":null,"group":[{"groupId":"ERXS","type":"CONTRIBUTION_TYPE","name":"Company","unit":{"total":71.09,"available":null,"crystallised":null,"uncrystallised":null}},{"groupId":"SASC","type":"CONTRIBUTION_TYPE","name":"Salary Sacrifice","unit":{"total":35.55,"available":null,"crystallised":null,"uncrystallised":null}}]},"price":{"value":252.9000,"datetime":"2026-04-17","currency":"GBP"},"valuation":{"total":269.70,"available":null,"crystallised":null,"uncrystallised":null,"group":[{"groupId":"ERXS","type":"CONTRIBUTION_TYPE","name":"Company","valuation":{"total":179.80,"available":null,"crystallised":null,"uncrystallised":null}},{"groupId":"SASC","type":"CONTRIBUTION_TYPE","name":"Salary Sacrifice","valuation":{"total":89.90,"available":null,"crystallised":null,"uncrystallised":null}}],"valuationType":"Value"},"currency":"GBP"}],"valuationSum":{"total":138743.53,"available":0.0,"crystallised":null,"uncrystallised":null,"currency":"GBP"},"asOfDateTime":"2026-04-17T12:00:00+01:00"} + diff --git a/tests/providers/test_fidelity_planviewer.py b/tests/providers/test_fidelity_planviewer.py index 838d2b8..fe4feca 100644 --- a/tests/providers/test_fidelity_planviewer.py +++ b/tests/providers/test_fidelity_planviewer.py @@ -1,22 +1,33 @@ from __future__ import annotations +import json +from datetime import UTC, datetime +from decimal import Decimal +from pathlib import Path + import pytest -from broker_sync.models import Account, AccountType +from broker_sync.models import Account, AccountType, ActivityType from broker_sync.providers.fidelity_planviewer import ( ACCOUNT_ID, FidelityCreds, FidelityPlanViewerProvider, FidelityProviderConfigError, + _gains_offset_activity, ) +from broker_sync.providers.parsers.fidelity import ( + parse_transactions_html, + parse_valuation_json, +) + +_FIXTURES = Path(__file__).parent.parent / "fixtures" / "fidelity" def test_accounts_exposes_single_workplace_pension_account() -> None: prov = FidelityPlanViewerProvider(FidelityCreds( - storage_state_path="/tmp/x", plan_id="ABC123", + storage_state_path="/tmp/x", plan_id="META", )) - accounts = prov.accounts() - assert accounts == [ + assert prov.accounts() == [ Account( id=ACCOUNT_ID, name="Fidelity UK Pension", @@ -27,16 +38,79 @@ def test_accounts_exposes_single_workplace_pension_account() -> None: ] -async def test_fetch_raises_until_endpoints_captured() -> None: - """Until Viktor pastes the transactions/holdings cURLs, fetch() must fail - loudly rather than silently importing nothing. - - Swap this test for real parser tests once the API shapes are known and - `FidelityPlanViewerProvider.fetch` is wired up against fixtures. - """ +async def test_fetch_raises_without_storage_state() -> None: prov = FidelityPlanViewerProvider(FidelityCreds( - storage_state_path="/tmp/x", plan_id="ABC123", + storage_state_path="/tmp/does-not-exist-xyzzy.json", plan_id="META", )) - with pytest.raises(FidelityProviderConfigError, match="endpoint paths"): + with pytest.raises(FidelityProviderConfigError, match="storage_state"): async for _ in prov.fetch(): - pytest.fail("fetch should not yield before endpoints are configured") + pytest.fail("should have raised before yielding") + + +# -- parser tests against real (captured) fixture -- + + +def test_parse_transactions_real_fixture() -> None: + html = (_FIXTURES / "transactions-full.html").read_text() + txs = parse_transactions_html(html) + # Scheme has ~48 months + a couple of single premiums + 1 rebate; + # Bulk Switches must be filtered out (zero-amount rows). + assert 40 <= len(txs) <= 100 + # All dates are within the scheme's lifetime (2022-03 to today-ish). + assert all(tx.date >= datetime(2022, 1, 1, tzinfo=UTC) for tx in txs) + # Sum should match the header total on the page (£102,004.15 at + # fixture time). Allow a £5 tolerance in case the page summary row + # changes in future captures — the unit test primarily guards parsing + # correctness, not drift in the fixture. + total = sum((tx.amount for tx in txs), Decimal(0)) + assert abs(total - Decimal("102004.15")) < Decimal("5") + + +def test_parse_transactions_skips_bulk_switch() -> None: + html = (_FIXTURES / "transactions-full.html").read_text() + txs = parse_transactions_html(html) + assert not any("bulk switch" in tx.tx_type.lower() for tx in txs) + + +def test_parse_transactions_external_id_deterministic() -> None: + html = (_FIXTURES / "transactions-full.html").read_text() + a = parse_transactions_html(html) + b = parse_transactions_html(html) + assert [tx.external_id for tx in a] == [tx.external_id for tx in b] + assert all(tx.external_id.startswith("fidelity:tx:") for tx in a) + + +def test_parse_valuation_fixture() -> None: + payload = json.loads((_FIXTURES / "valuation.json").read_text()) + holdings = parse_valuation_json(payload) + assert len(holdings) >= 1 + h = holdings[0] + assert h.fund_code == "KDOA" + assert "Passive Global Equity" in h.fund_name + assert h.currency == "GBP" + assert h.units > 0 + assert h.unit_price > 0 + # Value ≈ units * price + assert abs(h.total_value - h.units * h.unit_price) < Decimal("1") + # Contribution-type breakdown must parse + assert set(h.units_by_source.keys()) >= {"SASC", "ERXS"} + + +def test_gains_offset_emits_deposit_when_pot_exceeds_contributions() -> None: + html = (_FIXTURES / "transactions-full.html").read_text() + valuation = json.loads((_FIXTURES / "valuation.json").read_text()) + txs = parse_transactions_html(html) + holdings = parse_valuation_json(valuation) + as_of = datetime(2026, 4, 18, tzinfo=UTC) + offset = _gains_offset_activity(holdings, txs, as_of) + assert offset is not None + assert offset.activity_type in (ActivityType.DEPOSIT, ActivityType.WITHDRAWAL) + assert offset.amount > 0 + assert offset.external_id == "fidelity:gains:2026-04-18" + + +def test_gains_offset_none_when_no_holdings() -> None: + assert _gains_offset_activity( + holdings=[], transactions=[], + as_of=datetime(2026, 4, 18, tzinfo=UTC), + ) is None From 7c9be544dcb46957933e805c1a7b1fe9e56db129 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 18:50:54 +0000 Subject: [PATCH 13/17] fidelity-planviewer: bake Chromium into the image for headless Playwright MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context The Fidelity provider (commit 804e6a8) drives headless Chromium via Playwright to refresh the PlanViewer session cookie jar and scrape the Struts2 transaction history page. The image needs both the Chromium runtime and the Debian system libs Chromium dynamic-links against. ## This change - Adds Playwright's documented Debian 12 dependency set (fonts-liberation, libnss3, libxkbcommon0, xvfb, etc.). - Creates /app/.playwright-browsers owned by the broker user so the non-root process can write the Chromium install, and runs `playwright install chromium` as that user so the browser lands in the right cache path (PLAYWRIGHT_BROWSERS_PATH=/app/.playwright-browsers). - Image size will grow by ~300MB (Chromium headless shell is ~110MB compressed, plus libs). Acceptable — broker-sync runs once a day so pull cost is a one-shot. ## What is NOT in this change - Terraform CronJob / monitoring — separate commit in the infra repo. ## Verification $ docker build -t broker-sync:test . → (will run in CI) $ docker run --rm broker-sync:test fidelity-seed --help → shows the CLI help (can't actually run fidelity-seed headlessly). $ poetry run pytest -q (local) → 128 passed, 1 skipped. Reproduce locally: 1. docker build -t broker-sync:fidelity-test . 2. docker run --rm -v $PWD/tests/fixtures/fidelity:/data broker-sync:fidelity-test \ python -c "from playwright.sync_api import sync_playwright; \ with sync_playwright() as p: b = p.chromium.launch(); b.close(); print('ok')" 3. Expected: "ok" — Chromium launches successfully. Co-Authored-By: Claude Opus 4.7 (1M context) --- Dockerfile | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a6c526c..35224ef 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,14 +20,56 @@ FROM python:3.12-slim WORKDIR /app +# Playwright needs a big list of system libs for Chromium (fonts, NSS, libs +# for rendering, audio stubs, etc.). Mirror the list Playwright publishes at +# https://playwright.dev/docs/browsers#system-requirements for Debian 12. +# Fidelity PlanViewer is the only consumer today; gated to the fidelity-* +# CronJobs via the provider's explicit Playwright import. +RUN apt-get update && apt-get install --no-install-recommends -y \ + ca-certificates \ + fonts-liberation \ + fonts-noto-color-emoji \ + libasound2 \ + libatk-bridge2.0-0 \ + libatk1.0-0 \ + libatspi2.0-0 \ + libcairo2 \ + libcups2 \ + libdbus-1-3 \ + libdrm2 \ + libexpat1 \ + libgbm1 \ + libglib2.0-0 \ + libnspr4 \ + libnss3 \ + libpango-1.0-0 \ + libx11-6 \ + libxcb1 \ + libxcomposite1 \ + libxdamage1 \ + libxext6 \ + libxfixes3 \ + libxkbcommon0 \ + libxrandr2 \ + xvfb \ + && rm -rf /var/lib/apt/lists/* + RUN useradd --system --uid 10001 --home /app --shell /usr/sbin/nologin broker && \ mkdir -p /data && chown -R broker:broker /data COPY --from=builder --chown=broker:broker /app /app +# Install Chromium into broker's cache so Playwright (running as broker) +# can pick it up. `PLAYWRIGHT_BROWSERS_PATH=0` forces a co-located install +# next to the python package — the simpler path on slim images. ENV PATH="/app/.venv/bin:${PATH}" \ - PYTHONUNBUFFERED=1 + PYTHONUNBUFFERED=1 \ + PLAYWRIGHT_BROWSERS_PATH=/app/.playwright-browsers +RUN mkdir -p "${PLAYWRIGHT_BROWSERS_PATH}" && \ + chown -R broker:broker "${PLAYWRIGHT_BROWSERS_PATH}" USER broker +RUN playwright install chromium + ENTRYPOINT ["broker-sync"] CMD ["version"] From 6450201af0b155e1b4ba97a8877a06d13d6d39ef Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 19:12:49 +0000 Subject: [PATCH 14/17] pipeline: emit matching DEPOSIT/WITHDRAWAL for every BUY/SELL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Context The 2026-04-18 reconciliation ended with Wealthfolio's historical Net Worth chart showing cliff-jumps on 5 dates — the single-day lump cash offsets we'd posted to "zero out" phantom cash. An operational fix replaced those 6 lumps with 231 per-BUY/SELL matched DEPOSIT/WITHDRAWAL rows (see code-r9n note). That made the chart smooth — but only for today's data. Any future broker-sync run would re-introduce phantom cash because providers emit BUY/SELL only; nothing on the cash side. This commit bakes the match into the pipeline so **future syncs self-balance cash at import time** and the chart stays smooth. ## This change - broker_sync/pipeline.py - New _matched_cash_flow(a): returns a DEPOSIT for a BUY (amount = qty * unit_price + fee) or a WITHDRAWAL for a SELL (amount = qty * unit_price - fee). Returns None for every other activity type — DEPOSIT/WITHDRAWAL/DIVIDEND/etc. already touch cash directly. The synthetic activity carries a deterministic external_id `cash-flow-match::` so SyncRecordStore dedup handles idempotency across runs. - New _with_cash_flow_match(a): expand helper — returns [a] or [a, match]. Pure, testable. - sync_provider_to_wealthfolio loops over the expansion, so each activity may now contribute up to two rows to the batch. `fetched` still counts provider-side activities only; `new_after_dedup` + `imported` + `failed` count expanded rows. - tests/test_pipeline.py - Updated two existing pipeline integration tests to reflect the now-larger batch shape (3 BUYs become 6 rows after expansion). - 5 new unit tests for the helpers: BUY → DEPOSIT with fee, SELL → WITHDRAWAL net of fee, DEPOSIT/WITHDRAWAL/DIVIDEND pass through, zero-amount trades skipped, _with_cash_flow_match returns the right cardinality. ## What is NOT in this change - Provider-level opt-out (e.g., Provider.emits_matching_cash_flow = True). No current provider emits real cash flows alongside trades (Trading212 only calls /orders, not /transactions), so the default "always match" is safe. If we ever wire a provider that pulls real bank-transfer dates, add the opt-out then. - Retroactive cleanup of already-imported WF accounts — already done operationally today. ## Verification ### Automated $ poetry run pytest tests/test_pipeline.py -v 7 passed in 0.40s $ poetry run pytest -q 133 passed, 1 skipped in 8.58s $ poetry run mypy broker_sync/pipeline.py tests/test_pipeline.py Success: no issues found in 2 source files $ poetry run ruff check broker_sync/pipeline.py tests/test_pipeline.py All checks passed! ### Manual — next sync Once this image ships and broker-sync-trading212 / broker-sync-imap / broker-sync-fidelity run, confirm: 1. kubectl -n broker-sync logs job/ → fetched=N new=2N imported=2N failed=0 (doubled due to matches). 2. WF /api/v1/holdings?accountId= → cash ≈ £0 for every currency after import. 3. Net Worth chart has no new cliff-jumps. Co-Authored-By: Claude Opus 4.7 (1M context) --- broker_sync/pipeline.py | 91 +++++++++++++++++++++++++++++------- tests/test_pipeline.py | 100 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 169 insertions(+), 22 deletions(-) diff --git a/broker_sync/pipeline.py b/broker_sync/pipeline.py index 7921934..59e3e7b 100644 --- a/broker_sync/pipeline.py +++ b/broker_sync/pipeline.py @@ -5,9 +5,10 @@ import logging from collections.abc import AsyncIterator from dataclasses import dataclass from datetime import datetime +from decimal import Decimal from broker_sync.dedup import SyncRecordStore -from broker_sync.models import Account, Activity +from broker_sync.models import Account, Activity, ActivityType from broker_sync.providers.base import Provider from broker_sync.sinks.wealthfolio import WealthfolioSink @@ -51,21 +52,26 @@ async def sync_provider_to_wealthfolio( async for activity in provider.fetch(since=since, before=before): fetched += 1 - if dedup.has_seen(provider.name, activity.account_id, activity.external_id): - continue - new_after_dedup += 1 - _tag_notes(activity, provider.name) - original_account_id = activity.account_id - # Submit under Wealthfolio's UUID; keep dedup keyed on our id. - wf_id = wf_account_ids.get(original_account_id) - if wf_id: - activity.account_id = wf_id - batch.append((original_account_id, activity)) - if len(batch) >= _BATCH_SIZE: - ok, bad = await _flush_batch(sink, dedup, provider.name, batch) - imported += ok - failed += bad - batch = [] + # Expand each BUY/SELL into (original, matching DEPOSIT/WITHDRAWAL). + # See `_matched_cash_flow` — without the match, WF's historical Net + # Worth chart shows phantom spikes because BUYs consume cash that + # was never "deposited" according to the activity log. + for act in _with_cash_flow_match(activity): + if dedup.has_seen(provider.name, act.account_id, act.external_id): + continue + new_after_dedup += 1 + _tag_notes(act, provider.name) + original_account_id = act.account_id + # Submit under Wealthfolio's UUID; keep dedup keyed on our id. + wf_id = wf_account_ids.get(original_account_id) + if wf_id: + act.account_id = wf_id + batch.append((original_account_id, act)) + if len(batch) >= _BATCH_SIZE: + ok, bad = await _flush_batch(sink, dedup, provider.name, batch) + imported += ok + failed += bad + batch = [] if batch: ok, bad = await _flush_batch(sink, dedup, provider.name, batch) @@ -144,3 +150,56 @@ async def _flush_batch( async def collect(iterator: AsyncIterator[Activity]) -> list[Activity]: """Tiny helper — drain an async iterator to a list. Mainly for tests.""" return [a async for a in iterator] + + +# -- Cash-flow matching -------------------------------------------------- +# BUY and SELL activities touch shares, not cash. Without an explicit +# DEPOSIT/WITHDRAWAL on the same day, WF models the account as having +# "phantom" cash debt — and its Net Worth chart shows cliff-jumps +# whenever a lump offset is applied after the fact. +# +# The pipeline emits a matching DEPOSIT (for BUY) or WITHDRAWAL (for SELL) +# right alongside each trade so the account's cash balance reconciles to +# ~0 at every point in time. Providers that already emit real cash flows +# (e.g. a Trading212 "deposit" endpoint, if we ever wire it) should set +# `Provider.emits_matching_cash_flow = True` to opt out — no provider +# does today (Trading212 only exposes BUY/SELL via the /orders endpoint). + + +def _matched_cash_flow(a: Activity) -> Activity | None: + """Return the DEPOSIT/WITHDRAWAL that funds/receives the BUY/SELL `a`. + + Returns None for every other activity type — those already touch cash + directly (DEPOSIT, WITHDRAWAL, DIVIDEND, FEE, TAX, TRANSFER_*, + CONVERSION_*). + """ + if a.activity_type is ActivityType.BUY: + if a.quantity is None or a.unit_price is None: + return None + amount = a.quantity * a.unit_price + (a.fee or Decimal(0)) + kind, tag = ActivityType.DEPOSIT, "buy" + elif a.activity_type is ActivityType.SELL: + if a.quantity is None or a.unit_price is None: + return None + amount = a.quantity * a.unit_price - (a.fee or Decimal(0)) + kind, tag = ActivityType.WITHDRAWAL, "sell" + else: + return None + if amount <= 0: + return None + return Activity( + external_id=f"cash-flow-match:{tag}:{a.external_id}", + account_id=a.account_id, + account_type=a.account_type, + date=a.date, + activity_type=kind, + currency=a.currency, + amount=amount, + notes=f"cash-flow-match:{tag}:{a.external_id}", + ) + + +def _with_cash_flow_match(a: Activity) -> list[Activity]: + """Expand one activity into [original] or [original, matching cash flow].""" + match = _matched_cash_flow(a) + return [a] if match is None else [a, match] diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 481c4d7..e883314 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -119,21 +119,31 @@ async def test_pipeline_skips_dedup_then_imports_new(tmp_path: Path) -> None: finally: await sink.close() + # 3 provider activities fetched, but pipeline expands each BUY into + # (BUY, matching DEPOSIT). "a" is already-seen → skipped; its match + # "cash-flow-match:buy:a" is NEW since it wasn't seeded. assert result.fetched == 3 - assert result.new_after_dedup == 2 - assert result.imported == 2 + assert result.new_after_dedup == 5 + assert result.imported == 5 assert result.failed == 0 assert len(posted_batches) == 1 body = posted_batches[0] - # Only the new rows (b, c) — NOT the already-seen "a". + # Only the new rows (b, c + the 3 matches) — NOT the already-seen "a". assert "sync:fake:a" not in body assert "sync:fake:b" in body assert "sync:fake:c" in body + # Matching DEPOSITs rode along with their trade. + assert "cash-flow-match:buy:a" in body + assert "cash-flow-match:buy:b" in body + assert "cash-flow-match:buy:c" in body - # All three external_ids are now in dedup after the run. + # All six external_ids are now in dedup after the run. assert dedup.has_seen("fake", "fake-isa", "a") assert dedup.has_seen("fake", "fake-isa", "b") assert dedup.has_seen("fake", "fake-isa", "c") + assert dedup.has_seen("fake", "fake-isa", "cash-flow-match:buy:a") + assert dedup.has_seen("fake", "fake-isa", "cash-flow-match:buy:b") + assert dedup.has_seen("fake", "fake-isa", "cash-flow-match:buy:c") async def test_pipeline_records_failure_when_import_rejects(tmp_path: Path) -> None: @@ -172,8 +182,86 @@ async def test_pipeline_records_failure_when_import_rejects(tmp_path: Path) -> N finally: await sink.close() + # Pipeline expands 1 BUY into (BUY, matching DEPOSIT). Both are in the + # batch that /import/check rejects, so both are counted as failed. assert result.fetched == 1 assert result.imported == 0 - assert result.failed == 1 - # NOT recorded in dedup so the next run retries. + assert result.failed == 2 + # NOT recorded in dedup so the next run retries both. assert not dedup.has_seen("fake", "fake-isa", "a") + assert not dedup.has_seen("fake", "fake-isa", "cash-flow-match:buy:a") + + +# -- Cash-flow match helpers --------------------------------------------- +from broker_sync.pipeline import _matched_cash_flow, _with_cash_flow_match # noqa: E402 + + +def _make_activity( + activity_type: ActivityType, + *, + quantity: str | None = "1", + unit_price: str | None = "100", + fee: str = "0", + amount: str | None = None, + external_id: str = "x", +) -> Activity: + return Activity( + external_id=external_id, + account_id="acct", + account_type=AccountType.ISA, + date=datetime(2026, 4, 1, tzinfo=UTC), + activity_type=activity_type, + currency="GBP", + quantity=Decimal(quantity) if quantity is not None else None, + unit_price=Decimal(unit_price) if unit_price is not None else None, + fee=Decimal(fee), + amount=Decimal(amount) if amount is not None else None, + ) + + +def test_matched_cash_flow_for_buy_is_deposit_with_total_cost() -> None: + buy = _make_activity( + ActivityType.BUY, quantity="10", unit_price="200.50", fee="1.25", + external_id="buy-1", + ) + match = _matched_cash_flow(buy) + assert match is not None + assert match.activity_type is ActivityType.DEPOSIT + assert match.amount == Decimal("2006.25") # 10*200.50 + 1.25 + assert match.currency == "GBP" + assert match.account_id == buy.account_id + assert match.date == buy.date + assert match.external_id == "cash-flow-match:buy:buy-1" + + +def test_matched_cash_flow_for_sell_is_withdrawal_net_of_fee() -> None: + sell = _make_activity( + ActivityType.SELL, quantity="5", unit_price="300", fee="2.50", + external_id="sell-7", + ) + match = _matched_cash_flow(sell) + assert match is not None + assert match.activity_type is ActivityType.WITHDRAWAL + assert match.amount == Decimal("1497.50") # 5*300 - 2.50 + assert match.external_id == "cash-flow-match:sell:sell-7" + + +def test_matched_cash_flow_none_for_deposit_withdrawal_dividend() -> None: + dep = _make_activity(ActivityType.DEPOSIT, quantity=None, unit_price=None, amount="100") + wit = _make_activity(ActivityType.WITHDRAWAL, quantity=None, unit_price=None, amount="50") + div = _make_activity(ActivityType.DIVIDEND, quantity=None, unit_price=None, amount="5") + assert _matched_cash_flow(dep) is None + assert _matched_cash_flow(wit) is None + assert _matched_cash_flow(div) is None + + +def test_matched_cash_flow_skips_zero_amount_trades() -> None: + zero_buy = _make_activity(ActivityType.BUY, quantity="0", unit_price="100") + assert _matched_cash_flow(zero_buy) is None + + +def test_with_cash_flow_match_returns_pair_for_buy_single_for_deposit() -> None: + buy = _make_activity(ActivityType.BUY, external_id="buy-2") + dep = _make_activity(ActivityType.DEPOSIT, quantity=None, unit_price=None, amount="500") + assert len(_with_cash_flow_match(buy)) == 2 + assert len(_with_cash_flow_match(dep)) == 1 From 6f3bcea23e73095eccc45b57dcb663898361417e Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sat, 18 Apr 2026 22:52:38 +0000 Subject: [PATCH 15/17] ci: fix ruff E501 + mypy None-comparison warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_imap.py:49 — one-line comment ran past the 100-char line limit introduced in commit c830856. Split the "£20,000 cap" note onto its own line above the call. test_fidelity_planviewer.py:108 — mypy flagged `offset.amount > 0` where amount is typed Decimal | None. Added an explicit `is not None` guard; runtime behaviour unchanged (we already check offset is not None two lines earlier). $ poetry run ruff check . → All checks passed! $ poetry run mypy broker_sync tests → Success: no issues found in 43 source files $ poetry run pytest -q → 133 passed, 1 skipped Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/providers/test_fidelity_planviewer.py | 2 +- tests/providers/test_imap.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/providers/test_fidelity_planviewer.py b/tests/providers/test_fidelity_planviewer.py index fe4feca..55b069e 100644 --- a/tests/providers/test_fidelity_planviewer.py +++ b/tests/providers/test_fidelity_planviewer.py @@ -105,7 +105,7 @@ def test_gains_offset_emits_deposit_when_pot_exceeds_contributions() -> None: offset = _gains_offset_activity(holdings, txs, as_of) assert offset is not None assert offset.activity_type in (ActivityType.DEPOSIT, ActivityType.WITHDRAWAL) - assert offset.amount > 0 + assert offset.amount is not None and offset.amount > 0 assert offset.external_id == "fidelity:gains:2026-04-18" diff --git a/tests/providers/test_imap.py b/tests/providers/test_imap.py index 5e1c14f..63638cb 100644 --- a/tests/providers/test_imap.py +++ b/tests/providers/test_imap.py @@ -46,7 +46,8 @@ def test_single_tax_year_under_cap_stays_isa() -> None: def test_overflow_past_cap_flips_to_gia() -> None: acts = [ _buy(datetime(2024, 5, 1, tzinfo=UTC), "100", "80"), # £8,000 - _buy(datetime(2024, 6, 1, tzinfo=UTC), "150", "80"), # +£12,000 → £20,000 total; prev £8k < cap → ISA + # +£12,000 → £20,000 total; prev £8k < cap → ISA + _buy(datetime(2024, 6, 1, tzinfo=UTC), "150", "80"), _buy(datetime(2024, 7, 1, tzinfo=UTC), "10", "80"), # prev £20,000 ≥ cap → GIA _buy(datetime(2024, 8, 1, tzinfo=UTC), "10", "80"), # GIA ] From 1d1e20b72b13fcecfc3caf34e48d6387d38fe14a Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Sun, 19 Apr 2026 18:27:58 +0000 Subject: [PATCH 16/17] schwab: detect vest-confirmation emails + emit VestEvent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends parse_schwab_email to handle Schwab's RSU Release Confirmation emails alongside the existing trade confirmations. Adds: - `VestEvent` dataclass in models.py — carries vest_date, ticker, shares_vested, shares_sold_to_cover, fmv_at_vest_usd, tax_withheld_usd. Written to payslip_ingest.rsu_vest_events by a postgres sink (pending a real email fixture + cross-service DB grant). - `parse_schwab_email_full()` — new entry point returning both `list[Activity]` and `VestEvent | None`. The legacy `parse_schwab_email()` shape is preserved for existing callers. - Vest-release dispatch heuristic: HTML body mentions "Release Confirmation" / "Award Vesting" / "RSU Release". On match, extract vest fields via label regexes; the full vest becomes a BUY Activity and the sell-to-cover slice becomes a SELL Activity at the same FMV (net zero cash on the day). Gross vest + sell-to-cover returned so Wealthfolio gets the full portfolio picture. - Tests: 3 new (vest roundtrip, unparseable-vest safety, legacy shape preserved); existing 6 unchanged. The regex heuristics will need tightening once a real email sample exists — the HTML structure observed in public Schwab emails may differ in material ways. For now, unmatched vest bodies return empty-result (no Activity, no VestEvent) rather than crashing the IMAP batch. Part of: code-860 --- broker_sync/models.py | 24 +++ broker_sync/providers/parsers/schwab.py | 197 ++++++++++++++++++++++-- tests/providers/parsers/test_schwab.py | 56 +++++++ 3 files changed, 261 insertions(+), 16 deletions(-) diff --git a/broker_sync/models.py b/broker_sync/models.py index 17eff39..dd1be88 100644 --- a/broker_sync/models.py +++ b/broker_sync/models.py @@ -102,3 +102,27 @@ def _fmt(v: Decimal | None) -> str: if v is None: return "" return format(v, "f") + + +@dataclass +class VestEvent: + """Schwab RSU vest event — written to payslip_ingest.rsu_vest_events. + + Carries both the gross vest (shares x FMV) and the sell-to-cover portion + (shares withheld for tax x FMV). Sibling Activity records (one BUY for + the full vest, one SELL for the sold-to-cover slice) are produced + separately for Wealthfolio. + + USD-only at parse time; FX conversion happens at the postgres sink via + the ECB daily rate so the DB row carries both the raw USD figures and + the GBP-translated values for dashboard joins. + """ + external_id: str # schwab:{date}:{ticker}:VEST:{shares_vested} + vest_date: datetime + ticker: str + shares_vested: Decimal + shares_sold_to_cover: Decimal | None + fmv_at_vest_usd: Decimal + tax_withheld_usd: Decimal | None + source: str = "schwab_email" + raw: dict[str, str] = field(default_factory=dict) diff --git a/broker_sync/providers/parsers/schwab.py b/broker_sync/providers/parsers/schwab.py index fe5f5f3..aeef7d0 100644 --- a/broker_sync/providers/parsers/schwab.py +++ b/broker_sync/providers/parsers/schwab.py @@ -1,37 +1,79 @@ """Schwab workplace-RSU email parser. -Schwab sends HTML transaction-confirmation emails with the core fields in -five `` elements: -1. Trade date (human format — e.g. "Jan 23, 2025") -2. Direction word ("Sold" for SELL; anything else is BUY) -3. Quantity (share count, float) -4. Ticker -5. Price ("$123.45" — currency-sign-prefixed) +Two email shapes are handled: -One email → one Activity. On any parse failure we return an empty list -(same as the original finance/ behaviour — an unparseable email shouldn't -crash the whole IMAP batch). +1. Trade confirmations (sell-to-cover or user-initiated trades): HTML + with five `` cells + holding date / direction / quantity / ticker / price. → one Activity. -Ported from finance/position/provider/schwab/message_parser.py (39 lines). -Dropped: per-row timestamp id suffix (we use ISO date + ticker + qty which -is stable across re-pulls), currency-from-sign hackery (US Schwab is USD- -only in practice — if that ever changes we'll add FX on parse). +2. Release Confirmations (RSU vest events): subject/body mentions + "Release Confirmation" or "Award Vesting"; body lists vest date, + shares released, FMV, shares sold to cover, and USD tax withheld. + → (Activity, Activity, VestEvent) tuple: the gross vest (BUY at FMV), + the sell-to-cover (SELL at FMV), and a standalone VestEvent for the + payslip-ingest reconciliation pipeline. + +On any parse failure we return the neutral empty result (no Activities, +no VestEvent) — an unparseable email shouldn't crash the IMAP batch. """ from __future__ import annotations +import logging +import re +from dataclasses import dataclass from decimal import Decimal, InvalidOperation from bs4 import BeautifulSoup from dateutil import parser as dateparser -from broker_sync.models import AccountType, Activity, ActivityType +from broker_sync.models import AccountType, Activity, ActivityType, VestEvent + +log = logging.getLogger(__name__) _ACCOUNT_ID = "schwab-workplace" _DEFAULT_CURRENCY = "USD" +# Vest-confirmation emails reliably include one of these phrases. Matching +# is case-insensitive and on the raw HTML (cheap — no DOM parse needed). +_VEST_SUBJECT_RE = re.compile(r"Release Confirmation|Award Vesting|RSU Release", + re.IGNORECASE) + + +@dataclass +class VestParseResult: + activities: list[Activity] + vest_event: VestEvent | None + def parse_schwab_email(raw_html: str) -> list[Activity]: - """Return a single-item list of Activity on success, empty on failure.""" + """Return a single-item list of Activity on success, empty on failure. + + For vest-confirmation emails, returns the two Activity rows (gross + vest + sell-to-cover). Use `parse_schwab_email_full` when the caller + also needs the VestEvent. + """ + return parse_schwab_email_full(raw_html).activities + + +def parse_schwab_email_full(raw_html: str) -> VestParseResult: + """Full parse — returns activities + optional VestEvent. + + Dispatches: vest-confirmation emails → `_parse_vest_release`; + everything else → the legacy single-row confirmation parser. + """ + if _VEST_SUBJECT_RE.search(raw_html): + result = _parse_vest_release(raw_html) + if result is not None: + return result + log.warning("schwab: detected vest email but could not extract fields; " + "add a real fixture to broker-sync/tests/fixtures/") + return VestParseResult(activities=[], vest_event=None) + + return VestParseResult(activities=_parse_trade_confirmation(raw_html), vest_event=None) + + +def _parse_trade_confirmation(raw_html: str) -> list[Activity]: + """Legacy 5-cell trade confirmation parser.""" try: soup = BeautifulSoup(raw_html, "html.parser") cells = [ @@ -73,3 +115,126 @@ def parse_schwab_email(raw_html: str) -> list[Activity]: ] except (ValueError, InvalidOperation, IndexError, AttributeError): return [] + + +# Heuristic extractors for vest-release emails. Labels observed in public +# Schwab RSU release samples; real fixture needed to tighten these. +_VEST_DATE_RE = re.compile( + r"(?:Release Date|Vest Date|Vesting Date)\s*[:<][^0-9]*" + r"(\d{1,2}[\s/\-][A-Za-z]{3}[\s/\-]\d{2,4}|\d{2}/\d{2}/\d{4}|\d{4}-\d{2}-\d{2})", + re.IGNORECASE) +_VEST_TICKER_RE = re.compile(r"(?:Ticker|Symbol)\s*[:<]\s*([A-Z]{2,5})", + re.IGNORECASE) +_VEST_SHARES_RELEASED_RE = re.compile( + r"(?:Shares Released|Total Shares (?:Released|Vested))\s*[:<]\s*" + r"([\d,]+(?:\.\d+)?)", + re.IGNORECASE) +_VEST_SHARES_WITHHELD_RE = re.compile( + r"(?:Shares (?:Withheld|Sold)(?: for Taxes)?)\s*[:<]\s*" + r"([\d,]+(?:\.\d+)?)", + re.IGNORECASE) +_VEST_FMV_RE = re.compile( + r"(?:Market Price|FMV|Fair Market Value)\s*[:<]\s*" + r"\$?\s*([\d,]+(?:\.\d+)?)", + re.IGNORECASE) +_VEST_TAX_USD_RE = re.compile( + r"(?:Tax Withholding Amount|Total Tax Withholding|Tax Withheld)\s*[:<]\s*" + r"\$?\s*([\d,]+(?:\.\d+)?)", + re.IGNORECASE) + + +def _parse_vest_release(raw_html: str) -> VestParseResult | None: + """Best-effort extraction from a Schwab Release Confirmation email. + + Runs label regexes on the plain-text view of the HTML. Returns None + (signalling fall-through) if the core four fields (date, ticker, + shares released, FMV) don't all resolve — that's a strong signal the + heuristics need a real fixture before they can be trusted on a live + email. + """ + try: + soup = BeautifulSoup(raw_html, "html.parser") + text = soup.get_text(" ", strip=True) + except Exception: + return None + + date_str = _search_group(_VEST_DATE_RE, text) + ticker = _search_group(_VEST_TICKER_RE, text) + shares_released_str = _search_group(_VEST_SHARES_RELEASED_RE, text) + fmv_str = _search_group(_VEST_FMV_RE, text) + if not (date_str and ticker and shares_released_str and fmv_str): + return None + + try: + vest_date = dateparser.parse(date_str) + shares_vested = Decimal(shares_released_str.replace(",", "")) + fmv = Decimal(fmv_str.replace(",", "")) + except (ValueError, InvalidOperation): + return None + + shares_sold_str = _search_group(_VEST_SHARES_WITHHELD_RE, text) + shares_sold_to_cover = (Decimal(shares_sold_str.replace(",", "")) + if shares_sold_str else None) + tax_usd_str = _search_group(_VEST_TAX_USD_RE, text) + tax_withheld_usd = (Decimal(tax_usd_str.replace(",", "")) + if tax_usd_str else None) + + external_id = (f"schwab:{vest_date.date().isoformat()}:{ticker}:VEST:" + f"{shares_vested}") + + vest_event = VestEvent( + external_id=external_id, + vest_date=vest_date, + ticker=ticker, + shares_vested=shares_vested, + shares_sold_to_cover=shares_sold_to_cover, + fmv_at_vest_usd=fmv, + tax_withheld_usd=tax_withheld_usd, + source="schwab_email", + raw={ + "date": date_str, + "ticker": ticker, + "shares_released": shares_released_str, + "fmv": fmv_str, + "shares_withheld": shares_sold_str or "", + "tax_withheld": tax_usd_str or "", + }, + ) + + # Sibling Activities for Wealthfolio: full vest as BUY, sell-to-cover + # slice as SELL, both at the same FMV so net cash = 0 on that day. + activities: list[Activity] = [ + Activity( + external_id=f"{external_id}:BUY", + account_id=_ACCOUNT_ID, + account_type=AccountType.GIA, + date=vest_date, + activity_type=ActivityType.BUY, + symbol=ticker, + quantity=shares_vested, + unit_price=fmv, + currency=_DEFAULT_CURRENCY, + notes="schwab-vest-release", + ) + ] + if shares_sold_to_cover is not None and shares_sold_to_cover > 0: + activities.append( + Activity( + external_id=f"{external_id}:SELL_TO_COVER", + account_id=_ACCOUNT_ID, + account_type=AccountType.GIA, + date=vest_date, + activity_type=ActivityType.SELL, + symbol=ticker, + quantity=shares_sold_to_cover, + unit_price=fmv, + currency=_DEFAULT_CURRENCY, + notes="schwab-sell-to-cover", + )) + + return VestParseResult(activities=activities, vest_event=vest_event) + + +def _search_group(pattern: re.Pattern[str], text: str) -> str | None: + m = pattern.search(text) + return m.group(1).strip() if m else None diff --git a/tests/providers/parsers/test_schwab.py b/tests/providers/parsers/test_schwab.py index 8e3c736..c39bd0c 100644 --- a/tests/providers/parsers/test_schwab.py +++ b/tests/providers/parsers/test_schwab.py @@ -82,3 +82,59 @@ def test_price_with_commas_parses() -> None: html = _SELL.replace("$612.34", "$1,612.34") a = parse_schwab_email(html)[0] assert a.unit_price == Decimal("1612.34") + + +# --- Vest-release parsing ------------------------------------------------- + +_VEST_RELEASE = """ +

Release Confirmation

+

+Release Date: 15 Mar 2026 +Ticker: META +Total Shares Released: 100.0 +Market Price: $612.34 +Shares Withheld for Taxes: 45 +Tax Withholding Amount: $27,555.30 +

+""" + + +def test_vest_release_returns_two_activities_and_vest_event() -> None: + """Release Confirmation yields a BUY (full vest) + SELL (sell-to-cover) + VestEvent.""" + from broker_sync.providers.parsers.schwab import parse_schwab_email_full + + result = parse_schwab_email_full(_VEST_RELEASE) + assert result.vest_event is not None + assert result.vest_event.ticker == "META" + assert result.vest_event.shares_vested == Decimal("100.0") + assert result.vest_event.shares_sold_to_cover == Decimal("45") + assert result.vest_event.fmv_at_vest_usd == Decimal("612.34") + assert result.vest_event.tax_withheld_usd == Decimal("27555.30") + assert result.vest_event.vest_date.date().isoformat() == "2026-03-15" + assert result.vest_event.external_id.startswith("schwab:2026-03-15:META:VEST:") + + assert len(result.activities) == 2 + buy = result.activities[0] + assert buy.activity_type is ActivityType.BUY + assert buy.quantity == Decimal("100.0") + sell = result.activities[1] + assert sell.activity_type is ActivityType.SELL + assert sell.quantity == Decimal("45") + assert sell.unit_price == Decimal("612.34") + + +def test_vest_email_with_unparseable_body_returns_empty() -> None: + """Subject says Release Confirmation but fields missing → empty result, no crash.""" + from broker_sync.providers.parsers.schwab import parse_schwab_email_full + + html = "Release Confirmation — please contact support" + result = parse_schwab_email_full(html) + assert result.vest_event is None + assert result.activities == [] + + +def test_back_compat_parse_schwab_email_drops_vest_event() -> None: + """The legacy list[Activity] shape remains stable for existing callers.""" + acts = parse_schwab_email(_VEST_RELEASE) + assert len(acts) == 2 + assert all(isinstance(a.activity_type, ActivityType) for a in acts) From dfee29fda72a8c70514ebad1872e2fae0664b245 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Thu, 7 May 2026 22:33:29 +0000 Subject: [PATCH 17/17] [ci] Add Woodpecker build pushing to forgejo.viktorbarzin.me/viktor/wealthfolio-sync Companion to the existing GHA pipeline that pushes broker-sync to DockerHub. The Woodpecker build pushes to Forgejo as wealthfolio-sync (image name kept to match the existing infra/stacks/wealthfolio/main.tf CronJob reference, which has been broken since registry-private lost the image). --- .woodpecker/build.yml | 45 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 .woodpecker/build.yml diff --git a/.woodpecker/build.yml b/.woodpecker/build.yml new file mode 100644 index 0000000..423ea0c --- /dev/null +++ b/.woodpecker/build.yml @@ -0,0 +1,45 @@ +when: + event: push + branch: [main, master] + +clone: + git: + image: woodpeckerci/plugin-git + settings: + attempts: 5 + backoff: 10s + +steps: + - name: lint-and-test + image: python:3.12-slim + commands: + - pip install --no-cache-dir "poetry==1.8.4" + - poetry install --no-interaction --no-root + - poetry run ruff check . + - poetry run mypy broker_sync tests + - poetry run pytest -q + + - name: build-and-push + image: woodpeckerci/plugin-docker-buildx + depends_on: + - lint-and-test + settings: + # Image name is `wealthfolio-sync` to match the deployment in + # infra/stacks/wealthfolio/main.tf (CronJob `wealthfolio-sync`). + # The repo is called `broker-sync` because the source covers + # multiple brokers (Trading 212, Schwab, Fidelity, IMAP-CSV) — + # we just happen to publish it under the wealthfolio name since + # that's the consumer stack. + repo: + - forgejo.viktorbarzin.me/viktor/wealthfolio-sync + logins: + - registry: forgejo.viktorbarzin.me + username: + from_secret: forgejo_user + password: + from_secret: forgejo_push_token + dockerfile: Dockerfile + context: . + auto_tag: true + platforms: + - linux/amd64