broker-sync/broker_sync/providers/imap.py
Viktor Barzin 6efd03570a Add imap-ingest CLI + ImapProvider: route emails to IE/Schwab parsers
Wires the IE + Schwab email parsers into an actual runnable sync. Walks
the IMAP mailbox, routes each message by sender domain:
  - *@investengine.com → invest_engine.parse_invest_engine_email
  - *@schwab.com       → schwab.parse_schwab_email
then pushes the resulting Activities through the shared pipeline.

broker-sync imap-ingest — new CLI command taking IMAP_HOST/USER/PASSWORD/
DIRECTORY (mirrors the old wealthfolio-sync image's env shape so the
Terraform CronJob's existing env wiring works unchanged).

Verified: poetry run pytest -q → 109 passed + 1 skipped; mypy strict
clean (37 files); ruff + yapf clean.
2026-04-17 22:12:05 +00:00

189 lines
6 KiB
Python

"""IMAP email ingestor: dispatches messages to the matching parser by sender.
Used by the `imap-ingest` CLI command for InvestEngine + Schwab confirmation
emails. Each message passes through:
1. Pull ALL messages from the configured mailbox directory.
2. Route each by `From:` to a parser:
- noreply@investengine.com (+ equivalents) → invest_engine parser
- Schwab confirmations (equityawards@schwab.com, etc.) → schwab parser
3. Merge parser output into one list[Activity] with source attribution.
Not imap-idle; runs once per invocation. Designed for a daily CronJob.
"""
from __future__ import annotations
import email
import imaplib
import logging
import re
import ssl
from collections.abc import AsyncIterator, Iterator
from datetime import datetime
from email.message import Message
from typing import NamedTuple
from broker_sync.models import Account, AccountType, Activity
from broker_sync.providers.parsers import invest_engine as ie_parser
from broker_sync.providers.parsers.schwab import parse_schwab_email
log = logging.getLogger(__name__)
_IE_SENDERS = {"noreply@investengine.com", "hello@investengine.com"}
_SCHWAB_SENDERS = {
"equityawards@schwab.com",
"donotreply@schwab.com",
"wealthnotify@schwab.com",
}
_ADDR_RE = re.compile(r"[\w.+-]+@[\w-]+(?:\.[\w-]+)+")
class ImapCreds(NamedTuple):
host: str
user: str
password: str
directory: str
def _extract_sender(msg: Message) -> str:
raw = msg.get("From", "")
m = _ADDR_RE.search(raw)
return (m.group(0) if m else "").lower()
def _html_or_text(msg: Message) -> str:
"""Return the richest body available (prefer HTML)."""
if msg.is_multipart():
html = None
plain = None
for part in msg.walk():
ct = part.get_content_type()
if ct == "text/html" and html is None:
html = part.get_payload(decode=True)
elif ct == "text/plain" and plain is None:
plain = part.get_payload(decode=True)
body = html or plain
else:
body = msg.get_payload(decode=True)
if body is None:
return ""
if isinstance(body, bytes):
charset = msg.get_content_charset() or "utf-8"
try:
return body.decode(charset, errors="replace")
except LookupError:
return body.decode("utf-8", errors="replace")
return str(body)
def _fetch_all(creds: ImapCreds) -> Iterator[bytes]:
ctx = ssl.create_default_context()
with imaplib.IMAP4_SSL(creds.host, ssl_context=ctx) as m:
m.login(creds.user, creds.password)
typ, _ = m.select(creds.directory, readonly=True)
if typ != "OK":
raise RuntimeError(f"IMAP select {creds.directory} failed: {typ}")
typ, data = m.search(None, "ALL")
if typ != "OK":
raise RuntimeError(f"IMAP search failed: {typ}")
ids = data[0].split()
log.info("imap: fetching %d messages from %s", len(ids), creds.directory)
for uid in ids:
typ, rsp = m.fetch(uid, "(RFC822)")
if typ != "OK" or not rsp or not rsp[0]:
continue
raw = rsp[0][1]
if isinstance(raw, bytes):
yield raw
def fetch_activities(creds: ImapCreds) -> list[Activity]:
out: list[Activity] = []
ie_parsed = schwab_parsed = skipped = 0
for raw in _fetch_all(creds):
try:
msg = email.message_from_bytes(raw)
except Exception:
skipped += 1
continue
sender = _extract_sender(msg)
if sender in _IE_SENDERS or sender.endswith("@investengine.com"):
out.extend(ie_parser.parse_invest_engine_email(raw))
ie_parsed += 1
elif sender in _SCHWAB_SENDERS or sender.endswith("@schwab.com"):
html = _html_or_text(msg)
out.extend(parse_schwab_email(html))
schwab_parsed += 1
else:
skipped += 1
log.info(
"imap: ie_parsed=%d schwab_parsed=%d skipped=%d%d activities",
ie_parsed,
schwab_parsed,
skipped,
len(out),
)
return out
class ImapProvider:
"""Wraps the IMAP fetch + per-sender parse into the Provider protocol.
Yields both InvestEngine AND Schwab activities — downstream the
pipeline's dedup keyed on (provider, account, external_id) already
isolates them by account_id.
"""
name = "imap"
def __init__(self, creds: ImapCreds) -> None:
self._creds = creds
def accounts(self) -> list[Account]:
return [
Account(
id="invest-engine-primary",
name="InvestEngine ISA",
account_type=AccountType.ISA,
currency="GBP",
provider="invest-engine",
),
Account(
id="schwab-workplace",
name="Schwab (US workplace)",
account_type=AccountType.GIA,
currency="USD",
provider="schwab",
),
]
async def fetch(
self,
*,
since: datetime | None = None,
before: datetime | None = None,
) -> AsyncIterator[Activity]:
# IMAP doesn't give us a server-side date range directly without
# constructing IMAP SEARCH criteria; filter client-side.
for a in fetch_activities(self._creds):
if since is not None and a.date < since:
continue
if before is not None and a.date >= before:
continue
yield a
if __name__ == "__main__":
# Local smoke — invoked manually for debug, never from the CronJob.
import os
logging.basicConfig(level=logging.INFO)
c = ImapCreds(
host=os.environ["IMAP_HOST"],
user=os.environ["IMAP_USER"],
password=os.environ["IMAP_PASSWORD"],
directory=os.environ.get("IMAP_DIRECTORY", "INBOX"),
)
acts = fetch_activities(c)
print(f"total={len(acts)}")
for a in acts[:5]:
print(f" {a.activity_type} {a.symbol} {a.date.isoformat()}")