"""IMAP email ingestor: dispatches messages to the matching parser by sender. Used by the `imap-ingest` CLI command for InvestEngine + Schwab confirmation emails. Each message passes through: 1. Pull ALL messages from the configured mailbox directory. 2. Route each by `From:` to a parser: - noreply@investengine.com (+ equivalents) → invest_engine parser - Schwab confirmations (equityawards@schwab.com, etc.) → schwab parser 3. Merge parser output into one list[Activity] with source attribution. Not imap-idle; runs once per invocation. Designed for a daily CronJob. """ from __future__ import annotations import email import imaplib import logging import re import ssl from collections.abc import AsyncIterator, Iterator from datetime import datetime from email.message import Message from typing import NamedTuple from broker_sync.models import Account, AccountType, Activity from broker_sync.providers.parsers import invest_engine as ie_parser from broker_sync.providers.parsers.schwab import parse_schwab_email log = logging.getLogger(__name__) _IE_SENDERS = {"noreply@investengine.com", "hello@investengine.com"} _SCHWAB_SENDERS = { "equityawards@schwab.com", "donotreply@schwab.com", "wealthnotify@schwab.com", } _ADDR_RE = re.compile(r"[\w.+-]+@[\w-]+(?:\.[\w-]+)+") class ImapCreds(NamedTuple): host: str user: str password: str directory: str def _extract_sender(msg: Message) -> str: raw = msg.get("From", "") m = _ADDR_RE.search(raw) return (m.group(0) if m else "").lower() def _html_or_text(msg: Message) -> str: """Return the richest body available (prefer HTML).""" if msg.is_multipart(): html = None plain = None for part in msg.walk(): ct = part.get_content_type() if ct == "text/html" and html is None: html = part.get_payload(decode=True) elif ct == "text/plain" and plain is None: plain = part.get_payload(decode=True) body = html or plain else: body = msg.get_payload(decode=True) if body is None: return "" if isinstance(body, bytes): charset = msg.get_content_charset() or "utf-8" try: return body.decode(charset, errors="replace") except LookupError: return body.decode("utf-8", errors="replace") return str(body) def _fetch_all(creds: ImapCreds) -> Iterator[bytes]: ctx = ssl.create_default_context() with imaplib.IMAP4_SSL(creds.host, ssl_context=ctx) as m: m.login(creds.user, creds.password) typ, _ = m.select(creds.directory, readonly=True) if typ != "OK": raise RuntimeError(f"IMAP select {creds.directory} failed: {typ}") typ, data = m.search(None, "ALL") if typ != "OK": raise RuntimeError(f"IMAP search failed: {typ}") ids = data[0].split() log.info("imap: fetching %d messages from %s", len(ids), creds.directory) for uid in ids: typ, rsp = m.fetch(uid, "(RFC822)") if typ != "OK" or not rsp or not rsp[0]: continue raw = rsp[0][1] if isinstance(raw, bytes): yield raw def fetch_activities(creds: ImapCreds) -> list[Activity]: out: list[Activity] = [] ie_parsed = schwab_parsed = skipped = 0 for raw in _fetch_all(creds): try: msg = email.message_from_bytes(raw) except Exception: skipped += 1 continue sender = _extract_sender(msg) if sender in _IE_SENDERS or sender.endswith("@investengine.com"): out.extend(ie_parser.parse_invest_engine_email(raw)) ie_parsed += 1 elif sender in _SCHWAB_SENDERS or sender.endswith("@schwab.com"): html = _html_or_text(msg) out.extend(parse_schwab_email(html)) schwab_parsed += 1 else: skipped += 1 log.info( "imap: ie_parsed=%d schwab_parsed=%d skipped=%d → %d activities", ie_parsed, schwab_parsed, skipped, len(out), ) return out class ImapProvider: """Wraps the IMAP fetch + per-sender parse into the Provider protocol. Yields both InvestEngine AND Schwab activities — downstream the pipeline's dedup keyed on (provider, account, external_id) already isolates them by account_id. """ name = "imap" def __init__(self, creds: ImapCreds) -> None: self._creds = creds def accounts(self) -> list[Account]: return [ Account( id="invest-engine-primary", name="InvestEngine ISA", account_type=AccountType.ISA, currency="GBP", provider="invest-engine", ), Account( id="schwab-workplace", name="Schwab (US workplace)", account_type=AccountType.GIA, currency="USD", provider="schwab", ), ] async def fetch( self, *, since: datetime | None = None, before: datetime | None = None, ) -> AsyncIterator[Activity]: # IMAP doesn't give us a server-side date range directly without # constructing IMAP SEARCH criteria; filter client-side. for a in fetch_activities(self._creds): if since is not None and a.date < since: continue if before is not None and a.date >= before: continue yield a if __name__ == "__main__": # Local smoke — invoked manually for debug, never from the CronJob. import os logging.basicConfig(level=logging.INFO) c = ImapCreds( host=os.environ["IMAP_HOST"], user=os.environ["IMAP_USER"], password=os.environ["IMAP_PASSWORD"], directory=os.environ.get("IMAP_DIRECTORY", "INBOX"), ) acts = fetch_activities(c) print(f"total={len(acts)}") for a in acts[:5]: print(f" {a.activity_type} {a.symbol} {a.date.isoformat()}")