Wires the IE + Schwab email parsers into an actual runnable sync. Walks the IMAP mailbox, routes each message by sender domain: - *@investengine.com → invest_engine.parse_invest_engine_email - *@schwab.com → schwab.parse_schwab_email then pushes the resulting Activities through the shared pipeline. broker-sync imap-ingest — new CLI command taking IMAP_HOST/USER/PASSWORD/ DIRECTORY (mirrors the old wealthfolio-sync image's env shape so the Terraform CronJob's existing env wiring works unchanged). Verified: poetry run pytest -q → 109 passed + 1 skipped; mypy strict clean (37 files); ruff + yapf clean.
189 lines
6 KiB
Python
189 lines
6 KiB
Python
"""IMAP email ingestor: dispatches messages to the matching parser by sender.
|
|
|
|
Used by the `imap-ingest` CLI command for InvestEngine + Schwab confirmation
|
|
emails. Each message passes through:
|
|
|
|
1. Pull ALL messages from the configured mailbox directory.
|
|
2. Route each by `From:` to a parser:
|
|
- noreply@investengine.com (+ equivalents) → invest_engine parser
|
|
- Schwab confirmations (equityawards@schwab.com, etc.) → schwab parser
|
|
3. Merge parser output into one list[Activity] with source attribution.
|
|
|
|
Not imap-idle; runs once per invocation. Designed for a daily CronJob.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import email
|
|
import imaplib
|
|
import logging
|
|
import re
|
|
import ssl
|
|
from collections.abc import AsyncIterator, Iterator
|
|
from datetime import datetime
|
|
from email.message import Message
|
|
from typing import NamedTuple
|
|
|
|
from broker_sync.models import Account, AccountType, Activity
|
|
from broker_sync.providers.parsers import invest_engine as ie_parser
|
|
from broker_sync.providers.parsers.schwab import parse_schwab_email
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
_IE_SENDERS = {"noreply@investengine.com", "hello@investengine.com"}
|
|
_SCHWAB_SENDERS = {
|
|
"equityawards@schwab.com",
|
|
"donotreply@schwab.com",
|
|
"wealthnotify@schwab.com",
|
|
}
|
|
|
|
_ADDR_RE = re.compile(r"[\w.+-]+@[\w-]+(?:\.[\w-]+)+")
|
|
|
|
|
|
class ImapCreds(NamedTuple):
|
|
host: str
|
|
user: str
|
|
password: str
|
|
directory: str
|
|
|
|
|
|
def _extract_sender(msg: Message) -> str:
|
|
raw = msg.get("From", "")
|
|
m = _ADDR_RE.search(raw)
|
|
return (m.group(0) if m else "").lower()
|
|
|
|
|
|
def _html_or_text(msg: Message) -> str:
|
|
"""Return the richest body available (prefer HTML)."""
|
|
if msg.is_multipart():
|
|
html = None
|
|
plain = None
|
|
for part in msg.walk():
|
|
ct = part.get_content_type()
|
|
if ct == "text/html" and html is None:
|
|
html = part.get_payload(decode=True)
|
|
elif ct == "text/plain" and plain is None:
|
|
plain = part.get_payload(decode=True)
|
|
body = html or plain
|
|
else:
|
|
body = msg.get_payload(decode=True)
|
|
if body is None:
|
|
return ""
|
|
if isinstance(body, bytes):
|
|
charset = msg.get_content_charset() or "utf-8"
|
|
try:
|
|
return body.decode(charset, errors="replace")
|
|
except LookupError:
|
|
return body.decode("utf-8", errors="replace")
|
|
return str(body)
|
|
|
|
|
|
def _fetch_all(creds: ImapCreds) -> Iterator[bytes]:
|
|
ctx = ssl.create_default_context()
|
|
with imaplib.IMAP4_SSL(creds.host, ssl_context=ctx) as m:
|
|
m.login(creds.user, creds.password)
|
|
typ, _ = m.select(creds.directory, readonly=True)
|
|
if typ != "OK":
|
|
raise RuntimeError(f"IMAP select {creds.directory} failed: {typ}")
|
|
typ, data = m.search(None, "ALL")
|
|
if typ != "OK":
|
|
raise RuntimeError(f"IMAP search failed: {typ}")
|
|
ids = data[0].split()
|
|
log.info("imap: fetching %d messages from %s", len(ids), creds.directory)
|
|
for uid in ids:
|
|
typ, rsp = m.fetch(uid, "(RFC822)")
|
|
if typ != "OK" or not rsp or not rsp[0]:
|
|
continue
|
|
raw = rsp[0][1]
|
|
if isinstance(raw, bytes):
|
|
yield raw
|
|
|
|
|
|
def fetch_activities(creds: ImapCreds) -> list[Activity]:
|
|
out: list[Activity] = []
|
|
ie_parsed = schwab_parsed = skipped = 0
|
|
for raw in _fetch_all(creds):
|
|
try:
|
|
msg = email.message_from_bytes(raw)
|
|
except Exception:
|
|
skipped += 1
|
|
continue
|
|
sender = _extract_sender(msg)
|
|
if sender in _IE_SENDERS or sender.endswith("@investengine.com"):
|
|
out.extend(ie_parser.parse_invest_engine_email(raw))
|
|
ie_parsed += 1
|
|
elif sender in _SCHWAB_SENDERS or sender.endswith("@schwab.com"):
|
|
html = _html_or_text(msg)
|
|
out.extend(parse_schwab_email(html))
|
|
schwab_parsed += 1
|
|
else:
|
|
skipped += 1
|
|
log.info(
|
|
"imap: ie_parsed=%d schwab_parsed=%d skipped=%d → %d activities",
|
|
ie_parsed,
|
|
schwab_parsed,
|
|
skipped,
|
|
len(out),
|
|
)
|
|
return out
|
|
|
|
|
|
class ImapProvider:
|
|
"""Wraps the IMAP fetch + per-sender parse into the Provider protocol.
|
|
|
|
Yields both InvestEngine AND Schwab activities — downstream the
|
|
pipeline's dedup keyed on (provider, account, external_id) already
|
|
isolates them by account_id.
|
|
"""
|
|
name = "imap"
|
|
|
|
def __init__(self, creds: ImapCreds) -> None:
|
|
self._creds = creds
|
|
|
|
def accounts(self) -> list[Account]:
|
|
return [
|
|
Account(
|
|
id="invest-engine-primary",
|
|
name="InvestEngine ISA",
|
|
account_type=AccountType.ISA,
|
|
currency="GBP",
|
|
provider="invest-engine",
|
|
),
|
|
Account(
|
|
id="schwab-workplace",
|
|
name="Schwab (US workplace)",
|
|
account_type=AccountType.GIA,
|
|
currency="USD",
|
|
provider="schwab",
|
|
),
|
|
]
|
|
|
|
async def fetch(
|
|
self,
|
|
*,
|
|
since: datetime | None = None,
|
|
before: datetime | None = None,
|
|
) -> AsyncIterator[Activity]:
|
|
# IMAP doesn't give us a server-side date range directly without
|
|
# constructing IMAP SEARCH criteria; filter client-side.
|
|
for a in fetch_activities(self._creds):
|
|
if since is not None and a.date < since:
|
|
continue
|
|
if before is not None and a.date >= before:
|
|
continue
|
|
yield a
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Local smoke — invoked manually for debug, never from the CronJob.
|
|
import os
|
|
logging.basicConfig(level=logging.INFO)
|
|
c = ImapCreds(
|
|
host=os.environ["IMAP_HOST"],
|
|
user=os.environ["IMAP_USER"],
|
|
password=os.environ["IMAP_PASSWORD"],
|
|
directory=os.environ.get("IMAP_DIRECTORY", "INBOX"),
|
|
)
|
|
acts = fetch_activities(c)
|
|
print(f"total={len(acts)}")
|
|
for a in acts[:5]:
|
|
print(f" {a.activity_type} {a.symbol} {a.date.isoformat()}")
|