wrongmove/services/market_aggregator.py

"""Daily market-trend aggregator.

Two outputs per run:

1. Per-listing trend columns. For each row in RentListing / BuyListing we
   parse `price_history_json` and find the price entry whose `last_seen` was
   closest to `lookback_days` ago. The current price and that historical
   price land on `price_14d_ago` / `price_change_pct_14d` for the
   PropertyCard badge to render.

2. Aggregate market snapshot. For each configured (listing_type, bedroom
   band) we compute median/mean/count over the CURRENT listing pool and
   upsert one row in `dailylistingaggregate` keyed on today's date. The
   `MarketTrendStrip` UI consumes these rows.

Both steps are idempotent — re-running on the same day refreshes the
snapshot rather than appending. Designed to fire daily ~04:00 UTC (1h
after the 03:00 RENT scrape so the data is fresh).
"""
from __future__ import annotations

import json
import logging
import time
from datetime import datetime, timedelta
from statistics import mean, median
from typing import Iterable

from sqlalchemy import Engine
from sqlmodel import Session, select

from models.listing import (
    BuyListing,
    DailyListingAggregate,
    PriceHistoryItem,
    RentListing,
)

logger = logging.getLogger("uvicorn")

# Default scope: the user's daily filter (1-2 bed, both listing types).
DEFAULT_BEDROOM_BANDS: tuple[tuple[int, int], ...] = ((1, 2),)
DEFAULT_LISTING_TYPES: tuple[str, ...] = ("RENT", "BUY")

# Trend lookback window for the per-listing badge. Surfaces price moves
# that happened in the last fortnight (long enough for prices to actually
# settle, short enough to feel current).
DEFAULT_LOOKBACK_DAYS = 14


def _parse_history(price_history_json: str | None) -> list[PriceHistoryItem]:
    if not price_history_json:
        return []
    try:
        raw = json.loads(price_history_json)
    except (ValueError, TypeError):
        return []
    out: list[PriceHistoryItem] = []
    for item in raw:
        try:
            out.append(
                PriceHistoryItem(
                    first_seen=datetime.fromisoformat(item["first_seen"]),
                    last_seen=datetime.fromisoformat(item["last_seen"]),
                    price=float(item["price"]),
                )
            )
        except (KeyError, ValueError, TypeError):
            continue
    return out


def _price_at_or_before(
    history: list[PriceHistoryItem], cutoff: datetime
) -> float | None:
    """Return the price of the entry whose `last_seen` is closest to (but
    not after) `cutoff`. Returns None if no entry that old exists.

    History is in chronological order; we scan and keep the latest match.
    """
    found: float | None = None
    for item in history:
        if item.last_seen <= cutoff:
            found = item.price
        else:
            break
    return found


def compute_trend_for_listing(
    price_history_json: str | None,
    current_price: float | None,
    *,
    lookback_days: int = DEFAULT_LOOKBACK_DAYS,
    now: datetime | None = None,
) -> tuple[float | None, float | None]:
    """Return `(price_n_days_ago, change_pct)` for one listing.

    `change_pct` is `(current - past) / past * 100` rounded to 2dp; positive
    = price went up, negative = down. Both are None when there's no entry
    that old in history or current price is unusable.
    """
    if not isinstance(current_price, (int, float)) or current_price <= 0:
        return None, None
    cutoff = (now or datetime.utcnow()) - timedelta(days=lookback_days)
    history = _parse_history(price_history_json)
    past = _price_at_or_before(history, cutoff)
    if past is None or past <= 0:
        return None, None
    pct = round((current_price - past) / past * 100.0, 2)
    return past, pct


def update_per_listing_trend(
    engine: Engine,
    *,
    lookback_days: int = DEFAULT_LOOKBACK_DAYS,
    batch_size: int = 1000,
    now: datetime | None = None,
) -> dict[str, int]:
    """Walk every RentListing + BuyListing, recompute trend columns, write."""
    counts = {"rent_updated": 0, "buy_updated": 0}
    t0 = time.monotonic()
    for model_name, model in (("rent", RentListing), ("buy", BuyListing)):
        with Session(engine) as session:
            offset = 0
            while True:
                stmt = select(model).offset(offset).limit(batch_size)
                rows: list = list(session.exec(stmt).all())
                if not rows:
                    break
                for row in rows:
                    past, pct = compute_trend_for_listing(
                        row.price_history_json,
                        row.price,
                        lookback_days=lookback_days,
                        now=now,
                    )
                    if row.price_14d_ago != past or row.price_change_pct_14d != pct:
                        row.price_14d_ago = past
                        row.price_change_pct_14d = pct
                        session.add(row)
                        counts[f"{model_name}_updated"] += 1
                session.commit()
                if len(rows) < batch_size:
                    break
                offset += batch_size
    logger.info(
        "Per-listing trend updated in %.1fs: rent=%d buy=%d (lookback=%dd)",
        time.monotonic() - t0,
        counts["rent_updated"],
        counts["buy_updated"],
        lookback_days,
    )
    return counts


def _stats(values: Iterable[float]) -> dict[str, float | None]:
    """Median + mean over the valid positive entries; null for empty input."""
    finite = [v for v in values if isinstance(v, (int, float)) and v > 0]
    if not finite:
        return {"median": None, "mean": None, "count": 0}
    return {
        "median": float(median(finite)),
        "mean": round(float(mean(finite)), 2),
        "count": len(finite),
    }


def compute_aggregate_snapshot(
    engine: Engine,
    *,
    listing_types: tuple[str, ...] = DEFAULT_LISTING_TYPES,
    bedroom_bands: tuple[tuple[int, int], ...] = DEFAULT_BEDROOM_BANDS,
    snapshot_date: datetime | None = None,
) -> list[DailyListingAggregate]:
    """Compute one aggregate row per (listing_type * bedroom band) and
    upsert it onto today's `snapshot_date`. Returns the persisted rows.

    Uses an `INSERT ... ON DUPLICATE KEY UPDATE` so re-running on the same
    day refreshes the row in place — no duplicates, no DELETE.
    """
    today = snapshot_date or datetime.utcnow().replace(
        hour=0, minute=0, second=0, microsecond=0
    )
    written: list[DailyListingAggregate] = []
    dialect = engine.dialect.name
    with Session(engine) as session:
        for listing_type in listing_types:
            model = RentListing if listing_type == "RENT" else BuyListing
            for min_bed, max_bed in bedroom_bands:
                stmt = select(model.price, model.square_meters).where(
                    model.number_of_bedrooms >= min_bed,
                    model.number_of_bedrooms <= max_bed,
                )
                rows = list(session.exec(stmt).all())
                prices = [r[0] for r in rows]
                qmprices = [
                    (r[0] / r[1])
                    for r in rows
                    if r[1] is not None and r[1] > 0
                ]
                price_stats = _stats(prices)
                qm_stats = _stats(qmprices)
                values = {
                    "snapshot_date": today,
                    "listing_type": listing_type,
                    "min_bedrooms": min_bed,
                    "max_bedrooms": max_bed,
                    "listing_count": price_stats["count"],
                    "median_total_price": price_stats["median"],
                    "median_qmprice": qm_stats["median"],
                    "mean_total_price": price_stats["mean"],
                    "mean_qmprice": qm_stats["mean"],
                }
                if dialect == "mysql":
                    from sqlalchemy.dialects.mysql import insert as mysql_insert
                    stmt_ins = mysql_insert(DailyListingAggregate).values(**values)
                    stmt_ins = stmt_ins.on_duplicate_key_update(
                        listing_count=stmt_ins.inserted.listing_count,
                        median_total_price=stmt_ins.inserted.median_total_price,
                        median_qmprice=stmt_ins.inserted.median_qmprice,
                        mean_total_price=stmt_ins.inserted.mean_total_price,
                        mean_qmprice=stmt_ins.inserted.mean_qmprice,
                    )
                    session.execute(stmt_ins)
                else:
                    from sqlalchemy.dialects.sqlite import insert as sqlite_insert
                    stmt_ins = sqlite_insert(DailyListingAggregate).values(**values)
                    stmt_ins = stmt_ins.on_conflict_do_update(
                        index_elements=[
                            "snapshot_date", "listing_type",
                            "min_bedrooms", "max_bedrooms",
                        ],
                        set_={
                            "listing_count": stmt_ins.excluded.listing_count,
                            "median_total_price": stmt_ins.excluded.median_total_price,
                            "median_qmprice": stmt_ins.excluded.median_qmprice,
                            "mean_total_price": stmt_ins.excluded.mean_total_price,
                            "mean_qmprice": stmt_ins.excluded.mean_qmprice,
                        },
                    )
                    session.execute(stmt_ins)
                session.commit()
                row = session.exec(
                    select(DailyListingAggregate).where(
                        DailyListingAggregate.snapshot_date == today,
                        DailyListingAggregate.listing_type == listing_type,
                        DailyListingAggregate.min_bedrooms == min_bed,
                        DailyListingAggregate.max_bedrooms == max_bed,
                    )
                ).first()
                if row is not None:
                    written.append(row)
                logger.info(
                    "Aggregate %s %d-%d on %s: count=%s median=%s/%s mean=%s/%s",
                    listing_type, min_bed, max_bed, today.date(),
                    price_stats["count"],
                    price_stats["median"], qm_stats["median"],
                    price_stats["mean"], qm_stats["mean"],
                )
    return written


def fetch_trend_series(
    engine: Engine,
    *,
    listing_type: str,
    min_bedrooms: int,
    max_bedrooms: int,
    days: int = 30,
) -> list[DailyListingAggregate]:
    """Return the aggregate rows for the last `days` days, ordered ascending
    by date. Empty list when no rows match — the strip handles that case."""
    cutoff = datetime.utcnow() - timedelta(days=days)
    with Session(engine) as session:
        stmt = (
            select(DailyListingAggregate)
            .where(
                DailyListingAggregate.listing_type == listing_type,
                DailyListingAggregate.min_bedrooms == min_bedrooms,
                DailyListingAggregate.max_bedrooms == max_bedrooms,
                DailyListingAggregate.snapshot_date >= cutoff,
            )
            .order_by(DailyListingAggregate.snapshot_date)
        )
        return list(session.exec(stmt).all())