2025-05-17 20:13:28 +00:00
|
|
|
from pathlib import Path
|
|
|
|
|
import pandas as pd
|
2026-02-06 20:55:10 +00:00
|
|
|
from models.listing import QueryParameters
|
2025-06-08 17:01:33 +00:00
|
|
|
from repositories.listing_repository import ListingRepository
|
2025-05-17 20:13:28 +00:00
|
|
|
|
|
|
|
|
|
2025-05-20 21:58:08 +00:00
|
|
|
async def export_to_csv(
|
2025-06-08 17:01:33 +00:00
|
|
|
repository: ListingRepository,
|
2025-05-18 12:27:26 +00:00
|
|
|
output_file: Path,
|
2025-06-01 20:11:00 +00:00
|
|
|
query_parameters: QueryParameters | None = None,
|
2025-05-17 20:13:28 +00:00
|
|
|
) -> None:
|
2025-06-08 17:01:33 +00:00
|
|
|
listings = await repository.get_listings(query_parameters=query_parameters)
|
2026-02-06 20:55:10 +00:00
|
|
|
ds = [listing.__dict__ for listing in listings]
|
2025-05-17 20:13:28 +00:00
|
|
|
df = pd.DataFrame(ds)
|
2025-06-08 17:01:33 +00:00
|
|
|
|
2025-05-17 20:13:28 +00:00
|
|
|
# read decisions on file
|
Refactor backend for cleaner error handling, DRY, and type safety
- Extract rate limiter DRY: consolidate 3 duplicated check/respond paths
into _check_counter and _enforce_limit helpers, add proper type annotations
- Replace bare Exception raises with FloorplanDownloadError and
RightmoveApiError; narrow catch clauses to specific exception types;
fix Step base class to inherit from ABC
- Consolidate MAX_OCR_WORKERS into config/scraper_config.py; extract
_find_tenure_value helper to deduplicate tenure parsing
- Extract _build_poi_distances_lookup from stream endpoint to reduce nesting
- Fix csv_exporter: optional decisions.json, NaN instead of -1 sentinels,
guard against division by zero on missing square meters
- Fix notifications.py broken list[Surface]() constructor, database.py
stale comments and missing type annotation, auth.py type:ignore,
ui_exporter.py stale TODO
- Fix 3 pre-existing test failures: mock cache layer in streaming tests,
bypass rate limiter for test isolation, fix cache invalidation test to
account for two-pattern scan loop
2026-02-10 22:19:24 +00:00
|
|
|
decisions_path = Path("data/decisions.json")
|
|
|
|
|
if decisions_path.exists():
|
|
|
|
|
decisions = pd.read_json(decisions_path)
|
|
|
|
|
df.loc[:, "decision"] = df.id.apply(lambda x: decisions.get(x))
|
2025-06-08 17:01:33 +00:00
|
|
|
|
|
|
|
|
# remove _sa_instance_state column
|
|
|
|
|
drop_columns = ["_sa_instance_state", "additional_info"]
|
|
|
|
|
df = df.drop(columns=drop_columns)
|
2025-05-17 20:13:28 +00:00
|
|
|
|
Refactor backend for cleaner error handling, DRY, and type safety
- Extract rate limiter DRY: consolidate 3 duplicated check/respond paths
into _check_counter and _enforce_limit helpers, add proper type annotations
- Replace bare Exception raises with FloorplanDownloadError and
RightmoveApiError; narrow catch clauses to specific exception types;
fix Step base class to inherit from ABC
- Consolidate MAX_OCR_WORKERS into config/scraper_config.py; extract
_find_tenure_value helper to deduplicate tenure parsing
- Extract _build_poi_distances_lookup from stream endpoint to reduce nesting
- Fix csv_exporter: optional decisions.json, NaN instead of -1 sentinels,
guard against division by zero on missing square meters
- Fix notifications.py broken list[Surface]() constructor, database.py
stale comments and missing type annotation, auth.py type:ignore,
ui_exporter.py stale TODO
- Fix 3 pre-existing test failures: mock cache layer in streaming tests,
bypass rate limiter for test isolation, fix cache invalidation test to
account for two-pattern scan loop
2026-02-10 22:19:24 +00:00
|
|
|
# Ensure columns exist with NaN defaults for clean CSV output
|
|
|
|
|
for col in ("service_charge", "lease_left", "square_meters"):
|
|
|
|
|
if col not in df.columns:
|
|
|
|
|
df.loc[:, col] = float("nan")
|
|
|
|
|
|
|
|
|
|
# Replace -1 sentinel values with NaN
|
|
|
|
|
df.loc[:, "square_meters"] = df.square_meters.replace({-1: float("nan")})
|
|
|
|
|
|
|
|
|
|
# Add price per sqm column (guard against zero/missing square_meters)
|
|
|
|
|
df.loc[:, "price_per_sqm"] = df.apply(
|
|
|
|
|
lambda row: round(row.price / row.square_meters, 2)
|
|
|
|
|
if row.square_meters and row.square_meters > 0
|
|
|
|
|
else None,
|
|
|
|
|
axis=1,
|
|
|
|
|
)
|
2026-02-06 20:55:10 +00:00
|
|
|
|
|
|
|
|
df = df.sort_values(by=["price_per_sqm"], ascending=True)
|
|
|
|
|
df.to_csv(str(output_file), index=False)
|