Refactor codebase following Clean Code principles and add 229 tests
- Extract helpers to reduce function sizes (listing_tasks, app.py, query.py, listing_fetcher) - Replace nonlocal mutations with _PipelineState dataclass in listing_tasks - Fix bugs: isinstance→equality check in repository, verify_exp for OIDC tokens - Consolidate duplicate filter methods in listing_repository - Move hardcoded config to env vars with backward-compatible defaults - Simplify CLI decorator to auto-build QueryParameters - Add deprecation docstring to data_access.py - Test count: 158 → 387 (all passing)
This commit is contained in:
parent
7e05b3c971
commit
150342bb9e
48 changed files with 5029 additions and 990 deletions
|
|
@ -13,6 +13,8 @@ from services.query_splitter import QuerySplitter, SubQuery
|
|||
|
||||
logger = logging.getLogger("uvicorn.error")
|
||||
|
||||
# Number of concurrent workers that process listing details (fetch details,
|
||||
# download images, run OCR) from the streaming queue in parallel.
|
||||
NUM_WORKERS = 20
|
||||
|
||||
|
||||
|
|
@ -23,10 +25,104 @@ async def dump_listings_full(
|
|||
"""Fetches all listings, images as well as detects floorplans."""
|
||||
new_listings = await dump_listings(parameters, repository)
|
||||
logger.debug(f"Upserted {len(new_listings)} new listings")
|
||||
# refresh listings
|
||||
listings = await repository.get_listings(parameters) # this can be better
|
||||
new_listings = [x for x in listings if x.id in new_listings]
|
||||
return new_listings
|
||||
new_listing_ids = [listing.id for listing in new_listings]
|
||||
return await repository.get_listings(only_ids=new_listing_ids)
|
||||
|
||||
|
||||
async def _fetch_subquery(
|
||||
sq: SubQuery,
|
||||
parameters: QueryParameters,
|
||||
session: object,
|
||||
config: ScraperConfig,
|
||||
semaphore: asyncio.Semaphore,
|
||||
existing_ids: set[int],
|
||||
queue: asyncio.Queue[int | None],
|
||||
) -> int:
|
||||
"""Fetch listing IDs for a single subquery and enqueue new ones.
|
||||
|
||||
Iterates through pages of results for the given subquery, adding any
|
||||
newly discovered listing IDs to the processing queue.
|
||||
|
||||
Args:
|
||||
sq: The subquery to fetch results for.
|
||||
parameters: The original query parameters (for page_size, etc.).
|
||||
session: The aiohttp session for making requests.
|
||||
config: Scraper configuration.
|
||||
semaphore: Concurrency limiter for HTTP requests.
|
||||
existing_ids: Set of already-known listing IDs (mutated in place).
|
||||
queue: Queue to push new listing IDs onto for processing.
|
||||
|
||||
Returns:
|
||||
The number of new IDs discovered and enqueued.
|
||||
"""
|
||||
estimated = sq.estimated_results or 0
|
||||
if estimated == 0:
|
||||
return 0
|
||||
|
||||
ids_found = 0
|
||||
page_size = parameters.page_size
|
||||
max_pages = min(
|
||||
config.max_pages_per_query,
|
||||
(estimated // page_size) + 1,
|
||||
)
|
||||
|
||||
for page_id in range(1, max_pages + 1):
|
||||
async with semaphore:
|
||||
await asyncio.sleep(config.request_delay_ms / 1000)
|
||||
try:
|
||||
result = await listing_query(
|
||||
page=page_id,
|
||||
channel=parameters.listing_type,
|
||||
min_bedrooms=sq.min_bedrooms,
|
||||
max_bedrooms=sq.max_bedrooms,
|
||||
radius=parameters.radius,
|
||||
min_price=sq.min_price,
|
||||
max_price=sq.max_price,
|
||||
district=sq.district,
|
||||
page_size=page_size,
|
||||
max_days_since_added=parameters.max_days_since_added,
|
||||
furnish_types=parameters.furnish_types or [],
|
||||
session=session,
|
||||
config=config,
|
||||
)
|
||||
|
||||
# Extract and enqueue new IDs inline
|
||||
properties = result.get("properties", [])
|
||||
for prop in properties:
|
||||
identifier = prop.get("identifier")
|
||||
if identifier and identifier not in existing_ids:
|
||||
existing_ids.add(identifier)
|
||||
ids_found += 1
|
||||
await queue.put(identifier)
|
||||
|
||||
if len(properties) < page_size:
|
||||
break
|
||||
|
||||
except CircuitBreakerOpenError as e:
|
||||
logger.error(f"Circuit breaker open: {e}")
|
||||
break
|
||||
except ThrottlingError as e:
|
||||
logger.warning(
|
||||
f"Throttling error on page {page_id} for "
|
||||
f"{sq.district}: {e}"
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
# Rightmove returns GENERIC_ERROR when requesting pages
|
||||
# past the last page of results. This is expected behavior
|
||||
# and signals we've exhausted this subquery's results.
|
||||
if "GENERIC_ERROR" in str(e):
|
||||
logger.debug(
|
||||
f"Max page for {sq.district}: {page_id - 1}"
|
||||
)
|
||||
break
|
||||
logger.warning(
|
||||
f"Error fetching page {page_id} for "
|
||||
f"{sq.district}: {e}"
|
||||
)
|
||||
break
|
||||
|
||||
return ids_found
|
||||
|
||||
|
||||
async def dump_listings(
|
||||
|
|
@ -63,82 +159,23 @@ async def dump_listings(
|
|||
# Phase 2: Streaming fetch & process
|
||||
queue: asyncio.Queue[int | None] = asyncio.Queue()
|
||||
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
||||
ids_collected = 0
|
||||
processed_listings: list[Listing] = []
|
||||
|
||||
async def fetch_subquery(sq: SubQuery) -> None:
|
||||
nonlocal ids_collected
|
||||
|
||||
estimated = sq.estimated_results or 0
|
||||
if estimated == 0:
|
||||
return
|
||||
|
||||
page_size = parameters.page_size
|
||||
max_pages = min(
|
||||
config.max_pages_per_query,
|
||||
(estimated // page_size) + 1,
|
||||
)
|
||||
|
||||
for page_id in range(1, max_pages + 1):
|
||||
async with semaphore:
|
||||
await asyncio.sleep(config.request_delay_ms / 1000)
|
||||
try:
|
||||
result = await listing_query(
|
||||
page=page_id,
|
||||
channel=parameters.listing_type,
|
||||
min_bedrooms=sq.min_bedrooms,
|
||||
max_bedrooms=sq.max_bedrooms,
|
||||
radius=parameters.radius,
|
||||
min_price=sq.min_price,
|
||||
max_price=sq.max_price,
|
||||
district=sq.district,
|
||||
page_size=page_size,
|
||||
max_days_since_added=parameters.max_days_since_added,
|
||||
furnish_types=parameters.furnish_types or [],
|
||||
session=session,
|
||||
config=config,
|
||||
)
|
||||
|
||||
# Extract and enqueue new IDs inline
|
||||
properties = result.get("properties", [])
|
||||
for prop in properties:
|
||||
identifier = prop.get("identifier")
|
||||
if identifier and identifier not in existing_ids:
|
||||
existing_ids.add(identifier)
|
||||
ids_collected += 1
|
||||
await queue.put(identifier)
|
||||
|
||||
if len(properties) < page_size:
|
||||
break
|
||||
|
||||
except CircuitBreakerOpenError as e:
|
||||
logger.error(f"Circuit breaker open: {e}")
|
||||
break
|
||||
except ThrottlingError as e:
|
||||
logger.warning(
|
||||
f"Throttling error on page {page_id} for "
|
||||
f"{sq.district}: {e}"
|
||||
)
|
||||
break
|
||||
except Exception as e:
|
||||
if "GENERIC_ERROR" in str(e):
|
||||
logger.debug(
|
||||
f"Max page for {sq.district}: {page_id - 1}"
|
||||
)
|
||||
break
|
||||
logger.warning(
|
||||
f"Error fetching page {page_id} for "
|
||||
f"{sq.district}: {e}"
|
||||
)
|
||||
break
|
||||
|
||||
async def producer() -> None:
|
||||
await asyncio.gather(
|
||||
*[fetch_subquery(sq) for sq in subqueries]
|
||||
)
|
||||
async def producer() -> int:
|
||||
"""Fetch all subqueries and send sentinel values to workers."""
|
||||
tasks = [
|
||||
_fetch_subquery(
|
||||
sq, parameters, session, config,
|
||||
semaphore, existing_ids, queue,
|
||||
)
|
||||
for sq in subqueries
|
||||
]
|
||||
counts = await asyncio.gather(*tasks)
|
||||
ids_collected = sum(counts)
|
||||
logger.info(f"Fetch complete: {ids_collected} new IDs found")
|
||||
for _ in range(NUM_WORKERS):
|
||||
await queue.put(None)
|
||||
return ids_collected
|
||||
|
||||
async def worker() -> None:
|
||||
while True:
|
||||
|
|
@ -150,10 +187,11 @@ async def dump_listings(
|
|||
if listing is not None:
|
||||
processed_listings.append(listing)
|
||||
|
||||
await asyncio.gather(
|
||||
results = await asyncio.gather(
|
||||
producer(),
|
||||
*[worker() for _ in range(NUM_WORKERS)],
|
||||
)
|
||||
ids_collected = results[0]
|
||||
|
||||
except CircuitBreakerOpenError as e:
|
||||
logger.error(f"Circuit breaker prevented listing fetch: {e}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue