Stream-process listings as IDs arrive via asyncio.Queue
Replace the sequential fetch-all-then-process pipeline with a streaming architecture where listing processing starts as soon as IDs become available from each subquery. A producer task fetches pages and enqueues new IDs (filtered inline against DB), while 20 consumer workers process listings concurrently from the queue. - Add ListingRepository.get_listing_ids() for fast ID-only projection - Refactor listing_tasks.py: remove get_ids_to_process/dump_listings_and_monitor, replace with unified producer/worker/monitor pipeline - Apply same pattern to CLI path in listing_fetcher.py - Remove 'filtering' phase from frontend, show combined fetch+process metrics - Add fetching_done flag to TaskResult for phase transition tracking
This commit is contained in:
parent
7e8f1f0339
commit
b9f576ae2b
6 changed files with 372 additions and 420 deletions
|
|
@ -180,7 +180,6 @@ export function TaskIndicator({ taskID, onTaskCancelled }: TaskIndicatorProps) {
|
||||||
splitting: 'Splitting',
|
splitting: 'Splitting',
|
||||||
splitting_complete: 'Split done',
|
splitting_complete: 'Split done',
|
||||||
fetching: 'Fetching',
|
fetching: 'Fetching',
|
||||||
filtering: 'Filtering',
|
|
||||||
};
|
};
|
||||||
return phaseLabels[taskResult.phase] ?? `${Math.round(progressPercentage)}%`;
|
return phaseLabels[taskResult.phase] ?? `${Math.round(progressPercentage)}%`;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -23,9 +23,8 @@ interface TaskProgressDrawerProps {
|
||||||
|
|
||||||
const PHASES: { key: TaskPhase; label: string }[] = [
|
const PHASES: { key: TaskPhase; label: string }[] = [
|
||||||
{ key: 'splitting', label: 'Splitting queries' },
|
{ key: 'splitting', label: 'Splitting queries' },
|
||||||
{ key: 'fetching', label: 'Fetching listings' },
|
{ key: 'fetching', label: 'Fetching & processing' },
|
||||||
{ key: 'filtering', label: 'Filtering results' },
|
{ key: 'processing', label: 'Processing remaining' },
|
||||||
{ key: 'processing', label: 'Processing listings' },
|
|
||||||
];
|
];
|
||||||
|
|
||||||
function getPhaseIndex(phase: TaskPhase | undefined): number {
|
function getPhaseIndex(phase: TaskPhase | undefined): number {
|
||||||
|
|
@ -175,7 +174,7 @@ function PhaseDetails({ result }: { result: TaskResult }) {
|
||||||
return (
|
return (
|
||||||
<div className="rounded-md border p-3 space-y-1">
|
<div className="rounded-md border p-3 space-y-1">
|
||||||
<p className="text-xs font-medium text-muted-foreground uppercase tracking-wide mb-2">
|
<p className="text-xs font-medium text-muted-foreground uppercase tracking-wide mb-2">
|
||||||
Fetching
|
{result.fetching_done ? 'Fetching complete' : 'Fetching & processing'}
|
||||||
</p>
|
</p>
|
||||||
<CounterRow
|
<CounterRow
|
||||||
label="Subqueries completed"
|
label="Subqueries completed"
|
||||||
|
|
@ -184,19 +183,24 @@ function PhaseDetails({ result }: { result: TaskResult }) {
|
||||||
/>
|
/>
|
||||||
<CounterRow label="IDs collected" value={result.ids_collected} />
|
<CounterRow label="IDs collected" value={result.ids_collected} />
|
||||||
<CounterRow label="Pages fetched" value={result.pages_fetched} />
|
<CounterRow label="Pages fetched" value={result.pages_fetched} />
|
||||||
</div>
|
{(result.details_fetched !== undefined && result.details_fetched > 0) && (
|
||||||
);
|
<>
|
||||||
}
|
<div className="border-t my-2" />
|
||||||
|
<CounterRow
|
||||||
if (phase === 'filtering') {
|
label="Details fetched"
|
||||||
return (
|
value={result.details_fetched}
|
||||||
<div className="rounded-md border p-3 space-y-1">
|
total={result.total}
|
||||||
<p className="text-xs font-medium text-muted-foreground uppercase tracking-wide mb-2">
|
/>
|
||||||
Filtering
|
<CounterRow label="Images downloaded" value={result.images_downloaded} />
|
||||||
</p>
|
<CounterRow label="OCR completed" value={result.ocr_completed} />
|
||||||
<CounterRow label="Total from API" value={result.total_found} />
|
{(result.failed ?? 0) > 0 && (
|
||||||
<CounterRow label="Already in DB" value={result.existing_in_db} />
|
<div className="flex justify-between text-sm">
|
||||||
<CounterRow label="New to process" value={result.new_listings} />
|
<span className="text-red-500">Failed</span>
|
||||||
|
<span className="font-mono tabular-nums text-red-500">{result.failed}</span>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
@ -306,7 +310,7 @@ export function TaskProgressDrawer({
|
||||||
|
|
||||||
{taskResult && <PhaseDetails result={taskResult} />}
|
{taskResult && <PhaseDetails result={taskResult} />}
|
||||||
|
|
||||||
{taskResult && taskResult.phase === 'processing' && (
|
{taskResult && (taskResult.phase === 'processing' || taskResult.phase === 'fetching') && (taskResult.total ?? 0) > 0 && (
|
||||||
<div className="space-y-1">
|
<div className="space-y-1">
|
||||||
<div className="w-full h-2 bg-primary/20 rounded-full overflow-hidden">
|
<div className="w-full h-2 bg-primary/20 rounded-full overflow-hidden">
|
||||||
<div
|
<div
|
||||||
|
|
|
||||||
|
|
@ -52,7 +52,7 @@ export interface TaskStatusResponse {
|
||||||
message?: string;
|
message?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export type TaskPhase = 'splitting' | 'splitting_complete' | 'fetching' | 'filtering' | 'processing' | 'completed';
|
export type TaskPhase = 'splitting' | 'splitting_complete' | 'fetching' | 'processing' | 'completed';
|
||||||
|
|
||||||
export interface TaskResult {
|
export interface TaskResult {
|
||||||
progress: number;
|
progress: number;
|
||||||
|
|
@ -69,10 +69,7 @@ export interface TaskResult {
|
||||||
subqueries_completed?: number;
|
subqueries_completed?: number;
|
||||||
ids_collected?: number;
|
ids_collected?: number;
|
||||||
pages_fetched?: number;
|
pages_fetched?: number;
|
||||||
// Filtering phase
|
fetching_done?: boolean;
|
||||||
total_found?: number;
|
|
||||||
existing_in_db?: number;
|
|
||||||
new_listings?: number;
|
|
||||||
// Processing phase
|
// Processing phase
|
||||||
details_fetched?: number;
|
details_fetched?: number;
|
||||||
images_downloaded?: number;
|
images_downloaded?: number;
|
||||||
|
|
|
||||||
|
|
@ -343,6 +343,20 @@ class ListingRepository:
|
||||||
|
|
||||||
return model_listing
|
return model_listing
|
||||||
|
|
||||||
|
def get_listing_ids(
|
||||||
|
self,
|
||||||
|
listing_type: ListingType = ListingType.RENT,
|
||||||
|
) -> set[int]:
|
||||||
|
"""Get all listing IDs from the database (ID-only projection).
|
||||||
|
|
||||||
|
Much faster than get_listings() when only IDs are needed for
|
||||||
|
filtering against API results.
|
||||||
|
"""
|
||||||
|
model = RentListing if listing_type == ListingType.RENT else BuyListing
|
||||||
|
with Session(self.engine) as session:
|
||||||
|
result = session.execute(sa_select(model.id))
|
||||||
|
return {row[0] for row in result.fetchall()}
|
||||||
|
|
||||||
async def mark_seen(self, listing_id: int) -> None:
|
async def mark_seen(self, listing_id: int) -> None:
|
||||||
listings = await self.get_listings(only_ids=[listing_id])
|
listings = await self.get_listings(only_ids=[listing_id])
|
||||||
if len(listings) == 0:
|
if len(listings) == 0:
|
||||||
|
|
|
||||||
|
|
@ -1,26 +1,25 @@
|
||||||
"""Listing fetcher service - fetches listing data from Rightmove API."""
|
"""Listing fetcher service - fetches listing data from Rightmove API."""
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from config.scraper_config import ScraperConfig
|
from config.scraper_config import ScraperConfig
|
||||||
from listing_processor import ListingProcessor
|
from listing_processor import ListingProcessor
|
||||||
from rec.query import create_session, listing_query
|
from rec.query import create_session, listing_query
|
||||||
from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
|
from rec.exceptions import CircuitBreakerOpenError, ThrottlingError
|
||||||
from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
|
from rec.throttle_detector import get_throttle_metrics, reset_throttle_metrics
|
||||||
from models.listing import QueryParameters
|
from models.listing import Listing, QueryParameters
|
||||||
from repositories import ListingRepository
|
from repositories import ListingRepository
|
||||||
from tqdm.asyncio import tqdm
|
|
||||||
from models import Listing as modelListing
|
|
||||||
from services.query_splitter import QuerySplitter, SubQuery
|
from services.query_splitter import QuerySplitter, SubQuery
|
||||||
|
|
||||||
logger = logging.getLogger("uvicorn.error")
|
logger = logging.getLogger("uvicorn.error")
|
||||||
|
|
||||||
|
NUM_WORKERS = 20
|
||||||
|
|
||||||
|
|
||||||
async def dump_listings_full(
|
async def dump_listings_full(
|
||||||
parameters: QueryParameters,
|
parameters: QueryParameters,
|
||||||
repository: ListingRepository,
|
repository: ListingRepository,
|
||||||
) -> list[modelListing]:
|
) -> list[Listing]:
|
||||||
"""Fetches all listings, images as well as detects floorplans."""
|
"""Fetches all listings, images as well as detects floorplans."""
|
||||||
new_listings = await dump_listings(parameters, repository)
|
new_listings = await dump_listings(parameters, repository)
|
||||||
logger.debug(f"Upserted {len(new_listings)} new listings")
|
logger.debug(f"Upserted {len(new_listings)} new listings")
|
||||||
|
|
@ -33,11 +32,11 @@ async def dump_listings_full(
|
||||||
async def dump_listings(
|
async def dump_listings(
|
||||||
parameters: QueryParameters,
|
parameters: QueryParameters,
|
||||||
repository: ListingRepository,
|
repository: ListingRepository,
|
||||||
) -> list[modelListing]:
|
) -> list[Listing]:
|
||||||
"""Fetch listings from Rightmove API and process them.
|
"""Fetch listings from Rightmove API and process them.
|
||||||
|
|
||||||
Uses intelligent query splitting to maximize data extraction
|
Uses intelligent query splitting and a streaming pipeline so that
|
||||||
while respecting Rightmove's result caps.
|
listing processing starts as soon as IDs become available.
|
||||||
"""
|
"""
|
||||||
config = ScraperConfig.from_env()
|
config = ScraperConfig.from_env()
|
||||||
splitter = QuerySplitter(config)
|
splitter = QuerySplitter(config)
|
||||||
|
|
@ -47,7 +46,7 @@ async def dump_listings(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async with create_session(config) as session:
|
async with create_session(config) as session:
|
||||||
# Phase 1 & 2: Split and probe queries
|
# Phase 1: Split and probe queries
|
||||||
logger.info("Splitting query and probing result counts...")
|
logger.info("Splitting query and probing result counts...")
|
||||||
subqueries = await splitter.split(parameters, session)
|
subqueries = await splitter.split(parameters, session)
|
||||||
|
|
||||||
|
|
@ -57,16 +56,22 @@ async def dump_listings(
|
||||||
f"estimated {total_estimated} total results"
|
f"estimated {total_estimated} total results"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Phase 3: Fetch all pages for each subquery
|
# Load existing IDs (fast, ID-only projection)
|
||||||
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
existing_ids = repository.get_listing_ids(parameters.listing_type)
|
||||||
|
logger.info(f"Found {len(existing_ids)} existing listings in DB")
|
||||||
|
|
||||||
async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
|
# Phase 2: Streaming fetch & process
|
||||||
"""Fetch all pages for a single subquery."""
|
queue: asyncio.Queue[int | None] = asyncio.Queue()
|
||||||
results: list[dict[str, Any]] = []
|
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
||||||
|
ids_collected = 0
|
||||||
|
processed_listings: list[Listing] = []
|
||||||
|
|
||||||
|
async def fetch_subquery(sq: SubQuery) -> None:
|
||||||
|
nonlocal ids_collected
|
||||||
|
|
||||||
estimated = sq.estimated_results or 0
|
estimated = sq.estimated_results or 0
|
||||||
if estimated == 0:
|
if estimated == 0:
|
||||||
return results
|
return
|
||||||
|
|
||||||
page_size = parameters.page_size
|
page_size = parameters.page_size
|
||||||
max_pages = min(
|
max_pages = min(
|
||||||
|
|
@ -93,9 +98,16 @@ async def dump_listings(
|
||||||
session=session,
|
session=session,
|
||||||
config=config,
|
config=config,
|
||||||
)
|
)
|
||||||
results.append(result)
|
|
||||||
|
|
||||||
|
# Extract and enqueue new IDs inline
|
||||||
properties = result.get("properties", [])
|
properties = result.get("properties", [])
|
||||||
|
for prop in properties:
|
||||||
|
identifier = prop.get("identifier")
|
||||||
|
if identifier and identifier not in existing_ids:
|
||||||
|
existing_ids.add(identifier)
|
||||||
|
ids_collected += 1
|
||||||
|
await queue.put(identifier)
|
||||||
|
|
||||||
if len(properties) < page_size:
|
if len(properties) < page_size:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
@ -104,7 +116,8 @@ async def dump_listings(
|
||||||
break
|
break
|
||||||
except ThrottlingError as e:
|
except ThrottlingError as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Throttling error on page {page_id} for {sq.district}: {e}"
|
f"Throttling error on page {page_id} for "
|
||||||
|
f"{sq.district}: {e}"
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -114,17 +127,34 @@ async def dump_listings(
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Error fetching page {page_id} for {sq.district}: {e}"
|
f"Error fetching page {page_id} for "
|
||||||
|
f"{sq.district}: {e}"
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
return results
|
async def producer() -> None:
|
||||||
|
await asyncio.gather(
|
||||||
|
*[fetch_subquery(sq) for sq in subqueries]
|
||||||
|
)
|
||||||
|
logger.info(f"Fetch complete: {ids_collected} new IDs found")
|
||||||
|
for _ in range(NUM_WORKERS):
|
||||||
|
await queue.put(None)
|
||||||
|
|
||||||
# Fetch all subqueries with progress bar
|
async def worker() -> None:
|
||||||
all_results = await tqdm.gather(
|
while True:
|
||||||
*[fetch_subquery(sq) for sq in subqueries],
|
listing_id = await queue.get()
|
||||||
desc="Fetching listings",
|
if listing_id is None:
|
||||||
|
break
|
||||||
|
listing_processor = ListingProcessor(repository)
|
||||||
|
listing = await listing_processor.process_listing(listing_id)
|
||||||
|
if listing is not None:
|
||||||
|
processed_listings.append(listing)
|
||||||
|
|
||||||
|
await asyncio.gather(
|
||||||
|
producer(),
|
||||||
|
*[worker() for _ in range(NUM_WORKERS)],
|
||||||
)
|
)
|
||||||
|
|
||||||
except CircuitBreakerOpenError as e:
|
except CircuitBreakerOpenError as e:
|
||||||
logger.error(f"Circuit breaker prevented listing fetch: {e}")
|
logger.error(f"Circuit breaker prevented listing fetch: {e}")
|
||||||
logger.info(get_throttle_metrics().summary())
|
logger.info(get_throttle_metrics().summary())
|
||||||
|
|
@ -135,36 +165,9 @@ async def dump_listings(
|
||||||
if metrics.total_requests > 0:
|
if metrics.total_requests > 0:
|
||||||
logger.info("\n" + metrics.summary())
|
logger.info("\n" + metrics.summary())
|
||||||
|
|
||||||
# Extract listing identifiers from results
|
logger.info(
|
||||||
listing_ids: list[int] = []
|
f"Processed {len(processed_listings)} new listings "
|
||||||
for subquery_results in all_results:
|
f"({ids_collected} total found)"
|
||||||
for response_json in subquery_results:
|
|
||||||
if not response_json:
|
|
||||||
continue
|
|
||||||
if response_json.get("totalAvailableResults", 0) == 0:
|
|
||||||
continue
|
|
||||||
for property_data in response_json.get("properties", []):
|
|
||||||
identifier = property_data.get("identifier")
|
|
||||||
if identifier:
|
|
||||||
listing_ids.append(identifier)
|
|
||||||
|
|
||||||
logger.info(f"Found {len(listing_ids)} total listings")
|
|
||||||
|
|
||||||
# Deduplicate
|
|
||||||
unique_ids = list(set(listing_ids))
|
|
||||||
logger.info(f"After deduplication: {len(unique_ids)} unique listings")
|
|
||||||
|
|
||||||
# Filter out listings already in database
|
|
||||||
all_listing_ids = [x.id for x in await repository.get_listings()]
|
|
||||||
missing_ids = [
|
|
||||||
listing_id for listing_id in unique_ids if listing_id not in all_listing_ids
|
|
||||||
]
|
|
||||||
|
|
||||||
listing_processor = ListingProcessor(repository)
|
|
||||||
logger.info(f"Starting processing {len(missing_ids)} new listings")
|
|
||||||
processed_listings = await tqdm.gather(
|
|
||||||
*[listing_processor.process_listing(id) for id in missing_ids]
|
|
||||||
)
|
)
|
||||||
filtered_listings = [x for x in processed_listings if x is not None]
|
|
||||||
|
|
||||||
return filtered_listings
|
return processed_listings
|
||||||
|
|
|
||||||
|
|
@ -134,144 +134,303 @@ async def dump_listings_full(
|
||||||
async def _dump_listings_full_inner(
|
async def _dump_listings_full_inner(
|
||||||
*, task: Task, parameters: QueryParameters
|
*, task: Task, parameters: QueryParameters
|
||||||
) -> list[Listing]:
|
) -> list[Listing]:
|
||||||
"""Inner implementation with log capture active."""
|
"""Inner implementation with log capture active.
|
||||||
|
|
||||||
|
Uses a streaming pipeline: an asyncio.Queue bridges the fetcher (producer)
|
||||||
|
and processor workers (consumers) so that listing processing starts as
|
||||||
|
soon as IDs become available from each subquery.
|
||||||
|
"""
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
NUM_WORKERS = 20
|
||||||
|
|
||||||
celery_logger.info("=" * 60)
|
celery_logger.info("=" * 60)
|
||||||
celery_logger.info("PHASE 1: Initializing listing fetch")
|
celery_logger.info("PHASE 1: Splitting queries")
|
||||||
celery_logger.info("=" * 60)
|
celery_logger.info("=" * 60)
|
||||||
|
|
||||||
repository = ListingRepository(engine)
|
repository = ListingRepository(engine)
|
||||||
|
config = ScraperConfig.from_env()
|
||||||
|
splitter = QuerySplitter(config)
|
||||||
|
|
||||||
_update_task_state(task, "Identifying missing listings", {"phase": "splitting", "progress": 0})
|
# Reset throttle metrics
|
||||||
celery_logger.info("Querying Rightmove API to identify new listings...")
|
reset_throttle_metrics()
|
||||||
ids_to_process = await get_ids_to_process(
|
|
||||||
parameters=parameters, repository=repository, task=task
|
|
||||||
)
|
|
||||||
|
|
||||||
celery_logger.info(f"Found {len(ids_to_process)} new listings to process")
|
def on_progress(phase: str, message: str, **kwargs: Any) -> None:
|
||||||
logger.info(f"Found {len(ids_to_process)} listings to process")
|
meta: dict[str, Any] = {"phase": phase, "message": message}
|
||||||
|
meta.update(kwargs)
|
||||||
|
_update_task_state(task, message, meta)
|
||||||
|
celery_logger.info(f"[{phase}] {message}")
|
||||||
|
|
||||||
if len(ids_to_process) == 0:
|
_update_task_state(task, "Analyzing query and splitting by price bands...", {
|
||||||
elapsed = time.time() - start_time
|
"phase": "splitting", "progress": 0,
|
||||||
celery_logger.info(f"No new listings found. Completed in {elapsed:.1f}s")
|
})
|
||||||
invalidate_cache()
|
celery_logger.info("Starting query splitting and probing...")
|
||||||
_update_task_state(task, "No new listings found", {
|
|
||||||
"phase": "completed", "progress": 1, "processed": 0, "total": 0,
|
try:
|
||||||
"message": "All listings are up to date",
|
async with create_session(config) as session:
|
||||||
})
|
subqueries = await splitter.split(parameters, session, on_progress)
|
||||||
|
|
||||||
|
total_estimated = splitter.calculate_total_estimated_results(subqueries)
|
||||||
|
celery_logger.info(
|
||||||
|
f"Query split complete: {len(subqueries)} subqueries, "
|
||||||
|
f"~{total_estimated} estimated total results"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load existing IDs (fast, ID-only projection)
|
||||||
|
celery_logger.info("Loading existing listing IDs from database...")
|
||||||
|
existing_ids = repository.get_listing_ids(parameters.listing_type)
|
||||||
|
celery_logger.info(f"Found {len(existing_ids)} existing listings in DB")
|
||||||
|
|
||||||
|
celery_logger.info("=" * 60)
|
||||||
|
celery_logger.info("PHASE 2: Streaming fetch & process")
|
||||||
|
celery_logger.info("=" * 60)
|
||||||
|
|
||||||
|
# Shared state for the streaming pipeline
|
||||||
|
queue: asyncio.Queue[int | None] = asyncio.Queue()
|
||||||
|
ids_collected = 0
|
||||||
|
completed_subqueries = 0
|
||||||
|
total_pages_fetched = 0
|
||||||
|
fetching_done = False
|
||||||
|
processed_count = 0
|
||||||
|
failed_count = 0
|
||||||
|
details_fetched = 0
|
||||||
|
images_downloaded = 0
|
||||||
|
ocr_completed = 0
|
||||||
|
processed_listings: list[Listing] = []
|
||||||
|
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
||||||
|
|
||||||
|
_update_task_state(
|
||||||
|
task,
|
||||||
|
f"Fetching listings from {len(subqueries)} subqueries...",
|
||||||
|
{
|
||||||
|
"phase": "fetching",
|
||||||
|
"subqueries_completed": 0,
|
||||||
|
"subqueries_total": len(subqueries),
|
||||||
|
"ids_collected": 0,
|
||||||
|
"pages_fetched": 0,
|
||||||
|
"estimated_results": total_estimated,
|
||||||
|
"fetching_done": False,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
listing_processor = ListingProcessor(repository)
|
||||||
|
|
||||||
|
# --- Producer: fetch subquery pages and enqueue new IDs ---
|
||||||
|
async def producer() -> None:
|
||||||
|
nonlocal ids_collected, completed_subqueries, total_pages_fetched
|
||||||
|
nonlocal fetching_done
|
||||||
|
|
||||||
|
async def fetch_subquery(sq: SubQuery) -> None:
|
||||||
|
nonlocal ids_collected, completed_subqueries, total_pages_fetched
|
||||||
|
|
||||||
|
estimated = sq.estimated_results or 0
|
||||||
|
if estimated == 0:
|
||||||
|
completed_subqueries += 1
|
||||||
|
return
|
||||||
|
|
||||||
|
page_size = parameters.page_size
|
||||||
|
max_pages = min(
|
||||||
|
config.max_pages_per_query,
|
||||||
|
(estimated // page_size) + 1,
|
||||||
|
)
|
||||||
|
|
||||||
|
for page_id in range(1, max_pages + 1):
|
||||||
|
async with semaphore:
|
||||||
|
await asyncio.sleep(config.request_delay_ms / 1000)
|
||||||
|
try:
|
||||||
|
result = await listing_query(
|
||||||
|
page=page_id,
|
||||||
|
channel=parameters.listing_type,
|
||||||
|
min_bedrooms=sq.min_bedrooms,
|
||||||
|
max_bedrooms=sq.max_bedrooms,
|
||||||
|
radius=parameters.radius,
|
||||||
|
min_price=sq.min_price,
|
||||||
|
max_price=sq.max_price,
|
||||||
|
district=sq.district,
|
||||||
|
page_size=page_size,
|
||||||
|
max_days_since_added=parameters.max_days_since_added,
|
||||||
|
furnish_types=parameters.furnish_types or [],
|
||||||
|
session=session,
|
||||||
|
config=config,
|
||||||
|
)
|
||||||
|
total_pages_fetched += 1
|
||||||
|
|
||||||
|
# Extract and enqueue new IDs inline
|
||||||
|
properties = result.get("properties", [])
|
||||||
|
for prop in properties:
|
||||||
|
identifier = prop.get("identifier")
|
||||||
|
if identifier and identifier not in existing_ids:
|
||||||
|
existing_ids.add(identifier)
|
||||||
|
ids_collected += 1
|
||||||
|
await queue.put(identifier)
|
||||||
|
|
||||||
|
if len(properties) < page_size:
|
||||||
|
break
|
||||||
|
|
||||||
|
except CircuitBreakerOpenError as e:
|
||||||
|
celery_logger.error(f"Circuit breaker open: {e}")
|
||||||
|
break
|
||||||
|
except ThrottlingError as e:
|
||||||
|
celery_logger.warning(
|
||||||
|
f"Throttling on {sq.district} page {page_id}: {e}"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
if "GENERIC_ERROR" in str(e):
|
||||||
|
logger.debug(
|
||||||
|
f"Max page for {sq.district}: {page_id - 1}"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
logger.warning(
|
||||||
|
f"Error fetching page {page_id} for "
|
||||||
|
f"{sq.district}: {e}"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
completed_subqueries += 1
|
||||||
|
|
||||||
|
# Fetch all subqueries concurrently
|
||||||
|
await asyncio.gather(
|
||||||
|
*[fetch_subquery(sq) for sq in subqueries]
|
||||||
|
)
|
||||||
|
|
||||||
|
celery_logger.info(
|
||||||
|
f"Fetch complete: {total_pages_fetched} pages from "
|
||||||
|
f"{completed_subqueries} subqueries, "
|
||||||
|
f"{ids_collected} new IDs"
|
||||||
|
)
|
||||||
|
fetching_done = True
|
||||||
|
|
||||||
|
# Send sentinel values to stop workers
|
||||||
|
for _ in range(NUM_WORKERS):
|
||||||
|
await queue.put(None)
|
||||||
|
|
||||||
|
# --- Consumer workers: process listings from queue ---
|
||||||
|
async def worker() -> None:
|
||||||
|
nonlocal processed_count, failed_count
|
||||||
|
nonlocal details_fetched, images_downloaded, ocr_completed
|
||||||
|
|
||||||
|
while True:
|
||||||
|
listing_id = await queue.get()
|
||||||
|
if listing_id is None:
|
||||||
|
break
|
||||||
|
|
||||||
|
def step_callback(step_name: str) -> None:
|
||||||
|
nonlocal details_fetched, images_downloaded, ocr_completed
|
||||||
|
if step_name == "details":
|
||||||
|
details_fetched += 1
|
||||||
|
elif step_name == "images":
|
||||||
|
images_downloaded += 1
|
||||||
|
elif step_name == "ocr":
|
||||||
|
ocr_completed += 1
|
||||||
|
|
||||||
|
listing = await listing_processor.process_listing(
|
||||||
|
listing_id, on_step_complete=step_callback
|
||||||
|
)
|
||||||
|
if listing is not None:
|
||||||
|
processed_count += 1
|
||||||
|
processed_listings.append(listing)
|
||||||
|
else:
|
||||||
|
failed_count += 1
|
||||||
|
|
||||||
|
# --- Monitor: reports combined progress ---
|
||||||
|
async def monitor() -> None:
|
||||||
|
last_progress = 0.0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
total = ids_collected
|
||||||
|
done = processed_count + failed_count
|
||||||
|
|
||||||
|
if fetching_done and done >= total and total > 0:
|
||||||
|
break
|
||||||
|
if fetching_done and total == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Determine phase label
|
||||||
|
phase = "processing" if fetching_done else "fetching"
|
||||||
|
|
||||||
|
if total > 0:
|
||||||
|
progress_ratio = round(done / total, 2)
|
||||||
|
else:
|
||||||
|
progress_ratio = 0.0
|
||||||
|
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
rate = done / elapsed if elapsed > 0 else 0
|
||||||
|
remaining = (total - done) if total > 0 else 0
|
||||||
|
eta = remaining / rate if rate > 0 else 0
|
||||||
|
|
||||||
|
if progress_ratio >= last_progress + 0.1 or done == 1:
|
||||||
|
celery_logger.info(
|
||||||
|
f"Progress: {progress_ratio * 100:.0f}% "
|
||||||
|
f"({done}/{total}) "
|
||||||
|
f"| Elapsed: {elapsed:.0f}s "
|
||||||
|
f"| Rate: {rate:.1f}/s "
|
||||||
|
f"| ETA: {eta:.0f}s"
|
||||||
|
)
|
||||||
|
last_progress = progress_ratio
|
||||||
|
|
||||||
|
_update_task_state(
|
||||||
|
task,
|
||||||
|
f"{'Processing' if fetching_done else 'Fetching & processing'}: "
|
||||||
|
f"{done}/{total}",
|
||||||
|
{
|
||||||
|
"phase": phase,
|
||||||
|
"progress": progress_ratio,
|
||||||
|
"processed": done,
|
||||||
|
"total": total,
|
||||||
|
"subqueries_completed": completed_subqueries,
|
||||||
|
"subqueries_total": len(subqueries),
|
||||||
|
"ids_collected": ids_collected,
|
||||||
|
"pages_fetched": total_pages_fetched,
|
||||||
|
"fetching_done": fetching_done,
|
||||||
|
"details_fetched": details_fetched,
|
||||||
|
"images_downloaded": images_downloaded,
|
||||||
|
"ocr_completed": ocr_completed,
|
||||||
|
"failed": failed_count,
|
||||||
|
"elapsed_seconds": round(elapsed, 1),
|
||||||
|
"rate_per_second": round(rate, 2),
|
||||||
|
"eta_seconds": round(eta, 1),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
await asyncio.sleep(1)
|
||||||
|
|
||||||
|
# Run producer, workers, and monitor concurrently
|
||||||
|
await asyncio.gather(
|
||||||
|
producer(),
|
||||||
|
*[worker() for _ in range(NUM_WORKERS)],
|
||||||
|
monitor(),
|
||||||
|
)
|
||||||
|
|
||||||
|
except CircuitBreakerOpenError as e:
|
||||||
|
celery_logger.error(f"Circuit breaker prevented query: {e}")
|
||||||
|
metrics = get_throttle_metrics()
|
||||||
|
if metrics.total_requests > 0:
|
||||||
|
celery_logger.info(metrics.summary())
|
||||||
return []
|
return []
|
||||||
|
finally:
|
||||||
celery_logger.info("=" * 60)
|
metrics = get_throttle_metrics()
|
||||||
celery_logger.info("PHASE 2: Processing listings (fetch details, images, OCR)")
|
if metrics.total_requests > 0:
|
||||||
celery_logger.info("=" * 60)
|
celery_logger.info(
|
||||||
|
f"API Stats: {metrics.total_requests} requests, "
|
||||||
listing_processor = ListingProcessor(repository)
|
f"avg {metrics.average_response_time:.2f}s, "
|
||||||
celery_logger.info(f"Starting processing {len(ids_to_process)} listings")
|
f"{metrics.total_throttling_events} throttled"
|
||||||
logger.info(f"Starting processing {len(ids_to_process)} listings")
|
)
|
||||||
|
|
||||||
result = await dump_listings_and_monitor(
|
|
||||||
task=task, listing_processor=listing_processor, missing_ids=ids_to_process
|
|
||||||
)
|
|
||||||
|
|
||||||
elapsed = time.time() - start_time
|
elapsed = time.time() - start_time
|
||||||
celery_logger.info("=" * 60)
|
celery_logger.info("=" * 60)
|
||||||
celery_logger.info(f"COMPLETED: Processed {len(result)} listings in {elapsed:.1f}s")
|
celery_logger.info(
|
||||||
|
f"COMPLETED: Processed {len(processed_listings)} listings in {elapsed:.1f}s"
|
||||||
|
)
|
||||||
celery_logger.info("=" * 60)
|
celery_logger.info("=" * 60)
|
||||||
|
|
||||||
invalidate_cache()
|
invalidate_cache()
|
||||||
|
|
||||||
# Send final state so the frontend has rich data even after task completes
|
|
||||||
_update_task_state(task, "Completed", {
|
_update_task_state(task, "Completed", {
|
||||||
"phase": "completed", "progress": 1,
|
"phase": "completed", "progress": 1,
|
||||||
"processed": len(result), "total": len(ids_to_process),
|
"processed": len(processed_listings), "total": ids_collected,
|
||||||
"message": f"Processed {len(result)} listings in {elapsed:.0f}s",
|
"message": f"Processed {len(processed_listings)} listings in {elapsed:.0f}s",
|
||||||
})
|
})
|
||||||
|
|
||||||
return result
|
return processed_listings
|
||||||
|
|
||||||
|
|
||||||
async def dump_listings_and_monitor(
|
|
||||||
*, task: Task, listing_processor: ListingProcessor, missing_ids: set[int]
|
|
||||||
) -> list[Listing]:
|
|
||||||
task_progress = {missing_id: 0 for missing_id in missing_ids}
|
|
||||||
processed_count = 0
|
|
||||||
failed_count = 0
|
|
||||||
details_fetched = 0
|
|
||||||
images_downloaded = 0
|
|
||||||
ocr_completed = 0
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
async def process(missing_id: int) -> Listing | None:
|
|
||||||
nonlocal processed_count, failed_count
|
|
||||||
|
|
||||||
def step_callback(step_name: str) -> None:
|
|
||||||
nonlocal details_fetched, images_downloaded, ocr_completed
|
|
||||||
if step_name == "details":
|
|
||||||
details_fetched += 1
|
|
||||||
elif step_name == "images":
|
|
||||||
images_downloaded += 1
|
|
||||||
elif step_name == "ocr":
|
|
||||||
ocr_completed += 1
|
|
||||||
|
|
||||||
listing = await listing_processor.process_listing(
|
|
||||||
missing_id, on_step_complete=step_callback
|
|
||||||
)
|
|
||||||
task_progress[missing_id] = 1
|
|
||||||
if listing is not None:
|
|
||||||
processed_count += 1
|
|
||||||
else:
|
|
||||||
failed_count += 1
|
|
||||||
return listing
|
|
||||||
|
|
||||||
async def monitor() -> None:
|
|
||||||
last_progress = 0
|
|
||||||
while (progress := sum(task_progress.values())) < len(missing_ids):
|
|
||||||
progress_ratio = round(progress / len(missing_ids), 2)
|
|
||||||
|
|
||||||
elapsed = time.time() - start_time
|
|
||||||
rate = progress / elapsed if elapsed > 0 else 0
|
|
||||||
eta = (len(missing_ids) - progress) / rate if rate > 0 else 0
|
|
||||||
|
|
||||||
# Log every 10% progress or at least every update
|
|
||||||
if progress_ratio >= last_progress + 0.1 or progress == 1:
|
|
||||||
celery_logger.info(
|
|
||||||
f"Progress: {progress_ratio * 100:.0f}% "
|
|
||||||
f"({progress}/{len(missing_ids)}) "
|
|
||||||
f"| Elapsed: {elapsed:.0f}s "
|
|
||||||
f"| Rate: {rate:.1f}/s "
|
|
||||||
f"| ETA: {eta:.0f}s"
|
|
||||||
)
|
|
||||||
last_progress = progress_ratio
|
|
||||||
|
|
||||||
_update_task_state(
|
|
||||||
task,
|
|
||||||
f"Processing: {progress_ratio * 100:.0f}% ({progress}/{len(missing_ids)})",
|
|
||||||
{
|
|
||||||
"phase": "processing",
|
|
||||||
"progress": progress_ratio,
|
|
||||||
"processed": progress,
|
|
||||||
"total": len(missing_ids),
|
|
||||||
"details_fetched": details_fetched,
|
|
||||||
"images_downloaded": images_downloaded,
|
|
||||||
"ocr_completed": ocr_completed,
|
|
||||||
"failed": failed_count,
|
|
||||||
"elapsed_seconds": round(elapsed, 1),
|
|
||||||
"rate_per_second": round(rate, 2),
|
|
||||||
"eta_seconds": round(eta, 1),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
await asyncio.sleep(1)
|
|
||||||
|
|
||||||
processed_listings = await asyncio.gather(
|
|
||||||
*[process(id) for id in missing_ids], *[monitor()]
|
|
||||||
)
|
|
||||||
filtered_listings = [listing for listing in processed_listings if listing is not None]
|
|
||||||
|
|
||||||
celery_logger.info(
|
|
||||||
f"Processing complete: {processed_count} successful, {failed_count} failed"
|
|
||||||
)
|
|
||||||
|
|
||||||
return filtered_listings
|
|
||||||
|
|
||||||
|
|
||||||
@app.on_after_finalize.connect
|
@app.on_after_finalize.connect
|
||||||
|
|
@ -297,227 +456,3 @@ def setup_periodic_tasks(sender, **kwargs):
|
||||||
dump_listings_task.s(schedule.to_query_parameters().model_dump_json()),
|
dump_listings_task.s(schedule.to_query_parameters().model_dump_json()),
|
||||||
name=schedule.name,
|
name=schedule.name,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
async def get_ids_to_process(
|
|
||||||
*,
|
|
||||||
parameters: QueryParameters,
|
|
||||||
repository: ListingRepository,
|
|
||||||
task: Task,
|
|
||||||
) -> set[int]:
|
|
||||||
"""Fetch all listing IDs using intelligent query splitting.
|
|
||||||
|
|
||||||
Uses the QuerySplitter to adaptively split large queries and maximize
|
|
||||||
data extraction while respecting Rightmove's result caps.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
parameters: Query parameters for the search.
|
|
||||||
repository: Repository for checking existing listings.
|
|
||||||
task: Celery task for progress updates.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Set of new listing IDs that need to be processed.
|
|
||||||
"""
|
|
||||||
config = ScraperConfig.from_env()
|
|
||||||
splitter = QuerySplitter(config)
|
|
||||||
|
|
||||||
# Reset throttle metrics
|
|
||||||
reset_throttle_metrics()
|
|
||||||
|
|
||||||
def on_progress(phase: str, message: str, **kwargs: Any) -> None:
|
|
||||||
meta: dict[str, Any] = {"phase": phase, "message": message}
|
|
||||||
meta.update(kwargs)
|
|
||||||
_update_task_state(task, message, meta)
|
|
||||||
celery_logger.info(f"[{phase}] {message}")
|
|
||||||
|
|
||||||
celery_logger.info("Starting query splitting and probing...")
|
|
||||||
|
|
||||||
try:
|
|
||||||
async with create_session(config) as session:
|
|
||||||
# Phase 1 & 2: Split and probe queries
|
|
||||||
_update_task_state(task, "Analyzing query and splitting by price bands...", {
|
|
||||||
"phase": "splitting", "progress": 0,
|
|
||||||
})
|
|
||||||
subqueries = await splitter.split(parameters, session, on_progress)
|
|
||||||
|
|
||||||
total_estimated = splitter.calculate_total_estimated_results(subqueries)
|
|
||||||
celery_logger.info(
|
|
||||||
f"Query split complete: {len(subqueries)} subqueries, "
|
|
||||||
f"~{total_estimated} estimated total results"
|
|
||||||
)
|
|
||||||
logger.info(
|
|
||||||
f"Split into {len(subqueries)} subqueries, "
|
|
||||||
f"estimated {total_estimated} total results"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Phase 3: Fetch all pages for each subquery
|
|
||||||
_update_task_state(
|
|
||||||
task,
|
|
||||||
f"Fetching listings from {len(subqueries)} subqueries...",
|
|
||||||
{
|
|
||||||
"phase": "fetching",
|
|
||||||
"subqueries_completed": 0,
|
|
||||||
"subqueries_total": len(subqueries),
|
|
||||||
"ids_collected": 0,
|
|
||||||
"pages_fetched": 0,
|
|
||||||
"estimated_results": total_estimated,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
celery_logger.info(f"Fetching pages from {len(subqueries)} subqueries...")
|
|
||||||
|
|
||||||
semaphore = asyncio.Semaphore(config.max_concurrent_requests)
|
|
||||||
identifiers: set[int] = set()
|
|
||||||
completed_subqueries = 0
|
|
||||||
total_pages_fetched = 0
|
|
||||||
|
|
||||||
async def fetch_subquery(sq: SubQuery) -> list[dict[str, Any]]:
|
|
||||||
"""Fetch all pages for a single subquery."""
|
|
||||||
nonlocal completed_subqueries, total_pages_fetched
|
|
||||||
results: list[dict[str, Any]] = []
|
|
||||||
|
|
||||||
# Calculate how many pages we need based on estimated results
|
|
||||||
estimated = sq.estimated_results or 0
|
|
||||||
if estimated == 0:
|
|
||||||
completed_subqueries += 1
|
|
||||||
_update_task_state(
|
|
||||||
task,
|
|
||||||
f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
|
|
||||||
{
|
|
||||||
"phase": "fetching",
|
|
||||||
"subqueries_completed": completed_subqueries,
|
|
||||||
"subqueries_total": len(subqueries),
|
|
||||||
"ids_collected": len(identifiers),
|
|
||||||
"pages_fetched": total_pages_fetched,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
return results
|
|
||||||
|
|
||||||
# Fetch pages up to max_pages_per_query or until no more results
|
|
||||||
page_size = parameters.page_size
|
|
||||||
max_pages = min(
|
|
||||||
config.max_pages_per_query,
|
|
||||||
(estimated // page_size) + 1,
|
|
||||||
)
|
|
||||||
|
|
||||||
for page_id in range(1, max_pages + 1):
|
|
||||||
async with semaphore:
|
|
||||||
await asyncio.sleep(config.request_delay_ms / 1000)
|
|
||||||
try:
|
|
||||||
result = await listing_query(
|
|
||||||
page=page_id,
|
|
||||||
channel=parameters.listing_type,
|
|
||||||
min_bedrooms=sq.min_bedrooms,
|
|
||||||
max_bedrooms=sq.max_bedrooms,
|
|
||||||
radius=parameters.radius,
|
|
||||||
min_price=sq.min_price,
|
|
||||||
max_price=sq.max_price,
|
|
||||||
district=sq.district,
|
|
||||||
page_size=page_size,
|
|
||||||
max_days_since_added=parameters.max_days_since_added,
|
|
||||||
furnish_types=parameters.furnish_types or [],
|
|
||||||
session=session,
|
|
||||||
config=config,
|
|
||||||
)
|
|
||||||
results.append(result)
|
|
||||||
total_pages_fetched += 1
|
|
||||||
|
|
||||||
# Check if we've received all results
|
|
||||||
properties = result.get("properties", [])
|
|
||||||
if len(properties) < page_size:
|
|
||||||
# No more results on next page
|
|
||||||
break
|
|
||||||
|
|
||||||
except CircuitBreakerOpenError as e:
|
|
||||||
celery_logger.error(f"Circuit breaker open: {e}")
|
|
||||||
break
|
|
||||||
except ThrottlingError as e:
|
|
||||||
celery_logger.warning(
|
|
||||||
f"Throttling on {sq.district} page {page_id}: {e}"
|
|
||||||
)
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
if "GENERIC_ERROR" in str(e):
|
|
||||||
# Reached end of results
|
|
||||||
logger.debug(
|
|
||||||
f"Max page for {sq.district}: {page_id - 1}"
|
|
||||||
)
|
|
||||||
break
|
|
||||||
logger.warning(
|
|
||||||
f"Error fetching page {page_id} for {sq.district}: {e}"
|
|
||||||
)
|
|
||||||
break
|
|
||||||
|
|
||||||
completed_subqueries += 1
|
|
||||||
_update_task_state(
|
|
||||||
task,
|
|
||||||
f"Fetching: {completed_subqueries}/{len(subqueries)} subqueries",
|
|
||||||
{
|
|
||||||
"phase": "fetching",
|
|
||||||
"subqueries_completed": completed_subqueries,
|
|
||||||
"subqueries_total": len(subqueries),
|
|
||||||
"ids_collected": len(identifiers),
|
|
||||||
"pages_fetched": total_pages_fetched,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
return results
|
|
||||||
|
|
||||||
# Fetch all subqueries concurrently
|
|
||||||
all_results = await asyncio.gather(
|
|
||||||
*[fetch_subquery(sq) for sq in subqueries]
|
|
||||||
)
|
|
||||||
|
|
||||||
celery_logger.info(
|
|
||||||
f"Fetch complete: {total_pages_fetched} pages from "
|
|
||||||
f"{completed_subqueries} subqueries"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Extract identifiers from all results
|
|
||||||
for subquery_results in all_results:
|
|
||||||
for response_json in subquery_results:
|
|
||||||
if not response_json:
|
|
||||||
continue
|
|
||||||
if response_json.get("totalAvailableResults", 0) == 0:
|
|
||||||
continue
|
|
||||||
for property_data in response_json.get("properties", []):
|
|
||||||
identifier = property_data.get("identifier")
|
|
||||||
if identifier:
|
|
||||||
identifiers.add(identifier)
|
|
||||||
|
|
||||||
except CircuitBreakerOpenError as e:
|
|
||||||
celery_logger.error(f"Circuit breaker prevented query: {e}")
|
|
||||||
# Log throttle metrics
|
|
||||||
metrics = get_throttle_metrics()
|
|
||||||
if metrics.total_requests > 0:
|
|
||||||
celery_logger.info(metrics.summary())
|
|
||||||
return set()
|
|
||||||
finally:
|
|
||||||
# Log throttle metrics
|
|
||||||
metrics = get_throttle_metrics()
|
|
||||||
if metrics.total_requests > 0:
|
|
||||||
celery_logger.info(f"API Stats: {metrics.total_requests} requests, "
|
|
||||||
f"avg {metrics.average_response_time:.2f}s, "
|
|
||||||
f"{metrics.total_throttling_events} throttled")
|
|
||||||
|
|
||||||
celery_logger.info(f"Found {len(identifiers)} unique listing IDs from API")
|
|
||||||
logger.info(f"Found {len(identifiers)} unique listings")
|
|
||||||
|
|
||||||
# Filter out listings already in the database
|
|
||||||
celery_logger.info("Checking database for existing listings...")
|
|
||||||
all_listing_ids = {listing.id for listing in await repository.get_listings()}
|
|
||||||
new_ids = identifiers - all_listing_ids
|
|
||||||
|
|
||||||
celery_logger.info(
|
|
||||||
f"Filtering: {len(identifiers)} total, "
|
|
||||||
f"{len(all_listing_ids)} existing in DB, "
|
|
||||||
f"{len(new_ids)} new to process"
|
|
||||||
)
|
|
||||||
|
|
||||||
_update_task_state(task, f"Found {len(new_ids)} new listings to process", {
|
|
||||||
"phase": "filtering",
|
|
||||||
"total_found": len(identifiers),
|
|
||||||
"existing_in_db": len(all_listing_ids),
|
|
||||||
"new_listings": len(new_ids),
|
|
||||||
})
|
|
||||||
|
|
||||||
return new_ids
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue