wrongmove/crawler/tasks/listing_tasks.py
2025-06-22 21:18:52 +00:00

45 lines
1.9 KiB
Python

import asyncio
import importlib
import logging
from pathlib import Path
import time
from typing import Any
from celery import Celery, Task
from celery_app import app
from models.listing import Listing, QueryParameters
from repositories.listing_repository import ListingRepository
from database import engine
from tasks.task_state import TaskStatus
dump_listings_module = importlib.import_module("1_dump_listings")
dump_images_module = importlib.import_module("3_dump_images")
detect_floorplan_module = importlib.import_module("4_detect_floorplan")
logger = logging.getLogger("uvicorn.error")
@app.task(bind=True)
def dump_listings_task(self: Task, parameters_json: str) -> dict[str, Any]:
parsed_parameters = QueryParameters.model_validate_json(parameters_json)
asyncio.run(dump_listings_full(self, parsed_parameters))
return {"progress": 1}
async def dump_listings_full(self: Task, parameters: QueryParameters) -> list[Listing]:
"""Fetches all listings, images as well as detects floorplans"""
self.update_state(state="FETCHING_LISTINGS", meta={"progress": 0.1})
repository = ListingRepository(engine)
new_listings = await dump_listings_module.dump_listings(parameters, repository)
self.update_state(state="FETCHING_FLOORPLANS", meta={"progress": 0.3})
logger.debug(f"Upserted {len(new_listings)} new listings")
logger.debug("Starting to fetch floorplans")
await dump_images_module.dump_images(repository)
self.update_state(state="RUNNING_OCR_ON_FLOORPLANS", meta={"progress": 0.6})
logger.debug("Completed fetching floorplans")
logger.debug("Starting floorplan detection")
await detect_floorplan_module.detect_floorplan(repository)
logger.debug("Completed floorplan detection")
# refresh listings
listings = await repository.get_listings(parameters) # this can be better
new_listings = [l for l in listings if l.id in new_listings]
return new_listings