diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index c61a580..c96338e 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -16,7 +16,7 @@ from models import Listing as modelListing dump_images_module = importlib.import_module("3_dump_images") detect_floorplan_module = importlib.import_module("4_detect_floorplan") -logger = logging.getLogger("uvicorn") +logger = logging.getLogger("uvicorn.error") async def dump_listings_full( @@ -26,8 +26,13 @@ async def dump_listings_full( ) -> list[modelListing]: """Fetches all listings, images as well as detects floorplans""" new_listings = await dump_listings(parameters, repository, data_dir) + logger.debug(f"Upserted {len(new_listings)} new listings") + logger.debug("Starting to fetch floorplans") await dump_images_module.dump_images(repository, image_base_path=data_dir) + logger.debug("Completed fetching floorplans") + logger.debug("Starting floorplan detection") await detect_floorplan_module.detect_floorplan(repository) + logger.debug("Completed floorplan detection") # refresh listings listings = await repository.get_listings(parameters) # this can be better new_listings = [l for l in listings if l.id in new_listings] @@ -47,7 +52,7 @@ async def dump_listings( } else: districts = get_districts() - print("Valid districts to scrape:", districts.keys()) + logger.debug("Valid districts to scrape:", districts.keys()) semaphore = asyncio.Semaphore(5) # if too high, rightmove drops connections json_responses: list[list[dict[str, Any]]] = await tqdm.gather( @@ -58,6 +63,7 @@ async def dump_listings( desc="Fetching listings", ) json_responses_flat = list(itertools.chain.from_iterable(json_responses)) + logger.debug(f"Total listings fetched {len(json_responses_flat)}") listings: list[Listing] = [] for response_json in json_responses_flat: if response_json == {}: @@ -75,6 +81,7 @@ async def dump_listings( missing_listing = [ listing for listing in listings if listing.identifier not in all_listing_ids ] + logger.debug(f"Fetching details for {len(missing_listing)} missing listings") listing_details = await tqdm.gather( *[ _fetch_detail_with_semaphore(semaphore, listing.identifier) @@ -85,7 +92,9 @@ async def dump_listings( for listing, detail in zip(missing_listing, listing_details): listing._details_object = detail + logger.debug("Dumping listings to fs") await dump_listings_to_fs(missing_listing) + logger.debug("Upserting listings in db") model_listings = await repository.upsert_listings_legacy( missing_listing ) # upsert in db @@ -101,6 +110,7 @@ async def _fetch_listings_with_semaphore( result = [] # we don't know how many pages we have but we stop as soon as there's no more for page_id in range(1, 3): + logger.debug(f"Processing {page_id=} for {district=}") # seems like all searches stop at 1500 entries (page_id * page_size) async with semaphore: try: