merge dump listings and dump details commands - fetch both details and listings in the same command

2025-06-07 12:00:23 +00:00 · 2025-06-07 12:00:23 +00:00 · 842f7cefbe
commit 842f7cefbe
parent 29213f3d26
7 changed files with 54 additions and 59 deletions
--- a/crawler/1_dump_listings.py
+++ b/crawler/1_dump_listings.py
@ -1,13 +1,18 @@
 import asyncio
+import json
 import pathlib
 from typing import Any
-from rec.query import listing_query, QueryParameters
+from rec.query import detail_query, listing_query, QueryParameters
 from rec.districts import get_districts
+import repositories
+from sqlalchemy import Engine
+from tqdm.asyncio import tqdm
 from data_access import Listing


 async def dump_listings(
    parameters: QueryParameters,
+    db_engine: Engine,
    data_dir: pathlib.Path = pathlib.Path("data/rs/"),
 ) -> list[Listing]:
    if parameters.district_names:
@ -19,23 +24,20 @@ async def dump_listings(
    else:
        districts = get_districts()
    print("Valid districts to scrape:", districts.keys())
-    listings = []

    semaphore = asyncio.Semaphore(5)  # if too high, rightmove drops connections
-    json_responses = await asyncio.gather(
+    json_responses = await tqdm.gather(
        *[
            _dump_listings_with_semaphore(semaphore, i, parameters, locid)
            for locid in districts.values()
            for i in [1, 2]
-        ]
+        ],
+        desc="Fetching listings",
    )
-    listings = []
+    listings: list[Listing] = []
    for response_json in json_responses:
        if response_json["totalAvailableResults"] == 0:
-            print("No results found")
            continue
-        if response_json["totalAvailableResults"] > 0:
-            print("totalAvailableResults: ", response_json["totalAvailableResults"])
        for property in response_json["properties"]:
            identifier = property["identifier"]

@ -43,6 +45,21 @@ async def dump_listings(
            listing.dump_listing(property)
            listings.append(listing)

+    # if listing is already in db, do not fetch details again
+    repository = repositories.ListingRepository(db_engine)
+    all_listings = await repository.get_listings(
+        only_ids=[listing.identifier for listing in listings]
+    )
+    all_listing_ids = {listing.id for listing in all_listings}
+
+    await tqdm.gather(
+        *[
+            _dump_detail_with_semaphore(semaphore, listing)
+            for listing in listings
+            # if listing.identifier not in all_listing_ids # One day we will rely solely on the model data
+        ],
+        desc="Fetching details",
+    )
    return listings


@ -53,7 +70,7 @@ async def _dump_listings_with_semaphore(
    location_id: str,
 ) -> dict[str, Any]:
    async with semaphore:
-        listing = await listing_query(
+        listing_query_result = await listing_query(
            page=page_id,
            channel=parameters.listing_type,
            min_bedrooms=parameters.min_bedrooms,
@ -66,4 +83,16 @@ async def _dump_listings_with_semaphore(
            max_days_since_added=parameters.max_days_since_added,
            furnish_types=parameters.furnish_types or [],
        )
-    return listing
+
+    return listing_query_result
+
+
+async def _dump_detail_with_semaphore(semaphore: asyncio.Semaphore, listing: Listing):
+    if listing.path_detail_json().exists():
+        return
+
+    # for listing in tqdm(filtered_listings):
+    async with semaphore:
+        d = await detail_query(listing.identifier)
+    with open(listing.path_detail_json(), "w") as f:
+        json.dump(d, f)
--- a/crawler/2_dump_detail.py
+++ b/crawler/2_dump_detail.py
@ -1,28 +0,0 @@
-import asyncio
-import json
-from rec.query import detail_query
-from tqdm.asyncio import tqdm
-
-from data_access import Listing
-
-# Setting this too high either crashes rightmove or gets us blocked
-semaphore = asyncio.Semaphore(10)
-
-
-async def dump_detail(listing_paths: list[str]):
-    listings = Listing.get_all_listings(listing_paths)
-    filtered_listings = await tqdm.gather(
-        *[_dump_detail_for_listing(listing) for listing in listings]
-    )
-    return filtered_listings
-
-
-async def _dump_detail_for_listing(listing: Listing):
-    if listing.path_detail_json().exists():
-        return
-
-    # for listing in tqdm(filtered_listings):
-    async with semaphore:
-        d = await detail_query(listing.identifier)
-    with open(listing.path_detail_json(), "w") as f:
-        json.dump(d, f)
--- a/crawler/data_access.py
+++ b/crawler/data_access.py
@ -37,7 +37,7 @@ class Listing:

    @staticmethod
    def get_all_listings(
-        listing_paths: list[str],
+        listing_paths: list[pathlib.Path],
        seen_in_the_last_n_days: int = 30,
    ) -> List["Listing"]:
        identifiers = []
@ -256,8 +256,11 @@ class Listing:
    @property
    def detailobject(self) -> dict[str, Any]:
        if self._cached is None:
-            with open(self.path_detail_json()) as f:
-                self._cached = json.load(f)
+            if self.path_detail_json().exists():
+                with open(self.path_detail_json()) as f:
+                    self._cached = json.load(f)
+            else:
+                return {}
        return self._cached  # type: ignore

    @property
--- a/crawler/main.py
+++ b/crawler/main.py
@ -11,14 +11,13 @@ from data_access import Listing
 import csv_exporter
 from rec.query import ListingType, FurnishType, QueryParameters
 from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
-from repositories.listing_repositorty import ListingRepository
+from repositories.listing_repository import ListingRepository
 from ui_exporter import export_immoweb as export_immoweb_ui
 from functools import wraps
 from database import engine


 dump_listings_module = importlib.import_module("1_dump_listings")
-dump_detail_module = importlib.import_module("2_dump_detail")
 dump_images_module = importlib.import_module("3_dump_images")
 detect_floorplan_module = importlib.import_module("4_detect_floorplan")
 routing_module = importlib.import_module("5_routing")
@ -157,21 +156,12 @@ def dump_listings(
    )
    data_dir_path = pathlib.Path(data_dir)
    listings = asyncio.run(
-        dump_listings_module.dump_listings(query_parameters, data_dir_path)
+        dump_listings_module.dump_listings(query_parameters, engine, data_dir_path)
    )
    repository = ListingRepository(engine=engine)
    asyncio.run(repository.upsert_listings(listings))


-@cli.command()
-@click.pass_context
-def dump_details(ctx: click.core.Context):
-    data_dir = ctx.obj["data_dir"]
-    click.echo(f"Running dump_detail for listings stored in {data_dir}")
-    listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
-    asyncio.run(dump_detail_module.dump_detail(listing_paths))
-
-
@cli.command()
@click.pass_context
 def dump_images(ctx: click.core.Context):
@ -298,7 +288,7 @@ def export_csv(
    )
    output_file_path = pathlib.Path(output_file)
    listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
-    listings = Listing.get_all_listings([str(path) for path in listing_paths])
+    listings = Listing.get_all_listings([path for path in listing_paths])
    asyncio.run(
        csv_exporter.export_to_csv(
            listings,
@ -365,7 +355,7 @@ def populate_db(
    click.echo(f"Populating the database with data from {data_dir}")
    repository = ListingRepository(engine=engine)
    listings = Listing.get_all_listings(
-        [str(path) for path in pathlib.Path(data_dir).glob("*/listing.json")]
+        [path for path in pathlib.Path(data_dir).glob("*/listing.json")]
    )
    asyncio.run(repository.upsert_listings(listings))

--- a/crawler/rec/query.py
+++ b/crawler/rec/query.py
@ -90,7 +90,7 @@ class PropertyType(enum.StrEnum):
    TERRACED = "terraced"


-async def detail_query(detail_id: int):
+async def detail_query(detail_id: int) -> dict[str, Any]:
    params = {
        "apiApplication": "ANDROID",
        "appVersion": "3.70.0",
--- a/crawler/runall.sh
+++ b/crawler/runall.sh
@ -3,10 +3,11 @@
 set -euxo pipefail

 DATA_DIR="data/rs"
-LISTING_FILTER_OPTIONS="--min-price 2000 --max-price 4000 --min-bedrooms 2 --max-bedrooms 4 -t rent --available-from $(date +%Y-%m-%d) --last-seen-days 7 --furnish-type furnished"
+LISTING_FILTER_OPTIONS="--min-price 2000 --max-price 4000 --min-bedrooms 2 --max-bedrooms 4 -t rent --available-from $(date +%Y-%m-%d) --last-seen-days 7 --furnish-types furnished"
+
+alembic upgrade head # init db 

 python main.py --data-dir $DATA_DIR dump-listings $LISTING_FILTER_OPTIONS
-python main.py --data-dir $DATA_DIR dump-details 
 python main.py --data-dir $DATA_DIR dump-images 
 python main.py --data-dir $DATA_DIR detect-floorplan
 #python main.py --data-dir $DATA_DIR routing --destination-address 'Meta Brock Street' -m transit # NOTE: THIS CONSUMES API CALLS; USE CAREFULLY; add -l to limit number of entries
--- a/crawler/ui_exporter.py
+++ b/crawler/ui_exporter.py
@ -49,6 +49,6 @@ async def export_immoweb(
    prefix = "var data = "
    serialized_data = {"type": "FeatureCollection", "features": immoweb_listings}
    result = prefix + json.dumps(serialized_data, indent=4)
-    with open(output_file_path, "w") as f:
+    with open(str(output_file_path), "w") as f:
        f.write(result)
        # json.dump(serialized_data, f, indent=4)