diff --git a/.drone.yml b/.drone.yml index 456e6f1..271575b 100644 --- a/.drone.yml +++ b/.drone.yml @@ -1,56 +1,62 @@ kind: pipeline type: kubernetes name: frontend +timeout: 30m trigger: - branch: - - master event: - - push + - push + - cron + branch: + - master steps: - name: Build frontend image image: plugins/docker settings: - username: viktorbarzin + username: "viktorbarzin" password: from_secret: dockerhub-token repo: viktorbarzin/immoweb dockerfile: crawler/frontend/Dockerfile context: crawler/frontend + cache_from: "viktorbarzin/immoweb:latest" auto_tag: true - - name: Update deployment image: alpine commands: - - apk add curl + - "apk add curl" - 'curl -X PATCH https://kubernetes:6443/apis/apps/v1/namespaces/realestate-crawler/deployments/realestate-crawler-ui -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" -H "Content-Type:application/strategic-merge-patch+json" -k -d ''{"spec": {"template": {"metadata": { "annotations": {"kubectl.kubernetes.io/restartedAt": "''$(date +%Y-%m-%dT%TZ)''" }}}}}'' | head' --- kind: pipeline type: kubernetes name: api +timeout: 30m trigger: - branch: - - master event: - - push + - push + - cron + branch: + - master steps: - - name: Build API image + - name: Build backend API image image: plugins/docker settings: - username: viktorbarzin + username: "viktorbarzin" password: from_secret: dockerhub-token repo: viktorbarzin/realestatecrawler dockerfile: crawler/Dockerfile context: crawler/ + cache_from: "viktorbarzin/realestatecrawler:latest" auto_tag: true - - name: Update deployment image: alpine commands: - - apk add curl + - "apk add curl" - 'curl -X PATCH https://kubernetes:6443/apis/apps/v1/namespaces/realestate-crawler/deployments/realestate-crawler-api -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" -H "Content-Type:application/strategic-merge-patch+json" -k -d ''{"spec": {"template": {"metadata": { "annotations": {"kubectl.kubernetes.io/restartedAt": "''$(date +%Y-%m-%dT%TZ)''" }}}}}'' | head' +# Sun Feb 1 08:29:56 PM UTC 2026 +# Webhook test Sun Feb 1 08:45:52 PM UTC 2026 diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py new file mode 100644 index 0000000..afc3fd5 --- /dev/null +++ b/crawler/3_dump_images.py @@ -0,0 +1,51 @@ +import asyncio +from pathlib import Path +import aiohttp +from repositories import ListingRepository +from tenacity import retry, stop_after_attempt, wait_random +from tqdm.asyncio import tqdm + +from models import Listing + +# Setting this too high either crashes rightmove or gets us blocked +semaphore = asyncio.Semaphore(5) + + +async def dump_images( + repository: ListingRepository, + image_base_path: Path = Path("data/rs/"), +): + listings = await repository.get_listings() + updated_listings = await tqdm.gather( + *[dump_images_for_listing(listing, image_base_path) for listing in listings] + ) + await repository.upsert_listings( + [listing for listing in updated_listings if listing is not None] + ) + + +@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3)) +async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None: + all_floorplans = listing.additional_info.get("property", {}).get("floorplans", []) + for floorplan in all_floorplans: + url = floorplan["url"] + picname = url.split("/")[-1] + floorplan_path = Path(base_path, str(listing.id), "floorplans", picname) + if floorplan_path.exists(): + continue + try: + async with semaphore: + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + if response.status == 404: + return None + if response.status != 200: + raise Exception(f"Error for {url}: {response.status}") + floorplan_path.parent.mkdir(parents=True, exist_ok=True) + with open(floorplan_path, "wb") as f: + f.write(await response.read()) + listing.floorplan_image_paths.append(str(floorplan_path)) + return listing + except Exception as e: + tqdm.write(f"Error for {url}: {e}") + raise e # raise so that we retry it diff --git a/crawler/4_detect_floorplan.py b/crawler/4_detect_floorplan.py new file mode 100644 index 0000000..bf02f67 --- /dev/null +++ b/crawler/4_detect_floorplan.py @@ -0,0 +1,43 @@ +import asyncio +from models import Listing +from rec import floorplan +from repositories.listing_repository import ListingRepository +from tqdm.asyncio import tqdm +import multiprocessing + + +async def detect_floorplan(repository: ListingRepository): + listings = await repository.get_listings() + cpu_count = multiprocessing.cpu_count() // 4 + semaphore = asyncio.Semaphore(cpu_count) + + updated_listings = [ + listing + for listing in await tqdm.gather( + *[_calculate_sqm_ocr(listing, semaphore) for listing in listings] + ) + if listing is not None + ] + await repository.upsert_listings(updated_listings) + + +async def _calculate_sqm_ocr( + listing: Listing, semaphore: asyncio.Semaphore +) -> Listing | None: + if listing.square_meters is not None: + return None + sqms = [] + for floorplan_path in listing.floorplan_image_paths: + async with semaphore: + estimated_sqm, _ = await asyncio.to_thread( + floorplan.calculate_ocr, floorplan_path + ) + if estimated_sqm is not None: + sqms.append(estimated_sqm) + max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0 + # if max_sqm is not None: + listing.square_meters = max_sqm + return listing + # else: + # listing.square_meters = None + # return None