diff --git a/.drone.yml b/.drone.yml index 271575b..456e6f1 100644 --- a/.drone.yml +++ b/.drone.yml @@ -1,62 +1,56 @@ kind: pipeline type: kubernetes name: frontend -timeout: 30m trigger: - event: - - push - - cron branch: - - master + - master + event: + - push steps: - name: Build frontend image image: plugins/docker settings: - username: "viktorbarzin" + username: viktorbarzin password: from_secret: dockerhub-token repo: viktorbarzin/immoweb dockerfile: crawler/frontend/Dockerfile context: crawler/frontend - cache_from: "viktorbarzin/immoweb:latest" auto_tag: true + - name: Update deployment image: alpine commands: - - "apk add curl" + - apk add curl - 'curl -X PATCH https://kubernetes:6443/apis/apps/v1/namespaces/realestate-crawler/deployments/realestate-crawler-ui -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" -H "Content-Type:application/strategic-merge-patch+json" -k -d ''{"spec": {"template": {"metadata": { "annotations": {"kubectl.kubernetes.io/restartedAt": "''$(date +%Y-%m-%dT%TZ)''" }}}}}'' | head' --- kind: pipeline type: kubernetes name: api -timeout: 30m trigger: - event: - - push - - cron branch: - - master + - master + event: + - push steps: - - name: Build backend API image + - name: Build API image image: plugins/docker settings: - username: "viktorbarzin" + username: viktorbarzin password: from_secret: dockerhub-token repo: viktorbarzin/realestatecrawler dockerfile: crawler/Dockerfile context: crawler/ - cache_from: "viktorbarzin/realestatecrawler:latest" auto_tag: true + - name: Update deployment image: alpine commands: - - "apk add curl" + - apk add curl - 'curl -X PATCH https://kubernetes:6443/apis/apps/v1/namespaces/realestate-crawler/deployments/realestate-crawler-api -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" -H "Content-Type:application/strategic-merge-patch+json" -k -d ''{"spec": {"template": {"metadata": { "annotations": {"kubectl.kubernetes.io/restartedAt": "''$(date +%Y-%m-%dT%TZ)''" }}}}}'' | head' -# Sun Feb 1 08:29:56 PM UTC 2026 -# Webhook test Sun Feb 1 08:45:52 PM UTC 2026 diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py deleted file mode 100644 index afc3fd5..0000000 --- a/crawler/3_dump_images.py +++ /dev/null @@ -1,51 +0,0 @@ -import asyncio -from pathlib import Path -import aiohttp -from repositories import ListingRepository -from tenacity import retry, stop_after_attempt, wait_random -from tqdm.asyncio import tqdm - -from models import Listing - -# Setting this too high either crashes rightmove or gets us blocked -semaphore = asyncio.Semaphore(5) - - -async def dump_images( - repository: ListingRepository, - image_base_path: Path = Path("data/rs/"), -): - listings = await repository.get_listings() - updated_listings = await tqdm.gather( - *[dump_images_for_listing(listing, image_base_path) for listing in listings] - ) - await repository.upsert_listings( - [listing for listing in updated_listings if listing is not None] - ) - - -@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3)) -async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None: - all_floorplans = listing.additional_info.get("property", {}).get("floorplans", []) - for floorplan in all_floorplans: - url = floorplan["url"] - picname = url.split("/")[-1] - floorplan_path = Path(base_path, str(listing.id), "floorplans", picname) - if floorplan_path.exists(): - continue - try: - async with semaphore: - async with aiohttp.ClientSession() as session: - async with session.get(url) as response: - if response.status == 404: - return None - if response.status != 200: - raise Exception(f"Error for {url}: {response.status}") - floorplan_path.parent.mkdir(parents=True, exist_ok=True) - with open(floorplan_path, "wb") as f: - f.write(await response.read()) - listing.floorplan_image_paths.append(str(floorplan_path)) - return listing - except Exception as e: - tqdm.write(f"Error for {url}: {e}") - raise e # raise so that we retry it diff --git a/crawler/4_detect_floorplan.py b/crawler/4_detect_floorplan.py deleted file mode 100644 index bf02f67..0000000 --- a/crawler/4_detect_floorplan.py +++ /dev/null @@ -1,43 +0,0 @@ -import asyncio -from models import Listing -from rec import floorplan -from repositories.listing_repository import ListingRepository -from tqdm.asyncio import tqdm -import multiprocessing - - -async def detect_floorplan(repository: ListingRepository): - listings = await repository.get_listings() - cpu_count = multiprocessing.cpu_count() // 4 - semaphore = asyncio.Semaphore(cpu_count) - - updated_listings = [ - listing - for listing in await tqdm.gather( - *[_calculate_sqm_ocr(listing, semaphore) for listing in listings] - ) - if listing is not None - ] - await repository.upsert_listings(updated_listings) - - -async def _calculate_sqm_ocr( - listing: Listing, semaphore: asyncio.Semaphore -) -> Listing | None: - if listing.square_meters is not None: - return None - sqms = [] - for floorplan_path in listing.floorplan_image_paths: - async with semaphore: - estimated_sqm, _ = await asyncio.to_thread( - floorplan.calculate_ocr, floorplan_path - ) - if estimated_sqm is not None: - sqms.append(estimated_sqm) - max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0 - # if max_sqm is not None: - listing.square_meters = max_sqm - return listing - # else: - # listing.square_meters = None - # return None