Compare commits
No commits in common. "4b1e971edf30c083fe5385cb32454d96f16c33ff" and "1680cda7b76fc575ee75ee4bf50d8e54268428be" have entirely different histories.
4b1e971edf
...
1680cda7b7
3 changed files with 113 additions and 13 deletions
32
.drone.yml
32
.drone.yml
|
|
@ -1,56 +1,62 @@
|
||||||
kind: pipeline
|
kind: pipeline
|
||||||
type: kubernetes
|
type: kubernetes
|
||||||
name: frontend
|
name: frontend
|
||||||
|
timeout: 30m
|
||||||
|
|
||||||
trigger:
|
trigger:
|
||||||
branch:
|
|
||||||
- master
|
|
||||||
event:
|
event:
|
||||||
- push
|
- push
|
||||||
|
- cron
|
||||||
|
branch:
|
||||||
|
- master
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Build frontend image
|
- name: Build frontend image
|
||||||
image: plugins/docker
|
image: plugins/docker
|
||||||
settings:
|
settings:
|
||||||
username: viktorbarzin
|
username: "viktorbarzin"
|
||||||
password:
|
password:
|
||||||
from_secret: dockerhub-token
|
from_secret: dockerhub-token
|
||||||
repo: viktorbarzin/immoweb
|
repo: viktorbarzin/immoweb
|
||||||
dockerfile: crawler/frontend/Dockerfile
|
dockerfile: crawler/frontend/Dockerfile
|
||||||
context: crawler/frontend
|
context: crawler/frontend
|
||||||
|
cache_from: "viktorbarzin/immoweb:latest"
|
||||||
auto_tag: true
|
auto_tag: true
|
||||||
|
|
||||||
- name: Update deployment
|
- name: Update deployment
|
||||||
image: alpine
|
image: alpine
|
||||||
commands:
|
commands:
|
||||||
- apk add curl
|
- "apk add curl"
|
||||||
- 'curl -X PATCH https://kubernetes:6443/apis/apps/v1/namespaces/realestate-crawler/deployments/realestate-crawler-ui -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" -H "Content-Type:application/strategic-merge-patch+json" -k -d ''{"spec": {"template": {"metadata": { "annotations": {"kubectl.kubernetes.io/restartedAt": "''$(date +%Y-%m-%dT%TZ)''" }}}}}'' | head'
|
- 'curl -X PATCH https://kubernetes:6443/apis/apps/v1/namespaces/realestate-crawler/deployments/realestate-crawler-ui -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" -H "Content-Type:application/strategic-merge-patch+json" -k -d ''{"spec": {"template": {"metadata": { "annotations": {"kubectl.kubernetes.io/restartedAt": "''$(date +%Y-%m-%dT%TZ)''" }}}}}'' | head'
|
||||||
|
|
||||||
---
|
---
|
||||||
kind: pipeline
|
kind: pipeline
|
||||||
type: kubernetes
|
type: kubernetes
|
||||||
name: api
|
name: api
|
||||||
|
timeout: 30m
|
||||||
|
|
||||||
trigger:
|
trigger:
|
||||||
branch:
|
|
||||||
- master
|
|
||||||
event:
|
event:
|
||||||
- push
|
- push
|
||||||
|
- cron
|
||||||
|
branch:
|
||||||
|
- master
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Build API image
|
- name: Build backend API image
|
||||||
image: plugins/docker
|
image: plugins/docker
|
||||||
settings:
|
settings:
|
||||||
username: viktorbarzin
|
username: "viktorbarzin"
|
||||||
password:
|
password:
|
||||||
from_secret: dockerhub-token
|
from_secret: dockerhub-token
|
||||||
repo: viktorbarzin/realestatecrawler
|
repo: viktorbarzin/realestatecrawler
|
||||||
dockerfile: crawler/Dockerfile
|
dockerfile: crawler/Dockerfile
|
||||||
context: crawler/
|
context: crawler/
|
||||||
|
cache_from: "viktorbarzin/realestatecrawler:latest"
|
||||||
auto_tag: true
|
auto_tag: true
|
||||||
|
|
||||||
- name: Update deployment
|
- name: Update deployment
|
||||||
image: alpine
|
image: alpine
|
||||||
commands:
|
commands:
|
||||||
- apk add curl
|
- "apk add curl"
|
||||||
- 'curl -X PATCH https://kubernetes:6443/apis/apps/v1/namespaces/realestate-crawler/deployments/realestate-crawler-api -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" -H "Content-Type:application/strategic-merge-patch+json" -k -d ''{"spec": {"template": {"metadata": { "annotations": {"kubectl.kubernetes.io/restartedAt": "''$(date +%Y-%m-%dT%TZ)''" }}}}}'' | head'
|
- 'curl -X PATCH https://kubernetes:6443/apis/apps/v1/namespaces/realestate-crawler/deployments/realestate-crawler-api -H "Authorization:Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" -H "Content-Type:application/strategic-merge-patch+json" -k -d ''{"spec": {"template": {"metadata": { "annotations": {"kubectl.kubernetes.io/restartedAt": "''$(date +%Y-%m-%dT%TZ)''" }}}}}'' | head'
|
||||||
|
# Sun Feb 1 08:29:56 PM UTC 2026
|
||||||
|
# Webhook test Sun Feb 1 08:45:52 PM UTC 2026
|
||||||
|
|
|
||||||
51
crawler/3_dump_images.py
Normal file
51
crawler/3_dump_images.py
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
import aiohttp
|
||||||
|
from repositories import ListingRepository
|
||||||
|
from tenacity import retry, stop_after_attempt, wait_random
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
|
||||||
|
from models import Listing
|
||||||
|
|
||||||
|
# Setting this too high either crashes rightmove or gets us blocked
|
||||||
|
semaphore = asyncio.Semaphore(5)
|
||||||
|
|
||||||
|
|
||||||
|
async def dump_images(
|
||||||
|
repository: ListingRepository,
|
||||||
|
image_base_path: Path = Path("data/rs/"),
|
||||||
|
):
|
||||||
|
listings = await repository.get_listings()
|
||||||
|
updated_listings = await tqdm.gather(
|
||||||
|
*[dump_images_for_listing(listing, image_base_path) for listing in listings]
|
||||||
|
)
|
||||||
|
await repository.upsert_listings(
|
||||||
|
[listing for listing in updated_listings if listing is not None]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
|
||||||
|
async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None:
|
||||||
|
all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
|
||||||
|
for floorplan in all_floorplans:
|
||||||
|
url = floorplan["url"]
|
||||||
|
picname = url.split("/")[-1]
|
||||||
|
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
|
||||||
|
if floorplan_path.exists():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
async with semaphore:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(url) as response:
|
||||||
|
if response.status == 404:
|
||||||
|
return None
|
||||||
|
if response.status != 200:
|
||||||
|
raise Exception(f"Error for {url}: {response.status}")
|
||||||
|
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(floorplan_path, "wb") as f:
|
||||||
|
f.write(await response.read())
|
||||||
|
listing.floorplan_image_paths.append(str(floorplan_path))
|
||||||
|
return listing
|
||||||
|
except Exception as e:
|
||||||
|
tqdm.write(f"Error for {url}: {e}")
|
||||||
|
raise e # raise so that we retry it
|
||||||
43
crawler/4_detect_floorplan.py
Normal file
43
crawler/4_detect_floorplan.py
Normal file
|
|
@ -0,0 +1,43 @@
|
||||||
|
import asyncio
|
||||||
|
from models import Listing
|
||||||
|
from rec import floorplan
|
||||||
|
from repositories.listing_repository import ListingRepository
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
|
||||||
|
async def detect_floorplan(repository: ListingRepository):
|
||||||
|
listings = await repository.get_listings()
|
||||||
|
cpu_count = multiprocessing.cpu_count() // 4
|
||||||
|
semaphore = asyncio.Semaphore(cpu_count)
|
||||||
|
|
||||||
|
updated_listings = [
|
||||||
|
listing
|
||||||
|
for listing in await tqdm.gather(
|
||||||
|
*[_calculate_sqm_ocr(listing, semaphore) for listing in listings]
|
||||||
|
)
|
||||||
|
if listing is not None
|
||||||
|
]
|
||||||
|
await repository.upsert_listings(updated_listings)
|
||||||
|
|
||||||
|
|
||||||
|
async def _calculate_sqm_ocr(
|
||||||
|
listing: Listing, semaphore: asyncio.Semaphore
|
||||||
|
) -> Listing | None:
|
||||||
|
if listing.square_meters is not None:
|
||||||
|
return None
|
||||||
|
sqms = []
|
||||||
|
for floorplan_path in listing.floorplan_image_paths:
|
||||||
|
async with semaphore:
|
||||||
|
estimated_sqm, _ = await asyncio.to_thread(
|
||||||
|
floorplan.calculate_ocr, floorplan_path
|
||||||
|
)
|
||||||
|
if estimated_sqm is not None:
|
||||||
|
sqms.append(estimated_sqm)
|
||||||
|
max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0
|
||||||
|
# if max_sqm is not None:
|
||||||
|
listing.square_meters = max_sqm
|
||||||
|
return listing
|
||||||
|
# else:
|
||||||
|
# listing.square_meters = None
|
||||||
|
# return None
|
||||||
Loading…
Add table
Add a link
Reference in a new issue