diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index fdc12ac..f3751ab 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -65,7 +65,7 @@ async def dump_listings( for listing, detail in zip(listings_without_details, listing_details): listing._details_object = detail - model_listings = await repository.upsert_listings(listings) # upsert in db + model_listings = await repository.upsert_listings_legacy(listings) # upsert in db await dump_listings_to_fs(listings) return model_listings diff --git a/crawler/3_dump_images.py b/crawler/3_dump_images.py index bb14855..2cac145 100644 --- a/crawler/3_dump_images.py +++ b/crawler/3_dump_images.py @@ -1,36 +1,45 @@ import asyncio import json +from pathlib import Path import aiohttp +from repositories import ListingRepository from tqdm.asyncio import tqdm -from data_access import Listing + +# from data_access import Listing +from models import Listing # Setting this too high either crashes rightmove or gets us blocked semaphore = asyncio.Semaphore(10) -async def dump_images(listing_paths: list[str]): - listings = Listing.get_all_listings(listing_paths) - await tqdm.gather(*[dump_images_for_listing(listing) for listing in listings]) +async def dump_images(repository: ListingRepository, image_base_path: Path): + listings = await repository.get_listings() + updated_listings = await tqdm.gather( + *[dump_images_for_listing(listing, image_base_path) for listing in listings] + ) + await repository.upsert_listings( + [listing for listing in updated_listings if listing is not None] + ) -async def dump_images_for_listing(listing: Listing): - with open(listing.path_detail_json()) as f: - detail = json.load(f) - - for photo in detail["property"]["floorplans"]: - url = photo["url"] +async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None: + all_floorplans = listing.additional_info["property"]["floorplans"] + for floorplan in all_floorplans: + url = floorplan["url"] picname = url.split("/")[-1] - order = photo["order"] - p = listing.path_floorplan_file(order, picname) - if p.exists(): + floorplan_path = Path(base_path, str(listing.id), "floorplans", picname) + if floorplan_path.exists(): continue try: - async with aiohttp.ClientSession() as session: - async with semaphore: + async with semaphore: + async with aiohttp.ClientSession() as session: async with session.get(url) as response: if response.status != 200: raise Exception(f"Error for {url}: {response.status}") - with open(p, "wb") as f: + floorplan_path.parent.mkdir(parents=True, exist_ok=True) + with open(floorplan_path, "wb") as f: f.write(await response.read()) + listing.floorplan_image_paths.append(str(floorplan_path)) + return listing except Exception as e: tqdm.write(f"Error for {url}: {e}") diff --git a/crawler/alembic/versions/8a7accc583c9_add_more_fields_to_tables.py b/crawler/alembic/versions/8a7accc583c9_add_more_fields_to_tables.py new file mode 100644 index 0000000..13b6981 --- /dev/null +++ b/crawler/alembic/versions/8a7accc583c9_add_more_fields_to_tables.py @@ -0,0 +1,34 @@ +"""add more fields to tables + +Revision ID: 8a7accc583c9 +Revises: b2ffa638aafc +Create Date: 2025-06-07 13:38:08.805386 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '8a7accc583c9' +down_revision: Union[str, None] = 'b2ffa638aafc' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('buylisting', sa.Column('floorplan_image_paths', sa.JSON(), nullable=False)) + op.add_column('rentlisting', sa.Column('floorplan_image_paths', sa.JSON(), nullable=False)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('rentlisting', 'floorplan_image_paths') + op.drop_column('buylisting', 'floorplan_image_paths') + # ### end Alembic commands ### diff --git a/crawler/alembic/versions/b2ffa638aafc_add_more_fields_to_tables.py b/crawler/alembic/versions/b2ffa638aafc_add_more_fields_to_tables.py new file mode 100644 index 0000000..853bc62 --- /dev/null +++ b/crawler/alembic/versions/b2ffa638aafc_add_more_fields_to_tables.py @@ -0,0 +1,32 @@ +"""add more fields to tables + +Revision ID: b2ffa638aafc +Revises: b78e1ed31eed +Create Date: 2025-06-07 12:18:28.963851 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'b2ffa638aafc' +down_revision: Union[str, None] = 'b78e1ed31eed' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + pass + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + pass + # ### end Alembic commands ### diff --git a/crawler/main.py b/crawler/main.py index b051530..a9942d7 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -166,8 +166,8 @@ def dump_listings( def dump_images(ctx: click.core.Context): data_dir = ctx.obj["data_dir"] click.echo(f"Running dump_images stored in {data_dir}") - listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) - asyncio.run(dump_images_module.dump_images(listing_paths)) + repository = ListingRepository(engine=engine) + asyncio.run(dump_images_module.dump_images(repository, image_base_path=data_dir)) @cli.command() diff --git a/crawler/models/listing.py b/crawler/models/listing.py index 1b4c9aa..17ed15c 100644 --- a/crawler/models/listing.py +++ b/crawler/models/listing.py @@ -1,8 +1,9 @@ from dataclasses import dataclass from datetime import datetime import enum +from pathlib import Path from typing import Any, Dict, List -from sqlmodel import JSON, Column, Enum, SQLModel, Field +from sqlmodel import JSON, Column, Enum, SQLModel, Field, String, TypeDecorator @dataclass @@ -31,6 +32,9 @@ class Listing(SQLModel, table=False): listing_site: ListingSite = Field(nullable=False) last_seen: datetime = Field(default_factory=datetime.now, nullable=False) photo_thumbnail: str | None = Field(default=None, nullable=True) + floorplan_image_paths: List[str] = Field( + default_factory=list, sa_type=JSON, nullable=False + ) additional_info: Dict[str, Any] = Field( default_factory=dict, sa_type=JSON, nullable=False )