migrate dump images command to use model listings

This commit is contained in:
Viktor Barzin 2025-06-07 13:56:00 +00:00
parent 4f5a934fa9
commit ba87d07cd2
No known key found for this signature in database
GPG key ID: 4056458DBDBF8863
6 changed files with 99 additions and 20 deletions

View file

@ -65,7 +65,7 @@ async def dump_listings(
for listing, detail in zip(listings_without_details, listing_details): for listing, detail in zip(listings_without_details, listing_details):
listing._details_object = detail listing._details_object = detail
model_listings = await repository.upsert_listings(listings) # upsert in db model_listings = await repository.upsert_listings_legacy(listings) # upsert in db
await dump_listings_to_fs(listings) await dump_listings_to_fs(listings)
return model_listings return model_listings

View file

@ -1,36 +1,45 @@
import asyncio import asyncio
import json import json
from pathlib import Path
import aiohttp import aiohttp
from repositories import ListingRepository
from tqdm.asyncio import tqdm from tqdm.asyncio import tqdm
from data_access import Listing
# from data_access import Listing
from models import Listing
# Setting this too high either crashes rightmove or gets us blocked # Setting this too high either crashes rightmove or gets us blocked
semaphore = asyncio.Semaphore(10) semaphore = asyncio.Semaphore(10)
async def dump_images(listing_paths: list[str]): async def dump_images(repository: ListingRepository, image_base_path: Path):
listings = Listing.get_all_listings(listing_paths) listings = await repository.get_listings()
await tqdm.gather(*[dump_images_for_listing(listing) for listing in listings]) updated_listings = await tqdm.gather(
*[dump_images_for_listing(listing, image_base_path) for listing in listings]
)
await repository.upsert_listings(
[listing for listing in updated_listings if listing is not None]
)
async def dump_images_for_listing(listing: Listing): async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None:
with open(listing.path_detail_json()) as f: all_floorplans = listing.additional_info["property"]["floorplans"]
detail = json.load(f) for floorplan in all_floorplans:
url = floorplan["url"]
for photo in detail["property"]["floorplans"]:
url = photo["url"]
picname = url.split("/")[-1] picname = url.split("/")[-1]
order = photo["order"] floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
p = listing.path_floorplan_file(order, picname) if floorplan_path.exists():
if p.exists():
continue continue
try: try:
async with aiohttp.ClientSession() as session: async with semaphore:
async with semaphore: async with aiohttp.ClientSession() as session:
async with session.get(url) as response: async with session.get(url) as response:
if response.status != 200: if response.status != 200:
raise Exception(f"Error for {url}: {response.status}") raise Exception(f"Error for {url}: {response.status}")
with open(p, "wb") as f: floorplan_path.parent.mkdir(parents=True, exist_ok=True)
with open(floorplan_path, "wb") as f:
f.write(await response.read()) f.write(await response.read())
listing.floorplan_image_paths.append(str(floorplan_path))
return listing
except Exception as e: except Exception as e:
tqdm.write(f"Error for {url}: {e}") tqdm.write(f"Error for {url}: {e}")

View file

@ -0,0 +1,34 @@
"""add more fields to tables
Revision ID: 8a7accc583c9
Revises: b2ffa638aafc
Create Date: 2025-06-07 13:38:08.805386
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = '8a7accc583c9'
down_revision: Union[str, None] = 'b2ffa638aafc'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('buylisting', sa.Column('floorplan_image_paths', sa.JSON(), nullable=False))
op.add_column('rentlisting', sa.Column('floorplan_image_paths', sa.JSON(), nullable=False))
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('rentlisting', 'floorplan_image_paths')
op.drop_column('buylisting', 'floorplan_image_paths')
# ### end Alembic commands ###

View file

@ -0,0 +1,32 @@
"""add more fields to tables
Revision ID: b2ffa638aafc
Revises: b78e1ed31eed
Create Date: 2025-06-07 12:18:28.963851
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = 'b2ffa638aafc'
down_revision: Union[str, None] = 'b78e1ed31eed'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
pass
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
pass
# ### end Alembic commands ###

View file

@ -166,8 +166,8 @@ def dump_listings(
def dump_images(ctx: click.core.Context): def dump_images(ctx: click.core.Context):
data_dir = ctx.obj["data_dir"] data_dir = ctx.obj["data_dir"]
click.echo(f"Running dump_images stored in {data_dir}") click.echo(f"Running dump_images stored in {data_dir}")
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) repository = ListingRepository(engine=engine)
asyncio.run(dump_images_module.dump_images(listing_paths)) asyncio.run(dump_images_module.dump_images(repository, image_base_path=data_dir))
@cli.command() @cli.command()

View file

@ -1,8 +1,9 @@
from dataclasses import dataclass from dataclasses import dataclass
from datetime import datetime from datetime import datetime
import enum import enum
from pathlib import Path
from typing import Any, Dict, List from typing import Any, Dict, List
from sqlmodel import JSON, Column, Enum, SQLModel, Field from sqlmodel import JSON, Column, Enum, SQLModel, Field, String, TypeDecorator
@dataclass @dataclass
@ -31,6 +32,9 @@ class Listing(SQLModel, table=False):
listing_site: ListingSite = Field(nullable=False) listing_site: ListingSite = Field(nullable=False)
last_seen: datetime = Field(default_factory=datetime.now, nullable=False) last_seen: datetime = Field(default_factory=datetime.now, nullable=False)
photo_thumbnail: str | None = Field(default=None, nullable=True) photo_thumbnail: str | None = Field(default=None, nullable=True)
floorplan_image_paths: List[str] = Field(
default_factory=list, sa_type=JSON, nullable=False
)
additional_info: Dict[str, Any] = Field( additional_info: Dict[str, Any] = Field(
default_factory=dict, sa_type=JSON, nullable=False default_factory=dict, sa_type=JSON, nullable=False
) )