migrate dump images command to use model listings

This commit is contained in:
Viktor Barzin 2025-06-07 13:56:00 +00:00
parent 4f5a934fa9
commit ba87d07cd2
No known key found for this signature in database
GPG key ID: 4056458DBDBF8863
6 changed files with 99 additions and 20 deletions

View file

@ -65,7 +65,7 @@ async def dump_listings(
for listing, detail in zip(listings_without_details, listing_details):
listing._details_object = detail
model_listings = await repository.upsert_listings(listings) # upsert in db
model_listings = await repository.upsert_listings_legacy(listings) # upsert in db
await dump_listings_to_fs(listings)
return model_listings

View file

@ -1,36 +1,45 @@
import asyncio
import json
from pathlib import Path
import aiohttp
from repositories import ListingRepository
from tqdm.asyncio import tqdm
from data_access import Listing
# from data_access import Listing
from models import Listing
# Setting this too high either crashes rightmove or gets us blocked
semaphore = asyncio.Semaphore(10)
async def dump_images(listing_paths: list[str]):
listings = Listing.get_all_listings(listing_paths)
await tqdm.gather(*[dump_images_for_listing(listing) for listing in listings])
async def dump_images(repository: ListingRepository, image_base_path: Path):
listings = await repository.get_listings()
updated_listings = await tqdm.gather(
*[dump_images_for_listing(listing, image_base_path) for listing in listings]
)
await repository.upsert_listings(
[listing for listing in updated_listings if listing is not None]
)
async def dump_images_for_listing(listing: Listing):
with open(listing.path_detail_json()) as f:
detail = json.load(f)
for photo in detail["property"]["floorplans"]:
url = photo["url"]
async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None:
all_floorplans = listing.additional_info["property"]["floorplans"]
for floorplan in all_floorplans:
url = floorplan["url"]
picname = url.split("/")[-1]
order = photo["order"]
p = listing.path_floorplan_file(order, picname)
if p.exists():
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
if floorplan_path.exists():
continue
try:
async with aiohttp.ClientSession() as session:
async with semaphore:
async with semaphore:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status != 200:
raise Exception(f"Error for {url}: {response.status}")
with open(p, "wb") as f:
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
with open(floorplan_path, "wb") as f:
f.write(await response.read())
listing.floorplan_image_paths.append(str(floorplan_path))
return listing
except Exception as e:
tqdm.write(f"Error for {url}: {e}")

View file

@ -0,0 +1,34 @@
"""add more fields to tables
Revision ID: 8a7accc583c9
Revises: b2ffa638aafc
Create Date: 2025-06-07 13:38:08.805386
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = '8a7accc583c9'
down_revision: Union[str, None] = 'b2ffa638aafc'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('buylisting', sa.Column('floorplan_image_paths', sa.JSON(), nullable=False))
op.add_column('rentlisting', sa.Column('floorplan_image_paths', sa.JSON(), nullable=False))
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('rentlisting', 'floorplan_image_paths')
op.drop_column('buylisting', 'floorplan_image_paths')
# ### end Alembic commands ###

View file

@ -0,0 +1,32 @@
"""add more fields to tables
Revision ID: b2ffa638aafc
Revises: b78e1ed31eed
Create Date: 2025-06-07 12:18:28.963851
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = 'b2ffa638aafc'
down_revision: Union[str, None] = 'b78e1ed31eed'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
pass
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
pass
# ### end Alembic commands ###

View file

@ -166,8 +166,8 @@ def dump_listings(
def dump_images(ctx: click.core.Context):
data_dir = ctx.obj["data_dir"]
click.echo(f"Running dump_images stored in {data_dir}")
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
asyncio.run(dump_images_module.dump_images(listing_paths))
repository = ListingRepository(engine=engine)
asyncio.run(dump_images_module.dump_images(repository, image_base_path=data_dir))
@cli.command()

View file

@ -1,8 +1,9 @@
from dataclasses import dataclass
from datetime import datetime
import enum
from pathlib import Path
from typing import Any, Dict, List
from sqlmodel import JSON, Column, Enum, SQLModel, Field
from sqlmodel import JSON, Column, Enum, SQLModel, Field, String, TypeDecorator
@dataclass
@ -31,6 +32,9 @@ class Listing(SQLModel, table=False):
listing_site: ListingSite = Field(nullable=False)
last_seen: datetime = Field(default_factory=datetime.now, nullable=False)
photo_thumbnail: str | None = Field(default=None, nullable=True)
floorplan_image_paths: List[str] = Field(
default_factory=list, sa_type=JSON, nullable=False
)
additional_info: Dict[str, Any] = Field(
default_factory=dict, sa_type=JSON, nullable=False
)