diff --git a/crawler/5_routing.py b/crawler/5_routing.py index 2717d81..455ccd5 100644 --- a/crawler/5_routing.py +++ b/crawler/5_routing.py @@ -1,45 +1,67 @@ -from data_access import Listing -from tqdm import tqdm +from models.listing import DestinationMode, Route, RouteLegStep +from repositories.listing_repository import ListingRepository +from tqdm.asyncio import tqdm from rec import routing +from models import Listing async def calculate_route( - listing_paths: list[str], + repository: ListingRepository, destination_address: str, travel_mode: routing.TravelMode, + limit: int | None = None, ) -> None: + listings = await repository.get_listings() - listings = Listing.get_all_listings(listing_paths) + if limit is not None: + listings = listings[:limit] - # reduce listings to everything within 7 miles - filtered_listings = [] - for listing in listings: - print(f"Processing {listing.identifier}") - if listing.isRemoved: - print(f"Removed-Skip: Skipping {listing.identifier} " "is already removed.") - continue - sqm_ocr = await listing.sqm_ocr() - if sqm_ocr is None or sqm_ocr < 30 or sqm_ocr > 200: - print( - ( - f"Floorplan-Skip: Skipping {listing.identifier} as " - f"sqm_ocr is {sqm_ocr}" + destimation_mode = DestinationMode(destination_address, travel_mode) + updated_listings = await tqdm.gather( + *[update_routing_info(listing, destimation_mode) for listing in listings], + total=len(listings), + desc="Updating routing info", + ) + await repository.upsert_listings( + [listing for listing in updated_listings if listing is not None] + ) + + +async def update_routing_info( + listing: Listing, destination_mode: DestinationMode +) -> Listing | None: + if listing.routing_info.get(destination_mode) is not None: + # already calculated, do not recompute to save API calls + return None + + routes_data = routing.transit_route( + listing.latitude, + listing.longtitude, + destination_mode.destination_address, + destination_mode.travel_mode, + ) + + route_data = routes_data["routes"][0] + routes = [] + for route_data in routes_data["routes"]: + duration_s = int(route_data["duration"].split("s")[0]) + route = Route( + legs=[ + RouteLegStep( + distance_meters=step_data["distanceMeters"], + duration_s=int(step_data["staticDuration"].split("s")[0]), + travel_mode=routing.TravelMode(step_data["travelMode"]), ) - ) - continue - filtered_listings.append(listing) - - print(f"Filtered listings from {len(listings)} to {len(filtered_listings)}") - - for listing in tqdm(filtered_listings): - listing.calculate_route( - destination_address, - travel_mode, - recalculate=False, + for step_data in route_data["legs"][0]["steps"] + ], + distance_meters=route_data["distanceMeters"], + duration_s=duration_s, ) - traveltime = listing.travel_time(destination_address, travel_mode)[0] - duration_minutes = traveltime["duration"] / 60.0 - tqdm.write(f"{listing.identifier} {duration_minutes}") + routes.append(route) + listing.routing_info_json = listing.serialize_routing_info( + {**listing.routing_info, **{destination_mode: routes}} + ) + return listing # async def geocode_address( diff --git a/crawler/alembic/versions/b78e1ed31eed_add_more_fields_to_tables.py b/crawler/alembic/versions/72b7410ff3e6_add_more_fields_to_tables.py similarity index 89% rename from crawler/alembic/versions/b78e1ed31eed_add_more_fields_to_tables.py rename to crawler/alembic/versions/72b7410ff3e6_add_more_fields_to_tables.py index eed35ce..a4e2e0b 100644 --- a/crawler/alembic/versions/b78e1ed31eed_add_more_fields_to_tables.py +++ b/crawler/alembic/versions/72b7410ff3e6_add_more_fields_to_tables.py @@ -1,8 +1,8 @@ """add more fields to tables -Revision ID: b78e1ed31eed +Revision ID: 72b7410ff3e6 Revises: -Create Date: 2025-06-06 19:50:09.773676 +Create Date: 2025-06-08 10:56:22.448446 """ @@ -14,7 +14,7 @@ import sqlmodel # revision identifiers, used by Alembic. -revision: str = "b78e1ed31eed" +revision: str = "72b7410ff3e6" down_revision: Union[str, None] = None branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -30,6 +30,9 @@ def upgrade() -> None: sa.Column("number_of_bedrooms", sa.Integer(), nullable=False), sa.Column("square_meters", sa.Float(), nullable=True), sa.Column("agency", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + sa.Column( + "council_tax_band", sqlmodel.sql.sqltypes.AutoString(), nullable=True + ), sa.Column("longtitude", sa.Float(), nullable=False), sa.Column("latitude", sa.Float(), nullable=False), sa.Column("price_history", sa.JSON(), nullable=False), @@ -38,11 +41,10 @@ def upgrade() -> None: ), sa.Column("last_seen", sa.DateTime(), nullable=False), sa.Column("photo_thumbnail", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + sa.Column("floorplan_image_paths", sa.JSON(), nullable=False), sa.Column("additional_info", sa.JSON(), nullable=False), + sa.Column("routing_info_json", sa.String(), nullable=True), sa.Column("service_charge", sa.Float(), nullable=True), - sa.Column( - "council_tax_band", sqlmodel.sql.sqltypes.AutoString(), nullable=True - ), sa.Column("lease_left", sa.Integer(), nullable=True), sa.PrimaryKeyConstraint("id"), ) @@ -64,7 +66,9 @@ def upgrade() -> None: ), sa.Column("last_seen", sa.DateTime(), nullable=False), sa.Column("photo_thumbnail", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + sa.Column("floorplan_image_paths", sa.JSON(), nullable=False), sa.Column("additional_info", sa.JSON(), nullable=False), + sa.Column("routing_info_json", sa.String(), nullable=True), sa.Column("available_from", sa.DateTime(), nullable=True), sa.Column( "furnish_type", diff --git a/crawler/alembic/versions/8a7accc583c9_add_more_fields_to_tables.py b/crawler/alembic/versions/8a7accc583c9_add_more_fields_to_tables.py deleted file mode 100644 index 13b6981..0000000 --- a/crawler/alembic/versions/8a7accc583c9_add_more_fields_to_tables.py +++ /dev/null @@ -1,34 +0,0 @@ -"""add more fields to tables - -Revision ID: 8a7accc583c9 -Revises: b2ffa638aafc -Create Date: 2025-06-07 13:38:08.805386 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = '8a7accc583c9' -down_revision: Union[str, None] = 'b2ffa638aafc' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - """Upgrade schema.""" - # ### commands auto generated by Alembic - please adjust! ### - op.add_column('buylisting', sa.Column('floorplan_image_paths', sa.JSON(), nullable=False)) - op.add_column('rentlisting', sa.Column('floorplan_image_paths', sa.JSON(), nullable=False)) - # ### end Alembic commands ### - - -def downgrade() -> None: - """Downgrade schema.""" - # ### commands auto generated by Alembic - please adjust! ### - op.drop_column('rentlisting', 'floorplan_image_paths') - op.drop_column('buylisting', 'floorplan_image_paths') - # ### end Alembic commands ### diff --git a/crawler/alembic/versions/b2ffa638aafc_add_more_fields_to_tables.py b/crawler/alembic/versions/b2ffa638aafc_add_more_fields_to_tables.py deleted file mode 100644 index 853bc62..0000000 --- a/crawler/alembic/versions/b2ffa638aafc_add_more_fields_to_tables.py +++ /dev/null @@ -1,32 +0,0 @@ -"""add more fields to tables - -Revision ID: b2ffa638aafc -Revises: b78e1ed31eed -Create Date: 2025-06-07 12:18:28.963851 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = 'b2ffa638aafc' -down_revision: Union[str, None] = 'b78e1ed31eed' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - """Upgrade schema.""" - # ### commands auto generated by Alembic - please adjust! ### - pass - # ### end Alembic commands ### - - -def downgrade() -> None: - """Downgrade schema.""" - # ### commands auto generated by Alembic - please adjust! ### - pass - # ### end Alembic commands ### diff --git a/crawler/main.py b/crawler/main.py index 931518b..41a7331 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -118,7 +118,6 @@ def listing_filter_options(func): def cli(ctx, data_dir: str): ctx.ensure_object(dict) ctx.obj["data_dir"] = data_dir - pass @cli.command() @@ -218,12 +217,14 @@ def routing( "Please set it to your API key for the routing service." ) + repository = ListingRepository(engine=engine) asyncio.run( routing_module.calculate_route( - listing_paths, + repository, destination_address, # destination_address_coordinates, TravelMode[travel_mode], + limit=limit, ) ) diff --git a/crawler/models/listing.py b/crawler/models/listing.py index 17ed15c..fc8ff0d 100644 --- a/crawler/models/listing.py +++ b/crawler/models/listing.py @@ -1,9 +1,13 @@ -from dataclasses import dataclass -from datetime import datetime +from __future__ import annotations +from dataclasses import asdict, dataclass +import dataclasses +from datetime import datetime, timedelta import enum +import json from pathlib import Path from typing import Any, Dict, List -from sqlmodel import JSON, Column, Enum, SQLModel, Field, String, TypeDecorator +from rec import routing +from sqlmodel import JSON, SQLModel, Field, String @dataclass @@ -13,6 +17,28 @@ class PriceHistoryItem: price: float +@dataclass(frozen=True) +class Route: + legs: list[RouteLegStep] + distance_meters: int + duration_s: int + + @property + def duration(self) -> timedelta: + return timedelta(seconds=self.duration_s) + + +@dataclass(frozen=True) +class RouteLegStep: + distance_meters: int + duration_s: int + travel_mode: routing.TravelMode + + @property + def duration(self) -> timedelta: + return timedelta(seconds=self.duration_s) + + class ListingSite(enum.StrEnum): RIGHTMOVE = "rightmove" # ZOOPLA = "zoopla" @@ -38,6 +64,69 @@ class Listing(SQLModel, table=False): additional_info: Dict[str, Any] = Field( default_factory=dict, sa_type=JSON, nullable=False ) + routing_info_json: str = Field( + sa_type=String, nullable=True, default=None + ) # Store as JSON string for simplicity + + @property + def is_removed(self) -> bool: + return not self.additional_info["property"]["visible"] + + @property + def routing_info(self) -> dict[DestinationMode, List[Route]]: + """ + Returns a list of DestinationMode objects from the routing_info_str. + """ + if not self.routing_info_json: + return {} + + # TODO: move to a separate serializer class + json_data = json.loads(self.routing_info_json) + destimation_routes = {} + for destination_mode_str, routes_json in json_data.items(): + destination_mode = DestinationMode( + destination_address=json.loads(destination_mode_str)[ + "destination_address" + ], + travel_mode=routing.TravelMode( + json.loads(destination_mode_str)["travel_mode"] + ), + ) + parsed_route = json.loads(routes_json[0]) + routes = [ + Route( + legs=[ + RouteLegStep( + distance_meters=step["distance_meters"], + duration_s=step["duration_s"], + travel_mode=routing.TravelMode(step["travel_mode"]), + ) + for step in parsed_route["legs"] + ], + distance_meters=parsed_route["distance_meters"], + duration_s=int(parsed_route["duration_s"]), + ) + ] + destimation_routes[destination_mode] = routes + return destimation_routes + + def serialize_routing_info( + self, routing_info: dict[DestinationMode, list[Route]] + ) -> str: + """ + Serializes the routing_info to a JSON string. + """ + # TODO: move to a separate serializer class + # for destination_mode, routes in routing_info.items(): + serialized = json.dumps( + { + json.dumps(dataclasses.asdict(destination_mode)): [ + json.dumps(dataclasses.asdict(route)) for route in routes + ] + for destination_mode, routes in routing_info.items() + } + ) + return serialized class FurnishType(enum.StrEnum): @@ -57,3 +146,39 @@ class BuyListing(Listing, table=True): lease_left: int | None = Field( default=None, nullable=True ) # in years, e.g., 90, 80, etc. + + +@dataclass(frozen=True) +class DestinationMode: + destination_address: str + travel_mode: routing.TravelMode + + def __hash__(self) -> int: + return hash((self.destination_address, self.travel_mode)) + + # def to_dict(self) -> dict[str, str | routing.TravelMode]: + # return { + # "destination_address": self.destination_address, + # "travel_mode": self.travel_mode.value, + # } + + # @classmethod + # def from_dict(cls, data: dict): + # return cls( + # destination_address=data["destination_address"], + # travel_mode=routing.TravelMode(data["travel_mode"]), + # ) + + # def __json__(self) -> dict[str, str | routing.TravelMode]: + # return { + # "destination_address": self.destination_address, + # "travel_mode": self.travel_mode.value, + # } + + def __getstate__(self): + # This allows serializers to pick up a dict representation + return asdict(self) + + def __iter__(self): + # Makes it behave like a dict when expected + return iter(asdict(self).items())