diff --git a/crawler/alembic/env.py b/crawler/alembic/env.py index 4c91778..7d2c55c 100644 --- a/crawler/alembic/env.py +++ b/crawler/alembic/env.py @@ -4,7 +4,7 @@ from sqlalchemy import engine_from_config from sqlalchemy import pool from alembic import context -from models import Listing # Import all models here +from models import Listing, User # Import all models here from database import engine import sqlmodel from sqlmodel import SQLModel diff --git a/crawler/alembic/versions/73f976506aa8_add_more_fields_to_tables.py b/crawler/alembic/versions/73f976506aa8_add_more_fields_to_tables.py new file mode 100644 index 0000000..29961d7 --- /dev/null +++ b/crawler/alembic/versions/73f976506aa8_add_more_fields_to_tables.py @@ -0,0 +1,42 @@ +"""add more fields to tables + +Revision ID: 73f976506aa8 +Revises: 042751f52538 +Create Date: 2025-06-11 20:54:05.429912 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import sqlmodel + + +# revision identifiers, used by Alembic. +revision: str = "73f976506aa8" +down_revision: Union[str, None] = "042751f52538" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "user", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("email", sqlmodel.sql.sqltypes.AutoString(), nullable=False), + sa.Column("password", sqlmodel.sql.sqltypes.AutoString(), nullable=False), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index(op.f("ix_user_email"), "user", ["email"], unique=True) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f("ix_user_email"), table_name="user") + op.drop_table("user") + # ### end Alembic commands ### diff --git a/crawler/api/app.py b/crawler/api/app.py index 08f4705..3a4d096 100644 --- a/crawler/api/app.py +++ b/crawler/api/app.py @@ -1,14 +1,31 @@ -from fastapi import FastAPI +from typing import Annotated +from fastapi import Depends, FastAPI, HTTPException, status +from fastapi.security import OAuth2PasswordBearer +from models.user import User from repositories.listing_repository import ListingRepository from repositories.listing_repository import ListingRepository from database import engine +from repositories.user_repository import UserRepository app = FastAPI() +oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token") + + +async def decode_token(token: Annotated[str, Depends(oauth2_scheme)]): + repository = UserRepository(engine) + user = await repository.get_user_from_token(token) + if not user: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Invalid authentication credentials", + headers={"WWW-Authenticate": "Bearer"}, + ) + return user @app.get("/listing") -async def get_listing(): +async def get_listing(user: Annotated[User | None, Depends(decode_token)]): repository = ListingRepository(engine) listings = await repository.get_listings() return {"listings": listings} diff --git a/crawler/models/__init__.py b/crawler/models/__init__.py index 51ad9dc..6c25b27 100644 --- a/crawler/models/__init__.py +++ b/crawler/models/__init__.py @@ -1,4 +1,5 @@ from models.listing import Listing +from models.user import User -__all__ = ["Listing"] +__all__ = ["Listing", "User"] diff --git a/crawler/models/user.py b/crawler/models/user.py new file mode 100644 index 0000000..eed53eb --- /dev/null +++ b/crawler/models/user.py @@ -0,0 +1,8 @@ +from pydantic import EmailStr +from sqlmodel import SQLModel, Field + + +class User(SQLModel, table=True): + id: int = Field(primary_key=True) + email: EmailStr = Field(index=True, unique=True) + password: str = Field(nullable=False) diff --git a/crawler/repositories/__init__.py b/crawler/repositories/__init__.py new file mode 100644 index 0000000..4cb7a28 --- /dev/null +++ b/crawler/repositories/__init__.py @@ -0,0 +1,4 @@ +from repositories.listing_repository import ListingRepository + + +__all__ = ["ListingRepository"] diff --git a/crawler/repositories/listing_repository.py b/crawler/repositories/listing_repository.py new file mode 100644 index 0000000..e628e2f --- /dev/null +++ b/crawler/repositories/listing_repository.py @@ -0,0 +1,182 @@ +from datetime import datetime, timedelta +from data_access import Listing +from models.listing import ( + BuyListing, + FurnishType, + Listing as modelListing, + QueryParameters, + RentListing, +) +from sqlalchemy import Engine +from sqlmodel import Sequence, Session, and_, col, select +from sqlmodel.sql.expression import SelectOfScalar +from tqdm import tqdm + + +class ListingRepository: + engine: Engine + # anything more than 10k is considered buy type + buy_listing_price_threshold: int = 20_000 + + def __init__(self, engine: Engine): + self.engine = engine + + async def get_listings( + self, + query_parameters: QueryParameters | None = None, + only_ids: list[int] | None = None, + ) -> list[modelListing]: + """ + Get all listings from the database. + """ + only_ids = only_ids or [] + + query = select( + RentListing + ) # TODO: one nice day I will think of a way to query both rent and buy + if only_ids: + query = query.where(RentListing.id.in_(only_ids)) # type: ignore + query = self._add_where_from_query_parameters(query, query_parameters) + + with Session(self.engine) as session: + # query = select(modelListing) + return list(session.exec(query).all()) + + def _add_where_from_query_parameters( + self, + query: SelectOfScalar[RentListing], + query_parameters: QueryParameters | None = None, + ) -> SelectOfScalar[RentListing]: + if query_parameters is None: + return query + query = query.where( + RentListing.number_of_bedrooms.between( + query_parameters.min_bedrooms, query_parameters.max_bedrooms + ), + RentListing.price.between( + query_parameters.min_price, query_parameters.max_price + ), + ) + if query_parameters.min_sqm is not None: + query = query.where(RentListing.square_meters >= query_parameters.min_sqm) + if query_parameters.furnish_types: + query = query.where( + RentListing.furnish_type.in_(query_parameters.furnish_types) + ) + if query_parameters.let_date_available_from is not None: + query = query.where( + RentListing.available_from >= query_parameters.let_date_available_from + ) + if query_parameters.last_seen_days is not None: + last_seen_threshold = datetime.now() - timedelta( + days=query_parameters.last_seen_days + ) + query = query.where(RentListing.last_seen >= last_seen_threshold) + return query + + async def upsert_listings( + self, + listings: list[modelListing], + ) -> list[modelListing]: + """ + Upsert listings into the database. + """ + models = [] + with Session(self.engine) as session: + for listing in listings: + session.merge(listing) + models.append(listing) + session.commit() + return models + + async def upsert_listings_legacy( + self, + listings: list[Listing], + ) -> list[modelListing]: + """ + Upsert listings into the database. + """ + models = [] + failed_to_upsert = [] + with Session(self.engine) as session: + for listing in tqdm(listings, desc="Upserting listings"): + # Convert Listing to modelListing + try: + model_listing = await self._get_concrete_listing(listing) + except Exception as e: # WHY SO MANY ERORRS?? + import ipdb + + ipdb.set_trace() + # If for whatever reason we cannot add listing, ignore and retry + print(f"Error converting listing {listing.identifier}: {e}") + failed_to_upsert.append(listing) + continue + session.merge(model_listing) + models.append(model_listing) + session.commit() + print(f"Failed to upsert {len(failed_to_upsert)} listings.") + return models + + async def _get_concrete_listing( + self, + listing: Listing, + ) -> modelListing: + now = datetime.now() + + if ( + listing.detailobject is None + or listing.detailobject.get("property") is None + or listing.detailobject["property"].get("letFurnishType") is None + ): + furnish_type_str = "unknown" + else: + furnish_type_str = listing.detailobject["property"]["letFurnishType"] + if furnish_type_str is None: + furnish_type_str = "unknown" + elif "landlord" in furnish_type_str.lower(): + furnish_type_str = "ask landlord" + else: + furnish_type_str = furnish_type_str.lower() + furnish_type = FurnishType(furnish_type_str) + + if listing.price < self.buy_listing_price_threshold: + model_listing = RentListing( + id=listing.identifier, + price=listing.price, + number_of_bedrooms=listing.bedrooms, + square_meters=await listing.sqm_ocr(), + agency=listing.agency, + council_tax_band=listing.councilTaxBand, + longtitude=listing.longtitude, + latitude=listing.latitude, + price_history_json=modelListing.serialize_price_history( + listing.priceHistory + ), + listing_site=listing.listing_site, + last_seen=now, + photo_thumbnail=listing.photoThumbnail, + furnish_type=furnish_type, + available_from=listing.letDateAvailable, + additional_info=listing.detailobject, + ) + else: + model_listing = BuyListing( + id=listing.identifier, + price=listing.price, + number_of_bedrooms=listing.bedrooms, + square_meters=await listing.sqm_ocr(), + agency=listing.agency, + council_tax_band=listing.councilTaxBand, + longtitude=listing.longtitude, + latitude=listing.latitude, + price_history_json=modelListing.serialize_price_history( + listing.priceHistory + ), + listing_site=listing.listing_site, + last_seen=now, + photo_thumbnail=listing.photoThumbnail, + service_charge=listing.serviceCharge, + additional_info=listing.detailobject, + ) + + return model_listing diff --git a/crawler/repositories/user_repository.py b/crawler/repositories/user_repository.py new file mode 100644 index 0000000..028949c --- /dev/null +++ b/crawler/repositories/user_repository.py @@ -0,0 +1,17 @@ +from models.user import User +from sqlalchemy import Engine +from sqlmodel import Session, select + + +class UserRepository: + engine: Engine + + def __init__(self, engine: Engine): + self.engine = engine + + async def get_user_from_token(self, token: str) -> User | None: + raise NotImplementedError() + query = select(User) + + with Session(self.engine) as session: + return session.exec(query).first()