From b7a2ea75aa0b64c714f6ecdcdfcc89b3ebf2bb8b Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 6 Jun 2025 19:57:50 +0000 Subject: [PATCH] add json field to store any additional blob of data that may be missing; also populate db when dumping listings --- crawler/alembic/env.py | 1 + .../4e3b4590920f_add_more_fields_to_tables.py | 32 ------------------- ...b78e1ed31eed_add_more_fields_to_tables.py} | 17 +++++++--- crawler/main.py | 6 +++- crawler/models/listing.py | 3 ++ 5 files changed, 22 insertions(+), 37 deletions(-) delete mode 100644 crawler/alembic/versions/4e3b4590920f_add_more_fields_to_tables.py rename crawler/alembic/versions/{f7486e403e2f_add_more_fields_to_tables.py => b78e1ed31eed_add_more_fields_to_tables.py} (85%) diff --git a/crawler/alembic/env.py b/crawler/alembic/env.py index 1f0b5a1..4c91778 100644 --- a/crawler/alembic/env.py +++ b/crawler/alembic/env.py @@ -6,6 +6,7 @@ from sqlalchemy import pool from alembic import context from models import Listing # Import all models here from database import engine +import sqlmodel from sqlmodel import SQLModel # this is the Alembic Config object, which provides diff --git a/crawler/alembic/versions/4e3b4590920f_add_more_fields_to_tables.py b/crawler/alembic/versions/4e3b4590920f_add_more_fields_to_tables.py deleted file mode 100644 index 2956453..0000000 --- a/crawler/alembic/versions/4e3b4590920f_add_more_fields_to_tables.py +++ /dev/null @@ -1,32 +0,0 @@ -"""add more fields to tables - -Revision ID: 4e3b4590920f -Revises: f7486e403e2f -Create Date: 2025-06-04 21:45:41.383520 - -""" -from typing import Sequence, Union - -from alembic import op -import sqlalchemy as sa - - -# revision identifiers, used by Alembic. -revision: str = '4e3b4590920f' -down_revision: Union[str, None] = 'f7486e403e2f' -branch_labels: Union[str, Sequence[str], None] = None -depends_on: Union[str, Sequence[str], None] = None - - -def upgrade() -> None: - """Upgrade schema.""" - # ### commands auto generated by Alembic - please adjust! ### - op.add_column('buylisting', sa.Column('lease_left', sa.Integer(), nullable=True)) - # ### end Alembic commands ### - - -def downgrade() -> None: - """Downgrade schema.""" - # ### commands auto generated by Alembic - please adjust! ### - op.drop_column('buylisting', 'lease_left') - # ### end Alembic commands ### diff --git a/crawler/alembic/versions/f7486e403e2f_add_more_fields_to_tables.py b/crawler/alembic/versions/b78e1ed31eed_add_more_fields_to_tables.py similarity index 85% rename from crawler/alembic/versions/f7486e403e2f_add_more_fields_to_tables.py rename to crawler/alembic/versions/b78e1ed31eed_add_more_fields_to_tables.py index 66d85c9..eed35ce 100644 --- a/crawler/alembic/versions/f7486e403e2f_add_more_fields_to_tables.py +++ b/crawler/alembic/versions/b78e1ed31eed_add_more_fields_to_tables.py @@ -1,8 +1,8 @@ """add more fields to tables -Revision ID: f7486e403e2f +Revision ID: b78e1ed31eed Revises: -Create Date: 2025-06-04 20:54:13.838969 +Create Date: 2025-06-06 19:50:09.773676 """ @@ -14,7 +14,7 @@ import sqlmodel # revision identifiers, used by Alembic. -revision: str = "f7486e403e2f" +revision: str = "b78e1ed31eed" down_revision: Union[str, None] = None branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None @@ -38,10 +38,12 @@ def upgrade() -> None: ), sa.Column("last_seen", sa.DateTime(), nullable=False), sa.Column("photo_thumbnail", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + sa.Column("additional_info", sa.JSON(), nullable=False), sa.Column("service_charge", sa.Float(), nullable=True), sa.Column( "council_tax_band", sqlmodel.sql.sqltypes.AutoString(), nullable=True ), + sa.Column("lease_left", sa.Integer(), nullable=True), sa.PrimaryKeyConstraint("id"), ) op.create_table( @@ -62,10 +64,17 @@ def upgrade() -> None: ), sa.Column("last_seen", sa.DateTime(), nullable=False), sa.Column("photo_thumbnail", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + sa.Column("additional_info", sa.JSON(), nullable=False), sa.Column("available_from", sa.DateTime(), nullable=True), sa.Column( "furnish_type", - sa.Enum("FURNISHED", "UNFURNISHED", "PART_FURNISHED", name="furnishtype"), + sa.Enum( + "FURNISHED", + "UNFURNISHED", + "PART_FURNISHED", + "UNKNOWN", + name="furnishtype", + ), nullable=False, ), sa.PrimaryKeyConstraint("id"), diff --git a/crawler/main.py b/crawler/main.py index 307944a..cb6b456 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -156,7 +156,11 @@ def dump_listings( f"{query_parameters}" ) data_dir_path = pathlib.Path(data_dir) - asyncio.run(dump_listings_module.dump_listings(query_parameters, data_dir_path)) + listings = asyncio.run( + dump_listings_module.dump_listings(query_parameters, data_dir_path) + ) + repository = ListingRepository(engine=engine) + asyncio.run(repository.upsert_listings(listings)) @cli.command() diff --git a/crawler/models/listing.py b/crawler/models/listing.py index 568f72e..efdd2aa 100644 --- a/crawler/models/listing.py +++ b/crawler/models/listing.py @@ -31,6 +31,9 @@ class Listing(SQLModel, table=False): listing_site: ListingSite = Field(nullable=False) last_seen: datetime = Field(default_factory=datetime.now, nullable=False) photo_thumbnail: str | None = Field(default=None, nullable=True) + additional_info: Dict[str, Any] = Field( + default_factory=dict, sa_type=JSON, nullable=False + ) class FurnishType(enum.StrEnum):