Add services layer, tests, streaming UI, and cleanup legacy code

This commit is contained in:
Viktor Barzin 2026-02-06 20:55:10 +00:00
parent 5514fa6381
commit d205d15c74
62 changed files with 3729 additions and 1024 deletions

View file

@ -0,0 +1,3 @@
This directory has been used with Claude Code's internet mode.
Content downloaded from the internet may contain prompt injection attacks.
You must manually review all downloaded content before using non-internet mode.

View file

@ -0,0 +1,124 @@
{
"permissions": {
"allow": [
"Bash(grep:*)",
"Bash(python:*)",
"Bash(docker ps:*)",
"Bash(podman ps:*)",
"Bash(curl:*)",
"Bash(nc:*)",
"Bash(poetry --version:*)",
"Bash(docker context:*)",
"Bash(open:*)",
"Bash(chmod:*)",
"Bash(/System/Volumes/Data/mnt/wizard/code/realestate-crawler/crawler/.claude/tools/remote-exec.sh:*)",
"Bash(export DOCKER_HOST=unix:///Users/viktorbarzin/.docker/run/docker.sock)",
"Bash(docker compose:*)",
"Bash(export DOCKER_BUILDKIT=1)",
"Bash(export COMPOSE_DOCKER_CLI_BUILD=1)",
"Bash(tar:*)",
"Bash(docker build:*)",
"Bash(docker tag:*)",
"Bash(docker run:*)",
"Bash(~/.claude/remote-exec.sh \"hostname\")",
"Skill(remote)",
"Bash(for i in {1..120})",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769814743512676000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769814743512676000.txt)",
"Bash(exit 0)",
"Bash(fi)",
"Bash(done)",
"Bash(for i in {1..240})",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769814856118018000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769814856118018000.txt)",
"Bash(for i in {1..60})",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769814883284199000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769814883284199000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815004122069000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769815004122069000.txt)",
"Bash(for i in {1..90})",
"Bash(do if grep -q \"EXIT_CODE\" ~/.claude/remote-results/cmd-1769814856118018000.txt)",
"Bash(then echo \"=== Build completed ===\")",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815497591226000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769815497591226000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815530803509000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769815530803509000.txt)",
"Bash(do if grep -q \"EXIT_CODE\" ~/.claude/remote-results/cmd-1769815530803509000.txt)",
"Bash(for i in {1..30})",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815614622428000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769815614622428000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815710424010000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769815710424010000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815892793650000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769815892793650000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816040589015000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816040589015000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816256870361000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816256870361000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816300264785000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816300264785000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816375772556000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816375772556000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816407482202000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816407482202000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816439320016000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816439320016000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816532941427000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816532941427000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816611986724000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816611986724000.txt)",
"Bash(for i in {1..40})",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816682085291000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816682085291000.txt)",
"Bash(for i in {1..20})",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816742848870000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816742848870000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816763327960000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816763327960000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816784934447000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816784934447000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816872796427000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816872796427000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816892104231000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816892104231000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816911037685000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816911037685000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816946320457000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816946320457000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816987766946000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769816987766946000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769817008932477000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769817008932477000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769817027145242000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769817027145242000.txt)",
"Bash(for file in /mnt/wizard/code/realestate-crawler/crawler/frontend/src/components/ui/*.tsx)",
"Bash(do)",
"Bash(basename:*)",
"Bash(wc:*)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769819894031906000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769819894031906000.txt)",
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769854789336791000.txt ])",
"Bash(then cat ~/.claude/remote-results/cmd-1769854789336791000.txt)",
"Bash(npx tsc:*)",
"Bash(npx eslint:*)",
"Bash(find:*)",
"Bash(sync)",
"Bash(echo:*)",
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875304344407000.txt ])",
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875304344407000.txt)",
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875708563896000.txt ])",
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875708563896000.txt)",
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875753067606000.txt ])",
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875753067606000.txt)",
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875830424071000.txt ])",
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875830424071000.txt)",
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875948670335000.txt ])",
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875948670335000.txt)",
"Bash(sort:*)",
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769876096467703000.txt ])",
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769876096467703000.txt)",
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769876529766339000.txt ])",
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769876529766339000.txt)"
]
}
}

View file

@ -0,0 +1,101 @@
---
name: python-313-redis-generic-type
description: |
Fix for "TypeError: <class 'redis.client.Redis'> is not a generic class" when using
redis-py with Python 3.13. Use when: (1) upgrading to Python 3.13 breaks redis type
annotations, (2) mypy passes but runtime fails with generic class error, (3) using
redis.Redis[str] or similar parameterized types. Covers redis-py generic type
compatibility with Python 3.13's stricter runtime generic checking.
author: Claude Code
version: 1.0.0
date: 2026-01-31
---
# Python 3.13 redis.Redis Generic Type Error
## Problem
Python 3.13 introduced stricter runtime checking for generic types. The redis-py library's
`Redis` class is not defined as a generic class at runtime, even though it works with type
checkers like mypy. This causes a `TypeError` when you use parameterized types like
`redis.Redis[str]` in type annotations that are evaluated at runtime.
## Context / Trigger Conditions
- Python 3.13 or later
- Using redis-py library
- Type annotation like `redis_client: redis.Redis[str]`
- Error message: `TypeError: <class 'redis.client.Redis'> is not a generic class`
- Works fine with mypy but fails at runtime
- Often appears when instantiating a class with this annotation
## Solution
### Option 1: Remove the type parameter (Recommended)
```python
# Before (breaks in Python 3.13)
redis_client: redis.Redis[str]
# After (works in all Python versions)
redis_client: redis.Redis # type: ignore[type-arg]
```
The `# type: ignore[type-arg]` comment silences mypy's warning about missing type arguments.
### Option 2: Use string annotation (deferred evaluation)
```python
from __future__ import annotations
redis_client: "redis.Redis[str]" # String annotation, not evaluated at runtime
```
### Option 3: Use TYPE_CHECKING guard
```python
from typing import TYPE_CHECKING
if TYPE_CHECKING:
RedisClient = redis.Redis[str]
else:
RedisClient = redis.Redis
redis_client: RedisClient
```
## Verification
1. Run your application with Python 3.13
2. The TypeError should no longer appear
3. Run mypy to ensure type checking still works (may need type: ignore comment)
## Example
### Before (Broken)
```python
import redis
class RedisRepository:
redis_client: redis.Redis[str] # TypeError at runtime in Python 3.13
def __init__(self):
self.redis_client = redis.Redis(host='localhost', decode_responses=True)
```
### After (Fixed)
```python
import redis
class RedisRepository:
redis_client: redis.Redis # type: ignore[type-arg]
def __init__(self):
self.redis_client = redis.Redis(host='localhost', decode_responses=True)
```
## Notes
- This is a breaking change in Python 3.13's handling of generic types
- The redis-py library may add proper generic support in future versions
- If using `decode_responses=True`, the client returns `str`; otherwise `bytes`
- The `type: ignore` comment is preferable to `Any` as it preserves some type safety
- This issue affects other libraries that aren't properly defined as Generic classes
## References
- [Python 3.13 Release Notes](https://docs.python.org/3.13/whatsnew/3.13.html)
- [redis-py GitHub Issues](https://github.com/redis/redis-py/issues)
- [PEP 585 - Type Hinting Generics In Standard Collections](https://peps.python.org/pep-0585/)

View file

@ -0,0 +1,132 @@
---
name: python-parentheses-comparison-bug
description: |
Debug Python comparison bug where parentheses around a variable cause unexpected behavior.
Use when: (1) condition always evaluates to False/True unexpectedly, (2) code like
"if (mylist) == 0" never triggers, (3) length check seems to not work, (4) comparison
with list/dict returns unexpected results. Common mistake where parentheses cause the
variable itself to be compared instead of its length.
author: Claude Code
version: 1.0.0
date: 2026-01-31
---
# Python Parentheses Comparison Bug
## Problem
A subtle Python bug where unnecessary parentheses around a variable in a comparison
cause the wrong value to be compared. The expression `(mylist) == 0` compares the list
itself to 0, not its length. Since a list is never equal to an integer, this always
returns False.
## Context / Trigger Conditions
- Condition that should sometimes be True is always False (or vice versa)
- Code pattern like `if (existing_items) == 0:` or `if (result) == expected:`
- The parentheses don't cause a syntax error but change semantics
- Often appears when copying/adapting code or during refactoring
- May pass code review because it "looks" correct
## Solution
### Identify the Bug Pattern
```python
# BUG: Compares list to 0, always False
if (existing_listings) == 0:
return True
# Also wrong: compares list to integer
if (items) == 5:
do_something()
```
### Fix: Use len() for Length Comparisons
```python
# CORRECT: Compares length to 0
if len(existing_listings) == 0:
return True
# Alternative: Use truthiness for empty check
if not existing_listings:
return True
# CORRECT: Compares length to integer
if len(items) == 5:
do_something()
```
## Verification
1. Add a debug print before the condition: `print(f"list={existing_listings}, len={len(existing_listings)}")`
2. Verify the condition now evaluates correctly
3. Write a unit test that exercises both branches of the condition
## Example
### Before (Broken)
```python
class FetchListingDetailsStep:
async def needs_processing(self, listing_id: int) -> bool:
existing_listings = await self.listing_repository.get_listings(
only_ids=[listing_id]
)
# BUG: This compares the list object to 0, which is always False
# The parentheses around existing_listings are misleading
if (existing_listings) == 0:
return True
return False
```
### After (Fixed)
```python
class FetchListingDetailsStep:
async def needs_processing(self, listing_id: int) -> bool:
existing_listings = await self.listing_repository.get_listings(
only_ids=[listing_id]
)
# CORRECT: Check if list is empty using len()
if len(existing_listings) == 0:
return True
return False
```
### Even Better (Pythonic)
```python
class FetchListingDetailsStep:
async def needs_processing(self, listing_id: int) -> bool:
existing_listings = await self.listing_repository.get_listings(
only_ids=[listing_id]
)
# Most Pythonic: Use truthiness
return not existing_listings
```
## Notes
- Python's truthiness: empty collections are falsy, non-empty are truthy
- This bug is particularly insidious because:
- It's syntactically valid
- It doesn't raise an exception
- The parentheses make it look intentional
- Code review may miss it
- Linters like pylint or flake8 won't catch this specific pattern
- Type checkers like mypy may warn about comparing incompatible types
- When debugging, add print statements to verify actual vs expected values
## Prevention
- Prefer `if not mylist:` over `if len(mylist) == 0:`
- Prefer `if mylist:` over `if len(mylist) > 0:`
- Remove unnecessary parentheses around single variables
- Enable mypy's strict mode which may catch type comparison issues
- Write unit tests that exercise both branches of conditions
## Related Patterns
```python
# These are all wrong (comparing object to number):
if (mydict) == 0: # Always False
if (mylist) > 0: # TypeError in Python 3
if (mystring) == 0: # Always False
# These are correct:
if len(mydict) == 0: # True if empty
if not mydict: # True if empty (preferred)
if len(mylist) > 0: # True if non-empty
if mylist: # True if non-empty (preferred)
```

View file

@ -1,13 +0,0 @@
from data_access import Listing
from tqdm import tqdm
listings = Listing.get_all_listings()
recalculate_listings = []
for listing in listings:
sqm = listing.sqm_ocr
if sqm is None or sqm < 10 or sqm > 200:
recalculate_listings.append(listing)
for listing in tqdm(recalculate_listings):
listing.calculate_sqm_ocr(recalculate=True)

View file

@ -1,15 +0,0 @@
# recalculate regex from sqm from already previously ocr'ed text
import json
from rec.floorplan import extract_total_sqm
from tqdm import tqdm
from data_access import Listing
for listing in tqdm(list(Listing.get_all_listings())):
with open(listing.path_floorplan_ocr_json()) as f:
floorplans = json.load(f)
for floorplan in floorplans:
floorplan["estimated_sqm"] = extract_total_sqm(floorplan["text"])
with open(listing.path_floorplan_ocr_json(), "w") as f:
floorplans = json.dump(floorplans, f)

View file

@ -41,6 +41,7 @@ EXPOSE 5001
# Set the entry point (adjust to your CLI's entry point)
# ENTRYPOINT ["python", "/app/main.py"]
# ENTRYPOINT ["/app/runall.sh"]
# CMD ["/bin/bash" ,"-c" ,"alembic upgrade head && uvicorn api.app:app --host 0.0.0.0 --port 8000"]
# ENTRYPOINT ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "8000"]
CMD ["./start.sh"]
# For local dev with docker-compose:
# CMD ["./start.sh"]
# For Kubernetes deployment:
CMD ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "5001"]

View file

@ -0,0 +1,56 @@
"""add streaming indexes for query optimization
Revision ID: a1b2c3d4e5f6
Revises: e5f1bc4e3323
Create Date: 2026-02-01 12:00:00.000000
"""
from typing import Sequence, Union
from alembic import op
# revision identifiers, used by Alembic.
revision: str = 'a1b2c3d4e5f6'
down_revision: Union[str, None] = 'e5f1bc4e3323'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Add composite and single-column indexes for streaming query optimization."""
# Composite index for main query pattern (bedrooms, price, last_seen filtering)
op.create_index(
'ix_rentlisting_query_composite',
'rentlisting',
['number_of_bedrooms', 'price', 'last_seen'],
unique=False
)
op.create_index(
'ix_buylisting_query_composite',
'buylisting',
['number_of_bedrooms', 'price', 'last_seen'],
unique=False
)
# Missing single-column indexes for frequently filtered columns
op.create_index(
'ix_rentlisting_furnish_type',
'rentlisting',
['furnish_type'],
unique=False
)
op.create_index(
'ix_rentlisting_available_from',
'rentlisting',
['available_from'],
unique=False
)
def downgrade() -> None:
"""Remove streaming indexes."""
op.drop_index('ix_rentlisting_available_from', table_name='rentlisting')
op.drop_index('ix_rentlisting_furnish_type', table_name='rentlisting')
op.drop_index('ix_buylisting_query_composite', table_name='buylisting')
op.drop_index('ix_rentlisting_query_composite', table_name='rentlisting')

View file

@ -19,88 +19,12 @@ depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_index(op.f('ix_user_email'), table_name='user')
op.drop_table('user')
op.drop_index(op.f('ix_rentlisting_last_seen'), table_name='rentlisting')
op.drop_index(op.f('ix_rentlisting_number_of_bedrooms'), table_name='rentlisting')
op.drop_index(op.f('ix_rentlisting_price'), table_name='rentlisting')
op.drop_index(op.f('ix_rentlisting_square_meters'), table_name='rentlisting')
op.drop_table('rentlisting')
op.drop_index(op.f('ix_buylisting_last_seen'), table_name='buylisting')
op.drop_index(op.f('ix_buylisting_number_of_bedrooms'), table_name='buylisting')
op.drop_index(op.f('ix_buylisting_price'), table_name='buylisting')
op.drop_index(op.f('ix_buylisting_square_meters'), table_name='buylisting')
op.drop_table('buylisting')
# ### end Alembic commands ###
"""Upgrade schema - this migration is now a no-op since tables already have correct column name."""
# The tables were created with 'longitude' (correct spelling) in the initial migration.
# This migration was incorrectly auto-generated and has been fixed to be a no-op.
pass
def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('buylisting',
sa.Column('id', mysql.INTEGER(), autoincrement=True, nullable=False),
sa.Column('price', mysql.FLOAT(), nullable=False),
sa.Column('number_of_bedrooms', mysql.INTEGER(), autoincrement=False, nullable=False),
sa.Column('square_meters', mysql.FLOAT(), nullable=True),
sa.Column('agency', mysql.VARCHAR(length=255), nullable=True),
sa.Column('council_tax_band', mysql.VARCHAR(length=255), nullable=True),
sa.Column('longtitude', mysql.FLOAT(), nullable=False),
sa.Column('latitude', mysql.FLOAT(), nullable=False),
sa.Column('price_history_json', mysql.TEXT(), nullable=False),
sa.Column('listing_site', mysql.ENUM('RIGHTMOVE'), nullable=False),
sa.Column('last_seen', mysql.DATETIME(), nullable=False),
sa.Column('photo_thumbnail', mysql.VARCHAR(length=255), nullable=True),
sa.Column('floorplan_image_paths', mysql.JSON(), nullable=False),
sa.Column('additional_info', mysql.JSON(), nullable=False),
sa.Column('routing_info_json', mysql.TEXT(), nullable=True),
sa.Column('service_charge', mysql.FLOAT(), nullable=True),
sa.Column('lease_left', mysql.INTEGER(), autoincrement=False, nullable=True),
sa.PrimaryKeyConstraint('id'),
mysql_collate='utf8mb4_0900_ai_ci',
mysql_default_charset='utf8mb4',
mysql_engine='InnoDB'
)
op.create_index(op.f('ix_buylisting_square_meters'), 'buylisting', ['square_meters'], unique=False)
op.create_index(op.f('ix_buylisting_price'), 'buylisting', ['price'], unique=False)
op.create_index(op.f('ix_buylisting_number_of_bedrooms'), 'buylisting', ['number_of_bedrooms'], unique=False)
op.create_index(op.f('ix_buylisting_last_seen'), 'buylisting', ['last_seen'], unique=False)
op.create_table('rentlisting',
sa.Column('id', mysql.INTEGER(), autoincrement=True, nullable=False),
sa.Column('price', mysql.FLOAT(), nullable=False),
sa.Column('number_of_bedrooms', mysql.INTEGER(), autoincrement=False, nullable=False),
sa.Column('square_meters', mysql.FLOAT(), nullable=True),
sa.Column('agency', mysql.VARCHAR(length=255), nullable=True),
sa.Column('council_tax_band', mysql.VARCHAR(length=255), nullable=True),
sa.Column('longtitude', mysql.FLOAT(), nullable=False),
sa.Column('latitude', mysql.FLOAT(), nullable=False),
sa.Column('price_history_json', mysql.TEXT(), nullable=False),
sa.Column('listing_site', mysql.ENUM('RIGHTMOVE'), nullable=False),
sa.Column('last_seen', mysql.DATETIME(), nullable=False),
sa.Column('photo_thumbnail', mysql.VARCHAR(length=255), nullable=True),
sa.Column('floorplan_image_paths', mysql.JSON(), nullable=False),
sa.Column('additional_info', mysql.JSON(), nullable=False),
sa.Column('routing_info_json', mysql.TEXT(), nullable=True),
sa.Column('available_from', mysql.DATETIME(), nullable=True),
sa.Column('furnish_type', mysql.ENUM('FURNISHED', 'UNFURNISHED', 'PART_FURNISHED', 'ASK_LANDLORD', 'UNKNOWN'), nullable=False),
sa.PrimaryKeyConstraint('id'),
mysql_collate='utf8mb4_0900_ai_ci',
mysql_default_charset='utf8mb4',
mysql_engine='InnoDB'
)
op.create_index(op.f('ix_rentlisting_square_meters'), 'rentlisting', ['square_meters'], unique=False)
op.create_index(op.f('ix_rentlisting_price'), 'rentlisting', ['price'], unique=False)
op.create_index(op.f('ix_rentlisting_number_of_bedrooms'), 'rentlisting', ['number_of_bedrooms'], unique=False)
op.create_index(op.f('ix_rentlisting_last_seen'), 'rentlisting', ['last_seen'], unique=False)
op.create_table('user',
sa.Column('id', mysql.INTEGER(), autoincrement=True, nullable=False),
sa.Column('email', mysql.VARCHAR(length=255), nullable=False),
sa.Column('password', mysql.VARCHAR(length=255), nullable=False),
sa.PrimaryKeyConstraint('id'),
mysql_collate='utf8mb4_0900_ai_ci',
mysql_default_charset='utf8mb4',
mysql_engine='InnoDB'
)
op.create_index(op.f('ix_user_email'), 'user', ['email'], unique=True)
# ### end Alembic commands ###
"""Downgrade schema - no-op since upgrade is no-op."""
pass

View file

@ -1,6 +1,6 @@
from pathlib import Path
import pandas as pd
from rec.query import QueryParameters
from models.listing import QueryParameters
from repositories.listing_repository import ListingRepository
@ -10,7 +10,7 @@ async def export_to_csv(
query_parameters: QueryParameters | None = None,
) -> None:
listings = await repository.get_listings(query_parameters=query_parameters)
ds = [*[listing.__dict__ for listing in listings]]
ds = [listing.__dict__ for listing in listings]
df = pd.DataFrame(ds)
# read decisions on file
@ -22,37 +22,19 @@ async def export_to_csv(
drop_columns = ["_sa_instance_state", "additional_info"]
df = df.drop(columns=drop_columns)
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
# df2 = df[df.travel_time_fastest.notna()]
df2 = df
# fill in gap values for service charge and lease left for Excel filters
if "service_charge" not in df.columns:
df.loc[:, "service_charge"] = -1
df.loc[:, "service_charge"] = df.service_charge.fillna(-1)
if "lease_left" not in df.columns:
df.loc[:, "lease_left"] = -1
df.loc[:, "lease_left"] = df.lease_left.fillna(-1)
if "square_meters" not in df.columns:
df.loc[:, "square_meters"] = -1
df.loc[:, "square_meters"] = df.square_meters.fillna(-1)
# drop columns
# dropcolumns = ['distance_per_transit', 'duration_static', 'distance']
# s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)
# s1 = df2
# Add price per sqm column
df.loc[:, "price_per_sqm"] = df.price / df.square_meters
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
if "service_charge" not in df2.columns:
df2.loc[:, "service_charge"] = -1
df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1)
if "lease_left" not in df2.columns:
df2.loc[:, "lease_left"] = -1
df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1)
if "square_meters" not in df2.columns:
df2.loc[:, "square_meters"] = -1
df2.loc[:, "square_meters"] = df2.square_meters.fillna(-1)
df3 = df2
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
# df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()
df3.shape
df4 = df3
# df5 = df4[columns]
# Add some interesting columns
df4.loc[:, "price_per_sqm"] = df4.price / df4.square_meters
df5 = df4
df6 = df5.sort_values(by=["price_per_sqm"], ascending=True)
df6.to_csv(str(output_file), index=False)
df = df.sort_values(by=["price_per_sqm"], ascending=True)
df.to_csv(str(output_file), index=False)

View file

@ -4,6 +4,7 @@ from dataclasses import dataclass
import json
import pathlib
from typing import Any, List
import warnings
from models.listing import ListingSite, PriceHistoryItem
from rec import floorplan, routing
import re
@ -12,6 +13,12 @@ import datetime
@dataclass()
class Listing:
"""Legacy Listing class for filesystem-based data access.
.. deprecated::
Use models.listing.RentListing or models.listing.BuyListing instead.
This class is kept for backwards compatibility with the populate_db command.
"""
identifier: int
_details_object: dict[str, Any] | None = None
_listing_object: dict[str, Any] | None = None
@ -36,6 +43,14 @@ class Listing:
"council_tax_band",
]
def __post_init__(self) -> None:
warnings.warn(
"data_access.Listing is deprecated. Use models.listing.RentListing "
"or models.listing.BuyListing instead.",
DeprecationWarning,
stacklevel=3,
)
@staticmethod
def get_all_listings(
listing_paths: list[pathlib.Path],
@ -144,39 +159,6 @@ class Listing:
# todo add check if return is image
return images
def calculate_sqm_model(self):
objs = []
for floorplan_path in self.list_floorplans():
estimated_sqm, model_output, predictions = floorplan.calculate_model(
floorplan_path
)
objs.append(
{
"floorplan_path": str(floorplan_path),
"estimated_sqm": estimated_sqm,
"model_output": model_output,
"no_predictions": len(
predictions
), # cant serialize the predictions itself since its a tensor
}
)
with open(self.path_floorplan_model_json(), "w") as f:
json.dump(objs, f)
@property
def sqm_model(self, recalculate=True) -> float:
if not self.path_floorplan_model_json().exists() or recalculate:
self.calculate_sqm_model()
with open(self.path_floorplan_json()) as f:
objs = json.load(f)
max_sqm = max(
[o["estimated_sqm"] for o in objs if o is None]
) # filter out Nones
return max_sqm
async def calculate_sqm_ocr(self, recalculate=True):
objs = []
if self.path_floorplan_ocr_json().exists():
@ -405,63 +387,6 @@ class Listing:
def listing_site(self) -> ListingSite:
return ListingSite.RIGHTMOVE # this class supports only right move
async def dict_nicely(self):
travel_time_fastest = {}
travel_time_second = {}
if self.path_routing_json().exists():
with open(self.path_routing_json(), "r") as f:
travel_times = json.load(f)
for destination_mode in travel_times.keys():
destination_mode_clean = destination_mode.replace(" ", "_").replace(
",", "_"
)
destination, travel_mode = self.__from_routing_cache_key(
destination_mode
)
travel_time_fastest[destination_mode_clean] = self.travel_time(
destination, travel_mode
)[0]["duration"]
travel_time_second[destination_mode_clean] = self.travel_time(
destination, travel_mode
)[1]["duration"]
return {
"identifier": self.identifier,
"sqm_ocr": await self.sqm_ocr(),
"price": self.price,
"price_per_sqm": await self.price_per_sqm(),
"url": self.url,
"bedrooms": self.bedrooms,
"travel_time_fastest": ":".join(
sorted(
f"{dest} in {travel_mode//60}min"
for dest, travel_mode in travel_time_fastest.items()
)
),
"travel_time_second": ":".join(
sorted(
f"{dest} in {travel_mode//60}min"
for dest, travel_mode in travel_time_second.items()
)
),
"lease_left": self.leaseLeft,
"service_charge": self.serviceCharge,
"development": self.development,
"tenure_type": self.tenure_type,
"updated_days": self.updateDaysAgo,
"status": self.status,
"last_seen": self.last_seen,
"agency": self.agency,
"council_tax_band": self.councilTaxBand,
"photo_thumbnail": self.photoThumbnail,
"let_date_available": (
self.letDateAvailable.strftime("%d/%m/%Y")
if self.letDateAvailable
else "Ask agent"
),
"price_history": self.priceHistory,
}
def __routing_cache_key(
self,
dest_address: str,

View file

@ -14,10 +14,13 @@ services:
interval: 5s
timeout: 3s
retries: 5
networks:
- rec-network
mysql:
image: mysql:9
container_name: rec-mysql
hostname: mysql
ports:
- "3306:3306"
environment:
@ -32,6 +35,9 @@ services:
interval: 10s
timeout: 5s
retries: 5
start_period: 30s
networks:
- rec-network
app:
build:
@ -47,7 +53,7 @@ services:
- app_venv:/app/.venv
environment:
- ENV=dev
- DB_CONNECTION_STRING=mysql://wrongmove:wrongmove@mysql:3306/wrongmove
- DB_CONNECTION_STRING=mysql+mysqldb://wrongmove:wrongmove@mysql:3306/wrongmove
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- ROUTING_API_KEY=${ROUTING_API_KEY:-}
@ -57,6 +63,8 @@ services:
mysql:
condition: service_healthy
command: ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "5001", "--reload", "--reload-dir", "api", "--reload-dir", "services", "--reload-dir", "repositories", "--reload-dir", "models"]
networks:
- rec-network
celery:
build:
@ -68,7 +76,7 @@ services:
- app_venv:/app/.venv
environment:
- ENV=dev
- DB_CONNECTION_STRING=mysql://wrongmove:wrongmove@mysql:3306/wrongmove
- DB_CONNECTION_STRING=mysql+mysqldb://wrongmove:wrongmove@mysql:3306/wrongmove
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- ROUTING_API_KEY=${ROUTING_API_KEY:-}
@ -79,6 +87,8 @@ services:
mysql:
condition: service_healthy
command: ["celery", "-A", "celery_app", "worker", "--loglevel=info"]
networks:
- rec-network
celery-beat:
build:
@ -90,7 +100,7 @@ services:
- app_venv:/app/.venv
environment:
- ENV=dev
- DB_CONNECTION_STRING=mysql://wrongmove:wrongmove@mysql:3306/wrongmove
- DB_CONNECTION_STRING=mysql+mysqldb://wrongmove:wrongmove@mysql:3306/wrongmove
- CELERY_BROKER_URL=redis://redis:6379/0
- CELERY_RESULT_BACKEND=redis://redis:6379/0
- SCRAPE_SCHEDULES=${SCRAPE_SCHEDULES:-}
@ -98,6 +108,12 @@ services:
- redis
- celery
command: ["celery", "-A", "celery_app", "beat", "--loglevel=info"]
networks:
- rec-network
networks:
rec-network:
driver: bridge
volumes:
redis_data:

183
crawler/docs/BACKEND.md Normal file
View file

@ -0,0 +1,183 @@
# Real Estate Crawler - Backend Documentation
A property listing aggregator that scrapes Rightmove UK, extracts square meters via OCR, and calculates transit routes.
## Quick Start
```bash
# Docker (recommended) - starts Redis, MySQL, API, and Celery
./start.sh
# Or run locally with Poetry
poetry install
./start.sh --local
```
API available at `http://localhost:5001`
## Dependencies
| Dependency | Purpose |
|------------|---------|
| Python 3.11+ | Runtime |
| Redis | Celery message broker |
| MySQL/SQLite | Database |
| Tesseract OCR | Floorplan text extraction |
| Docker | Containerized deployment |
### Python Packages (key)
- `fastapi` + `uvicorn` - HTTP API
- `celery` - Background tasks
- `sqlmodel` - ORM
- `pytesseract` + `opencv` - OCR
- `aiohttp` - Async HTTP client
## API Endpoints
### Health Check
```bash
curl http://localhost:5001/api/status
# {"status": "OK"}
```
### Get Listings
```bash
curl -H "Authorization: Bearer $TOKEN" \
"http://localhost:5001/api/listing?limit=10"
```
### Get Listings as GeoJSON
```bash
curl -H "Authorization: Bearer $TOKEN" \
"http://localhost:5001/api/listing_geojson?listing_type=RENT&min_bedrooms=2&max_price=3000"
```
### Refresh Listings (async)
```bash
curl -X POST -H "Authorization: Bearer $TOKEN" \
"http://localhost:5001/api/refresh_listings?listing_type=RENT&min_bedrooms=2&max_bedrooms=3&min_price=2000&max_price=4000"
# {"task_id": "abc123", "message": "Task abc123 started"}
```
### Check Task Status
```bash
curl -H "Authorization: Bearer $TOKEN" \
"http://localhost:5001/api/task_status?task_id=abc123"
# {"task_id": "abc123", "status": "SUCCESS", "result": "..."}
```
### Get Districts
```bash
curl -H "Authorization: Bearer $TOKEN" \
"http://localhost:5001/api/get_districts"
# {"Westminster": "REGION^93965", "Camden": "REGION^93934", ...}
```
## CLI Commands
```bash
# Fetch listings from Rightmove
python main.py dump-listings -t rent --min-bedrooms 2 --max-price 4000
# Download floorplan images
python main.py dump-images
# Run OCR on floorplans
python main.py detect-floorplan
# Calculate transit routes
python main.py routing -d "10 Downing Street, London" -m TRANSIT -l 10
# Export to GeoJSON
python main.py export-immoweb -O output.geojson -t rent --min-bedrooms 2
# Export to CSV
python main.py export-csv -O output.csv -t rent
# List available districts
python main.py list-districts
```
## Query Parameters
| Parameter | Type | Description |
|-----------|------|-------------|
| `listing_type` | RENT/BUY | Property type |
| `min_bedrooms` | int | Minimum bedrooms |
| `max_bedrooms` | int | Maximum bedrooms |
| `min_price` | int | Minimum price |
| `max_price` | int | Maximum price |
| `min_sqm` | int | Minimum square meters |
| `district` | string | District name (repeatable) |
| `furnish_types` | string | FURNISHED/UNFURNISHED/PART_FURNISHED |
| `last_seen_days` | int | Only listings seen in last N days |
## Architecture
```
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ CLI │ │ HTTP API │ │ Celery │
│ (main.py) │ │ (api/app.py)│ │ Worker │
└──────┬──────┘ └──────┬──────┘ └──────┬──────┘
│ │ │
└───────────────────┼───────────────────┘
┌────────▼────────┐
│ Services │
│ (services/*.py) │
└────────┬────────┘
┌────────────┼────────────┐
│ │ │
┌──────▼──────┐ ┌───▼───┐ ┌──────▼──────┐
│ Repository │ │ Redis │ │ Rightmove │
│ (MySQL) │ │ │ │ API │
└─────────────┘ └───────┘ └─────────────┘
```
## Environment Variables
```bash
# Database
DB_CONNECTION_STRING=mysql://user:pass@localhost:3306/wrongmove
# Redis (Celery)
CELERY_BROKER_URL=redis://localhost:6379/0
CELERY_RESULT_BACKEND=redis://localhost:6379/0
# Google Maps (optional, for routing)
ROUTING_API_KEY=your_api_key
```
## Authentication
API endpoints (except `/api/status`) require JWT authentication via Authentik OIDC.
```bash
# Get token from Authentik, then:
curl -H "Authorization: Bearer $TOKEN" http://localhost:5001/api/listing
```
## Project Structure
```
├── main.py # CLI entry point
├── api/app.py # FastAPI application
├── services/ # Business logic (shared by CLI + API)
│ ├── listing_service.py
│ ├── export_service.py
│ ├── district_service.py
│ └── task_service.py
├── repositories/ # Database access
├── models/ # SQLModel entities
├── rec/ # Core logic (query, OCR, routing)
├── tasks/ # Celery background tasks
└── tests/ # Test suite
```
## Running Tests
```bash
pytest tests/ -v --cov=.
mypy .
```

View file

@ -12,130 +12,47 @@ import {
} from "@/components/ui/sidebar"
import * as React from "react"
// This is sample data.
const data = {
navMain: [
{
title: "Getting Started",
title: "Property Explorer",
url: "#",
items: [
{
title: "Installation",
url: "#",
},
{
title: "Project Structure",
url: "#",
},
],
},
{
title: "Building Your Application",
url: "#",
items: [
{
title: "Routing",
url: "#",
},
{
title: "Data Fetching",
title: "Map View",
url: "#",
isActive: true,
},
{
title: "Rendering",
url: "#",
},
{
title: "Caching",
url: "#",
},
{
title: "Styling",
url: "#",
},
{
title: "Optimizing",
url: "#",
},
{
title: "Configuring",
url: "#",
},
{
title: "Testing",
url: "#",
},
{
title: "Authentication",
url: "#",
},
{
title: "Deploying",
url: "#",
},
{
title: "Upgrading",
url: "#",
},
{
title: "Examples",
title: "List View",
url: "#",
},
],
},
{
title: "API Reference",
title: "Data Management",
url: "#",
items: [
{
title: "Components",
title: "Refresh Listings",
url: "#",
},
{
title: "File Conventions",
url: "#",
},
{
title: "Functions",
url: "#",
},
{
title: "next.config.js Options",
url: "#",
},
{
title: "CLI",
url: "#",
},
{
title: "Edge Runtime",
title: "Active Tasks",
url: "#",
},
],
},
{
title: "Architecture",
title: "Settings",
url: "#",
items: [
{
title: "Accessibility",
title: "Preferences",
url: "#",
},
{
title: "Fast Refresh",
url: "#",
},
{
title: "Next.js Compiler",
url: "#",
},
{
title: "Supported Browsers",
url: "#",
},
{
title: "Turbopack",
title: "Account",
url: "#",
},
],
@ -145,21 +62,19 @@ const data = {
export function AppSidebar({ ...props }: React.ComponentProps<typeof Sidebar>) {
return (
// create closed by default
<Sidebar {...props} >
<Sidebar {...props}>
<SidebarHeader>
</SidebarHeader>
<SidebarContent>
{/* We create a SidebarGroup for each parent. */}
{data.navMain.map((item) => (
<SidebarGroup key={item.title}>
<SidebarGroupLabel>{item.title}</SidebarGroupLabel>
<SidebarGroupContent>
<SidebarMenu>
{item.items.map((item) => (
<SidebarMenuItem key={item.title}>
<SidebarMenuButton asChild isActive={item.isActive}>
<a href={item.url}>{item.title}</a>
{item.items.map((subItem) => (
<SidebarMenuItem key={subItem.title}>
<SidebarMenuButton asChild isActive={subItem.isActive}>
<a href={subItem.url}>{subItem.title}</a>
</SidebarMenuButton>
</SidebarMenuItem>
))}

View file

@ -1,8 +1,6 @@
import { getUser } from "@/auth/authService";
import { zodResolver } from "@hookform/resolvers/zod";
import { DialogTitle } from "@radix-ui/react-dialog";
import type { User } from "oidc-client-ts";
import { useEffect, useState } from "react";
import { useState } from "react";
import { useForm } from "react-hook-form";
import { z } from "zod";
import { Button } from "./ui/button";
@ -24,6 +22,12 @@ export enum ListingType {
BUY = 'BUY'
}
export enum FurnishType {
FURNISHED = 'furnished',
PART_FURNISHED = 'partFurnished',
UNFURNISHED = 'unfurnished',
}
export interface ParameterValues {
metric: Metric
@ -33,30 +37,15 @@ export interface ParameterValues {
min_price?: number
max_price?: number
min_sqm?: number
max_sqm?: number
min_price_per_sqm?: number
max_price_per_sqm?: number
last_seen_days?: number
available_from?: Date
district: string
furnish_types?: FurnishType[]
}
const fetchDistricts = async (user: User | null) => {
const accessToken = user?.access_token;
const response = await fetch('/api/get_districts',
{
method: 'GET',
headers: {
'Authorization': `Bearer ${accessToken}`, // Pass the token
'Content-Type': 'application/json',
},
}
);
if (!response.ok) {
throw new Error('Error: ' + response.status);
}
const data: Response = await response.json();
return data;
};
export function Parameters(
props: {
isOpen: boolean,
@ -69,15 +58,6 @@ export function Parameters(
} = useForm<ParameterValues>()
const [action, setAction] = useState<'fetch-data' | 'visualize' | null>(null)
const [availableFromRawInput, setAvailableFromRawInput] = useState("now");
const [_districts, setDistricts] = useState<string[]>([]);
useEffect(() => {
getUser().then(user => {
fetchDistricts(user).then(data => {
setDistricts(Object.keys(data));
})
})
}, []);
const formSchema = z.object({
metric: z.nativeEnum(Metric, { required_error: "Metric is required" }),
@ -177,29 +157,6 @@ export function Parameters(
</FormItem>
)}
/>
{/* <FormField # listings don't have district stored as metadata; so only useful in rightmove querying
control={form.control}
name="district"
render={({ field }) => (
<FormItem className="flex flex-row items-center gap-4">
<FormLabel>District</FormLabel>
<Select onValueChange={field.onChange} defaultValue={field.value}>
<FormControl>
<SelectTrigger className="w-[180px]">
<SelectValue placeholder="District" />
</SelectTrigger>
</FormControl>
<SelectContent {...register('district')} >
{districts.map((district, index) => (
<SelectItem key={index} value={district}>{district}
</SelectItem>
))}
</SelectContent>
</Select>
<FormMessage />
</FormItem>
)}
/> */}
<FormField
control={form.control}
name="min_sqm"

View file

@ -0,0 +1,128 @@
import { BarChart3, MapPin, PoundSterling, Maximize2, List, Map as MapIcon } from 'lucide-react';
import { Button } from './ui/button';
import type { GeoJSONFeatureCollection, PropertyFeature } from '@/types';
export type ViewMode = 'map' | 'list' | 'split';
interface StatsBarProps {
listingData: GeoJSONFeatureCollection | null;
viewMode: ViewMode;
onViewModeChange: (mode: ViewMode) => void;
}
interface ListingStats {
count: number;
avgPrice: number;
avgPricePerSqm: number;
avgSize: number;
}
function calculateStats(data: GeoJSONFeatureCollection | null): ListingStats {
if (!data || data.features.length === 0) {
return { count: 0, avgPrice: 0, avgPricePerSqm: 0, avgSize: 0 };
}
const features = data.features;
const count = features.length;
const validPrices = features
.map((f: PropertyFeature) => f.properties.total_price)
.filter((p): p is number => typeof p === 'number' && p > 0);
const validPricesPerSqm = features
.map((f: PropertyFeature) => f.properties.qmprice)
.filter((p): p is number => typeof p === 'number' && p > 0);
const validSizes = features
.map((f: PropertyFeature) => f.properties.qm)
.filter((s): s is number => typeof s === 'number' && s > 0);
const avgPrice = validPrices.length > 0
? validPrices.reduce((a, b) => a + b, 0) / validPrices.length
: 0;
const avgPricePerSqm = validPricesPerSqm.length > 0
? validPricesPerSqm.reduce((a, b) => a + b, 0) / validPricesPerSqm.length
: 0;
const avgSize = validSizes.length > 0
? validSizes.reduce((a, b) => a + b, 0) / validSizes.length
: 0;
return { count, avgPrice, avgPricePerSqm, avgSize };
}
function formatCurrency(value: number): string {
if (value >= 1000) {
return `£${(value / 1000).toFixed(1)}k`;
}
return `£${Math.round(value)}`;
}
export function StatsBar({ listingData, viewMode, onViewModeChange }: StatsBarProps) {
const stats = calculateStats(listingData);
return (
<div className="flex items-center justify-between px-4 py-2 bg-muted/50 border-t text-sm">
{/* Stats */}
<div className="flex items-center gap-4 text-muted-foreground">
<div className="flex items-center gap-1.5">
<MapPin className="h-4 w-4" />
<span className="font-medium text-foreground">{stats.count.toLocaleString()}</span>
<span className="hidden sm:inline">listings</span>
</div>
{stats.avgPrice > 0 && (
<>
<div className="hidden md:flex items-center gap-1.5">
<PoundSterling className="h-4 w-4" />
<span>Avg: <span className="font-medium text-foreground">{formatCurrency(stats.avgPrice)}</span></span>
</div>
<div className="hidden lg:flex items-center gap-1.5">
<BarChart3 className="h-4 w-4" />
<span>Avg £/m²: <span className="font-medium text-foreground">{formatCurrency(stats.avgPricePerSqm)}</span></span>
</div>
<div className="hidden lg:flex items-center gap-1.5">
<Maximize2 className="h-4 w-4" />
<span>Avg: <span className="font-medium text-foreground">{Math.round(stats.avgSize)} m²</span></span>
</div>
</>
)}
</div>
{/* View Mode Toggle */}
<div className="flex items-center gap-1 bg-background rounded-md border p-0.5">
<Button
variant={viewMode === 'map' ? 'secondary' : 'ghost'}
size="sm"
className="h-7 px-2"
onClick={() => onViewModeChange('map')}
>
<MapIcon className="h-4 w-4" />
<span className="hidden sm:inline ml-1">Map</span>
</Button>
<Button
variant={viewMode === 'list' ? 'secondary' : 'ghost'}
size="sm"
className="h-7 px-2"
onClick={() => onViewModeChange('list')}
>
<List className="h-4 w-4" />
<span className="hidden sm:inline ml-1">List</span>
</Button>
<Button
variant={viewMode === 'split' ? 'secondary' : 'ghost'}
size="sm"
className="h-7 px-2 hidden md:flex"
onClick={() => onViewModeChange('split')}
>
<div className="flex gap-0.5">
<div className="w-2 h-4 bg-current rounded-sm opacity-60" />
<div className="w-2 h-4 border border-current rounded-sm" />
</div>
<span className="hidden sm:inline ml-1">Split</span>
</Button>
</div>
</div>
);
}

View file

@ -0,0 +1,47 @@
import { Loader2 } from 'lucide-react';
import type { StreamingProgress } from '@/services';
interface StreamingProgressBarProps {
progress: StreamingProgress | null;
isLoading: boolean;
}
export function StreamingProgressBar({ progress, isLoading }: StreamingProgressBarProps) {
if (!isLoading) return null;
return (
<div className="absolute top-0 left-0 right-0 z-10 bg-background/95 backdrop-blur-sm border-b px-4 py-2">
<div className="flex items-center gap-3">
<Loader2 className="h-4 w-4 animate-spin text-primary" />
<div className="flex-1">
<div className="flex items-center justify-between text-sm">
<span className="font-medium">
{progress
? `Loading listings...`
: 'Loading...'}
</span>
{progress && (
<span className="text-muted-foreground">
{progress.count.toLocaleString()}
{progress.total ? ` / ${progress.total.toLocaleString()}` : ''} loaded
</span>
)}
</div>
{progress && (
<div className="mt-1 h-1.5 w-full bg-primary/20 rounded-full overflow-hidden">
<div
className="h-full bg-primary transition-all duration-300 ease-out rounded-full"
style={{
width: progress.total
? `${Math.min((progress.count / progress.total) * 100, 100)}%`
: '100%',
animation: progress.total ? undefined : 'pulse 1.5s ease-in-out infinite',
}}
/>
</div>
)}
</div>
</div>
</div>
);
}

View file

@ -0,0 +1,56 @@
"use client"
import * as React from "react"
import * as AccordionPrimitive from "@radix-ui/react-accordion"
import { ChevronDown } from "lucide-react"
import { cn } from "@/lib/utils"
const Accordion = AccordionPrimitive.Root
const AccordionItem = React.forwardRef<
React.ComponentRef<typeof AccordionPrimitive.Item>,
React.ComponentPropsWithoutRef<typeof AccordionPrimitive.Item>
>(({ className, ...props }, ref) => (
<AccordionPrimitive.Item
ref={ref}
className={cn("border-b", className)}
{...props}
/>
))
AccordionItem.displayName = "AccordionItem"
const AccordionTrigger = React.forwardRef<
React.ComponentRef<typeof AccordionPrimitive.Trigger>,
React.ComponentPropsWithoutRef<typeof AccordionPrimitive.Trigger>
>(({ className, children, ...props }, ref) => (
<AccordionPrimitive.Header className="flex">
<AccordionPrimitive.Trigger
ref={ref}
className={cn(
"flex flex-1 items-center justify-between py-4 text-sm font-medium transition-all hover:underline text-left [&[data-state=open]>svg]:rotate-180",
className
)}
{...props}
>
{children}
<ChevronDown className="h-4 w-4 shrink-0 text-muted-foreground transition-transform duration-200" />
</AccordionPrimitive.Trigger>
</AccordionPrimitive.Header>
))
AccordionTrigger.displayName = AccordionPrimitive.Trigger.displayName
const AccordionContent = React.forwardRef<
React.ComponentRef<typeof AccordionPrimitive.Content>,
React.ComponentPropsWithoutRef<typeof AccordionPrimitive.Content>
>(({ className, children, ...props }, ref) => (
<AccordionPrimitive.Content
ref={ref}
className="overflow-hidden text-sm data-[state=closed]:animate-accordion-up data-[state=open]:animate-accordion-down"
{...props}
>
<div className={cn("pb-4 pt-0", className)}>{children}</div>
</AccordionPrimitive.Content>
))
AccordionContent.displayName = AccordionPrimitive.Content.displayName
export { Accordion, AccordionItem, AccordionTrigger, AccordionContent }

View file

@ -1,66 +0,0 @@
import * as React from "react"
import { cva, type VariantProps } from "class-variance-authority"
import { cn } from "@/lib/utils"
const alertVariants = cva(
"relative w-full rounded-lg border px-4 py-3 text-sm grid has-[>svg]:grid-cols-[calc(var(--spacing)*4)_1fr] grid-cols-[0_1fr] has-[>svg]:gap-x-3 gap-y-0.5 items-start [&>svg]:size-4 [&>svg]:translate-y-0.5 [&>svg]:text-current",
{
variants: {
variant: {
default: "bg-card text-card-foreground",
destructive:
"text-destructive bg-card [&>svg]:text-current *:data-[slot=alert-description]:text-destructive/90",
},
},
defaultVariants: {
variant: "default",
},
}
)
function Alert({
className,
variant,
...props
}: React.ComponentProps<"div"> & VariantProps<typeof alertVariants>) {
return (
<div
data-slot="alert"
role="alert"
className={cn(alertVariants({ variant }), className)}
{...props}
/>
)
}
function AlertTitle({ className, ...props }: React.ComponentProps<"div">) {
return (
<div
data-slot="alert-title"
className={cn(
"col-start-2 line-clamp-1 min-h-4 font-medium tracking-tight",
className
)}
{...props}
/>
)
}
function AlertDescription({
className,
...props
}: React.ComponentProps<"div">) {
return (
<div
data-slot="alert-description"
className={cn(
"text-muted-foreground col-start-2 grid justify-items-start gap-1 text-sm [&_p]:leading-relaxed",
className
)}
{...props}
/>
)
}
export { Alert, AlertTitle, AlertDescription }

View file

@ -1,46 +0,0 @@
import * as React from "react"
import { Slot } from "@radix-ui/react-slot"
import { cva, type VariantProps } from "class-variance-authority"
import { cn } from "@/lib/utils"
const badgeVariants = cva(
"inline-flex items-center justify-center rounded-md border px-2 py-0.5 text-xs font-medium w-fit whitespace-nowrap shrink-0 [&>svg]:size-3 gap-1 [&>svg]:pointer-events-none focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:ring-[3px] aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive transition-[color,box-shadow] overflow-hidden",
{
variants: {
variant: {
default:
"border-transparent bg-primary text-primary-foreground [a&]:hover:bg-primary/90",
secondary:
"border-transparent bg-secondary text-secondary-foreground [a&]:hover:bg-secondary/90",
destructive:
"border-transparent bg-destructive text-white [a&]:hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60",
outline:
"text-foreground [a&]:hover:bg-accent [a&]:hover:text-accent-foreground",
},
},
defaultVariants: {
variant: "default",
},
}
)
function Badge({
className,
variant,
asChild = false,
...props
}: React.ComponentProps<"span"> &
VariantProps<typeof badgeVariants> & { asChild?: boolean }) {
const Comp = asChild ? Slot : "span"
return (
<Comp
data-slot="badge"
className={cn(badgeVariants({ variant }), className)}
{...props}
/>
)
}
export { Badge, badgeVariants }

View file

@ -1,6 +1,6 @@
import * as React from "react"
import { Slot } from "@radix-ui/react-slot"
import { ChevronRight, MoreHorizontal } from "lucide-react"
import { ChevronRight } from "lucide-react"
import { cn } from "@/lib/utils"
@ -80,24 +80,6 @@ function BreadcrumbSeparator({
)
}
function BreadcrumbEllipsis({
className,
...props
}: React.ComponentProps<"span">) {
return (
<span
data-slot="breadcrumb-ellipsis"
role="presentation"
aria-hidden="true"
className={cn("flex size-9 items-center justify-center", className)}
{...props}
>
<MoreHorizontal className="size-4" />
<span className="sr-only">More</span>
</span>
)
}
export {
Breadcrumb,
BreadcrumbList,
@ -105,5 +87,4 @@ export {
BreadcrumbLink,
BreadcrumbPage,
BreadcrumbSeparator,
BreadcrumbEllipsis,
}

View file

@ -0,0 +1,29 @@
"use client"
import * as React from "react"
import * as CheckboxPrimitive from "@radix-ui/react-checkbox"
import { Check } from "lucide-react"
import { cn } from "@/lib/utils"
const Checkbox = React.forwardRef<
React.ComponentRef<typeof CheckboxPrimitive.Root>,
React.ComponentPropsWithoutRef<typeof CheckboxPrimitive.Root>
>(({ className, ...props }, ref) => (
<CheckboxPrimitive.Root
ref={ref}
className={cn(
"peer h-4 w-4 shrink-0 rounded-sm border border-primary shadow focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:cursor-not-allowed disabled:opacity-50 data-[state=checked]:bg-primary data-[state=checked]:text-primary-foreground",
className
)}
{...props}
>
<CheckboxPrimitive.Indicator
className={cn("flex items-center justify-center text-current")}
>
<Check className="h-4 w-4" />
</CheckboxPrimitive.Indicator>
</CheckboxPrimitive.Root>
))
Checkbox.displayName = CheckboxPrimitive.Root.displayName
export { Checkbox }

View file

@ -0,0 +1,34 @@
"use client"
import * as React from "react"
import * as SliderPrimitive from "@radix-ui/react-slider"
import { cn } from "@/lib/utils"
const Slider = React.forwardRef<
React.ComponentRef<typeof SliderPrimitive.Root>,
React.ComponentPropsWithoutRef<typeof SliderPrimitive.Root>
>(({ className, ...props }, ref) => (
<SliderPrimitive.Root
ref={ref}
className={cn(
"relative flex w-full touch-none select-none items-center",
className
)}
{...props}
>
<SliderPrimitive.Track className="relative h-1.5 w-full grow overflow-hidden rounded-full bg-primary/20">
<SliderPrimitive.Range className="absolute h-full bg-primary" />
</SliderPrimitive.Track>
{props.defaultValue?.map((_, index) => (
<SliderPrimitive.Thumb
key={index}
className="block h-4 w-4 rounded-full border border-primary/50 bg-background shadow transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50"
/>
)) ?? (
<SliderPrimitive.Thumb className="block h-4 w-4 rounded-full border border-primary/50 bg-background shadow transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50" />
)}
</SliderPrimitive.Root>
))
Slider.displayName = SliderPrimitive.Root.displayName
export { Slider }

View file

@ -118,3 +118,30 @@
@apply bg-background text-foreground;
}
}
/* Accordion animations */
@keyframes accordion-down {
from {
height: 0;
}
to {
height: var(--radix-accordion-content-height);
}
}
@keyframes accordion-up {
from {
height: var(--radix-accordion-content-height);
}
to {
height: 0;
}
}
.animate-accordion-down {
animation: accordion-down 0.2s ease-out;
}
.animate-accordion-up {
animation: accordion-up 0.2s ease-out;
}

View file

@ -0,0 +1,62 @@
// Generic API client with authentication
import type { User } from 'oidc-client-ts';
import { ApiError } from '@/types';
export interface RequestOptions {
method?: 'GET' | 'POST' | 'PUT' | 'DELETE';
params?: Record<string, string | number | boolean | Date | undefined>;
}
/**
* Build query string from parameters object
*/
function buildQueryString(params: Record<string, string | number | boolean | Date | undefined>): string {
const queryString = new URLSearchParams();
for (const [key, value] of Object.entries(params)) {
if (value !== undefined && value !== null && value !== '') {
if (value instanceof Date) {
queryString.append(key, value.toISOString());
} else {
queryString.append(key, String(value));
}
}
}
return queryString.toString();
}
/**
* Generic authenticated API request
*/
export async function apiRequest<T>(
user: User,
endpoint: string,
options: RequestOptions = {}
): Promise<T> {
const { method = 'GET', params } = options;
const accessToken = user.access_token;
let url = endpoint;
if (params) {
const queryString = buildQueryString(params);
if (queryString) {
url = `${endpoint}?${queryString}`;
}
}
const response = await fetch(url, {
method,
headers: {
Authorization: `Bearer ${accessToken}`,
'Content-Type': 'application/json',
},
});
if (!response.ok) {
throw new ApiError(`Error: ${response.status}`, response.status);
}
return response.json() as Promise<T>;
}

View file

@ -0,0 +1,54 @@
// Listing service for fetching and refreshing listings
import type { User } from 'oidc-client-ts';
import type { GeoJSONFeatureCollection, RefreshListingsResponse } from '@/types';
import type { ParameterValues } from '@/components/FilterPanel';
import { apiRequest } from './apiClient';
import { API_ENDPOINTS } from '@/constants';
/**
* Build listing query parameters from form values
*/
function buildListingParams(parameters: ParameterValues): Record<string, string | number | boolean | Date | undefined> {
return {
listing_type: parameters.listing_type,
min_bedrooms: parameters.min_bedrooms,
max_bedrooms: parameters.max_bedrooms,
max_price: parameters.max_price,
min_price: parameters.min_price,
min_sqm: parameters.min_sqm,
max_sqm: parameters.max_sqm,
min_price_per_sqm: parameters.min_price_per_sqm,
max_price_per_sqm: parameters.max_price_per_sqm,
last_seen_days: parameters.last_seen_days,
let_date_available_from: parameters.available_from,
district_names: parameters.district || undefined,
furnish_types: parameters.furnish_types?.join(',') || undefined,
};
}
/**
* Fetch listing data as GeoJSON
*/
export async function fetchListingGeoJSON(
user: User,
parameters: ParameterValues
): Promise<GeoJSONFeatureCollection> {
return apiRequest<GeoJSONFeatureCollection>(user, API_ENDPOINTS.LISTING_GEOJSON, {
method: 'GET',
params: buildListingParams(parameters),
});
}
/**
* Trigger a listing refresh task
*/
export async function refreshListings(
user: User,
parameters: ParameterValues
): Promise<RefreshListingsResponse> {
return apiRequest<RefreshListingsResponse>(user, API_ENDPOINTS.REFRESH_LISTINGS, {
method: 'POST',
params: buildListingParams(parameters),
});
}

View file

@ -0,0 +1,45 @@
// Map utility functions
/**
* Deep clone an object using JSON serialization
*/
export function clone<T>(obj: T): T {
return JSON.parse(JSON.stringify(obj));
}
/**
* Calculate the value at a given percentile in a sorted array
* @param arr Sorted array of numbers
* @param p Percentile (0-1)
*/
export function percentile(arr: number[], p: number): number {
if (arr.length === 0) return 0;
if (typeof p !== 'number') throw new TypeError('p must be a number');
if (p <= 0) return arr[0];
if (p >= 1) return arr[arr.length - 1];
const index = arr.length * p;
const lower = Math.floor(index);
const upper = lower + 1;
const weight = index % 1;
if (upper >= arr.length) return arr[lower];
return arr[lower] * (1 - weight) + arr[upper] * weight;
}
/**
* Convert percentage-based color stops to value-based color stops
* @param colorStopsPerc Array of [percentage, color] tuples
* @param min Minimum value
* @param max Maximum value
*/
export function calculateColorStops(
colorStopsPerc: [number, string][],
min: number,
max: number
): [number, string][] {
return colorStopsPerc.map(([perc, color]) => [
min + (perc * (max - min)) / 100,
color,
]);
}

View file

@ -1 +1 @@
{"root":["./src/App.tsx","./src/AppSidebar.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/auth/authService.ts","./src/auth/config.ts","./src/components/ActiveQuery.tsx","./src/components/AlertError.tsx","./src/components/LoginModal.tsx","./src/components/Map.tsx","./src/components/Parameters.tsx","./src/components/ui/alert-dialog.tsx","./src/components/ui/alert.tsx","./src/components/ui/badge.tsx","./src/components/ui/breadcrumb.tsx","./src/components/ui/button.tsx","./src/components/ui/dialog.tsx","./src/components/ui/form.tsx","./src/components/ui/hover-card.tsx","./src/components/ui/input.tsx","./src/components/ui/label.tsx","./src/components/ui/progress.tsx","./src/components/ui/scroll-area.tsx","./src/components/ui/select.tsx","./src/components/ui/separator.tsx","./src/components/ui/sheet.tsx","./src/components/ui/sidebar.tsx","./src/components/ui/skeleton.tsx","./src/components/ui/tooltip.tsx","./src/hooks/use-mobile.ts","./src/lib/utils.ts"],"version":"5.8.3"}
{"root":["./src/App.tsx","./src/AppSidebar.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/auth/authService.ts","./src/auth/config.ts","./src/auth/errors.ts","./src/components/ActiveQuery.tsx","./src/components/AlertError.tsx","./src/components/AuthCallback.tsx","./src/components/FilterPanel.tsx","./src/components/Header.tsx","./src/components/HealthIndicator.tsx","./src/components/ListView.tsx","./src/components/LoginModal.tsx","./src/components/Map.tsx","./src/components/Parameters.tsx","./src/components/PropertyCard.tsx","./src/components/Spinner.tsx","./src/components/StatsBar.tsx","./src/components/StreamingProgressBar.tsx","./src/components/TaskIndicator.tsx","./src/components/ui/DatePicker.tsx","./src/components/ui/accordion.tsx","./src/components/ui/alert-dialog.tsx","./src/components/ui/breadcrumb.tsx","./src/components/ui/button.tsx","./src/components/ui/calendar.tsx","./src/components/ui/checkbox.tsx","./src/components/ui/dialog.tsx","./src/components/ui/form.tsx","./src/components/ui/hover-card.tsx","./src/components/ui/input.tsx","./src/components/ui/label.tsx","./src/components/ui/popover.tsx","./src/components/ui/progress.tsx","./src/components/ui/scroll-area.tsx","./src/components/ui/select.tsx","./src/components/ui/separator.tsx","./src/components/ui/sheet.tsx","./src/components/ui/sidebar.tsx","./src/components/ui/skeleton.tsx","./src/components/ui/slider.tsx","./src/components/ui/tooltip.tsx","./src/constants/colorSchemes.ts","./src/constants/index.ts","./src/hooks/use-mobile.ts","./src/lib/utils.ts","./src/services/apiClient.ts","./src/services/healthService.ts","./src/services/index.ts","./src/services/listingService.ts","./src/services/streamingService.ts","./src/services/taskService.ts","./src/types/index.ts","./src/utils/mapUtils.ts"],"version":"5.8.3"}

View file

@ -19,7 +19,8 @@ export default defineConfig({
allowedHosts: [
env.DEV_HOST ?? 'localhost',
// Add more hosts here
'wrongmove.viktorbarzin.me'
'wrongmove.viktorbarzin.me',
'devvm.viktorbarzin.lan'
],
}
})

View file

@ -1,28 +1,28 @@
"""CLI entry point for the Real Estate Crawler."""
import asyncio
from datetime import datetime
import os
import pathlib
from typing import Callable, ParamSpec, TypeVar
import click
import importlib
from models.listing import FurnishType, ListingType, QueryParameters
from rec.districts import get_districts
from data_access import Listing
import csv_exporter
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
from repositories.listing_repository import ListingRepository
from ui_exporter import export_immoweb as export_immoweb_ui
from functools import wraps
from database import engine
from services import (
listing_service,
export_service,
district_service,
)
P = ParamSpec("P")
R = TypeVar("R")
dump_listings_module = importlib.import_module("1_dump_listings")
dump_images_module = importlib.import_module("3_dump_images")
detect_floorplan_module = importlib.import_module("4_detect_floorplan")
routing_module = importlib.import_module("5_routing")
def listing_filter_options(func):
def listing_filter_options(func: Callable[P, R]) -> Callable[P, R]:
"""Decorator to add common options for filtering listings."""
@click.option(
@ -45,7 +45,7 @@ def listing_filter_options(func):
"--max-bedrooms",
default=10,
help="Maximum number of bedrooms",
type=click.IntRange(min=1, max=10), # Right move gets unhappy with >10
type=click.IntRange(min=1, max=10),
)
@click.option(
"--min-price",
@ -57,13 +57,13 @@ def listing_filter_options(func):
"--max-price",
default=999_999,
help="Maximum price",
type=click.IntRange(min=0), # 40k for renting
type=click.IntRange(min=0),
)
@click.option(
"--district",
default=None,
help="Districts to scrape",
type=click.Choice(get_districts().keys(), case_sensitive=False),
type=click.Choice(district_service.get_district_names(), case_sensitive=False),
multiple=True,
)
@click.option(
@ -95,17 +95,50 @@ def listing_filter_options(func):
type=int,
)
@wraps(func)
def wrapper(*args, **kwargs):
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
return func(*args, **kwargs)
return wrapper
def build_query_parameters(
type: str,
district: list[str],
min_bedrooms: int,
max_bedrooms: int,
min_price: int,
max_price: int,
furnish_types: list[str],
available_from: datetime | None,
last_seen_days: int,
min_sqm: int | None = None,
radius: int = 0,
page_size: int = 500,
max_days_since_added: int = 14,
) -> QueryParameters:
"""Build QueryParameters from CLI options."""
return QueryParameters(
listing_type=ListingType[type],
district_names=set(district) if district else None,
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
min_price=min_price,
max_price=max_price,
furnish_types=[FurnishType[ft] for ft in furnish_types] if furnish_types else None,
let_date_available_from=available_from,
last_seen_days=last_seen_days,
min_sqm=min_sqm,
radius=radius,
page_size=page_size,
max_days_since_added=max_days_since_added,
)
@click.group()
@click.option(
"--data-dir",
default=pathlib.Path("data/rs/"),
help="Districts to scrape",
help="Data directory for storing listings",
type=click.Path(
writable=True,
file_okay=False,
@ -114,17 +147,18 @@ def listing_filter_options(func):
),
)
@click.pass_context
def cli(ctx, data_dir: str):
def cli(ctx: click.Context, data_dir: str) -> None:
ctx.ensure_object(dict)
ctx.obj["data_dir"] = data_dir
ctx.obj["data_dir"] = pathlib.Path(data_dir)
ctx.obj["repository"] = ListingRepository(engine=engine)
@cli.command()
@listing_filter_options
@click.option("--full", is_flag=True)
@click.option("--full", is_flag=True, help="Include images and floorplan detection")
@click.pass_context
def dump_listings(
ctx: click.core.Context,
ctx: click.Context,
full: bool,
district: list[str],
min_bedrooms: int,
@ -136,58 +170,63 @@ def dump_listings(
available_from: datetime | None,
last_seen_days: int,
min_sqm: int | None = None,
):
data_dir: str = ctx.obj["data_dir"]
query_parameters = QueryParameters(
listing_type=ListingType[type],
district_names=set(district),
) -> None:
"""Fetch listings from Rightmove API."""
data_dir: pathlib.Path = ctx.obj["data_dir"]
repository: ListingRepository = ctx.obj["repository"]
query_parameters = build_query_parameters(
type=type,
district=district,
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
min_price=min_price,
max_price=max_price,
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
let_date_available_from=available_from,
furnish_types=furnish_types,
available_from=available_from,
last_seen_days=last_seen_days,
min_sqm=min_sqm,
radius=0,
page_size=500,
max_days_since_added=14,
)
click.echo(
f"Running dump_listings for districts {district}, data dir {data_dir} and parameters: "
f"{query_parameters}"
click.echo(f"Fetching listings with parameters: {query_parameters}")
result = asyncio.run(
listing_service.refresh_listings(
repository,
query_parameters,
full=full,
async_mode=False,
)
)
data_dir_path = pathlib.Path(data_dir)
repository = ListingRepository(engine=engine)
if not full: # only listings
asyncio.run(
dump_listings_module.dump_listings(
query_parameters, repository, data_dir_path
)
)
else: # include images, floorplan detection etc.
asyncio.run(
dump_listings_module.dump_listings_full(
query_parameters, repository, data_dir_path
)
)
click.echo(result.message)
@cli.command()
@click.pass_context
def dump_images(ctx: click.core.Context):
data_dir = ctx.obj["data_dir"]
click.echo(f"Running dump_images for listings stored in {engine.url}")
repository = ListingRepository(engine=engine)
asyncio.run(dump_images_module.dump_images(repository, image_base_path=data_dir))
def dump_images(ctx: click.Context) -> None:
"""Download floorplan images for all listings."""
data_dir: pathlib.Path = ctx.obj["data_dir"]
repository: ListingRepository = ctx.obj["repository"]
click.echo(f"Downloading images to {data_dir}")
count = asyncio.run(listing_service.download_images(repository, data_dir))
click.echo(f"Processed {count} listings")
@cli.command()
@click.pass_context
def detect_floorplan(ctx: click.core.Context):
click.echo(f"Running detect_floorplan for listings stored in {engine.url}")
repository = ListingRepository(engine=engine)
asyncio.run(detect_floorplan_module.detect_floorplan(repository))
def detect_floorplan(ctx: click.Context) -> None:
"""Run OCR on floorplan images to detect square meters."""
repository: ListingRepository = ctx.obj["repository"]
click.echo("Running floorplan detection...")
count = asyncio.run(listing_service.detect_floorplans(repository))
click.echo(f"Processed {count} listings")
@cli.command()
@ -202,10 +241,7 @@ def detect_floorplan(ctx: click.core.Context):
"--travel-mode",
"-m",
help="Travel mode for routing",
type=click.Choice(
TravelMode.__members__.keys(),
case_sensitive=False,
),
type=click.Choice(TravelMode.__members__.keys(), case_sensitive=False),
required=True,
)
@click.option(
@ -213,65 +249,50 @@ def detect_floorplan(ctx: click.core.Context):
"-l",
help="Limit the number of listings to process",
type=click.IntRange(min=1),
default=1, # by default limit to 1 to avoid accidental API usage
default=1,
)
@click.pass_context
def routing(
ctx: click.core.Context, destination_address: str, travel_mode: str, limit: int
):
data_dir = ctx.obj["data_dir"]
click.echo(f"Running routing for the first {limit} listings in {data_dir}")
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
listing_paths = listing_paths[:limit]
ctx: click.Context,
destination_address: str,
travel_mode: str,
limit: int,
) -> None:
"""Calculate transit routes for listings."""
repository: ListingRepository = ctx.obj["repository"]
if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None:
raise click.exceptions.MissingParameter(
f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. "
"Please set it to your API key for the routing service."
raise click.ClickException(
f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set."
)
repository = ListingRepository(engine=engine)
asyncio.run(
routing_module.calculate_route(
click.echo(f"Calculating routes to '{destination_address}' for {limit} listings")
count = asyncio.run(
listing_service.calculate_routes(
repository,
destination_address,
# destination_address_coordinates,
TravelMode[travel_mode],
travel_mode,
limit=limit,
)
)
click.echo(f"Processed {count} listings")
@cli.command()
# @click.option(
# "--columns",
# "-C",
# help="Columns to include in the CSV file",
# type=click.Choice(
# # csv_exporter.get_columns_from_listings(),
# [1],
# case_sensitive=False,
# ),
# multiple=True,
# default=Listing.ALL_COLUMNS,
# )
@click.option(
"--output-file",
"-O",
help="Path to the output CSV file",
required=True,
type=click.Path(
writable=True,
file_okay=True,
dir_okay=False,
resolve_path=True,
),
type=click.Path(writable=True, file_okay=True, dir_okay=False, resolve_path=True),
)
@click.pass_context
@listing_filter_options
@click.pass_context
def export_csv(
ctx: click.core.Context,
ctx: click.Context,
output_file: str,
# columns: tuple[str],
district: list[str],
min_bedrooms: int,
max_bedrooms: int,
@ -282,53 +303,48 @@ def export_csv(
available_from: datetime | None,
last_seen_days: int,
min_sqm: int | None = None,
):
# use model
data_dir = ctx.obj["data_dir"]
query_parameters = QueryParameters(
listing_type=ListingType[type],
district_names=set(district),
) -> None:
"""Export listings to CSV file."""
repository: ListingRepository = ctx.obj["repository"]
query_parameters = build_query_parameters(
type=type,
district=district,
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
min_price=min_price,
max_price=max_price,
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
let_date_available_from=available_from,
furnish_types=furnish_types,
available_from=available_from,
last_seen_days=last_seen_days,
min_sqm=min_sqm,
)
click.echo(
f"Exporting data to {output_file} using {data_dir=} and query parameters: {query_parameters}"
)
output_file_path = pathlib.Path(output_file)
repository = ListingRepository(engine=engine)
asyncio.run(
csv_exporter.export_to_csv(
click.echo(f"Exporting to {output_file}")
result = asyncio.run(
export_service.export_to_csv(
repository,
output_file_path,
# list(columns),
query_parameters=query_parameters,
),
pathlib.Path(output_file),
query_parameters,
)
)
click.echo(result.message)
@cli.command()
@click.option(
"--output-file",
"-O",
help="Path to the output immoweb file",
help="Path to the output GeoJSON file",
required=True,
type=click.Path(
writable=True,
file_okay=True,
dir_okay=False,
resolve_path=True,
),
type=click.Path(writable=True, file_okay=True, dir_okay=False, resolve_path=True),
)
@listing_filter_options
@click.pass_context
def export_immoweb(
ctx: click.core.Context,
ctx: click.Context,
output_file: str,
district: list[str],
min_bedrooms: int,
@ -340,39 +356,62 @@ def export_immoweb(
available_from: datetime | None,
last_seen_days: int,
min_sqm: int | None = None,
):
query_parameters = QueryParameters(
listing_type=ListingType[type],
district_names=set(district),
) -> None:
"""Export listings to GeoJSON file for map visualization."""
repository: ListingRepository = ctx.obj["repository"]
query_parameters = build_query_parameters(
type=type,
district=district,
min_bedrooms=min_bedrooms,
max_bedrooms=max_bedrooms,
min_price=min_price,
max_price=max_price,
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
let_date_available_from=available_from,
furnish_types=furnish_types,
available_from=available_from,
last_seen_days=last_seen_days,
min_sqm=min_sqm,
)
click.echo(
f"Exporting data to {output_file} for listings stored in {engine.url} that match the query parameters: {query_parameters}"
click.echo(f"Exporting to {output_file}")
result = asyncio.run(
export_service.export_to_geojson(
repository,
query_parameters=query_parameters,
output_path=pathlib.Path(output_file),
)
)
repository = ListingRepository(engine=engine)
asyncio.run(export_immoweb_ui(repository, output_file, query_parameters))
click.echo(result.message)
@cli.command()
@click.pass_context
def populate_db(
ctx: click.core.Context,
):
data_dir = ctx.obj["data_dir"]
click.echo(f"Populating the database with data from {data_dir}")
repository = ListingRepository(engine=engine)
def populate_db(ctx: click.Context) -> None:
"""Populate database from filesystem data (legacy migration)."""
data_dir: pathlib.Path = ctx.obj["data_dir"]
repository: ListingRepository = ctx.obj["repository"]
click.echo(f"Populating database from {data_dir}")
listings = Listing.get_all_listings(
[path for path in pathlib.Path(data_dir).glob("*/listing.json")]
[path for path in data_dir.glob("*/listing.json")]
)
asyncio.run(repository.upsert_listings_legacy(listings))
click.echo(f"Imported {len(listings)} listings")
@cli.command()
def list_districts() -> None:
"""List all available districts."""
districts = district_service.get_all_districts()
click.echo(f"Available districts ({len(districts)}):")
for name in sorted(districts.keys()):
click.echo(f" - {name}")
if __name__ == "__main__":
cli()

View file

@ -1,40 +0,0 @@
def record():
from rec.query import listing_query, detail_query
import json
page = 1
listing = listing_query(page, 2, 2, 5, 200000, 500000)
with open(
f"/Users/kadir/code/realestate/crawler/code/json/queries/listing{page}.json",
"w",
) as f:
json.dump(listing, f)
for prop in listing["properties"]:
identifier = prop["identifier"]
resp = detail_query(identifier)
# print(identifier, resp.status_code)
with open(
f"/Users/kadir/code/realestate/crawler/code/json/queries/detail_{identifier}.json",
"w",
) as f:
json.dump(resp, f)
def process():
import json
import pathlib
path = pathlib.Path("/Users/kadir/code/realestate/crawler/code/json/queries/")
detailjsons = list(path.glob("detail_*json"))
for file in detailjsons:
with open(file) as f:
js = json.load(f)
for floorplan in js["property"]["floorplans"]:
print(floorplan["url"])
# record()
process()

View file

@ -5,7 +5,7 @@ from datetime import datetime, timedelta
import enum
import json
from typing import Any, Dict, List
from pydantic import BaseModel
from pydantic import BaseModel, Field as PydanticField
from rec import routing
from sqlmodel import JSON, TEXT, SQLModel, Field
@ -80,7 +80,10 @@ class Listing(SQLModel, table=False):
@property
def is_removed(self) -> bool:
return not self.additional_info["property"]["visible"]
if not self.additional_info:
return False
property_info = self.additional_info.get("property", {})
return not property_info.get("visible", True)
@property
def price_per_square_meter(self) -> float | None:
@ -231,14 +234,16 @@ class ListingType(enum.StrEnum):
RENT = "RENT"
@dataclass(frozen=True)
class QueryParameters(BaseModel):
"""Query parameters for filtering listings."""
model_config = {"frozen": True}
listing_type: ListingType
min_bedrooms: int = 1
max_bedrooms: int = 999
min_price: int = 0
max_price: int = 10_000_000
district_names: set[str] = dataclasses.field(default_factory=set)
district_names: set[str] = PydanticField(default_factory=set)
radius: float = 0
page_size: int = 500 # items per page
max_days_since_added: int = 14 # for buy listings

36
crawler/poetry.lock generated
View file

@ -120,6 +120,22 @@ yarl = ">=1.17.0,<2.0"
[package.extras]
speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.3.0)", "brotlicffi ; platform_python_implementation != \"CPython\""]
[[package]]
name = "aiohttp-socks"
version = "0.8.4"
description = "Proxy connector for aiohttp"
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "aiohttp_socks-0.8.4-py3-none-any.whl", hash = "sha256:74b21105634ed31d56ed6fee43701ca16218b53475e606d56950a4d17e8290ea"},
{file = "aiohttp_socks-0.8.4.tar.gz", hash = "sha256:6b611d4ce838e9cf2c2fed5e0dba447cc84824a6cba95dc5747606201da46cb4"},
]
[package.dependencies]
aiohttp = ">=2.3.2"
python-socks = {version = ">=2.4.3,<3.0.0", extras = ["asyncio"]}
[[package]]
name = "aioresponses"
version = "0.7.8"
@ -4246,6 +4262,24 @@ files = [
{file = "python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13"},
]
[[package]]
name = "python-socks"
version = "2.8.0"
description = "Proxy (SOCKS4, SOCKS5, HTTP CONNECT) client for Python"
optional = false
python-versions = ">=3.8.0"
groups = ["main"]
files = [
{file = "python_socks-2.8.0-py3-none-any.whl", hash = "sha256:57c24b416569ccea493a101d38b0c82ed54be603aa50b6afbe64c46e4a4e4315"},
{file = "python_socks-2.8.0.tar.gz", hash = "sha256:340f82778b20a290bdd538ee47492978d603dff7826aaf2ce362d21ad9ee6f1b"},
]
[package.extras]
anyio = ["anyio (>=3.3.4,<5.0.0)"]
asyncio = ["async-timeout (>=4.0) ; python_version < \"3.11\""]
curio = ["curio (>=1.4)"]
trio = ["trio (>=0.24)"]
[[package]]
name = "pytz"
version = "2025.2"
@ -6203,4 +6237,4 @@ type = ["pytest-mypy"]
[metadata]
lock-version = "2.1"
python-versions = ">3.11"
content-hash = "10a74594d9f695ab1077ff992bcd012b93b174b25c3f2ca681d6308653abbd14"
content-hash = "6f9ce2af71a995db179aa4fb682e8a9ccde59566d14e26c7b0dbf4edc8d8e583"

View file

@ -1,13 +0,0 @@
import requests
headers = {
"Host": "media.rightmove.co.uk",
# 'Accept-Encoding': 'gzip, deflate, br',
"User-Agent": "okhttp/4.10.0",
}
response = requests.get(
"https://media.rightmove.co.uk/47k/46001/138680705/46001_32532509_IMG_00_0000.jpeg",
headers=headers,
verify=False,
)

View file

@ -1,67 +0,0 @@
import requests
headers = {
"Host": "api.rightmove.co.uk",
# 'Accept-Encoding': 'gzip, deflate, br',
"User-Agent": "okhttp/4.10.0",
"Connection": "close",
}
params = {
"locationIdentifier": "POSTCODE^4228216",
"channel": "BUY",
"page": "1",
"numberOfPropertiesPerPage": "25",
"radius": "3.0",
"sortBy": "distance",
"includeUnavailableProperties": "false",
"propertyTypes": "flat",
"mustHave": "newHome", # added manually later
"dontShow": "sharedOwnership,retirement",
"minPrice": "150000",
"maxPrice": "500000",
"minBedrooms": "2",
"maxBedrooms": "2",
"apiApplication": "ANDROID",
"appVersion": "3.70.0",
}
response = requests.get(
"https://api.rightmove.co.uk/api/property-listing",
params=params,
headers=headers,
verify=False,
)
headers = {
"Host": "api.rightmove.co.uk",
# 'Accept-Encoding': 'gzip, deflate, br',
"User-Agent": "okhttp/4.10.0",
"Connection": "close",
}
params = {
"locationIdentifier": "POSTCODE^4228216",
"channel": "BUY",
"page": "2",
"numberOfPropertiesPerPage": "25",
"radius": "3.0",
"sortBy": "distance",
"includeUnavailableProperties": "false",
"propertyTypes": "flat",
"dontShow": "sharedOwnership,retirement",
"minPrice": "150000",
"maxPrice": "600000",
"minBedrooms": "2",
"maxBedrooms": "2",
"apiApplication": "ANDROID",
"appVersion": "3.70.0",
}
response = requests.get(
"https://api.rightmove.co.uk/api/property-listing",
params=params,
headers=headers,
verify=False,
)

View file

@ -1,22 +0,0 @@
import requests
API_KEY = "AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8"
url = "https://maps.googleapis.com/maps/api/distancematrix/json"
origin = "51.5636306598907,-0.11061106079085892"
dest = "51.53836609846008,-0.12743940233824352"
params = {
"origins": origin,
"destinations": dest,
"key": API_KEY,
"departure_time": "", # timstamp, optional
"mode": "transit",
}
r = requests.get(url, params=params)
print(r.status_code)
print(r.json())
with open("code/json/routing_distancematrix.json", "w") as f:
f.write(r.text)

View file

@ -1,83 +0,0 @@
import requests
from utils import nextMonday
from collections import defaultdict
API_KEY = "AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8"
url = "https://routes.googleapis.com/directions/v2:computeRoutes"
def travel_time(origin_lat: float, origin_lon: float, dest_lat: float, dest_lon: float):
monday9am = nextMonday()
header = {
"X-Goog-Api-Key": API_KEY,
"Content-Type": "application/json",
"X-Goog-FieldMask": "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode",
}
body = {
"origin": {
"location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}}
},
"destination": {
"location": {"latLng": {"latitude": dest_lat, "longitude": dest_lon}}
},
"travelMode": "TRANSIT",
# "2023-10-15T15:01:23.045123456Z"
"departureTime": monday9am.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
"computeAlternativeRoutes": False,
# "routeModifiers": {
# "avoidTolls": false,
# "avoidHighways": false,
# "avoidFerries": false
# },
"languageCode": "en-US",
"units": "METRIC",
}
r = requests.post(url, json=body, headers=header)
if r.status_code == 200:
return r.json()
raise Exception(r.json())
def extract_time(d):
r = d["routes"][0]
print(r.keys())
distance = r["distanceMeters"]
duration = r["duration"]
duration_static = r["staticDuration"]
steps = r["legs"][0]["steps"]
# print(steps)
duration_per_transit = defaultdict(lambda: 0)
distance_per_transit = defaultdict(lambda: 0)
for step in steps:
duration_per_transit[step["travelMode"]] += int(
step["staticDuration"].strip("s")
)
distance_per_transit[step["travelMode"]] += step.get("distanceMeters", 0)
print(
f"dis {distance}, dur {duration}, duration per transit {dict(duration_per_transit)}, distance per transit {dict(distance_per_transit)}, duration_static {duration_static}"
)
if __name__ == "__main__":
import json
with open("code/json/routing_routeapi.json", "r") as f:
d = json.load(f)
extract_time(d)
# if __name__ == "__main__":
# origin = 51.5635664310333, -0.1107173751570373 # home
# dest = 51.50475678313417, 0.04915321000190009 # london city airport
# d = travel_time(origin[0], origin[1], dest[0], dest[1])
# import json
# with open('code/json/routing_routeapi.json', 'w') as f:
# json.dump(d, f)

View file

@ -1,20 +0,0 @@
import requests
headers = {
"Host": "api.rightmove.co.uk",
# 'Accept-Encoding': 'gzip, deflate, br',
"User-Agent": "okhttp/4.10.0",
"Connection": "close",
}
params = {
"apiApplication": "ANDROID",
"appVersion": "3.70.0",
}
response = requests.get(
"https://api.rightmove.co.uk/api/property/119578451",
params=params,
headers=headers,
verify=False,
)

View file

@ -1,4 +1,4 @@
def get_districts():
def get_districts() -> dict[str, str]:
return {
"Barking and Dagenham": "REGION^61400",
"Barnet": "REGION^93929",

View file

@ -1,10 +1,12 @@
import re
from pathlib import Path
from typing import Any
from PIL import Image
import cv2
import numpy as np
def inference(image_path):
def inference(image_path: str | Path) -> tuple[str, Any]:
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
image = Image.open(image_path)
@ -19,7 +21,7 @@ def inference(image_path):
return output, predictions
def extract_total_sqm(input_str: str):
def extract_total_sqm(input_str: str) -> float | None:
sqmregex = r"(\d+\.?\d*) ?(sq ?m|sq. ?m)"
matches = re.findall(sqmregex, input_str.lower())
sqms = [float(m[0]) for m in matches]
@ -29,13 +31,13 @@ def extract_total_sqm(input_str: str):
return max(filtered)
def calculate_model(image_path):
def calculate_model(image_path: str | Path) -> tuple[float | None, str, Any]:
output, predictions_tensor = inference(image_path)
estimated_sqm = extract_total_sqm(output)
return estimated_sqm, output, predictions_tensor
def improve_img_for_ocr(img: Image):
def improve_img_for_ocr(img: Image.Image) -> Image.Image:
img2 = np.array(img.convert("L"))
cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
thresh = cv2.adaptiveThreshold(
@ -44,7 +46,7 @@ def improve_img_for_ocr(img: Image):
return Image.fromarray(thresh)
def calculate_ocr(image_path) -> tuple[float | None, str]:
def calculate_ocr(image_path: str | Path) -> tuple[float | None, str]:
import pytesseract
img = Image.open(image_path)

View file

@ -0,0 +1,41 @@
import json
from typing import List
from models.listing import DestinationMode, Route, RouteLegStep
from rec import routing
class RouteSerializer:
@staticmethod
def serialize(route): ...
@staticmethod
def deserialize(route_data_json: str) -> dict[DestinationMode, List[Route]]:
json_data = json.loads(route_data_json)
destimation_routes = {}
for destination_mode_str, routes_json in json_data.items():
destination_mode = DestinationMode(
destination_address=json.loads(destination_mode_str)[
"destination_address"
],
travel_mode=routing.TravelMode(
json.loads(destination_mode_str)["travel_mode"]
),
)
parsed_route = json.loads(routes_json[0])
routes = [
Route(
legs=[
RouteLegStep(
distance_meters=step["distance_meters"],
duration_s=step["duration_s"],
travel_mode=routing.TravelMode(step["travel_mode"]),
)
for step in parsed_route["legs"]
],
distance_meters=parsed_route["distance_meters"],
duration_s=int(parsed_route["duration_s"]),
)
]
destimation_routes[destination_mode] = routes
return destimation_routes

View file

@ -0,0 +1,41 @@
"""Services package for real estate crawler.
This package contains two layers of services:
## Low-level services (internal implementation):
- listing_fetcher: Fetches listing data from Rightmove API
- image_fetcher: Downloads floorplan images
- floorplan_detector: OCR-based square meter detection from floorplans
- route_calculator: Calculates transit routes using Google Maps API
## High-level services (use these in CLI and API):
- listing_service: Unified listing operations (get, refresh, download images, etc.)
- export_service: Export listings to CSV, GeoJSON
- district_service: District lookup and validation
- task_service: Background task management
"""
# Low-level services (internal)
from services.listing_fetcher import dump_listings, dump_listings_full
from services.image_fetcher import dump_images
from services.floorplan_detector import detect_floorplan
from services.route_calculator import calculate_route
# High-level services (CLI and API should use these)
from services import listing_service
from services import export_service
from services import district_service
from services import task_service
__all__ = [
# Low-level
"dump_listings",
"dump_listings_full",
"dump_images",
"detect_floorplan",
"calculate_route",
# High-level
"listing_service",
"export_service",
"district_service",
"task_service",
]

View file

@ -0,0 +1,38 @@
"""Unified district service - shared between CLI and HTTP API."""
from rec.districts import get_districts as _get_districts
def get_all_districts() -> dict[str, str]:
"""Get all available districts with their region IDs.
Used by:
- CLI: --district option choices
- API: GET /api/get_districts
Returns:
Dictionary mapping district names to region IDs
"""
return _get_districts()
def get_district_names() -> list[str]:
"""Get list of all district names.
Returns:
List of district names
"""
return list(_get_districts().keys())
def validate_districts(district_names: list[str]) -> tuple[bool, list[str]]:
"""Validate that district names exist.
Args:
district_names: List of district names to validate
Returns:
Tuple of (all_valid, invalid_names)
"""
valid_districts = set(_get_districts().keys())
invalid = [d for d in district_names if d not in valid_districts]
return len(invalid) == 0, invalid

View file

@ -0,0 +1,92 @@
"""Unified export service - shared between CLI and HTTP API.
This module provides export functionality for listings in various formats.
"""
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from models.listing import QueryParameters
from repositories.listing_repository import ListingRepository
@dataclass
class ExportResult:
"""Result of an export operation."""
success: bool
output_path: str | None # For file exports
data: Any | None # For in-memory exports (GeoJSON)
record_count: int
message: str
async def export_to_csv(
repository: ListingRepository,
output_path: Path,
query_parameters: QueryParameters | None = None,
) -> ExportResult:
"""Export listings to CSV file.
Used by:
- CLI: export-csv
- API: (could be added as download endpoint)
"""
from csv_exporter import export_to_csv as _export_csv
await _export_csv(repository, output_path, query_parameters)
listings = await repository.get_listings(query_parameters=query_parameters)
return ExportResult(
success=True,
output_path=str(output_path),
data=None,
record_count=len(listings),
message=f"Exported {len(listings)} listings to {output_path}",
)
async def export_to_geojson(
repository: ListingRepository,
query_parameters: QueryParameters | None = None,
output_path: Path | None = None,
limit: int | None = None,
) -> ExportResult:
"""Export listings to GeoJSON format.
Args:
repository: Database repository
query_parameters: Filtering parameters
output_path: If provided, write to file. Otherwise return data.
limit: Maximum number of listings to export
Used by:
- CLI: export-immoweb
- API: GET /api/listing_geojson
"""
from ui_exporter import export_immoweb
geojson_data = await export_immoweb(
repository,
output_file=str(output_path) if output_path else None,
query_parameters=query_parameters,
limit=limit,
)
feature_count = len(geojson_data.get("features", [])) if geojson_data else 0
if output_path:
return ExportResult(
success=True,
output_path=str(output_path),
data=None,
record_count=feature_count,
message=f"Exported {feature_count} listings to {output_path}",
)
return ExportResult(
success=True,
output_path=None,
data=geojson_data,
record_count=feature_count,
message=f"Generated GeoJSON with {feature_count} features",
)

View file

@ -0,0 +1,42 @@
"""Floorplan detector service - OCR-based square meter detection."""
import asyncio
from models import Listing
from rec import floorplan
from repositories.listing_repository import ListingRepository
from tqdm.asyncio import tqdm
import multiprocessing
async def detect_floorplan(repository: ListingRepository) -> None:
"""Detect square meters from floorplan images for all listings."""
listings = await repository.get_listings()
cpu_count = multiprocessing.cpu_count() // 4
semaphore = asyncio.Semaphore(cpu_count)
updated_listings = [
listing
for listing in await tqdm.gather(
*[_calculate_sqm_ocr(listing, semaphore) for listing in listings]
)
if listing is not None
]
await repository.upsert_listings(updated_listings)
async def _calculate_sqm_ocr(
listing: Listing, semaphore: asyncio.Semaphore
) -> Listing | None:
"""Calculate square meters from floorplan images using OCR."""
if listing.square_meters is not None:
return None
sqms: list[float] = []
for floorplan_path in listing.floorplan_image_paths:
async with semaphore:
estimated_sqm, _ = await asyncio.to_thread(
floorplan.calculate_ocr, floorplan_path
)
if estimated_sqm is not None:
sqms.append(estimated_sqm)
max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0
listing.square_meters = max_sqm
return listing

View file

@ -0,0 +1,55 @@
"""Image fetcher service - downloads floorplan images for listings."""
import asyncio
from pathlib import Path
import aiohttp
from repositories import ListingRepository
from tenacity import retry, stop_after_attempt, wait_random
from tqdm.asyncio import tqdm
from models import Listing
# Setting this too high either crashes rightmove or gets us blocked
semaphore = asyncio.Semaphore(5)
async def dump_images(
repository: ListingRepository,
image_base_path: Path = Path("data/rs/"),
) -> None:
"""Download floorplan images for all listings."""
listings = await repository.get_listings()
updated_listings = await tqdm.gather(
*[dump_images_for_listing(listing, image_base_path) for listing in listings]
)
await repository.upsert_listings(
[listing for listing in updated_listings if listing is not None]
)
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None:
"""Download floorplan images for a single listing."""
all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
for floorplan in all_floorplans:
url = floorplan["url"]
picname = url.split("/")[-1]
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
if floorplan_path.exists():
continue
try:
async with semaphore:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status == 404:
return None
if response.status != 200:
raise Exception(f"Error for {url}: {response.status}")
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
with open(floorplan_path, "wb") as f:
f.write(await response.read())
listing.floorplan_image_paths.append(str(floorplan_path))
return listing
except Exception as e:
tqdm.write(f"Error for {url}: {e}")
raise e # raise so that we retry it
return None

View file

@ -0,0 +1,168 @@
"""Unified listing service - shared between CLI and HTTP API.
This module provides the core business logic for listing operations.
Both the CLI (main.py) and HTTP API (api/app.py) should use these functions.
"""
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any
from models.listing import Listing, QueryParameters
from repositories.listing_repository import ListingRepository
@dataclass
class ListingResult:
"""Result of a listing operation."""
listings: list[Listing]
total_count: int
message: str | None = None
@dataclass
class RefreshResult:
"""Result of a refresh operation."""
task_id: str | None # None if run synchronously
new_listings_count: int
message: str
async def get_listings(
repository: ListingRepository,
query_parameters: QueryParameters | None = None,
limit: int | None = None,
only_ids: list[int] | None = None,
) -> ListingResult:
"""Get listings from the database with optional filtering.
Used by:
- CLI: export-csv, export-immoweb
- API: GET /api/listing, GET /api/listing_geojson
"""
listings = await repository.get_listings(
query_parameters=query_parameters,
limit=limit,
only_ids=only_ids,
)
return ListingResult(
listings=listings,
total_count=len(listings),
)
async def refresh_listings(
repository: ListingRepository,
query_parameters: QueryParameters,
full: bool = False,
async_mode: bool = False,
user_email: str | None = None,
) -> RefreshResult:
"""Refresh listings by fetching from external API.
Args:
repository: Database repository
query_parameters: Filtering parameters
full: If True, also fetch images and run OCR
async_mode: If True, run as background task and return task_id
user_email: User email for tracking (API mode)
Used by:
- CLI: dump-listings
- API: POST /api/refresh_listings
"""
if async_mode:
# Import here to avoid circular imports
from tasks.listing_tasks import dump_listings_task
from datetime import timedelta
expiry_time = datetime.now() + timedelta(minutes=10)
task = dump_listings_task.apply_async(
args=(query_parameters.model_dump_json(),),
expires=expiry_time,
)
return RefreshResult(
task_id=task.id,
new_listings_count=0,
message=f"Task {task.id} started",
)
# Synchronous mode - run directly
from services.listing_fetcher import dump_listings, dump_listings_full
if full:
new_listings = await dump_listings_full(query_parameters, repository)
else:
new_listings = await dump_listings(query_parameters, repository)
return RefreshResult(
task_id=None,
new_listings_count=len(new_listings),
message=f"Fetched {len(new_listings)} new listings",
)
async def download_images(
repository: ListingRepository,
data_dir: Path = Path("data/rs/"),
) -> int:
"""Download floorplan images for all listings.
Used by:
- CLI: dump-images
- API: (could be added)
Returns:
Number of listings processed
"""
from services.image_fetcher import dump_images
await dump_images(repository, image_base_path=data_dir)
listings = await repository.get_listings()
return len(listings)
async def detect_floorplans(
repository: ListingRepository,
) -> int:
"""Run OCR on floorplan images to detect square meters.
Used by:
- CLI: detect-floorplan
- API: (could be added)
Returns:
Number of listings processed
"""
from services.floorplan_detector import detect_floorplan
await detect_floorplan(repository)
listings = await repository.get_listings()
return len(listings)
async def calculate_routes(
repository: ListingRepository,
destination_address: str,
travel_mode: str,
limit: int | None = None,
) -> int:
"""Calculate transit routes for listings.
Used by:
- CLI: routing
- API: (could be added)
Returns:
Number of listings processed
"""
from services.route_calculator import calculate_route
from rec.routing import TravelMode
await calculate_route(
repository,
destination_address,
TravelMode[travel_mode],
limit=limit,
)
return limit or 0

View file

@ -1,3 +1,4 @@
"""Route calculator service - calculates transit routes using Google Maps API."""
from models.listing import DestinationMode, Route, RouteLegStep
from repositories.listing_repository import ListingRepository
from tqdm.asyncio import tqdm
@ -11,6 +12,7 @@ async def calculate_route(
travel_mode: routing.TravelMode,
limit: int | None = None,
) -> None:
"""Calculate transit routes for listings to a destination."""
listings = await repository.get_listings()
if limit is not None:
@ -30,6 +32,7 @@ async def calculate_route(
async def update_routing_info(
listing: Listing, destination_mode: DestinationMode
) -> Listing | None:
"""Update routing information for a single listing."""
if listing.routing_info.get(destination_mode) is not None:
# already calculated, do not recompute to save API calls
return None
@ -41,8 +44,7 @@ async def update_routing_info(
destination_mode.travel_mode,
)
route_data = routes_data["routes"][0]
routes = []
routes: list[Route] = []
for route_data in routes_data["routes"]:
duration_s = int(route_data["duration"].split("s")[0])
route = Route(
@ -61,47 +63,4 @@ async def update_routing_info(
listing.routing_info_json = listing.serialize_routing_info(
{**listing.routing_info, **{destination_mode: routes}}
)
return listing
# async def geocode_address(
# address: str,
# geocoding_cache: pathlib.Path,
# ) -> tuple[int, int]:
# cache = get_geocoding_cache(geocoding_cache)
# cached_results = cache.get(address)
# if cached_results is None:
# # resolve
# async with aiohttp.ClientSession() as session:
# async with session.get(
# ("https://maps.googleapis.com/maps/api/geocode/json"
# f"?address={address}"
# f"&key={API_KEY_ENVIRONMENT_VARIABLE}")) as response:
# if response.status != 200:
# raise Exception(
# f"Error {response.status} from geocoding API")
# cached_results = await response.json()
# with open(geocoding_cache, 'w') as f:
# json.dump({
# **{
# address: cached_results,
# },
# **cache
# }, f)
# # API format
# lat = cached_results["results"][0]["geometry"]["location"]["lat"]
# lng = cached_results["results"][0]["geometry"]["location"]["lng"]
# cache[address] = (lat, lng)
# with open(geocoding_cache, 'w') as f:
# json.dump(cache, f)
# return lat, lng
# def get_geocoding_cache(geocoding_cache: pathlib.Path) -> dict[str, Any]:
# try:
# with open(geocoding_cache, 'x') as f:
# json.dump({}, f)
# return {}
# except FileExistsError:
# pass # File already exists
# with open(geocoding_cache, 'r') as f:
# return json.load(f)
return listing

View file

@ -11,9 +11,14 @@ import json
class TaskStatus:
"""Status of a background task."""
task_id: str
status: str # PENDING, STARTED, SUCCESS, FAILURE, REVOKED
status: str # PENDING, STARTED, SUCCESS, FAILURE, REVOKED, SKIPPED
result: Any | None
progress: float | None # 0.0 to 1.0
processed: int | None # Number of items processed
total: int | None # Total number of items
message: str | None # Human-readable status message (e.g., "Fetching listings")
error: str | None # Error message if failed
traceback: str | None # Full traceback if failed
def get_task_status(task_id: str) -> TaskStatus:
@ -33,21 +38,50 @@ def get_task_status(task_id: str) -> TaskStatus:
task_result = dump_listings_task.AsyncResult(task_id)
# Try to serialize result
try:
result = json.loads(json.dumps(task_result.result))
except (TypeError, json.JSONDecodeError):
result = str(task_result.result) if task_result.result else None
result = None
error = None
if task_result.failed():
# Extract error message from failed task
error = str(task_result.result) if task_result.result else None
else:
try:
result = json.loads(json.dumps(task_result.result))
except (TypeError, json.JSONDecodeError):
result = str(task_result.result) if task_result.result else None
# Extract progress from task meta if available
# Extract traceback if available
task_traceback = task_result.traceback if task_result.failed() else None
# Extract progress, processed, total, and message from task meta
progress = None
processed = None
total = None
message = None
if task_result.info and isinstance(task_result.info, dict):
progress = task_result.info.get("progress")
processed = task_result.info.get("processed")
total = task_result.info.get("total")
# Use 'message' if available, fall back to 'reason' for SKIPPED tasks
message = task_result.info.get("message") or task_result.info.get("reason")
# For custom states (like "Fetching listings"), use the state as message
# if no message was provided in info
if not message and task_result.status not in (
"PENDING", "STARTED", "SUCCESS", "FAILURE", "REVOKED", "RETRY"
):
message = task_result.status
return TaskStatus(
task_id=task_id,
status=task_result.status,
result=result,
progress=progress,
processed=processed,
total=total,
message=message,
error=error,
traceback=task_traceback,
)

View file

@ -1,55 +1,153 @@
#!/usr/bin/env bash
set -eu
# This sript is used to start the backend services and configure them according to what's available in the system
set -eux
ENV_MODE=${ENV:-"dev"} # Defaults to "dev" if ENV_MODE is unset
case "$ENV_MODE" in
dev)
echo "🛠️ Running in DEVELOPMENT mode"
set +e
pkill -f celery
pkill watchmedo
set -e
if ! netstat -tlnp |grep 6379; then
echo "Did not find a running redis on 6379. Starting a new instance..."
docker run -d --rm --name redis-server -p 6379:6379 redis:latest
fi
echo "Checking connection to redis is successful..."
python celery_app.py
watchmedo auto-restart --directory=./ --pattern='*.py' --recursive -- celery -A celery_app worker & # DEV to autoreload on changes
CELERY_PID=$!
;;
prod)
echo "🚀 Running in PRODUCTION mode"
echo "Checking connection to redis is successful..."
python celery_app.py
alembic upgrade head
celery -A celery_app worker --beat &
CELERY_PID=$!
;;
*)
echo "❌ Unknown ENV_MODE: $ENV_MODE. Defaulting to DEV."
exit 1
;;
esac
cleanup() {
echo "Stopping background process (PID: $CELERY_PID)..."
kill "$CELERY_PID" 2>/dev/null # Graceful shutdown (SIGTERM)
wait "$CELERY_PID" 2>/dev/null # Wait for process to exit
# Real Estate Crawler - Development Server
# Usage:
# ./start.sh - Start with Docker (recommended)
# ./start.sh --local - Start locally (requires Poetry and dependencies)
# ./start.sh --help - Show help
show_help() {
echo "Real Estate Crawler - Development Server"
echo ""
echo "Usage: ./start.sh [OPTIONS]"
echo ""
echo "Options:"
echo " (default) Start all services with Docker Compose"
echo " --local Run locally with Poetry (requires local deps)"
echo " --build Rebuild Docker images before starting"
echo " --down Stop and remove all containers"
echo " --logs Follow logs from all services"
echo " --help Show this help message"
echo ""
echo "Examples:"
echo " ./start.sh # Start with Docker"
echo " ./start.sh --build # Rebuild and start"
echo " ./start.sh --local # Run locally with Poetry"
}
trap cleanup EXIT SIGINT SIGTERM
# celery -A celery_app worker -D # PROD
uvicorn api.app:app --host 0.0.0.0 --port 5001 --log-level debug
# UVICORN_PID=$!
start_docker() {
local build_flag=""
if [[ "${1:-}" == "--build" ]]; then
build_flag="--build"
fi
# wait for
# less /etc/passwd > /dev/null
echo "🐳 Starting services with Docker Compose..."
echo ""
# Check if docker/podman is available
if command -v docker &> /dev/null; then
COMPOSE_CMD="docker compose"
elif command -v podman-compose &> /dev/null; then
COMPOSE_CMD="podman-compose"
else
echo "❌ Error: Neither docker nor podman-compose found."
echo " Install Docker: https://docs.docker.com/get-docker/"
echo " Or run locally: ./start.sh --local"
exit 1
fi
$COMPOSE_CMD up $build_flag
}
stop_docker() {
echo "🛑 Stopping all containers..."
if command -v docker &> /dev/null; then
docker compose down
elif command -v podman-compose &> /dev/null; then
podman-compose down
fi
}
show_logs() {
if command -v docker &> /dev/null; then
docker compose logs -f
elif command -v podman-compose &> /dev/null; then
podman-compose logs -f
fi
}
start_local() {
echo "🛠️ Starting locally with Poetry..."
echo ""
# Check Poetry is available
if ! command -v poetry &> /dev/null; then
echo "❌ Error: Poetry not found."
echo " Install: curl -sSL https://install.python-poetry.org | python3 -"
echo " Or use Docker: ./start.sh"
exit 1
fi
# Source .env if it exists
if [[ -f .env ]]; then
set -a
source .env
set +a
fi
ENV_MODE=${ENV:-"dev"}
# Ensure Redis is running
if ! nc -z localhost 6379 2>/dev/null; then
echo "📦 Starting Redis container..."
docker run -d --rm --name rec-redis-local -p 6379:6379 redis:latest || true
sleep 2
fi
echo "✅ Redis OK"
# Test celery connection
poetry run python celery_app.py
# Start Celery worker in background
echo "🔧 Starting Celery worker..."
if [[ "$ENV_MODE" == "dev" ]]; then
poetry run celery -A celery_app worker --loglevel=info &
else
poetry run alembic upgrade head
poetry run celery -A celery_app worker --beat --loglevel=info &
fi
CELERY_PID=$!
cleanup() {
echo ""
echo "🛑 Stopping Celery worker (PID: $CELERY_PID)..."
kill "$CELERY_PID" 2>/dev/null || true
wait "$CELERY_PID" 2>/dev/null || true
}
trap cleanup EXIT SIGINT SIGTERM
# Start uvicorn
echo "🚀 Starting API server on http://localhost:5001"
echo ""
poetry run uvicorn api.app:app --host 0.0.0.0 --port 5001 --reload
}
# Parse arguments
case "${1:-}" in
--help|-h)
show_help
;;
--local)
start_local
;;
--down)
stop_docker
;;
--logs)
show_logs
;;
--build)
start_docker --build
;;
"")
start_docker
;;
*)
echo "❌ Unknown option: $1"
echo ""
show_help
exit 1
;;
esac

View file

@ -0,0 +1 @@
# Tests package

186
crawler/tests/conftest.py Normal file
View file

@ -0,0 +1,186 @@
"""Shared pytest fixtures for the test suite."""
from datetime import datetime
from typing import AsyncGenerator, Generator
import pytest
from sqlalchemy import Engine
from sqlmodel import SQLModel, Session, create_engine
from httpx import ASGITransport, AsyncClient
from models.listing import (
BuyListing,
FurnishType,
ListingSite,
RentListing,
Listing,
)
from repositories.listing_repository import ListingRepository
from api.auth import User
@pytest.fixture
def in_memory_engine() -> Generator[Engine, None, None]:
"""Create an in-memory SQLite engine for testing."""
engine = create_engine(
"sqlite:///:memory:",
echo=False,
connect_args={"check_same_thread": False},
)
SQLModel.metadata.create_all(engine)
yield engine
SQLModel.metadata.drop_all(engine)
@pytest.fixture
def listing_repository(in_memory_engine: Engine) -> ListingRepository:
"""Create a ListingRepository with the in-memory engine."""
return ListingRepository(engine=in_memory_engine)
@pytest.fixture
def sample_rent_listing() -> RentListing:
"""Create a sample RentListing for testing."""
return RentListing(
id=12345678,
price=2500.0,
number_of_bedrooms=2,
square_meters=65.0,
agency="Test Agency",
council_tax_band="C",
longitude=-0.1276,
latitude=51.5074,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=datetime.now(),
photo_thumbnail="https://example.com/photo.jpg",
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.FURNISHED,
available_from=datetime.now(),
)
@pytest.fixture
def sample_buy_listing() -> BuyListing:
"""Create a sample BuyListing for testing."""
return BuyListing(
id=87654321,
price=450000.0,
number_of_bedrooms=3,
square_meters=95.0,
agency="Test Estate Agents",
council_tax_band="D",
longitude=-0.1180,
latitude=51.5100,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=datetime.now(),
photo_thumbnail="https://example.com/buy_photo.jpg",
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
service_charge=1500.0,
lease_left=90,
)
@pytest.fixture
def sample_rent_listings() -> list[RentListing]:
"""Create multiple sample RentListings for testing filters."""
now = datetime.now()
return [
RentListing(
id=1,
price=1500.0,
number_of_bedrooms=1,
square_meters=40.0,
agency="Agency A",
council_tax_band="B",
longitude=-0.1,
latitude=51.5,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=now,
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.FURNISHED,
available_from=now,
),
RentListing(
id=2,
price=2000.0,
number_of_bedrooms=2,
square_meters=55.0,
agency="Agency B",
council_tax_band="C",
longitude=-0.12,
latitude=51.51,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=now,
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.UNFURNISHED,
available_from=now,
),
RentListing(
id=3,
price=3000.0,
number_of_bedrooms=3,
square_meters=80.0,
agency="Agency C",
council_tax_band="D",
longitude=-0.14,
latitude=51.52,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=now,
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.FURNISHED,
available_from=now,
),
]
@pytest.fixture
def mock_user() -> User:
"""Create a mock user for API tests."""
return User(
sub="test-user-id",
email="test@example.com",
name="Test User",
)
@pytest.fixture
async def async_client(
in_memory_engine: Engine, mock_user: User
) -> AsyncGenerator[AsyncClient, None]:
"""Create an AsyncClient for API testing with mock auth."""
from api.app import app
from api.auth import get_current_user
# Override dependencies
app.dependency_overrides[get_current_user] = lambda: mock_user
# Patch the engine used by the repository
original_engine = None
try:
from database import engine as db_engine
original_engine = db_engine
except Exception:
pass
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
yield client
# Clean up dependency overrides
app.dependency_overrides.clear()

View file

@ -0,0 +1 @@
# Integration tests package

View file

@ -0,0 +1,180 @@
"""Integration tests for API endpoints."""
from unittest.mock import AsyncMock, patch
import pytest
from httpx import AsyncClient
from api.auth import User
class TestStatusEndpoint:
"""Tests for the /api/status endpoint."""
async def test_status_endpoint_returns_ok(
self, async_client: AsyncClient
) -> None:
"""Test that status endpoint returns OK status."""
response = await async_client.get("/api/status")
assert response.status_code == 200
assert response.json() == {"status": "OK"}
class TestListingEndpoint:
"""Tests for the /api/listing endpoint."""
async def test_listing_endpoint_requires_auth(self) -> None:
"""Test that listing endpoint requires authentication."""
from api.app import app
from httpx import ASGITransport, AsyncClient
# Clear any dependency overrides to test auth requirement
app.dependency_overrides.clear()
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/api/listing")
# Should return 401 or 403 without valid auth
assert response.status_code in (401, 403)
async def test_listing_endpoint_with_auth(
self, async_client: AsyncClient
) -> None:
"""Test that listing endpoint works with authentication."""
# Mock the repository to return empty list
with patch(
"api.app.ListingRepository.get_listings",
new_callable=AsyncMock,
return_value=[],
):
response = await async_client.get("/api/listing")
assert response.status_code == 200
data = response.json()
assert "listings" in data
class TestListingGeoJsonEndpoint:
"""Tests for the /api/listing_geojson endpoint."""
async def test_listing_geojson_requires_auth(self) -> None:
"""Test that listing_geojson endpoint requires authentication."""
from api.app import app
from httpx import ASGITransport, AsyncClient
# Clear any dependency overrides to test auth requirement
app.dependency_overrides.clear()
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get(
"/api/listing_geojson",
params={"listing_type": "RENT"},
)
# Should return 401 or 403 without valid auth
assert response.status_code in (401, 403)
async def test_listing_geojson_with_filters(
self, async_client: AsyncClient
) -> None:
"""Test that listing_geojson accepts filter parameters."""
with patch(
"api.app.export_immoweb",
new_callable=AsyncMock,
return_value={"type": "FeatureCollection", "features": []},
):
response = await async_client.get(
"/api/listing_geojson",
params={
"listing_type": "RENT",
"min_bedrooms": 2,
"max_bedrooms": 3,
"min_price": 1500,
"max_price": 3000,
},
)
assert response.status_code == 200
data = response.json()
assert data["type"] == "FeatureCollection"
class TestGetDistrictsEndpoint:
"""Tests for the /api/get_districts endpoint."""
async def test_get_districts_requires_auth(self) -> None:
"""Test that get_districts endpoint requires authentication."""
from api.app import app
from httpx import ASGITransport, AsyncClient
# Clear any dependency overrides to test auth requirement
app.dependency_overrides.clear()
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get("/api/get_districts")
# Should return 401 or 403 without valid auth
assert response.status_code in (401, 403)
async def test_get_districts_returns_dict(
self, async_client: AsyncClient
) -> None:
"""Test that get_districts returns a dictionary of districts."""
response = await async_client.get("/api/get_districts")
assert response.status_code == 200
data = response.json()
assert isinstance(data, dict)
# Check some known districts exist
assert "London" in data
assert "Westminster" in data
assert "Camden" in data
async def test_get_districts_values_are_region_ids(
self, async_client: AsyncClient
) -> None:
"""Test that district values are REGION identifiers."""
response = await async_client.get("/api/get_districts")
data = response.json()
# All values should be REGION^... format
for district_name, region_id in data.items():
assert region_id.startswith("REGION^"), (
f"District {district_name} has invalid region ID: {region_id}"
)
class TestRefreshListingsEndpoint:
"""Tests for the /api/refresh_listings endpoint."""
async def test_refresh_listings_requires_auth(self) -> None:
"""Test that refresh_listings endpoint requires authentication."""
from api.app import app
from httpx import ASGITransport, AsyncClient
# Clear any dependency overrides to test auth requirement
app.dependency_overrides.clear()
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.post(
"/api/refresh_listings",
params={"listing_type": "RENT"},
)
# Should return 401 or 403 without valid auth
assert response.status_code in (401, 403)
class TestTaskStatusEndpoint:
"""Tests for the /api/task_status endpoint."""
async def test_task_status_requires_auth(self) -> None:
"""Test that task_status endpoint requires authentication."""
from api.app import app
from httpx import ASGITransport, AsyncClient
# Clear any dependency overrides to test auth requirement
app.dependency_overrides.clear()
transport = ASGITransport(app=app)
async with AsyncClient(transport=transport, base_url="http://test") as client:
response = await client.get(
"/api/task_status",
params={"task_id": "test-task-id"},
)
# Should return 401 or 403 without valid auth
assert response.status_code in (401, 403)

View file

@ -0,0 +1,299 @@
"""Tests for the listing_geojson API endpoint and QueryParameters parsing."""
import json
import pytest
from datetime import datetime
from unittest.mock import patch, MagicMock, AsyncMock
class TestQueryParametersModel:
"""Test QueryParameters model directly."""
def test_datetime_parsing_z_suffix(self):
"""Test that datetime with Z suffix is parsed correctly."""
from models.listing import QueryParameters, ListingType
params = QueryParameters(
listing_type=ListingType.RENT,
let_date_available_from="2026-02-01T11:33:01.248Z",
)
assert params.let_date_available_from is not None
assert params.let_date_available_from.year == 2026
def test_datetime_parsing_offset(self):
"""Test that datetime with offset is parsed correctly."""
from models.listing import QueryParameters, ListingType
params = QueryParameters(
listing_type=ListingType.RENT,
let_date_available_from="2026-02-01T11:33:01.248+00:00",
)
assert params.let_date_available_from is not None
def test_defaults_work(self):
"""Test that default values are applied correctly."""
from models.listing import QueryParameters, ListingType
params = QueryParameters(listing_type=ListingType.RENT)
assert params.min_bedrooms == 1
assert params.max_bedrooms == 999
assert params.min_price == 0
assert params.max_price == 10_000_000
assert params.district_names == set()
assert params.let_date_available_from is None
def test_full_frontend_params(self):
"""Test with all parameters as sent by frontend."""
from models.listing import QueryParameters, ListingType
params = QueryParameters(
listing_type=ListingType.RENT,
min_bedrooms=1,
max_bedrooms=3,
max_price=3000,
min_price=2000,
min_sqm=50,
last_seen_days=28,
let_date_available_from="2026-02-01T11:19:22.072Z",
)
assert params.listing_type == ListingType.RENT
assert params.min_bedrooms == 1
assert params.max_bedrooms == 3
assert params.min_sqm == 50
class TestGetQueryParametersDependency:
"""Test the get_query_parameters FastAPI dependency."""
def test_parses_datetime_correctly(self):
"""Test that the dependency parses datetime Z suffix."""
from api.app import get_query_parameters
from models.listing import ListingType
params = get_query_parameters(
listing_type=ListingType.RENT,
let_date_available_from=datetime(2026, 2, 1, 11, 33, 1),
)
assert params.let_date_available_from is not None
def test_defaults_applied(self):
"""Test that defaults are applied when not provided."""
from api.app import get_query_parameters
from models.listing import ListingType
params = get_query_parameters(listing_type=ListingType.RENT)
assert params.min_bedrooms == 1
assert params.max_bedrooms == 999
class TestListingGeoJsonEndpoint:
"""Test the /api/listing_geojson endpoint."""
@pytest.fixture
def client(self):
"""Create test client with mocked auth."""
from fastapi.testclient import TestClient
from api.app import app, get_current_user
from api.auth import User
# Override auth dependency
async def mock_auth():
return User(email="test@example.com", name="Test User")
app.dependency_overrides[get_current_user] = mock_auth
yield TestClient(app)
app.dependency_overrides.clear()
@pytest.fixture
def mock_export(self):
"""Mock the export service."""
with patch("api.app.export_service.export_to_geojson") as mock:
mock.return_value = MagicMock(
data={"type": "FeatureCollection", "features": [{"type": "Feature"}]}
)
yield mock
def test_minimal_params_no_422(self, client, mock_export):
"""Test that minimal params don't cause 422."""
response = client.get("/api/listing_geojson?listing_type=RENT")
assert response.status_code != 422, f"Got 422: {response.json()}"
def test_with_datetime_z_suffix_no_422(self, client, mock_export):
"""Test datetime parsing with Z suffix doesn't cause 422."""
response = client.get(
"/api/listing_geojson?"
"listing_type=RENT"
"&let_date_available_from=2026-02-01T11:33:01.248Z"
)
assert response.status_code != 422, f"Got 422: {response.json()}"
def test_full_frontend_params_no_422(self, client, mock_export):
"""Test with all parameters as sent by frontend."""
response = client.get(
"/api/listing_geojson?"
"listing_type=RENT"
"&min_bedrooms=1"
"&max_bedrooms=3"
"&max_price=3000"
"&min_price=2000"
"&min_sqm=50"
"&last_seen_days=28"
"&let_date_available_from=2026-02-01T11:19:22.072Z"
)
assert response.status_code != 422, f"Got 422: {response.json()}"
def test_returns_geojson_structure(self, client, mock_export):
"""Test that endpoint returns proper GeoJSON structure."""
response = client.get("/api/listing_geojson?listing_type=RENT")
assert response.status_code == 200
data = response.json()
assert "type" in data
assert data["type"] == "FeatureCollection"
assert "features" in data
class TestStreamingEndpoint:
"""Test the /api/listing_geojson/stream endpoint."""
@pytest.fixture
def client(self):
"""Create test client with mocked auth."""
from fastapi.testclient import TestClient
from api.app import app
from api.auth import get_current_user, User
async def mock_auth():
return User(sub="test-id", email="test@example.com", name="Test User")
app.dependency_overrides[get_current_user] = mock_auth
yield TestClient(app)
app.dependency_overrides.clear()
@pytest.fixture
def mock_repository(self):
"""Mock the repository methods."""
with patch("api.app.ListingRepository") as MockRepo:
mock_instance = MagicMock()
mock_instance.count_listings.return_value = 3
mock_instance.stream_listings_optimized.return_value = iter([
{
'id': 1,
'price': 2000.0,
'number_of_bedrooms': 2,
'square_meters': 50.0,
'longitude': -0.1,
'latitude': 51.5,
'photo_thumbnail': 'https://example.com/1.jpg',
'last_seen': datetime.now(),
'agency': 'Test Agency',
'price_history_json': '[]',
'available_from': datetime.now(),
},
{
'id': 2,
'price': 2500.0,
'number_of_bedrooms': 2,
'square_meters': 60.0,
'longitude': -0.12,
'latitude': 51.51,
'photo_thumbnail': 'https://example.com/2.jpg',
'last_seen': datetime.now(),
'agency': 'Test Agency 2',
'price_history_json': '[]',
'available_from': None,
},
{
'id': 3,
'price': 3000.0,
'number_of_bedrooms': 3,
'square_meters': None,
'longitude': -0.14,
'latitude': 51.52,
'photo_thumbnail': None,
'last_seen': datetime.now(),
'agency': None,
'price_history_json': '[{"first_seen": "2026-01-01", "last_seen": "2026-01-15", "price": 2800}]',
'available_from': None,
},
])
MockRepo.return_value = mock_instance
yield mock_instance
def test_streaming_returns_ndjson(self, client, mock_repository):
"""Test that streaming endpoint returns NDJSON format."""
response = client.get("/api/listing_geojson/stream?listing_type=RENT&limit=10")
assert response.status_code == 200
assert response.headers["content-type"] == "application/x-ndjson"
def test_streaming_metadata_includes_total_expected(self, client, mock_repository):
"""Test that first line includes total_expected count."""
response = client.get("/api/listing_geojson/stream?listing_type=RENT&limit=10")
lines = response.text.strip().split("\n")
assert len(lines) >= 1
metadata = json.loads(lines[0])
assert metadata["type"] == "metadata"
assert "total_expected" in metadata
assert metadata["total_expected"] == 3
assert "batch_size" in metadata
def test_streaming_returns_batches_and_complete(self, client, mock_repository):
"""Test that streaming returns batch and complete messages."""
response = client.get("/api/listing_geojson/stream?listing_type=RENT&limit=10")
lines = response.text.strip().split("\n")
# Parse all lines
messages = [json.loads(line) for line in lines]
# First should be metadata
assert messages[0]["type"] == "metadata"
# Should have at least one batch
batch_messages = [m for m in messages if m["type"] == "batch"]
assert len(batch_messages) >= 1
# Last should be complete
assert messages[-1]["type"] == "complete"
assert "total" in messages[-1]
def test_streaming_features_have_correct_structure(self, client, mock_repository):
"""Test that streamed features have correct GeoJSON structure."""
response = client.get("/api/listing_geojson/stream?listing_type=RENT&batch_size=10&limit=10")
lines = response.text.strip().split("\n")
messages = [json.loads(line) for line in lines]
batch_messages = [m for m in messages if m["type"] == "batch"]
assert len(batch_messages) >= 1
features = batch_messages[0]["features"]
assert len(features) > 0
feature = features[0]
assert feature["type"] == "Feature"
assert "properties" in feature
assert "geometry" in feature
assert feature["geometry"]["type"] == "Point"
assert "coordinates" in feature["geometry"]
# Check properties
props = feature["properties"]
assert "total_price" in props
assert "rooms" in props
assert "url" in props
assert "last_seen" in props
def test_streaming_handles_null_square_meters(self, client, mock_repository):
"""Test that null square_meters doesn't cause errors."""
response = client.get("/api/listing_geojson/stream?listing_type=RENT&batch_size=10&limit=10")
assert response.status_code == 200
lines = response.text.strip().split("\n")
messages = [json.loads(line) for line in lines]
# Find feature with id=3 (has null square_meters)
for msg in messages:
if msg["type"] == "batch":
for feature in msg["features"]:
if feature["properties"]["url"].endswith("/3"):
assert feature["properties"]["qm"] is None
assert feature["properties"]["qmprice"] is None

View file

@ -0,0 +1 @@
# Unit tests package

View file

@ -0,0 +1,343 @@
"""Unit tests for Listing models."""
from datetime import datetime
import json
import pytest
from models.listing import (
BuyListing,
FurnishType,
ListingSite,
PriceHistoryItem,
RentListing,
Listing,
)
class TestListing:
"""Tests for the base Listing model."""
def test_price_per_square_meter_calculation(self) -> None:
"""Test that price_per_square_meter is calculated correctly."""
listing = RentListing(
id=1,
price=2000.0,
number_of_bedrooms=2,
square_meters=50.0,
agency="Test",
council_tax_band="C",
longitude=0.0,
latitude=0.0,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=datetime.now(),
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.FURNISHED,
available_from=None,
)
assert listing.price_per_square_meter == 40.0
def test_price_per_square_meter_none_when_no_sqm(self) -> None:
"""Test that price_per_square_meter is None when square_meters is None."""
listing = RentListing(
id=1,
price=2000.0,
number_of_bedrooms=2,
square_meters=None,
agency="Test",
council_tax_band="C",
longitude=0.0,
latitude=0.0,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=datetime.now(),
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.FURNISHED,
available_from=None,
)
assert listing.price_per_square_meter is None
def test_price_per_square_meter_none_when_sqm_zero(self) -> None:
"""Test that price_per_square_meter is None when square_meters is 0."""
listing = RentListing(
id=1,
price=2000.0,
number_of_bedrooms=2,
square_meters=0.0,
agency="Test",
council_tax_band="C",
longitude=0.0,
latitude=0.0,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=datetime.now(),
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.FURNISHED,
available_from=None,
)
assert listing.price_per_square_meter is None
def test_url_property(self) -> None:
"""Test that url property returns correct Rightmove URL."""
listing = RentListing(
id=123456789,
price=2000.0,
number_of_bedrooms=2,
square_meters=50.0,
agency="Test",
council_tax_band="C",
longitude=0.0,
latitude=0.0,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=datetime.now(),
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.FURNISHED,
available_from=None,
)
assert listing.url == "https://www.rightmove.co.uk/properties/123456789"
def test_is_removed_property_visible(self) -> None:
"""Test that is_removed returns False when property is visible."""
listing = RentListing(
id=1,
price=2000.0,
number_of_bedrooms=2,
square_meters=50.0,
agency="Test",
council_tax_band="C",
longitude=0.0,
latitude=0.0,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=datetime.now(),
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.FURNISHED,
available_from=None,
)
assert listing.is_removed is False
def test_is_removed_property_not_visible(self) -> None:
"""Test that is_removed returns True when property is not visible."""
listing = RentListing(
id=1,
price=2000.0,
number_of_bedrooms=2,
square_meters=50.0,
agency="Test",
council_tax_band="C",
longitude=0.0,
latitude=0.0,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=datetime.now(),
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": False}},
routing_info_json=None,
furnish_type=FurnishType.FURNISHED,
available_from=None,
)
assert listing.is_removed is True
class TestPriceHistory:
"""Tests for price history serialization/deserialization."""
def test_price_history_serialization_roundtrip(self) -> None:
"""Test that price history can be serialized and deserialized."""
now = datetime.now()
price_history = [
PriceHistoryItem(
first_seen=now,
last_seen=now,
price=2000.0,
),
PriceHistoryItem(
first_seen=now,
last_seen=now,
price=2100.0,
),
]
# Serialize
serialized = Listing.serialize_price_history(price_history)
assert isinstance(serialized, str)
# Create listing with serialized history
listing = RentListing(
id=1,
price=2100.0,
number_of_bedrooms=2,
square_meters=50.0,
agency="Test",
council_tax_band="C",
longitude=0.0,
latitude=0.0,
price_history_json=serialized,
listing_site=ListingSite.RIGHTMOVE,
last_seen=now,
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.FURNISHED,
available_from=None,
)
# Deserialize
deserialized = listing.price_history
assert len(deserialized) == 2
assert deserialized[0].price == 2000.0
assert deserialized[1].price == 2100.0
def test_price_history_empty(self) -> None:
"""Test that empty price history works correctly."""
listing = RentListing(
id=1,
price=2000.0,
number_of_bedrooms=2,
square_meters=50.0,
agency="Test",
council_tax_band="C",
longitude=0.0,
latitude=0.0,
price_history_json="",
listing_site=ListingSite.RIGHTMOVE,
last_seen=datetime.now(),
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.FURNISHED,
available_from=None,
)
assert listing.price_history == []
def test_price_history_item_to_dict(self) -> None:
"""Test PriceHistoryItem.to_dict() method."""
now = datetime.now()
item = PriceHistoryItem(
first_seen=now,
last_seen=now,
price=2500.0,
)
result = item.to_dict()
assert result["price"] == 2500.0
assert result["first_seen"] == now.isoformat()
assert result["last_seen"] == now.isoformat()
class TestRentListing:
"""Tests specific to RentListing model."""
def test_rent_listing_has_furnish_type(self) -> None:
"""Test that RentListing has furnish_type field."""
listing = RentListing(
id=1,
price=2000.0,
number_of_bedrooms=2,
square_meters=50.0,
agency="Test",
council_tax_band="C",
longitude=0.0,
latitude=0.0,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=datetime.now(),
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.PART_FURNISHED,
available_from=None,
)
assert listing.furnish_type == FurnishType.PART_FURNISHED
def test_rent_listing_has_available_from(self) -> None:
"""Test that RentListing has available_from field."""
now = datetime.now()
listing = RentListing(
id=1,
price=2000.0,
number_of_bedrooms=2,
square_meters=50.0,
agency="Test",
council_tax_band="C",
longitude=0.0,
latitude=0.0,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=now,
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
furnish_type=FurnishType.FURNISHED,
available_from=now,
)
assert listing.available_from == now
class TestBuyListing:
"""Tests specific to BuyListing model."""
def test_buy_listing_has_service_charge(self) -> None:
"""Test that BuyListing has service_charge field."""
listing = BuyListing(
id=1,
price=450000.0,
number_of_bedrooms=3,
square_meters=95.0,
agency="Test",
council_tax_band="D",
longitude=0.0,
latitude=0.0,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=datetime.now(),
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
service_charge=2500.0,
lease_left=85,
)
assert listing.service_charge == 2500.0
def test_buy_listing_has_lease_left(self) -> None:
"""Test that BuyListing has lease_left field."""
listing = BuyListing(
id=1,
price=450000.0,
number_of_bedrooms=3,
square_meters=95.0,
agency="Test",
council_tax_band="D",
longitude=0.0,
latitude=0.0,
price_history_json="[]",
listing_site=ListingSite.RIGHTMOVE,
last_seen=datetime.now(),
photo_thumbnail=None,
floorplan_image_paths=[],
additional_info={"property": {"visible": True}},
routing_info_json=None,
service_charge=None,
lease_left=120,
)
assert listing.lease_left == 120

View file

@ -0,0 +1,74 @@
"""Unit tests for Redis distributed lock."""
from unittest import mock
import pytest
from utils.redis_lock import redis_lock, get_redis_client
class TestRedisLock:
"""Tests for redis_lock context manager."""
@mock.patch("utils.redis_lock.get_redis_client")
def test_lock_acquired_successfully(self, mock_get_client):
"""Test lock acquisition when no other lock exists."""
mock_client = mock.MagicMock()
mock_client.set.return_value = True
mock_get_client.return_value = mock_client
with redis_lock("test_lock") as acquired:
assert acquired is True
mock_client.set.assert_called_once_with("lock:test_lock", "1", nx=True, ex=3600 * 4)
mock_client.delete.assert_called_once_with("lock:test_lock")
@mock.patch("utils.redis_lock.get_redis_client")
def test_lock_not_acquired(self, mock_get_client):
"""Test lock not acquired when another lock exists."""
mock_client = mock.MagicMock()
mock_client.set.return_value = None # Redis returns None when nx=True fails
mock_get_client.return_value = mock_client
with redis_lock("test_lock") as acquired:
assert acquired is False
mock_client.set.assert_called_once_with("lock:test_lock", "1", nx=True, ex=3600 * 4)
# Should NOT call delete since we didn't acquire the lock
mock_client.delete.assert_not_called()
@mock.patch("utils.redis_lock.get_redis_client")
def test_lock_released_on_exception(self, mock_get_client):
"""Test lock is released even when exception occurs."""
mock_client = mock.MagicMock()
mock_client.set.return_value = True
mock_get_client.return_value = mock_client
with pytest.raises(ValueError):
with redis_lock("test_lock") as acquired:
assert acquired is True
raise ValueError("Test error")
# Lock should still be released
mock_client.delete.assert_called_once_with("lock:test_lock")
@mock.patch("utils.redis_lock.get_redis_client")
def test_custom_timeout(self, mock_get_client):
"""Test lock with custom timeout."""
mock_client = mock.MagicMock()
mock_client.set.return_value = True
mock_get_client.return_value = mock_client
with redis_lock("test_lock", timeout=300) as acquired:
assert acquired is True
mock_client.set.assert_called_once_with("lock:test_lock", "1", nx=True, ex=300)
@mock.patch("utils.redis_lock.redis")
def test_get_redis_client_uses_broker_url(self, mock_redis):
"""Test Redis client is created from CELERY_BROKER_URL."""
with mock.patch.dict("os.environ", {"CELERY_BROKER_URL": "redis://testhost:1234/5"}):
get_redis_client()
mock_redis.from_url.assert_called_once_with(
"redis://testhost:1234/5", decode_responses=True
)

View file

@ -0,0 +1,227 @@
"""Unit tests for ListingRepository."""
from datetime import datetime, timedelta
import pytest
from sqlalchemy import Engine
from models.listing import (
FurnishType,
ListingType,
QueryParameters,
RentListing,
)
from repositories.listing_repository import ListingRepository
class TestListingRepository:
"""Tests for ListingRepository methods."""
async def test_get_listings_empty_db(
self, listing_repository: ListingRepository
) -> None:
"""Test that get_listings returns empty list for empty database."""
listings = await listing_repository.get_listings()
assert listings == []
async def test_get_listings_returns_inserted_listings(
self,
listing_repository: ListingRepository,
sample_rent_listing: RentListing,
) -> None:
"""Test that get_listings returns listings that were inserted."""
await listing_repository.upsert_listings([sample_rent_listing])
listings = await listing_repository.get_listings()
assert len(listings) == 1
assert listings[0].id == sample_rent_listing.id
async def test_upsert_listings_creates_new(
self,
listing_repository: ListingRepository,
sample_rent_listing: RentListing,
) -> None:
"""Test that upsert_listings creates new listings."""
result = await listing_repository.upsert_listings([sample_rent_listing])
assert len(result) == 1
assert result[0].id == sample_rent_listing.id
# Verify it's in the database
listings = await listing_repository.get_listings()
assert len(listings) == 1
async def test_upsert_listings_updates_existing(
self,
listing_repository: ListingRepository,
sample_rent_listing: RentListing,
) -> None:
"""Test that upsert_listings updates existing listings."""
# Insert initial listing
await listing_repository.upsert_listings([sample_rent_listing])
# Update the listing
sample_rent_listing.price = 3000.0
await listing_repository.upsert_listings([sample_rent_listing])
# Verify update
listings = await listing_repository.get_listings()
assert len(listings) == 1
assert listings[0].price == 3000.0
async def test_mark_seen_updates_timestamp(
self,
listing_repository: ListingRepository,
sample_rent_listing: RentListing,
) -> None:
"""Test that mark_seen updates the last_seen timestamp."""
# Set an old timestamp
old_time = datetime.now() - timedelta(days=7)
sample_rent_listing.last_seen = old_time
await listing_repository.upsert_listings([sample_rent_listing])
# Mark as seen
await listing_repository.mark_seen(sample_rent_listing.id)
# Verify timestamp was updated
listings = await listing_repository.get_listings()
assert len(listings) == 1
assert listings[0].last_seen > old_time
async def test_mark_seen_nonexistent_listing(
self, listing_repository: ListingRepository
) -> None:
"""Test that mark_seen handles nonexistent listings gracefully."""
# Should not raise an exception
await listing_repository.mark_seen(999999)
async def test_get_listings_with_only_ids(
self,
listing_repository: ListingRepository,
sample_rent_listings: list[RentListing],
) -> None:
"""Test that get_listings filters by only_ids."""
await listing_repository.upsert_listings(sample_rent_listings)
# Request only specific IDs
listings = await listing_repository.get_listings(only_ids=[1, 3])
assert len(listings) == 2
listing_ids = [l.id for l in listings]
assert 1 in listing_ids
assert 3 in listing_ids
assert 2 not in listing_ids
async def test_get_listings_with_limit(
self,
listing_repository: ListingRepository,
sample_rent_listings: list[RentListing],
) -> None:
"""Test that get_listings respects limit parameter."""
await listing_repository.upsert_listings(sample_rent_listings)
listings = await listing_repository.get_listings(limit=2)
assert len(listings) == 2
class TestListingRepositoryFilters:
"""Tests for ListingRepository query parameter filtering."""
async def test_filter_by_bedrooms(
self,
listing_repository: ListingRepository,
sample_rent_listings: list[RentListing],
) -> None:
"""Test filtering by bedroom count."""
await listing_repository.upsert_listings(sample_rent_listings)
query_params = QueryParameters(
listing_type=ListingType.RENT,
min_bedrooms=2,
max_bedrooms=2,
)
listings = await listing_repository.get_listings(query_parameters=query_params)
assert len(listings) == 1
assert listings[0].number_of_bedrooms == 2
async def test_filter_by_price_range(
self,
listing_repository: ListingRepository,
sample_rent_listings: list[RentListing],
) -> None:
"""Test filtering by price range."""
await listing_repository.upsert_listings(sample_rent_listings)
query_params = QueryParameters(
listing_type=ListingType.RENT,
min_price=1800,
max_price=2500,
)
listings = await listing_repository.get_listings(query_parameters=query_params)
assert len(listings) == 1
assert listings[0].price == 2000.0
async def test_filter_by_min_sqm(
self,
listing_repository: ListingRepository,
sample_rent_listings: list[RentListing],
) -> None:
"""Test filtering by minimum square meters."""
await listing_repository.upsert_listings(sample_rent_listings)
query_params = QueryParameters(
listing_type=ListingType.RENT,
min_sqm=60,
)
listings = await listing_repository.get_listings(query_parameters=query_params)
assert len(listings) == 1
assert listings[0].square_meters == 80.0
async def test_filter_by_furnish_type(
self,
listing_repository: ListingRepository,
sample_rent_listings: list[RentListing],
) -> None:
"""Test filtering by furnish type."""
await listing_repository.upsert_listings(sample_rent_listings)
query_params = QueryParameters(
listing_type=ListingType.RENT,
furnish_types=[FurnishType.UNFURNISHED],
)
listings = await listing_repository.get_listings(query_parameters=query_params)
assert len(listings) == 1
assert listings[0].furnish_type == FurnishType.UNFURNISHED
async def test_filter_by_last_seen_days(
self,
listing_repository: ListingRepository,
sample_rent_listings: list[RentListing],
) -> None:
"""Test filtering by last_seen_days."""
# Make one listing old
sample_rent_listings[0].last_seen = datetime.now() - timedelta(days=30)
await listing_repository.upsert_listings(sample_rent_listings)
query_params = QueryParameters(
listing_type=ListingType.RENT,
last_seen_days=7,
)
listings = await listing_repository.get_listings(query_parameters=query_params)
# Only 2 should be recent enough
assert len(listings) == 2
async def test_combined_filters(
self,
listing_repository: ListingRepository,
sample_rent_listings: list[RentListing],
) -> None:
"""Test combining multiple filters."""
await listing_repository.upsert_listings(sample_rent_listings)
query_params = QueryParameters(
listing_type=ListingType.RENT,
min_bedrooms=1,
max_bedrooms=2,
min_price=1000,
max_price=2500,
furnish_types=[FurnishType.FURNISHED, FurnishType.UNFURNISHED],
)
listings = await listing_repository.get_listings(query_parameters=query_params)
# Should match listings with 1-2 bedrooms in price range
assert len(listings) == 2

View file

@ -0,0 +1,293 @@
"""Unit tests for schedule configuration."""
import os
from unittest import mock
import pytest
from pydantic import ValidationError
from config.schedule_config import ScheduleConfig, SchedulesConfig
from models.listing import FurnishType, ListingType
class TestScheduleConfig:
"""Tests for ScheduleConfig model."""
def test_basic_creation_with_defaults(self):
"""Test creating a schedule with minimal required fields."""
schedule = ScheduleConfig(name="Test Schedule", listing_type=ListingType.RENT)
assert schedule.name == "Test Schedule"
assert schedule.enabled is True
assert schedule.minute == "0"
assert schedule.hour == "2"
assert schedule.day_of_week == "*"
assert schedule.listing_type == ListingType.RENT
assert schedule.min_bedrooms == 1
assert schedule.max_bedrooms == 999
assert schedule.min_price == 0
assert schedule.max_price == 10_000_000
assert schedule.district_names == []
assert schedule.furnish_types is None
def test_full_creation(self):
"""Test creating a schedule with all fields specified."""
schedule = ScheduleConfig(
name="Full Schedule",
enabled=False,
minute="30",
hour="4",
day_of_week="1,3,5",
listing_type=ListingType.BUY,
min_bedrooms=2,
max_bedrooms=3,
min_price=400000,
max_price=800000,
district_names=["Westminster", "Camden"],
furnish_types=["furnished", "unfurnished"],
)
assert schedule.name == "Full Schedule"
assert schedule.enabled is False
assert schedule.minute == "30"
assert schedule.hour == "4"
assert schedule.day_of_week == "1,3,5"
assert schedule.listing_type == ListingType.BUY
assert schedule.min_bedrooms == 2
assert schedule.max_bedrooms == 3
assert schedule.min_price == 400000
assert schedule.max_price == 800000
assert schedule.district_names == ["Westminster", "Camden"]
assert schedule.furnish_types == ["furnished", "unfurnished"]
def test_to_query_parameters(self):
"""Test conversion to QueryParameters."""
schedule = ScheduleConfig(
name="Test",
listing_type=ListingType.RENT,
min_bedrooms=2,
max_bedrooms=3,
min_price=2000,
max_price=4000,
district_names=["Westminster"],
furnish_types=["furnished"],
)
params = schedule.to_query_parameters()
assert params.listing_type == ListingType.RENT
assert params.min_bedrooms == 2
assert params.max_bedrooms == 3
assert params.min_price == 2000
assert params.max_price == 4000
assert params.district_names == {"Westminster"}
assert params.furnish_types == [FurnishType.FURNISHED]
def test_to_query_parameters_no_furnish_types(self):
"""Test conversion when furnish_types is None."""
schedule = ScheduleConfig(
name="Test",
listing_type=ListingType.BUY,
)
params = schedule.to_query_parameters()
assert params.furnish_types is None
class TestCronValidation:
"""Tests for cron field validation."""
# Valid minute values
@pytest.mark.parametrize(
"minute",
[
"0",
"59",
"*",
"*/5",
"*/15",
"0,15,30,45",
],
)
def test_valid_minute(self, minute: str):
"""Test valid minute values are accepted."""
schedule = ScheduleConfig(
name="Test", listing_type=ListingType.RENT, minute=minute
)
assert schedule.minute == minute
# Invalid minute values
@pytest.mark.parametrize(
"minute",
[
"60",
"-1",
"abc",
"*/0",
],
)
def test_invalid_minute(self, minute: str):
"""Test invalid minute values are rejected."""
with pytest.raises(ValidationError):
ScheduleConfig(name="Test", listing_type=ListingType.RENT, minute=minute)
# Valid hour values
@pytest.mark.parametrize(
"hour",
[
"0",
"23",
"*",
"*/6",
"0,6,12,18",
],
)
def test_valid_hour(self, hour: str):
"""Test valid hour values are accepted."""
schedule = ScheduleConfig(
name="Test", listing_type=ListingType.RENT, hour=hour
)
assert schedule.hour == hour
# Invalid hour values
@pytest.mark.parametrize(
"hour",
[
"24",
"-1",
"abc",
"*/0",
],
)
def test_invalid_hour(self, hour: str):
"""Test invalid hour values are rejected."""
with pytest.raises(ValidationError):
ScheduleConfig(name="Test", listing_type=ListingType.RENT, hour=hour)
# Valid day_of_week values
@pytest.mark.parametrize(
"day_of_week",
[
"0",
"6",
"*",
"1,3,5",
"*/2",
],
)
def test_valid_day_of_week(self, day_of_week: str):
"""Test valid day_of_week values are accepted."""
schedule = ScheduleConfig(
name="Test", listing_type=ListingType.RENT, day_of_week=day_of_week
)
assert schedule.day_of_week == day_of_week
# Invalid day_of_week values
@pytest.mark.parametrize(
"day_of_week",
[
"7",
"-1",
"abc",
"*/0",
],
)
def test_invalid_day_of_week(self, day_of_week: str):
"""Test invalid day_of_week values are rejected."""
with pytest.raises(ValidationError):
ScheduleConfig(
name="Test", listing_type=ListingType.RENT, day_of_week=day_of_week
)
class TestSchedulesConfig:
"""Tests for SchedulesConfig container."""
def test_from_env_empty(self):
"""Test loading from empty environment variable."""
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": ""}, clear=False):
config = SchedulesConfig.from_env()
assert config.schedules == []
def test_from_env_missing(self):
"""Test loading when environment variable is not set."""
with mock.patch.dict(os.environ, {}, clear=True):
# Ensure SCRAPE_SCHEDULES is not set
os.environ.pop("SCRAPE_SCHEDULES", None)
config = SchedulesConfig.from_env()
assert config.schedules == []
def test_from_env_valid_single(self):
"""Test loading a single valid schedule."""
json_config = '[{"name":"Daily RENT","listing_type":"RENT","hour":"2"}]'
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": json_config}):
config = SchedulesConfig.from_env()
assert len(config.schedules) == 1
assert config.schedules[0].name == "Daily RENT"
assert config.schedules[0].listing_type == ListingType.RENT
assert config.schedules[0].hour == "2"
def test_from_env_valid_multiple(self):
"""Test loading multiple valid schedules."""
json_config = """[
{"name":"Daily RENT","listing_type":"RENT","hour":"2"},
{"name":"Daily BUY","listing_type":"BUY","hour":"4","enabled":false}
]"""
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": json_config}):
config = SchedulesConfig.from_env()
assert len(config.schedules) == 2
assert config.schedules[0].name == "Daily RENT"
assert config.schedules[0].enabled is True
assert config.schedules[1].name == "Daily BUY"
assert config.schedules[1].enabled is False
def test_from_env_invalid_json(self):
"""Test error on invalid JSON."""
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": "not json"}):
with pytest.raises(ValueError, match="Invalid JSON"):
SchedulesConfig.from_env()
def test_from_env_not_array(self):
"""Test error when JSON is not an array."""
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": '{"name":"test"}'}):
with pytest.raises(ValueError, match="must be a JSON array"):
SchedulesConfig.from_env()
def test_from_env_invalid_schedule(self):
"""Test error when schedule validation fails."""
# Missing required listing_type
json_config = '[{"name":"Invalid"}]'
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": json_config}):
with pytest.raises(ValidationError):
SchedulesConfig.from_env()
def test_get_enabled_schedules(self):
"""Test filtering to only enabled schedules."""
config = SchedulesConfig(
schedules=[
ScheduleConfig(name="Enabled", listing_type=ListingType.RENT, enabled=True),
ScheduleConfig(name="Disabled", listing_type=ListingType.BUY, enabled=False),
ScheduleConfig(name="Also Enabled", listing_type=ListingType.RENT, enabled=True),
]
)
enabled = config.get_enabled_schedules()
assert len(enabled) == 2
assert enabled[0].name == "Enabled"
assert enabled[1].name == "Also Enabled"
def test_get_enabled_schedules_all_disabled(self):
"""Test when all schedules are disabled."""
config = SchedulesConfig(
schedules=[
ScheduleConfig(name="Disabled1", listing_type=ListingType.RENT, enabled=False),
ScheduleConfig(name="Disabled2", listing_type=ListingType.BUY, enabled=False),
]
)
enabled = config.get_enabled_schedules()
assert len(enabled) == 0

View file

@ -0,0 +1,4 @@
"""Utility modules."""
from utils.redis_lock import redis_lock
__all__ = ["redis_lock"]

View file

@ -0,0 +1,50 @@
"""Redis-based distributed locking for task coordination."""
import logging
import os
from contextlib import contextmanager
from typing import Generator
import redis
logger = logging.getLogger("uvicorn.error")
def get_redis_client() -> redis.Redis:
"""Get Redis client from Celery broker URL."""
broker_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
return redis.from_url(broker_url, decode_responses=True)
@contextmanager
def redis_lock(
lock_name: str, timeout: int = 3600 * 4
) -> Generator[bool, None, None]:
"""Distributed lock using Redis.
Args:
lock_name: Unique name for the lock
timeout: Lock expiration time in seconds (default: 4 hours)
Yields:
bool: True if lock was acquired, False otherwise
Example:
with redis_lock("scrape_listings") as acquired:
if not acquired:
logger.warning("Another scrape is already running")
return
# ... do work ...
"""
client = get_redis_client()
lock_key = f"lock:{lock_name}"
# Try to acquire the lock
acquired = client.set(lock_key, "1", nx=True, ex=timeout)
try:
yield bool(acquired)
finally:
# Release the lock only if we acquired it
if acquired:
client.delete(lock_key)
logger.info(f"Released lock: {lock_name}")