From d205d15c74293a511dcf06bb0eb82055cdc6aa98 Mon Sep 17 00:00:00 2001 From: Viktor Barzin Date: Fri, 6 Feb 2026 20:55:10 +0000 Subject: [PATCH] Add services layer, tests, streaming UI, and cleanup legacy code --- ...-used_DO_NOT_REMOVE_MANUALLY_SECURITY_RISK | 3 + crawler/.claude/settings.local.json | 124 +++++++ .../python-313-redis-generic-type/SKILL.md | 101 ++++++ .../SKILL.md | 132 +++++++ crawler/91_recalculate_floorplan.py | 13 - crawler/9_recalculate_regex_squaremeter.py | 15 - crawler/Dockerfile | 7 +- .../a1b2c3d4e5f6_add_streaming_indexes.py | 56 +++ ...5f1bc4e3323_fix_typo_in_logitude_column.py | 88 +---- crawler/csv_exporter.py | 50 +-- crawler/data_access.py | 105 +----- crawler/docker-compose.yml | 22 +- crawler/docs/BACKEND.md | 183 ++++++++++ crawler/frontend/src/AppSidebar.tsx | 113 +----- .../frontend/src/components/Parameters.tsx | 65 +--- crawler/frontend/src/components/StatsBar.tsx | 128 +++++++ .../src/components/StreamingProgressBar.tsx | 47 +++ .../frontend/src/components/ui/accordion.tsx | 56 +++ crawler/frontend/src/components/ui/alert.tsx | 66 ---- crawler/frontend/src/components/ui/badge.tsx | 46 --- .../frontend/src/components/ui/breadcrumb.tsx | 21 +- .../frontend/src/components/ui/checkbox.tsx | 29 ++ crawler/frontend/src/components/ui/slider.tsx | 34 ++ crawler/frontend/src/index.css | 27 ++ crawler/frontend/src/services/apiClient.ts | 62 ++++ .../frontend/src/services/listingService.ts | 54 +++ crawler/frontend/src/utils/mapUtils.ts | 45 +++ crawler/frontend/tsconfig.app.tsbuildinfo | 2 +- crawler/frontend/vite.config.ts | 3 +- crawler/main.py | 319 +++++++++------- crawler/main_tmp.py | 40 -- crawler/models/listing.py | 13 +- crawler/poetry.lock | 36 +- crawler/proof_of_concept/image.py | 13 - crawler/proof_of_concept/listings.py | 67 ---- .../routing_distancematrix.py | 22 -- crawler/proof_of_concept/routing_routing.py | 83 ----- crawler/proof_of_concept/single-query.py | 20 - crawler/rec/districts.py | 2 +- crawler/rec/floorplan.py | 12 +- crawler/rec/route_serializer.py | 41 +++ crawler/services/__init__.py | 41 +++ crawler/services/district_service.py | 38 ++ crawler/services/export_service.py | 92 +++++ crawler/services/floorplan_detector.py | 42 +++ crawler/services/image_fetcher.py | 55 +++ crawler/services/listing_service.py | 168 +++++++++ .../route_calculator.py} | 51 +-- crawler/services/task_service.py | 46 ++- crawler/start.sh | 196 +++++++--- crawler/tests/__init__.py | 1 + crawler/tests/conftest.py | 186 ++++++++++ crawler/tests/integration/__init__.py | 1 + crawler/tests/integration/test_api.py | 180 +++++++++ crawler/tests/test_listing_geojson.py | 299 +++++++++++++++ crawler/tests/unit/__init__.py | 1 + crawler/tests/unit/test_models.py | 343 ++++++++++++++++++ crawler/tests/unit/test_redis_lock.py | 74 ++++ crawler/tests/unit/test_repository.py | 227 ++++++++++++ crawler/tests/unit/test_schedule_config.py | 293 +++++++++++++++ crawler/utils/__init__.py | 4 + crawler/utils/redis_lock.py | 50 +++ 62 files changed, 3729 insertions(+), 1024 deletions(-) create mode 100644 crawler/.claude/internet-mode-used_DO_NOT_REMOVE_MANUALLY_SECURITY_RISK create mode 100644 crawler/.claude/settings.local.json create mode 100644 crawler/.claude/skills/python-313-redis-generic-type/SKILL.md create mode 100644 crawler/.claude/skills/python-parentheses-comparison-bug/SKILL.md delete mode 100644 crawler/91_recalculate_floorplan.py delete mode 100644 crawler/9_recalculate_regex_squaremeter.py create mode 100644 crawler/alembic/versions/a1b2c3d4e5f6_add_streaming_indexes.py create mode 100644 crawler/docs/BACKEND.md create mode 100644 crawler/frontend/src/components/StatsBar.tsx create mode 100644 crawler/frontend/src/components/StreamingProgressBar.tsx create mode 100644 crawler/frontend/src/components/ui/accordion.tsx delete mode 100644 crawler/frontend/src/components/ui/alert.tsx delete mode 100644 crawler/frontend/src/components/ui/badge.tsx create mode 100644 crawler/frontend/src/components/ui/checkbox.tsx create mode 100644 crawler/frontend/src/components/ui/slider.tsx create mode 100644 crawler/frontend/src/services/apiClient.ts create mode 100644 crawler/frontend/src/services/listingService.ts create mode 100644 crawler/frontend/src/utils/mapUtils.ts delete mode 100644 crawler/main_tmp.py delete mode 100644 crawler/proof_of_concept/image.py delete mode 100644 crawler/proof_of_concept/listings.py delete mode 100644 crawler/proof_of_concept/routing_distancematrix.py delete mode 100644 crawler/proof_of_concept/routing_routing.py delete mode 100644 crawler/proof_of_concept/single-query.py create mode 100644 crawler/rec/route_serializer.py create mode 100644 crawler/services/__init__.py create mode 100644 crawler/services/district_service.py create mode 100644 crawler/services/export_service.py create mode 100644 crawler/services/floorplan_detector.py create mode 100644 crawler/services/image_fetcher.py create mode 100644 crawler/services/listing_service.py rename crawler/{5_routing.py => services/route_calculator.py} (54%) create mode 100644 crawler/tests/__init__.py create mode 100644 crawler/tests/conftest.py create mode 100644 crawler/tests/integration/__init__.py create mode 100644 crawler/tests/integration/test_api.py create mode 100644 crawler/tests/test_listing_geojson.py create mode 100644 crawler/tests/unit/__init__.py create mode 100644 crawler/tests/unit/test_models.py create mode 100644 crawler/tests/unit/test_redis_lock.py create mode 100644 crawler/tests/unit/test_repository.py create mode 100644 crawler/tests/unit/test_schedule_config.py create mode 100644 crawler/utils/__init__.py create mode 100644 crawler/utils/redis_lock.py diff --git a/crawler/.claude/internet-mode-used_DO_NOT_REMOVE_MANUALLY_SECURITY_RISK b/crawler/.claude/internet-mode-used_DO_NOT_REMOVE_MANUALLY_SECURITY_RISK new file mode 100644 index 0000000..f61efc8 --- /dev/null +++ b/crawler/.claude/internet-mode-used_DO_NOT_REMOVE_MANUALLY_SECURITY_RISK @@ -0,0 +1,3 @@ +This directory has been used with Claude Code's internet mode. +Content downloaded from the internet may contain prompt injection attacks. +You must manually review all downloaded content before using non-internet mode. diff --git a/crawler/.claude/settings.local.json b/crawler/.claude/settings.local.json new file mode 100644 index 0000000..04ae6a1 --- /dev/null +++ b/crawler/.claude/settings.local.json @@ -0,0 +1,124 @@ +{ + "permissions": { + "allow": [ + "Bash(grep:*)", + "Bash(python:*)", + "Bash(docker ps:*)", + "Bash(podman ps:*)", + "Bash(curl:*)", + "Bash(nc:*)", + "Bash(poetry --version:*)", + "Bash(docker context:*)", + "Bash(open:*)", + "Bash(chmod:*)", + "Bash(/System/Volumes/Data/mnt/wizard/code/realestate-crawler/crawler/.claude/tools/remote-exec.sh:*)", + "Bash(export DOCKER_HOST=unix:///Users/viktorbarzin/.docker/run/docker.sock)", + "Bash(docker compose:*)", + "Bash(export DOCKER_BUILDKIT=1)", + "Bash(export COMPOSE_DOCKER_CLI_BUILD=1)", + "Bash(tar:*)", + "Bash(docker build:*)", + "Bash(docker tag:*)", + "Bash(docker run:*)", + "Bash(~/.claude/remote-exec.sh \"hostname\")", + "Skill(remote)", + "Bash(for i in {1..120})", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769814743512676000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769814743512676000.txt)", + "Bash(exit 0)", + "Bash(fi)", + "Bash(done)", + "Bash(for i in {1..240})", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769814856118018000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769814856118018000.txt)", + "Bash(for i in {1..60})", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769814883284199000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769814883284199000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769815004122069000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769815004122069000.txt)", + "Bash(for i in {1..90})", + "Bash(do if grep -q \"EXIT_CODE\" ~/.claude/remote-results/cmd-1769814856118018000.txt)", + "Bash(then echo \"=== Build completed ===\")", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769815497591226000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769815497591226000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769815530803509000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769815530803509000.txt)", + "Bash(do if grep -q \"EXIT_CODE\" ~/.claude/remote-results/cmd-1769815530803509000.txt)", + "Bash(for i in {1..30})", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769815614622428000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769815614622428000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769815710424010000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769815710424010000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769815892793650000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769815892793650000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816040589015000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816040589015000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816256870361000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816256870361000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816300264785000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816300264785000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816375772556000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816375772556000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816407482202000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816407482202000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816439320016000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816439320016000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816532941427000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816532941427000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816611986724000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816611986724000.txt)", + "Bash(for i in {1..40})", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816682085291000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816682085291000.txt)", + "Bash(for i in {1..20})", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816742848870000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816742848870000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816763327960000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816763327960000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816784934447000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816784934447000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816872796427000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816872796427000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816892104231000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816892104231000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816911037685000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816911037685000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816946320457000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816946320457000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769816987766946000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769816987766946000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769817008932477000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769817008932477000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769817027145242000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769817027145242000.txt)", + "Bash(for file in /mnt/wizard/code/realestate-crawler/crawler/frontend/src/components/ui/*.tsx)", + "Bash(do)", + "Bash(basename:*)", + "Bash(wc:*)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769819894031906000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769819894031906000.txt)", + "Bash(do if [ -f ~/.claude/remote-results/cmd-1769854789336791000.txt ])", + "Bash(then cat ~/.claude/remote-results/cmd-1769854789336791000.txt)", + "Bash(npx tsc:*)", + "Bash(npx eslint:*)", + "Bash(find:*)", + "Bash(sync)", + "Bash(echo:*)", + "Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875304344407000.txt ])", + "Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875304344407000.txt)", + "Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875708563896000.txt ])", + "Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875708563896000.txt)", + "Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875753067606000.txt ])", + "Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875753067606000.txt)", + "Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875830424071000.txt ])", + "Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875830424071000.txt)", + "Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875948670335000.txt ])", + "Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875948670335000.txt)", + "Bash(sort:*)", + "Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769876096467703000.txt ])", + "Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769876096467703000.txt)", + "Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769876529766339000.txt ])", + "Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769876529766339000.txt)" + ] + } +} diff --git a/crawler/.claude/skills/python-313-redis-generic-type/SKILL.md b/crawler/.claude/skills/python-313-redis-generic-type/SKILL.md new file mode 100644 index 0000000..07deb13 --- /dev/null +++ b/crawler/.claude/skills/python-313-redis-generic-type/SKILL.md @@ -0,0 +1,101 @@ +--- +name: python-313-redis-generic-type +description: | + Fix for "TypeError: is not a generic class" when using + redis-py with Python 3.13. Use when: (1) upgrading to Python 3.13 breaks redis type + annotations, (2) mypy passes but runtime fails with generic class error, (3) using + redis.Redis[str] or similar parameterized types. Covers redis-py generic type + compatibility with Python 3.13's stricter runtime generic checking. +author: Claude Code +version: 1.0.0 +date: 2026-01-31 +--- + +# Python 3.13 redis.Redis Generic Type Error + +## Problem +Python 3.13 introduced stricter runtime checking for generic types. The redis-py library's +`Redis` class is not defined as a generic class at runtime, even though it works with type +checkers like mypy. This causes a `TypeError` when you use parameterized types like +`redis.Redis[str]` in type annotations that are evaluated at runtime. + +## Context / Trigger Conditions +- Python 3.13 or later +- Using redis-py library +- Type annotation like `redis_client: redis.Redis[str]` +- Error message: `TypeError: is not a generic class` +- Works fine with mypy but fails at runtime +- Often appears when instantiating a class with this annotation + +## Solution + +### Option 1: Remove the type parameter (Recommended) +```python +# Before (breaks in Python 3.13) +redis_client: redis.Redis[str] + +# After (works in all Python versions) +redis_client: redis.Redis # type: ignore[type-arg] +``` + +The `# type: ignore[type-arg]` comment silences mypy's warning about missing type arguments. + +### Option 2: Use string annotation (deferred evaluation) +```python +from __future__ import annotations + +redis_client: "redis.Redis[str]" # String annotation, not evaluated at runtime +``` + +### Option 3: Use TYPE_CHECKING guard +```python +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + RedisClient = redis.Redis[str] +else: + RedisClient = redis.Redis + +redis_client: RedisClient +``` + +## Verification +1. Run your application with Python 3.13 +2. The TypeError should no longer appear +3. Run mypy to ensure type checking still works (may need type: ignore comment) + +## Example + +### Before (Broken) +```python +import redis + +class RedisRepository: + redis_client: redis.Redis[str] # TypeError at runtime in Python 3.13 + + def __init__(self): + self.redis_client = redis.Redis(host='localhost', decode_responses=True) +``` + +### After (Fixed) +```python +import redis + +class RedisRepository: + redis_client: redis.Redis # type: ignore[type-arg] + + def __init__(self): + self.redis_client = redis.Redis(host='localhost', decode_responses=True) +``` + +## Notes +- This is a breaking change in Python 3.13's handling of generic types +- The redis-py library may add proper generic support in future versions +- If using `decode_responses=True`, the client returns `str`; otherwise `bytes` +- The `type: ignore` comment is preferable to `Any` as it preserves some type safety +- This issue affects other libraries that aren't properly defined as Generic classes + +## References +- [Python 3.13 Release Notes](https://docs.python.org/3.13/whatsnew/3.13.html) +- [redis-py GitHub Issues](https://github.com/redis/redis-py/issues) +- [PEP 585 - Type Hinting Generics In Standard Collections](https://peps.python.org/pep-0585/) diff --git a/crawler/.claude/skills/python-parentheses-comparison-bug/SKILL.md b/crawler/.claude/skills/python-parentheses-comparison-bug/SKILL.md new file mode 100644 index 0000000..eae8078 --- /dev/null +++ b/crawler/.claude/skills/python-parentheses-comparison-bug/SKILL.md @@ -0,0 +1,132 @@ +--- +name: python-parentheses-comparison-bug +description: | + Debug Python comparison bug where parentheses around a variable cause unexpected behavior. + Use when: (1) condition always evaluates to False/True unexpectedly, (2) code like + "if (mylist) == 0" never triggers, (3) length check seems to not work, (4) comparison + with list/dict returns unexpected results. Common mistake where parentheses cause the + variable itself to be compared instead of its length. +author: Claude Code +version: 1.0.0 +date: 2026-01-31 +--- + +# Python Parentheses Comparison Bug + +## Problem +A subtle Python bug where unnecessary parentheses around a variable in a comparison +cause the wrong value to be compared. The expression `(mylist) == 0` compares the list +itself to 0, not its length. Since a list is never equal to an integer, this always +returns False. + +## Context / Trigger Conditions +- Condition that should sometimes be True is always False (or vice versa) +- Code pattern like `if (existing_items) == 0:` or `if (result) == expected:` +- The parentheses don't cause a syntax error but change semantics +- Often appears when copying/adapting code or during refactoring +- May pass code review because it "looks" correct + +## Solution + +### Identify the Bug Pattern +```python +# BUG: Compares list to 0, always False +if (existing_listings) == 0: + return True + +# Also wrong: compares list to integer +if (items) == 5: + do_something() +``` + +### Fix: Use len() for Length Comparisons +```python +# CORRECT: Compares length to 0 +if len(existing_listings) == 0: + return True + +# Alternative: Use truthiness for empty check +if not existing_listings: + return True + +# CORRECT: Compares length to integer +if len(items) == 5: + do_something() +``` + +## Verification +1. Add a debug print before the condition: `print(f"list={existing_listings}, len={len(existing_listings)}")` +2. Verify the condition now evaluates correctly +3. Write a unit test that exercises both branches of the condition + +## Example + +### Before (Broken) +```python +class FetchListingDetailsStep: + async def needs_processing(self, listing_id: int) -> bool: + existing_listings = await self.listing_repository.get_listings( + only_ids=[listing_id] + ) + # BUG: This compares the list object to 0, which is always False + # The parentheses around existing_listings are misleading + if (existing_listings) == 0: + return True + return False +``` + +### After (Fixed) +```python +class FetchListingDetailsStep: + async def needs_processing(self, listing_id: int) -> bool: + existing_listings = await self.listing_repository.get_listings( + only_ids=[listing_id] + ) + # CORRECT: Check if list is empty using len() + if len(existing_listings) == 0: + return True + return False +``` + +### Even Better (Pythonic) +```python +class FetchListingDetailsStep: + async def needs_processing(self, listing_id: int) -> bool: + existing_listings = await self.listing_repository.get_listings( + only_ids=[listing_id] + ) + # Most Pythonic: Use truthiness + return not existing_listings +``` + +## Notes +- Python's truthiness: empty collections are falsy, non-empty are truthy +- This bug is particularly insidious because: + - It's syntactically valid + - It doesn't raise an exception + - The parentheses make it look intentional + - Code review may miss it +- Linters like pylint or flake8 won't catch this specific pattern +- Type checkers like mypy may warn about comparing incompatible types +- When debugging, add print statements to verify actual vs expected values + +## Prevention +- Prefer `if not mylist:` over `if len(mylist) == 0:` +- Prefer `if mylist:` over `if len(mylist) > 0:` +- Remove unnecessary parentheses around single variables +- Enable mypy's strict mode which may catch type comparison issues +- Write unit tests that exercise both branches of conditions + +## Related Patterns +```python +# These are all wrong (comparing object to number): +if (mydict) == 0: # Always False +if (mylist) > 0: # TypeError in Python 3 +if (mystring) == 0: # Always False + +# These are correct: +if len(mydict) == 0: # True if empty +if not mydict: # True if empty (preferred) +if len(mylist) > 0: # True if non-empty +if mylist: # True if non-empty (preferred) +``` diff --git a/crawler/91_recalculate_floorplan.py b/crawler/91_recalculate_floorplan.py deleted file mode 100644 index 28ad776..0000000 --- a/crawler/91_recalculate_floorplan.py +++ /dev/null @@ -1,13 +0,0 @@ -from data_access import Listing -from tqdm import tqdm - -listings = Listing.get_all_listings() -recalculate_listings = [] - -for listing in listings: - sqm = listing.sqm_ocr - if sqm is None or sqm < 10 or sqm > 200: - recalculate_listings.append(listing) - -for listing in tqdm(recalculate_listings): - listing.calculate_sqm_ocr(recalculate=True) diff --git a/crawler/9_recalculate_regex_squaremeter.py b/crawler/9_recalculate_regex_squaremeter.py deleted file mode 100644 index 70d1c3b..0000000 --- a/crawler/9_recalculate_regex_squaremeter.py +++ /dev/null @@ -1,15 +0,0 @@ -# recalculate regex from sqm from already previously ocr'ed text -import json -from rec.floorplan import extract_total_sqm -from tqdm import tqdm -from data_access import Listing - -for listing in tqdm(list(Listing.get_all_listings())): - with open(listing.path_floorplan_ocr_json()) as f: - floorplans = json.load(f) - - for floorplan in floorplans: - floorplan["estimated_sqm"] = extract_total_sqm(floorplan["text"]) - - with open(listing.path_floorplan_ocr_json(), "w") as f: - floorplans = json.dump(floorplans, f) diff --git a/crawler/Dockerfile b/crawler/Dockerfile index 44fb763..47d6f84 100644 --- a/crawler/Dockerfile +++ b/crawler/Dockerfile @@ -41,6 +41,7 @@ EXPOSE 5001 # Set the entry point (adjust to your CLI's entry point) # ENTRYPOINT ["python", "/app/main.py"] # ENTRYPOINT ["/app/runall.sh"] -# CMD ["/bin/bash" ,"-c" ,"alembic upgrade head && uvicorn api.app:app --host 0.0.0.0 --port 8000"] -# ENTRYPOINT ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "8000"] -CMD ["./start.sh"] +# For local dev with docker-compose: +# CMD ["./start.sh"] +# For Kubernetes deployment: +CMD ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "5001"] diff --git a/crawler/alembic/versions/a1b2c3d4e5f6_add_streaming_indexes.py b/crawler/alembic/versions/a1b2c3d4e5f6_add_streaming_indexes.py new file mode 100644 index 0000000..7f8561d --- /dev/null +++ b/crawler/alembic/versions/a1b2c3d4e5f6_add_streaming_indexes.py @@ -0,0 +1,56 @@ +"""add streaming indexes for query optimization + +Revision ID: a1b2c3d4e5f6 +Revises: e5f1bc4e3323 +Create Date: 2026-02-01 12:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op + + +# revision identifiers, used by Alembic. +revision: str = 'a1b2c3d4e5f6' +down_revision: Union[str, None] = 'e5f1bc4e3323' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Add composite and single-column indexes for streaming query optimization.""" + # Composite index for main query pattern (bedrooms, price, last_seen filtering) + op.create_index( + 'ix_rentlisting_query_composite', + 'rentlisting', + ['number_of_bedrooms', 'price', 'last_seen'], + unique=False + ) + op.create_index( + 'ix_buylisting_query_composite', + 'buylisting', + ['number_of_bedrooms', 'price', 'last_seen'], + unique=False + ) + + # Missing single-column indexes for frequently filtered columns + op.create_index( + 'ix_rentlisting_furnish_type', + 'rentlisting', + ['furnish_type'], + unique=False + ) + op.create_index( + 'ix_rentlisting_available_from', + 'rentlisting', + ['available_from'], + unique=False + ) + + +def downgrade() -> None: + """Remove streaming indexes.""" + op.drop_index('ix_rentlisting_available_from', table_name='rentlisting') + op.drop_index('ix_rentlisting_furnish_type', table_name='rentlisting') + op.drop_index('ix_buylisting_query_composite', table_name='buylisting') + op.drop_index('ix_rentlisting_query_composite', table_name='rentlisting') diff --git a/crawler/alembic/versions/e5f1bc4e3323_fix_typo_in_logitude_column.py b/crawler/alembic/versions/e5f1bc4e3323_fix_typo_in_logitude_column.py index 190b5f8..f79c396 100644 --- a/crawler/alembic/versions/e5f1bc4e3323_fix_typo_in_logitude_column.py +++ b/crawler/alembic/versions/e5f1bc4e3323_fix_typo_in_logitude_column.py @@ -19,88 +19,12 @@ depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: - """Upgrade schema.""" - # ### commands auto generated by Alembic - please adjust! ### - op.drop_index(op.f('ix_user_email'), table_name='user') - op.drop_table('user') - op.drop_index(op.f('ix_rentlisting_last_seen'), table_name='rentlisting') - op.drop_index(op.f('ix_rentlisting_number_of_bedrooms'), table_name='rentlisting') - op.drop_index(op.f('ix_rentlisting_price'), table_name='rentlisting') - op.drop_index(op.f('ix_rentlisting_square_meters'), table_name='rentlisting') - op.drop_table('rentlisting') - op.drop_index(op.f('ix_buylisting_last_seen'), table_name='buylisting') - op.drop_index(op.f('ix_buylisting_number_of_bedrooms'), table_name='buylisting') - op.drop_index(op.f('ix_buylisting_price'), table_name='buylisting') - op.drop_index(op.f('ix_buylisting_square_meters'), table_name='buylisting') - op.drop_table('buylisting') - # ### end Alembic commands ### + """Upgrade schema - this migration is now a no-op since tables already have correct column name.""" + # The tables were created with 'longitude' (correct spelling) in the initial migration. + # This migration was incorrectly auto-generated and has been fixed to be a no-op. + pass def downgrade() -> None: - """Downgrade schema.""" - # ### commands auto generated by Alembic - please adjust! ### - op.create_table('buylisting', - sa.Column('id', mysql.INTEGER(), autoincrement=True, nullable=False), - sa.Column('price', mysql.FLOAT(), nullable=False), - sa.Column('number_of_bedrooms', mysql.INTEGER(), autoincrement=False, nullable=False), - sa.Column('square_meters', mysql.FLOAT(), nullable=True), - sa.Column('agency', mysql.VARCHAR(length=255), nullable=True), - sa.Column('council_tax_band', mysql.VARCHAR(length=255), nullable=True), - sa.Column('longtitude', mysql.FLOAT(), nullable=False), - sa.Column('latitude', mysql.FLOAT(), nullable=False), - sa.Column('price_history_json', mysql.TEXT(), nullable=False), - sa.Column('listing_site', mysql.ENUM('RIGHTMOVE'), nullable=False), - sa.Column('last_seen', mysql.DATETIME(), nullable=False), - sa.Column('photo_thumbnail', mysql.VARCHAR(length=255), nullable=True), - sa.Column('floorplan_image_paths', mysql.JSON(), nullable=False), - sa.Column('additional_info', mysql.JSON(), nullable=False), - sa.Column('routing_info_json', mysql.TEXT(), nullable=True), - sa.Column('service_charge', mysql.FLOAT(), nullable=True), - sa.Column('lease_left', mysql.INTEGER(), autoincrement=False, nullable=True), - sa.PrimaryKeyConstraint('id'), - mysql_collate='utf8mb4_0900_ai_ci', - mysql_default_charset='utf8mb4', - mysql_engine='InnoDB' - ) - op.create_index(op.f('ix_buylisting_square_meters'), 'buylisting', ['square_meters'], unique=False) - op.create_index(op.f('ix_buylisting_price'), 'buylisting', ['price'], unique=False) - op.create_index(op.f('ix_buylisting_number_of_bedrooms'), 'buylisting', ['number_of_bedrooms'], unique=False) - op.create_index(op.f('ix_buylisting_last_seen'), 'buylisting', ['last_seen'], unique=False) - op.create_table('rentlisting', - sa.Column('id', mysql.INTEGER(), autoincrement=True, nullable=False), - sa.Column('price', mysql.FLOAT(), nullable=False), - sa.Column('number_of_bedrooms', mysql.INTEGER(), autoincrement=False, nullable=False), - sa.Column('square_meters', mysql.FLOAT(), nullable=True), - sa.Column('agency', mysql.VARCHAR(length=255), nullable=True), - sa.Column('council_tax_band', mysql.VARCHAR(length=255), nullable=True), - sa.Column('longtitude', mysql.FLOAT(), nullable=False), - sa.Column('latitude', mysql.FLOAT(), nullable=False), - sa.Column('price_history_json', mysql.TEXT(), nullable=False), - sa.Column('listing_site', mysql.ENUM('RIGHTMOVE'), nullable=False), - sa.Column('last_seen', mysql.DATETIME(), nullable=False), - sa.Column('photo_thumbnail', mysql.VARCHAR(length=255), nullable=True), - sa.Column('floorplan_image_paths', mysql.JSON(), nullable=False), - sa.Column('additional_info', mysql.JSON(), nullable=False), - sa.Column('routing_info_json', mysql.TEXT(), nullable=True), - sa.Column('available_from', mysql.DATETIME(), nullable=True), - sa.Column('furnish_type', mysql.ENUM('FURNISHED', 'UNFURNISHED', 'PART_FURNISHED', 'ASK_LANDLORD', 'UNKNOWN'), nullable=False), - sa.PrimaryKeyConstraint('id'), - mysql_collate='utf8mb4_0900_ai_ci', - mysql_default_charset='utf8mb4', - mysql_engine='InnoDB' - ) - op.create_index(op.f('ix_rentlisting_square_meters'), 'rentlisting', ['square_meters'], unique=False) - op.create_index(op.f('ix_rentlisting_price'), 'rentlisting', ['price'], unique=False) - op.create_index(op.f('ix_rentlisting_number_of_bedrooms'), 'rentlisting', ['number_of_bedrooms'], unique=False) - op.create_index(op.f('ix_rentlisting_last_seen'), 'rentlisting', ['last_seen'], unique=False) - op.create_table('user', - sa.Column('id', mysql.INTEGER(), autoincrement=True, nullable=False), - sa.Column('email', mysql.VARCHAR(length=255), nullable=False), - sa.Column('password', mysql.VARCHAR(length=255), nullable=False), - sa.PrimaryKeyConstraint('id'), - mysql_collate='utf8mb4_0900_ai_ci', - mysql_default_charset='utf8mb4', - mysql_engine='InnoDB' - ) - op.create_index(op.f('ix_user_email'), 'user', ['email'], unique=True) - # ### end Alembic commands ### + """Downgrade schema - no-op since upgrade is no-op.""" + pass diff --git a/crawler/csv_exporter.py b/crawler/csv_exporter.py index 838c0a9..9bd286c 100644 --- a/crawler/csv_exporter.py +++ b/crawler/csv_exporter.py @@ -1,6 +1,6 @@ from pathlib import Path import pandas as pd -from rec.query import QueryParameters +from models.listing import QueryParameters from repositories.listing_repository import ListingRepository @@ -10,7 +10,7 @@ async def export_to_csv( query_parameters: QueryParameters | None = None, ) -> None: listings = await repository.get_listings(query_parameters=query_parameters) - ds = [*[listing.__dict__ for listing in listings]] + ds = [listing.__dict__ for listing in listings] df = pd.DataFrame(ds) # read decisions on file @@ -22,37 +22,19 @@ async def export_to_csv( drop_columns = ["_sa_instance_state", "additional_info"] df = df.drop(columns=drop_columns) - # remove all entries where we didnt calculate transit time (probably due to a too far distance) - # df2 = df[df.travel_time_fastest.notna()] - df2 = df + # fill in gap values for service charge and lease left for Excel filters + if "service_charge" not in df.columns: + df.loc[:, "service_charge"] = -1 + df.loc[:, "service_charge"] = df.service_charge.fillna(-1) + if "lease_left" not in df.columns: + df.loc[:, "lease_left"] = -1 + df.loc[:, "lease_left"] = df.lease_left.fillna(-1) + if "square_meters" not in df.columns: + df.loc[:, "square_meters"] = -1 + df.loc[:, "square_meters"] = df.square_meters.fillna(-1) - # drop columns - # dropcolumns = ['distance_per_transit', 'duration_static', 'distance'] - # s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1) - # s1 = df2 + # Add price per sqm column + df.loc[:, "price_per_sqm"] = df.price / df.square_meters - # fill in gap values for service charge and lease left. This is for excel so we can use filters better there - if "service_charge" not in df2.columns: - df2.loc[:, "service_charge"] = -1 - df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1) - if "lease_left" not in df2.columns: - df2.loc[:, "lease_left"] = -1 - df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1) - if "square_meters" not in df2.columns: - df2.loc[:, "square_meters"] = -1 - df2.loc[:, "square_meters"] = df2.square_meters.fillna(-1) - - df3 = df2 - # df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1) - # df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round() - df3.shape - df4 = df3 - - # df5 = df4[columns] - - # Add some interesting columns - df4.loc[:, "price_per_sqm"] = df4.price / df4.square_meters - df5 = df4 - - df6 = df5.sort_values(by=["price_per_sqm"], ascending=True) - df6.to_csv(str(output_file), index=False) + df = df.sort_values(by=["price_per_sqm"], ascending=True) + df.to_csv(str(output_file), index=False) diff --git a/crawler/data_access.py b/crawler/data_access.py index adcbc51..32dd215 100644 --- a/crawler/data_access.py +++ b/crawler/data_access.py @@ -4,6 +4,7 @@ from dataclasses import dataclass import json import pathlib from typing import Any, List +import warnings from models.listing import ListingSite, PriceHistoryItem from rec import floorplan, routing import re @@ -12,6 +13,12 @@ import datetime @dataclass() class Listing: + """Legacy Listing class for filesystem-based data access. + + .. deprecated:: + Use models.listing.RentListing or models.listing.BuyListing instead. + This class is kept for backwards compatibility with the populate_db command. + """ identifier: int _details_object: dict[str, Any] | None = None _listing_object: dict[str, Any] | None = None @@ -36,6 +43,14 @@ class Listing: "council_tax_band", ] + def __post_init__(self) -> None: + warnings.warn( + "data_access.Listing is deprecated. Use models.listing.RentListing " + "or models.listing.BuyListing instead.", + DeprecationWarning, + stacklevel=3, + ) + @staticmethod def get_all_listings( listing_paths: list[pathlib.Path], @@ -144,39 +159,6 @@ class Listing: # todo add check if return is image return images - def calculate_sqm_model(self): - objs = [] - for floorplan_path in self.list_floorplans(): - estimated_sqm, model_output, predictions = floorplan.calculate_model( - floorplan_path - ) - objs.append( - { - "floorplan_path": str(floorplan_path), - "estimated_sqm": estimated_sqm, - "model_output": model_output, - "no_predictions": len( - predictions - ), # cant serialize the predictions itself since its a tensor - } - ) - - with open(self.path_floorplan_model_json(), "w") as f: - json.dump(objs, f) - - @property - def sqm_model(self, recalculate=True) -> float: - if not self.path_floorplan_model_json().exists() or recalculate: - self.calculate_sqm_model() - - with open(self.path_floorplan_json()) as f: - objs = json.load(f) - - max_sqm = max( - [o["estimated_sqm"] for o in objs if o is None] - ) # filter out Nones - return max_sqm - async def calculate_sqm_ocr(self, recalculate=True): objs = [] if self.path_floorplan_ocr_json().exists(): @@ -405,63 +387,6 @@ class Listing: def listing_site(self) -> ListingSite: return ListingSite.RIGHTMOVE # this class supports only right move - async def dict_nicely(self): - travel_time_fastest = {} - travel_time_second = {} - if self.path_routing_json().exists(): - with open(self.path_routing_json(), "r") as f: - travel_times = json.load(f) - for destination_mode in travel_times.keys(): - destination_mode_clean = destination_mode.replace(" ", "_").replace( - ",", "_" - ) - destination, travel_mode = self.__from_routing_cache_key( - destination_mode - ) - travel_time_fastest[destination_mode_clean] = self.travel_time( - destination, travel_mode - )[0]["duration"] - travel_time_second[destination_mode_clean] = self.travel_time( - destination, travel_mode - )[1]["duration"] - - return { - "identifier": self.identifier, - "sqm_ocr": await self.sqm_ocr(), - "price": self.price, - "price_per_sqm": await self.price_per_sqm(), - "url": self.url, - "bedrooms": self.bedrooms, - "travel_time_fastest": ":".join( - sorted( - f"{dest} in {travel_mode//60}min" - for dest, travel_mode in travel_time_fastest.items() - ) - ), - "travel_time_second": ":".join( - sorted( - f"{dest} in {travel_mode//60}min" - for dest, travel_mode in travel_time_second.items() - ) - ), - "lease_left": self.leaseLeft, - "service_charge": self.serviceCharge, - "development": self.development, - "tenure_type": self.tenure_type, - "updated_days": self.updateDaysAgo, - "status": self.status, - "last_seen": self.last_seen, - "agency": self.agency, - "council_tax_band": self.councilTaxBand, - "photo_thumbnail": self.photoThumbnail, - "let_date_available": ( - self.letDateAvailable.strftime("%d/%m/%Y") - if self.letDateAvailable - else "Ask agent" - ), - "price_history": self.priceHistory, - } - def __routing_cache_key( self, dest_address: str, diff --git a/crawler/docker-compose.yml b/crawler/docker-compose.yml index b071423..5756abe 100644 --- a/crawler/docker-compose.yml +++ b/crawler/docker-compose.yml @@ -14,10 +14,13 @@ services: interval: 5s timeout: 3s retries: 5 + networks: + - rec-network mysql: image: mysql:9 container_name: rec-mysql + hostname: mysql ports: - "3306:3306" environment: @@ -32,6 +35,9 @@ services: interval: 10s timeout: 5s retries: 5 + start_period: 30s + networks: + - rec-network app: build: @@ -47,7 +53,7 @@ services: - app_venv:/app/.venv environment: - ENV=dev - - DB_CONNECTION_STRING=mysql://wrongmove:wrongmove@mysql:3306/wrongmove + - DB_CONNECTION_STRING=mysql+mysqldb://wrongmove:wrongmove@mysql:3306/wrongmove - CELERY_BROKER_URL=redis://redis:6379/0 - CELERY_RESULT_BACKEND=redis://redis:6379/0 - ROUTING_API_KEY=${ROUTING_API_KEY:-} @@ -57,6 +63,8 @@ services: mysql: condition: service_healthy command: ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "5001", "--reload", "--reload-dir", "api", "--reload-dir", "services", "--reload-dir", "repositories", "--reload-dir", "models"] + networks: + - rec-network celery: build: @@ -68,7 +76,7 @@ services: - app_venv:/app/.venv environment: - ENV=dev - - DB_CONNECTION_STRING=mysql://wrongmove:wrongmove@mysql:3306/wrongmove + - DB_CONNECTION_STRING=mysql+mysqldb://wrongmove:wrongmove@mysql:3306/wrongmove - CELERY_BROKER_URL=redis://redis:6379/0 - CELERY_RESULT_BACKEND=redis://redis:6379/0 - ROUTING_API_KEY=${ROUTING_API_KEY:-} @@ -79,6 +87,8 @@ services: mysql: condition: service_healthy command: ["celery", "-A", "celery_app", "worker", "--loglevel=info"] + networks: + - rec-network celery-beat: build: @@ -90,7 +100,7 @@ services: - app_venv:/app/.venv environment: - ENV=dev - - DB_CONNECTION_STRING=mysql://wrongmove:wrongmove@mysql:3306/wrongmove + - DB_CONNECTION_STRING=mysql+mysqldb://wrongmove:wrongmove@mysql:3306/wrongmove - CELERY_BROKER_URL=redis://redis:6379/0 - CELERY_RESULT_BACKEND=redis://redis:6379/0 - SCRAPE_SCHEDULES=${SCRAPE_SCHEDULES:-} @@ -98,6 +108,12 @@ services: - redis - celery command: ["celery", "-A", "celery_app", "beat", "--loglevel=info"] + networks: + - rec-network + +networks: + rec-network: + driver: bridge volumes: redis_data: diff --git a/crawler/docs/BACKEND.md b/crawler/docs/BACKEND.md new file mode 100644 index 0000000..e2e1f88 --- /dev/null +++ b/crawler/docs/BACKEND.md @@ -0,0 +1,183 @@ +# Real Estate Crawler - Backend Documentation + +A property listing aggregator that scrapes Rightmove UK, extracts square meters via OCR, and calculates transit routes. + +## Quick Start + +```bash +# Docker (recommended) - starts Redis, MySQL, API, and Celery +./start.sh + +# Or run locally with Poetry +poetry install +./start.sh --local +``` + +API available at `http://localhost:5001` + +## Dependencies + +| Dependency | Purpose | +|------------|---------| +| Python 3.11+ | Runtime | +| Redis | Celery message broker | +| MySQL/SQLite | Database | +| Tesseract OCR | Floorplan text extraction | +| Docker | Containerized deployment | + +### Python Packages (key) +- `fastapi` + `uvicorn` - HTTP API +- `celery` - Background tasks +- `sqlmodel` - ORM +- `pytesseract` + `opencv` - OCR +- `aiohttp` - Async HTTP client + +## API Endpoints + +### Health Check +```bash +curl http://localhost:5001/api/status +# {"status": "OK"} +``` + +### Get Listings +```bash +curl -H "Authorization: Bearer $TOKEN" \ + "http://localhost:5001/api/listing?limit=10" +``` + +### Get Listings as GeoJSON +```bash +curl -H "Authorization: Bearer $TOKEN" \ + "http://localhost:5001/api/listing_geojson?listing_type=RENT&min_bedrooms=2&max_price=3000" +``` + +### Refresh Listings (async) +```bash +curl -X POST -H "Authorization: Bearer $TOKEN" \ + "http://localhost:5001/api/refresh_listings?listing_type=RENT&min_bedrooms=2&max_bedrooms=3&min_price=2000&max_price=4000" +# {"task_id": "abc123", "message": "Task abc123 started"} +``` + +### Check Task Status +```bash +curl -H "Authorization: Bearer $TOKEN" \ + "http://localhost:5001/api/task_status?task_id=abc123" +# {"task_id": "abc123", "status": "SUCCESS", "result": "..."} +``` + +### Get Districts +```bash +curl -H "Authorization: Bearer $TOKEN" \ + "http://localhost:5001/api/get_districts" +# {"Westminster": "REGION^93965", "Camden": "REGION^93934", ...} +``` + +## CLI Commands + +```bash +# Fetch listings from Rightmove +python main.py dump-listings -t rent --min-bedrooms 2 --max-price 4000 + +# Download floorplan images +python main.py dump-images + +# Run OCR on floorplans +python main.py detect-floorplan + +# Calculate transit routes +python main.py routing -d "10 Downing Street, London" -m TRANSIT -l 10 + +# Export to GeoJSON +python main.py export-immoweb -O output.geojson -t rent --min-bedrooms 2 + +# Export to CSV +python main.py export-csv -O output.csv -t rent + +# List available districts +python main.py list-districts +``` + +## Query Parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `listing_type` | RENT/BUY | Property type | +| `min_bedrooms` | int | Minimum bedrooms | +| `max_bedrooms` | int | Maximum bedrooms | +| `min_price` | int | Minimum price | +| `max_price` | int | Maximum price | +| `min_sqm` | int | Minimum square meters | +| `district` | string | District name (repeatable) | +| `furnish_types` | string | FURNISHED/UNFURNISHED/PART_FURNISHED | +| `last_seen_days` | int | Only listings seen in last N days | + +## Architecture + +``` +┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +│ CLI │ │ HTTP API │ │ Celery │ +│ (main.py) │ │ (api/app.py)│ │ Worker │ +└──────┬──────┘ └──────┬──────┘ └──────┬──────┘ + │ │ │ + └───────────────────┼───────────────────┘ + │ + ┌────────▼────────┐ + │ Services │ + │ (services/*.py) │ + └────────┬────────┘ + │ + ┌────────────┼────────────┐ + │ │ │ + ┌──────▼──────┐ ┌───▼───┐ ┌──────▼──────┐ + │ Repository │ │ Redis │ │ Rightmove │ + │ (MySQL) │ │ │ │ API │ + └─────────────┘ └───────┘ └─────────────┘ +``` + +## Environment Variables + +```bash +# Database +DB_CONNECTION_STRING=mysql://user:pass@localhost:3306/wrongmove + +# Redis (Celery) +CELERY_BROKER_URL=redis://localhost:6379/0 +CELERY_RESULT_BACKEND=redis://localhost:6379/0 + +# Google Maps (optional, for routing) +ROUTING_API_KEY=your_api_key +``` + +## Authentication + +API endpoints (except `/api/status`) require JWT authentication via Authentik OIDC. + +```bash +# Get token from Authentik, then: +curl -H "Authorization: Bearer $TOKEN" http://localhost:5001/api/listing +``` + +## Project Structure + +``` +├── main.py # CLI entry point +├── api/app.py # FastAPI application +├── services/ # Business logic (shared by CLI + API) +│ ├── listing_service.py +│ ├── export_service.py +│ ├── district_service.py +│ └── task_service.py +├── repositories/ # Database access +├── models/ # SQLModel entities +├── rec/ # Core logic (query, OCR, routing) +├── tasks/ # Celery background tasks +└── tests/ # Test suite +``` + +## Running Tests + +```bash +pytest tests/ -v --cov=. +mypy . +``` diff --git a/crawler/frontend/src/AppSidebar.tsx b/crawler/frontend/src/AppSidebar.tsx index 51f6861..03e172d 100644 --- a/crawler/frontend/src/AppSidebar.tsx +++ b/crawler/frontend/src/AppSidebar.tsx @@ -12,130 +12,47 @@ import { } from "@/components/ui/sidebar" import * as React from "react" -// This is sample data. const data = { navMain: [ { - title: "Getting Started", + title: "Property Explorer", url: "#", items: [ { - title: "Installation", - url: "#", - }, - { - title: "Project Structure", - url: "#", - }, - ], - }, - { - title: "Building Your Application", - url: "#", - items: [ - { - title: "Routing", - url: "#", - }, - { - title: "Data Fetching", + title: "Map View", url: "#", isActive: true, }, { - title: "Rendering", - url: "#", - }, - { - title: "Caching", - url: "#", - }, - { - title: "Styling", - url: "#", - }, - { - title: "Optimizing", - url: "#", - }, - { - title: "Configuring", - url: "#", - }, - { - title: "Testing", - url: "#", - }, - { - title: "Authentication", - url: "#", - }, - { - title: "Deploying", - url: "#", - }, - { - title: "Upgrading", - url: "#", - }, - { - title: "Examples", + title: "List View", url: "#", }, ], }, { - title: "API Reference", + title: "Data Management", url: "#", items: [ { - title: "Components", + title: "Refresh Listings", url: "#", }, { - title: "File Conventions", - url: "#", - }, - { - title: "Functions", - url: "#", - }, - { - title: "next.config.js Options", - url: "#", - }, - { - title: "CLI", - url: "#", - }, - { - title: "Edge Runtime", + title: "Active Tasks", url: "#", }, ], }, { - title: "Architecture", + title: "Settings", url: "#", items: [ { - title: "Accessibility", + title: "Preferences", url: "#", }, { - title: "Fast Refresh", - url: "#", - }, - { - title: "Next.js Compiler", - url: "#", - }, - { - title: "Supported Browsers", - url: "#", - }, - { - title: "Turbopack", + title: "Account", url: "#", }, ], @@ -145,21 +62,19 @@ const data = { export function AppSidebar({ ...props }: React.ComponentProps) { return ( - // create closed by default - + - {/* We create a SidebarGroup for each parent. */} {data.navMain.map((item) => ( {item.title} - {item.items.map((item) => ( - - - {item.title} + {item.items.map((subItem) => ( + + + {subItem.title} ))} diff --git a/crawler/frontend/src/components/Parameters.tsx b/crawler/frontend/src/components/Parameters.tsx index ed4ccdf..9693bc1 100644 --- a/crawler/frontend/src/components/Parameters.tsx +++ b/crawler/frontend/src/components/Parameters.tsx @@ -1,8 +1,6 @@ -import { getUser } from "@/auth/authService"; import { zodResolver } from "@hookform/resolvers/zod"; import { DialogTitle } from "@radix-ui/react-dialog"; -import type { User } from "oidc-client-ts"; -import { useEffect, useState } from "react"; +import { useState } from "react"; import { useForm } from "react-hook-form"; import { z } from "zod"; import { Button } from "./ui/button"; @@ -24,6 +22,12 @@ export enum ListingType { BUY = 'BUY' } +export enum FurnishType { + FURNISHED = 'furnished', + PART_FURNISHED = 'partFurnished', + UNFURNISHED = 'unfurnished', +} + export interface ParameterValues { metric: Metric @@ -33,30 +37,15 @@ export interface ParameterValues { min_price?: number max_price?: number min_sqm?: number + max_sqm?: number + min_price_per_sqm?: number + max_price_per_sqm?: number last_seen_days?: number available_from?: Date district: string + furnish_types?: FurnishType[] } -const fetchDistricts = async (user: User | null) => { - const accessToken = user?.access_token; - - const response = await fetch('/api/get_districts', - { - method: 'GET', - headers: { - 'Authorization': `Bearer ${accessToken}`, // Pass the token - 'Content-Type': 'application/json', - }, - } - ); - if (!response.ok) { - throw new Error('Error: ' + response.status); - } - const data: Response = await response.json(); - return data; -}; - export function Parameters( props: { isOpen: boolean, @@ -69,15 +58,6 @@ export function Parameters( } = useForm() const [action, setAction] = useState<'fetch-data' | 'visualize' | null>(null) const [availableFromRawInput, setAvailableFromRawInput] = useState("now"); - const [_districts, setDistricts] = useState([]); - - useEffect(() => { - getUser().then(user => { - fetchDistricts(user).then(data => { - setDistricts(Object.keys(data)); - }) - }) - }, []); const formSchema = z.object({ metric: z.nativeEnum(Metric, { required_error: "Metric is required" }), @@ -177,29 +157,6 @@ export function Parameters( )} /> - {/* ( - - District - - - - )} - /> */} void; +} + +interface ListingStats { + count: number; + avgPrice: number; + avgPricePerSqm: number; + avgSize: number; +} + +function calculateStats(data: GeoJSONFeatureCollection | null): ListingStats { + if (!data || data.features.length === 0) { + return { count: 0, avgPrice: 0, avgPricePerSqm: 0, avgSize: 0 }; + } + + const features = data.features; + const count = features.length; + + const validPrices = features + .map((f: PropertyFeature) => f.properties.total_price) + .filter((p): p is number => typeof p === 'number' && p > 0); + + const validPricesPerSqm = features + .map((f: PropertyFeature) => f.properties.qmprice) + .filter((p): p is number => typeof p === 'number' && p > 0); + + const validSizes = features + .map((f: PropertyFeature) => f.properties.qm) + .filter((s): s is number => typeof s === 'number' && s > 0); + + const avgPrice = validPrices.length > 0 + ? validPrices.reduce((a, b) => a + b, 0) / validPrices.length + : 0; + + const avgPricePerSqm = validPricesPerSqm.length > 0 + ? validPricesPerSqm.reduce((a, b) => a + b, 0) / validPricesPerSqm.length + : 0; + + const avgSize = validSizes.length > 0 + ? validSizes.reduce((a, b) => a + b, 0) / validSizes.length + : 0; + + return { count, avgPrice, avgPricePerSqm, avgSize }; +} + +function formatCurrency(value: number): string { + if (value >= 1000) { + return `£${(value / 1000).toFixed(1)}k`; + } + return `£${Math.round(value)}`; +} + +export function StatsBar({ listingData, viewMode, onViewModeChange }: StatsBarProps) { + const stats = calculateStats(listingData); + + return ( +
+ {/* Stats */} +
+
+ + {stats.count.toLocaleString()} + listings +
+ + {stats.avgPrice > 0 && ( + <> +
+ + Avg: {formatCurrency(stats.avgPrice)} +
+
+ + Avg £/m²: {formatCurrency(stats.avgPricePerSqm)} +
+
+ + Avg: {Math.round(stats.avgSize)} m² +
+ + )} +
+ + {/* View Mode Toggle */} +
+ + + +
+
+ ); +} diff --git a/crawler/frontend/src/components/StreamingProgressBar.tsx b/crawler/frontend/src/components/StreamingProgressBar.tsx new file mode 100644 index 0000000..fd28105 --- /dev/null +++ b/crawler/frontend/src/components/StreamingProgressBar.tsx @@ -0,0 +1,47 @@ +import { Loader2 } from 'lucide-react'; +import type { StreamingProgress } from '@/services'; + +interface StreamingProgressBarProps { + progress: StreamingProgress | null; + isLoading: boolean; +} + +export function StreamingProgressBar({ progress, isLoading }: StreamingProgressBarProps) { + if (!isLoading) return null; + + return ( +
+
+ +
+
+ + {progress + ? `Loading listings...` + : 'Loading...'} + + {progress && ( + + {progress.count.toLocaleString()} + {progress.total ? ` / ${progress.total.toLocaleString()}` : ''} loaded + + )} +
+ {progress && ( +
+
+
+ )} +
+
+
+ ); +} diff --git a/crawler/frontend/src/components/ui/accordion.tsx b/crawler/frontend/src/components/ui/accordion.tsx new file mode 100644 index 0000000..8be5856 --- /dev/null +++ b/crawler/frontend/src/components/ui/accordion.tsx @@ -0,0 +1,56 @@ +"use client" + +import * as React from "react" +import * as AccordionPrimitive from "@radix-ui/react-accordion" +import { ChevronDown } from "lucide-react" +import { cn } from "@/lib/utils" + +const Accordion = AccordionPrimitive.Root + +const AccordionItem = React.forwardRef< + React.ComponentRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + +)) +AccordionItem.displayName = "AccordionItem" + +const AccordionTrigger = React.forwardRef< + React.ComponentRef, + React.ComponentPropsWithoutRef +>(({ className, children, ...props }, ref) => ( + + svg]:rotate-180", + className + )} + {...props} + > + {children} + + + +)) +AccordionTrigger.displayName = AccordionPrimitive.Trigger.displayName + +const AccordionContent = React.forwardRef< + React.ComponentRef, + React.ComponentPropsWithoutRef +>(({ className, children, ...props }, ref) => ( + +
{children}
+
+)) +AccordionContent.displayName = AccordionPrimitive.Content.displayName + +export { Accordion, AccordionItem, AccordionTrigger, AccordionContent } diff --git a/crawler/frontend/src/components/ui/alert.tsx b/crawler/frontend/src/components/ui/alert.tsx deleted file mode 100644 index 1421354..0000000 --- a/crawler/frontend/src/components/ui/alert.tsx +++ /dev/null @@ -1,66 +0,0 @@ -import * as React from "react" -import { cva, type VariantProps } from "class-variance-authority" - -import { cn } from "@/lib/utils" - -const alertVariants = cva( - "relative w-full rounded-lg border px-4 py-3 text-sm grid has-[>svg]:grid-cols-[calc(var(--spacing)*4)_1fr] grid-cols-[0_1fr] has-[>svg]:gap-x-3 gap-y-0.5 items-start [&>svg]:size-4 [&>svg]:translate-y-0.5 [&>svg]:text-current", - { - variants: { - variant: { - default: "bg-card text-card-foreground", - destructive: - "text-destructive bg-card [&>svg]:text-current *:data-[slot=alert-description]:text-destructive/90", - }, - }, - defaultVariants: { - variant: "default", - }, - } -) - -function Alert({ - className, - variant, - ...props -}: React.ComponentProps<"div"> & VariantProps) { - return ( -
- ) -} - -function AlertTitle({ className, ...props }: React.ComponentProps<"div">) { - return ( -
- ) -} - -function AlertDescription({ - className, - ...props -}: React.ComponentProps<"div">) { - return ( -
- ) -} - -export { Alert, AlertTitle, AlertDescription } diff --git a/crawler/frontend/src/components/ui/badge.tsx b/crawler/frontend/src/components/ui/badge.tsx deleted file mode 100644 index 0205413..0000000 --- a/crawler/frontend/src/components/ui/badge.tsx +++ /dev/null @@ -1,46 +0,0 @@ -import * as React from "react" -import { Slot } from "@radix-ui/react-slot" -import { cva, type VariantProps } from "class-variance-authority" - -import { cn } from "@/lib/utils" - -const badgeVariants = cva( - "inline-flex items-center justify-center rounded-md border px-2 py-0.5 text-xs font-medium w-fit whitespace-nowrap shrink-0 [&>svg]:size-3 gap-1 [&>svg]:pointer-events-none focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:ring-[3px] aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive transition-[color,box-shadow] overflow-hidden", - { - variants: { - variant: { - default: - "border-transparent bg-primary text-primary-foreground [a&]:hover:bg-primary/90", - secondary: - "border-transparent bg-secondary text-secondary-foreground [a&]:hover:bg-secondary/90", - destructive: - "border-transparent bg-destructive text-white [a&]:hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60", - outline: - "text-foreground [a&]:hover:bg-accent [a&]:hover:text-accent-foreground", - }, - }, - defaultVariants: { - variant: "default", - }, - } -) - -function Badge({ - className, - variant, - asChild = false, - ...props -}: React.ComponentProps<"span"> & - VariantProps & { asChild?: boolean }) { - const Comp = asChild ? Slot : "span" - - return ( - - ) -} - -export { Badge, badgeVariants } diff --git a/crawler/frontend/src/components/ui/breadcrumb.tsx b/crawler/frontend/src/components/ui/breadcrumb.tsx index eb88f32..27b5c51 100644 --- a/crawler/frontend/src/components/ui/breadcrumb.tsx +++ b/crawler/frontend/src/components/ui/breadcrumb.tsx @@ -1,6 +1,6 @@ import * as React from "react" import { Slot } from "@radix-ui/react-slot" -import { ChevronRight, MoreHorizontal } from "lucide-react" +import { ChevronRight } from "lucide-react" import { cn } from "@/lib/utils" @@ -80,24 +80,6 @@ function BreadcrumbSeparator({ ) } -function BreadcrumbEllipsis({ - className, - ...props -}: React.ComponentProps<"span">) { - return ( - - ) -} - export { Breadcrumb, BreadcrumbList, @@ -105,5 +87,4 @@ export { BreadcrumbLink, BreadcrumbPage, BreadcrumbSeparator, - BreadcrumbEllipsis, } diff --git a/crawler/frontend/src/components/ui/checkbox.tsx b/crawler/frontend/src/components/ui/checkbox.tsx new file mode 100644 index 0000000..488095a --- /dev/null +++ b/crawler/frontend/src/components/ui/checkbox.tsx @@ -0,0 +1,29 @@ +"use client" + +import * as React from "react" +import * as CheckboxPrimitive from "@radix-ui/react-checkbox" +import { Check } from "lucide-react" +import { cn } from "@/lib/utils" + +const Checkbox = React.forwardRef< + React.ComponentRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + + + + + +)) +Checkbox.displayName = CheckboxPrimitive.Root.displayName + +export { Checkbox } diff --git a/crawler/frontend/src/components/ui/slider.tsx b/crawler/frontend/src/components/ui/slider.tsx new file mode 100644 index 0000000..3c20cab --- /dev/null +++ b/crawler/frontend/src/components/ui/slider.tsx @@ -0,0 +1,34 @@ +"use client" + +import * as React from "react" +import * as SliderPrimitive from "@radix-ui/react-slider" +import { cn } from "@/lib/utils" + +const Slider = React.forwardRef< + React.ComponentRef, + React.ComponentPropsWithoutRef +>(({ className, ...props }, ref) => ( + + + + + {props.defaultValue?.map((_, index) => ( + + )) ?? ( + + )} + +)) +Slider.displayName = SliderPrimitive.Root.displayName + +export { Slider } diff --git a/crawler/frontend/src/index.css b/crawler/frontend/src/index.css index f4c1e9b..4f3edd6 100644 --- a/crawler/frontend/src/index.css +++ b/crawler/frontend/src/index.css @@ -118,3 +118,30 @@ @apply bg-background text-foreground; } } + +/* Accordion animations */ +@keyframes accordion-down { + from { + height: 0; + } + to { + height: var(--radix-accordion-content-height); + } +} + +@keyframes accordion-up { + from { + height: var(--radix-accordion-content-height); + } + to { + height: 0; + } +} + +.animate-accordion-down { + animation: accordion-down 0.2s ease-out; +} + +.animate-accordion-up { + animation: accordion-up 0.2s ease-out; +} diff --git a/crawler/frontend/src/services/apiClient.ts b/crawler/frontend/src/services/apiClient.ts new file mode 100644 index 0000000..7efd8a6 --- /dev/null +++ b/crawler/frontend/src/services/apiClient.ts @@ -0,0 +1,62 @@ +// Generic API client with authentication + +import type { User } from 'oidc-client-ts'; +import { ApiError } from '@/types'; + +export interface RequestOptions { + method?: 'GET' | 'POST' | 'PUT' | 'DELETE'; + params?: Record; +} + +/** + * Build query string from parameters object + */ +function buildQueryString(params: Record): string { + const queryString = new URLSearchParams(); + + for (const [key, value] of Object.entries(params)) { + if (value !== undefined && value !== null && value !== '') { + if (value instanceof Date) { + queryString.append(key, value.toISOString()); + } else { + queryString.append(key, String(value)); + } + } + } + + return queryString.toString(); +} + +/** + * Generic authenticated API request + */ +export async function apiRequest( + user: User, + endpoint: string, + options: RequestOptions = {} +): Promise { + const { method = 'GET', params } = options; + const accessToken = user.access_token; + + let url = endpoint; + if (params) { + const queryString = buildQueryString(params); + if (queryString) { + url = `${endpoint}?${queryString}`; + } + } + + const response = await fetch(url, { + method, + headers: { + Authorization: `Bearer ${accessToken}`, + 'Content-Type': 'application/json', + }, + }); + + if (!response.ok) { + throw new ApiError(`Error: ${response.status}`, response.status); + } + + return response.json() as Promise; +} diff --git a/crawler/frontend/src/services/listingService.ts b/crawler/frontend/src/services/listingService.ts new file mode 100644 index 0000000..4520989 --- /dev/null +++ b/crawler/frontend/src/services/listingService.ts @@ -0,0 +1,54 @@ +// Listing service for fetching and refreshing listings + +import type { User } from 'oidc-client-ts'; +import type { GeoJSONFeatureCollection, RefreshListingsResponse } from '@/types'; +import type { ParameterValues } from '@/components/FilterPanel'; +import { apiRequest } from './apiClient'; +import { API_ENDPOINTS } from '@/constants'; + +/** + * Build listing query parameters from form values + */ +function buildListingParams(parameters: ParameterValues): Record { + return { + listing_type: parameters.listing_type, + min_bedrooms: parameters.min_bedrooms, + max_bedrooms: parameters.max_bedrooms, + max_price: parameters.max_price, + min_price: parameters.min_price, + min_sqm: parameters.min_sqm, + max_sqm: parameters.max_sqm, + min_price_per_sqm: parameters.min_price_per_sqm, + max_price_per_sqm: parameters.max_price_per_sqm, + last_seen_days: parameters.last_seen_days, + let_date_available_from: parameters.available_from, + district_names: parameters.district || undefined, + furnish_types: parameters.furnish_types?.join(',') || undefined, + }; +} + +/** + * Fetch listing data as GeoJSON + */ +export async function fetchListingGeoJSON( + user: User, + parameters: ParameterValues +): Promise { + return apiRequest(user, API_ENDPOINTS.LISTING_GEOJSON, { + method: 'GET', + params: buildListingParams(parameters), + }); +} + +/** + * Trigger a listing refresh task + */ +export async function refreshListings( + user: User, + parameters: ParameterValues +): Promise { + return apiRequest(user, API_ENDPOINTS.REFRESH_LISTINGS, { + method: 'POST', + params: buildListingParams(parameters), + }); +} diff --git a/crawler/frontend/src/utils/mapUtils.ts b/crawler/frontend/src/utils/mapUtils.ts new file mode 100644 index 0000000..4bd8d41 --- /dev/null +++ b/crawler/frontend/src/utils/mapUtils.ts @@ -0,0 +1,45 @@ +// Map utility functions + +/** + * Deep clone an object using JSON serialization + */ +export function clone(obj: T): T { + return JSON.parse(JSON.stringify(obj)); +} + +/** + * Calculate the value at a given percentile in a sorted array + * @param arr Sorted array of numbers + * @param p Percentile (0-1) + */ +export function percentile(arr: number[], p: number): number { + if (arr.length === 0) return 0; + if (typeof p !== 'number') throw new TypeError('p must be a number'); + if (p <= 0) return arr[0]; + if (p >= 1) return arr[arr.length - 1]; + + const index = arr.length * p; + const lower = Math.floor(index); + const upper = lower + 1; + const weight = index % 1; + + if (upper >= arr.length) return arr[lower]; + return arr[lower] * (1 - weight) + arr[upper] * weight; +} + +/** + * Convert percentage-based color stops to value-based color stops + * @param colorStopsPerc Array of [percentage, color] tuples + * @param min Minimum value + * @param max Maximum value + */ +export function calculateColorStops( + colorStopsPerc: [number, string][], + min: number, + max: number +): [number, string][] { + return colorStopsPerc.map(([perc, color]) => [ + min + (perc * (max - min)) / 100, + color, + ]); +} diff --git a/crawler/frontend/tsconfig.app.tsbuildinfo b/crawler/frontend/tsconfig.app.tsbuildinfo index c9dbc69..2030281 100644 --- a/crawler/frontend/tsconfig.app.tsbuildinfo +++ b/crawler/frontend/tsconfig.app.tsbuildinfo @@ -1 +1 @@ -{"root":["./src/App.tsx","./src/AppSidebar.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/auth/authService.ts","./src/auth/config.ts","./src/components/ActiveQuery.tsx","./src/components/AlertError.tsx","./src/components/LoginModal.tsx","./src/components/Map.tsx","./src/components/Parameters.tsx","./src/components/ui/alert-dialog.tsx","./src/components/ui/alert.tsx","./src/components/ui/badge.tsx","./src/components/ui/breadcrumb.tsx","./src/components/ui/button.tsx","./src/components/ui/dialog.tsx","./src/components/ui/form.tsx","./src/components/ui/hover-card.tsx","./src/components/ui/input.tsx","./src/components/ui/label.tsx","./src/components/ui/progress.tsx","./src/components/ui/scroll-area.tsx","./src/components/ui/select.tsx","./src/components/ui/separator.tsx","./src/components/ui/sheet.tsx","./src/components/ui/sidebar.tsx","./src/components/ui/skeleton.tsx","./src/components/ui/tooltip.tsx","./src/hooks/use-mobile.ts","./src/lib/utils.ts"],"version":"5.8.3"} \ No newline at end of file +{"root":["./src/App.tsx","./src/AppSidebar.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/auth/authService.ts","./src/auth/config.ts","./src/auth/errors.ts","./src/components/ActiveQuery.tsx","./src/components/AlertError.tsx","./src/components/AuthCallback.tsx","./src/components/FilterPanel.tsx","./src/components/Header.tsx","./src/components/HealthIndicator.tsx","./src/components/ListView.tsx","./src/components/LoginModal.tsx","./src/components/Map.tsx","./src/components/Parameters.tsx","./src/components/PropertyCard.tsx","./src/components/Spinner.tsx","./src/components/StatsBar.tsx","./src/components/StreamingProgressBar.tsx","./src/components/TaskIndicator.tsx","./src/components/ui/DatePicker.tsx","./src/components/ui/accordion.tsx","./src/components/ui/alert-dialog.tsx","./src/components/ui/breadcrumb.tsx","./src/components/ui/button.tsx","./src/components/ui/calendar.tsx","./src/components/ui/checkbox.tsx","./src/components/ui/dialog.tsx","./src/components/ui/form.tsx","./src/components/ui/hover-card.tsx","./src/components/ui/input.tsx","./src/components/ui/label.tsx","./src/components/ui/popover.tsx","./src/components/ui/progress.tsx","./src/components/ui/scroll-area.tsx","./src/components/ui/select.tsx","./src/components/ui/separator.tsx","./src/components/ui/sheet.tsx","./src/components/ui/sidebar.tsx","./src/components/ui/skeleton.tsx","./src/components/ui/slider.tsx","./src/components/ui/tooltip.tsx","./src/constants/colorSchemes.ts","./src/constants/index.ts","./src/hooks/use-mobile.ts","./src/lib/utils.ts","./src/services/apiClient.ts","./src/services/healthService.ts","./src/services/index.ts","./src/services/listingService.ts","./src/services/streamingService.ts","./src/services/taskService.ts","./src/types/index.ts","./src/utils/mapUtils.ts"],"version":"5.8.3"} \ No newline at end of file diff --git a/crawler/frontend/vite.config.ts b/crawler/frontend/vite.config.ts index 4e658f6..2511867 100644 --- a/crawler/frontend/vite.config.ts +++ b/crawler/frontend/vite.config.ts @@ -19,7 +19,8 @@ export default defineConfig({ allowedHosts: [ env.DEV_HOST ?? 'localhost', // Add more hosts here - 'wrongmove.viktorbarzin.me' + 'wrongmove.viktorbarzin.me', + 'devvm.viktorbarzin.lan' ], } }) diff --git a/crawler/main.py b/crawler/main.py index a7acbf9..b9fdee4 100644 --- a/crawler/main.py +++ b/crawler/main.py @@ -1,28 +1,28 @@ +"""CLI entry point for the Real Estate Crawler.""" import asyncio from datetime import datetime import os import pathlib +from typing import Callable, ParamSpec, TypeVar import click -import importlib from models.listing import FurnishType, ListingType, QueryParameters -from rec.districts import get_districts from data_access import Listing -import csv_exporter from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode from repositories.listing_repository import ListingRepository -from ui_exporter import export_immoweb as export_immoweb_ui from functools import wraps from database import engine +from services import ( + listing_service, + export_service, + district_service, +) + +P = ParamSpec("P") +R = TypeVar("R") -dump_listings_module = importlib.import_module("1_dump_listings") -dump_images_module = importlib.import_module("3_dump_images") -detect_floorplan_module = importlib.import_module("4_detect_floorplan") -routing_module = importlib.import_module("5_routing") - - -def listing_filter_options(func): +def listing_filter_options(func: Callable[P, R]) -> Callable[P, R]: """Decorator to add common options for filtering listings.""" @click.option( @@ -45,7 +45,7 @@ def listing_filter_options(func): "--max-bedrooms", default=10, help="Maximum number of bedrooms", - type=click.IntRange(min=1, max=10), # Right move gets unhappy with >10 + type=click.IntRange(min=1, max=10), ) @click.option( "--min-price", @@ -57,13 +57,13 @@ def listing_filter_options(func): "--max-price", default=999_999, help="Maximum price", - type=click.IntRange(min=0), # 40k for renting + type=click.IntRange(min=0), ) @click.option( "--district", default=None, help="Districts to scrape", - type=click.Choice(get_districts().keys(), case_sensitive=False), + type=click.Choice(district_service.get_district_names(), case_sensitive=False), multiple=True, ) @click.option( @@ -95,17 +95,50 @@ def listing_filter_options(func): type=int, ) @wraps(func) - def wrapper(*args, **kwargs): + def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: return func(*args, **kwargs) return wrapper +def build_query_parameters( + type: str, + district: list[str], + min_bedrooms: int, + max_bedrooms: int, + min_price: int, + max_price: int, + furnish_types: list[str], + available_from: datetime | None, + last_seen_days: int, + min_sqm: int | None = None, + radius: int = 0, + page_size: int = 500, + max_days_since_added: int = 14, +) -> QueryParameters: + """Build QueryParameters from CLI options.""" + return QueryParameters( + listing_type=ListingType[type], + district_names=set(district) if district else None, + min_bedrooms=min_bedrooms, + max_bedrooms=max_bedrooms, + min_price=min_price, + max_price=max_price, + furnish_types=[FurnishType[ft] for ft in furnish_types] if furnish_types else None, + let_date_available_from=available_from, + last_seen_days=last_seen_days, + min_sqm=min_sqm, + radius=radius, + page_size=page_size, + max_days_since_added=max_days_since_added, + ) + + @click.group() @click.option( "--data-dir", default=pathlib.Path("data/rs/"), - help="Districts to scrape", + help="Data directory for storing listings", type=click.Path( writable=True, file_okay=False, @@ -114,17 +147,18 @@ def listing_filter_options(func): ), ) @click.pass_context -def cli(ctx, data_dir: str): +def cli(ctx: click.Context, data_dir: str) -> None: ctx.ensure_object(dict) - ctx.obj["data_dir"] = data_dir + ctx.obj["data_dir"] = pathlib.Path(data_dir) + ctx.obj["repository"] = ListingRepository(engine=engine) @cli.command() @listing_filter_options -@click.option("--full", is_flag=True) +@click.option("--full", is_flag=True, help="Include images and floorplan detection") @click.pass_context def dump_listings( - ctx: click.core.Context, + ctx: click.Context, full: bool, district: list[str], min_bedrooms: int, @@ -136,58 +170,63 @@ def dump_listings( available_from: datetime | None, last_seen_days: int, min_sqm: int | None = None, -): - data_dir: str = ctx.obj["data_dir"] - query_parameters = QueryParameters( - listing_type=ListingType[type], - district_names=set(district), +) -> None: + """Fetch listings from Rightmove API.""" + data_dir: pathlib.Path = ctx.obj["data_dir"] + repository: ListingRepository = ctx.obj["repository"] + + query_parameters = build_query_parameters( + type=type, + district=district, min_bedrooms=min_bedrooms, max_bedrooms=max_bedrooms, min_price=min_price, max_price=max_price, - furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types], - let_date_available_from=available_from, + furnish_types=furnish_types, + available_from=available_from, last_seen_days=last_seen_days, min_sqm=min_sqm, - radius=0, - page_size=500, - max_days_since_added=14, ) - click.echo( - f"Running dump_listings for districts {district}, data dir {data_dir} and parameters: " - f"{query_parameters}" + + click.echo(f"Fetching listings with parameters: {query_parameters}") + + result = asyncio.run( + listing_service.refresh_listings( + repository, + query_parameters, + full=full, + async_mode=False, + ) ) - data_dir_path = pathlib.Path(data_dir) - repository = ListingRepository(engine=engine) - if not full: # only listings - asyncio.run( - dump_listings_module.dump_listings( - query_parameters, repository, data_dir_path - ) - ) - else: # include images, floorplan detection etc. - asyncio.run( - dump_listings_module.dump_listings_full( - query_parameters, repository, data_dir_path - ) - ) + + click.echo(result.message) @cli.command() @click.pass_context -def dump_images(ctx: click.core.Context): - data_dir = ctx.obj["data_dir"] - click.echo(f"Running dump_images for listings stored in {engine.url}") - repository = ListingRepository(engine=engine) - asyncio.run(dump_images_module.dump_images(repository, image_base_path=data_dir)) +def dump_images(ctx: click.Context) -> None: + """Download floorplan images for all listings.""" + data_dir: pathlib.Path = ctx.obj["data_dir"] + repository: ListingRepository = ctx.obj["repository"] + + click.echo(f"Downloading images to {data_dir}") + + count = asyncio.run(listing_service.download_images(repository, data_dir)) + + click.echo(f"Processed {count} listings") @cli.command() @click.pass_context -def detect_floorplan(ctx: click.core.Context): - click.echo(f"Running detect_floorplan for listings stored in {engine.url}") - repository = ListingRepository(engine=engine) - asyncio.run(detect_floorplan_module.detect_floorplan(repository)) +def detect_floorplan(ctx: click.Context) -> None: + """Run OCR on floorplan images to detect square meters.""" + repository: ListingRepository = ctx.obj["repository"] + + click.echo("Running floorplan detection...") + + count = asyncio.run(listing_service.detect_floorplans(repository)) + + click.echo(f"Processed {count} listings") @cli.command() @@ -202,10 +241,7 @@ def detect_floorplan(ctx: click.core.Context): "--travel-mode", "-m", help="Travel mode for routing", - type=click.Choice( - TravelMode.__members__.keys(), - case_sensitive=False, - ), + type=click.Choice(TravelMode.__members__.keys(), case_sensitive=False), required=True, ) @click.option( @@ -213,65 +249,50 @@ def detect_floorplan(ctx: click.core.Context): "-l", help="Limit the number of listings to process", type=click.IntRange(min=1), - default=1, # by default limit to 1 to avoid accidental API usage + default=1, ) @click.pass_context def routing( - ctx: click.core.Context, destination_address: str, travel_mode: str, limit: int -): - data_dir = ctx.obj["data_dir"] - click.echo(f"Running routing for the first {limit} listings in {data_dir}") - listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json"))) - listing_paths = listing_paths[:limit] + ctx: click.Context, + destination_address: str, + travel_mode: str, + limit: int, +) -> None: + """Calculate transit routes for listings.""" + repository: ListingRepository = ctx.obj["repository"] + if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None: - raise click.exceptions.MissingParameter( - f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. " - "Please set it to your API key for the routing service." + raise click.ClickException( + f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set." ) - repository = ListingRepository(engine=engine) - asyncio.run( - routing_module.calculate_route( + click.echo(f"Calculating routes to '{destination_address}' for {limit} listings") + + count = asyncio.run( + listing_service.calculate_routes( repository, destination_address, - # destination_address_coordinates, - TravelMode[travel_mode], + travel_mode, limit=limit, ) ) + click.echo(f"Processed {count} listings") + @cli.command() -# @click.option( -# "--columns", -# "-C", -# help="Columns to include in the CSV file", -# type=click.Choice( -# # csv_exporter.get_columns_from_listings(), -# [1], -# case_sensitive=False, -# ), -# multiple=True, -# default=Listing.ALL_COLUMNS, -# ) @click.option( "--output-file", "-O", help="Path to the output CSV file", required=True, - type=click.Path( - writable=True, - file_okay=True, - dir_okay=False, - resolve_path=True, - ), + type=click.Path(writable=True, file_okay=True, dir_okay=False, resolve_path=True), ) -@click.pass_context @listing_filter_options +@click.pass_context def export_csv( - ctx: click.core.Context, + ctx: click.Context, output_file: str, - # columns: tuple[str], district: list[str], min_bedrooms: int, max_bedrooms: int, @@ -282,53 +303,48 @@ def export_csv( available_from: datetime | None, last_seen_days: int, min_sqm: int | None = None, -): - # use model - data_dir = ctx.obj["data_dir"] - query_parameters = QueryParameters( - listing_type=ListingType[type], - district_names=set(district), +) -> None: + """Export listings to CSV file.""" + repository: ListingRepository = ctx.obj["repository"] + + query_parameters = build_query_parameters( + type=type, + district=district, min_bedrooms=min_bedrooms, max_bedrooms=max_bedrooms, min_price=min_price, max_price=max_price, - furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types], - let_date_available_from=available_from, + furnish_types=furnish_types, + available_from=available_from, last_seen_days=last_seen_days, min_sqm=min_sqm, ) - click.echo( - f"Exporting data to {output_file} using {data_dir=} and query parameters: {query_parameters}" - ) - output_file_path = pathlib.Path(output_file) - repository = ListingRepository(engine=engine) - asyncio.run( - csv_exporter.export_to_csv( + + click.echo(f"Exporting to {output_file}") + + result = asyncio.run( + export_service.export_to_csv( repository, - output_file_path, - # list(columns), - query_parameters=query_parameters, - ), + pathlib.Path(output_file), + query_parameters, + ) ) + click.echo(result.message) + @cli.command() @click.option( "--output-file", "-O", - help="Path to the output immoweb file", + help="Path to the output GeoJSON file", required=True, - type=click.Path( - writable=True, - file_okay=True, - dir_okay=False, - resolve_path=True, - ), + type=click.Path(writable=True, file_okay=True, dir_okay=False, resolve_path=True), ) @listing_filter_options @click.pass_context def export_immoweb( - ctx: click.core.Context, + ctx: click.Context, output_file: str, district: list[str], min_bedrooms: int, @@ -340,39 +356,62 @@ def export_immoweb( available_from: datetime | None, last_seen_days: int, min_sqm: int | None = None, -): - query_parameters = QueryParameters( - listing_type=ListingType[type], - district_names=set(district), +) -> None: + """Export listings to GeoJSON file for map visualization.""" + repository: ListingRepository = ctx.obj["repository"] + + query_parameters = build_query_parameters( + type=type, + district=district, min_bedrooms=min_bedrooms, max_bedrooms=max_bedrooms, min_price=min_price, max_price=max_price, - furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types], - let_date_available_from=available_from, + furnish_types=furnish_types, + available_from=available_from, last_seen_days=last_seen_days, min_sqm=min_sqm, ) - click.echo( - f"Exporting data to {output_file} for listings stored in {engine.url} that match the query parameters: {query_parameters}" + + click.echo(f"Exporting to {output_file}") + + result = asyncio.run( + export_service.export_to_geojson( + repository, + query_parameters=query_parameters, + output_path=pathlib.Path(output_file), + ) ) - repository = ListingRepository(engine=engine) - asyncio.run(export_immoweb_ui(repository, output_file, query_parameters)) + + click.echo(result.message) @cli.command() @click.pass_context -def populate_db( - ctx: click.core.Context, -): - data_dir = ctx.obj["data_dir"] - click.echo(f"Populating the database with data from {data_dir}") - repository = ListingRepository(engine=engine) +def populate_db(ctx: click.Context) -> None: + """Populate database from filesystem data (legacy migration).""" + data_dir: pathlib.Path = ctx.obj["data_dir"] + repository: ListingRepository = ctx.obj["repository"] + + click.echo(f"Populating database from {data_dir}") + listings = Listing.get_all_listings( - [path for path in pathlib.Path(data_dir).glob("*/listing.json")] + [path for path in data_dir.glob("*/listing.json")] ) + asyncio.run(repository.upsert_listings_legacy(listings)) + click.echo(f"Imported {len(listings)} listings") + + +@cli.command() +def list_districts() -> None: + """List all available districts.""" + districts = district_service.get_all_districts() + click.echo(f"Available districts ({len(districts)}):") + for name in sorted(districts.keys()): + click.echo(f" - {name}") + if __name__ == "__main__": cli() diff --git a/crawler/main_tmp.py b/crawler/main_tmp.py deleted file mode 100644 index 0741966..0000000 --- a/crawler/main_tmp.py +++ /dev/null @@ -1,40 +0,0 @@ -def record(): - from rec.query import listing_query, detail_query - import json - - page = 1 - listing = listing_query(page, 2, 2, 5, 200000, 500000) - with open( - f"/Users/kadir/code/realestate/crawler/code/json/queries/listing{page}.json", - "w", - ) as f: - json.dump(listing, f) - - for prop in listing["properties"]: - identifier = prop["identifier"] - resp = detail_query(identifier) - # print(identifier, resp.status_code) - with open( - f"/Users/kadir/code/realestate/crawler/code/json/queries/detail_{identifier}.json", - "w", - ) as f: - json.dump(resp, f) - - -def process(): - import json - import pathlib - - path = pathlib.Path("/Users/kadir/code/realestate/crawler/code/json/queries/") - - detailjsons = list(path.glob("detail_*json")) - for file in detailjsons: - with open(file) as f: - js = json.load(f) - - for floorplan in js["property"]["floorplans"]: - print(floorplan["url"]) - - -# record() -process() diff --git a/crawler/models/listing.py b/crawler/models/listing.py index cd0584b..3c645c8 100644 --- a/crawler/models/listing.py +++ b/crawler/models/listing.py @@ -5,7 +5,7 @@ from datetime import datetime, timedelta import enum import json from typing import Any, Dict, List -from pydantic import BaseModel +from pydantic import BaseModel, Field as PydanticField from rec import routing from sqlmodel import JSON, TEXT, SQLModel, Field @@ -80,7 +80,10 @@ class Listing(SQLModel, table=False): @property def is_removed(self) -> bool: - return not self.additional_info["property"]["visible"] + if not self.additional_info: + return False + property_info = self.additional_info.get("property", {}) + return not property_info.get("visible", True) @property def price_per_square_meter(self) -> float | None: @@ -231,14 +234,16 @@ class ListingType(enum.StrEnum): RENT = "RENT" -@dataclass(frozen=True) class QueryParameters(BaseModel): + """Query parameters for filtering listings.""" + model_config = {"frozen": True} + listing_type: ListingType min_bedrooms: int = 1 max_bedrooms: int = 999 min_price: int = 0 max_price: int = 10_000_000 - district_names: set[str] = dataclasses.field(default_factory=set) + district_names: set[str] = PydanticField(default_factory=set) radius: float = 0 page_size: int = 500 # items per page max_days_since_added: int = 14 # for buy listings diff --git a/crawler/poetry.lock b/crawler/poetry.lock index c822ef9..daad2a3 100644 --- a/crawler/poetry.lock +++ b/crawler/poetry.lock @@ -120,6 +120,22 @@ yarl = ">=1.17.0,<2.0" [package.extras] speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.3.0)", "brotlicffi ; platform_python_implementation != \"CPython\""] +[[package]] +name = "aiohttp-socks" +version = "0.8.4" +description = "Proxy connector for aiohttp" +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "aiohttp_socks-0.8.4-py3-none-any.whl", hash = "sha256:74b21105634ed31d56ed6fee43701ca16218b53475e606d56950a4d17e8290ea"}, + {file = "aiohttp_socks-0.8.4.tar.gz", hash = "sha256:6b611d4ce838e9cf2c2fed5e0dba447cc84824a6cba95dc5747606201da46cb4"}, +] + +[package.dependencies] +aiohttp = ">=2.3.2" +python-socks = {version = ">=2.4.3,<3.0.0", extras = ["asyncio"]} + [[package]] name = "aioresponses" version = "0.7.8" @@ -4246,6 +4262,24 @@ files = [ {file = "python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13"}, ] +[[package]] +name = "python-socks" +version = "2.8.0" +description = "Proxy (SOCKS4, SOCKS5, HTTP CONNECT) client for Python" +optional = false +python-versions = ">=3.8.0" +groups = ["main"] +files = [ + {file = "python_socks-2.8.0-py3-none-any.whl", hash = "sha256:57c24b416569ccea493a101d38b0c82ed54be603aa50b6afbe64c46e4a4e4315"}, + {file = "python_socks-2.8.0.tar.gz", hash = "sha256:340f82778b20a290bdd538ee47492978d603dff7826aaf2ce362d21ad9ee6f1b"}, +] + +[package.extras] +anyio = ["anyio (>=3.3.4,<5.0.0)"] +asyncio = ["async-timeout (>=4.0) ; python_version < \"3.11\""] +curio = ["curio (>=1.4)"] +trio = ["trio (>=0.24)"] + [[package]] name = "pytz" version = "2025.2" @@ -6203,4 +6237,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">3.11" -content-hash = "10a74594d9f695ab1077ff992bcd012b93b174b25c3f2ca681d6308653abbd14" +content-hash = "6f9ce2af71a995db179aa4fb682e8a9ccde59566d14e26c7b0dbf4edc8d8e583" diff --git a/crawler/proof_of_concept/image.py b/crawler/proof_of_concept/image.py deleted file mode 100644 index ce8c4b5..0000000 --- a/crawler/proof_of_concept/image.py +++ /dev/null @@ -1,13 +0,0 @@ -import requests - -headers = { - "Host": "media.rightmove.co.uk", - # 'Accept-Encoding': 'gzip, deflate, br', - "User-Agent": "okhttp/4.10.0", -} - -response = requests.get( - "https://media.rightmove.co.uk/47k/46001/138680705/46001_32532509_IMG_00_0000.jpeg", - headers=headers, - verify=False, -) diff --git a/crawler/proof_of_concept/listings.py b/crawler/proof_of_concept/listings.py deleted file mode 100644 index eb5b807..0000000 --- a/crawler/proof_of_concept/listings.py +++ /dev/null @@ -1,67 +0,0 @@ -import requests - -headers = { - "Host": "api.rightmove.co.uk", - # 'Accept-Encoding': 'gzip, deflate, br', - "User-Agent": "okhttp/4.10.0", - "Connection": "close", -} - -params = { - "locationIdentifier": "POSTCODE^4228216", - "channel": "BUY", - "page": "1", - "numberOfPropertiesPerPage": "25", - "radius": "3.0", - "sortBy": "distance", - "includeUnavailableProperties": "false", - "propertyTypes": "flat", - "mustHave": "newHome", # added manually later - "dontShow": "sharedOwnership,retirement", - "minPrice": "150000", - "maxPrice": "500000", - "minBedrooms": "2", - "maxBedrooms": "2", - "apiApplication": "ANDROID", - "appVersion": "3.70.0", -} - -response = requests.get( - "https://api.rightmove.co.uk/api/property-listing", - params=params, - headers=headers, - verify=False, -) - - -headers = { - "Host": "api.rightmove.co.uk", - # 'Accept-Encoding': 'gzip, deflate, br', - "User-Agent": "okhttp/4.10.0", - "Connection": "close", -} - -params = { - "locationIdentifier": "POSTCODE^4228216", - "channel": "BUY", - "page": "2", - "numberOfPropertiesPerPage": "25", - "radius": "3.0", - "sortBy": "distance", - "includeUnavailableProperties": "false", - "propertyTypes": "flat", - "dontShow": "sharedOwnership,retirement", - "minPrice": "150000", - "maxPrice": "600000", - "minBedrooms": "2", - "maxBedrooms": "2", - "apiApplication": "ANDROID", - "appVersion": "3.70.0", -} - -response = requests.get( - "https://api.rightmove.co.uk/api/property-listing", - params=params, - headers=headers, - verify=False, -) diff --git a/crawler/proof_of_concept/routing_distancematrix.py b/crawler/proof_of_concept/routing_distancematrix.py deleted file mode 100644 index 42057da..0000000 --- a/crawler/proof_of_concept/routing_distancematrix.py +++ /dev/null @@ -1,22 +0,0 @@ -import requests - -API_KEY = "AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8" -url = "https://maps.googleapis.com/maps/api/distancematrix/json" -origin = "51.5636306598907,-0.11061106079085892" -dest = "51.53836609846008,-0.12743940233824352" - -params = { - "origins": origin, - "destinations": dest, - "key": API_KEY, - "departure_time": "", # timstamp, optional - "mode": "transit", -} - -r = requests.get(url, params=params) -print(r.status_code) - -print(r.json()) - -with open("code/json/routing_distancematrix.json", "w") as f: - f.write(r.text) diff --git a/crawler/proof_of_concept/routing_routing.py b/crawler/proof_of_concept/routing_routing.py deleted file mode 100644 index d0990d7..0000000 --- a/crawler/proof_of_concept/routing_routing.py +++ /dev/null @@ -1,83 +0,0 @@ -import requests -from utils import nextMonday -from collections import defaultdict - -API_KEY = "AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8" -url = "https://routes.googleapis.com/directions/v2:computeRoutes" - - -def travel_time(origin_lat: float, origin_lon: float, dest_lat: float, dest_lon: float): - monday9am = nextMonday() - - header = { - "X-Goog-Api-Key": API_KEY, - "Content-Type": "application/json", - "X-Goog-FieldMask": "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode", - } - - body = { - "origin": { - "location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}} - }, - "destination": { - "location": {"latLng": {"latitude": dest_lat, "longitude": dest_lon}} - }, - "travelMode": "TRANSIT", - # "2023-10-15T15:01:23.045123456Z" - "departureTime": monday9am.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), - "computeAlternativeRoutes": False, - # "routeModifiers": { - # "avoidTolls": false, - # "avoidHighways": false, - # "avoidFerries": false - # }, - "languageCode": "en-US", - "units": "METRIC", - } - - r = requests.post(url, json=body, headers=header) - if r.status_code == 200: - return r.json() - - raise Exception(r.json()) - - -def extract_time(d): - r = d["routes"][0] - print(r.keys()) - distance = r["distanceMeters"] - duration = r["duration"] - duration_static = r["staticDuration"] - - steps = r["legs"][0]["steps"] - # print(steps) - duration_per_transit = defaultdict(lambda: 0) - distance_per_transit = defaultdict(lambda: 0) - - for step in steps: - duration_per_transit[step["travelMode"]] += int( - step["staticDuration"].strip("s") - ) - distance_per_transit[step["travelMode"]] += step.get("distanceMeters", 0) - - print( - f"dis {distance}, dur {duration}, duration per transit {dict(duration_per_transit)}, distance per transit {dict(distance_per_transit)}, duration_static {duration_static}" - ) - - -if __name__ == "__main__": - import json - - with open("code/json/routing_routeapi.json", "r") as f: - d = json.load(f) - - extract_time(d) - - -# if __name__ == "__main__": -# origin = 51.5635664310333, -0.1107173751570373 # home -# dest = 51.50475678313417, 0.04915321000190009 # london city airport -# d = travel_time(origin[0], origin[1], dest[0], dest[1]) -# import json -# with open('code/json/routing_routeapi.json', 'w') as f: -# json.dump(d, f) diff --git a/crawler/proof_of_concept/single-query.py b/crawler/proof_of_concept/single-query.py deleted file mode 100644 index 4879fd3..0000000 --- a/crawler/proof_of_concept/single-query.py +++ /dev/null @@ -1,20 +0,0 @@ -import requests - -headers = { - "Host": "api.rightmove.co.uk", - # 'Accept-Encoding': 'gzip, deflate, br', - "User-Agent": "okhttp/4.10.0", - "Connection": "close", -} - -params = { - "apiApplication": "ANDROID", - "appVersion": "3.70.0", -} - -response = requests.get( - "https://api.rightmove.co.uk/api/property/119578451", - params=params, - headers=headers, - verify=False, -) diff --git a/crawler/rec/districts.py b/crawler/rec/districts.py index 7c7cfc2..4d72789 100644 --- a/crawler/rec/districts.py +++ b/crawler/rec/districts.py @@ -1,4 +1,4 @@ -def get_districts(): +def get_districts() -> dict[str, str]: return { "Barking and Dagenham": "REGION^61400", "Barnet": "REGION^93929", diff --git a/crawler/rec/floorplan.py b/crawler/rec/floorplan.py index 657e8a6..7d134cc 100644 --- a/crawler/rec/floorplan.py +++ b/crawler/rec/floorplan.py @@ -1,10 +1,12 @@ import re +from pathlib import Path +from typing import Any from PIL import Image import cv2 import numpy as np -def inference(image_path): +def inference(image_path: str | Path) -> tuple[str, Any]: from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration image = Image.open(image_path) @@ -19,7 +21,7 @@ def inference(image_path): return output, predictions -def extract_total_sqm(input_str: str): +def extract_total_sqm(input_str: str) -> float | None: sqmregex = r"(\d+\.?\d*) ?(sq ?m|sq. ?m)" matches = re.findall(sqmregex, input_str.lower()) sqms = [float(m[0]) for m in matches] @@ -29,13 +31,13 @@ def extract_total_sqm(input_str: str): return max(filtered) -def calculate_model(image_path): +def calculate_model(image_path: str | Path) -> tuple[float | None, str, Any]: output, predictions_tensor = inference(image_path) estimated_sqm = extract_total_sqm(output) return estimated_sqm, output, predictions_tensor -def improve_img_for_ocr(img: Image): +def improve_img_for_ocr(img: Image.Image) -> Image.Image: img2 = np.array(img.convert("L")) cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC) thresh = cv2.adaptiveThreshold( @@ -44,7 +46,7 @@ def improve_img_for_ocr(img: Image): return Image.fromarray(thresh) -def calculate_ocr(image_path) -> tuple[float | None, str]: +def calculate_ocr(image_path: str | Path) -> tuple[float | None, str]: import pytesseract img = Image.open(image_path) diff --git a/crawler/rec/route_serializer.py b/crawler/rec/route_serializer.py new file mode 100644 index 0000000..ba86d25 --- /dev/null +++ b/crawler/rec/route_serializer.py @@ -0,0 +1,41 @@ +import json +from typing import List + +from models.listing import DestinationMode, Route, RouteLegStep +from rec import routing + + +class RouteSerializer: + @staticmethod + def serialize(route): ... + + @staticmethod + def deserialize(route_data_json: str) -> dict[DestinationMode, List[Route]]: + json_data = json.loads(route_data_json) + destimation_routes = {} + for destination_mode_str, routes_json in json_data.items(): + destination_mode = DestinationMode( + destination_address=json.loads(destination_mode_str)[ + "destination_address" + ], + travel_mode=routing.TravelMode( + json.loads(destination_mode_str)["travel_mode"] + ), + ) + parsed_route = json.loads(routes_json[0]) + routes = [ + Route( + legs=[ + RouteLegStep( + distance_meters=step["distance_meters"], + duration_s=step["duration_s"], + travel_mode=routing.TravelMode(step["travel_mode"]), + ) + for step in parsed_route["legs"] + ], + distance_meters=parsed_route["distance_meters"], + duration_s=int(parsed_route["duration_s"]), + ) + ] + destimation_routes[destination_mode] = routes + return destimation_routes diff --git a/crawler/services/__init__.py b/crawler/services/__init__.py new file mode 100644 index 0000000..d13064b --- /dev/null +++ b/crawler/services/__init__.py @@ -0,0 +1,41 @@ +"""Services package for real estate crawler. + +This package contains two layers of services: + +## Low-level services (internal implementation): +- listing_fetcher: Fetches listing data from Rightmove API +- image_fetcher: Downloads floorplan images +- floorplan_detector: OCR-based square meter detection from floorplans +- route_calculator: Calculates transit routes using Google Maps API + +## High-level services (use these in CLI and API): +- listing_service: Unified listing operations (get, refresh, download images, etc.) +- export_service: Export listings to CSV, GeoJSON +- district_service: District lookup and validation +- task_service: Background task management +""" +# Low-level services (internal) +from services.listing_fetcher import dump_listings, dump_listings_full +from services.image_fetcher import dump_images +from services.floorplan_detector import detect_floorplan +from services.route_calculator import calculate_route + +# High-level services (CLI and API should use these) +from services import listing_service +from services import export_service +from services import district_service +from services import task_service + +__all__ = [ + # Low-level + "dump_listings", + "dump_listings_full", + "dump_images", + "detect_floorplan", + "calculate_route", + # High-level + "listing_service", + "export_service", + "district_service", + "task_service", +] diff --git a/crawler/services/district_service.py b/crawler/services/district_service.py new file mode 100644 index 0000000..5beb01d --- /dev/null +++ b/crawler/services/district_service.py @@ -0,0 +1,38 @@ +"""Unified district service - shared between CLI and HTTP API.""" +from rec.districts import get_districts as _get_districts + + +def get_all_districts() -> dict[str, str]: + """Get all available districts with their region IDs. + + Used by: + - CLI: --district option choices + - API: GET /api/get_districts + + Returns: + Dictionary mapping district names to region IDs + """ + return _get_districts() + + +def get_district_names() -> list[str]: + """Get list of all district names. + + Returns: + List of district names + """ + return list(_get_districts().keys()) + + +def validate_districts(district_names: list[str]) -> tuple[bool, list[str]]: + """Validate that district names exist. + + Args: + district_names: List of district names to validate + + Returns: + Tuple of (all_valid, invalid_names) + """ + valid_districts = set(_get_districts().keys()) + invalid = [d for d in district_names if d not in valid_districts] + return len(invalid) == 0, invalid diff --git a/crawler/services/export_service.py b/crawler/services/export_service.py new file mode 100644 index 0000000..f7b599b --- /dev/null +++ b/crawler/services/export_service.py @@ -0,0 +1,92 @@ +"""Unified export service - shared between CLI and HTTP API. + +This module provides export functionality for listings in various formats. +""" +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from models.listing import QueryParameters +from repositories.listing_repository import ListingRepository + + +@dataclass +class ExportResult: + """Result of an export operation.""" + success: bool + output_path: str | None # For file exports + data: Any | None # For in-memory exports (GeoJSON) + record_count: int + message: str + + +async def export_to_csv( + repository: ListingRepository, + output_path: Path, + query_parameters: QueryParameters | None = None, +) -> ExportResult: + """Export listings to CSV file. + + Used by: + - CLI: export-csv + - API: (could be added as download endpoint) + """ + from csv_exporter import export_to_csv as _export_csv + + await _export_csv(repository, output_path, query_parameters) + + listings = await repository.get_listings(query_parameters=query_parameters) + return ExportResult( + success=True, + output_path=str(output_path), + data=None, + record_count=len(listings), + message=f"Exported {len(listings)} listings to {output_path}", + ) + + +async def export_to_geojson( + repository: ListingRepository, + query_parameters: QueryParameters | None = None, + output_path: Path | None = None, + limit: int | None = None, +) -> ExportResult: + """Export listings to GeoJSON format. + + Args: + repository: Database repository + query_parameters: Filtering parameters + output_path: If provided, write to file. Otherwise return data. + limit: Maximum number of listings to export + + Used by: + - CLI: export-immoweb + - API: GET /api/listing_geojson + """ + from ui_exporter import export_immoweb + + geojson_data = await export_immoweb( + repository, + output_file=str(output_path) if output_path else None, + query_parameters=query_parameters, + limit=limit, + ) + + feature_count = len(geojson_data.get("features", [])) if geojson_data else 0 + + if output_path: + return ExportResult( + success=True, + output_path=str(output_path), + data=None, + record_count=feature_count, + message=f"Exported {feature_count} listings to {output_path}", + ) + + return ExportResult( + success=True, + output_path=None, + data=geojson_data, + record_count=feature_count, + message=f"Generated GeoJSON with {feature_count} features", + ) diff --git a/crawler/services/floorplan_detector.py b/crawler/services/floorplan_detector.py new file mode 100644 index 0000000..27d74e1 --- /dev/null +++ b/crawler/services/floorplan_detector.py @@ -0,0 +1,42 @@ +"""Floorplan detector service - OCR-based square meter detection.""" +import asyncio +from models import Listing +from rec import floorplan +from repositories.listing_repository import ListingRepository +from tqdm.asyncio import tqdm +import multiprocessing + + +async def detect_floorplan(repository: ListingRepository) -> None: + """Detect square meters from floorplan images for all listings.""" + listings = await repository.get_listings() + cpu_count = multiprocessing.cpu_count() // 4 + semaphore = asyncio.Semaphore(cpu_count) + + updated_listings = [ + listing + for listing in await tqdm.gather( + *[_calculate_sqm_ocr(listing, semaphore) for listing in listings] + ) + if listing is not None + ] + await repository.upsert_listings(updated_listings) + + +async def _calculate_sqm_ocr( + listing: Listing, semaphore: asyncio.Semaphore +) -> Listing | None: + """Calculate square meters from floorplan images using OCR.""" + if listing.square_meters is not None: + return None + sqms: list[float] = [] + for floorplan_path in listing.floorplan_image_paths: + async with semaphore: + estimated_sqm, _ = await asyncio.to_thread( + floorplan.calculate_ocr, floorplan_path + ) + if estimated_sqm is not None: + sqms.append(estimated_sqm) + max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0 + listing.square_meters = max_sqm + return listing diff --git a/crawler/services/image_fetcher.py b/crawler/services/image_fetcher.py new file mode 100644 index 0000000..580e099 --- /dev/null +++ b/crawler/services/image_fetcher.py @@ -0,0 +1,55 @@ +"""Image fetcher service - downloads floorplan images for listings.""" +import asyncio +from pathlib import Path +import aiohttp +from repositories import ListingRepository +from tenacity import retry, stop_after_attempt, wait_random +from tqdm.asyncio import tqdm + +from models import Listing + +# Setting this too high either crashes rightmove or gets us blocked +semaphore = asyncio.Semaphore(5) + + +async def dump_images( + repository: ListingRepository, + image_base_path: Path = Path("data/rs/"), +) -> None: + """Download floorplan images for all listings.""" + listings = await repository.get_listings() + updated_listings = await tqdm.gather( + *[dump_images_for_listing(listing, image_base_path) for listing in listings] + ) + await repository.upsert_listings( + [listing for listing in updated_listings if listing is not None] + ) + + +@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3)) +async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None: + """Download floorplan images for a single listing.""" + all_floorplans = listing.additional_info.get("property", {}).get("floorplans", []) + for floorplan in all_floorplans: + url = floorplan["url"] + picname = url.split("/")[-1] + floorplan_path = Path(base_path, str(listing.id), "floorplans", picname) + if floorplan_path.exists(): + continue + try: + async with semaphore: + async with aiohttp.ClientSession() as session: + async with session.get(url) as response: + if response.status == 404: + return None + if response.status != 200: + raise Exception(f"Error for {url}: {response.status}") + floorplan_path.parent.mkdir(parents=True, exist_ok=True) + with open(floorplan_path, "wb") as f: + f.write(await response.read()) + listing.floorplan_image_paths.append(str(floorplan_path)) + return listing + except Exception as e: + tqdm.write(f"Error for {url}: {e}") + raise e # raise so that we retry it + return None diff --git a/crawler/services/listing_service.py b/crawler/services/listing_service.py new file mode 100644 index 0000000..83199e2 --- /dev/null +++ b/crawler/services/listing_service.py @@ -0,0 +1,168 @@ +"""Unified listing service - shared between CLI and HTTP API. + +This module provides the core business logic for listing operations. +Both the CLI (main.py) and HTTP API (api/app.py) should use these functions. +""" +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Any + +from models.listing import Listing, QueryParameters +from repositories.listing_repository import ListingRepository + + +@dataclass +class ListingResult: + """Result of a listing operation.""" + listings: list[Listing] + total_count: int + message: str | None = None + + +@dataclass +class RefreshResult: + """Result of a refresh operation.""" + task_id: str | None # None if run synchronously + new_listings_count: int + message: str + + +async def get_listings( + repository: ListingRepository, + query_parameters: QueryParameters | None = None, + limit: int | None = None, + only_ids: list[int] | None = None, +) -> ListingResult: + """Get listings from the database with optional filtering. + + Used by: + - CLI: export-csv, export-immoweb + - API: GET /api/listing, GET /api/listing_geojson + """ + listings = await repository.get_listings( + query_parameters=query_parameters, + limit=limit, + only_ids=only_ids, + ) + return ListingResult( + listings=listings, + total_count=len(listings), + ) + + +async def refresh_listings( + repository: ListingRepository, + query_parameters: QueryParameters, + full: bool = False, + async_mode: bool = False, + user_email: str | None = None, +) -> RefreshResult: + """Refresh listings by fetching from external API. + + Args: + repository: Database repository + query_parameters: Filtering parameters + full: If True, also fetch images and run OCR + async_mode: If True, run as background task and return task_id + user_email: User email for tracking (API mode) + + Used by: + - CLI: dump-listings + - API: POST /api/refresh_listings + """ + if async_mode: + # Import here to avoid circular imports + from tasks.listing_tasks import dump_listings_task + from datetime import timedelta + + expiry_time = datetime.now() + timedelta(minutes=10) + task = dump_listings_task.apply_async( + args=(query_parameters.model_dump_json(),), + expires=expiry_time, + ) + return RefreshResult( + task_id=task.id, + new_listings_count=0, + message=f"Task {task.id} started", + ) + + # Synchronous mode - run directly + from services.listing_fetcher import dump_listings, dump_listings_full + + if full: + new_listings = await dump_listings_full(query_parameters, repository) + else: + new_listings = await dump_listings(query_parameters, repository) + + return RefreshResult( + task_id=None, + new_listings_count=len(new_listings), + message=f"Fetched {len(new_listings)} new listings", + ) + + +async def download_images( + repository: ListingRepository, + data_dir: Path = Path("data/rs/"), +) -> int: + """Download floorplan images for all listings. + + Used by: + - CLI: dump-images + - API: (could be added) + + Returns: + Number of listings processed + """ + from services.image_fetcher import dump_images + + await dump_images(repository, image_base_path=data_dir) + listings = await repository.get_listings() + return len(listings) + + +async def detect_floorplans( + repository: ListingRepository, +) -> int: + """Run OCR on floorplan images to detect square meters. + + Used by: + - CLI: detect-floorplan + - API: (could be added) + + Returns: + Number of listings processed + """ + from services.floorplan_detector import detect_floorplan + + await detect_floorplan(repository) + listings = await repository.get_listings() + return len(listings) + + +async def calculate_routes( + repository: ListingRepository, + destination_address: str, + travel_mode: str, + limit: int | None = None, +) -> int: + """Calculate transit routes for listings. + + Used by: + - CLI: routing + - API: (could be added) + + Returns: + Number of listings processed + """ + from services.route_calculator import calculate_route + from rec.routing import TravelMode + + await calculate_route( + repository, + destination_address, + TravelMode[travel_mode], + limit=limit, + ) + return limit or 0 diff --git a/crawler/5_routing.py b/crawler/services/route_calculator.py similarity index 54% rename from crawler/5_routing.py rename to crawler/services/route_calculator.py index 090316b..26e2dc5 100644 --- a/crawler/5_routing.py +++ b/crawler/services/route_calculator.py @@ -1,3 +1,4 @@ +"""Route calculator service - calculates transit routes using Google Maps API.""" from models.listing import DestinationMode, Route, RouteLegStep from repositories.listing_repository import ListingRepository from tqdm.asyncio import tqdm @@ -11,6 +12,7 @@ async def calculate_route( travel_mode: routing.TravelMode, limit: int | None = None, ) -> None: + """Calculate transit routes for listings to a destination.""" listings = await repository.get_listings() if limit is not None: @@ -30,6 +32,7 @@ async def calculate_route( async def update_routing_info( listing: Listing, destination_mode: DestinationMode ) -> Listing | None: + """Update routing information for a single listing.""" if listing.routing_info.get(destination_mode) is not None: # already calculated, do not recompute to save API calls return None @@ -41,8 +44,7 @@ async def update_routing_info( destination_mode.travel_mode, ) - route_data = routes_data["routes"][0] - routes = [] + routes: list[Route] = [] for route_data in routes_data["routes"]: duration_s = int(route_data["duration"].split("s")[0]) route = Route( @@ -61,47 +63,4 @@ async def update_routing_info( listing.routing_info_json = listing.serialize_routing_info( {**listing.routing_info, **{destination_mode: routes}} ) - return listing - - -# async def geocode_address( -# address: str, -# geocoding_cache: pathlib.Path, -# ) -> tuple[int, int]: -# cache = get_geocoding_cache(geocoding_cache) -# cached_results = cache.get(address) -# if cached_results is None: -# # resolve -# async with aiohttp.ClientSession() as session: -# async with session.get( -# ("https://maps.googleapis.com/maps/api/geocode/json" -# f"?address={address}" -# f"&key={API_KEY_ENVIRONMENT_VARIABLE}")) as response: -# if response.status != 200: -# raise Exception( -# f"Error {response.status} from geocoding API") -# cached_results = await response.json() -# with open(geocoding_cache, 'w') as f: -# json.dump({ -# **{ -# address: cached_results, -# }, -# **cache -# }, f) -# # API format -# lat = cached_results["results"][0]["geometry"]["location"]["lat"] -# lng = cached_results["results"][0]["geometry"]["location"]["lng"] -# cache[address] = (lat, lng) -# with open(geocoding_cache, 'w') as f: -# json.dump(cache, f) -# return lat, lng - -# def get_geocoding_cache(geocoding_cache: pathlib.Path) -> dict[str, Any]: -# try: -# with open(geocoding_cache, 'x') as f: -# json.dump({}, f) -# return {} -# except FileExistsError: -# pass # File already exists -# with open(geocoding_cache, 'r') as f: -# return json.load(f) + return listing \ No newline at end of file diff --git a/crawler/services/task_service.py b/crawler/services/task_service.py index 69eb296..81a59fb 100644 --- a/crawler/services/task_service.py +++ b/crawler/services/task_service.py @@ -11,9 +11,14 @@ import json class TaskStatus: """Status of a background task.""" task_id: str - status: str # PENDING, STARTED, SUCCESS, FAILURE, REVOKED + status: str # PENDING, STARTED, SUCCESS, FAILURE, REVOKED, SKIPPED result: Any | None progress: float | None # 0.0 to 1.0 + processed: int | None # Number of items processed + total: int | None # Total number of items + message: str | None # Human-readable status message (e.g., "Fetching listings") + error: str | None # Error message if failed + traceback: str | None # Full traceback if failed def get_task_status(task_id: str) -> TaskStatus: @@ -33,21 +38,50 @@ def get_task_status(task_id: str) -> TaskStatus: task_result = dump_listings_task.AsyncResult(task_id) # Try to serialize result - try: - result = json.loads(json.dumps(task_result.result)) - except (TypeError, json.JSONDecodeError): - result = str(task_result.result) if task_result.result else None + result = None + error = None + if task_result.failed(): + # Extract error message from failed task + error = str(task_result.result) if task_result.result else None + else: + try: + result = json.loads(json.dumps(task_result.result)) + except (TypeError, json.JSONDecodeError): + result = str(task_result.result) if task_result.result else None - # Extract progress from task meta if available + # Extract traceback if available + task_traceback = task_result.traceback if task_result.failed() else None + + # Extract progress, processed, total, and message from task meta progress = None + processed = None + total = None + message = None + if task_result.info and isinstance(task_result.info, dict): progress = task_result.info.get("progress") + processed = task_result.info.get("processed") + total = task_result.info.get("total") + # Use 'message' if available, fall back to 'reason' for SKIPPED tasks + message = task_result.info.get("message") or task_result.info.get("reason") + + # For custom states (like "Fetching listings"), use the state as message + # if no message was provided in info + if not message and task_result.status not in ( + "PENDING", "STARTED", "SUCCESS", "FAILURE", "REVOKED", "RETRY" + ): + message = task_result.status return TaskStatus( task_id=task_id, status=task_result.status, result=result, progress=progress, + processed=processed, + total=total, + message=message, + error=error, + traceback=task_traceback, ) diff --git a/crawler/start.sh b/crawler/start.sh index cf9f98e..965e330 100755 --- a/crawler/start.sh +++ b/crawler/start.sh @@ -1,55 +1,153 @@ #!/usr/bin/env bash +set -eu -# This sript is used to start the backend services and configure them according to what's available in the system - -set -eux - -ENV_MODE=${ENV:-"dev"} # Defaults to "dev" if ENV_MODE is unset - - -case "$ENV_MODE" in - dev) - echo "🛠️ Running in DEVELOPMENT mode" - set +e - pkill -f celery - pkill watchmedo - set -e - if ! netstat -tlnp |grep 6379; then - echo "Did not find a running redis on 6379. Starting a new instance..." - docker run -d --rm --name redis-server -p 6379:6379 redis:latest - fi - echo "Checking connection to redis is successful..." - python celery_app.py - - watchmedo auto-restart --directory=./ --pattern='*.py' --recursive -- celery -A celery_app worker & # DEV to autoreload on changes - CELERY_PID=$! - ;; - prod) - echo "🚀 Running in PRODUCTION mode" - echo "Checking connection to redis is successful..." - python celery_app.py - alembic upgrade head - celery -A celery_app worker --beat & - CELERY_PID=$! - ;; - *) - echo "❌ Unknown ENV_MODE: $ENV_MODE. Defaulting to DEV." - exit 1 - ;; -esac - - -cleanup() { - echo "Stopping background process (PID: $CELERY_PID)..." - kill "$CELERY_PID" 2>/dev/null # Graceful shutdown (SIGTERM) - wait "$CELERY_PID" 2>/dev/null # Wait for process to exit +# Real Estate Crawler - Development Server +# Usage: +# ./start.sh - Start with Docker (recommended) +# ./start.sh --local - Start locally (requires Poetry and dependencies) +# ./start.sh --help - Show help +show_help() { + echo "Real Estate Crawler - Development Server" + echo "" + echo "Usage: ./start.sh [OPTIONS]" + echo "" + echo "Options:" + echo " (default) Start all services with Docker Compose" + echo " --local Run locally with Poetry (requires local deps)" + echo " --build Rebuild Docker images before starting" + echo " --down Stop and remove all containers" + echo " --logs Follow logs from all services" + echo " --help Show this help message" + echo "" + echo "Examples:" + echo " ./start.sh # Start with Docker" + echo " ./start.sh --build # Rebuild and start" + echo " ./start.sh --local # Run locally with Poetry" } -trap cleanup EXIT SIGINT SIGTERM -# celery -A celery_app worker -D # PROD -uvicorn api.app:app --host 0.0.0.0 --port 5001 --log-level debug -# UVICORN_PID=$! +start_docker() { + local build_flag="" + if [[ "${1:-}" == "--build" ]]; then + build_flag="--build" + fi -# wait for -# less /etc/passwd > /dev/null + echo "🐳 Starting services with Docker Compose..." + echo "" + + # Check if docker/podman is available + if command -v docker &> /dev/null; then + COMPOSE_CMD="docker compose" + elif command -v podman-compose &> /dev/null; then + COMPOSE_CMD="podman-compose" + else + echo "❌ Error: Neither docker nor podman-compose found." + echo " Install Docker: https://docs.docker.com/get-docker/" + echo " Or run locally: ./start.sh --local" + exit 1 + fi + + $COMPOSE_CMD up $build_flag +} + +stop_docker() { + echo "🛑 Stopping all containers..." + if command -v docker &> /dev/null; then + docker compose down + elif command -v podman-compose &> /dev/null; then + podman-compose down + fi +} + +show_logs() { + if command -v docker &> /dev/null; then + docker compose logs -f + elif command -v podman-compose &> /dev/null; then + podman-compose logs -f + fi +} + +start_local() { + echo "🛠️ Starting locally with Poetry..." + echo "" + + # Check Poetry is available + if ! command -v poetry &> /dev/null; then + echo "❌ Error: Poetry not found." + echo " Install: curl -sSL https://install.python-poetry.org | python3 -" + echo " Or use Docker: ./start.sh" + exit 1 + fi + + # Source .env if it exists + if [[ -f .env ]]; then + set -a + source .env + set +a + fi + + ENV_MODE=${ENV:-"dev"} + + # Ensure Redis is running + if ! nc -z localhost 6379 2>/dev/null; then + echo "📦 Starting Redis container..." + docker run -d --rm --name rec-redis-local -p 6379:6379 redis:latest || true + sleep 2 + fi + + echo "✅ Redis OK" + + # Test celery connection + poetry run python celery_app.py + + # Start Celery worker in background + echo "🔧 Starting Celery worker..." + if [[ "$ENV_MODE" == "dev" ]]; then + poetry run celery -A celery_app worker --loglevel=info & + else + poetry run alembic upgrade head + poetry run celery -A celery_app worker --beat --loglevel=info & + fi + CELERY_PID=$! + + cleanup() { + echo "" + echo "🛑 Stopping Celery worker (PID: $CELERY_PID)..." + kill "$CELERY_PID" 2>/dev/null || true + wait "$CELERY_PID" 2>/dev/null || true + } + trap cleanup EXIT SIGINT SIGTERM + + # Start uvicorn + echo "🚀 Starting API server on http://localhost:5001" + echo "" + poetry run uvicorn api.app:app --host 0.0.0.0 --port 5001 --reload +} + +# Parse arguments +case "${1:-}" in + --help|-h) + show_help + ;; + --local) + start_local + ;; + --down) + stop_docker + ;; + --logs) + show_logs + ;; + --build) + start_docker --build + ;; + "") + start_docker + ;; + *) + echo "❌ Unknown option: $1" + echo "" + show_help + exit 1 + ;; +esac diff --git a/crawler/tests/__init__.py b/crawler/tests/__init__.py new file mode 100644 index 0000000..d4839a6 --- /dev/null +++ b/crawler/tests/__init__.py @@ -0,0 +1 @@ +# Tests package diff --git a/crawler/tests/conftest.py b/crawler/tests/conftest.py new file mode 100644 index 0000000..012e6e3 --- /dev/null +++ b/crawler/tests/conftest.py @@ -0,0 +1,186 @@ +"""Shared pytest fixtures for the test suite.""" +from datetime import datetime +from typing import AsyncGenerator, Generator +import pytest +from sqlalchemy import Engine +from sqlmodel import SQLModel, Session, create_engine +from httpx import ASGITransport, AsyncClient + +from models.listing import ( + BuyListing, + FurnishType, + ListingSite, + RentListing, + Listing, +) +from repositories.listing_repository import ListingRepository +from api.auth import User + + +@pytest.fixture +def in_memory_engine() -> Generator[Engine, None, None]: + """Create an in-memory SQLite engine for testing.""" + engine = create_engine( + "sqlite:///:memory:", + echo=False, + connect_args={"check_same_thread": False}, + ) + SQLModel.metadata.create_all(engine) + yield engine + SQLModel.metadata.drop_all(engine) + + +@pytest.fixture +def listing_repository(in_memory_engine: Engine) -> ListingRepository: + """Create a ListingRepository with the in-memory engine.""" + return ListingRepository(engine=in_memory_engine) + + +@pytest.fixture +def sample_rent_listing() -> RentListing: + """Create a sample RentListing for testing.""" + return RentListing( + id=12345678, + price=2500.0, + number_of_bedrooms=2, + square_meters=65.0, + agency="Test Agency", + council_tax_band="C", + longitude=-0.1276, + latitude=51.5074, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=datetime.now(), + photo_thumbnail="https://example.com/photo.jpg", + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.FURNISHED, + available_from=datetime.now(), + ) + + +@pytest.fixture +def sample_buy_listing() -> BuyListing: + """Create a sample BuyListing for testing.""" + return BuyListing( + id=87654321, + price=450000.0, + number_of_bedrooms=3, + square_meters=95.0, + agency="Test Estate Agents", + council_tax_band="D", + longitude=-0.1180, + latitude=51.5100, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=datetime.now(), + photo_thumbnail="https://example.com/buy_photo.jpg", + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + service_charge=1500.0, + lease_left=90, + ) + + +@pytest.fixture +def sample_rent_listings() -> list[RentListing]: + """Create multiple sample RentListings for testing filters.""" + now = datetime.now() + return [ + RentListing( + id=1, + price=1500.0, + number_of_bedrooms=1, + square_meters=40.0, + agency="Agency A", + council_tax_band="B", + longitude=-0.1, + latitude=51.5, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=now, + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.FURNISHED, + available_from=now, + ), + RentListing( + id=2, + price=2000.0, + number_of_bedrooms=2, + square_meters=55.0, + agency="Agency B", + council_tax_band="C", + longitude=-0.12, + latitude=51.51, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=now, + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.UNFURNISHED, + available_from=now, + ), + RentListing( + id=3, + price=3000.0, + number_of_bedrooms=3, + square_meters=80.0, + agency="Agency C", + council_tax_band="D", + longitude=-0.14, + latitude=51.52, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=now, + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.FURNISHED, + available_from=now, + ), + ] + + +@pytest.fixture +def mock_user() -> User: + """Create a mock user for API tests.""" + return User( + sub="test-user-id", + email="test@example.com", + name="Test User", + ) + + +@pytest.fixture +async def async_client( + in_memory_engine: Engine, mock_user: User +) -> AsyncGenerator[AsyncClient, None]: + """Create an AsyncClient for API testing with mock auth.""" + from api.app import app + from api.auth import get_current_user + + # Override dependencies + app.dependency_overrides[get_current_user] = lambda: mock_user + + # Patch the engine used by the repository + original_engine = None + try: + from database import engine as db_engine + original_engine = db_engine + except Exception: + pass + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as client: + yield client + + # Clean up dependency overrides + app.dependency_overrides.clear() diff --git a/crawler/tests/integration/__init__.py b/crawler/tests/integration/__init__.py new file mode 100644 index 0000000..a265048 --- /dev/null +++ b/crawler/tests/integration/__init__.py @@ -0,0 +1 @@ +# Integration tests package diff --git a/crawler/tests/integration/test_api.py b/crawler/tests/integration/test_api.py new file mode 100644 index 0000000..c254d35 --- /dev/null +++ b/crawler/tests/integration/test_api.py @@ -0,0 +1,180 @@ +"""Integration tests for API endpoints.""" +from unittest.mock import AsyncMock, patch +import pytest +from httpx import AsyncClient + +from api.auth import User + + +class TestStatusEndpoint: + """Tests for the /api/status endpoint.""" + + async def test_status_endpoint_returns_ok( + self, async_client: AsyncClient + ) -> None: + """Test that status endpoint returns OK status.""" + response = await async_client.get("/api/status") + assert response.status_code == 200 + assert response.json() == {"status": "OK"} + + +class TestListingEndpoint: + """Tests for the /api/listing endpoint.""" + + async def test_listing_endpoint_requires_auth(self) -> None: + """Test that listing endpoint requires authentication.""" + from api.app import app + from httpx import ASGITransport, AsyncClient + + # Clear any dependency overrides to test auth requirement + app.dependency_overrides.clear() + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/api/listing") + # Should return 401 or 403 without valid auth + assert response.status_code in (401, 403) + + async def test_listing_endpoint_with_auth( + self, async_client: AsyncClient + ) -> None: + """Test that listing endpoint works with authentication.""" + # Mock the repository to return empty list + with patch( + "api.app.ListingRepository.get_listings", + new_callable=AsyncMock, + return_value=[], + ): + response = await async_client.get("/api/listing") + assert response.status_code == 200 + data = response.json() + assert "listings" in data + + +class TestListingGeoJsonEndpoint: + """Tests for the /api/listing_geojson endpoint.""" + + async def test_listing_geojson_requires_auth(self) -> None: + """Test that listing_geojson endpoint requires authentication.""" + from api.app import app + from httpx import ASGITransport, AsyncClient + + # Clear any dependency overrides to test auth requirement + app.dependency_overrides.clear() + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get( + "/api/listing_geojson", + params={"listing_type": "RENT"}, + ) + # Should return 401 or 403 without valid auth + assert response.status_code in (401, 403) + + async def test_listing_geojson_with_filters( + self, async_client: AsyncClient + ) -> None: + """Test that listing_geojson accepts filter parameters.""" + with patch( + "api.app.export_immoweb", + new_callable=AsyncMock, + return_value={"type": "FeatureCollection", "features": []}, + ): + response = await async_client.get( + "/api/listing_geojson", + params={ + "listing_type": "RENT", + "min_bedrooms": 2, + "max_bedrooms": 3, + "min_price": 1500, + "max_price": 3000, + }, + ) + assert response.status_code == 200 + data = response.json() + assert data["type"] == "FeatureCollection" + + +class TestGetDistrictsEndpoint: + """Tests for the /api/get_districts endpoint.""" + + async def test_get_districts_requires_auth(self) -> None: + """Test that get_districts endpoint requires authentication.""" + from api.app import app + from httpx import ASGITransport, AsyncClient + + # Clear any dependency overrides to test auth requirement + app.dependency_overrides.clear() + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get("/api/get_districts") + # Should return 401 or 403 without valid auth + assert response.status_code in (401, 403) + + async def test_get_districts_returns_dict( + self, async_client: AsyncClient + ) -> None: + """Test that get_districts returns a dictionary of districts.""" + response = await async_client.get("/api/get_districts") + assert response.status_code == 200 + data = response.json() + assert isinstance(data, dict) + # Check some known districts exist + assert "London" in data + assert "Westminster" in data + assert "Camden" in data + + async def test_get_districts_values_are_region_ids( + self, async_client: AsyncClient + ) -> None: + """Test that district values are REGION identifiers.""" + response = await async_client.get("/api/get_districts") + data = response.json() + # All values should be REGION^... format + for district_name, region_id in data.items(): + assert region_id.startswith("REGION^"), ( + f"District {district_name} has invalid region ID: {region_id}" + ) + + +class TestRefreshListingsEndpoint: + """Tests for the /api/refresh_listings endpoint.""" + + async def test_refresh_listings_requires_auth(self) -> None: + """Test that refresh_listings endpoint requires authentication.""" + from api.app import app + from httpx import ASGITransport, AsyncClient + + # Clear any dependency overrides to test auth requirement + app.dependency_overrides.clear() + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.post( + "/api/refresh_listings", + params={"listing_type": "RENT"}, + ) + # Should return 401 or 403 without valid auth + assert response.status_code in (401, 403) + + +class TestTaskStatusEndpoint: + """Tests for the /api/task_status endpoint.""" + + async def test_task_status_requires_auth(self) -> None: + """Test that task_status endpoint requires authentication.""" + from api.app import app + from httpx import ASGITransport, AsyncClient + + # Clear any dependency overrides to test auth requirement + app.dependency_overrides.clear() + + transport = ASGITransport(app=app) + async with AsyncClient(transport=transport, base_url="http://test") as client: + response = await client.get( + "/api/task_status", + params={"task_id": "test-task-id"}, + ) + # Should return 401 or 403 without valid auth + assert response.status_code in (401, 403) diff --git a/crawler/tests/test_listing_geojson.py b/crawler/tests/test_listing_geojson.py new file mode 100644 index 0000000..e870f84 --- /dev/null +++ b/crawler/tests/test_listing_geojson.py @@ -0,0 +1,299 @@ +"""Tests for the listing_geojson API endpoint and QueryParameters parsing.""" +import json +import pytest +from datetime import datetime +from unittest.mock import patch, MagicMock, AsyncMock + + +class TestQueryParametersModel: + """Test QueryParameters model directly.""" + + def test_datetime_parsing_z_suffix(self): + """Test that datetime with Z suffix is parsed correctly.""" + from models.listing import QueryParameters, ListingType + + params = QueryParameters( + listing_type=ListingType.RENT, + let_date_available_from="2026-02-01T11:33:01.248Z", + ) + assert params.let_date_available_from is not None + assert params.let_date_available_from.year == 2026 + + def test_datetime_parsing_offset(self): + """Test that datetime with offset is parsed correctly.""" + from models.listing import QueryParameters, ListingType + + params = QueryParameters( + listing_type=ListingType.RENT, + let_date_available_from="2026-02-01T11:33:01.248+00:00", + ) + assert params.let_date_available_from is not None + + def test_defaults_work(self): + """Test that default values are applied correctly.""" + from models.listing import QueryParameters, ListingType + + params = QueryParameters(listing_type=ListingType.RENT) + assert params.min_bedrooms == 1 + assert params.max_bedrooms == 999 + assert params.min_price == 0 + assert params.max_price == 10_000_000 + assert params.district_names == set() + assert params.let_date_available_from is None + + def test_full_frontend_params(self): + """Test with all parameters as sent by frontend.""" + from models.listing import QueryParameters, ListingType + + params = QueryParameters( + listing_type=ListingType.RENT, + min_bedrooms=1, + max_bedrooms=3, + max_price=3000, + min_price=2000, + min_sqm=50, + last_seen_days=28, + let_date_available_from="2026-02-01T11:19:22.072Z", + ) + assert params.listing_type == ListingType.RENT + assert params.min_bedrooms == 1 + assert params.max_bedrooms == 3 + assert params.min_sqm == 50 + + +class TestGetQueryParametersDependency: + """Test the get_query_parameters FastAPI dependency.""" + + def test_parses_datetime_correctly(self): + """Test that the dependency parses datetime Z suffix.""" + from api.app import get_query_parameters + from models.listing import ListingType + + params = get_query_parameters( + listing_type=ListingType.RENT, + let_date_available_from=datetime(2026, 2, 1, 11, 33, 1), + ) + assert params.let_date_available_from is not None + + def test_defaults_applied(self): + """Test that defaults are applied when not provided.""" + from api.app import get_query_parameters + from models.listing import ListingType + + params = get_query_parameters(listing_type=ListingType.RENT) + assert params.min_bedrooms == 1 + assert params.max_bedrooms == 999 + + +class TestListingGeoJsonEndpoint: + """Test the /api/listing_geojson endpoint.""" + + @pytest.fixture + def client(self): + """Create test client with mocked auth.""" + from fastapi.testclient import TestClient + from api.app import app, get_current_user + from api.auth import User + + # Override auth dependency + async def mock_auth(): + return User(email="test@example.com", name="Test User") + + app.dependency_overrides[get_current_user] = mock_auth + yield TestClient(app) + app.dependency_overrides.clear() + + @pytest.fixture + def mock_export(self): + """Mock the export service.""" + with patch("api.app.export_service.export_to_geojson") as mock: + mock.return_value = MagicMock( + data={"type": "FeatureCollection", "features": [{"type": "Feature"}]} + ) + yield mock + + def test_minimal_params_no_422(self, client, mock_export): + """Test that minimal params don't cause 422.""" + response = client.get("/api/listing_geojson?listing_type=RENT") + assert response.status_code != 422, f"Got 422: {response.json()}" + + def test_with_datetime_z_suffix_no_422(self, client, mock_export): + """Test datetime parsing with Z suffix doesn't cause 422.""" + response = client.get( + "/api/listing_geojson?" + "listing_type=RENT" + "&let_date_available_from=2026-02-01T11:33:01.248Z" + ) + assert response.status_code != 422, f"Got 422: {response.json()}" + + def test_full_frontend_params_no_422(self, client, mock_export): + """Test with all parameters as sent by frontend.""" + response = client.get( + "/api/listing_geojson?" + "listing_type=RENT" + "&min_bedrooms=1" + "&max_bedrooms=3" + "&max_price=3000" + "&min_price=2000" + "&min_sqm=50" + "&last_seen_days=28" + "&let_date_available_from=2026-02-01T11:19:22.072Z" + ) + assert response.status_code != 422, f"Got 422: {response.json()}" + + def test_returns_geojson_structure(self, client, mock_export): + """Test that endpoint returns proper GeoJSON structure.""" + response = client.get("/api/listing_geojson?listing_type=RENT") + assert response.status_code == 200 + data = response.json() + assert "type" in data + assert data["type"] == "FeatureCollection" + assert "features" in data + + +class TestStreamingEndpoint: + """Test the /api/listing_geojson/stream endpoint.""" + + @pytest.fixture + def client(self): + """Create test client with mocked auth.""" + from fastapi.testclient import TestClient + from api.app import app + from api.auth import get_current_user, User + + async def mock_auth(): + return User(sub="test-id", email="test@example.com", name="Test User") + + app.dependency_overrides[get_current_user] = mock_auth + yield TestClient(app) + app.dependency_overrides.clear() + + @pytest.fixture + def mock_repository(self): + """Mock the repository methods.""" + with patch("api.app.ListingRepository") as MockRepo: + mock_instance = MagicMock() + mock_instance.count_listings.return_value = 3 + mock_instance.stream_listings_optimized.return_value = iter([ + { + 'id': 1, + 'price': 2000.0, + 'number_of_bedrooms': 2, + 'square_meters': 50.0, + 'longitude': -0.1, + 'latitude': 51.5, + 'photo_thumbnail': 'https://example.com/1.jpg', + 'last_seen': datetime.now(), + 'agency': 'Test Agency', + 'price_history_json': '[]', + 'available_from': datetime.now(), + }, + { + 'id': 2, + 'price': 2500.0, + 'number_of_bedrooms': 2, + 'square_meters': 60.0, + 'longitude': -0.12, + 'latitude': 51.51, + 'photo_thumbnail': 'https://example.com/2.jpg', + 'last_seen': datetime.now(), + 'agency': 'Test Agency 2', + 'price_history_json': '[]', + 'available_from': None, + }, + { + 'id': 3, + 'price': 3000.0, + 'number_of_bedrooms': 3, + 'square_meters': None, + 'longitude': -0.14, + 'latitude': 51.52, + 'photo_thumbnail': None, + 'last_seen': datetime.now(), + 'agency': None, + 'price_history_json': '[{"first_seen": "2026-01-01", "last_seen": "2026-01-15", "price": 2800}]', + 'available_from': None, + }, + ]) + MockRepo.return_value = mock_instance + yield mock_instance + + def test_streaming_returns_ndjson(self, client, mock_repository): + """Test that streaming endpoint returns NDJSON format.""" + response = client.get("/api/listing_geojson/stream?listing_type=RENT&limit=10") + assert response.status_code == 200 + assert response.headers["content-type"] == "application/x-ndjson" + + def test_streaming_metadata_includes_total_expected(self, client, mock_repository): + """Test that first line includes total_expected count.""" + response = client.get("/api/listing_geojson/stream?listing_type=RENT&limit=10") + lines = response.text.strip().split("\n") + assert len(lines) >= 1 + + metadata = json.loads(lines[0]) + assert metadata["type"] == "metadata" + assert "total_expected" in metadata + assert metadata["total_expected"] == 3 + assert "batch_size" in metadata + + def test_streaming_returns_batches_and_complete(self, client, mock_repository): + """Test that streaming returns batch and complete messages.""" + response = client.get("/api/listing_geojson/stream?listing_type=RENT&limit=10") + lines = response.text.strip().split("\n") + + # Parse all lines + messages = [json.loads(line) for line in lines] + + # First should be metadata + assert messages[0]["type"] == "metadata" + + # Should have at least one batch + batch_messages = [m for m in messages if m["type"] == "batch"] + assert len(batch_messages) >= 1 + + # Last should be complete + assert messages[-1]["type"] == "complete" + assert "total" in messages[-1] + + def test_streaming_features_have_correct_structure(self, client, mock_repository): + """Test that streamed features have correct GeoJSON structure.""" + response = client.get("/api/listing_geojson/stream?listing_type=RENT&batch_size=10&limit=10") + lines = response.text.strip().split("\n") + messages = [json.loads(line) for line in lines] + + batch_messages = [m for m in messages if m["type"] == "batch"] + assert len(batch_messages) >= 1 + + features = batch_messages[0]["features"] + assert len(features) > 0 + + feature = features[0] + assert feature["type"] == "Feature" + assert "properties" in feature + assert "geometry" in feature + assert feature["geometry"]["type"] == "Point" + assert "coordinates" in feature["geometry"] + + # Check properties + props = feature["properties"] + assert "total_price" in props + assert "rooms" in props + assert "url" in props + assert "last_seen" in props + + def test_streaming_handles_null_square_meters(self, client, mock_repository): + """Test that null square_meters doesn't cause errors.""" + response = client.get("/api/listing_geojson/stream?listing_type=RENT&batch_size=10&limit=10") + assert response.status_code == 200 + + lines = response.text.strip().split("\n") + messages = [json.loads(line) for line in lines] + + # Find feature with id=3 (has null square_meters) + for msg in messages: + if msg["type"] == "batch": + for feature in msg["features"]: + if feature["properties"]["url"].endswith("/3"): + assert feature["properties"]["qm"] is None + assert feature["properties"]["qmprice"] is None + diff --git a/crawler/tests/unit/__init__.py b/crawler/tests/unit/__init__.py new file mode 100644 index 0000000..4a5d263 --- /dev/null +++ b/crawler/tests/unit/__init__.py @@ -0,0 +1 @@ +# Unit tests package diff --git a/crawler/tests/unit/test_models.py b/crawler/tests/unit/test_models.py new file mode 100644 index 0000000..662b53f --- /dev/null +++ b/crawler/tests/unit/test_models.py @@ -0,0 +1,343 @@ +"""Unit tests for Listing models.""" +from datetime import datetime +import json +import pytest + +from models.listing import ( + BuyListing, + FurnishType, + ListingSite, + PriceHistoryItem, + RentListing, + Listing, +) + + +class TestListing: + """Tests for the base Listing model.""" + + def test_price_per_square_meter_calculation(self) -> None: + """Test that price_per_square_meter is calculated correctly.""" + listing = RentListing( + id=1, + price=2000.0, + number_of_bedrooms=2, + square_meters=50.0, + agency="Test", + council_tax_band="C", + longitude=0.0, + latitude=0.0, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=datetime.now(), + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.FURNISHED, + available_from=None, + ) + assert listing.price_per_square_meter == 40.0 + + def test_price_per_square_meter_none_when_no_sqm(self) -> None: + """Test that price_per_square_meter is None when square_meters is None.""" + listing = RentListing( + id=1, + price=2000.0, + number_of_bedrooms=2, + square_meters=None, + agency="Test", + council_tax_band="C", + longitude=0.0, + latitude=0.0, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=datetime.now(), + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.FURNISHED, + available_from=None, + ) + assert listing.price_per_square_meter is None + + def test_price_per_square_meter_none_when_sqm_zero(self) -> None: + """Test that price_per_square_meter is None when square_meters is 0.""" + listing = RentListing( + id=1, + price=2000.0, + number_of_bedrooms=2, + square_meters=0.0, + agency="Test", + council_tax_band="C", + longitude=0.0, + latitude=0.0, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=datetime.now(), + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.FURNISHED, + available_from=None, + ) + assert listing.price_per_square_meter is None + + def test_url_property(self) -> None: + """Test that url property returns correct Rightmove URL.""" + listing = RentListing( + id=123456789, + price=2000.0, + number_of_bedrooms=2, + square_meters=50.0, + agency="Test", + council_tax_band="C", + longitude=0.0, + latitude=0.0, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=datetime.now(), + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.FURNISHED, + available_from=None, + ) + assert listing.url == "https://www.rightmove.co.uk/properties/123456789" + + def test_is_removed_property_visible(self) -> None: + """Test that is_removed returns False when property is visible.""" + listing = RentListing( + id=1, + price=2000.0, + number_of_bedrooms=2, + square_meters=50.0, + agency="Test", + council_tax_band="C", + longitude=0.0, + latitude=0.0, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=datetime.now(), + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.FURNISHED, + available_from=None, + ) + assert listing.is_removed is False + + def test_is_removed_property_not_visible(self) -> None: + """Test that is_removed returns True when property is not visible.""" + listing = RentListing( + id=1, + price=2000.0, + number_of_bedrooms=2, + square_meters=50.0, + agency="Test", + council_tax_band="C", + longitude=0.0, + latitude=0.0, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=datetime.now(), + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": False}}, + routing_info_json=None, + furnish_type=FurnishType.FURNISHED, + available_from=None, + ) + assert listing.is_removed is True + + +class TestPriceHistory: + """Tests for price history serialization/deserialization.""" + + def test_price_history_serialization_roundtrip(self) -> None: + """Test that price history can be serialized and deserialized.""" + now = datetime.now() + price_history = [ + PriceHistoryItem( + first_seen=now, + last_seen=now, + price=2000.0, + ), + PriceHistoryItem( + first_seen=now, + last_seen=now, + price=2100.0, + ), + ] + + # Serialize + serialized = Listing.serialize_price_history(price_history) + assert isinstance(serialized, str) + + # Create listing with serialized history + listing = RentListing( + id=1, + price=2100.0, + number_of_bedrooms=2, + square_meters=50.0, + agency="Test", + council_tax_band="C", + longitude=0.0, + latitude=0.0, + price_history_json=serialized, + listing_site=ListingSite.RIGHTMOVE, + last_seen=now, + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.FURNISHED, + available_from=None, + ) + + # Deserialize + deserialized = listing.price_history + assert len(deserialized) == 2 + assert deserialized[0].price == 2000.0 + assert deserialized[1].price == 2100.0 + + def test_price_history_empty(self) -> None: + """Test that empty price history works correctly.""" + listing = RentListing( + id=1, + price=2000.0, + number_of_bedrooms=2, + square_meters=50.0, + agency="Test", + council_tax_band="C", + longitude=0.0, + latitude=0.0, + price_history_json="", + listing_site=ListingSite.RIGHTMOVE, + last_seen=datetime.now(), + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.FURNISHED, + available_from=None, + ) + assert listing.price_history == [] + + def test_price_history_item_to_dict(self) -> None: + """Test PriceHistoryItem.to_dict() method.""" + now = datetime.now() + item = PriceHistoryItem( + first_seen=now, + last_seen=now, + price=2500.0, + ) + result = item.to_dict() + assert result["price"] == 2500.0 + assert result["first_seen"] == now.isoformat() + assert result["last_seen"] == now.isoformat() + + +class TestRentListing: + """Tests specific to RentListing model.""" + + def test_rent_listing_has_furnish_type(self) -> None: + """Test that RentListing has furnish_type field.""" + listing = RentListing( + id=1, + price=2000.0, + number_of_bedrooms=2, + square_meters=50.0, + agency="Test", + council_tax_band="C", + longitude=0.0, + latitude=0.0, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=datetime.now(), + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.PART_FURNISHED, + available_from=None, + ) + assert listing.furnish_type == FurnishType.PART_FURNISHED + + def test_rent_listing_has_available_from(self) -> None: + """Test that RentListing has available_from field.""" + now = datetime.now() + listing = RentListing( + id=1, + price=2000.0, + number_of_bedrooms=2, + square_meters=50.0, + agency="Test", + council_tax_band="C", + longitude=0.0, + latitude=0.0, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=now, + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + furnish_type=FurnishType.FURNISHED, + available_from=now, + ) + assert listing.available_from == now + + +class TestBuyListing: + """Tests specific to BuyListing model.""" + + def test_buy_listing_has_service_charge(self) -> None: + """Test that BuyListing has service_charge field.""" + listing = BuyListing( + id=1, + price=450000.0, + number_of_bedrooms=3, + square_meters=95.0, + agency="Test", + council_tax_band="D", + longitude=0.0, + latitude=0.0, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=datetime.now(), + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + service_charge=2500.0, + lease_left=85, + ) + assert listing.service_charge == 2500.0 + + def test_buy_listing_has_lease_left(self) -> None: + """Test that BuyListing has lease_left field.""" + listing = BuyListing( + id=1, + price=450000.0, + number_of_bedrooms=3, + square_meters=95.0, + agency="Test", + council_tax_band="D", + longitude=0.0, + latitude=0.0, + price_history_json="[]", + listing_site=ListingSite.RIGHTMOVE, + last_seen=datetime.now(), + photo_thumbnail=None, + floorplan_image_paths=[], + additional_info={"property": {"visible": True}}, + routing_info_json=None, + service_charge=None, + lease_left=120, + ) + assert listing.lease_left == 120 diff --git a/crawler/tests/unit/test_redis_lock.py b/crawler/tests/unit/test_redis_lock.py new file mode 100644 index 0000000..3fd7225 --- /dev/null +++ b/crawler/tests/unit/test_redis_lock.py @@ -0,0 +1,74 @@ +"""Unit tests for Redis distributed lock.""" +from unittest import mock + +import pytest + +from utils.redis_lock import redis_lock, get_redis_client + + +class TestRedisLock: + """Tests for redis_lock context manager.""" + + @mock.patch("utils.redis_lock.get_redis_client") + def test_lock_acquired_successfully(self, mock_get_client): + """Test lock acquisition when no other lock exists.""" + mock_client = mock.MagicMock() + mock_client.set.return_value = True + mock_get_client.return_value = mock_client + + with redis_lock("test_lock") as acquired: + assert acquired is True + + mock_client.set.assert_called_once_with("lock:test_lock", "1", nx=True, ex=3600 * 4) + mock_client.delete.assert_called_once_with("lock:test_lock") + + @mock.patch("utils.redis_lock.get_redis_client") + def test_lock_not_acquired(self, mock_get_client): + """Test lock not acquired when another lock exists.""" + mock_client = mock.MagicMock() + mock_client.set.return_value = None # Redis returns None when nx=True fails + mock_get_client.return_value = mock_client + + with redis_lock("test_lock") as acquired: + assert acquired is False + + mock_client.set.assert_called_once_with("lock:test_lock", "1", nx=True, ex=3600 * 4) + # Should NOT call delete since we didn't acquire the lock + mock_client.delete.assert_not_called() + + @mock.patch("utils.redis_lock.get_redis_client") + def test_lock_released_on_exception(self, mock_get_client): + """Test lock is released even when exception occurs.""" + mock_client = mock.MagicMock() + mock_client.set.return_value = True + mock_get_client.return_value = mock_client + + with pytest.raises(ValueError): + with redis_lock("test_lock") as acquired: + assert acquired is True + raise ValueError("Test error") + + # Lock should still be released + mock_client.delete.assert_called_once_with("lock:test_lock") + + @mock.patch("utils.redis_lock.get_redis_client") + def test_custom_timeout(self, mock_get_client): + """Test lock with custom timeout.""" + mock_client = mock.MagicMock() + mock_client.set.return_value = True + mock_get_client.return_value = mock_client + + with redis_lock("test_lock", timeout=300) as acquired: + assert acquired is True + + mock_client.set.assert_called_once_with("lock:test_lock", "1", nx=True, ex=300) + + @mock.patch("utils.redis_lock.redis") + def test_get_redis_client_uses_broker_url(self, mock_redis): + """Test Redis client is created from CELERY_BROKER_URL.""" + with mock.patch.dict("os.environ", {"CELERY_BROKER_URL": "redis://testhost:1234/5"}): + get_redis_client() + + mock_redis.from_url.assert_called_once_with( + "redis://testhost:1234/5", decode_responses=True + ) diff --git a/crawler/tests/unit/test_repository.py b/crawler/tests/unit/test_repository.py new file mode 100644 index 0000000..b1ab5b4 --- /dev/null +++ b/crawler/tests/unit/test_repository.py @@ -0,0 +1,227 @@ +"""Unit tests for ListingRepository.""" +from datetime import datetime, timedelta +import pytest +from sqlalchemy import Engine + +from models.listing import ( + FurnishType, + ListingType, + QueryParameters, + RentListing, +) +from repositories.listing_repository import ListingRepository + + +class TestListingRepository: + """Tests for ListingRepository methods.""" + + async def test_get_listings_empty_db( + self, listing_repository: ListingRepository + ) -> None: + """Test that get_listings returns empty list for empty database.""" + listings = await listing_repository.get_listings() + assert listings == [] + + async def test_get_listings_returns_inserted_listings( + self, + listing_repository: ListingRepository, + sample_rent_listing: RentListing, + ) -> None: + """Test that get_listings returns listings that were inserted.""" + await listing_repository.upsert_listings([sample_rent_listing]) + listings = await listing_repository.get_listings() + assert len(listings) == 1 + assert listings[0].id == sample_rent_listing.id + + async def test_upsert_listings_creates_new( + self, + listing_repository: ListingRepository, + sample_rent_listing: RentListing, + ) -> None: + """Test that upsert_listings creates new listings.""" + result = await listing_repository.upsert_listings([sample_rent_listing]) + assert len(result) == 1 + assert result[0].id == sample_rent_listing.id + + # Verify it's in the database + listings = await listing_repository.get_listings() + assert len(listings) == 1 + + async def test_upsert_listings_updates_existing( + self, + listing_repository: ListingRepository, + sample_rent_listing: RentListing, + ) -> None: + """Test that upsert_listings updates existing listings.""" + # Insert initial listing + await listing_repository.upsert_listings([sample_rent_listing]) + + # Update the listing + sample_rent_listing.price = 3000.0 + await listing_repository.upsert_listings([sample_rent_listing]) + + # Verify update + listings = await listing_repository.get_listings() + assert len(listings) == 1 + assert listings[0].price == 3000.0 + + async def test_mark_seen_updates_timestamp( + self, + listing_repository: ListingRepository, + sample_rent_listing: RentListing, + ) -> None: + """Test that mark_seen updates the last_seen timestamp.""" + # Set an old timestamp + old_time = datetime.now() - timedelta(days=7) + sample_rent_listing.last_seen = old_time + await listing_repository.upsert_listings([sample_rent_listing]) + + # Mark as seen + await listing_repository.mark_seen(sample_rent_listing.id) + + # Verify timestamp was updated + listings = await listing_repository.get_listings() + assert len(listings) == 1 + assert listings[0].last_seen > old_time + + async def test_mark_seen_nonexistent_listing( + self, listing_repository: ListingRepository + ) -> None: + """Test that mark_seen handles nonexistent listings gracefully.""" + # Should not raise an exception + await listing_repository.mark_seen(999999) + + async def test_get_listings_with_only_ids( + self, + listing_repository: ListingRepository, + sample_rent_listings: list[RentListing], + ) -> None: + """Test that get_listings filters by only_ids.""" + await listing_repository.upsert_listings(sample_rent_listings) + + # Request only specific IDs + listings = await listing_repository.get_listings(only_ids=[1, 3]) + assert len(listings) == 2 + listing_ids = [l.id for l in listings] + assert 1 in listing_ids + assert 3 in listing_ids + assert 2 not in listing_ids + + async def test_get_listings_with_limit( + self, + listing_repository: ListingRepository, + sample_rent_listings: list[RentListing], + ) -> None: + """Test that get_listings respects limit parameter.""" + await listing_repository.upsert_listings(sample_rent_listings) + + listings = await listing_repository.get_listings(limit=2) + assert len(listings) == 2 + + +class TestListingRepositoryFilters: + """Tests for ListingRepository query parameter filtering.""" + + async def test_filter_by_bedrooms( + self, + listing_repository: ListingRepository, + sample_rent_listings: list[RentListing], + ) -> None: + """Test filtering by bedroom count.""" + await listing_repository.upsert_listings(sample_rent_listings) + + query_params = QueryParameters( + listing_type=ListingType.RENT, + min_bedrooms=2, + max_bedrooms=2, + ) + listings = await listing_repository.get_listings(query_parameters=query_params) + assert len(listings) == 1 + assert listings[0].number_of_bedrooms == 2 + + async def test_filter_by_price_range( + self, + listing_repository: ListingRepository, + sample_rent_listings: list[RentListing], + ) -> None: + """Test filtering by price range.""" + await listing_repository.upsert_listings(sample_rent_listings) + + query_params = QueryParameters( + listing_type=ListingType.RENT, + min_price=1800, + max_price=2500, + ) + listings = await listing_repository.get_listings(query_parameters=query_params) + assert len(listings) == 1 + assert listings[0].price == 2000.0 + + async def test_filter_by_min_sqm( + self, + listing_repository: ListingRepository, + sample_rent_listings: list[RentListing], + ) -> None: + """Test filtering by minimum square meters.""" + await listing_repository.upsert_listings(sample_rent_listings) + + query_params = QueryParameters( + listing_type=ListingType.RENT, + min_sqm=60, + ) + listings = await listing_repository.get_listings(query_parameters=query_params) + assert len(listings) == 1 + assert listings[0].square_meters == 80.0 + + async def test_filter_by_furnish_type( + self, + listing_repository: ListingRepository, + sample_rent_listings: list[RentListing], + ) -> None: + """Test filtering by furnish type.""" + await listing_repository.upsert_listings(sample_rent_listings) + + query_params = QueryParameters( + listing_type=ListingType.RENT, + furnish_types=[FurnishType.UNFURNISHED], + ) + listings = await listing_repository.get_listings(query_parameters=query_params) + assert len(listings) == 1 + assert listings[0].furnish_type == FurnishType.UNFURNISHED + + async def test_filter_by_last_seen_days( + self, + listing_repository: ListingRepository, + sample_rent_listings: list[RentListing], + ) -> None: + """Test filtering by last_seen_days.""" + # Make one listing old + sample_rent_listings[0].last_seen = datetime.now() - timedelta(days=30) + await listing_repository.upsert_listings(sample_rent_listings) + + query_params = QueryParameters( + listing_type=ListingType.RENT, + last_seen_days=7, + ) + listings = await listing_repository.get_listings(query_parameters=query_params) + # Only 2 should be recent enough + assert len(listings) == 2 + + async def test_combined_filters( + self, + listing_repository: ListingRepository, + sample_rent_listings: list[RentListing], + ) -> None: + """Test combining multiple filters.""" + await listing_repository.upsert_listings(sample_rent_listings) + + query_params = QueryParameters( + listing_type=ListingType.RENT, + min_bedrooms=1, + max_bedrooms=2, + min_price=1000, + max_price=2500, + furnish_types=[FurnishType.FURNISHED, FurnishType.UNFURNISHED], + ) + listings = await listing_repository.get_listings(query_parameters=query_params) + # Should match listings with 1-2 bedrooms in price range + assert len(listings) == 2 diff --git a/crawler/tests/unit/test_schedule_config.py b/crawler/tests/unit/test_schedule_config.py new file mode 100644 index 0000000..fdf9641 --- /dev/null +++ b/crawler/tests/unit/test_schedule_config.py @@ -0,0 +1,293 @@ +"""Unit tests for schedule configuration.""" +import os +from unittest import mock + +import pytest +from pydantic import ValidationError + +from config.schedule_config import ScheduleConfig, SchedulesConfig +from models.listing import FurnishType, ListingType + + +class TestScheduleConfig: + """Tests for ScheduleConfig model.""" + + def test_basic_creation_with_defaults(self): + """Test creating a schedule with minimal required fields.""" + schedule = ScheduleConfig(name="Test Schedule", listing_type=ListingType.RENT) + + assert schedule.name == "Test Schedule" + assert schedule.enabled is True + assert schedule.minute == "0" + assert schedule.hour == "2" + assert schedule.day_of_week == "*" + assert schedule.listing_type == ListingType.RENT + assert schedule.min_bedrooms == 1 + assert schedule.max_bedrooms == 999 + assert schedule.min_price == 0 + assert schedule.max_price == 10_000_000 + assert schedule.district_names == [] + assert schedule.furnish_types is None + + def test_full_creation(self): + """Test creating a schedule with all fields specified.""" + schedule = ScheduleConfig( + name="Full Schedule", + enabled=False, + minute="30", + hour="4", + day_of_week="1,3,5", + listing_type=ListingType.BUY, + min_bedrooms=2, + max_bedrooms=3, + min_price=400000, + max_price=800000, + district_names=["Westminster", "Camden"], + furnish_types=["furnished", "unfurnished"], + ) + + assert schedule.name == "Full Schedule" + assert schedule.enabled is False + assert schedule.minute == "30" + assert schedule.hour == "4" + assert schedule.day_of_week == "1,3,5" + assert schedule.listing_type == ListingType.BUY + assert schedule.min_bedrooms == 2 + assert schedule.max_bedrooms == 3 + assert schedule.min_price == 400000 + assert schedule.max_price == 800000 + assert schedule.district_names == ["Westminster", "Camden"] + assert schedule.furnish_types == ["furnished", "unfurnished"] + + def test_to_query_parameters(self): + """Test conversion to QueryParameters.""" + schedule = ScheduleConfig( + name="Test", + listing_type=ListingType.RENT, + min_bedrooms=2, + max_bedrooms=3, + min_price=2000, + max_price=4000, + district_names=["Westminster"], + furnish_types=["furnished"], + ) + + params = schedule.to_query_parameters() + + assert params.listing_type == ListingType.RENT + assert params.min_bedrooms == 2 + assert params.max_bedrooms == 3 + assert params.min_price == 2000 + assert params.max_price == 4000 + assert params.district_names == {"Westminster"} + assert params.furnish_types == [FurnishType.FURNISHED] + + def test_to_query_parameters_no_furnish_types(self): + """Test conversion when furnish_types is None.""" + schedule = ScheduleConfig( + name="Test", + listing_type=ListingType.BUY, + ) + + params = schedule.to_query_parameters() + + assert params.furnish_types is None + + +class TestCronValidation: + """Tests for cron field validation.""" + + # Valid minute values + @pytest.mark.parametrize( + "minute", + [ + "0", + "59", + "*", + "*/5", + "*/15", + "0,15,30,45", + ], + ) + def test_valid_minute(self, minute: str): + """Test valid minute values are accepted.""" + schedule = ScheduleConfig( + name="Test", listing_type=ListingType.RENT, minute=minute + ) + assert schedule.minute == minute + + # Invalid minute values + @pytest.mark.parametrize( + "minute", + [ + "60", + "-1", + "abc", + "*/0", + ], + ) + def test_invalid_minute(self, minute: str): + """Test invalid minute values are rejected.""" + with pytest.raises(ValidationError): + ScheduleConfig(name="Test", listing_type=ListingType.RENT, minute=minute) + + # Valid hour values + @pytest.mark.parametrize( + "hour", + [ + "0", + "23", + "*", + "*/6", + "0,6,12,18", + ], + ) + def test_valid_hour(self, hour: str): + """Test valid hour values are accepted.""" + schedule = ScheduleConfig( + name="Test", listing_type=ListingType.RENT, hour=hour + ) + assert schedule.hour == hour + + # Invalid hour values + @pytest.mark.parametrize( + "hour", + [ + "24", + "-1", + "abc", + "*/0", + ], + ) + def test_invalid_hour(self, hour: str): + """Test invalid hour values are rejected.""" + with pytest.raises(ValidationError): + ScheduleConfig(name="Test", listing_type=ListingType.RENT, hour=hour) + + # Valid day_of_week values + @pytest.mark.parametrize( + "day_of_week", + [ + "0", + "6", + "*", + "1,3,5", + "*/2", + ], + ) + def test_valid_day_of_week(self, day_of_week: str): + """Test valid day_of_week values are accepted.""" + schedule = ScheduleConfig( + name="Test", listing_type=ListingType.RENT, day_of_week=day_of_week + ) + assert schedule.day_of_week == day_of_week + + # Invalid day_of_week values + @pytest.mark.parametrize( + "day_of_week", + [ + "7", + "-1", + "abc", + "*/0", + ], + ) + def test_invalid_day_of_week(self, day_of_week: str): + """Test invalid day_of_week values are rejected.""" + with pytest.raises(ValidationError): + ScheduleConfig( + name="Test", listing_type=ListingType.RENT, day_of_week=day_of_week + ) + + +class TestSchedulesConfig: + """Tests for SchedulesConfig container.""" + + def test_from_env_empty(self): + """Test loading from empty environment variable.""" + with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": ""}, clear=False): + config = SchedulesConfig.from_env() + assert config.schedules == [] + + def test_from_env_missing(self): + """Test loading when environment variable is not set.""" + with mock.patch.dict(os.environ, {}, clear=True): + # Ensure SCRAPE_SCHEDULES is not set + os.environ.pop("SCRAPE_SCHEDULES", None) + config = SchedulesConfig.from_env() + assert config.schedules == [] + + def test_from_env_valid_single(self): + """Test loading a single valid schedule.""" + json_config = '[{"name":"Daily RENT","listing_type":"RENT","hour":"2"}]' + with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": json_config}): + config = SchedulesConfig.from_env() + + assert len(config.schedules) == 1 + assert config.schedules[0].name == "Daily RENT" + assert config.schedules[0].listing_type == ListingType.RENT + assert config.schedules[0].hour == "2" + + def test_from_env_valid_multiple(self): + """Test loading multiple valid schedules.""" + json_config = """[ + {"name":"Daily RENT","listing_type":"RENT","hour":"2"}, + {"name":"Daily BUY","listing_type":"BUY","hour":"4","enabled":false} + ]""" + with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": json_config}): + config = SchedulesConfig.from_env() + + assert len(config.schedules) == 2 + assert config.schedules[0].name == "Daily RENT" + assert config.schedules[0].enabled is True + assert config.schedules[1].name == "Daily BUY" + assert config.schedules[1].enabled is False + + def test_from_env_invalid_json(self): + """Test error on invalid JSON.""" + with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": "not json"}): + with pytest.raises(ValueError, match="Invalid JSON"): + SchedulesConfig.from_env() + + def test_from_env_not_array(self): + """Test error when JSON is not an array.""" + with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": '{"name":"test"}'}): + with pytest.raises(ValueError, match="must be a JSON array"): + SchedulesConfig.from_env() + + def test_from_env_invalid_schedule(self): + """Test error when schedule validation fails.""" + # Missing required listing_type + json_config = '[{"name":"Invalid"}]' + with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": json_config}): + with pytest.raises(ValidationError): + SchedulesConfig.from_env() + + def test_get_enabled_schedules(self): + """Test filtering to only enabled schedules.""" + config = SchedulesConfig( + schedules=[ + ScheduleConfig(name="Enabled", listing_type=ListingType.RENT, enabled=True), + ScheduleConfig(name="Disabled", listing_type=ListingType.BUY, enabled=False), + ScheduleConfig(name="Also Enabled", listing_type=ListingType.RENT, enabled=True), + ] + ) + + enabled = config.get_enabled_schedules() + + assert len(enabled) == 2 + assert enabled[0].name == "Enabled" + assert enabled[1].name == "Also Enabled" + + def test_get_enabled_schedules_all_disabled(self): + """Test when all schedules are disabled.""" + config = SchedulesConfig( + schedules=[ + ScheduleConfig(name="Disabled1", listing_type=ListingType.RENT, enabled=False), + ScheduleConfig(name="Disabled2", listing_type=ListingType.BUY, enabled=False), + ] + ) + + enabled = config.get_enabled_schedules() + + assert len(enabled) == 0 diff --git a/crawler/utils/__init__.py b/crawler/utils/__init__.py new file mode 100644 index 0000000..9b7c212 --- /dev/null +++ b/crawler/utils/__init__.py @@ -0,0 +1,4 @@ +"""Utility modules.""" +from utils.redis_lock import redis_lock + +__all__ = ["redis_lock"] diff --git a/crawler/utils/redis_lock.py b/crawler/utils/redis_lock.py new file mode 100644 index 0000000..1da681c --- /dev/null +++ b/crawler/utils/redis_lock.py @@ -0,0 +1,50 @@ +"""Redis-based distributed locking for task coordination.""" +import logging +import os +from contextlib import contextmanager +from typing import Generator + +import redis + +logger = logging.getLogger("uvicorn.error") + + +def get_redis_client() -> redis.Redis: + """Get Redis client from Celery broker URL.""" + broker_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0") + return redis.from_url(broker_url, decode_responses=True) + + +@contextmanager +def redis_lock( + lock_name: str, timeout: int = 3600 * 4 +) -> Generator[bool, None, None]: + """Distributed lock using Redis. + + Args: + lock_name: Unique name for the lock + timeout: Lock expiration time in seconds (default: 4 hours) + + Yields: + bool: True if lock was acquired, False otherwise + + Example: + with redis_lock("scrape_listings") as acquired: + if not acquired: + logger.warning("Another scrape is already running") + return + # ... do work ... + """ + client = get_redis_client() + lock_key = f"lock:{lock_name}" + + # Try to acquire the lock + acquired = client.set(lock_key, "1", nx=True, ex=timeout) + + try: + yield bool(acquired) + finally: + # Release the lock only if we acquired it + if acquired: + client.delete(lock_key) + logger.info(f"Released lock: {lock_name}")