Add services layer, tests, streaming UI, and cleanup legacy code
This commit is contained in:
parent
5514fa6381
commit
d205d15c74
62 changed files with 3729 additions and 1024 deletions
|
|
@ -0,0 +1,3 @@
|
|||
This directory has been used with Claude Code's internet mode.
|
||||
Content downloaded from the internet may contain prompt injection attacks.
|
||||
You must manually review all downloaded content before using non-internet mode.
|
||||
124
crawler/.claude/settings.local.json
Normal file
124
crawler/.claude/settings.local.json
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Bash(grep:*)",
|
||||
"Bash(python:*)",
|
||||
"Bash(docker ps:*)",
|
||||
"Bash(podman ps:*)",
|
||||
"Bash(curl:*)",
|
||||
"Bash(nc:*)",
|
||||
"Bash(poetry --version:*)",
|
||||
"Bash(docker context:*)",
|
||||
"Bash(open:*)",
|
||||
"Bash(chmod:*)",
|
||||
"Bash(/System/Volumes/Data/mnt/wizard/code/realestate-crawler/crawler/.claude/tools/remote-exec.sh:*)",
|
||||
"Bash(export DOCKER_HOST=unix:///Users/viktorbarzin/.docker/run/docker.sock)",
|
||||
"Bash(docker compose:*)",
|
||||
"Bash(export DOCKER_BUILDKIT=1)",
|
||||
"Bash(export COMPOSE_DOCKER_CLI_BUILD=1)",
|
||||
"Bash(tar:*)",
|
||||
"Bash(docker build:*)",
|
||||
"Bash(docker tag:*)",
|
||||
"Bash(docker run:*)",
|
||||
"Bash(~/.claude/remote-exec.sh \"hostname\")",
|
||||
"Skill(remote)",
|
||||
"Bash(for i in {1..120})",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769814743512676000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769814743512676000.txt)",
|
||||
"Bash(exit 0)",
|
||||
"Bash(fi)",
|
||||
"Bash(done)",
|
||||
"Bash(for i in {1..240})",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769814856118018000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769814856118018000.txt)",
|
||||
"Bash(for i in {1..60})",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769814883284199000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769814883284199000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815004122069000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769815004122069000.txt)",
|
||||
"Bash(for i in {1..90})",
|
||||
"Bash(do if grep -q \"EXIT_CODE\" ~/.claude/remote-results/cmd-1769814856118018000.txt)",
|
||||
"Bash(then echo \"=== Build completed ===\")",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815497591226000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769815497591226000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815530803509000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769815530803509000.txt)",
|
||||
"Bash(do if grep -q \"EXIT_CODE\" ~/.claude/remote-results/cmd-1769815530803509000.txt)",
|
||||
"Bash(for i in {1..30})",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815614622428000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769815614622428000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815710424010000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769815710424010000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815892793650000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769815892793650000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816040589015000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816040589015000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816256870361000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816256870361000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816300264785000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816300264785000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816375772556000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816375772556000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816407482202000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816407482202000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816439320016000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816439320016000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816532941427000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816532941427000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816611986724000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816611986724000.txt)",
|
||||
"Bash(for i in {1..40})",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816682085291000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816682085291000.txt)",
|
||||
"Bash(for i in {1..20})",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816742848870000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816742848870000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816763327960000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816763327960000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816784934447000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816784934447000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816872796427000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816872796427000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816892104231000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816892104231000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816911037685000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816911037685000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816946320457000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816946320457000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816987766946000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769816987766946000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769817008932477000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769817008932477000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769817027145242000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769817027145242000.txt)",
|
||||
"Bash(for file in /mnt/wizard/code/realestate-crawler/crawler/frontend/src/components/ui/*.tsx)",
|
||||
"Bash(do)",
|
||||
"Bash(basename:*)",
|
||||
"Bash(wc:*)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769819894031906000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769819894031906000.txt)",
|
||||
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769854789336791000.txt ])",
|
||||
"Bash(then cat ~/.claude/remote-results/cmd-1769854789336791000.txt)",
|
||||
"Bash(npx tsc:*)",
|
||||
"Bash(npx eslint:*)",
|
||||
"Bash(find:*)",
|
||||
"Bash(sync)",
|
||||
"Bash(echo:*)",
|
||||
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875304344407000.txt ])",
|
||||
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875304344407000.txt)",
|
||||
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875708563896000.txt ])",
|
||||
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875708563896000.txt)",
|
||||
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875753067606000.txt ])",
|
||||
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875753067606000.txt)",
|
||||
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875830424071000.txt ])",
|
||||
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875830424071000.txt)",
|
||||
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875948670335000.txt ])",
|
||||
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875948670335000.txt)",
|
||||
"Bash(sort:*)",
|
||||
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769876096467703000.txt ])",
|
||||
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769876096467703000.txt)",
|
||||
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769876529766339000.txt ])",
|
||||
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769876529766339000.txt)"
|
||||
]
|
||||
}
|
||||
}
|
||||
101
crawler/.claude/skills/python-313-redis-generic-type/SKILL.md
Normal file
101
crawler/.claude/skills/python-313-redis-generic-type/SKILL.md
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
---
|
||||
name: python-313-redis-generic-type
|
||||
description: |
|
||||
Fix for "TypeError: <class 'redis.client.Redis'> is not a generic class" when using
|
||||
redis-py with Python 3.13. Use when: (1) upgrading to Python 3.13 breaks redis type
|
||||
annotations, (2) mypy passes but runtime fails with generic class error, (3) using
|
||||
redis.Redis[str] or similar parameterized types. Covers redis-py generic type
|
||||
compatibility with Python 3.13's stricter runtime generic checking.
|
||||
author: Claude Code
|
||||
version: 1.0.0
|
||||
date: 2026-01-31
|
||||
---
|
||||
|
||||
# Python 3.13 redis.Redis Generic Type Error
|
||||
|
||||
## Problem
|
||||
Python 3.13 introduced stricter runtime checking for generic types. The redis-py library's
|
||||
`Redis` class is not defined as a generic class at runtime, even though it works with type
|
||||
checkers like mypy. This causes a `TypeError` when you use parameterized types like
|
||||
`redis.Redis[str]` in type annotations that are evaluated at runtime.
|
||||
|
||||
## Context / Trigger Conditions
|
||||
- Python 3.13 or later
|
||||
- Using redis-py library
|
||||
- Type annotation like `redis_client: redis.Redis[str]`
|
||||
- Error message: `TypeError: <class 'redis.client.Redis'> is not a generic class`
|
||||
- Works fine with mypy but fails at runtime
|
||||
- Often appears when instantiating a class with this annotation
|
||||
|
||||
## Solution
|
||||
|
||||
### Option 1: Remove the type parameter (Recommended)
|
||||
```python
|
||||
# Before (breaks in Python 3.13)
|
||||
redis_client: redis.Redis[str]
|
||||
|
||||
# After (works in all Python versions)
|
||||
redis_client: redis.Redis # type: ignore[type-arg]
|
||||
```
|
||||
|
||||
The `# type: ignore[type-arg]` comment silences mypy's warning about missing type arguments.
|
||||
|
||||
### Option 2: Use string annotation (deferred evaluation)
|
||||
```python
|
||||
from __future__ import annotations
|
||||
|
||||
redis_client: "redis.Redis[str]" # String annotation, not evaluated at runtime
|
||||
```
|
||||
|
||||
### Option 3: Use TYPE_CHECKING guard
|
||||
```python
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
RedisClient = redis.Redis[str]
|
||||
else:
|
||||
RedisClient = redis.Redis
|
||||
|
||||
redis_client: RedisClient
|
||||
```
|
||||
|
||||
## Verification
|
||||
1. Run your application with Python 3.13
|
||||
2. The TypeError should no longer appear
|
||||
3. Run mypy to ensure type checking still works (may need type: ignore comment)
|
||||
|
||||
## Example
|
||||
|
||||
### Before (Broken)
|
||||
```python
|
||||
import redis
|
||||
|
||||
class RedisRepository:
|
||||
redis_client: redis.Redis[str] # TypeError at runtime in Python 3.13
|
||||
|
||||
def __init__(self):
|
||||
self.redis_client = redis.Redis(host='localhost', decode_responses=True)
|
||||
```
|
||||
|
||||
### After (Fixed)
|
||||
```python
|
||||
import redis
|
||||
|
||||
class RedisRepository:
|
||||
redis_client: redis.Redis # type: ignore[type-arg]
|
||||
|
||||
def __init__(self):
|
||||
self.redis_client = redis.Redis(host='localhost', decode_responses=True)
|
||||
```
|
||||
|
||||
## Notes
|
||||
- This is a breaking change in Python 3.13's handling of generic types
|
||||
- The redis-py library may add proper generic support in future versions
|
||||
- If using `decode_responses=True`, the client returns `str`; otherwise `bytes`
|
||||
- The `type: ignore` comment is preferable to `Any` as it preserves some type safety
|
||||
- This issue affects other libraries that aren't properly defined as Generic classes
|
||||
|
||||
## References
|
||||
- [Python 3.13 Release Notes](https://docs.python.org/3.13/whatsnew/3.13.html)
|
||||
- [redis-py GitHub Issues](https://github.com/redis/redis-py/issues)
|
||||
- [PEP 585 - Type Hinting Generics In Standard Collections](https://peps.python.org/pep-0585/)
|
||||
|
|
@ -0,0 +1,132 @@
|
|||
---
|
||||
name: python-parentheses-comparison-bug
|
||||
description: |
|
||||
Debug Python comparison bug where parentheses around a variable cause unexpected behavior.
|
||||
Use when: (1) condition always evaluates to False/True unexpectedly, (2) code like
|
||||
"if (mylist) == 0" never triggers, (3) length check seems to not work, (4) comparison
|
||||
with list/dict returns unexpected results. Common mistake where parentheses cause the
|
||||
variable itself to be compared instead of its length.
|
||||
author: Claude Code
|
||||
version: 1.0.0
|
||||
date: 2026-01-31
|
||||
---
|
||||
|
||||
# Python Parentheses Comparison Bug
|
||||
|
||||
## Problem
|
||||
A subtle Python bug where unnecessary parentheses around a variable in a comparison
|
||||
cause the wrong value to be compared. The expression `(mylist) == 0` compares the list
|
||||
itself to 0, not its length. Since a list is never equal to an integer, this always
|
||||
returns False.
|
||||
|
||||
## Context / Trigger Conditions
|
||||
- Condition that should sometimes be True is always False (or vice versa)
|
||||
- Code pattern like `if (existing_items) == 0:` or `if (result) == expected:`
|
||||
- The parentheses don't cause a syntax error but change semantics
|
||||
- Often appears when copying/adapting code or during refactoring
|
||||
- May pass code review because it "looks" correct
|
||||
|
||||
## Solution
|
||||
|
||||
### Identify the Bug Pattern
|
||||
```python
|
||||
# BUG: Compares list to 0, always False
|
||||
if (existing_listings) == 0:
|
||||
return True
|
||||
|
||||
# Also wrong: compares list to integer
|
||||
if (items) == 5:
|
||||
do_something()
|
||||
```
|
||||
|
||||
### Fix: Use len() for Length Comparisons
|
||||
```python
|
||||
# CORRECT: Compares length to 0
|
||||
if len(existing_listings) == 0:
|
||||
return True
|
||||
|
||||
# Alternative: Use truthiness for empty check
|
||||
if not existing_listings:
|
||||
return True
|
||||
|
||||
# CORRECT: Compares length to integer
|
||||
if len(items) == 5:
|
||||
do_something()
|
||||
```
|
||||
|
||||
## Verification
|
||||
1. Add a debug print before the condition: `print(f"list={existing_listings}, len={len(existing_listings)}")`
|
||||
2. Verify the condition now evaluates correctly
|
||||
3. Write a unit test that exercises both branches of the condition
|
||||
|
||||
## Example
|
||||
|
||||
### Before (Broken)
|
||||
```python
|
||||
class FetchListingDetailsStep:
|
||||
async def needs_processing(self, listing_id: int) -> bool:
|
||||
existing_listings = await self.listing_repository.get_listings(
|
||||
only_ids=[listing_id]
|
||||
)
|
||||
# BUG: This compares the list object to 0, which is always False
|
||||
# The parentheses around existing_listings are misleading
|
||||
if (existing_listings) == 0:
|
||||
return True
|
||||
return False
|
||||
```
|
||||
|
||||
### After (Fixed)
|
||||
```python
|
||||
class FetchListingDetailsStep:
|
||||
async def needs_processing(self, listing_id: int) -> bool:
|
||||
existing_listings = await self.listing_repository.get_listings(
|
||||
only_ids=[listing_id]
|
||||
)
|
||||
# CORRECT: Check if list is empty using len()
|
||||
if len(existing_listings) == 0:
|
||||
return True
|
||||
return False
|
||||
```
|
||||
|
||||
### Even Better (Pythonic)
|
||||
```python
|
||||
class FetchListingDetailsStep:
|
||||
async def needs_processing(self, listing_id: int) -> bool:
|
||||
existing_listings = await self.listing_repository.get_listings(
|
||||
only_ids=[listing_id]
|
||||
)
|
||||
# Most Pythonic: Use truthiness
|
||||
return not existing_listings
|
||||
```
|
||||
|
||||
## Notes
|
||||
- Python's truthiness: empty collections are falsy, non-empty are truthy
|
||||
- This bug is particularly insidious because:
|
||||
- It's syntactically valid
|
||||
- It doesn't raise an exception
|
||||
- The parentheses make it look intentional
|
||||
- Code review may miss it
|
||||
- Linters like pylint or flake8 won't catch this specific pattern
|
||||
- Type checkers like mypy may warn about comparing incompatible types
|
||||
- When debugging, add print statements to verify actual vs expected values
|
||||
|
||||
## Prevention
|
||||
- Prefer `if not mylist:` over `if len(mylist) == 0:`
|
||||
- Prefer `if mylist:` over `if len(mylist) > 0:`
|
||||
- Remove unnecessary parentheses around single variables
|
||||
- Enable mypy's strict mode which may catch type comparison issues
|
||||
- Write unit tests that exercise both branches of conditions
|
||||
|
||||
## Related Patterns
|
||||
```python
|
||||
# These are all wrong (comparing object to number):
|
||||
if (mydict) == 0: # Always False
|
||||
if (mylist) > 0: # TypeError in Python 3
|
||||
if (mystring) == 0: # Always False
|
||||
|
||||
# These are correct:
|
||||
if len(mydict) == 0: # True if empty
|
||||
if not mydict: # True if empty (preferred)
|
||||
if len(mylist) > 0: # True if non-empty
|
||||
if mylist: # True if non-empty (preferred)
|
||||
```
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
from data_access import Listing
|
||||
from tqdm import tqdm
|
||||
|
||||
listings = Listing.get_all_listings()
|
||||
recalculate_listings = []
|
||||
|
||||
for listing in listings:
|
||||
sqm = listing.sqm_ocr
|
||||
if sqm is None or sqm < 10 or sqm > 200:
|
||||
recalculate_listings.append(listing)
|
||||
|
||||
for listing in tqdm(recalculate_listings):
|
||||
listing.calculate_sqm_ocr(recalculate=True)
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
# recalculate regex from sqm from already previously ocr'ed text
|
||||
import json
|
||||
from rec.floorplan import extract_total_sqm
|
||||
from tqdm import tqdm
|
||||
from data_access import Listing
|
||||
|
||||
for listing in tqdm(list(Listing.get_all_listings())):
|
||||
with open(listing.path_floorplan_ocr_json()) as f:
|
||||
floorplans = json.load(f)
|
||||
|
||||
for floorplan in floorplans:
|
||||
floorplan["estimated_sqm"] = extract_total_sqm(floorplan["text"])
|
||||
|
||||
with open(listing.path_floorplan_ocr_json(), "w") as f:
|
||||
floorplans = json.dump(floorplans, f)
|
||||
|
|
@ -41,6 +41,7 @@ EXPOSE 5001
|
|||
# Set the entry point (adjust to your CLI's entry point)
|
||||
# ENTRYPOINT ["python", "/app/main.py"]
|
||||
# ENTRYPOINT ["/app/runall.sh"]
|
||||
# CMD ["/bin/bash" ,"-c" ,"alembic upgrade head && uvicorn api.app:app --host 0.0.0.0 --port 8000"]
|
||||
# ENTRYPOINT ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
CMD ["./start.sh"]
|
||||
# For local dev with docker-compose:
|
||||
# CMD ["./start.sh"]
|
||||
# For Kubernetes deployment:
|
||||
CMD ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "5001"]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,56 @@
|
|||
"""add streaming indexes for query optimization
|
||||
|
||||
Revision ID: a1b2c3d4e5f6
|
||||
Revises: e5f1bc4e3323
|
||||
Create Date: 2026-02-01 12:00:00.000000
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = 'a1b2c3d4e5f6'
|
||||
down_revision: Union[str, None] = 'e5f1bc4e3323'
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Add composite and single-column indexes for streaming query optimization."""
|
||||
# Composite index for main query pattern (bedrooms, price, last_seen filtering)
|
||||
op.create_index(
|
||||
'ix_rentlisting_query_composite',
|
||||
'rentlisting',
|
||||
['number_of_bedrooms', 'price', 'last_seen'],
|
||||
unique=False
|
||||
)
|
||||
op.create_index(
|
||||
'ix_buylisting_query_composite',
|
||||
'buylisting',
|
||||
['number_of_bedrooms', 'price', 'last_seen'],
|
||||
unique=False
|
||||
)
|
||||
|
||||
# Missing single-column indexes for frequently filtered columns
|
||||
op.create_index(
|
||||
'ix_rentlisting_furnish_type',
|
||||
'rentlisting',
|
||||
['furnish_type'],
|
||||
unique=False
|
||||
)
|
||||
op.create_index(
|
||||
'ix_rentlisting_available_from',
|
||||
'rentlisting',
|
||||
['available_from'],
|
||||
unique=False
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Remove streaming indexes."""
|
||||
op.drop_index('ix_rentlisting_available_from', table_name='rentlisting')
|
||||
op.drop_index('ix_rentlisting_furnish_type', table_name='rentlisting')
|
||||
op.drop_index('ix_buylisting_query_composite', table_name='buylisting')
|
||||
op.drop_index('ix_rentlisting_query_composite', table_name='rentlisting')
|
||||
|
|
@ -19,88 +19,12 @@ depends_on: Union[str, Sequence[str], None] = None
|
|||
|
||||
|
||||
def upgrade() -> None:
|
||||
"""Upgrade schema."""
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.drop_index(op.f('ix_user_email'), table_name='user')
|
||||
op.drop_table('user')
|
||||
op.drop_index(op.f('ix_rentlisting_last_seen'), table_name='rentlisting')
|
||||
op.drop_index(op.f('ix_rentlisting_number_of_bedrooms'), table_name='rentlisting')
|
||||
op.drop_index(op.f('ix_rentlisting_price'), table_name='rentlisting')
|
||||
op.drop_index(op.f('ix_rentlisting_square_meters'), table_name='rentlisting')
|
||||
op.drop_table('rentlisting')
|
||||
op.drop_index(op.f('ix_buylisting_last_seen'), table_name='buylisting')
|
||||
op.drop_index(op.f('ix_buylisting_number_of_bedrooms'), table_name='buylisting')
|
||||
op.drop_index(op.f('ix_buylisting_price'), table_name='buylisting')
|
||||
op.drop_index(op.f('ix_buylisting_square_meters'), table_name='buylisting')
|
||||
op.drop_table('buylisting')
|
||||
# ### end Alembic commands ###
|
||||
"""Upgrade schema - this migration is now a no-op since tables already have correct column name."""
|
||||
# The tables were created with 'longitude' (correct spelling) in the initial migration.
|
||||
# This migration was incorrectly auto-generated and has been fixed to be a no-op.
|
||||
pass
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
"""Downgrade schema."""
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.create_table('buylisting',
|
||||
sa.Column('id', mysql.INTEGER(), autoincrement=True, nullable=False),
|
||||
sa.Column('price', mysql.FLOAT(), nullable=False),
|
||||
sa.Column('number_of_bedrooms', mysql.INTEGER(), autoincrement=False, nullable=False),
|
||||
sa.Column('square_meters', mysql.FLOAT(), nullable=True),
|
||||
sa.Column('agency', mysql.VARCHAR(length=255), nullable=True),
|
||||
sa.Column('council_tax_band', mysql.VARCHAR(length=255), nullable=True),
|
||||
sa.Column('longtitude', mysql.FLOAT(), nullable=False),
|
||||
sa.Column('latitude', mysql.FLOAT(), nullable=False),
|
||||
sa.Column('price_history_json', mysql.TEXT(), nullable=False),
|
||||
sa.Column('listing_site', mysql.ENUM('RIGHTMOVE'), nullable=False),
|
||||
sa.Column('last_seen', mysql.DATETIME(), nullable=False),
|
||||
sa.Column('photo_thumbnail', mysql.VARCHAR(length=255), nullable=True),
|
||||
sa.Column('floorplan_image_paths', mysql.JSON(), nullable=False),
|
||||
sa.Column('additional_info', mysql.JSON(), nullable=False),
|
||||
sa.Column('routing_info_json', mysql.TEXT(), nullable=True),
|
||||
sa.Column('service_charge', mysql.FLOAT(), nullable=True),
|
||||
sa.Column('lease_left', mysql.INTEGER(), autoincrement=False, nullable=True),
|
||||
sa.PrimaryKeyConstraint('id'),
|
||||
mysql_collate='utf8mb4_0900_ai_ci',
|
||||
mysql_default_charset='utf8mb4',
|
||||
mysql_engine='InnoDB'
|
||||
)
|
||||
op.create_index(op.f('ix_buylisting_square_meters'), 'buylisting', ['square_meters'], unique=False)
|
||||
op.create_index(op.f('ix_buylisting_price'), 'buylisting', ['price'], unique=False)
|
||||
op.create_index(op.f('ix_buylisting_number_of_bedrooms'), 'buylisting', ['number_of_bedrooms'], unique=False)
|
||||
op.create_index(op.f('ix_buylisting_last_seen'), 'buylisting', ['last_seen'], unique=False)
|
||||
op.create_table('rentlisting',
|
||||
sa.Column('id', mysql.INTEGER(), autoincrement=True, nullable=False),
|
||||
sa.Column('price', mysql.FLOAT(), nullable=False),
|
||||
sa.Column('number_of_bedrooms', mysql.INTEGER(), autoincrement=False, nullable=False),
|
||||
sa.Column('square_meters', mysql.FLOAT(), nullable=True),
|
||||
sa.Column('agency', mysql.VARCHAR(length=255), nullable=True),
|
||||
sa.Column('council_tax_band', mysql.VARCHAR(length=255), nullable=True),
|
||||
sa.Column('longtitude', mysql.FLOAT(), nullable=False),
|
||||
sa.Column('latitude', mysql.FLOAT(), nullable=False),
|
||||
sa.Column('price_history_json', mysql.TEXT(), nullable=False),
|
||||
sa.Column('listing_site', mysql.ENUM('RIGHTMOVE'), nullable=False),
|
||||
sa.Column('last_seen', mysql.DATETIME(), nullable=False),
|
||||
sa.Column('photo_thumbnail', mysql.VARCHAR(length=255), nullable=True),
|
||||
sa.Column('floorplan_image_paths', mysql.JSON(), nullable=False),
|
||||
sa.Column('additional_info', mysql.JSON(), nullable=False),
|
||||
sa.Column('routing_info_json', mysql.TEXT(), nullable=True),
|
||||
sa.Column('available_from', mysql.DATETIME(), nullable=True),
|
||||
sa.Column('furnish_type', mysql.ENUM('FURNISHED', 'UNFURNISHED', 'PART_FURNISHED', 'ASK_LANDLORD', 'UNKNOWN'), nullable=False),
|
||||
sa.PrimaryKeyConstraint('id'),
|
||||
mysql_collate='utf8mb4_0900_ai_ci',
|
||||
mysql_default_charset='utf8mb4',
|
||||
mysql_engine='InnoDB'
|
||||
)
|
||||
op.create_index(op.f('ix_rentlisting_square_meters'), 'rentlisting', ['square_meters'], unique=False)
|
||||
op.create_index(op.f('ix_rentlisting_price'), 'rentlisting', ['price'], unique=False)
|
||||
op.create_index(op.f('ix_rentlisting_number_of_bedrooms'), 'rentlisting', ['number_of_bedrooms'], unique=False)
|
||||
op.create_index(op.f('ix_rentlisting_last_seen'), 'rentlisting', ['last_seen'], unique=False)
|
||||
op.create_table('user',
|
||||
sa.Column('id', mysql.INTEGER(), autoincrement=True, nullable=False),
|
||||
sa.Column('email', mysql.VARCHAR(length=255), nullable=False),
|
||||
sa.Column('password', mysql.VARCHAR(length=255), nullable=False),
|
||||
sa.PrimaryKeyConstraint('id'),
|
||||
mysql_collate='utf8mb4_0900_ai_ci',
|
||||
mysql_default_charset='utf8mb4',
|
||||
mysql_engine='InnoDB'
|
||||
)
|
||||
op.create_index(op.f('ix_user_email'), 'user', ['email'], unique=True)
|
||||
# ### end Alembic commands ###
|
||||
"""Downgrade schema - no-op since upgrade is no-op."""
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from rec.query import QueryParameters
|
||||
from models.listing import QueryParameters
|
||||
from repositories.listing_repository import ListingRepository
|
||||
|
||||
|
||||
|
|
@ -10,7 +10,7 @@ async def export_to_csv(
|
|||
query_parameters: QueryParameters | None = None,
|
||||
) -> None:
|
||||
listings = await repository.get_listings(query_parameters=query_parameters)
|
||||
ds = [*[listing.__dict__ for listing in listings]]
|
||||
ds = [listing.__dict__ for listing in listings]
|
||||
df = pd.DataFrame(ds)
|
||||
|
||||
# read decisions on file
|
||||
|
|
@ -22,37 +22,19 @@ async def export_to_csv(
|
|||
drop_columns = ["_sa_instance_state", "additional_info"]
|
||||
df = df.drop(columns=drop_columns)
|
||||
|
||||
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
|
||||
# df2 = df[df.travel_time_fastest.notna()]
|
||||
df2 = df
|
||||
# fill in gap values for service charge and lease left for Excel filters
|
||||
if "service_charge" not in df.columns:
|
||||
df.loc[:, "service_charge"] = -1
|
||||
df.loc[:, "service_charge"] = df.service_charge.fillna(-1)
|
||||
if "lease_left" not in df.columns:
|
||||
df.loc[:, "lease_left"] = -1
|
||||
df.loc[:, "lease_left"] = df.lease_left.fillna(-1)
|
||||
if "square_meters" not in df.columns:
|
||||
df.loc[:, "square_meters"] = -1
|
||||
df.loc[:, "square_meters"] = df.square_meters.fillna(-1)
|
||||
|
||||
# drop columns
|
||||
# dropcolumns = ['distance_per_transit', 'duration_static', 'distance']
|
||||
# s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)
|
||||
# s1 = df2
|
||||
# Add price per sqm column
|
||||
df.loc[:, "price_per_sqm"] = df.price / df.square_meters
|
||||
|
||||
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
|
||||
if "service_charge" not in df2.columns:
|
||||
df2.loc[:, "service_charge"] = -1
|
||||
df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1)
|
||||
if "lease_left" not in df2.columns:
|
||||
df2.loc[:, "lease_left"] = -1
|
||||
df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1)
|
||||
if "square_meters" not in df2.columns:
|
||||
df2.loc[:, "square_meters"] = -1
|
||||
df2.loc[:, "square_meters"] = df2.square_meters.fillna(-1)
|
||||
|
||||
df3 = df2
|
||||
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
|
||||
# df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()
|
||||
df3.shape
|
||||
df4 = df3
|
||||
|
||||
# df5 = df4[columns]
|
||||
|
||||
# Add some interesting columns
|
||||
df4.loc[:, "price_per_sqm"] = df4.price / df4.square_meters
|
||||
df5 = df4
|
||||
|
||||
df6 = df5.sort_values(by=["price_per_sqm"], ascending=True)
|
||||
df6.to_csv(str(output_file), index=False)
|
||||
df = df.sort_values(by=["price_per_sqm"], ascending=True)
|
||||
df.to_csv(str(output_file), index=False)
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ from dataclasses import dataclass
|
|||
import json
|
||||
import pathlib
|
||||
from typing import Any, List
|
||||
import warnings
|
||||
from models.listing import ListingSite, PriceHistoryItem
|
||||
from rec import floorplan, routing
|
||||
import re
|
||||
|
|
@ -12,6 +13,12 @@ import datetime
|
|||
|
||||
@dataclass()
|
||||
class Listing:
|
||||
"""Legacy Listing class for filesystem-based data access.
|
||||
|
||||
.. deprecated::
|
||||
Use models.listing.RentListing or models.listing.BuyListing instead.
|
||||
This class is kept for backwards compatibility with the populate_db command.
|
||||
"""
|
||||
identifier: int
|
||||
_details_object: dict[str, Any] | None = None
|
||||
_listing_object: dict[str, Any] | None = None
|
||||
|
|
@ -36,6 +43,14 @@ class Listing:
|
|||
"council_tax_band",
|
||||
]
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
warnings.warn(
|
||||
"data_access.Listing is deprecated. Use models.listing.RentListing "
|
||||
"or models.listing.BuyListing instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def get_all_listings(
|
||||
listing_paths: list[pathlib.Path],
|
||||
|
|
@ -144,39 +159,6 @@ class Listing:
|
|||
# todo add check if return is image
|
||||
return images
|
||||
|
||||
def calculate_sqm_model(self):
|
||||
objs = []
|
||||
for floorplan_path in self.list_floorplans():
|
||||
estimated_sqm, model_output, predictions = floorplan.calculate_model(
|
||||
floorplan_path
|
||||
)
|
||||
objs.append(
|
||||
{
|
||||
"floorplan_path": str(floorplan_path),
|
||||
"estimated_sqm": estimated_sqm,
|
||||
"model_output": model_output,
|
||||
"no_predictions": len(
|
||||
predictions
|
||||
), # cant serialize the predictions itself since its a tensor
|
||||
}
|
||||
)
|
||||
|
||||
with open(self.path_floorplan_model_json(), "w") as f:
|
||||
json.dump(objs, f)
|
||||
|
||||
@property
|
||||
def sqm_model(self, recalculate=True) -> float:
|
||||
if not self.path_floorplan_model_json().exists() or recalculate:
|
||||
self.calculate_sqm_model()
|
||||
|
||||
with open(self.path_floorplan_json()) as f:
|
||||
objs = json.load(f)
|
||||
|
||||
max_sqm = max(
|
||||
[o["estimated_sqm"] for o in objs if o is None]
|
||||
) # filter out Nones
|
||||
return max_sqm
|
||||
|
||||
async def calculate_sqm_ocr(self, recalculate=True):
|
||||
objs = []
|
||||
if self.path_floorplan_ocr_json().exists():
|
||||
|
|
@ -405,63 +387,6 @@ class Listing:
|
|||
def listing_site(self) -> ListingSite:
|
||||
return ListingSite.RIGHTMOVE # this class supports only right move
|
||||
|
||||
async def dict_nicely(self):
|
||||
travel_time_fastest = {}
|
||||
travel_time_second = {}
|
||||
if self.path_routing_json().exists():
|
||||
with open(self.path_routing_json(), "r") as f:
|
||||
travel_times = json.load(f)
|
||||
for destination_mode in travel_times.keys():
|
||||
destination_mode_clean = destination_mode.replace(" ", "_").replace(
|
||||
",", "_"
|
||||
)
|
||||
destination, travel_mode = self.__from_routing_cache_key(
|
||||
destination_mode
|
||||
)
|
||||
travel_time_fastest[destination_mode_clean] = self.travel_time(
|
||||
destination, travel_mode
|
||||
)[0]["duration"]
|
||||
travel_time_second[destination_mode_clean] = self.travel_time(
|
||||
destination, travel_mode
|
||||
)[1]["duration"]
|
||||
|
||||
return {
|
||||
"identifier": self.identifier,
|
||||
"sqm_ocr": await self.sqm_ocr(),
|
||||
"price": self.price,
|
||||
"price_per_sqm": await self.price_per_sqm(),
|
||||
"url": self.url,
|
||||
"bedrooms": self.bedrooms,
|
||||
"travel_time_fastest": ":".join(
|
||||
sorted(
|
||||
f"{dest} in {travel_mode//60}min"
|
||||
for dest, travel_mode in travel_time_fastest.items()
|
||||
)
|
||||
),
|
||||
"travel_time_second": ":".join(
|
||||
sorted(
|
||||
f"{dest} in {travel_mode//60}min"
|
||||
for dest, travel_mode in travel_time_second.items()
|
||||
)
|
||||
),
|
||||
"lease_left": self.leaseLeft,
|
||||
"service_charge": self.serviceCharge,
|
||||
"development": self.development,
|
||||
"tenure_type": self.tenure_type,
|
||||
"updated_days": self.updateDaysAgo,
|
||||
"status": self.status,
|
||||
"last_seen": self.last_seen,
|
||||
"agency": self.agency,
|
||||
"council_tax_band": self.councilTaxBand,
|
||||
"photo_thumbnail": self.photoThumbnail,
|
||||
"let_date_available": (
|
||||
self.letDateAvailable.strftime("%d/%m/%Y")
|
||||
if self.letDateAvailable
|
||||
else "Ask agent"
|
||||
),
|
||||
"price_history": self.priceHistory,
|
||||
}
|
||||
|
||||
def __routing_cache_key(
|
||||
self,
|
||||
dest_address: str,
|
||||
|
|
|
|||
|
|
@ -14,10 +14,13 @@ services:
|
|||
interval: 5s
|
||||
timeout: 3s
|
||||
retries: 5
|
||||
networks:
|
||||
- rec-network
|
||||
|
||||
mysql:
|
||||
image: mysql:9
|
||||
container_name: rec-mysql
|
||||
hostname: mysql
|
||||
ports:
|
||||
- "3306:3306"
|
||||
environment:
|
||||
|
|
@ -32,6 +35,9 @@ services:
|
|||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
networks:
|
||||
- rec-network
|
||||
|
||||
app:
|
||||
build:
|
||||
|
|
@ -47,7 +53,7 @@ services:
|
|||
- app_venv:/app/.venv
|
||||
environment:
|
||||
- ENV=dev
|
||||
- DB_CONNECTION_STRING=mysql://wrongmove:wrongmove@mysql:3306/wrongmove
|
||||
- DB_CONNECTION_STRING=mysql+mysqldb://wrongmove:wrongmove@mysql:3306/wrongmove
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
- ROUTING_API_KEY=${ROUTING_API_KEY:-}
|
||||
|
|
@ -57,6 +63,8 @@ services:
|
|||
mysql:
|
||||
condition: service_healthy
|
||||
command: ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "5001", "--reload", "--reload-dir", "api", "--reload-dir", "services", "--reload-dir", "repositories", "--reload-dir", "models"]
|
||||
networks:
|
||||
- rec-network
|
||||
|
||||
celery:
|
||||
build:
|
||||
|
|
@ -68,7 +76,7 @@ services:
|
|||
- app_venv:/app/.venv
|
||||
environment:
|
||||
- ENV=dev
|
||||
- DB_CONNECTION_STRING=mysql://wrongmove:wrongmove@mysql:3306/wrongmove
|
||||
- DB_CONNECTION_STRING=mysql+mysqldb://wrongmove:wrongmove@mysql:3306/wrongmove
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
- ROUTING_API_KEY=${ROUTING_API_KEY:-}
|
||||
|
|
@ -79,6 +87,8 @@ services:
|
|||
mysql:
|
||||
condition: service_healthy
|
||||
command: ["celery", "-A", "celery_app", "worker", "--loglevel=info"]
|
||||
networks:
|
||||
- rec-network
|
||||
|
||||
celery-beat:
|
||||
build:
|
||||
|
|
@ -90,7 +100,7 @@ services:
|
|||
- app_venv:/app/.venv
|
||||
environment:
|
||||
- ENV=dev
|
||||
- DB_CONNECTION_STRING=mysql://wrongmove:wrongmove@mysql:3306/wrongmove
|
||||
- DB_CONNECTION_STRING=mysql+mysqldb://wrongmove:wrongmove@mysql:3306/wrongmove
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||
- SCRAPE_SCHEDULES=${SCRAPE_SCHEDULES:-}
|
||||
|
|
@ -98,6 +108,12 @@ services:
|
|||
- redis
|
||||
- celery
|
||||
command: ["celery", "-A", "celery_app", "beat", "--loglevel=info"]
|
||||
networks:
|
||||
- rec-network
|
||||
|
||||
networks:
|
||||
rec-network:
|
||||
driver: bridge
|
||||
|
||||
volumes:
|
||||
redis_data:
|
||||
|
|
|
|||
183
crawler/docs/BACKEND.md
Normal file
183
crawler/docs/BACKEND.md
Normal file
|
|
@ -0,0 +1,183 @@
|
|||
# Real Estate Crawler - Backend Documentation
|
||||
|
||||
A property listing aggregator that scrapes Rightmove UK, extracts square meters via OCR, and calculates transit routes.
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Docker (recommended) - starts Redis, MySQL, API, and Celery
|
||||
./start.sh
|
||||
|
||||
# Or run locally with Poetry
|
||||
poetry install
|
||||
./start.sh --local
|
||||
```
|
||||
|
||||
API available at `http://localhost:5001`
|
||||
|
||||
## Dependencies
|
||||
|
||||
| Dependency | Purpose |
|
||||
|------------|---------|
|
||||
| Python 3.11+ | Runtime |
|
||||
| Redis | Celery message broker |
|
||||
| MySQL/SQLite | Database |
|
||||
| Tesseract OCR | Floorplan text extraction |
|
||||
| Docker | Containerized deployment |
|
||||
|
||||
### Python Packages (key)
|
||||
- `fastapi` + `uvicorn` - HTTP API
|
||||
- `celery` - Background tasks
|
||||
- `sqlmodel` - ORM
|
||||
- `pytesseract` + `opencv` - OCR
|
||||
- `aiohttp` - Async HTTP client
|
||||
|
||||
## API Endpoints
|
||||
|
||||
### Health Check
|
||||
```bash
|
||||
curl http://localhost:5001/api/status
|
||||
# {"status": "OK"}
|
||||
```
|
||||
|
||||
### Get Listings
|
||||
```bash
|
||||
curl -H "Authorization: Bearer $TOKEN" \
|
||||
"http://localhost:5001/api/listing?limit=10"
|
||||
```
|
||||
|
||||
### Get Listings as GeoJSON
|
||||
```bash
|
||||
curl -H "Authorization: Bearer $TOKEN" \
|
||||
"http://localhost:5001/api/listing_geojson?listing_type=RENT&min_bedrooms=2&max_price=3000"
|
||||
```
|
||||
|
||||
### Refresh Listings (async)
|
||||
```bash
|
||||
curl -X POST -H "Authorization: Bearer $TOKEN" \
|
||||
"http://localhost:5001/api/refresh_listings?listing_type=RENT&min_bedrooms=2&max_bedrooms=3&min_price=2000&max_price=4000"
|
||||
# {"task_id": "abc123", "message": "Task abc123 started"}
|
||||
```
|
||||
|
||||
### Check Task Status
|
||||
```bash
|
||||
curl -H "Authorization: Bearer $TOKEN" \
|
||||
"http://localhost:5001/api/task_status?task_id=abc123"
|
||||
# {"task_id": "abc123", "status": "SUCCESS", "result": "..."}
|
||||
```
|
||||
|
||||
### Get Districts
|
||||
```bash
|
||||
curl -H "Authorization: Bearer $TOKEN" \
|
||||
"http://localhost:5001/api/get_districts"
|
||||
# {"Westminster": "REGION^93965", "Camden": "REGION^93934", ...}
|
||||
```
|
||||
|
||||
## CLI Commands
|
||||
|
||||
```bash
|
||||
# Fetch listings from Rightmove
|
||||
python main.py dump-listings -t rent --min-bedrooms 2 --max-price 4000
|
||||
|
||||
# Download floorplan images
|
||||
python main.py dump-images
|
||||
|
||||
# Run OCR on floorplans
|
||||
python main.py detect-floorplan
|
||||
|
||||
# Calculate transit routes
|
||||
python main.py routing -d "10 Downing Street, London" -m TRANSIT -l 10
|
||||
|
||||
# Export to GeoJSON
|
||||
python main.py export-immoweb -O output.geojson -t rent --min-bedrooms 2
|
||||
|
||||
# Export to CSV
|
||||
python main.py export-csv -O output.csv -t rent
|
||||
|
||||
# List available districts
|
||||
python main.py list-districts
|
||||
```
|
||||
|
||||
## Query Parameters
|
||||
|
||||
| Parameter | Type | Description |
|
||||
|-----------|------|-------------|
|
||||
| `listing_type` | RENT/BUY | Property type |
|
||||
| `min_bedrooms` | int | Minimum bedrooms |
|
||||
| `max_bedrooms` | int | Maximum bedrooms |
|
||||
| `min_price` | int | Minimum price |
|
||||
| `max_price` | int | Maximum price |
|
||||
| `min_sqm` | int | Minimum square meters |
|
||||
| `district` | string | District name (repeatable) |
|
||||
| `furnish_types` | string | FURNISHED/UNFURNISHED/PART_FURNISHED |
|
||||
| `last_seen_days` | int | Only listings seen in last N days |
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
||||
│ CLI │ │ HTTP API │ │ Celery │
|
||||
│ (main.py) │ │ (api/app.py)│ │ Worker │
|
||||
└──────┬──────┘ └──────┬──────┘ └──────┬──────┘
|
||||
│ │ │
|
||||
└───────────────────┼───────────────────┘
|
||||
│
|
||||
┌────────▼────────┐
|
||||
│ Services │
|
||||
│ (services/*.py) │
|
||||
└────────┬────────┘
|
||||
│
|
||||
┌────────────┼────────────┐
|
||||
│ │ │
|
||||
┌──────▼──────┐ ┌───▼───┐ ┌──────▼──────┐
|
||||
│ Repository │ │ Redis │ │ Rightmove │
|
||||
│ (MySQL) │ │ │ │ API │
|
||||
└─────────────┘ └───────┘ └─────────────┘
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
```bash
|
||||
# Database
|
||||
DB_CONNECTION_STRING=mysql://user:pass@localhost:3306/wrongmove
|
||||
|
||||
# Redis (Celery)
|
||||
CELERY_BROKER_URL=redis://localhost:6379/0
|
||||
CELERY_RESULT_BACKEND=redis://localhost:6379/0
|
||||
|
||||
# Google Maps (optional, for routing)
|
||||
ROUTING_API_KEY=your_api_key
|
||||
```
|
||||
|
||||
## Authentication
|
||||
|
||||
API endpoints (except `/api/status`) require JWT authentication via Authentik OIDC.
|
||||
|
||||
```bash
|
||||
# Get token from Authentik, then:
|
||||
curl -H "Authorization: Bearer $TOKEN" http://localhost:5001/api/listing
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
├── main.py # CLI entry point
|
||||
├── api/app.py # FastAPI application
|
||||
├── services/ # Business logic (shared by CLI + API)
|
||||
│ ├── listing_service.py
|
||||
│ ├── export_service.py
|
||||
│ ├── district_service.py
|
||||
│ └── task_service.py
|
||||
├── repositories/ # Database access
|
||||
├── models/ # SQLModel entities
|
||||
├── rec/ # Core logic (query, OCR, routing)
|
||||
├── tasks/ # Celery background tasks
|
||||
└── tests/ # Test suite
|
||||
```
|
||||
|
||||
## Running Tests
|
||||
|
||||
```bash
|
||||
pytest tests/ -v --cov=.
|
||||
mypy .
|
||||
```
|
||||
|
|
@ -12,130 +12,47 @@ import {
|
|||
} from "@/components/ui/sidebar"
|
||||
import * as React from "react"
|
||||
|
||||
// This is sample data.
|
||||
const data = {
|
||||
navMain: [
|
||||
{
|
||||
title: "Getting Started",
|
||||
title: "Property Explorer",
|
||||
url: "#",
|
||||
items: [
|
||||
{
|
||||
title: "Installation",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Project Structure",
|
||||
url: "#",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
title: "Building Your Application",
|
||||
url: "#",
|
||||
items: [
|
||||
{
|
||||
title: "Routing",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Data Fetching",
|
||||
title: "Map View",
|
||||
url: "#",
|
||||
isActive: true,
|
||||
},
|
||||
{
|
||||
title: "Rendering",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Caching",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Styling",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Optimizing",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Configuring",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Testing",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Authentication",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Deploying",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Upgrading",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Examples",
|
||||
title: "List View",
|
||||
url: "#",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
title: "API Reference",
|
||||
title: "Data Management",
|
||||
url: "#",
|
||||
items: [
|
||||
{
|
||||
title: "Components",
|
||||
title: "Refresh Listings",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "File Conventions",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Functions",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "next.config.js Options",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "CLI",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Edge Runtime",
|
||||
title: "Active Tasks",
|
||||
url: "#",
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
title: "Architecture",
|
||||
title: "Settings",
|
||||
url: "#",
|
||||
items: [
|
||||
{
|
||||
title: "Accessibility",
|
||||
title: "Preferences",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Fast Refresh",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Next.js Compiler",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Supported Browsers",
|
||||
url: "#",
|
||||
},
|
||||
{
|
||||
title: "Turbopack",
|
||||
title: "Account",
|
||||
url: "#",
|
||||
},
|
||||
],
|
||||
|
|
@ -145,21 +62,19 @@ const data = {
|
|||
|
||||
export function AppSidebar({ ...props }: React.ComponentProps<typeof Sidebar>) {
|
||||
return (
|
||||
// create closed by default
|
||||
<Sidebar {...props} >
|
||||
<Sidebar {...props}>
|
||||
<SidebarHeader>
|
||||
</SidebarHeader>
|
||||
<SidebarContent>
|
||||
{/* We create a SidebarGroup for each parent. */}
|
||||
{data.navMain.map((item) => (
|
||||
<SidebarGroup key={item.title}>
|
||||
<SidebarGroupLabel>{item.title}</SidebarGroupLabel>
|
||||
<SidebarGroupContent>
|
||||
<SidebarMenu>
|
||||
{item.items.map((item) => (
|
||||
<SidebarMenuItem key={item.title}>
|
||||
<SidebarMenuButton asChild isActive={item.isActive}>
|
||||
<a href={item.url}>{item.title}</a>
|
||||
{item.items.map((subItem) => (
|
||||
<SidebarMenuItem key={subItem.title}>
|
||||
<SidebarMenuButton asChild isActive={subItem.isActive}>
|
||||
<a href={subItem.url}>{subItem.title}</a>
|
||||
</SidebarMenuButton>
|
||||
</SidebarMenuItem>
|
||||
))}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,6 @@
|
|||
import { getUser } from "@/auth/authService";
|
||||
import { zodResolver } from "@hookform/resolvers/zod";
|
||||
import { DialogTitle } from "@radix-ui/react-dialog";
|
||||
import type { User } from "oidc-client-ts";
|
||||
import { useEffect, useState } from "react";
|
||||
import { useState } from "react";
|
||||
import { useForm } from "react-hook-form";
|
||||
import { z } from "zod";
|
||||
import { Button } from "./ui/button";
|
||||
|
|
@ -24,6 +22,12 @@ export enum ListingType {
|
|||
BUY = 'BUY'
|
||||
}
|
||||
|
||||
export enum FurnishType {
|
||||
FURNISHED = 'furnished',
|
||||
PART_FURNISHED = 'partFurnished',
|
||||
UNFURNISHED = 'unfurnished',
|
||||
}
|
||||
|
||||
|
||||
export interface ParameterValues {
|
||||
metric: Metric
|
||||
|
|
@ -33,30 +37,15 @@ export interface ParameterValues {
|
|||
min_price?: number
|
||||
max_price?: number
|
||||
min_sqm?: number
|
||||
max_sqm?: number
|
||||
min_price_per_sqm?: number
|
||||
max_price_per_sqm?: number
|
||||
last_seen_days?: number
|
||||
available_from?: Date
|
||||
district: string
|
||||
furnish_types?: FurnishType[]
|
||||
}
|
||||
|
||||
const fetchDistricts = async (user: User | null) => {
|
||||
const accessToken = user?.access_token;
|
||||
|
||||
const response = await fetch('/api/get_districts',
|
||||
{
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'Authorization': `Bearer ${accessToken}`, // Pass the token
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
}
|
||||
);
|
||||
if (!response.ok) {
|
||||
throw new Error('Error: ' + response.status);
|
||||
}
|
||||
const data: Response = await response.json();
|
||||
return data;
|
||||
};
|
||||
|
||||
export function Parameters(
|
||||
props: {
|
||||
isOpen: boolean,
|
||||
|
|
@ -69,15 +58,6 @@ export function Parameters(
|
|||
} = useForm<ParameterValues>()
|
||||
const [action, setAction] = useState<'fetch-data' | 'visualize' | null>(null)
|
||||
const [availableFromRawInput, setAvailableFromRawInput] = useState("now");
|
||||
const [_districts, setDistricts] = useState<string[]>([]);
|
||||
|
||||
useEffect(() => {
|
||||
getUser().then(user => {
|
||||
fetchDistricts(user).then(data => {
|
||||
setDistricts(Object.keys(data));
|
||||
})
|
||||
})
|
||||
}, []);
|
||||
|
||||
const formSchema = z.object({
|
||||
metric: z.nativeEnum(Metric, { required_error: "Metric is required" }),
|
||||
|
|
@ -177,29 +157,6 @@ export function Parameters(
|
|||
</FormItem>
|
||||
)}
|
||||
/>
|
||||
{/* <FormField # listings don't have district stored as metadata; so only useful in rightmove querying
|
||||
control={form.control}
|
||||
name="district"
|
||||
render={({ field }) => (
|
||||
<FormItem className="flex flex-row items-center gap-4">
|
||||
<FormLabel>District</FormLabel>
|
||||
<Select onValueChange={field.onChange} defaultValue={field.value}>
|
||||
<FormControl>
|
||||
<SelectTrigger className="w-[180px]">
|
||||
<SelectValue placeholder="District" />
|
||||
</SelectTrigger>
|
||||
</FormControl>
|
||||
<SelectContent {...register('district')} >
|
||||
{districts.map((district, index) => (
|
||||
<SelectItem key={index} value={district}>{district}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
<FormMessage />
|
||||
</FormItem>
|
||||
)}
|
||||
/> */}
|
||||
<FormField
|
||||
control={form.control}
|
||||
name="min_sqm"
|
||||
|
|
|
|||
128
crawler/frontend/src/components/StatsBar.tsx
Normal file
128
crawler/frontend/src/components/StatsBar.tsx
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
import { BarChart3, MapPin, PoundSterling, Maximize2, List, Map as MapIcon } from 'lucide-react';
|
||||
import { Button } from './ui/button';
|
||||
import type { GeoJSONFeatureCollection, PropertyFeature } from '@/types';
|
||||
|
||||
export type ViewMode = 'map' | 'list' | 'split';
|
||||
|
||||
interface StatsBarProps {
|
||||
listingData: GeoJSONFeatureCollection | null;
|
||||
viewMode: ViewMode;
|
||||
onViewModeChange: (mode: ViewMode) => void;
|
||||
}
|
||||
|
||||
interface ListingStats {
|
||||
count: number;
|
||||
avgPrice: number;
|
||||
avgPricePerSqm: number;
|
||||
avgSize: number;
|
||||
}
|
||||
|
||||
function calculateStats(data: GeoJSONFeatureCollection | null): ListingStats {
|
||||
if (!data || data.features.length === 0) {
|
||||
return { count: 0, avgPrice: 0, avgPricePerSqm: 0, avgSize: 0 };
|
||||
}
|
||||
|
||||
const features = data.features;
|
||||
const count = features.length;
|
||||
|
||||
const validPrices = features
|
||||
.map((f: PropertyFeature) => f.properties.total_price)
|
||||
.filter((p): p is number => typeof p === 'number' && p > 0);
|
||||
|
||||
const validPricesPerSqm = features
|
||||
.map((f: PropertyFeature) => f.properties.qmprice)
|
||||
.filter((p): p is number => typeof p === 'number' && p > 0);
|
||||
|
||||
const validSizes = features
|
||||
.map((f: PropertyFeature) => f.properties.qm)
|
||||
.filter((s): s is number => typeof s === 'number' && s > 0);
|
||||
|
||||
const avgPrice = validPrices.length > 0
|
||||
? validPrices.reduce((a, b) => a + b, 0) / validPrices.length
|
||||
: 0;
|
||||
|
||||
const avgPricePerSqm = validPricesPerSqm.length > 0
|
||||
? validPricesPerSqm.reduce((a, b) => a + b, 0) / validPricesPerSqm.length
|
||||
: 0;
|
||||
|
||||
const avgSize = validSizes.length > 0
|
||||
? validSizes.reduce((a, b) => a + b, 0) / validSizes.length
|
||||
: 0;
|
||||
|
||||
return { count, avgPrice, avgPricePerSqm, avgSize };
|
||||
}
|
||||
|
||||
function formatCurrency(value: number): string {
|
||||
if (value >= 1000) {
|
||||
return `£${(value / 1000).toFixed(1)}k`;
|
||||
}
|
||||
return `£${Math.round(value)}`;
|
||||
}
|
||||
|
||||
export function StatsBar({ listingData, viewMode, onViewModeChange }: StatsBarProps) {
|
||||
const stats = calculateStats(listingData);
|
||||
|
||||
return (
|
||||
<div className="flex items-center justify-between px-4 py-2 bg-muted/50 border-t text-sm">
|
||||
{/* Stats */}
|
||||
<div className="flex items-center gap-4 text-muted-foreground">
|
||||
<div className="flex items-center gap-1.5">
|
||||
<MapPin className="h-4 w-4" />
|
||||
<span className="font-medium text-foreground">{stats.count.toLocaleString()}</span>
|
||||
<span className="hidden sm:inline">listings</span>
|
||||
</div>
|
||||
|
||||
{stats.avgPrice > 0 && (
|
||||
<>
|
||||
<div className="hidden md:flex items-center gap-1.5">
|
||||
<PoundSterling className="h-4 w-4" />
|
||||
<span>Avg: <span className="font-medium text-foreground">{formatCurrency(stats.avgPrice)}</span></span>
|
||||
</div>
|
||||
<div className="hidden lg:flex items-center gap-1.5">
|
||||
<BarChart3 className="h-4 w-4" />
|
||||
<span>Avg £/m²: <span className="font-medium text-foreground">{formatCurrency(stats.avgPricePerSqm)}</span></span>
|
||||
</div>
|
||||
<div className="hidden lg:flex items-center gap-1.5">
|
||||
<Maximize2 className="h-4 w-4" />
|
||||
<span>Avg: <span className="font-medium text-foreground">{Math.round(stats.avgSize)} m²</span></span>
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* View Mode Toggle */}
|
||||
<div className="flex items-center gap-1 bg-background rounded-md border p-0.5">
|
||||
<Button
|
||||
variant={viewMode === 'map' ? 'secondary' : 'ghost'}
|
||||
size="sm"
|
||||
className="h-7 px-2"
|
||||
onClick={() => onViewModeChange('map')}
|
||||
>
|
||||
<MapIcon className="h-4 w-4" />
|
||||
<span className="hidden sm:inline ml-1">Map</span>
|
||||
</Button>
|
||||
<Button
|
||||
variant={viewMode === 'list' ? 'secondary' : 'ghost'}
|
||||
size="sm"
|
||||
className="h-7 px-2"
|
||||
onClick={() => onViewModeChange('list')}
|
||||
>
|
||||
<List className="h-4 w-4" />
|
||||
<span className="hidden sm:inline ml-1">List</span>
|
||||
</Button>
|
||||
<Button
|
||||
variant={viewMode === 'split' ? 'secondary' : 'ghost'}
|
||||
size="sm"
|
||||
className="h-7 px-2 hidden md:flex"
|
||||
onClick={() => onViewModeChange('split')}
|
||||
>
|
||||
<div className="flex gap-0.5">
|
||||
<div className="w-2 h-4 bg-current rounded-sm opacity-60" />
|
||||
<div className="w-2 h-4 border border-current rounded-sm" />
|
||||
</div>
|
||||
<span className="hidden sm:inline ml-1">Split</span>
|
||||
</Button>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
47
crawler/frontend/src/components/StreamingProgressBar.tsx
Normal file
47
crawler/frontend/src/components/StreamingProgressBar.tsx
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import { Loader2 } from 'lucide-react';
|
||||
import type { StreamingProgress } from '@/services';
|
||||
|
||||
interface StreamingProgressBarProps {
|
||||
progress: StreamingProgress | null;
|
||||
isLoading: boolean;
|
||||
}
|
||||
|
||||
export function StreamingProgressBar({ progress, isLoading }: StreamingProgressBarProps) {
|
||||
if (!isLoading) return null;
|
||||
|
||||
return (
|
||||
<div className="absolute top-0 left-0 right-0 z-10 bg-background/95 backdrop-blur-sm border-b px-4 py-2">
|
||||
<div className="flex items-center gap-3">
|
||||
<Loader2 className="h-4 w-4 animate-spin text-primary" />
|
||||
<div className="flex-1">
|
||||
<div className="flex items-center justify-between text-sm">
|
||||
<span className="font-medium">
|
||||
{progress
|
||||
? `Loading listings...`
|
||||
: 'Loading...'}
|
||||
</span>
|
||||
{progress && (
|
||||
<span className="text-muted-foreground">
|
||||
{progress.count.toLocaleString()}
|
||||
{progress.total ? ` / ${progress.total.toLocaleString()}` : ''} loaded
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
{progress && (
|
||||
<div className="mt-1 h-1.5 w-full bg-primary/20 rounded-full overflow-hidden">
|
||||
<div
|
||||
className="h-full bg-primary transition-all duration-300 ease-out rounded-full"
|
||||
style={{
|
||||
width: progress.total
|
||||
? `${Math.min((progress.count / progress.total) * 100, 100)}%`
|
||||
: '100%',
|
||||
animation: progress.total ? undefined : 'pulse 1.5s ease-in-out infinite',
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
56
crawler/frontend/src/components/ui/accordion.tsx
Normal file
56
crawler/frontend/src/components/ui/accordion.tsx
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
"use client"
|
||||
|
||||
import * as React from "react"
|
||||
import * as AccordionPrimitive from "@radix-ui/react-accordion"
|
||||
import { ChevronDown } from "lucide-react"
|
||||
import { cn } from "@/lib/utils"
|
||||
|
||||
const Accordion = AccordionPrimitive.Root
|
||||
|
||||
const AccordionItem = React.forwardRef<
|
||||
React.ComponentRef<typeof AccordionPrimitive.Item>,
|
||||
React.ComponentPropsWithoutRef<typeof AccordionPrimitive.Item>
|
||||
>(({ className, ...props }, ref) => (
|
||||
<AccordionPrimitive.Item
|
||||
ref={ref}
|
||||
className={cn("border-b", className)}
|
||||
{...props}
|
||||
/>
|
||||
))
|
||||
AccordionItem.displayName = "AccordionItem"
|
||||
|
||||
const AccordionTrigger = React.forwardRef<
|
||||
React.ComponentRef<typeof AccordionPrimitive.Trigger>,
|
||||
React.ComponentPropsWithoutRef<typeof AccordionPrimitive.Trigger>
|
||||
>(({ className, children, ...props }, ref) => (
|
||||
<AccordionPrimitive.Header className="flex">
|
||||
<AccordionPrimitive.Trigger
|
||||
ref={ref}
|
||||
className={cn(
|
||||
"flex flex-1 items-center justify-between py-4 text-sm font-medium transition-all hover:underline text-left [&[data-state=open]>svg]:rotate-180",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
>
|
||||
{children}
|
||||
<ChevronDown className="h-4 w-4 shrink-0 text-muted-foreground transition-transform duration-200" />
|
||||
</AccordionPrimitive.Trigger>
|
||||
</AccordionPrimitive.Header>
|
||||
))
|
||||
AccordionTrigger.displayName = AccordionPrimitive.Trigger.displayName
|
||||
|
||||
const AccordionContent = React.forwardRef<
|
||||
React.ComponentRef<typeof AccordionPrimitive.Content>,
|
||||
React.ComponentPropsWithoutRef<typeof AccordionPrimitive.Content>
|
||||
>(({ className, children, ...props }, ref) => (
|
||||
<AccordionPrimitive.Content
|
||||
ref={ref}
|
||||
className="overflow-hidden text-sm data-[state=closed]:animate-accordion-up data-[state=open]:animate-accordion-down"
|
||||
{...props}
|
||||
>
|
||||
<div className={cn("pb-4 pt-0", className)}>{children}</div>
|
||||
</AccordionPrimitive.Content>
|
||||
))
|
||||
AccordionContent.displayName = AccordionPrimitive.Content.displayName
|
||||
|
||||
export { Accordion, AccordionItem, AccordionTrigger, AccordionContent }
|
||||
|
|
@ -1,66 +0,0 @@
|
|||
import * as React from "react"
|
||||
import { cva, type VariantProps } from "class-variance-authority"
|
||||
|
||||
import { cn } from "@/lib/utils"
|
||||
|
||||
const alertVariants = cva(
|
||||
"relative w-full rounded-lg border px-4 py-3 text-sm grid has-[>svg]:grid-cols-[calc(var(--spacing)*4)_1fr] grid-cols-[0_1fr] has-[>svg]:gap-x-3 gap-y-0.5 items-start [&>svg]:size-4 [&>svg]:translate-y-0.5 [&>svg]:text-current",
|
||||
{
|
||||
variants: {
|
||||
variant: {
|
||||
default: "bg-card text-card-foreground",
|
||||
destructive:
|
||||
"text-destructive bg-card [&>svg]:text-current *:data-[slot=alert-description]:text-destructive/90",
|
||||
},
|
||||
},
|
||||
defaultVariants: {
|
||||
variant: "default",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
function Alert({
|
||||
className,
|
||||
variant,
|
||||
...props
|
||||
}: React.ComponentProps<"div"> & VariantProps<typeof alertVariants>) {
|
||||
return (
|
||||
<div
|
||||
data-slot="alert"
|
||||
role="alert"
|
||||
className={cn(alertVariants({ variant }), className)}
|
||||
{...props}
|
||||
/>
|
||||
)
|
||||
}
|
||||
|
||||
function AlertTitle({ className, ...props }: React.ComponentProps<"div">) {
|
||||
return (
|
||||
<div
|
||||
data-slot="alert-title"
|
||||
className={cn(
|
||||
"col-start-2 line-clamp-1 min-h-4 font-medium tracking-tight",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
/>
|
||||
)
|
||||
}
|
||||
|
||||
function AlertDescription({
|
||||
className,
|
||||
...props
|
||||
}: React.ComponentProps<"div">) {
|
||||
return (
|
||||
<div
|
||||
data-slot="alert-description"
|
||||
className={cn(
|
||||
"text-muted-foreground col-start-2 grid justify-items-start gap-1 text-sm [&_p]:leading-relaxed",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
/>
|
||||
)
|
||||
}
|
||||
|
||||
export { Alert, AlertTitle, AlertDescription }
|
||||
|
|
@ -1,46 +0,0 @@
|
|||
import * as React from "react"
|
||||
import { Slot } from "@radix-ui/react-slot"
|
||||
import { cva, type VariantProps } from "class-variance-authority"
|
||||
|
||||
import { cn } from "@/lib/utils"
|
||||
|
||||
const badgeVariants = cva(
|
||||
"inline-flex items-center justify-center rounded-md border px-2 py-0.5 text-xs font-medium w-fit whitespace-nowrap shrink-0 [&>svg]:size-3 gap-1 [&>svg]:pointer-events-none focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:ring-[3px] aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive transition-[color,box-shadow] overflow-hidden",
|
||||
{
|
||||
variants: {
|
||||
variant: {
|
||||
default:
|
||||
"border-transparent bg-primary text-primary-foreground [a&]:hover:bg-primary/90",
|
||||
secondary:
|
||||
"border-transparent bg-secondary text-secondary-foreground [a&]:hover:bg-secondary/90",
|
||||
destructive:
|
||||
"border-transparent bg-destructive text-white [a&]:hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60",
|
||||
outline:
|
||||
"text-foreground [a&]:hover:bg-accent [a&]:hover:text-accent-foreground",
|
||||
},
|
||||
},
|
||||
defaultVariants: {
|
||||
variant: "default",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
function Badge({
|
||||
className,
|
||||
variant,
|
||||
asChild = false,
|
||||
...props
|
||||
}: React.ComponentProps<"span"> &
|
||||
VariantProps<typeof badgeVariants> & { asChild?: boolean }) {
|
||||
const Comp = asChild ? Slot : "span"
|
||||
|
||||
return (
|
||||
<Comp
|
||||
data-slot="badge"
|
||||
className={cn(badgeVariants({ variant }), className)}
|
||||
{...props}
|
||||
/>
|
||||
)
|
||||
}
|
||||
|
||||
export { Badge, badgeVariants }
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
import * as React from "react"
|
||||
import { Slot } from "@radix-ui/react-slot"
|
||||
import { ChevronRight, MoreHorizontal } from "lucide-react"
|
||||
import { ChevronRight } from "lucide-react"
|
||||
|
||||
import { cn } from "@/lib/utils"
|
||||
|
||||
|
|
@ -80,24 +80,6 @@ function BreadcrumbSeparator({
|
|||
)
|
||||
}
|
||||
|
||||
function BreadcrumbEllipsis({
|
||||
className,
|
||||
...props
|
||||
}: React.ComponentProps<"span">) {
|
||||
return (
|
||||
<span
|
||||
data-slot="breadcrumb-ellipsis"
|
||||
role="presentation"
|
||||
aria-hidden="true"
|
||||
className={cn("flex size-9 items-center justify-center", className)}
|
||||
{...props}
|
||||
>
|
||||
<MoreHorizontal className="size-4" />
|
||||
<span className="sr-only">More</span>
|
||||
</span>
|
||||
)
|
||||
}
|
||||
|
||||
export {
|
||||
Breadcrumb,
|
||||
BreadcrumbList,
|
||||
|
|
@ -105,5 +87,4 @@ export {
|
|||
BreadcrumbLink,
|
||||
BreadcrumbPage,
|
||||
BreadcrumbSeparator,
|
||||
BreadcrumbEllipsis,
|
||||
}
|
||||
|
|
|
|||
29
crawler/frontend/src/components/ui/checkbox.tsx
Normal file
29
crawler/frontend/src/components/ui/checkbox.tsx
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
"use client"
|
||||
|
||||
import * as React from "react"
|
||||
import * as CheckboxPrimitive from "@radix-ui/react-checkbox"
|
||||
import { Check } from "lucide-react"
|
||||
import { cn } from "@/lib/utils"
|
||||
|
||||
const Checkbox = React.forwardRef<
|
||||
React.ComponentRef<typeof CheckboxPrimitive.Root>,
|
||||
React.ComponentPropsWithoutRef<typeof CheckboxPrimitive.Root>
|
||||
>(({ className, ...props }, ref) => (
|
||||
<CheckboxPrimitive.Root
|
||||
ref={ref}
|
||||
className={cn(
|
||||
"peer h-4 w-4 shrink-0 rounded-sm border border-primary shadow focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:cursor-not-allowed disabled:opacity-50 data-[state=checked]:bg-primary data-[state=checked]:text-primary-foreground",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
>
|
||||
<CheckboxPrimitive.Indicator
|
||||
className={cn("flex items-center justify-center text-current")}
|
||||
>
|
||||
<Check className="h-4 w-4" />
|
||||
</CheckboxPrimitive.Indicator>
|
||||
</CheckboxPrimitive.Root>
|
||||
))
|
||||
Checkbox.displayName = CheckboxPrimitive.Root.displayName
|
||||
|
||||
export { Checkbox }
|
||||
34
crawler/frontend/src/components/ui/slider.tsx
Normal file
34
crawler/frontend/src/components/ui/slider.tsx
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
"use client"
|
||||
|
||||
import * as React from "react"
|
||||
import * as SliderPrimitive from "@radix-ui/react-slider"
|
||||
import { cn } from "@/lib/utils"
|
||||
|
||||
const Slider = React.forwardRef<
|
||||
React.ComponentRef<typeof SliderPrimitive.Root>,
|
||||
React.ComponentPropsWithoutRef<typeof SliderPrimitive.Root>
|
||||
>(({ className, ...props }, ref) => (
|
||||
<SliderPrimitive.Root
|
||||
ref={ref}
|
||||
className={cn(
|
||||
"relative flex w-full touch-none select-none items-center",
|
||||
className
|
||||
)}
|
||||
{...props}
|
||||
>
|
||||
<SliderPrimitive.Track className="relative h-1.5 w-full grow overflow-hidden rounded-full bg-primary/20">
|
||||
<SliderPrimitive.Range className="absolute h-full bg-primary" />
|
||||
</SliderPrimitive.Track>
|
||||
{props.defaultValue?.map((_, index) => (
|
||||
<SliderPrimitive.Thumb
|
||||
key={index}
|
||||
className="block h-4 w-4 rounded-full border border-primary/50 bg-background shadow transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50"
|
||||
/>
|
||||
)) ?? (
|
||||
<SliderPrimitive.Thumb className="block h-4 w-4 rounded-full border border-primary/50 bg-background shadow transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50" />
|
||||
)}
|
||||
</SliderPrimitive.Root>
|
||||
))
|
||||
Slider.displayName = SliderPrimitive.Root.displayName
|
||||
|
||||
export { Slider }
|
||||
|
|
@ -118,3 +118,30 @@
|
|||
@apply bg-background text-foreground;
|
||||
}
|
||||
}
|
||||
|
||||
/* Accordion animations */
|
||||
@keyframes accordion-down {
|
||||
from {
|
||||
height: 0;
|
||||
}
|
||||
to {
|
||||
height: var(--radix-accordion-content-height);
|
||||
}
|
||||
}
|
||||
|
||||
@keyframes accordion-up {
|
||||
from {
|
||||
height: var(--radix-accordion-content-height);
|
||||
}
|
||||
to {
|
||||
height: 0;
|
||||
}
|
||||
}
|
||||
|
||||
.animate-accordion-down {
|
||||
animation: accordion-down 0.2s ease-out;
|
||||
}
|
||||
|
||||
.animate-accordion-up {
|
||||
animation: accordion-up 0.2s ease-out;
|
||||
}
|
||||
|
|
|
|||
62
crawler/frontend/src/services/apiClient.ts
Normal file
62
crawler/frontend/src/services/apiClient.ts
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
// Generic API client with authentication
|
||||
|
||||
import type { User } from 'oidc-client-ts';
|
||||
import { ApiError } from '@/types';
|
||||
|
||||
export interface RequestOptions {
|
||||
method?: 'GET' | 'POST' | 'PUT' | 'DELETE';
|
||||
params?: Record<string, string | number | boolean | Date | undefined>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build query string from parameters object
|
||||
*/
|
||||
function buildQueryString(params: Record<string, string | number | boolean | Date | undefined>): string {
|
||||
const queryString = new URLSearchParams();
|
||||
|
||||
for (const [key, value] of Object.entries(params)) {
|
||||
if (value !== undefined && value !== null && value !== '') {
|
||||
if (value instanceof Date) {
|
||||
queryString.append(key, value.toISOString());
|
||||
} else {
|
||||
queryString.append(key, String(value));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return queryString.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic authenticated API request
|
||||
*/
|
||||
export async function apiRequest<T>(
|
||||
user: User,
|
||||
endpoint: string,
|
||||
options: RequestOptions = {}
|
||||
): Promise<T> {
|
||||
const { method = 'GET', params } = options;
|
||||
const accessToken = user.access_token;
|
||||
|
||||
let url = endpoint;
|
||||
if (params) {
|
||||
const queryString = buildQueryString(params);
|
||||
if (queryString) {
|
||||
url = `${endpoint}?${queryString}`;
|
||||
}
|
||||
}
|
||||
|
||||
const response = await fetch(url, {
|
||||
method,
|
||||
headers: {
|
||||
Authorization: `Bearer ${accessToken}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new ApiError(`Error: ${response.status}`, response.status);
|
||||
}
|
||||
|
||||
return response.json() as Promise<T>;
|
||||
}
|
||||
54
crawler/frontend/src/services/listingService.ts
Normal file
54
crawler/frontend/src/services/listingService.ts
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
// Listing service for fetching and refreshing listings
|
||||
|
||||
import type { User } from 'oidc-client-ts';
|
||||
import type { GeoJSONFeatureCollection, RefreshListingsResponse } from '@/types';
|
||||
import type { ParameterValues } from '@/components/FilterPanel';
|
||||
import { apiRequest } from './apiClient';
|
||||
import { API_ENDPOINTS } from '@/constants';
|
||||
|
||||
/**
|
||||
* Build listing query parameters from form values
|
||||
*/
|
||||
function buildListingParams(parameters: ParameterValues): Record<string, string | number | boolean | Date | undefined> {
|
||||
return {
|
||||
listing_type: parameters.listing_type,
|
||||
min_bedrooms: parameters.min_bedrooms,
|
||||
max_bedrooms: parameters.max_bedrooms,
|
||||
max_price: parameters.max_price,
|
||||
min_price: parameters.min_price,
|
||||
min_sqm: parameters.min_sqm,
|
||||
max_sqm: parameters.max_sqm,
|
||||
min_price_per_sqm: parameters.min_price_per_sqm,
|
||||
max_price_per_sqm: parameters.max_price_per_sqm,
|
||||
last_seen_days: parameters.last_seen_days,
|
||||
let_date_available_from: parameters.available_from,
|
||||
district_names: parameters.district || undefined,
|
||||
furnish_types: parameters.furnish_types?.join(',') || undefined,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch listing data as GeoJSON
|
||||
*/
|
||||
export async function fetchListingGeoJSON(
|
||||
user: User,
|
||||
parameters: ParameterValues
|
||||
): Promise<GeoJSONFeatureCollection> {
|
||||
return apiRequest<GeoJSONFeatureCollection>(user, API_ENDPOINTS.LISTING_GEOJSON, {
|
||||
method: 'GET',
|
||||
params: buildListingParams(parameters),
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Trigger a listing refresh task
|
||||
*/
|
||||
export async function refreshListings(
|
||||
user: User,
|
||||
parameters: ParameterValues
|
||||
): Promise<RefreshListingsResponse> {
|
||||
return apiRequest<RefreshListingsResponse>(user, API_ENDPOINTS.REFRESH_LISTINGS, {
|
||||
method: 'POST',
|
||||
params: buildListingParams(parameters),
|
||||
});
|
||||
}
|
||||
45
crawler/frontend/src/utils/mapUtils.ts
Normal file
45
crawler/frontend/src/utils/mapUtils.ts
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
// Map utility functions
|
||||
|
||||
/**
|
||||
* Deep clone an object using JSON serialization
|
||||
*/
|
||||
export function clone<T>(obj: T): T {
|
||||
return JSON.parse(JSON.stringify(obj));
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the value at a given percentile in a sorted array
|
||||
* @param arr Sorted array of numbers
|
||||
* @param p Percentile (0-1)
|
||||
*/
|
||||
export function percentile(arr: number[], p: number): number {
|
||||
if (arr.length === 0) return 0;
|
||||
if (typeof p !== 'number') throw new TypeError('p must be a number');
|
||||
if (p <= 0) return arr[0];
|
||||
if (p >= 1) return arr[arr.length - 1];
|
||||
|
||||
const index = arr.length * p;
|
||||
const lower = Math.floor(index);
|
||||
const upper = lower + 1;
|
||||
const weight = index % 1;
|
||||
|
||||
if (upper >= arr.length) return arr[lower];
|
||||
return arr[lower] * (1 - weight) + arr[upper] * weight;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert percentage-based color stops to value-based color stops
|
||||
* @param colorStopsPerc Array of [percentage, color] tuples
|
||||
* @param min Minimum value
|
||||
* @param max Maximum value
|
||||
*/
|
||||
export function calculateColorStops(
|
||||
colorStopsPerc: [number, string][],
|
||||
min: number,
|
||||
max: number
|
||||
): [number, string][] {
|
||||
return colorStopsPerc.map(([perc, color]) => [
|
||||
min + (perc * (max - min)) / 100,
|
||||
color,
|
||||
]);
|
||||
}
|
||||
|
|
@ -1 +1 @@
|
|||
{"root":["./src/App.tsx","./src/AppSidebar.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/auth/authService.ts","./src/auth/config.ts","./src/components/ActiveQuery.tsx","./src/components/AlertError.tsx","./src/components/LoginModal.tsx","./src/components/Map.tsx","./src/components/Parameters.tsx","./src/components/ui/alert-dialog.tsx","./src/components/ui/alert.tsx","./src/components/ui/badge.tsx","./src/components/ui/breadcrumb.tsx","./src/components/ui/button.tsx","./src/components/ui/dialog.tsx","./src/components/ui/form.tsx","./src/components/ui/hover-card.tsx","./src/components/ui/input.tsx","./src/components/ui/label.tsx","./src/components/ui/progress.tsx","./src/components/ui/scroll-area.tsx","./src/components/ui/select.tsx","./src/components/ui/separator.tsx","./src/components/ui/sheet.tsx","./src/components/ui/sidebar.tsx","./src/components/ui/skeleton.tsx","./src/components/ui/tooltip.tsx","./src/hooks/use-mobile.ts","./src/lib/utils.ts"],"version":"5.8.3"}
|
||||
{"root":["./src/App.tsx","./src/AppSidebar.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/auth/authService.ts","./src/auth/config.ts","./src/auth/errors.ts","./src/components/ActiveQuery.tsx","./src/components/AlertError.tsx","./src/components/AuthCallback.tsx","./src/components/FilterPanel.tsx","./src/components/Header.tsx","./src/components/HealthIndicator.tsx","./src/components/ListView.tsx","./src/components/LoginModal.tsx","./src/components/Map.tsx","./src/components/Parameters.tsx","./src/components/PropertyCard.tsx","./src/components/Spinner.tsx","./src/components/StatsBar.tsx","./src/components/StreamingProgressBar.tsx","./src/components/TaskIndicator.tsx","./src/components/ui/DatePicker.tsx","./src/components/ui/accordion.tsx","./src/components/ui/alert-dialog.tsx","./src/components/ui/breadcrumb.tsx","./src/components/ui/button.tsx","./src/components/ui/calendar.tsx","./src/components/ui/checkbox.tsx","./src/components/ui/dialog.tsx","./src/components/ui/form.tsx","./src/components/ui/hover-card.tsx","./src/components/ui/input.tsx","./src/components/ui/label.tsx","./src/components/ui/popover.tsx","./src/components/ui/progress.tsx","./src/components/ui/scroll-area.tsx","./src/components/ui/select.tsx","./src/components/ui/separator.tsx","./src/components/ui/sheet.tsx","./src/components/ui/sidebar.tsx","./src/components/ui/skeleton.tsx","./src/components/ui/slider.tsx","./src/components/ui/tooltip.tsx","./src/constants/colorSchemes.ts","./src/constants/index.ts","./src/hooks/use-mobile.ts","./src/lib/utils.ts","./src/services/apiClient.ts","./src/services/healthService.ts","./src/services/index.ts","./src/services/listingService.ts","./src/services/streamingService.ts","./src/services/taskService.ts","./src/types/index.ts","./src/utils/mapUtils.ts"],"version":"5.8.3"}
|
||||
|
|
@ -19,7 +19,8 @@ export default defineConfig({
|
|||
allowedHosts: [
|
||||
env.DEV_HOST ?? 'localhost',
|
||||
// Add more hosts here
|
||||
'wrongmove.viktorbarzin.me'
|
||||
'wrongmove.viktorbarzin.me',
|
||||
'devvm.viktorbarzin.lan'
|
||||
],
|
||||
}
|
||||
})
|
||||
|
|
|
|||
319
crawler/main.py
319
crawler/main.py
|
|
@ -1,28 +1,28 @@
|
|||
"""CLI entry point for the Real Estate Crawler."""
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
import os
|
||||
import pathlib
|
||||
from typing import Callable, ParamSpec, TypeVar
|
||||
import click
|
||||
import importlib
|
||||
|
||||
from models.listing import FurnishType, ListingType, QueryParameters
|
||||
from rec.districts import get_districts
|
||||
from data_access import Listing
|
||||
import csv_exporter
|
||||
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
|
||||
from repositories.listing_repository import ListingRepository
|
||||
from ui_exporter import export_immoweb as export_immoweb_ui
|
||||
from functools import wraps
|
||||
from database import engine
|
||||
from services import (
|
||||
listing_service,
|
||||
export_service,
|
||||
district_service,
|
||||
)
|
||||
|
||||
P = ParamSpec("P")
|
||||
R = TypeVar("R")
|
||||
|
||||
|
||||
dump_listings_module = importlib.import_module("1_dump_listings")
|
||||
dump_images_module = importlib.import_module("3_dump_images")
|
||||
detect_floorplan_module = importlib.import_module("4_detect_floorplan")
|
||||
routing_module = importlib.import_module("5_routing")
|
||||
|
||||
|
||||
def listing_filter_options(func):
|
||||
def listing_filter_options(func: Callable[P, R]) -> Callable[P, R]:
|
||||
"""Decorator to add common options for filtering listings."""
|
||||
|
||||
@click.option(
|
||||
|
|
@ -45,7 +45,7 @@ def listing_filter_options(func):
|
|||
"--max-bedrooms",
|
||||
default=10,
|
||||
help="Maximum number of bedrooms",
|
||||
type=click.IntRange(min=1, max=10), # Right move gets unhappy with >10
|
||||
type=click.IntRange(min=1, max=10),
|
||||
)
|
||||
@click.option(
|
||||
"--min-price",
|
||||
|
|
@ -57,13 +57,13 @@ def listing_filter_options(func):
|
|||
"--max-price",
|
||||
default=999_999,
|
||||
help="Maximum price",
|
||||
type=click.IntRange(min=0), # 40k for renting
|
||||
type=click.IntRange(min=0),
|
||||
)
|
||||
@click.option(
|
||||
"--district",
|
||||
default=None,
|
||||
help="Districts to scrape",
|
||||
type=click.Choice(get_districts().keys(), case_sensitive=False),
|
||||
type=click.Choice(district_service.get_district_names(), case_sensitive=False),
|
||||
multiple=True,
|
||||
)
|
||||
@click.option(
|
||||
|
|
@ -95,17 +95,50 @@ def listing_filter_options(func):
|
|||
type=int,
|
||||
)
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def build_query_parameters(
|
||||
type: str,
|
||||
district: list[str],
|
||||
min_bedrooms: int,
|
||||
max_bedrooms: int,
|
||||
min_price: int,
|
||||
max_price: int,
|
||||
furnish_types: list[str],
|
||||
available_from: datetime | None,
|
||||
last_seen_days: int,
|
||||
min_sqm: int | None = None,
|
||||
radius: int = 0,
|
||||
page_size: int = 500,
|
||||
max_days_since_added: int = 14,
|
||||
) -> QueryParameters:
|
||||
"""Build QueryParameters from CLI options."""
|
||||
return QueryParameters(
|
||||
listing_type=ListingType[type],
|
||||
district_names=set(district) if district else None,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
furnish_types=[FurnishType[ft] for ft in furnish_types] if furnish_types else None,
|
||||
let_date_available_from=available_from,
|
||||
last_seen_days=last_seen_days,
|
||||
min_sqm=min_sqm,
|
||||
radius=radius,
|
||||
page_size=page_size,
|
||||
max_days_since_added=max_days_since_added,
|
||||
)
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.option(
|
||||
"--data-dir",
|
||||
default=pathlib.Path("data/rs/"),
|
||||
help="Districts to scrape",
|
||||
help="Data directory for storing listings",
|
||||
type=click.Path(
|
||||
writable=True,
|
||||
file_okay=False,
|
||||
|
|
@ -114,17 +147,18 @@ def listing_filter_options(func):
|
|||
),
|
||||
)
|
||||
@click.pass_context
|
||||
def cli(ctx, data_dir: str):
|
||||
def cli(ctx: click.Context, data_dir: str) -> None:
|
||||
ctx.ensure_object(dict)
|
||||
ctx.obj["data_dir"] = data_dir
|
||||
ctx.obj["data_dir"] = pathlib.Path(data_dir)
|
||||
ctx.obj["repository"] = ListingRepository(engine=engine)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@listing_filter_options
|
||||
@click.option("--full", is_flag=True)
|
||||
@click.option("--full", is_flag=True, help="Include images and floorplan detection")
|
||||
@click.pass_context
|
||||
def dump_listings(
|
||||
ctx: click.core.Context,
|
||||
ctx: click.Context,
|
||||
full: bool,
|
||||
district: list[str],
|
||||
min_bedrooms: int,
|
||||
|
|
@ -136,58 +170,63 @@ def dump_listings(
|
|||
available_from: datetime | None,
|
||||
last_seen_days: int,
|
||||
min_sqm: int | None = None,
|
||||
):
|
||||
data_dir: str = ctx.obj["data_dir"]
|
||||
query_parameters = QueryParameters(
|
||||
listing_type=ListingType[type],
|
||||
district_names=set(district),
|
||||
) -> None:
|
||||
"""Fetch listings from Rightmove API."""
|
||||
data_dir: pathlib.Path = ctx.obj["data_dir"]
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
query_parameters = build_query_parameters(
|
||||
type=type,
|
||||
district=district,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
||||
let_date_available_from=available_from,
|
||||
furnish_types=furnish_types,
|
||||
available_from=available_from,
|
||||
last_seen_days=last_seen_days,
|
||||
min_sqm=min_sqm,
|
||||
radius=0,
|
||||
page_size=500,
|
||||
max_days_since_added=14,
|
||||
)
|
||||
click.echo(
|
||||
f"Running dump_listings for districts {district}, data dir {data_dir} and parameters: "
|
||||
f"{query_parameters}"
|
||||
|
||||
click.echo(f"Fetching listings with parameters: {query_parameters}")
|
||||
|
||||
result = asyncio.run(
|
||||
listing_service.refresh_listings(
|
||||
repository,
|
||||
query_parameters,
|
||||
full=full,
|
||||
async_mode=False,
|
||||
)
|
||||
)
|
||||
data_dir_path = pathlib.Path(data_dir)
|
||||
repository = ListingRepository(engine=engine)
|
||||
if not full: # only listings
|
||||
asyncio.run(
|
||||
dump_listings_module.dump_listings(
|
||||
query_parameters, repository, data_dir_path
|
||||
)
|
||||
)
|
||||
else: # include images, floorplan detection etc.
|
||||
asyncio.run(
|
||||
dump_listings_module.dump_listings_full(
|
||||
query_parameters, repository, data_dir_path
|
||||
)
|
||||
)
|
||||
|
||||
click.echo(result.message)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.pass_context
|
||||
def dump_images(ctx: click.core.Context):
|
||||
data_dir = ctx.obj["data_dir"]
|
||||
click.echo(f"Running dump_images for listings stored in {engine.url}")
|
||||
repository = ListingRepository(engine=engine)
|
||||
asyncio.run(dump_images_module.dump_images(repository, image_base_path=data_dir))
|
||||
def dump_images(ctx: click.Context) -> None:
|
||||
"""Download floorplan images for all listings."""
|
||||
data_dir: pathlib.Path = ctx.obj["data_dir"]
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
click.echo(f"Downloading images to {data_dir}")
|
||||
|
||||
count = asyncio.run(listing_service.download_images(repository, data_dir))
|
||||
|
||||
click.echo(f"Processed {count} listings")
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.pass_context
|
||||
def detect_floorplan(ctx: click.core.Context):
|
||||
click.echo(f"Running detect_floorplan for listings stored in {engine.url}")
|
||||
repository = ListingRepository(engine=engine)
|
||||
asyncio.run(detect_floorplan_module.detect_floorplan(repository))
|
||||
def detect_floorplan(ctx: click.Context) -> None:
|
||||
"""Run OCR on floorplan images to detect square meters."""
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
click.echo("Running floorplan detection...")
|
||||
|
||||
count = asyncio.run(listing_service.detect_floorplans(repository))
|
||||
|
||||
click.echo(f"Processed {count} listings")
|
||||
|
||||
|
||||
@cli.command()
|
||||
|
|
@ -202,10 +241,7 @@ def detect_floorplan(ctx: click.core.Context):
|
|||
"--travel-mode",
|
||||
"-m",
|
||||
help="Travel mode for routing",
|
||||
type=click.Choice(
|
||||
TravelMode.__members__.keys(),
|
||||
case_sensitive=False,
|
||||
),
|
||||
type=click.Choice(TravelMode.__members__.keys(), case_sensitive=False),
|
||||
required=True,
|
||||
)
|
||||
@click.option(
|
||||
|
|
@ -213,65 +249,50 @@ def detect_floorplan(ctx: click.core.Context):
|
|||
"-l",
|
||||
help="Limit the number of listings to process",
|
||||
type=click.IntRange(min=1),
|
||||
default=1, # by default limit to 1 to avoid accidental API usage
|
||||
default=1,
|
||||
)
|
||||
@click.pass_context
|
||||
def routing(
|
||||
ctx: click.core.Context, destination_address: str, travel_mode: str, limit: int
|
||||
):
|
||||
data_dir = ctx.obj["data_dir"]
|
||||
click.echo(f"Running routing for the first {limit} listings in {data_dir}")
|
||||
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
||||
listing_paths = listing_paths[:limit]
|
||||
ctx: click.Context,
|
||||
destination_address: str,
|
||||
travel_mode: str,
|
||||
limit: int,
|
||||
) -> None:
|
||||
"""Calculate transit routes for listings."""
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None:
|
||||
raise click.exceptions.MissingParameter(
|
||||
f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. "
|
||||
"Please set it to your API key for the routing service."
|
||||
raise click.ClickException(
|
||||
f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set."
|
||||
)
|
||||
|
||||
repository = ListingRepository(engine=engine)
|
||||
asyncio.run(
|
||||
routing_module.calculate_route(
|
||||
click.echo(f"Calculating routes to '{destination_address}' for {limit} listings")
|
||||
|
||||
count = asyncio.run(
|
||||
listing_service.calculate_routes(
|
||||
repository,
|
||||
destination_address,
|
||||
# destination_address_coordinates,
|
||||
TravelMode[travel_mode],
|
||||
travel_mode,
|
||||
limit=limit,
|
||||
)
|
||||
)
|
||||
|
||||
click.echo(f"Processed {count} listings")
|
||||
|
||||
|
||||
@cli.command()
|
||||
# @click.option(
|
||||
# "--columns",
|
||||
# "-C",
|
||||
# help="Columns to include in the CSV file",
|
||||
# type=click.Choice(
|
||||
# # csv_exporter.get_columns_from_listings(),
|
||||
# [1],
|
||||
# case_sensitive=False,
|
||||
# ),
|
||||
# multiple=True,
|
||||
# default=Listing.ALL_COLUMNS,
|
||||
# )
|
||||
@click.option(
|
||||
"--output-file",
|
||||
"-O",
|
||||
help="Path to the output CSV file",
|
||||
required=True,
|
||||
type=click.Path(
|
||||
writable=True,
|
||||
file_okay=True,
|
||||
dir_okay=False,
|
||||
resolve_path=True,
|
||||
),
|
||||
type=click.Path(writable=True, file_okay=True, dir_okay=False, resolve_path=True),
|
||||
)
|
||||
@click.pass_context
|
||||
@listing_filter_options
|
||||
@click.pass_context
|
||||
def export_csv(
|
||||
ctx: click.core.Context,
|
||||
ctx: click.Context,
|
||||
output_file: str,
|
||||
# columns: tuple[str],
|
||||
district: list[str],
|
||||
min_bedrooms: int,
|
||||
max_bedrooms: int,
|
||||
|
|
@ -282,53 +303,48 @@ def export_csv(
|
|||
available_from: datetime | None,
|
||||
last_seen_days: int,
|
||||
min_sqm: int | None = None,
|
||||
):
|
||||
# use model
|
||||
data_dir = ctx.obj["data_dir"]
|
||||
query_parameters = QueryParameters(
|
||||
listing_type=ListingType[type],
|
||||
district_names=set(district),
|
||||
) -> None:
|
||||
"""Export listings to CSV file."""
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
query_parameters = build_query_parameters(
|
||||
type=type,
|
||||
district=district,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
||||
let_date_available_from=available_from,
|
||||
furnish_types=furnish_types,
|
||||
available_from=available_from,
|
||||
last_seen_days=last_seen_days,
|
||||
min_sqm=min_sqm,
|
||||
)
|
||||
click.echo(
|
||||
f"Exporting data to {output_file} using {data_dir=} and query parameters: {query_parameters}"
|
||||
)
|
||||
output_file_path = pathlib.Path(output_file)
|
||||
repository = ListingRepository(engine=engine)
|
||||
asyncio.run(
|
||||
csv_exporter.export_to_csv(
|
||||
|
||||
click.echo(f"Exporting to {output_file}")
|
||||
|
||||
result = asyncio.run(
|
||||
export_service.export_to_csv(
|
||||
repository,
|
||||
output_file_path,
|
||||
# list(columns),
|
||||
query_parameters=query_parameters,
|
||||
),
|
||||
pathlib.Path(output_file),
|
||||
query_parameters,
|
||||
)
|
||||
)
|
||||
|
||||
click.echo(result.message)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.option(
|
||||
"--output-file",
|
||||
"-O",
|
||||
help="Path to the output immoweb file",
|
||||
help="Path to the output GeoJSON file",
|
||||
required=True,
|
||||
type=click.Path(
|
||||
writable=True,
|
||||
file_okay=True,
|
||||
dir_okay=False,
|
||||
resolve_path=True,
|
||||
),
|
||||
type=click.Path(writable=True, file_okay=True, dir_okay=False, resolve_path=True),
|
||||
)
|
||||
@listing_filter_options
|
||||
@click.pass_context
|
||||
def export_immoweb(
|
||||
ctx: click.core.Context,
|
||||
ctx: click.Context,
|
||||
output_file: str,
|
||||
district: list[str],
|
||||
min_bedrooms: int,
|
||||
|
|
@ -340,39 +356,62 @@ def export_immoweb(
|
|||
available_from: datetime | None,
|
||||
last_seen_days: int,
|
||||
min_sqm: int | None = None,
|
||||
):
|
||||
query_parameters = QueryParameters(
|
||||
listing_type=ListingType[type],
|
||||
district_names=set(district),
|
||||
) -> None:
|
||||
"""Export listings to GeoJSON file for map visualization."""
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
query_parameters = build_query_parameters(
|
||||
type=type,
|
||||
district=district,
|
||||
min_bedrooms=min_bedrooms,
|
||||
max_bedrooms=max_bedrooms,
|
||||
min_price=min_price,
|
||||
max_price=max_price,
|
||||
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
||||
let_date_available_from=available_from,
|
||||
furnish_types=furnish_types,
|
||||
available_from=available_from,
|
||||
last_seen_days=last_seen_days,
|
||||
min_sqm=min_sqm,
|
||||
)
|
||||
click.echo(
|
||||
f"Exporting data to {output_file} for listings stored in {engine.url} that match the query parameters: {query_parameters}"
|
||||
|
||||
click.echo(f"Exporting to {output_file}")
|
||||
|
||||
result = asyncio.run(
|
||||
export_service.export_to_geojson(
|
||||
repository,
|
||||
query_parameters=query_parameters,
|
||||
output_path=pathlib.Path(output_file),
|
||||
)
|
||||
)
|
||||
repository = ListingRepository(engine=engine)
|
||||
asyncio.run(export_immoweb_ui(repository, output_file, query_parameters))
|
||||
|
||||
click.echo(result.message)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.pass_context
|
||||
def populate_db(
|
||||
ctx: click.core.Context,
|
||||
):
|
||||
data_dir = ctx.obj["data_dir"]
|
||||
click.echo(f"Populating the database with data from {data_dir}")
|
||||
repository = ListingRepository(engine=engine)
|
||||
def populate_db(ctx: click.Context) -> None:
|
||||
"""Populate database from filesystem data (legacy migration)."""
|
||||
data_dir: pathlib.Path = ctx.obj["data_dir"]
|
||||
repository: ListingRepository = ctx.obj["repository"]
|
||||
|
||||
click.echo(f"Populating database from {data_dir}")
|
||||
|
||||
listings = Listing.get_all_listings(
|
||||
[path for path in pathlib.Path(data_dir).glob("*/listing.json")]
|
||||
[path for path in data_dir.glob("*/listing.json")]
|
||||
)
|
||||
|
||||
asyncio.run(repository.upsert_listings_legacy(listings))
|
||||
|
||||
click.echo(f"Imported {len(listings)} listings")
|
||||
|
||||
|
||||
@cli.command()
|
||||
def list_districts() -> None:
|
||||
"""List all available districts."""
|
||||
districts = district_service.get_all_districts()
|
||||
click.echo(f"Available districts ({len(districts)}):")
|
||||
for name in sorted(districts.keys()):
|
||||
click.echo(f" - {name}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
|
|
|
|||
|
|
@ -1,40 +0,0 @@
|
|||
def record():
|
||||
from rec.query import listing_query, detail_query
|
||||
import json
|
||||
|
||||
page = 1
|
||||
listing = listing_query(page, 2, 2, 5, 200000, 500000)
|
||||
with open(
|
||||
f"/Users/kadir/code/realestate/crawler/code/json/queries/listing{page}.json",
|
||||
"w",
|
||||
) as f:
|
||||
json.dump(listing, f)
|
||||
|
||||
for prop in listing["properties"]:
|
||||
identifier = prop["identifier"]
|
||||
resp = detail_query(identifier)
|
||||
# print(identifier, resp.status_code)
|
||||
with open(
|
||||
f"/Users/kadir/code/realestate/crawler/code/json/queries/detail_{identifier}.json",
|
||||
"w",
|
||||
) as f:
|
||||
json.dump(resp, f)
|
||||
|
||||
|
||||
def process():
|
||||
import json
|
||||
import pathlib
|
||||
|
||||
path = pathlib.Path("/Users/kadir/code/realestate/crawler/code/json/queries/")
|
||||
|
||||
detailjsons = list(path.glob("detail_*json"))
|
||||
for file in detailjsons:
|
||||
with open(file) as f:
|
||||
js = json.load(f)
|
||||
|
||||
for floorplan in js["property"]["floorplans"]:
|
||||
print(floorplan["url"])
|
||||
|
||||
|
||||
# record()
|
||||
process()
|
||||
|
|
@ -5,7 +5,7 @@ from datetime import datetime, timedelta
|
|||
import enum
|
||||
import json
|
||||
from typing import Any, Dict, List
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, Field as PydanticField
|
||||
from rec import routing
|
||||
from sqlmodel import JSON, TEXT, SQLModel, Field
|
||||
|
||||
|
|
@ -80,7 +80,10 @@ class Listing(SQLModel, table=False):
|
|||
|
||||
@property
|
||||
def is_removed(self) -> bool:
|
||||
return not self.additional_info["property"]["visible"]
|
||||
if not self.additional_info:
|
||||
return False
|
||||
property_info = self.additional_info.get("property", {})
|
||||
return not property_info.get("visible", True)
|
||||
|
||||
@property
|
||||
def price_per_square_meter(self) -> float | None:
|
||||
|
|
@ -231,14 +234,16 @@ class ListingType(enum.StrEnum):
|
|||
RENT = "RENT"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class QueryParameters(BaseModel):
|
||||
"""Query parameters for filtering listings."""
|
||||
model_config = {"frozen": True}
|
||||
|
||||
listing_type: ListingType
|
||||
min_bedrooms: int = 1
|
||||
max_bedrooms: int = 999
|
||||
min_price: int = 0
|
||||
max_price: int = 10_000_000
|
||||
district_names: set[str] = dataclasses.field(default_factory=set)
|
||||
district_names: set[str] = PydanticField(default_factory=set)
|
||||
radius: float = 0
|
||||
page_size: int = 500 # items per page
|
||||
max_days_since_added: int = 14 # for buy listings
|
||||
|
|
|
|||
36
crawler/poetry.lock
generated
36
crawler/poetry.lock
generated
|
|
@ -120,6 +120,22 @@ yarl = ">=1.17.0,<2.0"
|
|||
[package.extras]
|
||||
speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.3.0)", "brotlicffi ; platform_python_implementation != \"CPython\""]
|
||||
|
||||
[[package]]
|
||||
name = "aiohttp-socks"
|
||||
version = "0.8.4"
|
||||
description = "Proxy connector for aiohttp"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "aiohttp_socks-0.8.4-py3-none-any.whl", hash = "sha256:74b21105634ed31d56ed6fee43701ca16218b53475e606d56950a4d17e8290ea"},
|
||||
{file = "aiohttp_socks-0.8.4.tar.gz", hash = "sha256:6b611d4ce838e9cf2c2fed5e0dba447cc84824a6cba95dc5747606201da46cb4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
aiohttp = ">=2.3.2"
|
||||
python-socks = {version = ">=2.4.3,<3.0.0", extras = ["asyncio"]}
|
||||
|
||||
[[package]]
|
||||
name = "aioresponses"
|
||||
version = "0.7.8"
|
||||
|
|
@ -4246,6 +4262,24 @@ files = [
|
|||
{file = "python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-socks"
|
||||
version = "2.8.0"
|
||||
description = "Proxy (SOCKS4, SOCKS5, HTTP CONNECT) client for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.8.0"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "python_socks-2.8.0-py3-none-any.whl", hash = "sha256:57c24b416569ccea493a101d38b0c82ed54be603aa50b6afbe64c46e4a4e4315"},
|
||||
{file = "python_socks-2.8.0.tar.gz", hash = "sha256:340f82778b20a290bdd538ee47492978d603dff7826aaf2ce362d21ad9ee6f1b"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
anyio = ["anyio (>=3.3.4,<5.0.0)"]
|
||||
asyncio = ["async-timeout (>=4.0) ; python_version < \"3.11\""]
|
||||
curio = ["curio (>=1.4)"]
|
||||
trio = ["trio (>=0.24)"]
|
||||
|
||||
[[package]]
|
||||
name = "pytz"
|
||||
version = "2025.2"
|
||||
|
|
@ -6203,4 +6237,4 @@ type = ["pytest-mypy"]
|
|||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">3.11"
|
||||
content-hash = "10a74594d9f695ab1077ff992bcd012b93b174b25c3f2ca681d6308653abbd14"
|
||||
content-hash = "6f9ce2af71a995db179aa4fb682e8a9ccde59566d14e26c7b0dbf4edc8d8e583"
|
||||
|
|
|
|||
|
|
@ -1,13 +0,0 @@
|
|||
import requests
|
||||
|
||||
headers = {
|
||||
"Host": "media.rightmove.co.uk",
|
||||
# 'Accept-Encoding': 'gzip, deflate, br',
|
||||
"User-Agent": "okhttp/4.10.0",
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
"https://media.rightmove.co.uk/47k/46001/138680705/46001_32532509_IMG_00_0000.jpeg",
|
||||
headers=headers,
|
||||
verify=False,
|
||||
)
|
||||
|
|
@ -1,67 +0,0 @@
|
|||
import requests
|
||||
|
||||
headers = {
|
||||
"Host": "api.rightmove.co.uk",
|
||||
# 'Accept-Encoding': 'gzip, deflate, br',
|
||||
"User-Agent": "okhttp/4.10.0",
|
||||
"Connection": "close",
|
||||
}
|
||||
|
||||
params = {
|
||||
"locationIdentifier": "POSTCODE^4228216",
|
||||
"channel": "BUY",
|
||||
"page": "1",
|
||||
"numberOfPropertiesPerPage": "25",
|
||||
"radius": "3.0",
|
||||
"sortBy": "distance",
|
||||
"includeUnavailableProperties": "false",
|
||||
"propertyTypes": "flat",
|
||||
"mustHave": "newHome", # added manually later
|
||||
"dontShow": "sharedOwnership,retirement",
|
||||
"minPrice": "150000",
|
||||
"maxPrice": "500000",
|
||||
"minBedrooms": "2",
|
||||
"maxBedrooms": "2",
|
||||
"apiApplication": "ANDROID",
|
||||
"appVersion": "3.70.0",
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
"https://api.rightmove.co.uk/api/property-listing",
|
||||
params=params,
|
||||
headers=headers,
|
||||
verify=False,
|
||||
)
|
||||
|
||||
|
||||
headers = {
|
||||
"Host": "api.rightmove.co.uk",
|
||||
# 'Accept-Encoding': 'gzip, deflate, br',
|
||||
"User-Agent": "okhttp/4.10.0",
|
||||
"Connection": "close",
|
||||
}
|
||||
|
||||
params = {
|
||||
"locationIdentifier": "POSTCODE^4228216",
|
||||
"channel": "BUY",
|
||||
"page": "2",
|
||||
"numberOfPropertiesPerPage": "25",
|
||||
"radius": "3.0",
|
||||
"sortBy": "distance",
|
||||
"includeUnavailableProperties": "false",
|
||||
"propertyTypes": "flat",
|
||||
"dontShow": "sharedOwnership,retirement",
|
||||
"minPrice": "150000",
|
||||
"maxPrice": "600000",
|
||||
"minBedrooms": "2",
|
||||
"maxBedrooms": "2",
|
||||
"apiApplication": "ANDROID",
|
||||
"appVersion": "3.70.0",
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
"https://api.rightmove.co.uk/api/property-listing",
|
||||
params=params,
|
||||
headers=headers,
|
||||
verify=False,
|
||||
)
|
||||
|
|
@ -1,22 +0,0 @@
|
|||
import requests
|
||||
|
||||
API_KEY = "AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8"
|
||||
url = "https://maps.googleapis.com/maps/api/distancematrix/json"
|
||||
origin = "51.5636306598907,-0.11061106079085892"
|
||||
dest = "51.53836609846008,-0.12743940233824352"
|
||||
|
||||
params = {
|
||||
"origins": origin,
|
||||
"destinations": dest,
|
||||
"key": API_KEY,
|
||||
"departure_time": "", # timstamp, optional
|
||||
"mode": "transit",
|
||||
}
|
||||
|
||||
r = requests.get(url, params=params)
|
||||
print(r.status_code)
|
||||
|
||||
print(r.json())
|
||||
|
||||
with open("code/json/routing_distancematrix.json", "w") as f:
|
||||
f.write(r.text)
|
||||
|
|
@ -1,83 +0,0 @@
|
|||
import requests
|
||||
from utils import nextMonday
|
||||
from collections import defaultdict
|
||||
|
||||
API_KEY = "AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8"
|
||||
url = "https://routes.googleapis.com/directions/v2:computeRoutes"
|
||||
|
||||
|
||||
def travel_time(origin_lat: float, origin_lon: float, dest_lat: float, dest_lon: float):
|
||||
monday9am = nextMonday()
|
||||
|
||||
header = {
|
||||
"X-Goog-Api-Key": API_KEY,
|
||||
"Content-Type": "application/json",
|
||||
"X-Goog-FieldMask": "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode",
|
||||
}
|
||||
|
||||
body = {
|
||||
"origin": {
|
||||
"location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}}
|
||||
},
|
||||
"destination": {
|
||||
"location": {"latLng": {"latitude": dest_lat, "longitude": dest_lon}}
|
||||
},
|
||||
"travelMode": "TRANSIT",
|
||||
# "2023-10-15T15:01:23.045123456Z"
|
||||
"departureTime": monday9am.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
|
||||
"computeAlternativeRoutes": False,
|
||||
# "routeModifiers": {
|
||||
# "avoidTolls": false,
|
||||
# "avoidHighways": false,
|
||||
# "avoidFerries": false
|
||||
# },
|
||||
"languageCode": "en-US",
|
||||
"units": "METRIC",
|
||||
}
|
||||
|
||||
r = requests.post(url, json=body, headers=header)
|
||||
if r.status_code == 200:
|
||||
return r.json()
|
||||
|
||||
raise Exception(r.json())
|
||||
|
||||
|
||||
def extract_time(d):
|
||||
r = d["routes"][0]
|
||||
print(r.keys())
|
||||
distance = r["distanceMeters"]
|
||||
duration = r["duration"]
|
||||
duration_static = r["staticDuration"]
|
||||
|
||||
steps = r["legs"][0]["steps"]
|
||||
# print(steps)
|
||||
duration_per_transit = defaultdict(lambda: 0)
|
||||
distance_per_transit = defaultdict(lambda: 0)
|
||||
|
||||
for step in steps:
|
||||
duration_per_transit[step["travelMode"]] += int(
|
||||
step["staticDuration"].strip("s")
|
||||
)
|
||||
distance_per_transit[step["travelMode"]] += step.get("distanceMeters", 0)
|
||||
|
||||
print(
|
||||
f"dis {distance}, dur {duration}, duration per transit {dict(duration_per_transit)}, distance per transit {dict(distance_per_transit)}, duration_static {duration_static}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
|
||||
with open("code/json/routing_routeapi.json", "r") as f:
|
||||
d = json.load(f)
|
||||
|
||||
extract_time(d)
|
||||
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# origin = 51.5635664310333, -0.1107173751570373 # home
|
||||
# dest = 51.50475678313417, 0.04915321000190009 # london city airport
|
||||
# d = travel_time(origin[0], origin[1], dest[0], dest[1])
|
||||
# import json
|
||||
# with open('code/json/routing_routeapi.json', 'w') as f:
|
||||
# json.dump(d, f)
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
import requests
|
||||
|
||||
headers = {
|
||||
"Host": "api.rightmove.co.uk",
|
||||
# 'Accept-Encoding': 'gzip, deflate, br',
|
||||
"User-Agent": "okhttp/4.10.0",
|
||||
"Connection": "close",
|
||||
}
|
||||
|
||||
params = {
|
||||
"apiApplication": "ANDROID",
|
||||
"appVersion": "3.70.0",
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
"https://api.rightmove.co.uk/api/property/119578451",
|
||||
params=params,
|
||||
headers=headers,
|
||||
verify=False,
|
||||
)
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
def get_districts():
|
||||
def get_districts() -> dict[str, str]:
|
||||
return {
|
||||
"Barking and Dagenham": "REGION^61400",
|
||||
"Barnet": "REGION^93929",
|
||||
|
|
|
|||
|
|
@ -1,10 +1,12 @@
|
|||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from PIL import Image
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
|
||||
def inference(image_path):
|
||||
def inference(image_path: str | Path) -> tuple[str, Any]:
|
||||
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
|
||||
|
||||
image = Image.open(image_path)
|
||||
|
|
@ -19,7 +21,7 @@ def inference(image_path):
|
|||
return output, predictions
|
||||
|
||||
|
||||
def extract_total_sqm(input_str: str):
|
||||
def extract_total_sqm(input_str: str) -> float | None:
|
||||
sqmregex = r"(\d+\.?\d*) ?(sq ?m|sq. ?m)"
|
||||
matches = re.findall(sqmregex, input_str.lower())
|
||||
sqms = [float(m[0]) for m in matches]
|
||||
|
|
@ -29,13 +31,13 @@ def extract_total_sqm(input_str: str):
|
|||
return max(filtered)
|
||||
|
||||
|
||||
def calculate_model(image_path):
|
||||
def calculate_model(image_path: str | Path) -> tuple[float | None, str, Any]:
|
||||
output, predictions_tensor = inference(image_path)
|
||||
estimated_sqm = extract_total_sqm(output)
|
||||
return estimated_sqm, output, predictions_tensor
|
||||
|
||||
|
||||
def improve_img_for_ocr(img: Image):
|
||||
def improve_img_for_ocr(img: Image.Image) -> Image.Image:
|
||||
img2 = np.array(img.convert("L"))
|
||||
cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
|
||||
thresh = cv2.adaptiveThreshold(
|
||||
|
|
@ -44,7 +46,7 @@ def improve_img_for_ocr(img: Image):
|
|||
return Image.fromarray(thresh)
|
||||
|
||||
|
||||
def calculate_ocr(image_path) -> tuple[float | None, str]:
|
||||
def calculate_ocr(image_path: str | Path) -> tuple[float | None, str]:
|
||||
import pytesseract
|
||||
|
||||
img = Image.open(image_path)
|
||||
|
|
|
|||
41
crawler/rec/route_serializer.py
Normal file
41
crawler/rec/route_serializer.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
import json
|
||||
from typing import List
|
||||
|
||||
from models.listing import DestinationMode, Route, RouteLegStep
|
||||
from rec import routing
|
||||
|
||||
|
||||
class RouteSerializer:
|
||||
@staticmethod
|
||||
def serialize(route): ...
|
||||
|
||||
@staticmethod
|
||||
def deserialize(route_data_json: str) -> dict[DestinationMode, List[Route]]:
|
||||
json_data = json.loads(route_data_json)
|
||||
destimation_routes = {}
|
||||
for destination_mode_str, routes_json in json_data.items():
|
||||
destination_mode = DestinationMode(
|
||||
destination_address=json.loads(destination_mode_str)[
|
||||
"destination_address"
|
||||
],
|
||||
travel_mode=routing.TravelMode(
|
||||
json.loads(destination_mode_str)["travel_mode"]
|
||||
),
|
||||
)
|
||||
parsed_route = json.loads(routes_json[0])
|
||||
routes = [
|
||||
Route(
|
||||
legs=[
|
||||
RouteLegStep(
|
||||
distance_meters=step["distance_meters"],
|
||||
duration_s=step["duration_s"],
|
||||
travel_mode=routing.TravelMode(step["travel_mode"]),
|
||||
)
|
||||
for step in parsed_route["legs"]
|
||||
],
|
||||
distance_meters=parsed_route["distance_meters"],
|
||||
duration_s=int(parsed_route["duration_s"]),
|
||||
)
|
||||
]
|
||||
destimation_routes[destination_mode] = routes
|
||||
return destimation_routes
|
||||
41
crawler/services/__init__.py
Normal file
41
crawler/services/__init__.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
"""Services package for real estate crawler.
|
||||
|
||||
This package contains two layers of services:
|
||||
|
||||
## Low-level services (internal implementation):
|
||||
- listing_fetcher: Fetches listing data from Rightmove API
|
||||
- image_fetcher: Downloads floorplan images
|
||||
- floorplan_detector: OCR-based square meter detection from floorplans
|
||||
- route_calculator: Calculates transit routes using Google Maps API
|
||||
|
||||
## High-level services (use these in CLI and API):
|
||||
- listing_service: Unified listing operations (get, refresh, download images, etc.)
|
||||
- export_service: Export listings to CSV, GeoJSON
|
||||
- district_service: District lookup and validation
|
||||
- task_service: Background task management
|
||||
"""
|
||||
# Low-level services (internal)
|
||||
from services.listing_fetcher import dump_listings, dump_listings_full
|
||||
from services.image_fetcher import dump_images
|
||||
from services.floorplan_detector import detect_floorplan
|
||||
from services.route_calculator import calculate_route
|
||||
|
||||
# High-level services (CLI and API should use these)
|
||||
from services import listing_service
|
||||
from services import export_service
|
||||
from services import district_service
|
||||
from services import task_service
|
||||
|
||||
__all__ = [
|
||||
# Low-level
|
||||
"dump_listings",
|
||||
"dump_listings_full",
|
||||
"dump_images",
|
||||
"detect_floorplan",
|
||||
"calculate_route",
|
||||
# High-level
|
||||
"listing_service",
|
||||
"export_service",
|
||||
"district_service",
|
||||
"task_service",
|
||||
]
|
||||
38
crawler/services/district_service.py
Normal file
38
crawler/services/district_service.py
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
"""Unified district service - shared between CLI and HTTP API."""
|
||||
from rec.districts import get_districts as _get_districts
|
||||
|
||||
|
||||
def get_all_districts() -> dict[str, str]:
|
||||
"""Get all available districts with their region IDs.
|
||||
|
||||
Used by:
|
||||
- CLI: --district option choices
|
||||
- API: GET /api/get_districts
|
||||
|
||||
Returns:
|
||||
Dictionary mapping district names to region IDs
|
||||
"""
|
||||
return _get_districts()
|
||||
|
||||
|
||||
def get_district_names() -> list[str]:
|
||||
"""Get list of all district names.
|
||||
|
||||
Returns:
|
||||
List of district names
|
||||
"""
|
||||
return list(_get_districts().keys())
|
||||
|
||||
|
||||
def validate_districts(district_names: list[str]) -> tuple[bool, list[str]]:
|
||||
"""Validate that district names exist.
|
||||
|
||||
Args:
|
||||
district_names: List of district names to validate
|
||||
|
||||
Returns:
|
||||
Tuple of (all_valid, invalid_names)
|
||||
"""
|
||||
valid_districts = set(_get_districts().keys())
|
||||
invalid = [d for d in district_names if d not in valid_districts]
|
||||
return len(invalid) == 0, invalid
|
||||
92
crawler/services/export_service.py
Normal file
92
crawler/services/export_service.py
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
"""Unified export service - shared between CLI and HTTP API.
|
||||
|
||||
This module provides export functionality for listings in various formats.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from models.listing import QueryParameters
|
||||
from repositories.listing_repository import ListingRepository
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExportResult:
|
||||
"""Result of an export operation."""
|
||||
success: bool
|
||||
output_path: str | None # For file exports
|
||||
data: Any | None # For in-memory exports (GeoJSON)
|
||||
record_count: int
|
||||
message: str
|
||||
|
||||
|
||||
async def export_to_csv(
|
||||
repository: ListingRepository,
|
||||
output_path: Path,
|
||||
query_parameters: QueryParameters | None = None,
|
||||
) -> ExportResult:
|
||||
"""Export listings to CSV file.
|
||||
|
||||
Used by:
|
||||
- CLI: export-csv
|
||||
- API: (could be added as download endpoint)
|
||||
"""
|
||||
from csv_exporter import export_to_csv as _export_csv
|
||||
|
||||
await _export_csv(repository, output_path, query_parameters)
|
||||
|
||||
listings = await repository.get_listings(query_parameters=query_parameters)
|
||||
return ExportResult(
|
||||
success=True,
|
||||
output_path=str(output_path),
|
||||
data=None,
|
||||
record_count=len(listings),
|
||||
message=f"Exported {len(listings)} listings to {output_path}",
|
||||
)
|
||||
|
||||
|
||||
async def export_to_geojson(
|
||||
repository: ListingRepository,
|
||||
query_parameters: QueryParameters | None = None,
|
||||
output_path: Path | None = None,
|
||||
limit: int | None = None,
|
||||
) -> ExportResult:
|
||||
"""Export listings to GeoJSON format.
|
||||
|
||||
Args:
|
||||
repository: Database repository
|
||||
query_parameters: Filtering parameters
|
||||
output_path: If provided, write to file. Otherwise return data.
|
||||
limit: Maximum number of listings to export
|
||||
|
||||
Used by:
|
||||
- CLI: export-immoweb
|
||||
- API: GET /api/listing_geojson
|
||||
"""
|
||||
from ui_exporter import export_immoweb
|
||||
|
||||
geojson_data = await export_immoweb(
|
||||
repository,
|
||||
output_file=str(output_path) if output_path else None,
|
||||
query_parameters=query_parameters,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
feature_count = len(geojson_data.get("features", [])) if geojson_data else 0
|
||||
|
||||
if output_path:
|
||||
return ExportResult(
|
||||
success=True,
|
||||
output_path=str(output_path),
|
||||
data=None,
|
||||
record_count=feature_count,
|
||||
message=f"Exported {feature_count} listings to {output_path}",
|
||||
)
|
||||
|
||||
return ExportResult(
|
||||
success=True,
|
||||
output_path=None,
|
||||
data=geojson_data,
|
||||
record_count=feature_count,
|
||||
message=f"Generated GeoJSON with {feature_count} features",
|
||||
)
|
||||
42
crawler/services/floorplan_detector.py
Normal file
42
crawler/services/floorplan_detector.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
"""Floorplan detector service - OCR-based square meter detection."""
|
||||
import asyncio
|
||||
from models import Listing
|
||||
from rec import floorplan
|
||||
from repositories.listing_repository import ListingRepository
|
||||
from tqdm.asyncio import tqdm
|
||||
import multiprocessing
|
||||
|
||||
|
||||
async def detect_floorplan(repository: ListingRepository) -> None:
|
||||
"""Detect square meters from floorplan images for all listings."""
|
||||
listings = await repository.get_listings()
|
||||
cpu_count = multiprocessing.cpu_count() // 4
|
||||
semaphore = asyncio.Semaphore(cpu_count)
|
||||
|
||||
updated_listings = [
|
||||
listing
|
||||
for listing in await tqdm.gather(
|
||||
*[_calculate_sqm_ocr(listing, semaphore) for listing in listings]
|
||||
)
|
||||
if listing is not None
|
||||
]
|
||||
await repository.upsert_listings(updated_listings)
|
||||
|
||||
|
||||
async def _calculate_sqm_ocr(
|
||||
listing: Listing, semaphore: asyncio.Semaphore
|
||||
) -> Listing | None:
|
||||
"""Calculate square meters from floorplan images using OCR."""
|
||||
if listing.square_meters is not None:
|
||||
return None
|
||||
sqms: list[float] = []
|
||||
for floorplan_path in listing.floorplan_image_paths:
|
||||
async with semaphore:
|
||||
estimated_sqm, _ = await asyncio.to_thread(
|
||||
floorplan.calculate_ocr, floorplan_path
|
||||
)
|
||||
if estimated_sqm is not None:
|
||||
sqms.append(estimated_sqm)
|
||||
max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0
|
||||
listing.square_meters = max_sqm
|
||||
return listing
|
||||
55
crawler/services/image_fetcher.py
Normal file
55
crawler/services/image_fetcher.py
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
"""Image fetcher service - downloads floorplan images for listings."""
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
import aiohttp
|
||||
from repositories import ListingRepository
|
||||
from tenacity import retry, stop_after_attempt, wait_random
|
||||
from tqdm.asyncio import tqdm
|
||||
|
||||
from models import Listing
|
||||
|
||||
# Setting this too high either crashes rightmove or gets us blocked
|
||||
semaphore = asyncio.Semaphore(5)
|
||||
|
||||
|
||||
async def dump_images(
|
||||
repository: ListingRepository,
|
||||
image_base_path: Path = Path("data/rs/"),
|
||||
) -> None:
|
||||
"""Download floorplan images for all listings."""
|
||||
listings = await repository.get_listings()
|
||||
updated_listings = await tqdm.gather(
|
||||
*[dump_images_for_listing(listing, image_base_path) for listing in listings]
|
||||
)
|
||||
await repository.upsert_listings(
|
||||
[listing for listing in updated_listings if listing is not None]
|
||||
)
|
||||
|
||||
|
||||
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
|
||||
async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None:
|
||||
"""Download floorplan images for a single listing."""
|
||||
all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
|
||||
for floorplan in all_floorplans:
|
||||
url = floorplan["url"]
|
||||
picname = url.split("/")[-1]
|
||||
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
|
||||
if floorplan_path.exists():
|
||||
continue
|
||||
try:
|
||||
async with semaphore:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url) as response:
|
||||
if response.status == 404:
|
||||
return None
|
||||
if response.status != 200:
|
||||
raise Exception(f"Error for {url}: {response.status}")
|
||||
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(floorplan_path, "wb") as f:
|
||||
f.write(await response.read())
|
||||
listing.floorplan_image_paths.append(str(floorplan_path))
|
||||
return listing
|
||||
except Exception as e:
|
||||
tqdm.write(f"Error for {url}: {e}")
|
||||
raise e # raise so that we retry it
|
||||
return None
|
||||
168
crawler/services/listing_service.py
Normal file
168
crawler/services/listing_service.py
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
"""Unified listing service - shared between CLI and HTTP API.
|
||||
|
||||
This module provides the core business logic for listing operations.
|
||||
Both the CLI (main.py) and HTTP API (api/app.py) should use these functions.
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from models.listing import Listing, QueryParameters
|
||||
from repositories.listing_repository import ListingRepository
|
||||
|
||||
|
||||
@dataclass
|
||||
class ListingResult:
|
||||
"""Result of a listing operation."""
|
||||
listings: list[Listing]
|
||||
total_count: int
|
||||
message: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RefreshResult:
|
||||
"""Result of a refresh operation."""
|
||||
task_id: str | None # None if run synchronously
|
||||
new_listings_count: int
|
||||
message: str
|
||||
|
||||
|
||||
async def get_listings(
|
||||
repository: ListingRepository,
|
||||
query_parameters: QueryParameters | None = None,
|
||||
limit: int | None = None,
|
||||
only_ids: list[int] | None = None,
|
||||
) -> ListingResult:
|
||||
"""Get listings from the database with optional filtering.
|
||||
|
||||
Used by:
|
||||
- CLI: export-csv, export-immoweb
|
||||
- API: GET /api/listing, GET /api/listing_geojson
|
||||
"""
|
||||
listings = await repository.get_listings(
|
||||
query_parameters=query_parameters,
|
||||
limit=limit,
|
||||
only_ids=only_ids,
|
||||
)
|
||||
return ListingResult(
|
||||
listings=listings,
|
||||
total_count=len(listings),
|
||||
)
|
||||
|
||||
|
||||
async def refresh_listings(
|
||||
repository: ListingRepository,
|
||||
query_parameters: QueryParameters,
|
||||
full: bool = False,
|
||||
async_mode: bool = False,
|
||||
user_email: str | None = None,
|
||||
) -> RefreshResult:
|
||||
"""Refresh listings by fetching from external API.
|
||||
|
||||
Args:
|
||||
repository: Database repository
|
||||
query_parameters: Filtering parameters
|
||||
full: If True, also fetch images and run OCR
|
||||
async_mode: If True, run as background task and return task_id
|
||||
user_email: User email for tracking (API mode)
|
||||
|
||||
Used by:
|
||||
- CLI: dump-listings
|
||||
- API: POST /api/refresh_listings
|
||||
"""
|
||||
if async_mode:
|
||||
# Import here to avoid circular imports
|
||||
from tasks.listing_tasks import dump_listings_task
|
||||
from datetime import timedelta
|
||||
|
||||
expiry_time = datetime.now() + timedelta(minutes=10)
|
||||
task = dump_listings_task.apply_async(
|
||||
args=(query_parameters.model_dump_json(),),
|
||||
expires=expiry_time,
|
||||
)
|
||||
return RefreshResult(
|
||||
task_id=task.id,
|
||||
new_listings_count=0,
|
||||
message=f"Task {task.id} started",
|
||||
)
|
||||
|
||||
# Synchronous mode - run directly
|
||||
from services.listing_fetcher import dump_listings, dump_listings_full
|
||||
|
||||
if full:
|
||||
new_listings = await dump_listings_full(query_parameters, repository)
|
||||
else:
|
||||
new_listings = await dump_listings(query_parameters, repository)
|
||||
|
||||
return RefreshResult(
|
||||
task_id=None,
|
||||
new_listings_count=len(new_listings),
|
||||
message=f"Fetched {len(new_listings)} new listings",
|
||||
)
|
||||
|
||||
|
||||
async def download_images(
|
||||
repository: ListingRepository,
|
||||
data_dir: Path = Path("data/rs/"),
|
||||
) -> int:
|
||||
"""Download floorplan images for all listings.
|
||||
|
||||
Used by:
|
||||
- CLI: dump-images
|
||||
- API: (could be added)
|
||||
|
||||
Returns:
|
||||
Number of listings processed
|
||||
"""
|
||||
from services.image_fetcher import dump_images
|
||||
|
||||
await dump_images(repository, image_base_path=data_dir)
|
||||
listings = await repository.get_listings()
|
||||
return len(listings)
|
||||
|
||||
|
||||
async def detect_floorplans(
|
||||
repository: ListingRepository,
|
||||
) -> int:
|
||||
"""Run OCR on floorplan images to detect square meters.
|
||||
|
||||
Used by:
|
||||
- CLI: detect-floorplan
|
||||
- API: (could be added)
|
||||
|
||||
Returns:
|
||||
Number of listings processed
|
||||
"""
|
||||
from services.floorplan_detector import detect_floorplan
|
||||
|
||||
await detect_floorplan(repository)
|
||||
listings = await repository.get_listings()
|
||||
return len(listings)
|
||||
|
||||
|
||||
async def calculate_routes(
|
||||
repository: ListingRepository,
|
||||
destination_address: str,
|
||||
travel_mode: str,
|
||||
limit: int | None = None,
|
||||
) -> int:
|
||||
"""Calculate transit routes for listings.
|
||||
|
||||
Used by:
|
||||
- CLI: routing
|
||||
- API: (could be added)
|
||||
|
||||
Returns:
|
||||
Number of listings processed
|
||||
"""
|
||||
from services.route_calculator import calculate_route
|
||||
from rec.routing import TravelMode
|
||||
|
||||
await calculate_route(
|
||||
repository,
|
||||
destination_address,
|
||||
TravelMode[travel_mode],
|
||||
limit=limit,
|
||||
)
|
||||
return limit or 0
|
||||
|
|
@ -1,3 +1,4 @@
|
|||
"""Route calculator service - calculates transit routes using Google Maps API."""
|
||||
from models.listing import DestinationMode, Route, RouteLegStep
|
||||
from repositories.listing_repository import ListingRepository
|
||||
from tqdm.asyncio import tqdm
|
||||
|
|
@ -11,6 +12,7 @@ async def calculate_route(
|
|||
travel_mode: routing.TravelMode,
|
||||
limit: int | None = None,
|
||||
) -> None:
|
||||
"""Calculate transit routes for listings to a destination."""
|
||||
listings = await repository.get_listings()
|
||||
|
||||
if limit is not None:
|
||||
|
|
@ -30,6 +32,7 @@ async def calculate_route(
|
|||
async def update_routing_info(
|
||||
listing: Listing, destination_mode: DestinationMode
|
||||
) -> Listing | None:
|
||||
"""Update routing information for a single listing."""
|
||||
if listing.routing_info.get(destination_mode) is not None:
|
||||
# already calculated, do not recompute to save API calls
|
||||
return None
|
||||
|
|
@ -41,8 +44,7 @@ async def update_routing_info(
|
|||
destination_mode.travel_mode,
|
||||
)
|
||||
|
||||
route_data = routes_data["routes"][0]
|
||||
routes = []
|
||||
routes: list[Route] = []
|
||||
for route_data in routes_data["routes"]:
|
||||
duration_s = int(route_data["duration"].split("s")[0])
|
||||
route = Route(
|
||||
|
|
@ -61,47 +63,4 @@ async def update_routing_info(
|
|||
listing.routing_info_json = listing.serialize_routing_info(
|
||||
{**listing.routing_info, **{destination_mode: routes}}
|
||||
)
|
||||
return listing
|
||||
|
||||
|
||||
# async def geocode_address(
|
||||
# address: str,
|
||||
# geocoding_cache: pathlib.Path,
|
||||
# ) -> tuple[int, int]:
|
||||
# cache = get_geocoding_cache(geocoding_cache)
|
||||
# cached_results = cache.get(address)
|
||||
# if cached_results is None:
|
||||
# # resolve
|
||||
# async with aiohttp.ClientSession() as session:
|
||||
# async with session.get(
|
||||
# ("https://maps.googleapis.com/maps/api/geocode/json"
|
||||
# f"?address={address}"
|
||||
# f"&key={API_KEY_ENVIRONMENT_VARIABLE}")) as response:
|
||||
# if response.status != 200:
|
||||
# raise Exception(
|
||||
# f"Error {response.status} from geocoding API")
|
||||
# cached_results = await response.json()
|
||||
# with open(geocoding_cache, 'w') as f:
|
||||
# json.dump({
|
||||
# **{
|
||||
# address: cached_results,
|
||||
# },
|
||||
# **cache
|
||||
# }, f)
|
||||
# # API format
|
||||
# lat = cached_results["results"][0]["geometry"]["location"]["lat"]
|
||||
# lng = cached_results["results"][0]["geometry"]["location"]["lng"]
|
||||
# cache[address] = (lat, lng)
|
||||
# with open(geocoding_cache, 'w') as f:
|
||||
# json.dump(cache, f)
|
||||
# return lat, lng
|
||||
|
||||
# def get_geocoding_cache(geocoding_cache: pathlib.Path) -> dict[str, Any]:
|
||||
# try:
|
||||
# with open(geocoding_cache, 'x') as f:
|
||||
# json.dump({}, f)
|
||||
# return {}
|
||||
# except FileExistsError:
|
||||
# pass # File already exists
|
||||
# with open(geocoding_cache, 'r') as f:
|
||||
# return json.load(f)
|
||||
return listing
|
||||
|
|
@ -11,9 +11,14 @@ import json
|
|||
class TaskStatus:
|
||||
"""Status of a background task."""
|
||||
task_id: str
|
||||
status: str # PENDING, STARTED, SUCCESS, FAILURE, REVOKED
|
||||
status: str # PENDING, STARTED, SUCCESS, FAILURE, REVOKED, SKIPPED
|
||||
result: Any | None
|
||||
progress: float | None # 0.0 to 1.0
|
||||
processed: int | None # Number of items processed
|
||||
total: int | None # Total number of items
|
||||
message: str | None # Human-readable status message (e.g., "Fetching listings")
|
||||
error: str | None # Error message if failed
|
||||
traceback: str | None # Full traceback if failed
|
||||
|
||||
|
||||
def get_task_status(task_id: str) -> TaskStatus:
|
||||
|
|
@ -33,21 +38,50 @@ def get_task_status(task_id: str) -> TaskStatus:
|
|||
task_result = dump_listings_task.AsyncResult(task_id)
|
||||
|
||||
# Try to serialize result
|
||||
try:
|
||||
result = json.loads(json.dumps(task_result.result))
|
||||
except (TypeError, json.JSONDecodeError):
|
||||
result = str(task_result.result) if task_result.result else None
|
||||
result = None
|
||||
error = None
|
||||
if task_result.failed():
|
||||
# Extract error message from failed task
|
||||
error = str(task_result.result) if task_result.result else None
|
||||
else:
|
||||
try:
|
||||
result = json.loads(json.dumps(task_result.result))
|
||||
except (TypeError, json.JSONDecodeError):
|
||||
result = str(task_result.result) if task_result.result else None
|
||||
|
||||
# Extract progress from task meta if available
|
||||
# Extract traceback if available
|
||||
task_traceback = task_result.traceback if task_result.failed() else None
|
||||
|
||||
# Extract progress, processed, total, and message from task meta
|
||||
progress = None
|
||||
processed = None
|
||||
total = None
|
||||
message = None
|
||||
|
||||
if task_result.info and isinstance(task_result.info, dict):
|
||||
progress = task_result.info.get("progress")
|
||||
processed = task_result.info.get("processed")
|
||||
total = task_result.info.get("total")
|
||||
# Use 'message' if available, fall back to 'reason' for SKIPPED tasks
|
||||
message = task_result.info.get("message") or task_result.info.get("reason")
|
||||
|
||||
# For custom states (like "Fetching listings"), use the state as message
|
||||
# if no message was provided in info
|
||||
if not message and task_result.status not in (
|
||||
"PENDING", "STARTED", "SUCCESS", "FAILURE", "REVOKED", "RETRY"
|
||||
):
|
||||
message = task_result.status
|
||||
|
||||
return TaskStatus(
|
||||
task_id=task_id,
|
||||
status=task_result.status,
|
||||
result=result,
|
||||
progress=progress,
|
||||
processed=processed,
|
||||
total=total,
|
||||
message=message,
|
||||
error=error,
|
||||
traceback=task_traceback,
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
196
crawler/start.sh
196
crawler/start.sh
|
|
@ -1,55 +1,153 @@
|
|||
#!/usr/bin/env bash
|
||||
set -eu
|
||||
|
||||
# This sript is used to start the backend services and configure them according to what's available in the system
|
||||
|
||||
set -eux
|
||||
|
||||
ENV_MODE=${ENV:-"dev"} # Defaults to "dev" if ENV_MODE is unset
|
||||
|
||||
|
||||
case "$ENV_MODE" in
|
||||
dev)
|
||||
echo "🛠️ Running in DEVELOPMENT mode"
|
||||
set +e
|
||||
pkill -f celery
|
||||
pkill watchmedo
|
||||
set -e
|
||||
if ! netstat -tlnp |grep 6379; then
|
||||
echo "Did not find a running redis on 6379. Starting a new instance..."
|
||||
docker run -d --rm --name redis-server -p 6379:6379 redis:latest
|
||||
fi
|
||||
echo "Checking connection to redis is successful..."
|
||||
python celery_app.py
|
||||
|
||||
watchmedo auto-restart --directory=./ --pattern='*.py' --recursive -- celery -A celery_app worker & # DEV to autoreload on changes
|
||||
CELERY_PID=$!
|
||||
;;
|
||||
prod)
|
||||
echo "🚀 Running in PRODUCTION mode"
|
||||
echo "Checking connection to redis is successful..."
|
||||
python celery_app.py
|
||||
alembic upgrade head
|
||||
celery -A celery_app worker --beat &
|
||||
CELERY_PID=$!
|
||||
;;
|
||||
*)
|
||||
echo "❌ Unknown ENV_MODE: $ENV_MODE. Defaulting to DEV."
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
|
||||
cleanup() {
|
||||
echo "Stopping background process (PID: $CELERY_PID)..."
|
||||
kill "$CELERY_PID" 2>/dev/null # Graceful shutdown (SIGTERM)
|
||||
wait "$CELERY_PID" 2>/dev/null # Wait for process to exit
|
||||
# Real Estate Crawler - Development Server
|
||||
# Usage:
|
||||
# ./start.sh - Start with Docker (recommended)
|
||||
# ./start.sh --local - Start locally (requires Poetry and dependencies)
|
||||
# ./start.sh --help - Show help
|
||||
|
||||
show_help() {
|
||||
echo "Real Estate Crawler - Development Server"
|
||||
echo ""
|
||||
echo "Usage: ./start.sh [OPTIONS]"
|
||||
echo ""
|
||||
echo "Options:"
|
||||
echo " (default) Start all services with Docker Compose"
|
||||
echo " --local Run locally with Poetry (requires local deps)"
|
||||
echo " --build Rebuild Docker images before starting"
|
||||
echo " --down Stop and remove all containers"
|
||||
echo " --logs Follow logs from all services"
|
||||
echo " --help Show this help message"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " ./start.sh # Start with Docker"
|
||||
echo " ./start.sh --build # Rebuild and start"
|
||||
echo " ./start.sh --local # Run locally with Poetry"
|
||||
}
|
||||
trap cleanup EXIT SIGINT SIGTERM
|
||||
|
||||
# celery -A celery_app worker -D # PROD
|
||||
uvicorn api.app:app --host 0.0.0.0 --port 5001 --log-level debug
|
||||
# UVICORN_PID=$!
|
||||
start_docker() {
|
||||
local build_flag=""
|
||||
if [[ "${1:-}" == "--build" ]]; then
|
||||
build_flag="--build"
|
||||
fi
|
||||
|
||||
# wait for
|
||||
# less /etc/passwd > /dev/null
|
||||
echo "🐳 Starting services with Docker Compose..."
|
||||
echo ""
|
||||
|
||||
# Check if docker/podman is available
|
||||
if command -v docker &> /dev/null; then
|
||||
COMPOSE_CMD="docker compose"
|
||||
elif command -v podman-compose &> /dev/null; then
|
||||
COMPOSE_CMD="podman-compose"
|
||||
else
|
||||
echo "❌ Error: Neither docker nor podman-compose found."
|
||||
echo " Install Docker: https://docs.docker.com/get-docker/"
|
||||
echo " Or run locally: ./start.sh --local"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
$COMPOSE_CMD up $build_flag
|
||||
}
|
||||
|
||||
stop_docker() {
|
||||
echo "🛑 Stopping all containers..."
|
||||
if command -v docker &> /dev/null; then
|
||||
docker compose down
|
||||
elif command -v podman-compose &> /dev/null; then
|
||||
podman-compose down
|
||||
fi
|
||||
}
|
||||
|
||||
show_logs() {
|
||||
if command -v docker &> /dev/null; then
|
||||
docker compose logs -f
|
||||
elif command -v podman-compose &> /dev/null; then
|
||||
podman-compose logs -f
|
||||
fi
|
||||
}
|
||||
|
||||
start_local() {
|
||||
echo "🛠️ Starting locally with Poetry..."
|
||||
echo ""
|
||||
|
||||
# Check Poetry is available
|
||||
if ! command -v poetry &> /dev/null; then
|
||||
echo "❌ Error: Poetry not found."
|
||||
echo " Install: curl -sSL https://install.python-poetry.org | python3 -"
|
||||
echo " Or use Docker: ./start.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Source .env if it exists
|
||||
if [[ -f .env ]]; then
|
||||
set -a
|
||||
source .env
|
||||
set +a
|
||||
fi
|
||||
|
||||
ENV_MODE=${ENV:-"dev"}
|
||||
|
||||
# Ensure Redis is running
|
||||
if ! nc -z localhost 6379 2>/dev/null; then
|
||||
echo "📦 Starting Redis container..."
|
||||
docker run -d --rm --name rec-redis-local -p 6379:6379 redis:latest || true
|
||||
sleep 2
|
||||
fi
|
||||
|
||||
echo "✅ Redis OK"
|
||||
|
||||
# Test celery connection
|
||||
poetry run python celery_app.py
|
||||
|
||||
# Start Celery worker in background
|
||||
echo "🔧 Starting Celery worker..."
|
||||
if [[ "$ENV_MODE" == "dev" ]]; then
|
||||
poetry run celery -A celery_app worker --loglevel=info &
|
||||
else
|
||||
poetry run alembic upgrade head
|
||||
poetry run celery -A celery_app worker --beat --loglevel=info &
|
||||
fi
|
||||
CELERY_PID=$!
|
||||
|
||||
cleanup() {
|
||||
echo ""
|
||||
echo "🛑 Stopping Celery worker (PID: $CELERY_PID)..."
|
||||
kill "$CELERY_PID" 2>/dev/null || true
|
||||
wait "$CELERY_PID" 2>/dev/null || true
|
||||
}
|
||||
trap cleanup EXIT SIGINT SIGTERM
|
||||
|
||||
# Start uvicorn
|
||||
echo "🚀 Starting API server on http://localhost:5001"
|
||||
echo ""
|
||||
poetry run uvicorn api.app:app --host 0.0.0.0 --port 5001 --reload
|
||||
}
|
||||
|
||||
# Parse arguments
|
||||
case "${1:-}" in
|
||||
--help|-h)
|
||||
show_help
|
||||
;;
|
||||
--local)
|
||||
start_local
|
||||
;;
|
||||
--down)
|
||||
stop_docker
|
||||
;;
|
||||
--logs)
|
||||
show_logs
|
||||
;;
|
||||
--build)
|
||||
start_docker --build
|
||||
;;
|
||||
"")
|
||||
start_docker
|
||||
;;
|
||||
*)
|
||||
echo "❌ Unknown option: $1"
|
||||
echo ""
|
||||
show_help
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
|
|
|||
1
crawler/tests/__init__.py
Normal file
1
crawler/tests/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# Tests package
|
||||
186
crawler/tests/conftest.py
Normal file
186
crawler/tests/conftest.py
Normal file
|
|
@ -0,0 +1,186 @@
|
|||
"""Shared pytest fixtures for the test suite."""
|
||||
from datetime import datetime
|
||||
from typing import AsyncGenerator, Generator
|
||||
import pytest
|
||||
from sqlalchemy import Engine
|
||||
from sqlmodel import SQLModel, Session, create_engine
|
||||
from httpx import ASGITransport, AsyncClient
|
||||
|
||||
from models.listing import (
|
||||
BuyListing,
|
||||
FurnishType,
|
||||
ListingSite,
|
||||
RentListing,
|
||||
Listing,
|
||||
)
|
||||
from repositories.listing_repository import ListingRepository
|
||||
from api.auth import User
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def in_memory_engine() -> Generator[Engine, None, None]:
|
||||
"""Create an in-memory SQLite engine for testing."""
|
||||
engine = create_engine(
|
||||
"sqlite:///:memory:",
|
||||
echo=False,
|
||||
connect_args={"check_same_thread": False},
|
||||
)
|
||||
SQLModel.metadata.create_all(engine)
|
||||
yield engine
|
||||
SQLModel.metadata.drop_all(engine)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def listing_repository(in_memory_engine: Engine) -> ListingRepository:
|
||||
"""Create a ListingRepository with the in-memory engine."""
|
||||
return ListingRepository(engine=in_memory_engine)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_rent_listing() -> RentListing:
|
||||
"""Create a sample RentListing for testing."""
|
||||
return RentListing(
|
||||
id=12345678,
|
||||
price=2500.0,
|
||||
number_of_bedrooms=2,
|
||||
square_meters=65.0,
|
||||
agency="Test Agency",
|
||||
council_tax_band="C",
|
||||
longitude=-0.1276,
|
||||
latitude=51.5074,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=datetime.now(),
|
||||
photo_thumbnail="https://example.com/photo.jpg",
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.FURNISHED,
|
||||
available_from=datetime.now(),
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_buy_listing() -> BuyListing:
|
||||
"""Create a sample BuyListing for testing."""
|
||||
return BuyListing(
|
||||
id=87654321,
|
||||
price=450000.0,
|
||||
number_of_bedrooms=3,
|
||||
square_meters=95.0,
|
||||
agency="Test Estate Agents",
|
||||
council_tax_band="D",
|
||||
longitude=-0.1180,
|
||||
latitude=51.5100,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=datetime.now(),
|
||||
photo_thumbnail="https://example.com/buy_photo.jpg",
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
service_charge=1500.0,
|
||||
lease_left=90,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_rent_listings() -> list[RentListing]:
|
||||
"""Create multiple sample RentListings for testing filters."""
|
||||
now = datetime.now()
|
||||
return [
|
||||
RentListing(
|
||||
id=1,
|
||||
price=1500.0,
|
||||
number_of_bedrooms=1,
|
||||
square_meters=40.0,
|
||||
agency="Agency A",
|
||||
council_tax_band="B",
|
||||
longitude=-0.1,
|
||||
latitude=51.5,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=now,
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.FURNISHED,
|
||||
available_from=now,
|
||||
),
|
||||
RentListing(
|
||||
id=2,
|
||||
price=2000.0,
|
||||
number_of_bedrooms=2,
|
||||
square_meters=55.0,
|
||||
agency="Agency B",
|
||||
council_tax_band="C",
|
||||
longitude=-0.12,
|
||||
latitude=51.51,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=now,
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.UNFURNISHED,
|
||||
available_from=now,
|
||||
),
|
||||
RentListing(
|
||||
id=3,
|
||||
price=3000.0,
|
||||
number_of_bedrooms=3,
|
||||
square_meters=80.0,
|
||||
agency="Agency C",
|
||||
council_tax_band="D",
|
||||
longitude=-0.14,
|
||||
latitude=51.52,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=now,
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.FURNISHED,
|
||||
available_from=now,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_user() -> User:
|
||||
"""Create a mock user for API tests."""
|
||||
return User(
|
||||
sub="test-user-id",
|
||||
email="test@example.com",
|
||||
name="Test User",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
async def async_client(
|
||||
in_memory_engine: Engine, mock_user: User
|
||||
) -> AsyncGenerator[AsyncClient, None]:
|
||||
"""Create an AsyncClient for API testing with mock auth."""
|
||||
from api.app import app
|
||||
from api.auth import get_current_user
|
||||
|
||||
# Override dependencies
|
||||
app.dependency_overrides[get_current_user] = lambda: mock_user
|
||||
|
||||
# Patch the engine used by the repository
|
||||
original_engine = None
|
||||
try:
|
||||
from database import engine as db_engine
|
||||
original_engine = db_engine
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
transport = ASGITransport(app=app)
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
yield client
|
||||
|
||||
# Clean up dependency overrides
|
||||
app.dependency_overrides.clear()
|
||||
1
crawler/tests/integration/__init__.py
Normal file
1
crawler/tests/integration/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# Integration tests package
|
||||
180
crawler/tests/integration/test_api.py
Normal file
180
crawler/tests/integration/test_api.py
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
"""Integration tests for API endpoints."""
|
||||
from unittest.mock import AsyncMock, patch
|
||||
import pytest
|
||||
from httpx import AsyncClient
|
||||
|
||||
from api.auth import User
|
||||
|
||||
|
||||
class TestStatusEndpoint:
|
||||
"""Tests for the /api/status endpoint."""
|
||||
|
||||
async def test_status_endpoint_returns_ok(
|
||||
self, async_client: AsyncClient
|
||||
) -> None:
|
||||
"""Test that status endpoint returns OK status."""
|
||||
response = await async_client.get("/api/status")
|
||||
assert response.status_code == 200
|
||||
assert response.json() == {"status": "OK"}
|
||||
|
||||
|
||||
class TestListingEndpoint:
|
||||
"""Tests for the /api/listing endpoint."""
|
||||
|
||||
async def test_listing_endpoint_requires_auth(self) -> None:
|
||||
"""Test that listing endpoint requires authentication."""
|
||||
from api.app import app
|
||||
from httpx import ASGITransport, AsyncClient
|
||||
|
||||
# Clear any dependency overrides to test auth requirement
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
transport = ASGITransport(app=app)
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get("/api/listing")
|
||||
# Should return 401 or 403 without valid auth
|
||||
assert response.status_code in (401, 403)
|
||||
|
||||
async def test_listing_endpoint_with_auth(
|
||||
self, async_client: AsyncClient
|
||||
) -> None:
|
||||
"""Test that listing endpoint works with authentication."""
|
||||
# Mock the repository to return empty list
|
||||
with patch(
|
||||
"api.app.ListingRepository.get_listings",
|
||||
new_callable=AsyncMock,
|
||||
return_value=[],
|
||||
):
|
||||
response = await async_client.get("/api/listing")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert "listings" in data
|
||||
|
||||
|
||||
class TestListingGeoJsonEndpoint:
|
||||
"""Tests for the /api/listing_geojson endpoint."""
|
||||
|
||||
async def test_listing_geojson_requires_auth(self) -> None:
|
||||
"""Test that listing_geojson endpoint requires authentication."""
|
||||
from api.app import app
|
||||
from httpx import ASGITransport, AsyncClient
|
||||
|
||||
# Clear any dependency overrides to test auth requirement
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
transport = ASGITransport(app=app)
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get(
|
||||
"/api/listing_geojson",
|
||||
params={"listing_type": "RENT"},
|
||||
)
|
||||
# Should return 401 or 403 without valid auth
|
||||
assert response.status_code in (401, 403)
|
||||
|
||||
async def test_listing_geojson_with_filters(
|
||||
self, async_client: AsyncClient
|
||||
) -> None:
|
||||
"""Test that listing_geojson accepts filter parameters."""
|
||||
with patch(
|
||||
"api.app.export_immoweb",
|
||||
new_callable=AsyncMock,
|
||||
return_value={"type": "FeatureCollection", "features": []},
|
||||
):
|
||||
response = await async_client.get(
|
||||
"/api/listing_geojson",
|
||||
params={
|
||||
"listing_type": "RENT",
|
||||
"min_bedrooms": 2,
|
||||
"max_bedrooms": 3,
|
||||
"min_price": 1500,
|
||||
"max_price": 3000,
|
||||
},
|
||||
)
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert data["type"] == "FeatureCollection"
|
||||
|
||||
|
||||
class TestGetDistrictsEndpoint:
|
||||
"""Tests for the /api/get_districts endpoint."""
|
||||
|
||||
async def test_get_districts_requires_auth(self) -> None:
|
||||
"""Test that get_districts endpoint requires authentication."""
|
||||
from api.app import app
|
||||
from httpx import ASGITransport, AsyncClient
|
||||
|
||||
# Clear any dependency overrides to test auth requirement
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
transport = ASGITransport(app=app)
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get("/api/get_districts")
|
||||
# Should return 401 or 403 without valid auth
|
||||
assert response.status_code in (401, 403)
|
||||
|
||||
async def test_get_districts_returns_dict(
|
||||
self, async_client: AsyncClient
|
||||
) -> None:
|
||||
"""Test that get_districts returns a dictionary of districts."""
|
||||
response = await async_client.get("/api/get_districts")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert isinstance(data, dict)
|
||||
# Check some known districts exist
|
||||
assert "London" in data
|
||||
assert "Westminster" in data
|
||||
assert "Camden" in data
|
||||
|
||||
async def test_get_districts_values_are_region_ids(
|
||||
self, async_client: AsyncClient
|
||||
) -> None:
|
||||
"""Test that district values are REGION identifiers."""
|
||||
response = await async_client.get("/api/get_districts")
|
||||
data = response.json()
|
||||
# All values should be REGION^... format
|
||||
for district_name, region_id in data.items():
|
||||
assert region_id.startswith("REGION^"), (
|
||||
f"District {district_name} has invalid region ID: {region_id}"
|
||||
)
|
||||
|
||||
|
||||
class TestRefreshListingsEndpoint:
|
||||
"""Tests for the /api/refresh_listings endpoint."""
|
||||
|
||||
async def test_refresh_listings_requires_auth(self) -> None:
|
||||
"""Test that refresh_listings endpoint requires authentication."""
|
||||
from api.app import app
|
||||
from httpx import ASGITransport, AsyncClient
|
||||
|
||||
# Clear any dependency overrides to test auth requirement
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
transport = ASGITransport(app=app)
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.post(
|
||||
"/api/refresh_listings",
|
||||
params={"listing_type": "RENT"},
|
||||
)
|
||||
# Should return 401 or 403 without valid auth
|
||||
assert response.status_code in (401, 403)
|
||||
|
||||
|
||||
class TestTaskStatusEndpoint:
|
||||
"""Tests for the /api/task_status endpoint."""
|
||||
|
||||
async def test_task_status_requires_auth(self) -> None:
|
||||
"""Test that task_status endpoint requires authentication."""
|
||||
from api.app import app
|
||||
from httpx import ASGITransport, AsyncClient
|
||||
|
||||
# Clear any dependency overrides to test auth requirement
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
transport = ASGITransport(app=app)
|
||||
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||
response = await client.get(
|
||||
"/api/task_status",
|
||||
params={"task_id": "test-task-id"},
|
||||
)
|
||||
# Should return 401 or 403 without valid auth
|
||||
assert response.status_code in (401, 403)
|
||||
299
crawler/tests/test_listing_geojson.py
Normal file
299
crawler/tests/test_listing_geojson.py
Normal file
|
|
@ -0,0 +1,299 @@
|
|||
"""Tests for the listing_geojson API endpoint and QueryParameters parsing."""
|
||||
import json
|
||||
import pytest
|
||||
from datetime import datetime
|
||||
from unittest.mock import patch, MagicMock, AsyncMock
|
||||
|
||||
|
||||
class TestQueryParametersModel:
|
||||
"""Test QueryParameters model directly."""
|
||||
|
||||
def test_datetime_parsing_z_suffix(self):
|
||||
"""Test that datetime with Z suffix is parsed correctly."""
|
||||
from models.listing import QueryParameters, ListingType
|
||||
|
||||
params = QueryParameters(
|
||||
listing_type=ListingType.RENT,
|
||||
let_date_available_from="2026-02-01T11:33:01.248Z",
|
||||
)
|
||||
assert params.let_date_available_from is not None
|
||||
assert params.let_date_available_from.year == 2026
|
||||
|
||||
def test_datetime_parsing_offset(self):
|
||||
"""Test that datetime with offset is parsed correctly."""
|
||||
from models.listing import QueryParameters, ListingType
|
||||
|
||||
params = QueryParameters(
|
||||
listing_type=ListingType.RENT,
|
||||
let_date_available_from="2026-02-01T11:33:01.248+00:00",
|
||||
)
|
||||
assert params.let_date_available_from is not None
|
||||
|
||||
def test_defaults_work(self):
|
||||
"""Test that default values are applied correctly."""
|
||||
from models.listing import QueryParameters, ListingType
|
||||
|
||||
params = QueryParameters(listing_type=ListingType.RENT)
|
||||
assert params.min_bedrooms == 1
|
||||
assert params.max_bedrooms == 999
|
||||
assert params.min_price == 0
|
||||
assert params.max_price == 10_000_000
|
||||
assert params.district_names == set()
|
||||
assert params.let_date_available_from is None
|
||||
|
||||
def test_full_frontend_params(self):
|
||||
"""Test with all parameters as sent by frontend."""
|
||||
from models.listing import QueryParameters, ListingType
|
||||
|
||||
params = QueryParameters(
|
||||
listing_type=ListingType.RENT,
|
||||
min_bedrooms=1,
|
||||
max_bedrooms=3,
|
||||
max_price=3000,
|
||||
min_price=2000,
|
||||
min_sqm=50,
|
||||
last_seen_days=28,
|
||||
let_date_available_from="2026-02-01T11:19:22.072Z",
|
||||
)
|
||||
assert params.listing_type == ListingType.RENT
|
||||
assert params.min_bedrooms == 1
|
||||
assert params.max_bedrooms == 3
|
||||
assert params.min_sqm == 50
|
||||
|
||||
|
||||
class TestGetQueryParametersDependency:
|
||||
"""Test the get_query_parameters FastAPI dependency."""
|
||||
|
||||
def test_parses_datetime_correctly(self):
|
||||
"""Test that the dependency parses datetime Z suffix."""
|
||||
from api.app import get_query_parameters
|
||||
from models.listing import ListingType
|
||||
|
||||
params = get_query_parameters(
|
||||
listing_type=ListingType.RENT,
|
||||
let_date_available_from=datetime(2026, 2, 1, 11, 33, 1),
|
||||
)
|
||||
assert params.let_date_available_from is not None
|
||||
|
||||
def test_defaults_applied(self):
|
||||
"""Test that defaults are applied when not provided."""
|
||||
from api.app import get_query_parameters
|
||||
from models.listing import ListingType
|
||||
|
||||
params = get_query_parameters(listing_type=ListingType.RENT)
|
||||
assert params.min_bedrooms == 1
|
||||
assert params.max_bedrooms == 999
|
||||
|
||||
|
||||
class TestListingGeoJsonEndpoint:
|
||||
"""Test the /api/listing_geojson endpoint."""
|
||||
|
||||
@pytest.fixture
|
||||
def client(self):
|
||||
"""Create test client with mocked auth."""
|
||||
from fastapi.testclient import TestClient
|
||||
from api.app import app, get_current_user
|
||||
from api.auth import User
|
||||
|
||||
# Override auth dependency
|
||||
async def mock_auth():
|
||||
return User(email="test@example.com", name="Test User")
|
||||
|
||||
app.dependency_overrides[get_current_user] = mock_auth
|
||||
yield TestClient(app)
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
@pytest.fixture
|
||||
def mock_export(self):
|
||||
"""Mock the export service."""
|
||||
with patch("api.app.export_service.export_to_geojson") as mock:
|
||||
mock.return_value = MagicMock(
|
||||
data={"type": "FeatureCollection", "features": [{"type": "Feature"}]}
|
||||
)
|
||||
yield mock
|
||||
|
||||
def test_minimal_params_no_422(self, client, mock_export):
|
||||
"""Test that minimal params don't cause 422."""
|
||||
response = client.get("/api/listing_geojson?listing_type=RENT")
|
||||
assert response.status_code != 422, f"Got 422: {response.json()}"
|
||||
|
||||
def test_with_datetime_z_suffix_no_422(self, client, mock_export):
|
||||
"""Test datetime parsing with Z suffix doesn't cause 422."""
|
||||
response = client.get(
|
||||
"/api/listing_geojson?"
|
||||
"listing_type=RENT"
|
||||
"&let_date_available_from=2026-02-01T11:33:01.248Z"
|
||||
)
|
||||
assert response.status_code != 422, f"Got 422: {response.json()}"
|
||||
|
||||
def test_full_frontend_params_no_422(self, client, mock_export):
|
||||
"""Test with all parameters as sent by frontend."""
|
||||
response = client.get(
|
||||
"/api/listing_geojson?"
|
||||
"listing_type=RENT"
|
||||
"&min_bedrooms=1"
|
||||
"&max_bedrooms=3"
|
||||
"&max_price=3000"
|
||||
"&min_price=2000"
|
||||
"&min_sqm=50"
|
||||
"&last_seen_days=28"
|
||||
"&let_date_available_from=2026-02-01T11:19:22.072Z"
|
||||
)
|
||||
assert response.status_code != 422, f"Got 422: {response.json()}"
|
||||
|
||||
def test_returns_geojson_structure(self, client, mock_export):
|
||||
"""Test that endpoint returns proper GeoJSON structure."""
|
||||
response = client.get("/api/listing_geojson?listing_type=RENT")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert "type" in data
|
||||
assert data["type"] == "FeatureCollection"
|
||||
assert "features" in data
|
||||
|
||||
|
||||
class TestStreamingEndpoint:
|
||||
"""Test the /api/listing_geojson/stream endpoint."""
|
||||
|
||||
@pytest.fixture
|
||||
def client(self):
|
||||
"""Create test client with mocked auth."""
|
||||
from fastapi.testclient import TestClient
|
||||
from api.app import app
|
||||
from api.auth import get_current_user, User
|
||||
|
||||
async def mock_auth():
|
||||
return User(sub="test-id", email="test@example.com", name="Test User")
|
||||
|
||||
app.dependency_overrides[get_current_user] = mock_auth
|
||||
yield TestClient(app)
|
||||
app.dependency_overrides.clear()
|
||||
|
||||
@pytest.fixture
|
||||
def mock_repository(self):
|
||||
"""Mock the repository methods."""
|
||||
with patch("api.app.ListingRepository") as MockRepo:
|
||||
mock_instance = MagicMock()
|
||||
mock_instance.count_listings.return_value = 3
|
||||
mock_instance.stream_listings_optimized.return_value = iter([
|
||||
{
|
||||
'id': 1,
|
||||
'price': 2000.0,
|
||||
'number_of_bedrooms': 2,
|
||||
'square_meters': 50.0,
|
||||
'longitude': -0.1,
|
||||
'latitude': 51.5,
|
||||
'photo_thumbnail': 'https://example.com/1.jpg',
|
||||
'last_seen': datetime.now(),
|
||||
'agency': 'Test Agency',
|
||||
'price_history_json': '[]',
|
||||
'available_from': datetime.now(),
|
||||
},
|
||||
{
|
||||
'id': 2,
|
||||
'price': 2500.0,
|
||||
'number_of_bedrooms': 2,
|
||||
'square_meters': 60.0,
|
||||
'longitude': -0.12,
|
||||
'latitude': 51.51,
|
||||
'photo_thumbnail': 'https://example.com/2.jpg',
|
||||
'last_seen': datetime.now(),
|
||||
'agency': 'Test Agency 2',
|
||||
'price_history_json': '[]',
|
||||
'available_from': None,
|
||||
},
|
||||
{
|
||||
'id': 3,
|
||||
'price': 3000.0,
|
||||
'number_of_bedrooms': 3,
|
||||
'square_meters': None,
|
||||
'longitude': -0.14,
|
||||
'latitude': 51.52,
|
||||
'photo_thumbnail': None,
|
||||
'last_seen': datetime.now(),
|
||||
'agency': None,
|
||||
'price_history_json': '[{"first_seen": "2026-01-01", "last_seen": "2026-01-15", "price": 2800}]',
|
||||
'available_from': None,
|
||||
},
|
||||
])
|
||||
MockRepo.return_value = mock_instance
|
||||
yield mock_instance
|
||||
|
||||
def test_streaming_returns_ndjson(self, client, mock_repository):
|
||||
"""Test that streaming endpoint returns NDJSON format."""
|
||||
response = client.get("/api/listing_geojson/stream?listing_type=RENT&limit=10")
|
||||
assert response.status_code == 200
|
||||
assert response.headers["content-type"] == "application/x-ndjson"
|
||||
|
||||
def test_streaming_metadata_includes_total_expected(self, client, mock_repository):
|
||||
"""Test that first line includes total_expected count."""
|
||||
response = client.get("/api/listing_geojson/stream?listing_type=RENT&limit=10")
|
||||
lines = response.text.strip().split("\n")
|
||||
assert len(lines) >= 1
|
||||
|
||||
metadata = json.loads(lines[0])
|
||||
assert metadata["type"] == "metadata"
|
||||
assert "total_expected" in metadata
|
||||
assert metadata["total_expected"] == 3
|
||||
assert "batch_size" in metadata
|
||||
|
||||
def test_streaming_returns_batches_and_complete(self, client, mock_repository):
|
||||
"""Test that streaming returns batch and complete messages."""
|
||||
response = client.get("/api/listing_geojson/stream?listing_type=RENT&limit=10")
|
||||
lines = response.text.strip().split("\n")
|
||||
|
||||
# Parse all lines
|
||||
messages = [json.loads(line) for line in lines]
|
||||
|
||||
# First should be metadata
|
||||
assert messages[0]["type"] == "metadata"
|
||||
|
||||
# Should have at least one batch
|
||||
batch_messages = [m for m in messages if m["type"] == "batch"]
|
||||
assert len(batch_messages) >= 1
|
||||
|
||||
# Last should be complete
|
||||
assert messages[-1]["type"] == "complete"
|
||||
assert "total" in messages[-1]
|
||||
|
||||
def test_streaming_features_have_correct_structure(self, client, mock_repository):
|
||||
"""Test that streamed features have correct GeoJSON structure."""
|
||||
response = client.get("/api/listing_geojson/stream?listing_type=RENT&batch_size=10&limit=10")
|
||||
lines = response.text.strip().split("\n")
|
||||
messages = [json.loads(line) for line in lines]
|
||||
|
||||
batch_messages = [m for m in messages if m["type"] == "batch"]
|
||||
assert len(batch_messages) >= 1
|
||||
|
||||
features = batch_messages[0]["features"]
|
||||
assert len(features) > 0
|
||||
|
||||
feature = features[0]
|
||||
assert feature["type"] == "Feature"
|
||||
assert "properties" in feature
|
||||
assert "geometry" in feature
|
||||
assert feature["geometry"]["type"] == "Point"
|
||||
assert "coordinates" in feature["geometry"]
|
||||
|
||||
# Check properties
|
||||
props = feature["properties"]
|
||||
assert "total_price" in props
|
||||
assert "rooms" in props
|
||||
assert "url" in props
|
||||
assert "last_seen" in props
|
||||
|
||||
def test_streaming_handles_null_square_meters(self, client, mock_repository):
|
||||
"""Test that null square_meters doesn't cause errors."""
|
||||
response = client.get("/api/listing_geojson/stream?listing_type=RENT&batch_size=10&limit=10")
|
||||
assert response.status_code == 200
|
||||
|
||||
lines = response.text.strip().split("\n")
|
||||
messages = [json.loads(line) for line in lines]
|
||||
|
||||
# Find feature with id=3 (has null square_meters)
|
||||
for msg in messages:
|
||||
if msg["type"] == "batch":
|
||||
for feature in msg["features"]:
|
||||
if feature["properties"]["url"].endswith("/3"):
|
||||
assert feature["properties"]["qm"] is None
|
||||
assert feature["properties"]["qmprice"] is None
|
||||
|
||||
1
crawler/tests/unit/__init__.py
Normal file
1
crawler/tests/unit/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
|||
# Unit tests package
|
||||
343
crawler/tests/unit/test_models.py
Normal file
343
crawler/tests/unit/test_models.py
Normal file
|
|
@ -0,0 +1,343 @@
|
|||
"""Unit tests for Listing models."""
|
||||
from datetime import datetime
|
||||
import json
|
||||
import pytest
|
||||
|
||||
from models.listing import (
|
||||
BuyListing,
|
||||
FurnishType,
|
||||
ListingSite,
|
||||
PriceHistoryItem,
|
||||
RentListing,
|
||||
Listing,
|
||||
)
|
||||
|
||||
|
||||
class TestListing:
|
||||
"""Tests for the base Listing model."""
|
||||
|
||||
def test_price_per_square_meter_calculation(self) -> None:
|
||||
"""Test that price_per_square_meter is calculated correctly."""
|
||||
listing = RentListing(
|
||||
id=1,
|
||||
price=2000.0,
|
||||
number_of_bedrooms=2,
|
||||
square_meters=50.0,
|
||||
agency="Test",
|
||||
council_tax_band="C",
|
||||
longitude=0.0,
|
||||
latitude=0.0,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=datetime.now(),
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.FURNISHED,
|
||||
available_from=None,
|
||||
)
|
||||
assert listing.price_per_square_meter == 40.0
|
||||
|
||||
def test_price_per_square_meter_none_when_no_sqm(self) -> None:
|
||||
"""Test that price_per_square_meter is None when square_meters is None."""
|
||||
listing = RentListing(
|
||||
id=1,
|
||||
price=2000.0,
|
||||
number_of_bedrooms=2,
|
||||
square_meters=None,
|
||||
agency="Test",
|
||||
council_tax_band="C",
|
||||
longitude=0.0,
|
||||
latitude=0.0,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=datetime.now(),
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.FURNISHED,
|
||||
available_from=None,
|
||||
)
|
||||
assert listing.price_per_square_meter is None
|
||||
|
||||
def test_price_per_square_meter_none_when_sqm_zero(self) -> None:
|
||||
"""Test that price_per_square_meter is None when square_meters is 0."""
|
||||
listing = RentListing(
|
||||
id=1,
|
||||
price=2000.0,
|
||||
number_of_bedrooms=2,
|
||||
square_meters=0.0,
|
||||
agency="Test",
|
||||
council_tax_band="C",
|
||||
longitude=0.0,
|
||||
latitude=0.0,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=datetime.now(),
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.FURNISHED,
|
||||
available_from=None,
|
||||
)
|
||||
assert listing.price_per_square_meter is None
|
||||
|
||||
def test_url_property(self) -> None:
|
||||
"""Test that url property returns correct Rightmove URL."""
|
||||
listing = RentListing(
|
||||
id=123456789,
|
||||
price=2000.0,
|
||||
number_of_bedrooms=2,
|
||||
square_meters=50.0,
|
||||
agency="Test",
|
||||
council_tax_band="C",
|
||||
longitude=0.0,
|
||||
latitude=0.0,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=datetime.now(),
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.FURNISHED,
|
||||
available_from=None,
|
||||
)
|
||||
assert listing.url == "https://www.rightmove.co.uk/properties/123456789"
|
||||
|
||||
def test_is_removed_property_visible(self) -> None:
|
||||
"""Test that is_removed returns False when property is visible."""
|
||||
listing = RentListing(
|
||||
id=1,
|
||||
price=2000.0,
|
||||
number_of_bedrooms=2,
|
||||
square_meters=50.0,
|
||||
agency="Test",
|
||||
council_tax_band="C",
|
||||
longitude=0.0,
|
||||
latitude=0.0,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=datetime.now(),
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.FURNISHED,
|
||||
available_from=None,
|
||||
)
|
||||
assert listing.is_removed is False
|
||||
|
||||
def test_is_removed_property_not_visible(self) -> None:
|
||||
"""Test that is_removed returns True when property is not visible."""
|
||||
listing = RentListing(
|
||||
id=1,
|
||||
price=2000.0,
|
||||
number_of_bedrooms=2,
|
||||
square_meters=50.0,
|
||||
agency="Test",
|
||||
council_tax_band="C",
|
||||
longitude=0.0,
|
||||
latitude=0.0,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=datetime.now(),
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": False}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.FURNISHED,
|
||||
available_from=None,
|
||||
)
|
||||
assert listing.is_removed is True
|
||||
|
||||
|
||||
class TestPriceHistory:
|
||||
"""Tests for price history serialization/deserialization."""
|
||||
|
||||
def test_price_history_serialization_roundtrip(self) -> None:
|
||||
"""Test that price history can be serialized and deserialized."""
|
||||
now = datetime.now()
|
||||
price_history = [
|
||||
PriceHistoryItem(
|
||||
first_seen=now,
|
||||
last_seen=now,
|
||||
price=2000.0,
|
||||
),
|
||||
PriceHistoryItem(
|
||||
first_seen=now,
|
||||
last_seen=now,
|
||||
price=2100.0,
|
||||
),
|
||||
]
|
||||
|
||||
# Serialize
|
||||
serialized = Listing.serialize_price_history(price_history)
|
||||
assert isinstance(serialized, str)
|
||||
|
||||
# Create listing with serialized history
|
||||
listing = RentListing(
|
||||
id=1,
|
||||
price=2100.0,
|
||||
number_of_bedrooms=2,
|
||||
square_meters=50.0,
|
||||
agency="Test",
|
||||
council_tax_band="C",
|
||||
longitude=0.0,
|
||||
latitude=0.0,
|
||||
price_history_json=serialized,
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=now,
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.FURNISHED,
|
||||
available_from=None,
|
||||
)
|
||||
|
||||
# Deserialize
|
||||
deserialized = listing.price_history
|
||||
assert len(deserialized) == 2
|
||||
assert deserialized[0].price == 2000.0
|
||||
assert deserialized[1].price == 2100.0
|
||||
|
||||
def test_price_history_empty(self) -> None:
|
||||
"""Test that empty price history works correctly."""
|
||||
listing = RentListing(
|
||||
id=1,
|
||||
price=2000.0,
|
||||
number_of_bedrooms=2,
|
||||
square_meters=50.0,
|
||||
agency="Test",
|
||||
council_tax_band="C",
|
||||
longitude=0.0,
|
||||
latitude=0.0,
|
||||
price_history_json="",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=datetime.now(),
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.FURNISHED,
|
||||
available_from=None,
|
||||
)
|
||||
assert listing.price_history == []
|
||||
|
||||
def test_price_history_item_to_dict(self) -> None:
|
||||
"""Test PriceHistoryItem.to_dict() method."""
|
||||
now = datetime.now()
|
||||
item = PriceHistoryItem(
|
||||
first_seen=now,
|
||||
last_seen=now,
|
||||
price=2500.0,
|
||||
)
|
||||
result = item.to_dict()
|
||||
assert result["price"] == 2500.0
|
||||
assert result["first_seen"] == now.isoformat()
|
||||
assert result["last_seen"] == now.isoformat()
|
||||
|
||||
|
||||
class TestRentListing:
|
||||
"""Tests specific to RentListing model."""
|
||||
|
||||
def test_rent_listing_has_furnish_type(self) -> None:
|
||||
"""Test that RentListing has furnish_type field."""
|
||||
listing = RentListing(
|
||||
id=1,
|
||||
price=2000.0,
|
||||
number_of_bedrooms=2,
|
||||
square_meters=50.0,
|
||||
agency="Test",
|
||||
council_tax_band="C",
|
||||
longitude=0.0,
|
||||
latitude=0.0,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=datetime.now(),
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.PART_FURNISHED,
|
||||
available_from=None,
|
||||
)
|
||||
assert listing.furnish_type == FurnishType.PART_FURNISHED
|
||||
|
||||
def test_rent_listing_has_available_from(self) -> None:
|
||||
"""Test that RentListing has available_from field."""
|
||||
now = datetime.now()
|
||||
listing = RentListing(
|
||||
id=1,
|
||||
price=2000.0,
|
||||
number_of_bedrooms=2,
|
||||
square_meters=50.0,
|
||||
agency="Test",
|
||||
council_tax_band="C",
|
||||
longitude=0.0,
|
||||
latitude=0.0,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=now,
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
furnish_type=FurnishType.FURNISHED,
|
||||
available_from=now,
|
||||
)
|
||||
assert listing.available_from == now
|
||||
|
||||
|
||||
class TestBuyListing:
|
||||
"""Tests specific to BuyListing model."""
|
||||
|
||||
def test_buy_listing_has_service_charge(self) -> None:
|
||||
"""Test that BuyListing has service_charge field."""
|
||||
listing = BuyListing(
|
||||
id=1,
|
||||
price=450000.0,
|
||||
number_of_bedrooms=3,
|
||||
square_meters=95.0,
|
||||
agency="Test",
|
||||
council_tax_band="D",
|
||||
longitude=0.0,
|
||||
latitude=0.0,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=datetime.now(),
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
service_charge=2500.0,
|
||||
lease_left=85,
|
||||
)
|
||||
assert listing.service_charge == 2500.0
|
||||
|
||||
def test_buy_listing_has_lease_left(self) -> None:
|
||||
"""Test that BuyListing has lease_left field."""
|
||||
listing = BuyListing(
|
||||
id=1,
|
||||
price=450000.0,
|
||||
number_of_bedrooms=3,
|
||||
square_meters=95.0,
|
||||
agency="Test",
|
||||
council_tax_band="D",
|
||||
longitude=0.0,
|
||||
latitude=0.0,
|
||||
price_history_json="[]",
|
||||
listing_site=ListingSite.RIGHTMOVE,
|
||||
last_seen=datetime.now(),
|
||||
photo_thumbnail=None,
|
||||
floorplan_image_paths=[],
|
||||
additional_info={"property": {"visible": True}},
|
||||
routing_info_json=None,
|
||||
service_charge=None,
|
||||
lease_left=120,
|
||||
)
|
||||
assert listing.lease_left == 120
|
||||
74
crawler/tests/unit/test_redis_lock.py
Normal file
74
crawler/tests/unit/test_redis_lock.py
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
"""Unit tests for Redis distributed lock."""
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
||||
from utils.redis_lock import redis_lock, get_redis_client
|
||||
|
||||
|
||||
class TestRedisLock:
|
||||
"""Tests for redis_lock context manager."""
|
||||
|
||||
@mock.patch("utils.redis_lock.get_redis_client")
|
||||
def test_lock_acquired_successfully(self, mock_get_client):
|
||||
"""Test lock acquisition when no other lock exists."""
|
||||
mock_client = mock.MagicMock()
|
||||
mock_client.set.return_value = True
|
||||
mock_get_client.return_value = mock_client
|
||||
|
||||
with redis_lock("test_lock") as acquired:
|
||||
assert acquired is True
|
||||
|
||||
mock_client.set.assert_called_once_with("lock:test_lock", "1", nx=True, ex=3600 * 4)
|
||||
mock_client.delete.assert_called_once_with("lock:test_lock")
|
||||
|
||||
@mock.patch("utils.redis_lock.get_redis_client")
|
||||
def test_lock_not_acquired(self, mock_get_client):
|
||||
"""Test lock not acquired when another lock exists."""
|
||||
mock_client = mock.MagicMock()
|
||||
mock_client.set.return_value = None # Redis returns None when nx=True fails
|
||||
mock_get_client.return_value = mock_client
|
||||
|
||||
with redis_lock("test_lock") as acquired:
|
||||
assert acquired is False
|
||||
|
||||
mock_client.set.assert_called_once_with("lock:test_lock", "1", nx=True, ex=3600 * 4)
|
||||
# Should NOT call delete since we didn't acquire the lock
|
||||
mock_client.delete.assert_not_called()
|
||||
|
||||
@mock.patch("utils.redis_lock.get_redis_client")
|
||||
def test_lock_released_on_exception(self, mock_get_client):
|
||||
"""Test lock is released even when exception occurs."""
|
||||
mock_client = mock.MagicMock()
|
||||
mock_client.set.return_value = True
|
||||
mock_get_client.return_value = mock_client
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
with redis_lock("test_lock") as acquired:
|
||||
assert acquired is True
|
||||
raise ValueError("Test error")
|
||||
|
||||
# Lock should still be released
|
||||
mock_client.delete.assert_called_once_with("lock:test_lock")
|
||||
|
||||
@mock.patch("utils.redis_lock.get_redis_client")
|
||||
def test_custom_timeout(self, mock_get_client):
|
||||
"""Test lock with custom timeout."""
|
||||
mock_client = mock.MagicMock()
|
||||
mock_client.set.return_value = True
|
||||
mock_get_client.return_value = mock_client
|
||||
|
||||
with redis_lock("test_lock", timeout=300) as acquired:
|
||||
assert acquired is True
|
||||
|
||||
mock_client.set.assert_called_once_with("lock:test_lock", "1", nx=True, ex=300)
|
||||
|
||||
@mock.patch("utils.redis_lock.redis")
|
||||
def test_get_redis_client_uses_broker_url(self, mock_redis):
|
||||
"""Test Redis client is created from CELERY_BROKER_URL."""
|
||||
with mock.patch.dict("os.environ", {"CELERY_BROKER_URL": "redis://testhost:1234/5"}):
|
||||
get_redis_client()
|
||||
|
||||
mock_redis.from_url.assert_called_once_with(
|
||||
"redis://testhost:1234/5", decode_responses=True
|
||||
)
|
||||
227
crawler/tests/unit/test_repository.py
Normal file
227
crawler/tests/unit/test_repository.py
Normal file
|
|
@ -0,0 +1,227 @@
|
|||
"""Unit tests for ListingRepository."""
|
||||
from datetime import datetime, timedelta
|
||||
import pytest
|
||||
from sqlalchemy import Engine
|
||||
|
||||
from models.listing import (
|
||||
FurnishType,
|
||||
ListingType,
|
||||
QueryParameters,
|
||||
RentListing,
|
||||
)
|
||||
from repositories.listing_repository import ListingRepository
|
||||
|
||||
|
||||
class TestListingRepository:
|
||||
"""Tests for ListingRepository methods."""
|
||||
|
||||
async def test_get_listings_empty_db(
|
||||
self, listing_repository: ListingRepository
|
||||
) -> None:
|
||||
"""Test that get_listings returns empty list for empty database."""
|
||||
listings = await listing_repository.get_listings()
|
||||
assert listings == []
|
||||
|
||||
async def test_get_listings_returns_inserted_listings(
|
||||
self,
|
||||
listing_repository: ListingRepository,
|
||||
sample_rent_listing: RentListing,
|
||||
) -> None:
|
||||
"""Test that get_listings returns listings that were inserted."""
|
||||
await listing_repository.upsert_listings([sample_rent_listing])
|
||||
listings = await listing_repository.get_listings()
|
||||
assert len(listings) == 1
|
||||
assert listings[0].id == sample_rent_listing.id
|
||||
|
||||
async def test_upsert_listings_creates_new(
|
||||
self,
|
||||
listing_repository: ListingRepository,
|
||||
sample_rent_listing: RentListing,
|
||||
) -> None:
|
||||
"""Test that upsert_listings creates new listings."""
|
||||
result = await listing_repository.upsert_listings([sample_rent_listing])
|
||||
assert len(result) == 1
|
||||
assert result[0].id == sample_rent_listing.id
|
||||
|
||||
# Verify it's in the database
|
||||
listings = await listing_repository.get_listings()
|
||||
assert len(listings) == 1
|
||||
|
||||
async def test_upsert_listings_updates_existing(
|
||||
self,
|
||||
listing_repository: ListingRepository,
|
||||
sample_rent_listing: RentListing,
|
||||
) -> None:
|
||||
"""Test that upsert_listings updates existing listings."""
|
||||
# Insert initial listing
|
||||
await listing_repository.upsert_listings([sample_rent_listing])
|
||||
|
||||
# Update the listing
|
||||
sample_rent_listing.price = 3000.0
|
||||
await listing_repository.upsert_listings([sample_rent_listing])
|
||||
|
||||
# Verify update
|
||||
listings = await listing_repository.get_listings()
|
||||
assert len(listings) == 1
|
||||
assert listings[0].price == 3000.0
|
||||
|
||||
async def test_mark_seen_updates_timestamp(
|
||||
self,
|
||||
listing_repository: ListingRepository,
|
||||
sample_rent_listing: RentListing,
|
||||
) -> None:
|
||||
"""Test that mark_seen updates the last_seen timestamp."""
|
||||
# Set an old timestamp
|
||||
old_time = datetime.now() - timedelta(days=7)
|
||||
sample_rent_listing.last_seen = old_time
|
||||
await listing_repository.upsert_listings([sample_rent_listing])
|
||||
|
||||
# Mark as seen
|
||||
await listing_repository.mark_seen(sample_rent_listing.id)
|
||||
|
||||
# Verify timestamp was updated
|
||||
listings = await listing_repository.get_listings()
|
||||
assert len(listings) == 1
|
||||
assert listings[0].last_seen > old_time
|
||||
|
||||
async def test_mark_seen_nonexistent_listing(
|
||||
self, listing_repository: ListingRepository
|
||||
) -> None:
|
||||
"""Test that mark_seen handles nonexistent listings gracefully."""
|
||||
# Should not raise an exception
|
||||
await listing_repository.mark_seen(999999)
|
||||
|
||||
async def test_get_listings_with_only_ids(
|
||||
self,
|
||||
listing_repository: ListingRepository,
|
||||
sample_rent_listings: list[RentListing],
|
||||
) -> None:
|
||||
"""Test that get_listings filters by only_ids."""
|
||||
await listing_repository.upsert_listings(sample_rent_listings)
|
||||
|
||||
# Request only specific IDs
|
||||
listings = await listing_repository.get_listings(only_ids=[1, 3])
|
||||
assert len(listings) == 2
|
||||
listing_ids = [l.id for l in listings]
|
||||
assert 1 in listing_ids
|
||||
assert 3 in listing_ids
|
||||
assert 2 not in listing_ids
|
||||
|
||||
async def test_get_listings_with_limit(
|
||||
self,
|
||||
listing_repository: ListingRepository,
|
||||
sample_rent_listings: list[RentListing],
|
||||
) -> None:
|
||||
"""Test that get_listings respects limit parameter."""
|
||||
await listing_repository.upsert_listings(sample_rent_listings)
|
||||
|
||||
listings = await listing_repository.get_listings(limit=2)
|
||||
assert len(listings) == 2
|
||||
|
||||
|
||||
class TestListingRepositoryFilters:
|
||||
"""Tests for ListingRepository query parameter filtering."""
|
||||
|
||||
async def test_filter_by_bedrooms(
|
||||
self,
|
||||
listing_repository: ListingRepository,
|
||||
sample_rent_listings: list[RentListing],
|
||||
) -> None:
|
||||
"""Test filtering by bedroom count."""
|
||||
await listing_repository.upsert_listings(sample_rent_listings)
|
||||
|
||||
query_params = QueryParameters(
|
||||
listing_type=ListingType.RENT,
|
||||
min_bedrooms=2,
|
||||
max_bedrooms=2,
|
||||
)
|
||||
listings = await listing_repository.get_listings(query_parameters=query_params)
|
||||
assert len(listings) == 1
|
||||
assert listings[0].number_of_bedrooms == 2
|
||||
|
||||
async def test_filter_by_price_range(
|
||||
self,
|
||||
listing_repository: ListingRepository,
|
||||
sample_rent_listings: list[RentListing],
|
||||
) -> None:
|
||||
"""Test filtering by price range."""
|
||||
await listing_repository.upsert_listings(sample_rent_listings)
|
||||
|
||||
query_params = QueryParameters(
|
||||
listing_type=ListingType.RENT,
|
||||
min_price=1800,
|
||||
max_price=2500,
|
||||
)
|
||||
listings = await listing_repository.get_listings(query_parameters=query_params)
|
||||
assert len(listings) == 1
|
||||
assert listings[0].price == 2000.0
|
||||
|
||||
async def test_filter_by_min_sqm(
|
||||
self,
|
||||
listing_repository: ListingRepository,
|
||||
sample_rent_listings: list[RentListing],
|
||||
) -> None:
|
||||
"""Test filtering by minimum square meters."""
|
||||
await listing_repository.upsert_listings(sample_rent_listings)
|
||||
|
||||
query_params = QueryParameters(
|
||||
listing_type=ListingType.RENT,
|
||||
min_sqm=60,
|
||||
)
|
||||
listings = await listing_repository.get_listings(query_parameters=query_params)
|
||||
assert len(listings) == 1
|
||||
assert listings[0].square_meters == 80.0
|
||||
|
||||
async def test_filter_by_furnish_type(
|
||||
self,
|
||||
listing_repository: ListingRepository,
|
||||
sample_rent_listings: list[RentListing],
|
||||
) -> None:
|
||||
"""Test filtering by furnish type."""
|
||||
await listing_repository.upsert_listings(sample_rent_listings)
|
||||
|
||||
query_params = QueryParameters(
|
||||
listing_type=ListingType.RENT,
|
||||
furnish_types=[FurnishType.UNFURNISHED],
|
||||
)
|
||||
listings = await listing_repository.get_listings(query_parameters=query_params)
|
||||
assert len(listings) == 1
|
||||
assert listings[0].furnish_type == FurnishType.UNFURNISHED
|
||||
|
||||
async def test_filter_by_last_seen_days(
|
||||
self,
|
||||
listing_repository: ListingRepository,
|
||||
sample_rent_listings: list[RentListing],
|
||||
) -> None:
|
||||
"""Test filtering by last_seen_days."""
|
||||
# Make one listing old
|
||||
sample_rent_listings[0].last_seen = datetime.now() - timedelta(days=30)
|
||||
await listing_repository.upsert_listings(sample_rent_listings)
|
||||
|
||||
query_params = QueryParameters(
|
||||
listing_type=ListingType.RENT,
|
||||
last_seen_days=7,
|
||||
)
|
||||
listings = await listing_repository.get_listings(query_parameters=query_params)
|
||||
# Only 2 should be recent enough
|
||||
assert len(listings) == 2
|
||||
|
||||
async def test_combined_filters(
|
||||
self,
|
||||
listing_repository: ListingRepository,
|
||||
sample_rent_listings: list[RentListing],
|
||||
) -> None:
|
||||
"""Test combining multiple filters."""
|
||||
await listing_repository.upsert_listings(sample_rent_listings)
|
||||
|
||||
query_params = QueryParameters(
|
||||
listing_type=ListingType.RENT,
|
||||
min_bedrooms=1,
|
||||
max_bedrooms=2,
|
||||
min_price=1000,
|
||||
max_price=2500,
|
||||
furnish_types=[FurnishType.FURNISHED, FurnishType.UNFURNISHED],
|
||||
)
|
||||
listings = await listing_repository.get_listings(query_parameters=query_params)
|
||||
# Should match listings with 1-2 bedrooms in price range
|
||||
assert len(listings) == 2
|
||||
293
crawler/tests/unit/test_schedule_config.py
Normal file
293
crawler/tests/unit/test_schedule_config.py
Normal file
|
|
@ -0,0 +1,293 @@
|
|||
"""Unit tests for schedule configuration."""
|
||||
import os
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from config.schedule_config import ScheduleConfig, SchedulesConfig
|
||||
from models.listing import FurnishType, ListingType
|
||||
|
||||
|
||||
class TestScheduleConfig:
|
||||
"""Tests for ScheduleConfig model."""
|
||||
|
||||
def test_basic_creation_with_defaults(self):
|
||||
"""Test creating a schedule with minimal required fields."""
|
||||
schedule = ScheduleConfig(name="Test Schedule", listing_type=ListingType.RENT)
|
||||
|
||||
assert schedule.name == "Test Schedule"
|
||||
assert schedule.enabled is True
|
||||
assert schedule.minute == "0"
|
||||
assert schedule.hour == "2"
|
||||
assert schedule.day_of_week == "*"
|
||||
assert schedule.listing_type == ListingType.RENT
|
||||
assert schedule.min_bedrooms == 1
|
||||
assert schedule.max_bedrooms == 999
|
||||
assert schedule.min_price == 0
|
||||
assert schedule.max_price == 10_000_000
|
||||
assert schedule.district_names == []
|
||||
assert schedule.furnish_types is None
|
||||
|
||||
def test_full_creation(self):
|
||||
"""Test creating a schedule with all fields specified."""
|
||||
schedule = ScheduleConfig(
|
||||
name="Full Schedule",
|
||||
enabled=False,
|
||||
minute="30",
|
||||
hour="4",
|
||||
day_of_week="1,3,5",
|
||||
listing_type=ListingType.BUY,
|
||||
min_bedrooms=2,
|
||||
max_bedrooms=3,
|
||||
min_price=400000,
|
||||
max_price=800000,
|
||||
district_names=["Westminster", "Camden"],
|
||||
furnish_types=["furnished", "unfurnished"],
|
||||
)
|
||||
|
||||
assert schedule.name == "Full Schedule"
|
||||
assert schedule.enabled is False
|
||||
assert schedule.minute == "30"
|
||||
assert schedule.hour == "4"
|
||||
assert schedule.day_of_week == "1,3,5"
|
||||
assert schedule.listing_type == ListingType.BUY
|
||||
assert schedule.min_bedrooms == 2
|
||||
assert schedule.max_bedrooms == 3
|
||||
assert schedule.min_price == 400000
|
||||
assert schedule.max_price == 800000
|
||||
assert schedule.district_names == ["Westminster", "Camden"]
|
||||
assert schedule.furnish_types == ["furnished", "unfurnished"]
|
||||
|
||||
def test_to_query_parameters(self):
|
||||
"""Test conversion to QueryParameters."""
|
||||
schedule = ScheduleConfig(
|
||||
name="Test",
|
||||
listing_type=ListingType.RENT,
|
||||
min_bedrooms=2,
|
||||
max_bedrooms=3,
|
||||
min_price=2000,
|
||||
max_price=4000,
|
||||
district_names=["Westminster"],
|
||||
furnish_types=["furnished"],
|
||||
)
|
||||
|
||||
params = schedule.to_query_parameters()
|
||||
|
||||
assert params.listing_type == ListingType.RENT
|
||||
assert params.min_bedrooms == 2
|
||||
assert params.max_bedrooms == 3
|
||||
assert params.min_price == 2000
|
||||
assert params.max_price == 4000
|
||||
assert params.district_names == {"Westminster"}
|
||||
assert params.furnish_types == [FurnishType.FURNISHED]
|
||||
|
||||
def test_to_query_parameters_no_furnish_types(self):
|
||||
"""Test conversion when furnish_types is None."""
|
||||
schedule = ScheduleConfig(
|
||||
name="Test",
|
||||
listing_type=ListingType.BUY,
|
||||
)
|
||||
|
||||
params = schedule.to_query_parameters()
|
||||
|
||||
assert params.furnish_types is None
|
||||
|
||||
|
||||
class TestCronValidation:
|
||||
"""Tests for cron field validation."""
|
||||
|
||||
# Valid minute values
|
||||
@pytest.mark.parametrize(
|
||||
"minute",
|
||||
[
|
||||
"0",
|
||||
"59",
|
||||
"*",
|
||||
"*/5",
|
||||
"*/15",
|
||||
"0,15,30,45",
|
||||
],
|
||||
)
|
||||
def test_valid_minute(self, minute: str):
|
||||
"""Test valid minute values are accepted."""
|
||||
schedule = ScheduleConfig(
|
||||
name="Test", listing_type=ListingType.RENT, minute=minute
|
||||
)
|
||||
assert schedule.minute == minute
|
||||
|
||||
# Invalid minute values
|
||||
@pytest.mark.parametrize(
|
||||
"minute",
|
||||
[
|
||||
"60",
|
||||
"-1",
|
||||
"abc",
|
||||
"*/0",
|
||||
],
|
||||
)
|
||||
def test_invalid_minute(self, minute: str):
|
||||
"""Test invalid minute values are rejected."""
|
||||
with pytest.raises(ValidationError):
|
||||
ScheduleConfig(name="Test", listing_type=ListingType.RENT, minute=minute)
|
||||
|
||||
# Valid hour values
|
||||
@pytest.mark.parametrize(
|
||||
"hour",
|
||||
[
|
||||
"0",
|
||||
"23",
|
||||
"*",
|
||||
"*/6",
|
||||
"0,6,12,18",
|
||||
],
|
||||
)
|
||||
def test_valid_hour(self, hour: str):
|
||||
"""Test valid hour values are accepted."""
|
||||
schedule = ScheduleConfig(
|
||||
name="Test", listing_type=ListingType.RENT, hour=hour
|
||||
)
|
||||
assert schedule.hour == hour
|
||||
|
||||
# Invalid hour values
|
||||
@pytest.mark.parametrize(
|
||||
"hour",
|
||||
[
|
||||
"24",
|
||||
"-1",
|
||||
"abc",
|
||||
"*/0",
|
||||
],
|
||||
)
|
||||
def test_invalid_hour(self, hour: str):
|
||||
"""Test invalid hour values are rejected."""
|
||||
with pytest.raises(ValidationError):
|
||||
ScheduleConfig(name="Test", listing_type=ListingType.RENT, hour=hour)
|
||||
|
||||
# Valid day_of_week values
|
||||
@pytest.mark.parametrize(
|
||||
"day_of_week",
|
||||
[
|
||||
"0",
|
||||
"6",
|
||||
"*",
|
||||
"1,3,5",
|
||||
"*/2",
|
||||
],
|
||||
)
|
||||
def test_valid_day_of_week(self, day_of_week: str):
|
||||
"""Test valid day_of_week values are accepted."""
|
||||
schedule = ScheduleConfig(
|
||||
name="Test", listing_type=ListingType.RENT, day_of_week=day_of_week
|
||||
)
|
||||
assert schedule.day_of_week == day_of_week
|
||||
|
||||
# Invalid day_of_week values
|
||||
@pytest.mark.parametrize(
|
||||
"day_of_week",
|
||||
[
|
||||
"7",
|
||||
"-1",
|
||||
"abc",
|
||||
"*/0",
|
||||
],
|
||||
)
|
||||
def test_invalid_day_of_week(self, day_of_week: str):
|
||||
"""Test invalid day_of_week values are rejected."""
|
||||
with pytest.raises(ValidationError):
|
||||
ScheduleConfig(
|
||||
name="Test", listing_type=ListingType.RENT, day_of_week=day_of_week
|
||||
)
|
||||
|
||||
|
||||
class TestSchedulesConfig:
|
||||
"""Tests for SchedulesConfig container."""
|
||||
|
||||
def test_from_env_empty(self):
|
||||
"""Test loading from empty environment variable."""
|
||||
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": ""}, clear=False):
|
||||
config = SchedulesConfig.from_env()
|
||||
assert config.schedules == []
|
||||
|
||||
def test_from_env_missing(self):
|
||||
"""Test loading when environment variable is not set."""
|
||||
with mock.patch.dict(os.environ, {}, clear=True):
|
||||
# Ensure SCRAPE_SCHEDULES is not set
|
||||
os.environ.pop("SCRAPE_SCHEDULES", None)
|
||||
config = SchedulesConfig.from_env()
|
||||
assert config.schedules == []
|
||||
|
||||
def test_from_env_valid_single(self):
|
||||
"""Test loading a single valid schedule."""
|
||||
json_config = '[{"name":"Daily RENT","listing_type":"RENT","hour":"2"}]'
|
||||
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": json_config}):
|
||||
config = SchedulesConfig.from_env()
|
||||
|
||||
assert len(config.schedules) == 1
|
||||
assert config.schedules[0].name == "Daily RENT"
|
||||
assert config.schedules[0].listing_type == ListingType.RENT
|
||||
assert config.schedules[0].hour == "2"
|
||||
|
||||
def test_from_env_valid_multiple(self):
|
||||
"""Test loading multiple valid schedules."""
|
||||
json_config = """[
|
||||
{"name":"Daily RENT","listing_type":"RENT","hour":"2"},
|
||||
{"name":"Daily BUY","listing_type":"BUY","hour":"4","enabled":false}
|
||||
]"""
|
||||
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": json_config}):
|
||||
config = SchedulesConfig.from_env()
|
||||
|
||||
assert len(config.schedules) == 2
|
||||
assert config.schedules[0].name == "Daily RENT"
|
||||
assert config.schedules[0].enabled is True
|
||||
assert config.schedules[1].name == "Daily BUY"
|
||||
assert config.schedules[1].enabled is False
|
||||
|
||||
def test_from_env_invalid_json(self):
|
||||
"""Test error on invalid JSON."""
|
||||
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": "not json"}):
|
||||
with pytest.raises(ValueError, match="Invalid JSON"):
|
||||
SchedulesConfig.from_env()
|
||||
|
||||
def test_from_env_not_array(self):
|
||||
"""Test error when JSON is not an array."""
|
||||
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": '{"name":"test"}'}):
|
||||
with pytest.raises(ValueError, match="must be a JSON array"):
|
||||
SchedulesConfig.from_env()
|
||||
|
||||
def test_from_env_invalid_schedule(self):
|
||||
"""Test error when schedule validation fails."""
|
||||
# Missing required listing_type
|
||||
json_config = '[{"name":"Invalid"}]'
|
||||
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": json_config}):
|
||||
with pytest.raises(ValidationError):
|
||||
SchedulesConfig.from_env()
|
||||
|
||||
def test_get_enabled_schedules(self):
|
||||
"""Test filtering to only enabled schedules."""
|
||||
config = SchedulesConfig(
|
||||
schedules=[
|
||||
ScheduleConfig(name="Enabled", listing_type=ListingType.RENT, enabled=True),
|
||||
ScheduleConfig(name="Disabled", listing_type=ListingType.BUY, enabled=False),
|
||||
ScheduleConfig(name="Also Enabled", listing_type=ListingType.RENT, enabled=True),
|
||||
]
|
||||
)
|
||||
|
||||
enabled = config.get_enabled_schedules()
|
||||
|
||||
assert len(enabled) == 2
|
||||
assert enabled[0].name == "Enabled"
|
||||
assert enabled[1].name == "Also Enabled"
|
||||
|
||||
def test_get_enabled_schedules_all_disabled(self):
|
||||
"""Test when all schedules are disabled."""
|
||||
config = SchedulesConfig(
|
||||
schedules=[
|
||||
ScheduleConfig(name="Disabled1", listing_type=ListingType.RENT, enabled=False),
|
||||
ScheduleConfig(name="Disabled2", listing_type=ListingType.BUY, enabled=False),
|
||||
]
|
||||
)
|
||||
|
||||
enabled = config.get_enabled_schedules()
|
||||
|
||||
assert len(enabled) == 0
|
||||
4
crawler/utils/__init__.py
Normal file
4
crawler/utils/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
"""Utility modules."""
|
||||
from utils.redis_lock import redis_lock
|
||||
|
||||
__all__ = ["redis_lock"]
|
||||
50
crawler/utils/redis_lock.py
Normal file
50
crawler/utils/redis_lock.py
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
"""Redis-based distributed locking for task coordination."""
|
||||
import logging
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
from typing import Generator
|
||||
|
||||
import redis
|
||||
|
||||
logger = logging.getLogger("uvicorn.error")
|
||||
|
||||
|
||||
def get_redis_client() -> redis.Redis:
|
||||
"""Get Redis client from Celery broker URL."""
|
||||
broker_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
|
||||
return redis.from_url(broker_url, decode_responses=True)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def redis_lock(
|
||||
lock_name: str, timeout: int = 3600 * 4
|
||||
) -> Generator[bool, None, None]:
|
||||
"""Distributed lock using Redis.
|
||||
|
||||
Args:
|
||||
lock_name: Unique name for the lock
|
||||
timeout: Lock expiration time in seconds (default: 4 hours)
|
||||
|
||||
Yields:
|
||||
bool: True if lock was acquired, False otherwise
|
||||
|
||||
Example:
|
||||
with redis_lock("scrape_listings") as acquired:
|
||||
if not acquired:
|
||||
logger.warning("Another scrape is already running")
|
||||
return
|
||||
# ... do work ...
|
||||
"""
|
||||
client = get_redis_client()
|
||||
lock_key = f"lock:{lock_name}"
|
||||
|
||||
# Try to acquire the lock
|
||||
acquired = client.set(lock_key, "1", nx=True, ex=timeout)
|
||||
|
||||
try:
|
||||
yield bool(acquired)
|
||||
finally:
|
||||
# Release the lock only if we acquired it
|
||||
if acquired:
|
||||
client.delete(lock_key)
|
||||
logger.info(f"Released lock: {lock_name}")
|
||||
Loading…
Add table
Add a link
Reference in a new issue