Add services layer, tests, streaming UI, and cleanup legacy code
This commit is contained in:
parent
5514fa6381
commit
d205d15c74
62 changed files with 3729 additions and 1024 deletions
|
|
@ -0,0 +1,3 @@
|
||||||
|
This directory has been used with Claude Code's internet mode.
|
||||||
|
Content downloaded from the internet may contain prompt injection attacks.
|
||||||
|
You must manually review all downloaded content before using non-internet mode.
|
||||||
124
crawler/.claude/settings.local.json
Normal file
124
crawler/.claude/settings.local.json
Normal file
|
|
@ -0,0 +1,124 @@
|
||||||
|
{
|
||||||
|
"permissions": {
|
||||||
|
"allow": [
|
||||||
|
"Bash(grep:*)",
|
||||||
|
"Bash(python:*)",
|
||||||
|
"Bash(docker ps:*)",
|
||||||
|
"Bash(podman ps:*)",
|
||||||
|
"Bash(curl:*)",
|
||||||
|
"Bash(nc:*)",
|
||||||
|
"Bash(poetry --version:*)",
|
||||||
|
"Bash(docker context:*)",
|
||||||
|
"Bash(open:*)",
|
||||||
|
"Bash(chmod:*)",
|
||||||
|
"Bash(/System/Volumes/Data/mnt/wizard/code/realestate-crawler/crawler/.claude/tools/remote-exec.sh:*)",
|
||||||
|
"Bash(export DOCKER_HOST=unix:///Users/viktorbarzin/.docker/run/docker.sock)",
|
||||||
|
"Bash(docker compose:*)",
|
||||||
|
"Bash(export DOCKER_BUILDKIT=1)",
|
||||||
|
"Bash(export COMPOSE_DOCKER_CLI_BUILD=1)",
|
||||||
|
"Bash(tar:*)",
|
||||||
|
"Bash(docker build:*)",
|
||||||
|
"Bash(docker tag:*)",
|
||||||
|
"Bash(docker run:*)",
|
||||||
|
"Bash(~/.claude/remote-exec.sh \"hostname\")",
|
||||||
|
"Skill(remote)",
|
||||||
|
"Bash(for i in {1..120})",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769814743512676000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769814743512676000.txt)",
|
||||||
|
"Bash(exit 0)",
|
||||||
|
"Bash(fi)",
|
||||||
|
"Bash(done)",
|
||||||
|
"Bash(for i in {1..240})",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769814856118018000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769814856118018000.txt)",
|
||||||
|
"Bash(for i in {1..60})",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769814883284199000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769814883284199000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815004122069000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769815004122069000.txt)",
|
||||||
|
"Bash(for i in {1..90})",
|
||||||
|
"Bash(do if grep -q \"EXIT_CODE\" ~/.claude/remote-results/cmd-1769814856118018000.txt)",
|
||||||
|
"Bash(then echo \"=== Build completed ===\")",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815497591226000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769815497591226000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815530803509000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769815530803509000.txt)",
|
||||||
|
"Bash(do if grep -q \"EXIT_CODE\" ~/.claude/remote-results/cmd-1769815530803509000.txt)",
|
||||||
|
"Bash(for i in {1..30})",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815614622428000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769815614622428000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815710424010000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769815710424010000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769815892793650000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769815892793650000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816040589015000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816040589015000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816256870361000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816256870361000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816300264785000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816300264785000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816375772556000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816375772556000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816407482202000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816407482202000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816439320016000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816439320016000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816532941427000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816532941427000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816611986724000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816611986724000.txt)",
|
||||||
|
"Bash(for i in {1..40})",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816682085291000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816682085291000.txt)",
|
||||||
|
"Bash(for i in {1..20})",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816742848870000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816742848870000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816763327960000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816763327960000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816784934447000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816784934447000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816872796427000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816872796427000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816892104231000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816892104231000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816911037685000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816911037685000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816946320457000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816946320457000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769816987766946000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769816987766946000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769817008932477000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769817008932477000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769817027145242000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769817027145242000.txt)",
|
||||||
|
"Bash(for file in /mnt/wizard/code/realestate-crawler/crawler/frontend/src/components/ui/*.tsx)",
|
||||||
|
"Bash(do)",
|
||||||
|
"Bash(basename:*)",
|
||||||
|
"Bash(wc:*)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769819894031906000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769819894031906000.txt)",
|
||||||
|
"Bash(do if [ -f ~/.claude/remote-results/cmd-1769854789336791000.txt ])",
|
||||||
|
"Bash(then cat ~/.claude/remote-results/cmd-1769854789336791000.txt)",
|
||||||
|
"Bash(npx tsc:*)",
|
||||||
|
"Bash(npx eslint:*)",
|
||||||
|
"Bash(find:*)",
|
||||||
|
"Bash(sync)",
|
||||||
|
"Bash(echo:*)",
|
||||||
|
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875304344407000.txt ])",
|
||||||
|
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875304344407000.txt)",
|
||||||
|
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875708563896000.txt ])",
|
||||||
|
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875708563896000.txt)",
|
||||||
|
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875753067606000.txt ])",
|
||||||
|
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875753067606000.txt)",
|
||||||
|
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875830424071000.txt ])",
|
||||||
|
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875830424071000.txt)",
|
||||||
|
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769875948670335000.txt ])",
|
||||||
|
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769875948670335000.txt)",
|
||||||
|
"Bash(sort:*)",
|
||||||
|
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769876096467703000.txt ])",
|
||||||
|
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769876096467703000.txt)",
|
||||||
|
"Bash(do if [ -f /Users/viktorbarzin/.claude/remote-results/cmd-1769876529766339000.txt ])",
|
||||||
|
"Bash(then cat /Users/viktorbarzin/.claude/remote-results/cmd-1769876529766339000.txt)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
101
crawler/.claude/skills/python-313-redis-generic-type/SKILL.md
Normal file
101
crawler/.claude/skills/python-313-redis-generic-type/SKILL.md
Normal file
|
|
@ -0,0 +1,101 @@
|
||||||
|
---
|
||||||
|
name: python-313-redis-generic-type
|
||||||
|
description: |
|
||||||
|
Fix for "TypeError: <class 'redis.client.Redis'> is not a generic class" when using
|
||||||
|
redis-py with Python 3.13. Use when: (1) upgrading to Python 3.13 breaks redis type
|
||||||
|
annotations, (2) mypy passes but runtime fails with generic class error, (3) using
|
||||||
|
redis.Redis[str] or similar parameterized types. Covers redis-py generic type
|
||||||
|
compatibility with Python 3.13's stricter runtime generic checking.
|
||||||
|
author: Claude Code
|
||||||
|
version: 1.0.0
|
||||||
|
date: 2026-01-31
|
||||||
|
---
|
||||||
|
|
||||||
|
# Python 3.13 redis.Redis Generic Type Error
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
Python 3.13 introduced stricter runtime checking for generic types. The redis-py library's
|
||||||
|
`Redis` class is not defined as a generic class at runtime, even though it works with type
|
||||||
|
checkers like mypy. This causes a `TypeError` when you use parameterized types like
|
||||||
|
`redis.Redis[str]` in type annotations that are evaluated at runtime.
|
||||||
|
|
||||||
|
## Context / Trigger Conditions
|
||||||
|
- Python 3.13 or later
|
||||||
|
- Using redis-py library
|
||||||
|
- Type annotation like `redis_client: redis.Redis[str]`
|
||||||
|
- Error message: `TypeError: <class 'redis.client.Redis'> is not a generic class`
|
||||||
|
- Works fine with mypy but fails at runtime
|
||||||
|
- Often appears when instantiating a class with this annotation
|
||||||
|
|
||||||
|
## Solution
|
||||||
|
|
||||||
|
### Option 1: Remove the type parameter (Recommended)
|
||||||
|
```python
|
||||||
|
# Before (breaks in Python 3.13)
|
||||||
|
redis_client: redis.Redis[str]
|
||||||
|
|
||||||
|
# After (works in all Python versions)
|
||||||
|
redis_client: redis.Redis # type: ignore[type-arg]
|
||||||
|
```
|
||||||
|
|
||||||
|
The `# type: ignore[type-arg]` comment silences mypy's warning about missing type arguments.
|
||||||
|
|
||||||
|
### Option 2: Use string annotation (deferred evaluation)
|
||||||
|
```python
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
redis_client: "redis.Redis[str]" # String annotation, not evaluated at runtime
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 3: Use TYPE_CHECKING guard
|
||||||
|
```python
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
RedisClient = redis.Redis[str]
|
||||||
|
else:
|
||||||
|
RedisClient = redis.Redis
|
||||||
|
|
||||||
|
redis_client: RedisClient
|
||||||
|
```
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
1. Run your application with Python 3.13
|
||||||
|
2. The TypeError should no longer appear
|
||||||
|
3. Run mypy to ensure type checking still works (may need type: ignore comment)
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
### Before (Broken)
|
||||||
|
```python
|
||||||
|
import redis
|
||||||
|
|
||||||
|
class RedisRepository:
|
||||||
|
redis_client: redis.Redis[str] # TypeError at runtime in Python 3.13
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.redis_client = redis.Redis(host='localhost', decode_responses=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
### After (Fixed)
|
||||||
|
```python
|
||||||
|
import redis
|
||||||
|
|
||||||
|
class RedisRepository:
|
||||||
|
redis_client: redis.Redis # type: ignore[type-arg]
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.redis_client = redis.Redis(host='localhost', decode_responses=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
- This is a breaking change in Python 3.13's handling of generic types
|
||||||
|
- The redis-py library may add proper generic support in future versions
|
||||||
|
- If using `decode_responses=True`, the client returns `str`; otherwise `bytes`
|
||||||
|
- The `type: ignore` comment is preferable to `Any` as it preserves some type safety
|
||||||
|
- This issue affects other libraries that aren't properly defined as Generic classes
|
||||||
|
|
||||||
|
## References
|
||||||
|
- [Python 3.13 Release Notes](https://docs.python.org/3.13/whatsnew/3.13.html)
|
||||||
|
- [redis-py GitHub Issues](https://github.com/redis/redis-py/issues)
|
||||||
|
- [PEP 585 - Type Hinting Generics In Standard Collections](https://peps.python.org/pep-0585/)
|
||||||
|
|
@ -0,0 +1,132 @@
|
||||||
|
---
|
||||||
|
name: python-parentheses-comparison-bug
|
||||||
|
description: |
|
||||||
|
Debug Python comparison bug where parentheses around a variable cause unexpected behavior.
|
||||||
|
Use when: (1) condition always evaluates to False/True unexpectedly, (2) code like
|
||||||
|
"if (mylist) == 0" never triggers, (3) length check seems to not work, (4) comparison
|
||||||
|
with list/dict returns unexpected results. Common mistake where parentheses cause the
|
||||||
|
variable itself to be compared instead of its length.
|
||||||
|
author: Claude Code
|
||||||
|
version: 1.0.0
|
||||||
|
date: 2026-01-31
|
||||||
|
---
|
||||||
|
|
||||||
|
# Python Parentheses Comparison Bug
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
A subtle Python bug where unnecessary parentheses around a variable in a comparison
|
||||||
|
cause the wrong value to be compared. The expression `(mylist) == 0` compares the list
|
||||||
|
itself to 0, not its length. Since a list is never equal to an integer, this always
|
||||||
|
returns False.
|
||||||
|
|
||||||
|
## Context / Trigger Conditions
|
||||||
|
- Condition that should sometimes be True is always False (or vice versa)
|
||||||
|
- Code pattern like `if (existing_items) == 0:` or `if (result) == expected:`
|
||||||
|
- The parentheses don't cause a syntax error but change semantics
|
||||||
|
- Often appears when copying/adapting code or during refactoring
|
||||||
|
- May pass code review because it "looks" correct
|
||||||
|
|
||||||
|
## Solution
|
||||||
|
|
||||||
|
### Identify the Bug Pattern
|
||||||
|
```python
|
||||||
|
# BUG: Compares list to 0, always False
|
||||||
|
if (existing_listings) == 0:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Also wrong: compares list to integer
|
||||||
|
if (items) == 5:
|
||||||
|
do_something()
|
||||||
|
```
|
||||||
|
|
||||||
|
### Fix: Use len() for Length Comparisons
|
||||||
|
```python
|
||||||
|
# CORRECT: Compares length to 0
|
||||||
|
if len(existing_listings) == 0:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Alternative: Use truthiness for empty check
|
||||||
|
if not existing_listings:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# CORRECT: Compares length to integer
|
||||||
|
if len(items) == 5:
|
||||||
|
do_something()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
1. Add a debug print before the condition: `print(f"list={existing_listings}, len={len(existing_listings)}")`
|
||||||
|
2. Verify the condition now evaluates correctly
|
||||||
|
3. Write a unit test that exercises both branches of the condition
|
||||||
|
|
||||||
|
## Example
|
||||||
|
|
||||||
|
### Before (Broken)
|
||||||
|
```python
|
||||||
|
class FetchListingDetailsStep:
|
||||||
|
async def needs_processing(self, listing_id: int) -> bool:
|
||||||
|
existing_listings = await self.listing_repository.get_listings(
|
||||||
|
only_ids=[listing_id]
|
||||||
|
)
|
||||||
|
# BUG: This compares the list object to 0, which is always False
|
||||||
|
# The parentheses around existing_listings are misleading
|
||||||
|
if (existing_listings) == 0:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
```
|
||||||
|
|
||||||
|
### After (Fixed)
|
||||||
|
```python
|
||||||
|
class FetchListingDetailsStep:
|
||||||
|
async def needs_processing(self, listing_id: int) -> bool:
|
||||||
|
existing_listings = await self.listing_repository.get_listings(
|
||||||
|
only_ids=[listing_id]
|
||||||
|
)
|
||||||
|
# CORRECT: Check if list is empty using len()
|
||||||
|
if len(existing_listings) == 0:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
```
|
||||||
|
|
||||||
|
### Even Better (Pythonic)
|
||||||
|
```python
|
||||||
|
class FetchListingDetailsStep:
|
||||||
|
async def needs_processing(self, listing_id: int) -> bool:
|
||||||
|
existing_listings = await self.listing_repository.get_listings(
|
||||||
|
only_ids=[listing_id]
|
||||||
|
)
|
||||||
|
# Most Pythonic: Use truthiness
|
||||||
|
return not existing_listings
|
||||||
|
```
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
- Python's truthiness: empty collections are falsy, non-empty are truthy
|
||||||
|
- This bug is particularly insidious because:
|
||||||
|
- It's syntactically valid
|
||||||
|
- It doesn't raise an exception
|
||||||
|
- The parentheses make it look intentional
|
||||||
|
- Code review may miss it
|
||||||
|
- Linters like pylint or flake8 won't catch this specific pattern
|
||||||
|
- Type checkers like mypy may warn about comparing incompatible types
|
||||||
|
- When debugging, add print statements to verify actual vs expected values
|
||||||
|
|
||||||
|
## Prevention
|
||||||
|
- Prefer `if not mylist:` over `if len(mylist) == 0:`
|
||||||
|
- Prefer `if mylist:` over `if len(mylist) > 0:`
|
||||||
|
- Remove unnecessary parentheses around single variables
|
||||||
|
- Enable mypy's strict mode which may catch type comparison issues
|
||||||
|
- Write unit tests that exercise both branches of conditions
|
||||||
|
|
||||||
|
## Related Patterns
|
||||||
|
```python
|
||||||
|
# These are all wrong (comparing object to number):
|
||||||
|
if (mydict) == 0: # Always False
|
||||||
|
if (mylist) > 0: # TypeError in Python 3
|
||||||
|
if (mystring) == 0: # Always False
|
||||||
|
|
||||||
|
# These are correct:
|
||||||
|
if len(mydict) == 0: # True if empty
|
||||||
|
if not mydict: # True if empty (preferred)
|
||||||
|
if len(mylist) > 0: # True if non-empty
|
||||||
|
if mylist: # True if non-empty (preferred)
|
||||||
|
```
|
||||||
|
|
@ -1,13 +0,0 @@
|
||||||
from data_access import Listing
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
listings = Listing.get_all_listings()
|
|
||||||
recalculate_listings = []
|
|
||||||
|
|
||||||
for listing in listings:
|
|
||||||
sqm = listing.sqm_ocr
|
|
||||||
if sqm is None or sqm < 10 or sqm > 200:
|
|
||||||
recalculate_listings.append(listing)
|
|
||||||
|
|
||||||
for listing in tqdm(recalculate_listings):
|
|
||||||
listing.calculate_sqm_ocr(recalculate=True)
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
# recalculate regex from sqm from already previously ocr'ed text
|
|
||||||
import json
|
|
||||||
from rec.floorplan import extract_total_sqm
|
|
||||||
from tqdm import tqdm
|
|
||||||
from data_access import Listing
|
|
||||||
|
|
||||||
for listing in tqdm(list(Listing.get_all_listings())):
|
|
||||||
with open(listing.path_floorplan_ocr_json()) as f:
|
|
||||||
floorplans = json.load(f)
|
|
||||||
|
|
||||||
for floorplan in floorplans:
|
|
||||||
floorplan["estimated_sqm"] = extract_total_sqm(floorplan["text"])
|
|
||||||
|
|
||||||
with open(listing.path_floorplan_ocr_json(), "w") as f:
|
|
||||||
floorplans = json.dump(floorplans, f)
|
|
||||||
|
|
@ -41,6 +41,7 @@ EXPOSE 5001
|
||||||
# Set the entry point (adjust to your CLI's entry point)
|
# Set the entry point (adjust to your CLI's entry point)
|
||||||
# ENTRYPOINT ["python", "/app/main.py"]
|
# ENTRYPOINT ["python", "/app/main.py"]
|
||||||
# ENTRYPOINT ["/app/runall.sh"]
|
# ENTRYPOINT ["/app/runall.sh"]
|
||||||
# CMD ["/bin/bash" ,"-c" ,"alembic upgrade head && uvicorn api.app:app --host 0.0.0.0 --port 8000"]
|
# For local dev with docker-compose:
|
||||||
# ENTRYPOINT ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
# CMD ["./start.sh"]
|
||||||
CMD ["./start.sh"]
|
# For Kubernetes deployment:
|
||||||
|
CMD ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "5001"]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,56 @@
|
||||||
|
"""add streaming indexes for query optimization
|
||||||
|
|
||||||
|
Revision ID: a1b2c3d4e5f6
|
||||||
|
Revises: e5f1bc4e3323
|
||||||
|
Create Date: 2026-02-01 12:00:00.000000
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = 'a1b2c3d4e5f6'
|
||||||
|
down_revision: Union[str, None] = 'e5f1bc4e3323'
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
"""Add composite and single-column indexes for streaming query optimization."""
|
||||||
|
# Composite index for main query pattern (bedrooms, price, last_seen filtering)
|
||||||
|
op.create_index(
|
||||||
|
'ix_rentlisting_query_composite',
|
||||||
|
'rentlisting',
|
||||||
|
['number_of_bedrooms', 'price', 'last_seen'],
|
||||||
|
unique=False
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
'ix_buylisting_query_composite',
|
||||||
|
'buylisting',
|
||||||
|
['number_of_bedrooms', 'price', 'last_seen'],
|
||||||
|
unique=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# Missing single-column indexes for frequently filtered columns
|
||||||
|
op.create_index(
|
||||||
|
'ix_rentlisting_furnish_type',
|
||||||
|
'rentlisting',
|
||||||
|
['furnish_type'],
|
||||||
|
unique=False
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
'ix_rentlisting_available_from',
|
||||||
|
'rentlisting',
|
||||||
|
['available_from'],
|
||||||
|
unique=False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
"""Remove streaming indexes."""
|
||||||
|
op.drop_index('ix_rentlisting_available_from', table_name='rentlisting')
|
||||||
|
op.drop_index('ix_rentlisting_furnish_type', table_name='rentlisting')
|
||||||
|
op.drop_index('ix_buylisting_query_composite', table_name='buylisting')
|
||||||
|
op.drop_index('ix_rentlisting_query_composite', table_name='rentlisting')
|
||||||
|
|
@ -19,88 +19,12 @@ depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
def upgrade() -> None:
|
def upgrade() -> None:
|
||||||
"""Upgrade schema."""
|
"""Upgrade schema - this migration is now a no-op since tables already have correct column name."""
|
||||||
# ### commands auto generated by Alembic - please adjust! ###
|
# The tables were created with 'longitude' (correct spelling) in the initial migration.
|
||||||
op.drop_index(op.f('ix_user_email'), table_name='user')
|
# This migration was incorrectly auto-generated and has been fixed to be a no-op.
|
||||||
op.drop_table('user')
|
pass
|
||||||
op.drop_index(op.f('ix_rentlisting_last_seen'), table_name='rentlisting')
|
|
||||||
op.drop_index(op.f('ix_rentlisting_number_of_bedrooms'), table_name='rentlisting')
|
|
||||||
op.drop_index(op.f('ix_rentlisting_price'), table_name='rentlisting')
|
|
||||||
op.drop_index(op.f('ix_rentlisting_square_meters'), table_name='rentlisting')
|
|
||||||
op.drop_table('rentlisting')
|
|
||||||
op.drop_index(op.f('ix_buylisting_last_seen'), table_name='buylisting')
|
|
||||||
op.drop_index(op.f('ix_buylisting_number_of_bedrooms'), table_name='buylisting')
|
|
||||||
op.drop_index(op.f('ix_buylisting_price'), table_name='buylisting')
|
|
||||||
op.drop_index(op.f('ix_buylisting_square_meters'), table_name='buylisting')
|
|
||||||
op.drop_table('buylisting')
|
|
||||||
# ### end Alembic commands ###
|
|
||||||
|
|
||||||
|
|
||||||
def downgrade() -> None:
|
def downgrade() -> None:
|
||||||
"""Downgrade schema."""
|
"""Downgrade schema - no-op since upgrade is no-op."""
|
||||||
# ### commands auto generated by Alembic - please adjust! ###
|
pass
|
||||||
op.create_table('buylisting',
|
|
||||||
sa.Column('id', mysql.INTEGER(), autoincrement=True, nullable=False),
|
|
||||||
sa.Column('price', mysql.FLOAT(), nullable=False),
|
|
||||||
sa.Column('number_of_bedrooms', mysql.INTEGER(), autoincrement=False, nullable=False),
|
|
||||||
sa.Column('square_meters', mysql.FLOAT(), nullable=True),
|
|
||||||
sa.Column('agency', mysql.VARCHAR(length=255), nullable=True),
|
|
||||||
sa.Column('council_tax_band', mysql.VARCHAR(length=255), nullable=True),
|
|
||||||
sa.Column('longtitude', mysql.FLOAT(), nullable=False),
|
|
||||||
sa.Column('latitude', mysql.FLOAT(), nullable=False),
|
|
||||||
sa.Column('price_history_json', mysql.TEXT(), nullable=False),
|
|
||||||
sa.Column('listing_site', mysql.ENUM('RIGHTMOVE'), nullable=False),
|
|
||||||
sa.Column('last_seen', mysql.DATETIME(), nullable=False),
|
|
||||||
sa.Column('photo_thumbnail', mysql.VARCHAR(length=255), nullable=True),
|
|
||||||
sa.Column('floorplan_image_paths', mysql.JSON(), nullable=False),
|
|
||||||
sa.Column('additional_info', mysql.JSON(), nullable=False),
|
|
||||||
sa.Column('routing_info_json', mysql.TEXT(), nullable=True),
|
|
||||||
sa.Column('service_charge', mysql.FLOAT(), nullable=True),
|
|
||||||
sa.Column('lease_left', mysql.INTEGER(), autoincrement=False, nullable=True),
|
|
||||||
sa.PrimaryKeyConstraint('id'),
|
|
||||||
mysql_collate='utf8mb4_0900_ai_ci',
|
|
||||||
mysql_default_charset='utf8mb4',
|
|
||||||
mysql_engine='InnoDB'
|
|
||||||
)
|
|
||||||
op.create_index(op.f('ix_buylisting_square_meters'), 'buylisting', ['square_meters'], unique=False)
|
|
||||||
op.create_index(op.f('ix_buylisting_price'), 'buylisting', ['price'], unique=False)
|
|
||||||
op.create_index(op.f('ix_buylisting_number_of_bedrooms'), 'buylisting', ['number_of_bedrooms'], unique=False)
|
|
||||||
op.create_index(op.f('ix_buylisting_last_seen'), 'buylisting', ['last_seen'], unique=False)
|
|
||||||
op.create_table('rentlisting',
|
|
||||||
sa.Column('id', mysql.INTEGER(), autoincrement=True, nullable=False),
|
|
||||||
sa.Column('price', mysql.FLOAT(), nullable=False),
|
|
||||||
sa.Column('number_of_bedrooms', mysql.INTEGER(), autoincrement=False, nullable=False),
|
|
||||||
sa.Column('square_meters', mysql.FLOAT(), nullable=True),
|
|
||||||
sa.Column('agency', mysql.VARCHAR(length=255), nullable=True),
|
|
||||||
sa.Column('council_tax_band', mysql.VARCHAR(length=255), nullable=True),
|
|
||||||
sa.Column('longtitude', mysql.FLOAT(), nullable=False),
|
|
||||||
sa.Column('latitude', mysql.FLOAT(), nullable=False),
|
|
||||||
sa.Column('price_history_json', mysql.TEXT(), nullable=False),
|
|
||||||
sa.Column('listing_site', mysql.ENUM('RIGHTMOVE'), nullable=False),
|
|
||||||
sa.Column('last_seen', mysql.DATETIME(), nullable=False),
|
|
||||||
sa.Column('photo_thumbnail', mysql.VARCHAR(length=255), nullable=True),
|
|
||||||
sa.Column('floorplan_image_paths', mysql.JSON(), nullable=False),
|
|
||||||
sa.Column('additional_info', mysql.JSON(), nullable=False),
|
|
||||||
sa.Column('routing_info_json', mysql.TEXT(), nullable=True),
|
|
||||||
sa.Column('available_from', mysql.DATETIME(), nullable=True),
|
|
||||||
sa.Column('furnish_type', mysql.ENUM('FURNISHED', 'UNFURNISHED', 'PART_FURNISHED', 'ASK_LANDLORD', 'UNKNOWN'), nullable=False),
|
|
||||||
sa.PrimaryKeyConstraint('id'),
|
|
||||||
mysql_collate='utf8mb4_0900_ai_ci',
|
|
||||||
mysql_default_charset='utf8mb4',
|
|
||||||
mysql_engine='InnoDB'
|
|
||||||
)
|
|
||||||
op.create_index(op.f('ix_rentlisting_square_meters'), 'rentlisting', ['square_meters'], unique=False)
|
|
||||||
op.create_index(op.f('ix_rentlisting_price'), 'rentlisting', ['price'], unique=False)
|
|
||||||
op.create_index(op.f('ix_rentlisting_number_of_bedrooms'), 'rentlisting', ['number_of_bedrooms'], unique=False)
|
|
||||||
op.create_index(op.f('ix_rentlisting_last_seen'), 'rentlisting', ['last_seen'], unique=False)
|
|
||||||
op.create_table('user',
|
|
||||||
sa.Column('id', mysql.INTEGER(), autoincrement=True, nullable=False),
|
|
||||||
sa.Column('email', mysql.VARCHAR(length=255), nullable=False),
|
|
||||||
sa.Column('password', mysql.VARCHAR(length=255), nullable=False),
|
|
||||||
sa.PrimaryKeyConstraint('id'),
|
|
||||||
mysql_collate='utf8mb4_0900_ai_ci',
|
|
||||||
mysql_default_charset='utf8mb4',
|
|
||||||
mysql_engine='InnoDB'
|
|
||||||
)
|
|
||||||
op.create_index(op.f('ix_user_email'), 'user', ['email'], unique=True)
|
|
||||||
# ### end Alembic commands ###
|
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from rec.query import QueryParameters
|
from models.listing import QueryParameters
|
||||||
from repositories.listing_repository import ListingRepository
|
from repositories.listing_repository import ListingRepository
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -10,7 +10,7 @@ async def export_to_csv(
|
||||||
query_parameters: QueryParameters | None = None,
|
query_parameters: QueryParameters | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
listings = await repository.get_listings(query_parameters=query_parameters)
|
listings = await repository.get_listings(query_parameters=query_parameters)
|
||||||
ds = [*[listing.__dict__ for listing in listings]]
|
ds = [listing.__dict__ for listing in listings]
|
||||||
df = pd.DataFrame(ds)
|
df = pd.DataFrame(ds)
|
||||||
|
|
||||||
# read decisions on file
|
# read decisions on file
|
||||||
|
|
@ -22,37 +22,19 @@ async def export_to_csv(
|
||||||
drop_columns = ["_sa_instance_state", "additional_info"]
|
drop_columns = ["_sa_instance_state", "additional_info"]
|
||||||
df = df.drop(columns=drop_columns)
|
df = df.drop(columns=drop_columns)
|
||||||
|
|
||||||
# remove all entries where we didnt calculate transit time (probably due to a too far distance)
|
# fill in gap values for service charge and lease left for Excel filters
|
||||||
# df2 = df[df.travel_time_fastest.notna()]
|
if "service_charge" not in df.columns:
|
||||||
df2 = df
|
df.loc[:, "service_charge"] = -1
|
||||||
|
df.loc[:, "service_charge"] = df.service_charge.fillna(-1)
|
||||||
|
if "lease_left" not in df.columns:
|
||||||
|
df.loc[:, "lease_left"] = -1
|
||||||
|
df.loc[:, "lease_left"] = df.lease_left.fillna(-1)
|
||||||
|
if "square_meters" not in df.columns:
|
||||||
|
df.loc[:, "square_meters"] = -1
|
||||||
|
df.loc[:, "square_meters"] = df.square_meters.fillna(-1)
|
||||||
|
|
||||||
# drop columns
|
# Add price per sqm column
|
||||||
# dropcolumns = ['distance_per_transit', 'duration_static', 'distance']
|
df.loc[:, "price_per_sqm"] = df.price / df.square_meters
|
||||||
# s1 = df2['travel_time_fastest'].apply(pd.Series).drop(dropcolumns, axis=1)
|
|
||||||
# s1 = df2
|
|
||||||
|
|
||||||
# fill in gap values for service charge and lease left. This is for excel so we can use filters better there
|
df = df.sort_values(by=["price_per_sqm"], ascending=True)
|
||||||
if "service_charge" not in df2.columns:
|
df.to_csv(str(output_file), index=False)
|
||||||
df2.loc[:, "service_charge"] = -1
|
|
||||||
df2.loc[:, "service_charge"] = df2.service_charge.fillna(-1)
|
|
||||||
if "lease_left" not in df2.columns:
|
|
||||||
df2.loc[:, "lease_left"] = -1
|
|
||||||
df2.loc[:, "lease_left"] = df2.lease_left.fillna(-1)
|
|
||||||
if "square_meters" not in df2.columns:
|
|
||||||
df2.loc[:, "square_meters"] = -1
|
|
||||||
df2.loc[:, "square_meters"] = df2.square_meters.fillna(-1)
|
|
||||||
|
|
||||||
df3 = df2
|
|
||||||
# df3 = pd.concat([df2.drop(['travel_time_fastest', 'travel_time_second'], axis=1), s1], axis=1)
|
|
||||||
# df3.loc[:, 'duration'] = (df3.loc[:, ['duration']].min(axis=1) / 60).round()
|
|
||||||
df3.shape
|
|
||||||
df4 = df3
|
|
||||||
|
|
||||||
# df5 = df4[columns]
|
|
||||||
|
|
||||||
# Add some interesting columns
|
|
||||||
df4.loc[:, "price_per_sqm"] = df4.price / df4.square_meters
|
|
||||||
df5 = df4
|
|
||||||
|
|
||||||
df6 = df5.sort_values(by=["price_per_sqm"], ascending=True)
|
|
||||||
df6.to_csv(str(output_file), index=False)
|
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ from dataclasses import dataclass
|
||||||
import json
|
import json
|
||||||
import pathlib
|
import pathlib
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
import warnings
|
||||||
from models.listing import ListingSite, PriceHistoryItem
|
from models.listing import ListingSite, PriceHistoryItem
|
||||||
from rec import floorplan, routing
|
from rec import floorplan, routing
|
||||||
import re
|
import re
|
||||||
|
|
@ -12,6 +13,12 @@ import datetime
|
||||||
|
|
||||||
@dataclass()
|
@dataclass()
|
||||||
class Listing:
|
class Listing:
|
||||||
|
"""Legacy Listing class for filesystem-based data access.
|
||||||
|
|
||||||
|
.. deprecated::
|
||||||
|
Use models.listing.RentListing or models.listing.BuyListing instead.
|
||||||
|
This class is kept for backwards compatibility with the populate_db command.
|
||||||
|
"""
|
||||||
identifier: int
|
identifier: int
|
||||||
_details_object: dict[str, Any] | None = None
|
_details_object: dict[str, Any] | None = None
|
||||||
_listing_object: dict[str, Any] | None = None
|
_listing_object: dict[str, Any] | None = None
|
||||||
|
|
@ -36,6 +43,14 @@ class Listing:
|
||||||
"council_tax_band",
|
"council_tax_band",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
warnings.warn(
|
||||||
|
"data_access.Listing is deprecated. Use models.listing.RentListing "
|
||||||
|
"or models.listing.BuyListing instead.",
|
||||||
|
DeprecationWarning,
|
||||||
|
stacklevel=3,
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_all_listings(
|
def get_all_listings(
|
||||||
listing_paths: list[pathlib.Path],
|
listing_paths: list[pathlib.Path],
|
||||||
|
|
@ -144,39 +159,6 @@ class Listing:
|
||||||
# todo add check if return is image
|
# todo add check if return is image
|
||||||
return images
|
return images
|
||||||
|
|
||||||
def calculate_sqm_model(self):
|
|
||||||
objs = []
|
|
||||||
for floorplan_path in self.list_floorplans():
|
|
||||||
estimated_sqm, model_output, predictions = floorplan.calculate_model(
|
|
||||||
floorplan_path
|
|
||||||
)
|
|
||||||
objs.append(
|
|
||||||
{
|
|
||||||
"floorplan_path": str(floorplan_path),
|
|
||||||
"estimated_sqm": estimated_sqm,
|
|
||||||
"model_output": model_output,
|
|
||||||
"no_predictions": len(
|
|
||||||
predictions
|
|
||||||
), # cant serialize the predictions itself since its a tensor
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(self.path_floorplan_model_json(), "w") as f:
|
|
||||||
json.dump(objs, f)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def sqm_model(self, recalculate=True) -> float:
|
|
||||||
if not self.path_floorplan_model_json().exists() or recalculate:
|
|
||||||
self.calculate_sqm_model()
|
|
||||||
|
|
||||||
with open(self.path_floorplan_json()) as f:
|
|
||||||
objs = json.load(f)
|
|
||||||
|
|
||||||
max_sqm = max(
|
|
||||||
[o["estimated_sqm"] for o in objs if o is None]
|
|
||||||
) # filter out Nones
|
|
||||||
return max_sqm
|
|
||||||
|
|
||||||
async def calculate_sqm_ocr(self, recalculate=True):
|
async def calculate_sqm_ocr(self, recalculate=True):
|
||||||
objs = []
|
objs = []
|
||||||
if self.path_floorplan_ocr_json().exists():
|
if self.path_floorplan_ocr_json().exists():
|
||||||
|
|
@ -405,63 +387,6 @@ class Listing:
|
||||||
def listing_site(self) -> ListingSite:
|
def listing_site(self) -> ListingSite:
|
||||||
return ListingSite.RIGHTMOVE # this class supports only right move
|
return ListingSite.RIGHTMOVE # this class supports only right move
|
||||||
|
|
||||||
async def dict_nicely(self):
|
|
||||||
travel_time_fastest = {}
|
|
||||||
travel_time_second = {}
|
|
||||||
if self.path_routing_json().exists():
|
|
||||||
with open(self.path_routing_json(), "r") as f:
|
|
||||||
travel_times = json.load(f)
|
|
||||||
for destination_mode in travel_times.keys():
|
|
||||||
destination_mode_clean = destination_mode.replace(" ", "_").replace(
|
|
||||||
",", "_"
|
|
||||||
)
|
|
||||||
destination, travel_mode = self.__from_routing_cache_key(
|
|
||||||
destination_mode
|
|
||||||
)
|
|
||||||
travel_time_fastest[destination_mode_clean] = self.travel_time(
|
|
||||||
destination, travel_mode
|
|
||||||
)[0]["duration"]
|
|
||||||
travel_time_second[destination_mode_clean] = self.travel_time(
|
|
||||||
destination, travel_mode
|
|
||||||
)[1]["duration"]
|
|
||||||
|
|
||||||
return {
|
|
||||||
"identifier": self.identifier,
|
|
||||||
"sqm_ocr": await self.sqm_ocr(),
|
|
||||||
"price": self.price,
|
|
||||||
"price_per_sqm": await self.price_per_sqm(),
|
|
||||||
"url": self.url,
|
|
||||||
"bedrooms": self.bedrooms,
|
|
||||||
"travel_time_fastest": ":".join(
|
|
||||||
sorted(
|
|
||||||
f"{dest} in {travel_mode//60}min"
|
|
||||||
for dest, travel_mode in travel_time_fastest.items()
|
|
||||||
)
|
|
||||||
),
|
|
||||||
"travel_time_second": ":".join(
|
|
||||||
sorted(
|
|
||||||
f"{dest} in {travel_mode//60}min"
|
|
||||||
for dest, travel_mode in travel_time_second.items()
|
|
||||||
)
|
|
||||||
),
|
|
||||||
"lease_left": self.leaseLeft,
|
|
||||||
"service_charge": self.serviceCharge,
|
|
||||||
"development": self.development,
|
|
||||||
"tenure_type": self.tenure_type,
|
|
||||||
"updated_days": self.updateDaysAgo,
|
|
||||||
"status": self.status,
|
|
||||||
"last_seen": self.last_seen,
|
|
||||||
"agency": self.agency,
|
|
||||||
"council_tax_band": self.councilTaxBand,
|
|
||||||
"photo_thumbnail": self.photoThumbnail,
|
|
||||||
"let_date_available": (
|
|
||||||
self.letDateAvailable.strftime("%d/%m/%Y")
|
|
||||||
if self.letDateAvailable
|
|
||||||
else "Ask agent"
|
|
||||||
),
|
|
||||||
"price_history": self.priceHistory,
|
|
||||||
}
|
|
||||||
|
|
||||||
def __routing_cache_key(
|
def __routing_cache_key(
|
||||||
self,
|
self,
|
||||||
dest_address: str,
|
dest_address: str,
|
||||||
|
|
|
||||||
|
|
@ -14,10 +14,13 @@ services:
|
||||||
interval: 5s
|
interval: 5s
|
||||||
timeout: 3s
|
timeout: 3s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
networks:
|
||||||
|
- rec-network
|
||||||
|
|
||||||
mysql:
|
mysql:
|
||||||
image: mysql:9
|
image: mysql:9
|
||||||
container_name: rec-mysql
|
container_name: rec-mysql
|
||||||
|
hostname: mysql
|
||||||
ports:
|
ports:
|
||||||
- "3306:3306"
|
- "3306:3306"
|
||||||
environment:
|
environment:
|
||||||
|
|
@ -32,6 +35,9 @@ services:
|
||||||
interval: 10s
|
interval: 10s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
start_period: 30s
|
||||||
|
networks:
|
||||||
|
- rec-network
|
||||||
|
|
||||||
app:
|
app:
|
||||||
build:
|
build:
|
||||||
|
|
@ -47,7 +53,7 @@ services:
|
||||||
- app_venv:/app/.venv
|
- app_venv:/app/.venv
|
||||||
environment:
|
environment:
|
||||||
- ENV=dev
|
- ENV=dev
|
||||||
- DB_CONNECTION_STRING=mysql://wrongmove:wrongmove@mysql:3306/wrongmove
|
- DB_CONNECTION_STRING=mysql+mysqldb://wrongmove:wrongmove@mysql:3306/wrongmove
|
||||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||||
- ROUTING_API_KEY=${ROUTING_API_KEY:-}
|
- ROUTING_API_KEY=${ROUTING_API_KEY:-}
|
||||||
|
|
@ -57,6 +63,8 @@ services:
|
||||||
mysql:
|
mysql:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
command: ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "5001", "--reload", "--reload-dir", "api", "--reload-dir", "services", "--reload-dir", "repositories", "--reload-dir", "models"]
|
command: ["uvicorn", "api.app:app", "--host", "0.0.0.0", "--port", "5001", "--reload", "--reload-dir", "api", "--reload-dir", "services", "--reload-dir", "repositories", "--reload-dir", "models"]
|
||||||
|
networks:
|
||||||
|
- rec-network
|
||||||
|
|
||||||
celery:
|
celery:
|
||||||
build:
|
build:
|
||||||
|
|
@ -68,7 +76,7 @@ services:
|
||||||
- app_venv:/app/.venv
|
- app_venv:/app/.venv
|
||||||
environment:
|
environment:
|
||||||
- ENV=dev
|
- ENV=dev
|
||||||
- DB_CONNECTION_STRING=mysql://wrongmove:wrongmove@mysql:3306/wrongmove
|
- DB_CONNECTION_STRING=mysql+mysqldb://wrongmove:wrongmove@mysql:3306/wrongmove
|
||||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||||
- ROUTING_API_KEY=${ROUTING_API_KEY:-}
|
- ROUTING_API_KEY=${ROUTING_API_KEY:-}
|
||||||
|
|
@ -79,6 +87,8 @@ services:
|
||||||
mysql:
|
mysql:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
command: ["celery", "-A", "celery_app", "worker", "--loglevel=info"]
|
command: ["celery", "-A", "celery_app", "worker", "--loglevel=info"]
|
||||||
|
networks:
|
||||||
|
- rec-network
|
||||||
|
|
||||||
celery-beat:
|
celery-beat:
|
||||||
build:
|
build:
|
||||||
|
|
@ -90,7 +100,7 @@ services:
|
||||||
- app_venv:/app/.venv
|
- app_venv:/app/.venv
|
||||||
environment:
|
environment:
|
||||||
- ENV=dev
|
- ENV=dev
|
||||||
- DB_CONNECTION_STRING=mysql://wrongmove:wrongmove@mysql:3306/wrongmove
|
- DB_CONNECTION_STRING=mysql+mysqldb://wrongmove:wrongmove@mysql:3306/wrongmove
|
||||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||||
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
- CELERY_RESULT_BACKEND=redis://redis:6379/0
|
||||||
- SCRAPE_SCHEDULES=${SCRAPE_SCHEDULES:-}
|
- SCRAPE_SCHEDULES=${SCRAPE_SCHEDULES:-}
|
||||||
|
|
@ -98,6 +108,12 @@ services:
|
||||||
- redis
|
- redis
|
||||||
- celery
|
- celery
|
||||||
command: ["celery", "-A", "celery_app", "beat", "--loglevel=info"]
|
command: ["celery", "-A", "celery_app", "beat", "--loglevel=info"]
|
||||||
|
networks:
|
||||||
|
- rec-network
|
||||||
|
|
||||||
|
networks:
|
||||||
|
rec-network:
|
||||||
|
driver: bridge
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
redis_data:
|
redis_data:
|
||||||
|
|
|
||||||
183
crawler/docs/BACKEND.md
Normal file
183
crawler/docs/BACKEND.md
Normal file
|
|
@ -0,0 +1,183 @@
|
||||||
|
# Real Estate Crawler - Backend Documentation
|
||||||
|
|
||||||
|
A property listing aggregator that scrapes Rightmove UK, extracts square meters via OCR, and calculates transit routes.
|
||||||
|
|
||||||
|
## Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Docker (recommended) - starts Redis, MySQL, API, and Celery
|
||||||
|
./start.sh
|
||||||
|
|
||||||
|
# Or run locally with Poetry
|
||||||
|
poetry install
|
||||||
|
./start.sh --local
|
||||||
|
```
|
||||||
|
|
||||||
|
API available at `http://localhost:5001`
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
| Dependency | Purpose |
|
||||||
|
|------------|---------|
|
||||||
|
| Python 3.11+ | Runtime |
|
||||||
|
| Redis | Celery message broker |
|
||||||
|
| MySQL/SQLite | Database |
|
||||||
|
| Tesseract OCR | Floorplan text extraction |
|
||||||
|
| Docker | Containerized deployment |
|
||||||
|
|
||||||
|
### Python Packages (key)
|
||||||
|
- `fastapi` + `uvicorn` - HTTP API
|
||||||
|
- `celery` - Background tasks
|
||||||
|
- `sqlmodel` - ORM
|
||||||
|
- `pytesseract` + `opencv` - OCR
|
||||||
|
- `aiohttp` - Async HTTP client
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
### Health Check
|
||||||
|
```bash
|
||||||
|
curl http://localhost:5001/api/status
|
||||||
|
# {"status": "OK"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get Listings
|
||||||
|
```bash
|
||||||
|
curl -H "Authorization: Bearer $TOKEN" \
|
||||||
|
"http://localhost:5001/api/listing?limit=10"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get Listings as GeoJSON
|
||||||
|
```bash
|
||||||
|
curl -H "Authorization: Bearer $TOKEN" \
|
||||||
|
"http://localhost:5001/api/listing_geojson?listing_type=RENT&min_bedrooms=2&max_price=3000"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Refresh Listings (async)
|
||||||
|
```bash
|
||||||
|
curl -X POST -H "Authorization: Bearer $TOKEN" \
|
||||||
|
"http://localhost:5001/api/refresh_listings?listing_type=RENT&min_bedrooms=2&max_bedrooms=3&min_price=2000&max_price=4000"
|
||||||
|
# {"task_id": "abc123", "message": "Task abc123 started"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Check Task Status
|
||||||
|
```bash
|
||||||
|
curl -H "Authorization: Bearer $TOKEN" \
|
||||||
|
"http://localhost:5001/api/task_status?task_id=abc123"
|
||||||
|
# {"task_id": "abc123", "status": "SUCCESS", "result": "..."}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Get Districts
|
||||||
|
```bash
|
||||||
|
curl -H "Authorization: Bearer $TOKEN" \
|
||||||
|
"http://localhost:5001/api/get_districts"
|
||||||
|
# {"Westminster": "REGION^93965", "Camden": "REGION^93934", ...}
|
||||||
|
```
|
||||||
|
|
||||||
|
## CLI Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Fetch listings from Rightmove
|
||||||
|
python main.py dump-listings -t rent --min-bedrooms 2 --max-price 4000
|
||||||
|
|
||||||
|
# Download floorplan images
|
||||||
|
python main.py dump-images
|
||||||
|
|
||||||
|
# Run OCR on floorplans
|
||||||
|
python main.py detect-floorplan
|
||||||
|
|
||||||
|
# Calculate transit routes
|
||||||
|
python main.py routing -d "10 Downing Street, London" -m TRANSIT -l 10
|
||||||
|
|
||||||
|
# Export to GeoJSON
|
||||||
|
python main.py export-immoweb -O output.geojson -t rent --min-bedrooms 2
|
||||||
|
|
||||||
|
# Export to CSV
|
||||||
|
python main.py export-csv -O output.csv -t rent
|
||||||
|
|
||||||
|
# List available districts
|
||||||
|
python main.py list-districts
|
||||||
|
```
|
||||||
|
|
||||||
|
## Query Parameters
|
||||||
|
|
||||||
|
| Parameter | Type | Description |
|
||||||
|
|-----------|------|-------------|
|
||||||
|
| `listing_type` | RENT/BUY | Property type |
|
||||||
|
| `min_bedrooms` | int | Minimum bedrooms |
|
||||||
|
| `max_bedrooms` | int | Maximum bedrooms |
|
||||||
|
| `min_price` | int | Minimum price |
|
||||||
|
| `max_price` | int | Maximum price |
|
||||||
|
| `min_sqm` | int | Minimum square meters |
|
||||||
|
| `district` | string | District name (repeatable) |
|
||||||
|
| `furnish_types` | string | FURNISHED/UNFURNISHED/PART_FURNISHED |
|
||||||
|
| `last_seen_days` | int | Only listings seen in last N days |
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
||||||
|
│ CLI │ │ HTTP API │ │ Celery │
|
||||||
|
│ (main.py) │ │ (api/app.py)│ │ Worker │
|
||||||
|
└──────┬──────┘ └──────┬──────┘ └──────┬──────┘
|
||||||
|
│ │ │
|
||||||
|
└───────────────────┼───────────────────┘
|
||||||
|
│
|
||||||
|
┌────────▼────────┐
|
||||||
|
│ Services │
|
||||||
|
│ (services/*.py) │
|
||||||
|
└────────┬────────┘
|
||||||
|
│
|
||||||
|
┌────────────┼────────────┐
|
||||||
|
│ │ │
|
||||||
|
┌──────▼──────┐ ┌───▼───┐ ┌──────▼──────┐
|
||||||
|
│ Repository │ │ Redis │ │ Rightmove │
|
||||||
|
│ (MySQL) │ │ │ │ API │
|
||||||
|
└─────────────┘ └───────┘ └─────────────┘
|
||||||
|
```
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Database
|
||||||
|
DB_CONNECTION_STRING=mysql://user:pass@localhost:3306/wrongmove
|
||||||
|
|
||||||
|
# Redis (Celery)
|
||||||
|
CELERY_BROKER_URL=redis://localhost:6379/0
|
||||||
|
CELERY_RESULT_BACKEND=redis://localhost:6379/0
|
||||||
|
|
||||||
|
# Google Maps (optional, for routing)
|
||||||
|
ROUTING_API_KEY=your_api_key
|
||||||
|
```
|
||||||
|
|
||||||
|
## Authentication
|
||||||
|
|
||||||
|
API endpoints (except `/api/status`) require JWT authentication via Authentik OIDC.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Get token from Authentik, then:
|
||||||
|
curl -H "Authorization: Bearer $TOKEN" http://localhost:5001/api/listing
|
||||||
|
```
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
├── main.py # CLI entry point
|
||||||
|
├── api/app.py # FastAPI application
|
||||||
|
├── services/ # Business logic (shared by CLI + API)
|
||||||
|
│ ├── listing_service.py
|
||||||
|
│ ├── export_service.py
|
||||||
|
│ ├── district_service.py
|
||||||
|
│ └── task_service.py
|
||||||
|
├── repositories/ # Database access
|
||||||
|
├── models/ # SQLModel entities
|
||||||
|
├── rec/ # Core logic (query, OCR, routing)
|
||||||
|
├── tasks/ # Celery background tasks
|
||||||
|
└── tests/ # Test suite
|
||||||
|
```
|
||||||
|
|
||||||
|
## Running Tests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pytest tests/ -v --cov=.
|
||||||
|
mypy .
|
||||||
|
```
|
||||||
|
|
@ -12,130 +12,47 @@ import {
|
||||||
} from "@/components/ui/sidebar"
|
} from "@/components/ui/sidebar"
|
||||||
import * as React from "react"
|
import * as React from "react"
|
||||||
|
|
||||||
// This is sample data.
|
|
||||||
const data = {
|
const data = {
|
||||||
navMain: [
|
navMain: [
|
||||||
{
|
{
|
||||||
title: "Getting Started",
|
title: "Property Explorer",
|
||||||
url: "#",
|
url: "#",
|
||||||
items: [
|
items: [
|
||||||
{
|
{
|
||||||
title: "Installation",
|
title: "Map View",
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Project Structure",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Building Your Application",
|
|
||||||
url: "#",
|
|
||||||
items: [
|
|
||||||
{
|
|
||||||
title: "Routing",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Data Fetching",
|
|
||||||
url: "#",
|
url: "#",
|
||||||
isActive: true,
|
isActive: true,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
title: "Rendering",
|
title: "List View",
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Caching",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Styling",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Optimizing",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Configuring",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Testing",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Authentication",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Deploying",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Upgrading",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Examples",
|
|
||||||
url: "#",
|
url: "#",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
title: "API Reference",
|
title: "Data Management",
|
||||||
url: "#",
|
url: "#",
|
||||||
items: [
|
items: [
|
||||||
{
|
{
|
||||||
title: "Components",
|
title: "Refresh Listings",
|
||||||
url: "#",
|
url: "#",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
title: "File Conventions",
|
title: "Active Tasks",
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Functions",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "next.config.js Options",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "CLI",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Edge Runtime",
|
|
||||||
url: "#",
|
url: "#",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
title: "Architecture",
|
title: "Settings",
|
||||||
url: "#",
|
url: "#",
|
||||||
items: [
|
items: [
|
||||||
{
|
{
|
||||||
title: "Accessibility",
|
title: "Preferences",
|
||||||
url: "#",
|
url: "#",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
title: "Fast Refresh",
|
title: "Account",
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Next.js Compiler",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Supported Browsers",
|
|
||||||
url: "#",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
title: "Turbopack",
|
|
||||||
url: "#",
|
url: "#",
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
@ -145,21 +62,19 @@ const data = {
|
||||||
|
|
||||||
export function AppSidebar({ ...props }: React.ComponentProps<typeof Sidebar>) {
|
export function AppSidebar({ ...props }: React.ComponentProps<typeof Sidebar>) {
|
||||||
return (
|
return (
|
||||||
// create closed by default
|
<Sidebar {...props}>
|
||||||
<Sidebar {...props} >
|
|
||||||
<SidebarHeader>
|
<SidebarHeader>
|
||||||
</SidebarHeader>
|
</SidebarHeader>
|
||||||
<SidebarContent>
|
<SidebarContent>
|
||||||
{/* We create a SidebarGroup for each parent. */}
|
|
||||||
{data.navMain.map((item) => (
|
{data.navMain.map((item) => (
|
||||||
<SidebarGroup key={item.title}>
|
<SidebarGroup key={item.title}>
|
||||||
<SidebarGroupLabel>{item.title}</SidebarGroupLabel>
|
<SidebarGroupLabel>{item.title}</SidebarGroupLabel>
|
||||||
<SidebarGroupContent>
|
<SidebarGroupContent>
|
||||||
<SidebarMenu>
|
<SidebarMenu>
|
||||||
{item.items.map((item) => (
|
{item.items.map((subItem) => (
|
||||||
<SidebarMenuItem key={item.title}>
|
<SidebarMenuItem key={subItem.title}>
|
||||||
<SidebarMenuButton asChild isActive={item.isActive}>
|
<SidebarMenuButton asChild isActive={subItem.isActive}>
|
||||||
<a href={item.url}>{item.title}</a>
|
<a href={subItem.url}>{subItem.title}</a>
|
||||||
</SidebarMenuButton>
|
</SidebarMenuButton>
|
||||||
</SidebarMenuItem>
|
</SidebarMenuItem>
|
||||||
))}
|
))}
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,6 @@
|
||||||
import { getUser } from "@/auth/authService";
|
|
||||||
import { zodResolver } from "@hookform/resolvers/zod";
|
import { zodResolver } from "@hookform/resolvers/zod";
|
||||||
import { DialogTitle } from "@radix-ui/react-dialog";
|
import { DialogTitle } from "@radix-ui/react-dialog";
|
||||||
import type { User } from "oidc-client-ts";
|
import { useState } from "react";
|
||||||
import { useEffect, useState } from "react";
|
|
||||||
import { useForm } from "react-hook-form";
|
import { useForm } from "react-hook-form";
|
||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
import { Button } from "./ui/button";
|
import { Button } from "./ui/button";
|
||||||
|
|
@ -24,6 +22,12 @@ export enum ListingType {
|
||||||
BUY = 'BUY'
|
BUY = 'BUY'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export enum FurnishType {
|
||||||
|
FURNISHED = 'furnished',
|
||||||
|
PART_FURNISHED = 'partFurnished',
|
||||||
|
UNFURNISHED = 'unfurnished',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
export interface ParameterValues {
|
export interface ParameterValues {
|
||||||
metric: Metric
|
metric: Metric
|
||||||
|
|
@ -33,30 +37,15 @@ export interface ParameterValues {
|
||||||
min_price?: number
|
min_price?: number
|
||||||
max_price?: number
|
max_price?: number
|
||||||
min_sqm?: number
|
min_sqm?: number
|
||||||
|
max_sqm?: number
|
||||||
|
min_price_per_sqm?: number
|
||||||
|
max_price_per_sqm?: number
|
||||||
last_seen_days?: number
|
last_seen_days?: number
|
||||||
available_from?: Date
|
available_from?: Date
|
||||||
district: string
|
district: string
|
||||||
|
furnish_types?: FurnishType[]
|
||||||
}
|
}
|
||||||
|
|
||||||
const fetchDistricts = async (user: User | null) => {
|
|
||||||
const accessToken = user?.access_token;
|
|
||||||
|
|
||||||
const response = await fetch('/api/get_districts',
|
|
||||||
{
|
|
||||||
method: 'GET',
|
|
||||||
headers: {
|
|
||||||
'Authorization': `Bearer ${accessToken}`, // Pass the token
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
},
|
|
||||||
}
|
|
||||||
);
|
|
||||||
if (!response.ok) {
|
|
||||||
throw new Error('Error: ' + response.status);
|
|
||||||
}
|
|
||||||
const data: Response = await response.json();
|
|
||||||
return data;
|
|
||||||
};
|
|
||||||
|
|
||||||
export function Parameters(
|
export function Parameters(
|
||||||
props: {
|
props: {
|
||||||
isOpen: boolean,
|
isOpen: boolean,
|
||||||
|
|
@ -69,15 +58,6 @@ export function Parameters(
|
||||||
} = useForm<ParameterValues>()
|
} = useForm<ParameterValues>()
|
||||||
const [action, setAction] = useState<'fetch-data' | 'visualize' | null>(null)
|
const [action, setAction] = useState<'fetch-data' | 'visualize' | null>(null)
|
||||||
const [availableFromRawInput, setAvailableFromRawInput] = useState("now");
|
const [availableFromRawInput, setAvailableFromRawInput] = useState("now");
|
||||||
const [_districts, setDistricts] = useState<string[]>([]);
|
|
||||||
|
|
||||||
useEffect(() => {
|
|
||||||
getUser().then(user => {
|
|
||||||
fetchDistricts(user).then(data => {
|
|
||||||
setDistricts(Object.keys(data));
|
|
||||||
})
|
|
||||||
})
|
|
||||||
}, []);
|
|
||||||
|
|
||||||
const formSchema = z.object({
|
const formSchema = z.object({
|
||||||
metric: z.nativeEnum(Metric, { required_error: "Metric is required" }),
|
metric: z.nativeEnum(Metric, { required_error: "Metric is required" }),
|
||||||
|
|
@ -177,29 +157,6 @@ export function Parameters(
|
||||||
</FormItem>
|
</FormItem>
|
||||||
)}
|
)}
|
||||||
/>
|
/>
|
||||||
{/* <FormField # listings don't have district stored as metadata; so only useful in rightmove querying
|
|
||||||
control={form.control}
|
|
||||||
name="district"
|
|
||||||
render={({ field }) => (
|
|
||||||
<FormItem className="flex flex-row items-center gap-4">
|
|
||||||
<FormLabel>District</FormLabel>
|
|
||||||
<Select onValueChange={field.onChange} defaultValue={field.value}>
|
|
||||||
<FormControl>
|
|
||||||
<SelectTrigger className="w-[180px]">
|
|
||||||
<SelectValue placeholder="District" />
|
|
||||||
</SelectTrigger>
|
|
||||||
</FormControl>
|
|
||||||
<SelectContent {...register('district')} >
|
|
||||||
{districts.map((district, index) => (
|
|
||||||
<SelectItem key={index} value={district}>{district}
|
|
||||||
</SelectItem>
|
|
||||||
))}
|
|
||||||
</SelectContent>
|
|
||||||
</Select>
|
|
||||||
<FormMessage />
|
|
||||||
</FormItem>
|
|
||||||
)}
|
|
||||||
/> */}
|
|
||||||
<FormField
|
<FormField
|
||||||
control={form.control}
|
control={form.control}
|
||||||
name="min_sqm"
|
name="min_sqm"
|
||||||
|
|
|
||||||
128
crawler/frontend/src/components/StatsBar.tsx
Normal file
128
crawler/frontend/src/components/StatsBar.tsx
Normal file
|
|
@ -0,0 +1,128 @@
|
||||||
|
import { BarChart3, MapPin, PoundSterling, Maximize2, List, Map as MapIcon } from 'lucide-react';
|
||||||
|
import { Button } from './ui/button';
|
||||||
|
import type { GeoJSONFeatureCollection, PropertyFeature } from '@/types';
|
||||||
|
|
||||||
|
export type ViewMode = 'map' | 'list' | 'split';
|
||||||
|
|
||||||
|
interface StatsBarProps {
|
||||||
|
listingData: GeoJSONFeatureCollection | null;
|
||||||
|
viewMode: ViewMode;
|
||||||
|
onViewModeChange: (mode: ViewMode) => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface ListingStats {
|
||||||
|
count: number;
|
||||||
|
avgPrice: number;
|
||||||
|
avgPricePerSqm: number;
|
||||||
|
avgSize: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
function calculateStats(data: GeoJSONFeatureCollection | null): ListingStats {
|
||||||
|
if (!data || data.features.length === 0) {
|
||||||
|
return { count: 0, avgPrice: 0, avgPricePerSqm: 0, avgSize: 0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
const features = data.features;
|
||||||
|
const count = features.length;
|
||||||
|
|
||||||
|
const validPrices = features
|
||||||
|
.map((f: PropertyFeature) => f.properties.total_price)
|
||||||
|
.filter((p): p is number => typeof p === 'number' && p > 0);
|
||||||
|
|
||||||
|
const validPricesPerSqm = features
|
||||||
|
.map((f: PropertyFeature) => f.properties.qmprice)
|
||||||
|
.filter((p): p is number => typeof p === 'number' && p > 0);
|
||||||
|
|
||||||
|
const validSizes = features
|
||||||
|
.map((f: PropertyFeature) => f.properties.qm)
|
||||||
|
.filter((s): s is number => typeof s === 'number' && s > 0);
|
||||||
|
|
||||||
|
const avgPrice = validPrices.length > 0
|
||||||
|
? validPrices.reduce((a, b) => a + b, 0) / validPrices.length
|
||||||
|
: 0;
|
||||||
|
|
||||||
|
const avgPricePerSqm = validPricesPerSqm.length > 0
|
||||||
|
? validPricesPerSqm.reduce((a, b) => a + b, 0) / validPricesPerSqm.length
|
||||||
|
: 0;
|
||||||
|
|
||||||
|
const avgSize = validSizes.length > 0
|
||||||
|
? validSizes.reduce((a, b) => a + b, 0) / validSizes.length
|
||||||
|
: 0;
|
||||||
|
|
||||||
|
return { count, avgPrice, avgPricePerSqm, avgSize };
|
||||||
|
}
|
||||||
|
|
||||||
|
function formatCurrency(value: number): string {
|
||||||
|
if (value >= 1000) {
|
||||||
|
return `£${(value / 1000).toFixed(1)}k`;
|
||||||
|
}
|
||||||
|
return `£${Math.round(value)}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function StatsBar({ listingData, viewMode, onViewModeChange }: StatsBarProps) {
|
||||||
|
const stats = calculateStats(listingData);
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="flex items-center justify-between px-4 py-2 bg-muted/50 border-t text-sm">
|
||||||
|
{/* Stats */}
|
||||||
|
<div className="flex items-center gap-4 text-muted-foreground">
|
||||||
|
<div className="flex items-center gap-1.5">
|
||||||
|
<MapPin className="h-4 w-4" />
|
||||||
|
<span className="font-medium text-foreground">{stats.count.toLocaleString()}</span>
|
||||||
|
<span className="hidden sm:inline">listings</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{stats.avgPrice > 0 && (
|
||||||
|
<>
|
||||||
|
<div className="hidden md:flex items-center gap-1.5">
|
||||||
|
<PoundSterling className="h-4 w-4" />
|
||||||
|
<span>Avg: <span className="font-medium text-foreground">{formatCurrency(stats.avgPrice)}</span></span>
|
||||||
|
</div>
|
||||||
|
<div className="hidden lg:flex items-center gap-1.5">
|
||||||
|
<BarChart3 className="h-4 w-4" />
|
||||||
|
<span>Avg £/m²: <span className="font-medium text-foreground">{formatCurrency(stats.avgPricePerSqm)}</span></span>
|
||||||
|
</div>
|
||||||
|
<div className="hidden lg:flex items-center gap-1.5">
|
||||||
|
<Maximize2 className="h-4 w-4" />
|
||||||
|
<span>Avg: <span className="font-medium text-foreground">{Math.round(stats.avgSize)} m²</span></span>
|
||||||
|
</div>
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* View Mode Toggle */}
|
||||||
|
<div className="flex items-center gap-1 bg-background rounded-md border p-0.5">
|
||||||
|
<Button
|
||||||
|
variant={viewMode === 'map' ? 'secondary' : 'ghost'}
|
||||||
|
size="sm"
|
||||||
|
className="h-7 px-2"
|
||||||
|
onClick={() => onViewModeChange('map')}
|
||||||
|
>
|
||||||
|
<MapIcon className="h-4 w-4" />
|
||||||
|
<span className="hidden sm:inline ml-1">Map</span>
|
||||||
|
</Button>
|
||||||
|
<Button
|
||||||
|
variant={viewMode === 'list' ? 'secondary' : 'ghost'}
|
||||||
|
size="sm"
|
||||||
|
className="h-7 px-2"
|
||||||
|
onClick={() => onViewModeChange('list')}
|
||||||
|
>
|
||||||
|
<List className="h-4 w-4" />
|
||||||
|
<span className="hidden sm:inline ml-1">List</span>
|
||||||
|
</Button>
|
||||||
|
<Button
|
||||||
|
variant={viewMode === 'split' ? 'secondary' : 'ghost'}
|
||||||
|
size="sm"
|
||||||
|
className="h-7 px-2 hidden md:flex"
|
||||||
|
onClick={() => onViewModeChange('split')}
|
||||||
|
>
|
||||||
|
<div className="flex gap-0.5">
|
||||||
|
<div className="w-2 h-4 bg-current rounded-sm opacity-60" />
|
||||||
|
<div className="w-2 h-4 border border-current rounded-sm" />
|
||||||
|
</div>
|
||||||
|
<span className="hidden sm:inline ml-1">Split</span>
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
47
crawler/frontend/src/components/StreamingProgressBar.tsx
Normal file
47
crawler/frontend/src/components/StreamingProgressBar.tsx
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
import { Loader2 } from 'lucide-react';
|
||||||
|
import type { StreamingProgress } from '@/services';
|
||||||
|
|
||||||
|
interface StreamingProgressBarProps {
|
||||||
|
progress: StreamingProgress | null;
|
||||||
|
isLoading: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function StreamingProgressBar({ progress, isLoading }: StreamingProgressBarProps) {
|
||||||
|
if (!isLoading) return null;
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className="absolute top-0 left-0 right-0 z-10 bg-background/95 backdrop-blur-sm border-b px-4 py-2">
|
||||||
|
<div className="flex items-center gap-3">
|
||||||
|
<Loader2 className="h-4 w-4 animate-spin text-primary" />
|
||||||
|
<div className="flex-1">
|
||||||
|
<div className="flex items-center justify-between text-sm">
|
||||||
|
<span className="font-medium">
|
||||||
|
{progress
|
||||||
|
? `Loading listings...`
|
||||||
|
: 'Loading...'}
|
||||||
|
</span>
|
||||||
|
{progress && (
|
||||||
|
<span className="text-muted-foreground">
|
||||||
|
{progress.count.toLocaleString()}
|
||||||
|
{progress.total ? ` / ${progress.total.toLocaleString()}` : ''} loaded
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
{progress && (
|
||||||
|
<div className="mt-1 h-1.5 w-full bg-primary/20 rounded-full overflow-hidden">
|
||||||
|
<div
|
||||||
|
className="h-full bg-primary transition-all duration-300 ease-out rounded-full"
|
||||||
|
style={{
|
||||||
|
width: progress.total
|
||||||
|
? `${Math.min((progress.count / progress.total) * 100, 100)}%`
|
||||||
|
: '100%',
|
||||||
|
animation: progress.total ? undefined : 'pulse 1.5s ease-in-out infinite',
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
}
|
||||||
56
crawler/frontend/src/components/ui/accordion.tsx
Normal file
56
crawler/frontend/src/components/ui/accordion.tsx
Normal file
|
|
@ -0,0 +1,56 @@
|
||||||
|
"use client"
|
||||||
|
|
||||||
|
import * as React from "react"
|
||||||
|
import * as AccordionPrimitive from "@radix-ui/react-accordion"
|
||||||
|
import { ChevronDown } from "lucide-react"
|
||||||
|
import { cn } from "@/lib/utils"
|
||||||
|
|
||||||
|
const Accordion = AccordionPrimitive.Root
|
||||||
|
|
||||||
|
const AccordionItem = React.forwardRef<
|
||||||
|
React.ComponentRef<typeof AccordionPrimitive.Item>,
|
||||||
|
React.ComponentPropsWithoutRef<typeof AccordionPrimitive.Item>
|
||||||
|
>(({ className, ...props }, ref) => (
|
||||||
|
<AccordionPrimitive.Item
|
||||||
|
ref={ref}
|
||||||
|
className={cn("border-b", className)}
|
||||||
|
{...props}
|
||||||
|
/>
|
||||||
|
))
|
||||||
|
AccordionItem.displayName = "AccordionItem"
|
||||||
|
|
||||||
|
const AccordionTrigger = React.forwardRef<
|
||||||
|
React.ComponentRef<typeof AccordionPrimitive.Trigger>,
|
||||||
|
React.ComponentPropsWithoutRef<typeof AccordionPrimitive.Trigger>
|
||||||
|
>(({ className, children, ...props }, ref) => (
|
||||||
|
<AccordionPrimitive.Header className="flex">
|
||||||
|
<AccordionPrimitive.Trigger
|
||||||
|
ref={ref}
|
||||||
|
className={cn(
|
||||||
|
"flex flex-1 items-center justify-between py-4 text-sm font-medium transition-all hover:underline text-left [&[data-state=open]>svg]:rotate-180",
|
||||||
|
className
|
||||||
|
)}
|
||||||
|
{...props}
|
||||||
|
>
|
||||||
|
{children}
|
||||||
|
<ChevronDown className="h-4 w-4 shrink-0 text-muted-foreground transition-transform duration-200" />
|
||||||
|
</AccordionPrimitive.Trigger>
|
||||||
|
</AccordionPrimitive.Header>
|
||||||
|
))
|
||||||
|
AccordionTrigger.displayName = AccordionPrimitive.Trigger.displayName
|
||||||
|
|
||||||
|
const AccordionContent = React.forwardRef<
|
||||||
|
React.ComponentRef<typeof AccordionPrimitive.Content>,
|
||||||
|
React.ComponentPropsWithoutRef<typeof AccordionPrimitive.Content>
|
||||||
|
>(({ className, children, ...props }, ref) => (
|
||||||
|
<AccordionPrimitive.Content
|
||||||
|
ref={ref}
|
||||||
|
className="overflow-hidden text-sm data-[state=closed]:animate-accordion-up data-[state=open]:animate-accordion-down"
|
||||||
|
{...props}
|
||||||
|
>
|
||||||
|
<div className={cn("pb-4 pt-0", className)}>{children}</div>
|
||||||
|
</AccordionPrimitive.Content>
|
||||||
|
))
|
||||||
|
AccordionContent.displayName = AccordionPrimitive.Content.displayName
|
||||||
|
|
||||||
|
export { Accordion, AccordionItem, AccordionTrigger, AccordionContent }
|
||||||
|
|
@ -1,66 +0,0 @@
|
||||||
import * as React from "react"
|
|
||||||
import { cva, type VariantProps } from "class-variance-authority"
|
|
||||||
|
|
||||||
import { cn } from "@/lib/utils"
|
|
||||||
|
|
||||||
const alertVariants = cva(
|
|
||||||
"relative w-full rounded-lg border px-4 py-3 text-sm grid has-[>svg]:grid-cols-[calc(var(--spacing)*4)_1fr] grid-cols-[0_1fr] has-[>svg]:gap-x-3 gap-y-0.5 items-start [&>svg]:size-4 [&>svg]:translate-y-0.5 [&>svg]:text-current",
|
|
||||||
{
|
|
||||||
variants: {
|
|
||||||
variant: {
|
|
||||||
default: "bg-card text-card-foreground",
|
|
||||||
destructive:
|
|
||||||
"text-destructive bg-card [&>svg]:text-current *:data-[slot=alert-description]:text-destructive/90",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
defaultVariants: {
|
|
||||||
variant: "default",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
function Alert({
|
|
||||||
className,
|
|
||||||
variant,
|
|
||||||
...props
|
|
||||||
}: React.ComponentProps<"div"> & VariantProps<typeof alertVariants>) {
|
|
||||||
return (
|
|
||||||
<div
|
|
||||||
data-slot="alert"
|
|
||||||
role="alert"
|
|
||||||
className={cn(alertVariants({ variant }), className)}
|
|
||||||
{...props}
|
|
||||||
/>
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
function AlertTitle({ className, ...props }: React.ComponentProps<"div">) {
|
|
||||||
return (
|
|
||||||
<div
|
|
||||||
data-slot="alert-title"
|
|
||||||
className={cn(
|
|
||||||
"col-start-2 line-clamp-1 min-h-4 font-medium tracking-tight",
|
|
||||||
className
|
|
||||||
)}
|
|
||||||
{...props}
|
|
||||||
/>
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
function AlertDescription({
|
|
||||||
className,
|
|
||||||
...props
|
|
||||||
}: React.ComponentProps<"div">) {
|
|
||||||
return (
|
|
||||||
<div
|
|
||||||
data-slot="alert-description"
|
|
||||||
className={cn(
|
|
||||||
"text-muted-foreground col-start-2 grid justify-items-start gap-1 text-sm [&_p]:leading-relaxed",
|
|
||||||
className
|
|
||||||
)}
|
|
||||||
{...props}
|
|
||||||
/>
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
export { Alert, AlertTitle, AlertDescription }
|
|
||||||
|
|
@ -1,46 +0,0 @@
|
||||||
import * as React from "react"
|
|
||||||
import { Slot } from "@radix-ui/react-slot"
|
|
||||||
import { cva, type VariantProps } from "class-variance-authority"
|
|
||||||
|
|
||||||
import { cn } from "@/lib/utils"
|
|
||||||
|
|
||||||
const badgeVariants = cva(
|
|
||||||
"inline-flex items-center justify-center rounded-md border px-2 py-0.5 text-xs font-medium w-fit whitespace-nowrap shrink-0 [&>svg]:size-3 gap-1 [&>svg]:pointer-events-none focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:ring-[3px] aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive transition-[color,box-shadow] overflow-hidden",
|
|
||||||
{
|
|
||||||
variants: {
|
|
||||||
variant: {
|
|
||||||
default:
|
|
||||||
"border-transparent bg-primary text-primary-foreground [a&]:hover:bg-primary/90",
|
|
||||||
secondary:
|
|
||||||
"border-transparent bg-secondary text-secondary-foreground [a&]:hover:bg-secondary/90",
|
|
||||||
destructive:
|
|
||||||
"border-transparent bg-destructive text-white [a&]:hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60",
|
|
||||||
outline:
|
|
||||||
"text-foreground [a&]:hover:bg-accent [a&]:hover:text-accent-foreground",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
defaultVariants: {
|
|
||||||
variant: "default",
|
|
||||||
},
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
function Badge({
|
|
||||||
className,
|
|
||||||
variant,
|
|
||||||
asChild = false,
|
|
||||||
...props
|
|
||||||
}: React.ComponentProps<"span"> &
|
|
||||||
VariantProps<typeof badgeVariants> & { asChild?: boolean }) {
|
|
||||||
const Comp = asChild ? Slot : "span"
|
|
||||||
|
|
||||||
return (
|
|
||||||
<Comp
|
|
||||||
data-slot="badge"
|
|
||||||
className={cn(badgeVariants({ variant }), className)}
|
|
||||||
{...props}
|
|
||||||
/>
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
export { Badge, badgeVariants }
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
import * as React from "react"
|
import * as React from "react"
|
||||||
import { Slot } from "@radix-ui/react-slot"
|
import { Slot } from "@radix-ui/react-slot"
|
||||||
import { ChevronRight, MoreHorizontal } from "lucide-react"
|
import { ChevronRight } from "lucide-react"
|
||||||
|
|
||||||
import { cn } from "@/lib/utils"
|
import { cn } from "@/lib/utils"
|
||||||
|
|
||||||
|
|
@ -80,24 +80,6 @@ function BreadcrumbSeparator({
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
function BreadcrumbEllipsis({
|
|
||||||
className,
|
|
||||||
...props
|
|
||||||
}: React.ComponentProps<"span">) {
|
|
||||||
return (
|
|
||||||
<span
|
|
||||||
data-slot="breadcrumb-ellipsis"
|
|
||||||
role="presentation"
|
|
||||||
aria-hidden="true"
|
|
||||||
className={cn("flex size-9 items-center justify-center", className)}
|
|
||||||
{...props}
|
|
||||||
>
|
|
||||||
<MoreHorizontal className="size-4" />
|
|
||||||
<span className="sr-only">More</span>
|
|
||||||
</span>
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
export {
|
export {
|
||||||
Breadcrumb,
|
Breadcrumb,
|
||||||
BreadcrumbList,
|
BreadcrumbList,
|
||||||
|
|
@ -105,5 +87,4 @@ export {
|
||||||
BreadcrumbLink,
|
BreadcrumbLink,
|
||||||
BreadcrumbPage,
|
BreadcrumbPage,
|
||||||
BreadcrumbSeparator,
|
BreadcrumbSeparator,
|
||||||
BreadcrumbEllipsis,
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
29
crawler/frontend/src/components/ui/checkbox.tsx
Normal file
29
crawler/frontend/src/components/ui/checkbox.tsx
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
"use client"
|
||||||
|
|
||||||
|
import * as React from "react"
|
||||||
|
import * as CheckboxPrimitive from "@radix-ui/react-checkbox"
|
||||||
|
import { Check } from "lucide-react"
|
||||||
|
import { cn } from "@/lib/utils"
|
||||||
|
|
||||||
|
const Checkbox = React.forwardRef<
|
||||||
|
React.ComponentRef<typeof CheckboxPrimitive.Root>,
|
||||||
|
React.ComponentPropsWithoutRef<typeof CheckboxPrimitive.Root>
|
||||||
|
>(({ className, ...props }, ref) => (
|
||||||
|
<CheckboxPrimitive.Root
|
||||||
|
ref={ref}
|
||||||
|
className={cn(
|
||||||
|
"peer h-4 w-4 shrink-0 rounded-sm border border-primary shadow focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:cursor-not-allowed disabled:opacity-50 data-[state=checked]:bg-primary data-[state=checked]:text-primary-foreground",
|
||||||
|
className
|
||||||
|
)}
|
||||||
|
{...props}
|
||||||
|
>
|
||||||
|
<CheckboxPrimitive.Indicator
|
||||||
|
className={cn("flex items-center justify-center text-current")}
|
||||||
|
>
|
||||||
|
<Check className="h-4 w-4" />
|
||||||
|
</CheckboxPrimitive.Indicator>
|
||||||
|
</CheckboxPrimitive.Root>
|
||||||
|
))
|
||||||
|
Checkbox.displayName = CheckboxPrimitive.Root.displayName
|
||||||
|
|
||||||
|
export { Checkbox }
|
||||||
34
crawler/frontend/src/components/ui/slider.tsx
Normal file
34
crawler/frontend/src/components/ui/slider.tsx
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
"use client"
|
||||||
|
|
||||||
|
import * as React from "react"
|
||||||
|
import * as SliderPrimitive from "@radix-ui/react-slider"
|
||||||
|
import { cn } from "@/lib/utils"
|
||||||
|
|
||||||
|
const Slider = React.forwardRef<
|
||||||
|
React.ComponentRef<typeof SliderPrimitive.Root>,
|
||||||
|
React.ComponentPropsWithoutRef<typeof SliderPrimitive.Root>
|
||||||
|
>(({ className, ...props }, ref) => (
|
||||||
|
<SliderPrimitive.Root
|
||||||
|
ref={ref}
|
||||||
|
className={cn(
|
||||||
|
"relative flex w-full touch-none select-none items-center",
|
||||||
|
className
|
||||||
|
)}
|
||||||
|
{...props}
|
||||||
|
>
|
||||||
|
<SliderPrimitive.Track className="relative h-1.5 w-full grow overflow-hidden rounded-full bg-primary/20">
|
||||||
|
<SliderPrimitive.Range className="absolute h-full bg-primary" />
|
||||||
|
</SliderPrimitive.Track>
|
||||||
|
{props.defaultValue?.map((_, index) => (
|
||||||
|
<SliderPrimitive.Thumb
|
||||||
|
key={index}
|
||||||
|
className="block h-4 w-4 rounded-full border border-primary/50 bg-background shadow transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50"
|
||||||
|
/>
|
||||||
|
)) ?? (
|
||||||
|
<SliderPrimitive.Thumb className="block h-4 w-4 rounded-full border border-primary/50 bg-background shadow transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50" />
|
||||||
|
)}
|
||||||
|
</SliderPrimitive.Root>
|
||||||
|
))
|
||||||
|
Slider.displayName = SliderPrimitive.Root.displayName
|
||||||
|
|
||||||
|
export { Slider }
|
||||||
|
|
@ -118,3 +118,30 @@
|
||||||
@apply bg-background text-foreground;
|
@apply bg-background text-foreground;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Accordion animations */
|
||||||
|
@keyframes accordion-down {
|
||||||
|
from {
|
||||||
|
height: 0;
|
||||||
|
}
|
||||||
|
to {
|
||||||
|
height: var(--radix-accordion-content-height);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@keyframes accordion-up {
|
||||||
|
from {
|
||||||
|
height: var(--radix-accordion-content-height);
|
||||||
|
}
|
||||||
|
to {
|
||||||
|
height: 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
.animate-accordion-down {
|
||||||
|
animation: accordion-down 0.2s ease-out;
|
||||||
|
}
|
||||||
|
|
||||||
|
.animate-accordion-up {
|
||||||
|
animation: accordion-up 0.2s ease-out;
|
||||||
|
}
|
||||||
|
|
|
||||||
62
crawler/frontend/src/services/apiClient.ts
Normal file
62
crawler/frontend/src/services/apiClient.ts
Normal file
|
|
@ -0,0 +1,62 @@
|
||||||
|
// Generic API client with authentication
|
||||||
|
|
||||||
|
import type { User } from 'oidc-client-ts';
|
||||||
|
import { ApiError } from '@/types';
|
||||||
|
|
||||||
|
export interface RequestOptions {
|
||||||
|
method?: 'GET' | 'POST' | 'PUT' | 'DELETE';
|
||||||
|
params?: Record<string, string | number | boolean | Date | undefined>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build query string from parameters object
|
||||||
|
*/
|
||||||
|
function buildQueryString(params: Record<string, string | number | boolean | Date | undefined>): string {
|
||||||
|
const queryString = new URLSearchParams();
|
||||||
|
|
||||||
|
for (const [key, value] of Object.entries(params)) {
|
||||||
|
if (value !== undefined && value !== null && value !== '') {
|
||||||
|
if (value instanceof Date) {
|
||||||
|
queryString.append(key, value.toISOString());
|
||||||
|
} else {
|
||||||
|
queryString.append(key, String(value));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return queryString.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generic authenticated API request
|
||||||
|
*/
|
||||||
|
export async function apiRequest<T>(
|
||||||
|
user: User,
|
||||||
|
endpoint: string,
|
||||||
|
options: RequestOptions = {}
|
||||||
|
): Promise<T> {
|
||||||
|
const { method = 'GET', params } = options;
|
||||||
|
const accessToken = user.access_token;
|
||||||
|
|
||||||
|
let url = endpoint;
|
||||||
|
if (params) {
|
||||||
|
const queryString = buildQueryString(params);
|
||||||
|
if (queryString) {
|
||||||
|
url = `${endpoint}?${queryString}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const response = await fetch(url, {
|
||||||
|
method,
|
||||||
|
headers: {
|
||||||
|
Authorization: `Bearer ${accessToken}`,
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new ApiError(`Error: ${response.status}`, response.status);
|
||||||
|
}
|
||||||
|
|
||||||
|
return response.json() as Promise<T>;
|
||||||
|
}
|
||||||
54
crawler/frontend/src/services/listingService.ts
Normal file
54
crawler/frontend/src/services/listingService.ts
Normal file
|
|
@ -0,0 +1,54 @@
|
||||||
|
// Listing service for fetching and refreshing listings
|
||||||
|
|
||||||
|
import type { User } from 'oidc-client-ts';
|
||||||
|
import type { GeoJSONFeatureCollection, RefreshListingsResponse } from '@/types';
|
||||||
|
import type { ParameterValues } from '@/components/FilterPanel';
|
||||||
|
import { apiRequest } from './apiClient';
|
||||||
|
import { API_ENDPOINTS } from '@/constants';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build listing query parameters from form values
|
||||||
|
*/
|
||||||
|
function buildListingParams(parameters: ParameterValues): Record<string, string | number | boolean | Date | undefined> {
|
||||||
|
return {
|
||||||
|
listing_type: parameters.listing_type,
|
||||||
|
min_bedrooms: parameters.min_bedrooms,
|
||||||
|
max_bedrooms: parameters.max_bedrooms,
|
||||||
|
max_price: parameters.max_price,
|
||||||
|
min_price: parameters.min_price,
|
||||||
|
min_sqm: parameters.min_sqm,
|
||||||
|
max_sqm: parameters.max_sqm,
|
||||||
|
min_price_per_sqm: parameters.min_price_per_sqm,
|
||||||
|
max_price_per_sqm: parameters.max_price_per_sqm,
|
||||||
|
last_seen_days: parameters.last_seen_days,
|
||||||
|
let_date_available_from: parameters.available_from,
|
||||||
|
district_names: parameters.district || undefined,
|
||||||
|
furnish_types: parameters.furnish_types?.join(',') || undefined,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch listing data as GeoJSON
|
||||||
|
*/
|
||||||
|
export async function fetchListingGeoJSON(
|
||||||
|
user: User,
|
||||||
|
parameters: ParameterValues
|
||||||
|
): Promise<GeoJSONFeatureCollection> {
|
||||||
|
return apiRequest<GeoJSONFeatureCollection>(user, API_ENDPOINTS.LISTING_GEOJSON, {
|
||||||
|
method: 'GET',
|
||||||
|
params: buildListingParams(parameters),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Trigger a listing refresh task
|
||||||
|
*/
|
||||||
|
export async function refreshListings(
|
||||||
|
user: User,
|
||||||
|
parameters: ParameterValues
|
||||||
|
): Promise<RefreshListingsResponse> {
|
||||||
|
return apiRequest<RefreshListingsResponse>(user, API_ENDPOINTS.REFRESH_LISTINGS, {
|
||||||
|
method: 'POST',
|
||||||
|
params: buildListingParams(parameters),
|
||||||
|
});
|
||||||
|
}
|
||||||
45
crawler/frontend/src/utils/mapUtils.ts
Normal file
45
crawler/frontend/src/utils/mapUtils.ts
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
// Map utility functions
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deep clone an object using JSON serialization
|
||||||
|
*/
|
||||||
|
export function clone<T>(obj: T): T {
|
||||||
|
return JSON.parse(JSON.stringify(obj));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate the value at a given percentile in a sorted array
|
||||||
|
* @param arr Sorted array of numbers
|
||||||
|
* @param p Percentile (0-1)
|
||||||
|
*/
|
||||||
|
export function percentile(arr: number[], p: number): number {
|
||||||
|
if (arr.length === 0) return 0;
|
||||||
|
if (typeof p !== 'number') throw new TypeError('p must be a number');
|
||||||
|
if (p <= 0) return arr[0];
|
||||||
|
if (p >= 1) return arr[arr.length - 1];
|
||||||
|
|
||||||
|
const index = arr.length * p;
|
||||||
|
const lower = Math.floor(index);
|
||||||
|
const upper = lower + 1;
|
||||||
|
const weight = index % 1;
|
||||||
|
|
||||||
|
if (upper >= arr.length) return arr[lower];
|
||||||
|
return arr[lower] * (1 - weight) + arr[upper] * weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert percentage-based color stops to value-based color stops
|
||||||
|
* @param colorStopsPerc Array of [percentage, color] tuples
|
||||||
|
* @param min Minimum value
|
||||||
|
* @param max Maximum value
|
||||||
|
*/
|
||||||
|
export function calculateColorStops(
|
||||||
|
colorStopsPerc: [number, string][],
|
||||||
|
min: number,
|
||||||
|
max: number
|
||||||
|
): [number, string][] {
|
||||||
|
return colorStopsPerc.map(([perc, color]) => [
|
||||||
|
min + (perc * (max - min)) / 100,
|
||||||
|
color,
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
@ -1 +1 @@
|
||||||
{"root":["./src/App.tsx","./src/AppSidebar.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/auth/authService.ts","./src/auth/config.ts","./src/components/ActiveQuery.tsx","./src/components/AlertError.tsx","./src/components/LoginModal.tsx","./src/components/Map.tsx","./src/components/Parameters.tsx","./src/components/ui/alert-dialog.tsx","./src/components/ui/alert.tsx","./src/components/ui/badge.tsx","./src/components/ui/breadcrumb.tsx","./src/components/ui/button.tsx","./src/components/ui/dialog.tsx","./src/components/ui/form.tsx","./src/components/ui/hover-card.tsx","./src/components/ui/input.tsx","./src/components/ui/label.tsx","./src/components/ui/progress.tsx","./src/components/ui/scroll-area.tsx","./src/components/ui/select.tsx","./src/components/ui/separator.tsx","./src/components/ui/sheet.tsx","./src/components/ui/sidebar.tsx","./src/components/ui/skeleton.tsx","./src/components/ui/tooltip.tsx","./src/hooks/use-mobile.ts","./src/lib/utils.ts"],"version":"5.8.3"}
|
{"root":["./src/App.tsx","./src/AppSidebar.tsx","./src/main.tsx","./src/vite-env.d.ts","./src/auth/authService.ts","./src/auth/config.ts","./src/auth/errors.ts","./src/components/ActiveQuery.tsx","./src/components/AlertError.tsx","./src/components/AuthCallback.tsx","./src/components/FilterPanel.tsx","./src/components/Header.tsx","./src/components/HealthIndicator.tsx","./src/components/ListView.tsx","./src/components/LoginModal.tsx","./src/components/Map.tsx","./src/components/Parameters.tsx","./src/components/PropertyCard.tsx","./src/components/Spinner.tsx","./src/components/StatsBar.tsx","./src/components/StreamingProgressBar.tsx","./src/components/TaskIndicator.tsx","./src/components/ui/DatePicker.tsx","./src/components/ui/accordion.tsx","./src/components/ui/alert-dialog.tsx","./src/components/ui/breadcrumb.tsx","./src/components/ui/button.tsx","./src/components/ui/calendar.tsx","./src/components/ui/checkbox.tsx","./src/components/ui/dialog.tsx","./src/components/ui/form.tsx","./src/components/ui/hover-card.tsx","./src/components/ui/input.tsx","./src/components/ui/label.tsx","./src/components/ui/popover.tsx","./src/components/ui/progress.tsx","./src/components/ui/scroll-area.tsx","./src/components/ui/select.tsx","./src/components/ui/separator.tsx","./src/components/ui/sheet.tsx","./src/components/ui/sidebar.tsx","./src/components/ui/skeleton.tsx","./src/components/ui/slider.tsx","./src/components/ui/tooltip.tsx","./src/constants/colorSchemes.ts","./src/constants/index.ts","./src/hooks/use-mobile.ts","./src/lib/utils.ts","./src/services/apiClient.ts","./src/services/healthService.ts","./src/services/index.ts","./src/services/listingService.ts","./src/services/streamingService.ts","./src/services/taskService.ts","./src/types/index.ts","./src/utils/mapUtils.ts"],"version":"5.8.3"}
|
||||||
|
|
@ -19,7 +19,8 @@ export default defineConfig({
|
||||||
allowedHosts: [
|
allowedHosts: [
|
||||||
env.DEV_HOST ?? 'localhost',
|
env.DEV_HOST ?? 'localhost',
|
||||||
// Add more hosts here
|
// Add more hosts here
|
||||||
'wrongmove.viktorbarzin.me'
|
'wrongmove.viktorbarzin.me',
|
||||||
|
'devvm.viktorbarzin.lan'
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
|
||||||
319
crawler/main.py
319
crawler/main.py
|
|
@ -1,28 +1,28 @@
|
||||||
|
"""CLI entry point for the Real Estate Crawler."""
|
||||||
import asyncio
|
import asyncio
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
from typing import Callable, ParamSpec, TypeVar
|
||||||
import click
|
import click
|
||||||
import importlib
|
|
||||||
|
|
||||||
from models.listing import FurnishType, ListingType, QueryParameters
|
from models.listing import FurnishType, ListingType, QueryParameters
|
||||||
from rec.districts import get_districts
|
|
||||||
from data_access import Listing
|
from data_access import Listing
|
||||||
import csv_exporter
|
|
||||||
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
|
from rec.routing import API_KEY_ENVIRONMENT_VARIABLE, TravelMode
|
||||||
from repositories.listing_repository import ListingRepository
|
from repositories.listing_repository import ListingRepository
|
||||||
from ui_exporter import export_immoweb as export_immoweb_ui
|
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
from database import engine
|
from database import engine
|
||||||
|
from services import (
|
||||||
|
listing_service,
|
||||||
|
export_service,
|
||||||
|
district_service,
|
||||||
|
)
|
||||||
|
|
||||||
|
P = ParamSpec("P")
|
||||||
|
R = TypeVar("R")
|
||||||
|
|
||||||
|
|
||||||
dump_listings_module = importlib.import_module("1_dump_listings")
|
def listing_filter_options(func: Callable[P, R]) -> Callable[P, R]:
|
||||||
dump_images_module = importlib.import_module("3_dump_images")
|
|
||||||
detect_floorplan_module = importlib.import_module("4_detect_floorplan")
|
|
||||||
routing_module = importlib.import_module("5_routing")
|
|
||||||
|
|
||||||
|
|
||||||
def listing_filter_options(func):
|
|
||||||
"""Decorator to add common options for filtering listings."""
|
"""Decorator to add common options for filtering listings."""
|
||||||
|
|
||||||
@click.option(
|
@click.option(
|
||||||
|
|
@ -45,7 +45,7 @@ def listing_filter_options(func):
|
||||||
"--max-bedrooms",
|
"--max-bedrooms",
|
||||||
default=10,
|
default=10,
|
||||||
help="Maximum number of bedrooms",
|
help="Maximum number of bedrooms",
|
||||||
type=click.IntRange(min=1, max=10), # Right move gets unhappy with >10
|
type=click.IntRange(min=1, max=10),
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--min-price",
|
"--min-price",
|
||||||
|
|
@ -57,13 +57,13 @@ def listing_filter_options(func):
|
||||||
"--max-price",
|
"--max-price",
|
||||||
default=999_999,
|
default=999_999,
|
||||||
help="Maximum price",
|
help="Maximum price",
|
||||||
type=click.IntRange(min=0), # 40k for renting
|
type=click.IntRange(min=0),
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
"--district",
|
"--district",
|
||||||
default=None,
|
default=None,
|
||||||
help="Districts to scrape",
|
help="Districts to scrape",
|
||||||
type=click.Choice(get_districts().keys(), case_sensitive=False),
|
type=click.Choice(district_service.get_district_names(), case_sensitive=False),
|
||||||
multiple=True,
|
multiple=True,
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
|
|
@ -95,17 +95,50 @@ def listing_filter_options(func):
|
||||||
type=int,
|
type=int,
|
||||||
)
|
)
|
||||||
@wraps(func)
|
@wraps(func)
|
||||||
def wrapper(*args, **kwargs):
|
def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
|
||||||
return func(*args, **kwargs)
|
return func(*args, **kwargs)
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
def build_query_parameters(
|
||||||
|
type: str,
|
||||||
|
district: list[str],
|
||||||
|
min_bedrooms: int,
|
||||||
|
max_bedrooms: int,
|
||||||
|
min_price: int,
|
||||||
|
max_price: int,
|
||||||
|
furnish_types: list[str],
|
||||||
|
available_from: datetime | None,
|
||||||
|
last_seen_days: int,
|
||||||
|
min_sqm: int | None = None,
|
||||||
|
radius: int = 0,
|
||||||
|
page_size: int = 500,
|
||||||
|
max_days_since_added: int = 14,
|
||||||
|
) -> QueryParameters:
|
||||||
|
"""Build QueryParameters from CLI options."""
|
||||||
|
return QueryParameters(
|
||||||
|
listing_type=ListingType[type],
|
||||||
|
district_names=set(district) if district else None,
|
||||||
|
min_bedrooms=min_bedrooms,
|
||||||
|
max_bedrooms=max_bedrooms,
|
||||||
|
min_price=min_price,
|
||||||
|
max_price=max_price,
|
||||||
|
furnish_types=[FurnishType[ft] for ft in furnish_types] if furnish_types else None,
|
||||||
|
let_date_available_from=available_from,
|
||||||
|
last_seen_days=last_seen_days,
|
||||||
|
min_sqm=min_sqm,
|
||||||
|
radius=radius,
|
||||||
|
page_size=page_size,
|
||||||
|
max_days_since_added=max_days_since_added,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@click.group()
|
@click.group()
|
||||||
@click.option(
|
@click.option(
|
||||||
"--data-dir",
|
"--data-dir",
|
||||||
default=pathlib.Path("data/rs/"),
|
default=pathlib.Path("data/rs/"),
|
||||||
help="Districts to scrape",
|
help="Data directory for storing listings",
|
||||||
type=click.Path(
|
type=click.Path(
|
||||||
writable=True,
|
writable=True,
|
||||||
file_okay=False,
|
file_okay=False,
|
||||||
|
|
@ -114,17 +147,18 @@ def listing_filter_options(func):
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def cli(ctx, data_dir: str):
|
def cli(ctx: click.Context, data_dir: str) -> None:
|
||||||
ctx.ensure_object(dict)
|
ctx.ensure_object(dict)
|
||||||
ctx.obj["data_dir"] = data_dir
|
ctx.obj["data_dir"] = pathlib.Path(data_dir)
|
||||||
|
ctx.obj["repository"] = ListingRepository(engine=engine)
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@listing_filter_options
|
@listing_filter_options
|
||||||
@click.option("--full", is_flag=True)
|
@click.option("--full", is_flag=True, help="Include images and floorplan detection")
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def dump_listings(
|
def dump_listings(
|
||||||
ctx: click.core.Context,
|
ctx: click.Context,
|
||||||
full: bool,
|
full: bool,
|
||||||
district: list[str],
|
district: list[str],
|
||||||
min_bedrooms: int,
|
min_bedrooms: int,
|
||||||
|
|
@ -136,58 +170,63 @@ def dump_listings(
|
||||||
available_from: datetime | None,
|
available_from: datetime | None,
|
||||||
last_seen_days: int,
|
last_seen_days: int,
|
||||||
min_sqm: int | None = None,
|
min_sqm: int | None = None,
|
||||||
):
|
) -> None:
|
||||||
data_dir: str = ctx.obj["data_dir"]
|
"""Fetch listings from Rightmove API."""
|
||||||
query_parameters = QueryParameters(
|
data_dir: pathlib.Path = ctx.obj["data_dir"]
|
||||||
listing_type=ListingType[type],
|
repository: ListingRepository = ctx.obj["repository"]
|
||||||
district_names=set(district),
|
|
||||||
|
query_parameters = build_query_parameters(
|
||||||
|
type=type,
|
||||||
|
district=district,
|
||||||
min_bedrooms=min_bedrooms,
|
min_bedrooms=min_bedrooms,
|
||||||
max_bedrooms=max_bedrooms,
|
max_bedrooms=max_bedrooms,
|
||||||
min_price=min_price,
|
min_price=min_price,
|
||||||
max_price=max_price,
|
max_price=max_price,
|
||||||
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
furnish_types=furnish_types,
|
||||||
let_date_available_from=available_from,
|
available_from=available_from,
|
||||||
last_seen_days=last_seen_days,
|
last_seen_days=last_seen_days,
|
||||||
min_sqm=min_sqm,
|
min_sqm=min_sqm,
|
||||||
radius=0,
|
|
||||||
page_size=500,
|
|
||||||
max_days_since_added=14,
|
|
||||||
)
|
)
|
||||||
click.echo(
|
|
||||||
f"Running dump_listings for districts {district}, data dir {data_dir} and parameters: "
|
click.echo(f"Fetching listings with parameters: {query_parameters}")
|
||||||
f"{query_parameters}"
|
|
||||||
|
result = asyncio.run(
|
||||||
|
listing_service.refresh_listings(
|
||||||
|
repository,
|
||||||
|
query_parameters,
|
||||||
|
full=full,
|
||||||
|
async_mode=False,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
data_dir_path = pathlib.Path(data_dir)
|
|
||||||
repository = ListingRepository(engine=engine)
|
click.echo(result.message)
|
||||||
if not full: # only listings
|
|
||||||
asyncio.run(
|
|
||||||
dump_listings_module.dump_listings(
|
|
||||||
query_parameters, repository, data_dir_path
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else: # include images, floorplan detection etc.
|
|
||||||
asyncio.run(
|
|
||||||
dump_listings_module.dump_listings_full(
|
|
||||||
query_parameters, repository, data_dir_path
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def dump_images(ctx: click.core.Context):
|
def dump_images(ctx: click.Context) -> None:
|
||||||
data_dir = ctx.obj["data_dir"]
|
"""Download floorplan images for all listings."""
|
||||||
click.echo(f"Running dump_images for listings stored in {engine.url}")
|
data_dir: pathlib.Path = ctx.obj["data_dir"]
|
||||||
repository = ListingRepository(engine=engine)
|
repository: ListingRepository = ctx.obj["repository"]
|
||||||
asyncio.run(dump_images_module.dump_images(repository, image_base_path=data_dir))
|
|
||||||
|
click.echo(f"Downloading images to {data_dir}")
|
||||||
|
|
||||||
|
count = asyncio.run(listing_service.download_images(repository, data_dir))
|
||||||
|
|
||||||
|
click.echo(f"Processed {count} listings")
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def detect_floorplan(ctx: click.core.Context):
|
def detect_floorplan(ctx: click.Context) -> None:
|
||||||
click.echo(f"Running detect_floorplan for listings stored in {engine.url}")
|
"""Run OCR on floorplan images to detect square meters."""
|
||||||
repository = ListingRepository(engine=engine)
|
repository: ListingRepository = ctx.obj["repository"]
|
||||||
asyncio.run(detect_floorplan_module.detect_floorplan(repository))
|
|
||||||
|
click.echo("Running floorplan detection...")
|
||||||
|
|
||||||
|
count = asyncio.run(listing_service.detect_floorplans(repository))
|
||||||
|
|
||||||
|
click.echo(f"Processed {count} listings")
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
|
|
@ -202,10 +241,7 @@ def detect_floorplan(ctx: click.core.Context):
|
||||||
"--travel-mode",
|
"--travel-mode",
|
||||||
"-m",
|
"-m",
|
||||||
help="Travel mode for routing",
|
help="Travel mode for routing",
|
||||||
type=click.Choice(
|
type=click.Choice(TravelMode.__members__.keys(), case_sensitive=False),
|
||||||
TravelMode.__members__.keys(),
|
|
||||||
case_sensitive=False,
|
|
||||||
),
|
|
||||||
required=True,
|
required=True,
|
||||||
)
|
)
|
||||||
@click.option(
|
@click.option(
|
||||||
|
|
@ -213,65 +249,50 @@ def detect_floorplan(ctx: click.core.Context):
|
||||||
"-l",
|
"-l",
|
||||||
help="Limit the number of listings to process",
|
help="Limit the number of listings to process",
|
||||||
type=click.IntRange(min=1),
|
type=click.IntRange(min=1),
|
||||||
default=1, # by default limit to 1 to avoid accidental API usage
|
default=1,
|
||||||
)
|
)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def routing(
|
def routing(
|
||||||
ctx: click.core.Context, destination_address: str, travel_mode: str, limit: int
|
ctx: click.Context,
|
||||||
):
|
destination_address: str,
|
||||||
data_dir = ctx.obj["data_dir"]
|
travel_mode: str,
|
||||||
click.echo(f"Running routing for the first {limit} listings in {data_dir}")
|
limit: int,
|
||||||
listing_paths = sorted(list(pathlib.Path(data_dir).glob("*/listing.json")))
|
) -> None:
|
||||||
listing_paths = listing_paths[:limit]
|
"""Calculate transit routes for listings."""
|
||||||
|
repository: ListingRepository = ctx.obj["repository"]
|
||||||
|
|
||||||
if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None:
|
if os.environ.get(API_KEY_ENVIRONMENT_VARIABLE) is None:
|
||||||
raise click.exceptions.MissingParameter(
|
raise click.ClickException(
|
||||||
f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set. "
|
f"{API_KEY_ENVIRONMENT_VARIABLE} environment variable is not set."
|
||||||
"Please set it to your API key for the routing service."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
repository = ListingRepository(engine=engine)
|
click.echo(f"Calculating routes to '{destination_address}' for {limit} listings")
|
||||||
asyncio.run(
|
|
||||||
routing_module.calculate_route(
|
count = asyncio.run(
|
||||||
|
listing_service.calculate_routes(
|
||||||
repository,
|
repository,
|
||||||
destination_address,
|
destination_address,
|
||||||
# destination_address_coordinates,
|
travel_mode,
|
||||||
TravelMode[travel_mode],
|
|
||||||
limit=limit,
|
limit=limit,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
click.echo(f"Processed {count} listings")
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
# @click.option(
|
|
||||||
# "--columns",
|
|
||||||
# "-C",
|
|
||||||
# help="Columns to include in the CSV file",
|
|
||||||
# type=click.Choice(
|
|
||||||
# # csv_exporter.get_columns_from_listings(),
|
|
||||||
# [1],
|
|
||||||
# case_sensitive=False,
|
|
||||||
# ),
|
|
||||||
# multiple=True,
|
|
||||||
# default=Listing.ALL_COLUMNS,
|
|
||||||
# )
|
|
||||||
@click.option(
|
@click.option(
|
||||||
"--output-file",
|
"--output-file",
|
||||||
"-O",
|
"-O",
|
||||||
help="Path to the output CSV file",
|
help="Path to the output CSV file",
|
||||||
required=True,
|
required=True,
|
||||||
type=click.Path(
|
type=click.Path(writable=True, file_okay=True, dir_okay=False, resolve_path=True),
|
||||||
writable=True,
|
|
||||||
file_okay=True,
|
|
||||||
dir_okay=False,
|
|
||||||
resolve_path=True,
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
@click.pass_context
|
|
||||||
@listing_filter_options
|
@listing_filter_options
|
||||||
|
@click.pass_context
|
||||||
def export_csv(
|
def export_csv(
|
||||||
ctx: click.core.Context,
|
ctx: click.Context,
|
||||||
output_file: str,
|
output_file: str,
|
||||||
# columns: tuple[str],
|
|
||||||
district: list[str],
|
district: list[str],
|
||||||
min_bedrooms: int,
|
min_bedrooms: int,
|
||||||
max_bedrooms: int,
|
max_bedrooms: int,
|
||||||
|
|
@ -282,53 +303,48 @@ def export_csv(
|
||||||
available_from: datetime | None,
|
available_from: datetime | None,
|
||||||
last_seen_days: int,
|
last_seen_days: int,
|
||||||
min_sqm: int | None = None,
|
min_sqm: int | None = None,
|
||||||
):
|
) -> None:
|
||||||
# use model
|
"""Export listings to CSV file."""
|
||||||
data_dir = ctx.obj["data_dir"]
|
repository: ListingRepository = ctx.obj["repository"]
|
||||||
query_parameters = QueryParameters(
|
|
||||||
listing_type=ListingType[type],
|
query_parameters = build_query_parameters(
|
||||||
district_names=set(district),
|
type=type,
|
||||||
|
district=district,
|
||||||
min_bedrooms=min_bedrooms,
|
min_bedrooms=min_bedrooms,
|
||||||
max_bedrooms=max_bedrooms,
|
max_bedrooms=max_bedrooms,
|
||||||
min_price=min_price,
|
min_price=min_price,
|
||||||
max_price=max_price,
|
max_price=max_price,
|
||||||
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
furnish_types=furnish_types,
|
||||||
let_date_available_from=available_from,
|
available_from=available_from,
|
||||||
last_seen_days=last_seen_days,
|
last_seen_days=last_seen_days,
|
||||||
min_sqm=min_sqm,
|
min_sqm=min_sqm,
|
||||||
)
|
)
|
||||||
click.echo(
|
|
||||||
f"Exporting data to {output_file} using {data_dir=} and query parameters: {query_parameters}"
|
click.echo(f"Exporting to {output_file}")
|
||||||
)
|
|
||||||
output_file_path = pathlib.Path(output_file)
|
result = asyncio.run(
|
||||||
repository = ListingRepository(engine=engine)
|
export_service.export_to_csv(
|
||||||
asyncio.run(
|
|
||||||
csv_exporter.export_to_csv(
|
|
||||||
repository,
|
repository,
|
||||||
output_file_path,
|
pathlib.Path(output_file),
|
||||||
# list(columns),
|
query_parameters,
|
||||||
query_parameters=query_parameters,
|
)
|
||||||
),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
click.echo(result.message)
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.option(
|
@click.option(
|
||||||
"--output-file",
|
"--output-file",
|
||||||
"-O",
|
"-O",
|
||||||
help="Path to the output immoweb file",
|
help="Path to the output GeoJSON file",
|
||||||
required=True,
|
required=True,
|
||||||
type=click.Path(
|
type=click.Path(writable=True, file_okay=True, dir_okay=False, resolve_path=True),
|
||||||
writable=True,
|
|
||||||
file_okay=True,
|
|
||||||
dir_okay=False,
|
|
||||||
resolve_path=True,
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
@listing_filter_options
|
@listing_filter_options
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def export_immoweb(
|
def export_immoweb(
|
||||||
ctx: click.core.Context,
|
ctx: click.Context,
|
||||||
output_file: str,
|
output_file: str,
|
||||||
district: list[str],
|
district: list[str],
|
||||||
min_bedrooms: int,
|
min_bedrooms: int,
|
||||||
|
|
@ -340,39 +356,62 @@ def export_immoweb(
|
||||||
available_from: datetime | None,
|
available_from: datetime | None,
|
||||||
last_seen_days: int,
|
last_seen_days: int,
|
||||||
min_sqm: int | None = None,
|
min_sqm: int | None = None,
|
||||||
):
|
) -> None:
|
||||||
query_parameters = QueryParameters(
|
"""Export listings to GeoJSON file for map visualization."""
|
||||||
listing_type=ListingType[type],
|
repository: ListingRepository = ctx.obj["repository"]
|
||||||
district_names=set(district),
|
|
||||||
|
query_parameters = build_query_parameters(
|
||||||
|
type=type,
|
||||||
|
district=district,
|
||||||
min_bedrooms=min_bedrooms,
|
min_bedrooms=min_bedrooms,
|
||||||
max_bedrooms=max_bedrooms,
|
max_bedrooms=max_bedrooms,
|
||||||
min_price=min_price,
|
min_price=min_price,
|
||||||
max_price=max_price,
|
max_price=max_price,
|
||||||
furnish_types=[FurnishType[furnish_type] for furnish_type in furnish_types],
|
furnish_types=furnish_types,
|
||||||
let_date_available_from=available_from,
|
available_from=available_from,
|
||||||
last_seen_days=last_seen_days,
|
last_seen_days=last_seen_days,
|
||||||
min_sqm=min_sqm,
|
min_sqm=min_sqm,
|
||||||
)
|
)
|
||||||
click.echo(
|
|
||||||
f"Exporting data to {output_file} for listings stored in {engine.url} that match the query parameters: {query_parameters}"
|
click.echo(f"Exporting to {output_file}")
|
||||||
|
|
||||||
|
result = asyncio.run(
|
||||||
|
export_service.export_to_geojson(
|
||||||
|
repository,
|
||||||
|
query_parameters=query_parameters,
|
||||||
|
output_path=pathlib.Path(output_file),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
repository = ListingRepository(engine=engine)
|
|
||||||
asyncio.run(export_immoweb_ui(repository, output_file, query_parameters))
|
click.echo(result.message)
|
||||||
|
|
||||||
|
|
||||||
@cli.command()
|
@cli.command()
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def populate_db(
|
def populate_db(ctx: click.Context) -> None:
|
||||||
ctx: click.core.Context,
|
"""Populate database from filesystem data (legacy migration)."""
|
||||||
):
|
data_dir: pathlib.Path = ctx.obj["data_dir"]
|
||||||
data_dir = ctx.obj["data_dir"]
|
repository: ListingRepository = ctx.obj["repository"]
|
||||||
click.echo(f"Populating the database with data from {data_dir}")
|
|
||||||
repository = ListingRepository(engine=engine)
|
click.echo(f"Populating database from {data_dir}")
|
||||||
|
|
||||||
listings = Listing.get_all_listings(
|
listings = Listing.get_all_listings(
|
||||||
[path for path in pathlib.Path(data_dir).glob("*/listing.json")]
|
[path for path in data_dir.glob("*/listing.json")]
|
||||||
)
|
)
|
||||||
|
|
||||||
asyncio.run(repository.upsert_listings_legacy(listings))
|
asyncio.run(repository.upsert_listings_legacy(listings))
|
||||||
|
|
||||||
|
click.echo(f"Imported {len(listings)} listings")
|
||||||
|
|
||||||
|
|
||||||
|
@cli.command()
|
||||||
|
def list_districts() -> None:
|
||||||
|
"""List all available districts."""
|
||||||
|
districts = district_service.get_all_districts()
|
||||||
|
click.echo(f"Available districts ({len(districts)}):")
|
||||||
|
for name in sorted(districts.keys()):
|
||||||
|
click.echo(f" - {name}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cli()
|
cli()
|
||||||
|
|
|
||||||
|
|
@ -1,40 +0,0 @@
|
||||||
def record():
|
|
||||||
from rec.query import listing_query, detail_query
|
|
||||||
import json
|
|
||||||
|
|
||||||
page = 1
|
|
||||||
listing = listing_query(page, 2, 2, 5, 200000, 500000)
|
|
||||||
with open(
|
|
||||||
f"/Users/kadir/code/realestate/crawler/code/json/queries/listing{page}.json",
|
|
||||||
"w",
|
|
||||||
) as f:
|
|
||||||
json.dump(listing, f)
|
|
||||||
|
|
||||||
for prop in listing["properties"]:
|
|
||||||
identifier = prop["identifier"]
|
|
||||||
resp = detail_query(identifier)
|
|
||||||
# print(identifier, resp.status_code)
|
|
||||||
with open(
|
|
||||||
f"/Users/kadir/code/realestate/crawler/code/json/queries/detail_{identifier}.json",
|
|
||||||
"w",
|
|
||||||
) as f:
|
|
||||||
json.dump(resp, f)
|
|
||||||
|
|
||||||
|
|
||||||
def process():
|
|
||||||
import json
|
|
||||||
import pathlib
|
|
||||||
|
|
||||||
path = pathlib.Path("/Users/kadir/code/realestate/crawler/code/json/queries/")
|
|
||||||
|
|
||||||
detailjsons = list(path.glob("detail_*json"))
|
|
||||||
for file in detailjsons:
|
|
||||||
with open(file) as f:
|
|
||||||
js = json.load(f)
|
|
||||||
|
|
||||||
for floorplan in js["property"]["floorplans"]:
|
|
||||||
print(floorplan["url"])
|
|
||||||
|
|
||||||
|
|
||||||
# record()
|
|
||||||
process()
|
|
||||||
|
|
@ -5,7 +5,7 @@ from datetime import datetime, timedelta
|
||||||
import enum
|
import enum
|
||||||
import json
|
import json
|
||||||
from typing import Any, Dict, List
|
from typing import Any, Dict, List
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel, Field as PydanticField
|
||||||
from rec import routing
|
from rec import routing
|
||||||
from sqlmodel import JSON, TEXT, SQLModel, Field
|
from sqlmodel import JSON, TEXT, SQLModel, Field
|
||||||
|
|
||||||
|
|
@ -80,7 +80,10 @@ class Listing(SQLModel, table=False):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_removed(self) -> bool:
|
def is_removed(self) -> bool:
|
||||||
return not self.additional_info["property"]["visible"]
|
if not self.additional_info:
|
||||||
|
return False
|
||||||
|
property_info = self.additional_info.get("property", {})
|
||||||
|
return not property_info.get("visible", True)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def price_per_square_meter(self) -> float | None:
|
def price_per_square_meter(self) -> float | None:
|
||||||
|
|
@ -231,14 +234,16 @@ class ListingType(enum.StrEnum):
|
||||||
RENT = "RENT"
|
RENT = "RENT"
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class QueryParameters(BaseModel):
|
class QueryParameters(BaseModel):
|
||||||
|
"""Query parameters for filtering listings."""
|
||||||
|
model_config = {"frozen": True}
|
||||||
|
|
||||||
listing_type: ListingType
|
listing_type: ListingType
|
||||||
min_bedrooms: int = 1
|
min_bedrooms: int = 1
|
||||||
max_bedrooms: int = 999
|
max_bedrooms: int = 999
|
||||||
min_price: int = 0
|
min_price: int = 0
|
||||||
max_price: int = 10_000_000
|
max_price: int = 10_000_000
|
||||||
district_names: set[str] = dataclasses.field(default_factory=set)
|
district_names: set[str] = PydanticField(default_factory=set)
|
||||||
radius: float = 0
|
radius: float = 0
|
||||||
page_size: int = 500 # items per page
|
page_size: int = 500 # items per page
|
||||||
max_days_since_added: int = 14 # for buy listings
|
max_days_since_added: int = 14 # for buy listings
|
||||||
|
|
|
||||||
36
crawler/poetry.lock
generated
36
crawler/poetry.lock
generated
|
|
@ -120,6 +120,22 @@ yarl = ">=1.17.0,<2.0"
|
||||||
[package.extras]
|
[package.extras]
|
||||||
speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.3.0)", "brotlicffi ; platform_python_implementation != \"CPython\""]
|
speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.3.0)", "brotlicffi ; platform_python_implementation != \"CPython\""]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aiohttp-socks"
|
||||||
|
version = "0.8.4"
|
||||||
|
description = "Proxy connector for aiohttp"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
groups = ["main"]
|
||||||
|
files = [
|
||||||
|
{file = "aiohttp_socks-0.8.4-py3-none-any.whl", hash = "sha256:74b21105634ed31d56ed6fee43701ca16218b53475e606d56950a4d17e8290ea"},
|
||||||
|
{file = "aiohttp_socks-0.8.4.tar.gz", hash = "sha256:6b611d4ce838e9cf2c2fed5e0dba447cc84824a6cba95dc5747606201da46cb4"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
aiohttp = ">=2.3.2"
|
||||||
|
python-socks = {version = ">=2.4.3,<3.0.0", extras = ["asyncio"]}
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aioresponses"
|
name = "aioresponses"
|
||||||
version = "0.7.8"
|
version = "0.7.8"
|
||||||
|
|
@ -4246,6 +4262,24 @@ files = [
|
||||||
{file = "python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13"},
|
{file = "python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "python-socks"
|
||||||
|
version = "2.8.0"
|
||||||
|
description = "Proxy (SOCKS4, SOCKS5, HTTP CONNECT) client for Python"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8.0"
|
||||||
|
groups = ["main"]
|
||||||
|
files = [
|
||||||
|
{file = "python_socks-2.8.0-py3-none-any.whl", hash = "sha256:57c24b416569ccea493a101d38b0c82ed54be603aa50b6afbe64c46e4a4e4315"},
|
||||||
|
{file = "python_socks-2.8.0.tar.gz", hash = "sha256:340f82778b20a290bdd538ee47492978d603dff7826aaf2ce362d21ad9ee6f1b"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
anyio = ["anyio (>=3.3.4,<5.0.0)"]
|
||||||
|
asyncio = ["async-timeout (>=4.0) ; python_version < \"3.11\""]
|
||||||
|
curio = ["curio (>=1.4)"]
|
||||||
|
trio = ["trio (>=0.24)"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pytz"
|
name = "pytz"
|
||||||
version = "2025.2"
|
version = "2025.2"
|
||||||
|
|
@ -6203,4 +6237,4 @@ type = ["pytest-mypy"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.1"
|
lock-version = "2.1"
|
||||||
python-versions = ">3.11"
|
python-versions = ">3.11"
|
||||||
content-hash = "10a74594d9f695ab1077ff992bcd012b93b174b25c3f2ca681d6308653abbd14"
|
content-hash = "6f9ce2af71a995db179aa4fb682e8a9ccde59566d14e26c7b0dbf4edc8d8e583"
|
||||||
|
|
|
||||||
|
|
@ -1,13 +0,0 @@
|
||||||
import requests
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Host": "media.rightmove.co.uk",
|
|
||||||
# 'Accept-Encoding': 'gzip, deflate, br',
|
|
||||||
"User-Agent": "okhttp/4.10.0",
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.get(
|
|
||||||
"https://media.rightmove.co.uk/47k/46001/138680705/46001_32532509_IMG_00_0000.jpeg",
|
|
||||||
headers=headers,
|
|
||||||
verify=False,
|
|
||||||
)
|
|
||||||
|
|
@ -1,67 +0,0 @@
|
||||||
import requests
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Host": "api.rightmove.co.uk",
|
|
||||||
# 'Accept-Encoding': 'gzip, deflate, br',
|
|
||||||
"User-Agent": "okhttp/4.10.0",
|
|
||||||
"Connection": "close",
|
|
||||||
}
|
|
||||||
|
|
||||||
params = {
|
|
||||||
"locationIdentifier": "POSTCODE^4228216",
|
|
||||||
"channel": "BUY",
|
|
||||||
"page": "1",
|
|
||||||
"numberOfPropertiesPerPage": "25",
|
|
||||||
"radius": "3.0",
|
|
||||||
"sortBy": "distance",
|
|
||||||
"includeUnavailableProperties": "false",
|
|
||||||
"propertyTypes": "flat",
|
|
||||||
"mustHave": "newHome", # added manually later
|
|
||||||
"dontShow": "sharedOwnership,retirement",
|
|
||||||
"minPrice": "150000",
|
|
||||||
"maxPrice": "500000",
|
|
||||||
"minBedrooms": "2",
|
|
||||||
"maxBedrooms": "2",
|
|
||||||
"apiApplication": "ANDROID",
|
|
||||||
"appVersion": "3.70.0",
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.get(
|
|
||||||
"https://api.rightmove.co.uk/api/property-listing",
|
|
||||||
params=params,
|
|
||||||
headers=headers,
|
|
||||||
verify=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Host": "api.rightmove.co.uk",
|
|
||||||
# 'Accept-Encoding': 'gzip, deflate, br',
|
|
||||||
"User-Agent": "okhttp/4.10.0",
|
|
||||||
"Connection": "close",
|
|
||||||
}
|
|
||||||
|
|
||||||
params = {
|
|
||||||
"locationIdentifier": "POSTCODE^4228216",
|
|
||||||
"channel": "BUY",
|
|
||||||
"page": "2",
|
|
||||||
"numberOfPropertiesPerPage": "25",
|
|
||||||
"radius": "3.0",
|
|
||||||
"sortBy": "distance",
|
|
||||||
"includeUnavailableProperties": "false",
|
|
||||||
"propertyTypes": "flat",
|
|
||||||
"dontShow": "sharedOwnership,retirement",
|
|
||||||
"minPrice": "150000",
|
|
||||||
"maxPrice": "600000",
|
|
||||||
"minBedrooms": "2",
|
|
||||||
"maxBedrooms": "2",
|
|
||||||
"apiApplication": "ANDROID",
|
|
||||||
"appVersion": "3.70.0",
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.get(
|
|
||||||
"https://api.rightmove.co.uk/api/property-listing",
|
|
||||||
params=params,
|
|
||||||
headers=headers,
|
|
||||||
verify=False,
|
|
||||||
)
|
|
||||||
|
|
@ -1,22 +0,0 @@
|
||||||
import requests
|
|
||||||
|
|
||||||
API_KEY = "AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8"
|
|
||||||
url = "https://maps.googleapis.com/maps/api/distancematrix/json"
|
|
||||||
origin = "51.5636306598907,-0.11061106079085892"
|
|
||||||
dest = "51.53836609846008,-0.12743940233824352"
|
|
||||||
|
|
||||||
params = {
|
|
||||||
"origins": origin,
|
|
||||||
"destinations": dest,
|
|
||||||
"key": API_KEY,
|
|
||||||
"departure_time": "", # timstamp, optional
|
|
||||||
"mode": "transit",
|
|
||||||
}
|
|
||||||
|
|
||||||
r = requests.get(url, params=params)
|
|
||||||
print(r.status_code)
|
|
||||||
|
|
||||||
print(r.json())
|
|
||||||
|
|
||||||
with open("code/json/routing_distancematrix.json", "w") as f:
|
|
||||||
f.write(r.text)
|
|
||||||
|
|
@ -1,83 +0,0 @@
|
||||||
import requests
|
|
||||||
from utils import nextMonday
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
API_KEY = "AIzaSyBoBHzeQFgR7O-NlNsuHXQcC1B7ccEHpl8"
|
|
||||||
url = "https://routes.googleapis.com/directions/v2:computeRoutes"
|
|
||||||
|
|
||||||
|
|
||||||
def travel_time(origin_lat: float, origin_lon: float, dest_lat: float, dest_lon: float):
|
|
||||||
monday9am = nextMonday()
|
|
||||||
|
|
||||||
header = {
|
|
||||||
"X-Goog-Api-Key": API_KEY,
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
"X-Goog-FieldMask": "routes.distanceMeters,routes.duration,routes.staticDuration,routes.legs.steps.distanceMeters,routes.legs.steps.staticDuration,routes.legs.steps.travelMode",
|
|
||||||
}
|
|
||||||
|
|
||||||
body = {
|
|
||||||
"origin": {
|
|
||||||
"location": {"latLng": {"latitude": origin_lat, "longitude": origin_lon}}
|
|
||||||
},
|
|
||||||
"destination": {
|
|
||||||
"location": {"latLng": {"latitude": dest_lat, "longitude": dest_lon}}
|
|
||||||
},
|
|
||||||
"travelMode": "TRANSIT",
|
|
||||||
# "2023-10-15T15:01:23.045123456Z"
|
|
||||||
"departureTime": monday9am.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
|
|
||||||
"computeAlternativeRoutes": False,
|
|
||||||
# "routeModifiers": {
|
|
||||||
# "avoidTolls": false,
|
|
||||||
# "avoidHighways": false,
|
|
||||||
# "avoidFerries": false
|
|
||||||
# },
|
|
||||||
"languageCode": "en-US",
|
|
||||||
"units": "METRIC",
|
|
||||||
}
|
|
||||||
|
|
||||||
r = requests.post(url, json=body, headers=header)
|
|
||||||
if r.status_code == 200:
|
|
||||||
return r.json()
|
|
||||||
|
|
||||||
raise Exception(r.json())
|
|
||||||
|
|
||||||
|
|
||||||
def extract_time(d):
|
|
||||||
r = d["routes"][0]
|
|
||||||
print(r.keys())
|
|
||||||
distance = r["distanceMeters"]
|
|
||||||
duration = r["duration"]
|
|
||||||
duration_static = r["staticDuration"]
|
|
||||||
|
|
||||||
steps = r["legs"][0]["steps"]
|
|
||||||
# print(steps)
|
|
||||||
duration_per_transit = defaultdict(lambda: 0)
|
|
||||||
distance_per_transit = defaultdict(lambda: 0)
|
|
||||||
|
|
||||||
for step in steps:
|
|
||||||
duration_per_transit[step["travelMode"]] += int(
|
|
||||||
step["staticDuration"].strip("s")
|
|
||||||
)
|
|
||||||
distance_per_transit[step["travelMode"]] += step.get("distanceMeters", 0)
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"dis {distance}, dur {duration}, duration per transit {dict(duration_per_transit)}, distance per transit {dict(distance_per_transit)}, duration_static {duration_static}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import json
|
|
||||||
|
|
||||||
with open("code/json/routing_routeapi.json", "r") as f:
|
|
||||||
d = json.load(f)
|
|
||||||
|
|
||||||
extract_time(d)
|
|
||||||
|
|
||||||
|
|
||||||
# if __name__ == "__main__":
|
|
||||||
# origin = 51.5635664310333, -0.1107173751570373 # home
|
|
||||||
# dest = 51.50475678313417, 0.04915321000190009 # london city airport
|
|
||||||
# d = travel_time(origin[0], origin[1], dest[0], dest[1])
|
|
||||||
# import json
|
|
||||||
# with open('code/json/routing_routeapi.json', 'w') as f:
|
|
||||||
# json.dump(d, f)
|
|
||||||
|
|
@ -1,20 +0,0 @@
|
||||||
import requests
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Host": "api.rightmove.co.uk",
|
|
||||||
# 'Accept-Encoding': 'gzip, deflate, br',
|
|
||||||
"User-Agent": "okhttp/4.10.0",
|
|
||||||
"Connection": "close",
|
|
||||||
}
|
|
||||||
|
|
||||||
params = {
|
|
||||||
"apiApplication": "ANDROID",
|
|
||||||
"appVersion": "3.70.0",
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.get(
|
|
||||||
"https://api.rightmove.co.uk/api/property/119578451",
|
|
||||||
params=params,
|
|
||||||
headers=headers,
|
|
||||||
verify=False,
|
|
||||||
)
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
def get_districts():
|
def get_districts() -> dict[str, str]:
|
||||||
return {
|
return {
|
||||||
"Barking and Dagenham": "REGION^61400",
|
"Barking and Dagenham": "REGION^61400",
|
||||||
"Barnet": "REGION^93929",
|
"Barnet": "REGION^93929",
|
||||||
|
|
|
||||||
|
|
@ -1,10 +1,12 @@
|
||||||
import re
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
def inference(image_path):
|
def inference(image_path: str | Path) -> tuple[str, Any]:
|
||||||
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
|
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
|
||||||
|
|
||||||
image = Image.open(image_path)
|
image = Image.open(image_path)
|
||||||
|
|
@ -19,7 +21,7 @@ def inference(image_path):
|
||||||
return output, predictions
|
return output, predictions
|
||||||
|
|
||||||
|
|
||||||
def extract_total_sqm(input_str: str):
|
def extract_total_sqm(input_str: str) -> float | None:
|
||||||
sqmregex = r"(\d+\.?\d*) ?(sq ?m|sq. ?m)"
|
sqmregex = r"(\d+\.?\d*) ?(sq ?m|sq. ?m)"
|
||||||
matches = re.findall(sqmregex, input_str.lower())
|
matches = re.findall(sqmregex, input_str.lower())
|
||||||
sqms = [float(m[0]) for m in matches]
|
sqms = [float(m[0]) for m in matches]
|
||||||
|
|
@ -29,13 +31,13 @@ def extract_total_sqm(input_str: str):
|
||||||
return max(filtered)
|
return max(filtered)
|
||||||
|
|
||||||
|
|
||||||
def calculate_model(image_path):
|
def calculate_model(image_path: str | Path) -> tuple[float | None, str, Any]:
|
||||||
output, predictions_tensor = inference(image_path)
|
output, predictions_tensor = inference(image_path)
|
||||||
estimated_sqm = extract_total_sqm(output)
|
estimated_sqm = extract_total_sqm(output)
|
||||||
return estimated_sqm, output, predictions_tensor
|
return estimated_sqm, output, predictions_tensor
|
||||||
|
|
||||||
|
|
||||||
def improve_img_for_ocr(img: Image):
|
def improve_img_for_ocr(img: Image.Image) -> Image.Image:
|
||||||
img2 = np.array(img.convert("L"))
|
img2 = np.array(img.convert("L"))
|
||||||
cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
|
cv2.resize(img2, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC)
|
||||||
thresh = cv2.adaptiveThreshold(
|
thresh = cv2.adaptiveThreshold(
|
||||||
|
|
@ -44,7 +46,7 @@ def improve_img_for_ocr(img: Image):
|
||||||
return Image.fromarray(thresh)
|
return Image.fromarray(thresh)
|
||||||
|
|
||||||
|
|
||||||
def calculate_ocr(image_path) -> tuple[float | None, str]:
|
def calculate_ocr(image_path: str | Path) -> tuple[float | None, str]:
|
||||||
import pytesseract
|
import pytesseract
|
||||||
|
|
||||||
img = Image.open(image_path)
|
img = Image.open(image_path)
|
||||||
|
|
|
||||||
41
crawler/rec/route_serializer.py
Normal file
41
crawler/rec/route_serializer.py
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
import json
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from models.listing import DestinationMode, Route, RouteLegStep
|
||||||
|
from rec import routing
|
||||||
|
|
||||||
|
|
||||||
|
class RouteSerializer:
|
||||||
|
@staticmethod
|
||||||
|
def serialize(route): ...
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def deserialize(route_data_json: str) -> dict[DestinationMode, List[Route]]:
|
||||||
|
json_data = json.loads(route_data_json)
|
||||||
|
destimation_routes = {}
|
||||||
|
for destination_mode_str, routes_json in json_data.items():
|
||||||
|
destination_mode = DestinationMode(
|
||||||
|
destination_address=json.loads(destination_mode_str)[
|
||||||
|
"destination_address"
|
||||||
|
],
|
||||||
|
travel_mode=routing.TravelMode(
|
||||||
|
json.loads(destination_mode_str)["travel_mode"]
|
||||||
|
),
|
||||||
|
)
|
||||||
|
parsed_route = json.loads(routes_json[0])
|
||||||
|
routes = [
|
||||||
|
Route(
|
||||||
|
legs=[
|
||||||
|
RouteLegStep(
|
||||||
|
distance_meters=step["distance_meters"],
|
||||||
|
duration_s=step["duration_s"],
|
||||||
|
travel_mode=routing.TravelMode(step["travel_mode"]),
|
||||||
|
)
|
||||||
|
for step in parsed_route["legs"]
|
||||||
|
],
|
||||||
|
distance_meters=parsed_route["distance_meters"],
|
||||||
|
duration_s=int(parsed_route["duration_s"]),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
destimation_routes[destination_mode] = routes
|
||||||
|
return destimation_routes
|
||||||
41
crawler/services/__init__.py
Normal file
41
crawler/services/__init__.py
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
"""Services package for real estate crawler.
|
||||||
|
|
||||||
|
This package contains two layers of services:
|
||||||
|
|
||||||
|
## Low-level services (internal implementation):
|
||||||
|
- listing_fetcher: Fetches listing data from Rightmove API
|
||||||
|
- image_fetcher: Downloads floorplan images
|
||||||
|
- floorplan_detector: OCR-based square meter detection from floorplans
|
||||||
|
- route_calculator: Calculates transit routes using Google Maps API
|
||||||
|
|
||||||
|
## High-level services (use these in CLI and API):
|
||||||
|
- listing_service: Unified listing operations (get, refresh, download images, etc.)
|
||||||
|
- export_service: Export listings to CSV, GeoJSON
|
||||||
|
- district_service: District lookup and validation
|
||||||
|
- task_service: Background task management
|
||||||
|
"""
|
||||||
|
# Low-level services (internal)
|
||||||
|
from services.listing_fetcher import dump_listings, dump_listings_full
|
||||||
|
from services.image_fetcher import dump_images
|
||||||
|
from services.floorplan_detector import detect_floorplan
|
||||||
|
from services.route_calculator import calculate_route
|
||||||
|
|
||||||
|
# High-level services (CLI and API should use these)
|
||||||
|
from services import listing_service
|
||||||
|
from services import export_service
|
||||||
|
from services import district_service
|
||||||
|
from services import task_service
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# Low-level
|
||||||
|
"dump_listings",
|
||||||
|
"dump_listings_full",
|
||||||
|
"dump_images",
|
||||||
|
"detect_floorplan",
|
||||||
|
"calculate_route",
|
||||||
|
# High-level
|
||||||
|
"listing_service",
|
||||||
|
"export_service",
|
||||||
|
"district_service",
|
||||||
|
"task_service",
|
||||||
|
]
|
||||||
38
crawler/services/district_service.py
Normal file
38
crawler/services/district_service.py
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
"""Unified district service - shared between CLI and HTTP API."""
|
||||||
|
from rec.districts import get_districts as _get_districts
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_districts() -> dict[str, str]:
|
||||||
|
"""Get all available districts with their region IDs.
|
||||||
|
|
||||||
|
Used by:
|
||||||
|
- CLI: --district option choices
|
||||||
|
- API: GET /api/get_districts
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary mapping district names to region IDs
|
||||||
|
"""
|
||||||
|
return _get_districts()
|
||||||
|
|
||||||
|
|
||||||
|
def get_district_names() -> list[str]:
|
||||||
|
"""Get list of all district names.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of district names
|
||||||
|
"""
|
||||||
|
return list(_get_districts().keys())
|
||||||
|
|
||||||
|
|
||||||
|
def validate_districts(district_names: list[str]) -> tuple[bool, list[str]]:
|
||||||
|
"""Validate that district names exist.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
district_names: List of district names to validate
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (all_valid, invalid_names)
|
||||||
|
"""
|
||||||
|
valid_districts = set(_get_districts().keys())
|
||||||
|
invalid = [d for d in district_names if d not in valid_districts]
|
||||||
|
return len(invalid) == 0, invalid
|
||||||
92
crawler/services/export_service.py
Normal file
92
crawler/services/export_service.py
Normal file
|
|
@ -0,0 +1,92 @@
|
||||||
|
"""Unified export service - shared between CLI and HTTP API.
|
||||||
|
|
||||||
|
This module provides export functionality for listings in various formats.
|
||||||
|
"""
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from models.listing import QueryParameters
|
||||||
|
from repositories.listing_repository import ListingRepository
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExportResult:
|
||||||
|
"""Result of an export operation."""
|
||||||
|
success: bool
|
||||||
|
output_path: str | None # For file exports
|
||||||
|
data: Any | None # For in-memory exports (GeoJSON)
|
||||||
|
record_count: int
|
||||||
|
message: str
|
||||||
|
|
||||||
|
|
||||||
|
async def export_to_csv(
|
||||||
|
repository: ListingRepository,
|
||||||
|
output_path: Path,
|
||||||
|
query_parameters: QueryParameters | None = None,
|
||||||
|
) -> ExportResult:
|
||||||
|
"""Export listings to CSV file.
|
||||||
|
|
||||||
|
Used by:
|
||||||
|
- CLI: export-csv
|
||||||
|
- API: (could be added as download endpoint)
|
||||||
|
"""
|
||||||
|
from csv_exporter import export_to_csv as _export_csv
|
||||||
|
|
||||||
|
await _export_csv(repository, output_path, query_parameters)
|
||||||
|
|
||||||
|
listings = await repository.get_listings(query_parameters=query_parameters)
|
||||||
|
return ExportResult(
|
||||||
|
success=True,
|
||||||
|
output_path=str(output_path),
|
||||||
|
data=None,
|
||||||
|
record_count=len(listings),
|
||||||
|
message=f"Exported {len(listings)} listings to {output_path}",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def export_to_geojson(
|
||||||
|
repository: ListingRepository,
|
||||||
|
query_parameters: QueryParameters | None = None,
|
||||||
|
output_path: Path | None = None,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> ExportResult:
|
||||||
|
"""Export listings to GeoJSON format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repository: Database repository
|
||||||
|
query_parameters: Filtering parameters
|
||||||
|
output_path: If provided, write to file. Otherwise return data.
|
||||||
|
limit: Maximum number of listings to export
|
||||||
|
|
||||||
|
Used by:
|
||||||
|
- CLI: export-immoweb
|
||||||
|
- API: GET /api/listing_geojson
|
||||||
|
"""
|
||||||
|
from ui_exporter import export_immoweb
|
||||||
|
|
||||||
|
geojson_data = await export_immoweb(
|
||||||
|
repository,
|
||||||
|
output_file=str(output_path) if output_path else None,
|
||||||
|
query_parameters=query_parameters,
|
||||||
|
limit=limit,
|
||||||
|
)
|
||||||
|
|
||||||
|
feature_count = len(geojson_data.get("features", [])) if geojson_data else 0
|
||||||
|
|
||||||
|
if output_path:
|
||||||
|
return ExportResult(
|
||||||
|
success=True,
|
||||||
|
output_path=str(output_path),
|
||||||
|
data=None,
|
||||||
|
record_count=feature_count,
|
||||||
|
message=f"Exported {feature_count} listings to {output_path}",
|
||||||
|
)
|
||||||
|
|
||||||
|
return ExportResult(
|
||||||
|
success=True,
|
||||||
|
output_path=None,
|
||||||
|
data=geojson_data,
|
||||||
|
record_count=feature_count,
|
||||||
|
message=f"Generated GeoJSON with {feature_count} features",
|
||||||
|
)
|
||||||
42
crawler/services/floorplan_detector.py
Normal file
42
crawler/services/floorplan_detector.py
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
"""Floorplan detector service - OCR-based square meter detection."""
|
||||||
|
import asyncio
|
||||||
|
from models import Listing
|
||||||
|
from rec import floorplan
|
||||||
|
from repositories.listing_repository import ListingRepository
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
|
||||||
|
async def detect_floorplan(repository: ListingRepository) -> None:
|
||||||
|
"""Detect square meters from floorplan images for all listings."""
|
||||||
|
listings = await repository.get_listings()
|
||||||
|
cpu_count = multiprocessing.cpu_count() // 4
|
||||||
|
semaphore = asyncio.Semaphore(cpu_count)
|
||||||
|
|
||||||
|
updated_listings = [
|
||||||
|
listing
|
||||||
|
for listing in await tqdm.gather(
|
||||||
|
*[_calculate_sqm_ocr(listing, semaphore) for listing in listings]
|
||||||
|
)
|
||||||
|
if listing is not None
|
||||||
|
]
|
||||||
|
await repository.upsert_listings(updated_listings)
|
||||||
|
|
||||||
|
|
||||||
|
async def _calculate_sqm_ocr(
|
||||||
|
listing: Listing, semaphore: asyncio.Semaphore
|
||||||
|
) -> Listing | None:
|
||||||
|
"""Calculate square meters from floorplan images using OCR."""
|
||||||
|
if listing.square_meters is not None:
|
||||||
|
return None
|
||||||
|
sqms: list[float] = []
|
||||||
|
for floorplan_path in listing.floorplan_image_paths:
|
||||||
|
async with semaphore:
|
||||||
|
estimated_sqm, _ = await asyncio.to_thread(
|
||||||
|
floorplan.calculate_ocr, floorplan_path
|
||||||
|
)
|
||||||
|
if estimated_sqm is not None:
|
||||||
|
sqms.append(estimated_sqm)
|
||||||
|
max_sqm = max(sqms, default=0) # try once, if we fail, keep as 0
|
||||||
|
listing.square_meters = max_sqm
|
||||||
|
return listing
|
||||||
55
crawler/services/image_fetcher.py
Normal file
55
crawler/services/image_fetcher.py
Normal file
|
|
@ -0,0 +1,55 @@
|
||||||
|
"""Image fetcher service - downloads floorplan images for listings."""
|
||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
import aiohttp
|
||||||
|
from repositories import ListingRepository
|
||||||
|
from tenacity import retry, stop_after_attempt, wait_random
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
|
||||||
|
from models import Listing
|
||||||
|
|
||||||
|
# Setting this too high either crashes rightmove or gets us blocked
|
||||||
|
semaphore = asyncio.Semaphore(5)
|
||||||
|
|
||||||
|
|
||||||
|
async def dump_images(
|
||||||
|
repository: ListingRepository,
|
||||||
|
image_base_path: Path = Path("data/rs/"),
|
||||||
|
) -> None:
|
||||||
|
"""Download floorplan images for all listings."""
|
||||||
|
listings = await repository.get_listings()
|
||||||
|
updated_listings = await tqdm.gather(
|
||||||
|
*[dump_images_for_listing(listing, image_base_path) for listing in listings]
|
||||||
|
)
|
||||||
|
await repository.upsert_listings(
|
||||||
|
[listing for listing in updated_listings if listing is not None]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@retry(wait=wait_random(min=1, max=2), stop=stop_after_attempt(3))
|
||||||
|
async def dump_images_for_listing(listing: Listing, base_path: Path) -> Listing | None:
|
||||||
|
"""Download floorplan images for a single listing."""
|
||||||
|
all_floorplans = listing.additional_info.get("property", {}).get("floorplans", [])
|
||||||
|
for floorplan in all_floorplans:
|
||||||
|
url = floorplan["url"]
|
||||||
|
picname = url.split("/")[-1]
|
||||||
|
floorplan_path = Path(base_path, str(listing.id), "floorplans", picname)
|
||||||
|
if floorplan_path.exists():
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
async with semaphore:
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
async with session.get(url) as response:
|
||||||
|
if response.status == 404:
|
||||||
|
return None
|
||||||
|
if response.status != 200:
|
||||||
|
raise Exception(f"Error for {url}: {response.status}")
|
||||||
|
floorplan_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(floorplan_path, "wb") as f:
|
||||||
|
f.write(await response.read())
|
||||||
|
listing.floorplan_image_paths.append(str(floorplan_path))
|
||||||
|
return listing
|
||||||
|
except Exception as e:
|
||||||
|
tqdm.write(f"Error for {url}: {e}")
|
||||||
|
raise e # raise so that we retry it
|
||||||
|
return None
|
||||||
168
crawler/services/listing_service.py
Normal file
168
crawler/services/listing_service.py
Normal file
|
|
@ -0,0 +1,168 @@
|
||||||
|
"""Unified listing service - shared between CLI and HTTP API.
|
||||||
|
|
||||||
|
This module provides the core business logic for listing operations.
|
||||||
|
Both the CLI (main.py) and HTTP API (api/app.py) should use these functions.
|
||||||
|
"""
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from models.listing import Listing, QueryParameters
|
||||||
|
from repositories.listing_repository import ListingRepository
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ListingResult:
|
||||||
|
"""Result of a listing operation."""
|
||||||
|
listings: list[Listing]
|
||||||
|
total_count: int
|
||||||
|
message: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RefreshResult:
|
||||||
|
"""Result of a refresh operation."""
|
||||||
|
task_id: str | None # None if run synchronously
|
||||||
|
new_listings_count: int
|
||||||
|
message: str
|
||||||
|
|
||||||
|
|
||||||
|
async def get_listings(
|
||||||
|
repository: ListingRepository,
|
||||||
|
query_parameters: QueryParameters | None = None,
|
||||||
|
limit: int | None = None,
|
||||||
|
only_ids: list[int] | None = None,
|
||||||
|
) -> ListingResult:
|
||||||
|
"""Get listings from the database with optional filtering.
|
||||||
|
|
||||||
|
Used by:
|
||||||
|
- CLI: export-csv, export-immoweb
|
||||||
|
- API: GET /api/listing, GET /api/listing_geojson
|
||||||
|
"""
|
||||||
|
listings = await repository.get_listings(
|
||||||
|
query_parameters=query_parameters,
|
||||||
|
limit=limit,
|
||||||
|
only_ids=only_ids,
|
||||||
|
)
|
||||||
|
return ListingResult(
|
||||||
|
listings=listings,
|
||||||
|
total_count=len(listings),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def refresh_listings(
|
||||||
|
repository: ListingRepository,
|
||||||
|
query_parameters: QueryParameters,
|
||||||
|
full: bool = False,
|
||||||
|
async_mode: bool = False,
|
||||||
|
user_email: str | None = None,
|
||||||
|
) -> RefreshResult:
|
||||||
|
"""Refresh listings by fetching from external API.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repository: Database repository
|
||||||
|
query_parameters: Filtering parameters
|
||||||
|
full: If True, also fetch images and run OCR
|
||||||
|
async_mode: If True, run as background task and return task_id
|
||||||
|
user_email: User email for tracking (API mode)
|
||||||
|
|
||||||
|
Used by:
|
||||||
|
- CLI: dump-listings
|
||||||
|
- API: POST /api/refresh_listings
|
||||||
|
"""
|
||||||
|
if async_mode:
|
||||||
|
# Import here to avoid circular imports
|
||||||
|
from tasks.listing_tasks import dump_listings_task
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
|
expiry_time = datetime.now() + timedelta(minutes=10)
|
||||||
|
task = dump_listings_task.apply_async(
|
||||||
|
args=(query_parameters.model_dump_json(),),
|
||||||
|
expires=expiry_time,
|
||||||
|
)
|
||||||
|
return RefreshResult(
|
||||||
|
task_id=task.id,
|
||||||
|
new_listings_count=0,
|
||||||
|
message=f"Task {task.id} started",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Synchronous mode - run directly
|
||||||
|
from services.listing_fetcher import dump_listings, dump_listings_full
|
||||||
|
|
||||||
|
if full:
|
||||||
|
new_listings = await dump_listings_full(query_parameters, repository)
|
||||||
|
else:
|
||||||
|
new_listings = await dump_listings(query_parameters, repository)
|
||||||
|
|
||||||
|
return RefreshResult(
|
||||||
|
task_id=None,
|
||||||
|
new_listings_count=len(new_listings),
|
||||||
|
message=f"Fetched {len(new_listings)} new listings",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def download_images(
|
||||||
|
repository: ListingRepository,
|
||||||
|
data_dir: Path = Path("data/rs/"),
|
||||||
|
) -> int:
|
||||||
|
"""Download floorplan images for all listings.
|
||||||
|
|
||||||
|
Used by:
|
||||||
|
- CLI: dump-images
|
||||||
|
- API: (could be added)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of listings processed
|
||||||
|
"""
|
||||||
|
from services.image_fetcher import dump_images
|
||||||
|
|
||||||
|
await dump_images(repository, image_base_path=data_dir)
|
||||||
|
listings = await repository.get_listings()
|
||||||
|
return len(listings)
|
||||||
|
|
||||||
|
|
||||||
|
async def detect_floorplans(
|
||||||
|
repository: ListingRepository,
|
||||||
|
) -> int:
|
||||||
|
"""Run OCR on floorplan images to detect square meters.
|
||||||
|
|
||||||
|
Used by:
|
||||||
|
- CLI: detect-floorplan
|
||||||
|
- API: (could be added)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of listings processed
|
||||||
|
"""
|
||||||
|
from services.floorplan_detector import detect_floorplan
|
||||||
|
|
||||||
|
await detect_floorplan(repository)
|
||||||
|
listings = await repository.get_listings()
|
||||||
|
return len(listings)
|
||||||
|
|
||||||
|
|
||||||
|
async def calculate_routes(
|
||||||
|
repository: ListingRepository,
|
||||||
|
destination_address: str,
|
||||||
|
travel_mode: str,
|
||||||
|
limit: int | None = None,
|
||||||
|
) -> int:
|
||||||
|
"""Calculate transit routes for listings.
|
||||||
|
|
||||||
|
Used by:
|
||||||
|
- CLI: routing
|
||||||
|
- API: (could be added)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Number of listings processed
|
||||||
|
"""
|
||||||
|
from services.route_calculator import calculate_route
|
||||||
|
from rec.routing import TravelMode
|
||||||
|
|
||||||
|
await calculate_route(
|
||||||
|
repository,
|
||||||
|
destination_address,
|
||||||
|
TravelMode[travel_mode],
|
||||||
|
limit=limit,
|
||||||
|
)
|
||||||
|
return limit or 0
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
"""Route calculator service - calculates transit routes using Google Maps API."""
|
||||||
from models.listing import DestinationMode, Route, RouteLegStep
|
from models.listing import DestinationMode, Route, RouteLegStep
|
||||||
from repositories.listing_repository import ListingRepository
|
from repositories.listing_repository import ListingRepository
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
|
|
@ -11,6 +12,7 @@ async def calculate_route(
|
||||||
travel_mode: routing.TravelMode,
|
travel_mode: routing.TravelMode,
|
||||||
limit: int | None = None,
|
limit: int | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Calculate transit routes for listings to a destination."""
|
||||||
listings = await repository.get_listings()
|
listings = await repository.get_listings()
|
||||||
|
|
||||||
if limit is not None:
|
if limit is not None:
|
||||||
|
|
@ -30,6 +32,7 @@ async def calculate_route(
|
||||||
async def update_routing_info(
|
async def update_routing_info(
|
||||||
listing: Listing, destination_mode: DestinationMode
|
listing: Listing, destination_mode: DestinationMode
|
||||||
) -> Listing | None:
|
) -> Listing | None:
|
||||||
|
"""Update routing information for a single listing."""
|
||||||
if listing.routing_info.get(destination_mode) is not None:
|
if listing.routing_info.get(destination_mode) is not None:
|
||||||
# already calculated, do not recompute to save API calls
|
# already calculated, do not recompute to save API calls
|
||||||
return None
|
return None
|
||||||
|
|
@ -41,8 +44,7 @@ async def update_routing_info(
|
||||||
destination_mode.travel_mode,
|
destination_mode.travel_mode,
|
||||||
)
|
)
|
||||||
|
|
||||||
route_data = routes_data["routes"][0]
|
routes: list[Route] = []
|
||||||
routes = []
|
|
||||||
for route_data in routes_data["routes"]:
|
for route_data in routes_data["routes"]:
|
||||||
duration_s = int(route_data["duration"].split("s")[0])
|
duration_s = int(route_data["duration"].split("s")[0])
|
||||||
route = Route(
|
route = Route(
|
||||||
|
|
@ -61,47 +63,4 @@ async def update_routing_info(
|
||||||
listing.routing_info_json = listing.serialize_routing_info(
|
listing.routing_info_json = listing.serialize_routing_info(
|
||||||
{**listing.routing_info, **{destination_mode: routes}}
|
{**listing.routing_info, **{destination_mode: routes}}
|
||||||
)
|
)
|
||||||
return listing
|
return listing
|
||||||
|
|
||||||
|
|
||||||
# async def geocode_address(
|
|
||||||
# address: str,
|
|
||||||
# geocoding_cache: pathlib.Path,
|
|
||||||
# ) -> tuple[int, int]:
|
|
||||||
# cache = get_geocoding_cache(geocoding_cache)
|
|
||||||
# cached_results = cache.get(address)
|
|
||||||
# if cached_results is None:
|
|
||||||
# # resolve
|
|
||||||
# async with aiohttp.ClientSession() as session:
|
|
||||||
# async with session.get(
|
|
||||||
# ("https://maps.googleapis.com/maps/api/geocode/json"
|
|
||||||
# f"?address={address}"
|
|
||||||
# f"&key={API_KEY_ENVIRONMENT_VARIABLE}")) as response:
|
|
||||||
# if response.status != 200:
|
|
||||||
# raise Exception(
|
|
||||||
# f"Error {response.status} from geocoding API")
|
|
||||||
# cached_results = await response.json()
|
|
||||||
# with open(geocoding_cache, 'w') as f:
|
|
||||||
# json.dump({
|
|
||||||
# **{
|
|
||||||
# address: cached_results,
|
|
||||||
# },
|
|
||||||
# **cache
|
|
||||||
# }, f)
|
|
||||||
# # API format
|
|
||||||
# lat = cached_results["results"][0]["geometry"]["location"]["lat"]
|
|
||||||
# lng = cached_results["results"][0]["geometry"]["location"]["lng"]
|
|
||||||
# cache[address] = (lat, lng)
|
|
||||||
# with open(geocoding_cache, 'w') as f:
|
|
||||||
# json.dump(cache, f)
|
|
||||||
# return lat, lng
|
|
||||||
|
|
||||||
# def get_geocoding_cache(geocoding_cache: pathlib.Path) -> dict[str, Any]:
|
|
||||||
# try:
|
|
||||||
# with open(geocoding_cache, 'x') as f:
|
|
||||||
# json.dump({}, f)
|
|
||||||
# return {}
|
|
||||||
# except FileExistsError:
|
|
||||||
# pass # File already exists
|
|
||||||
# with open(geocoding_cache, 'r') as f:
|
|
||||||
# return json.load(f)
|
|
||||||
|
|
@ -11,9 +11,14 @@ import json
|
||||||
class TaskStatus:
|
class TaskStatus:
|
||||||
"""Status of a background task."""
|
"""Status of a background task."""
|
||||||
task_id: str
|
task_id: str
|
||||||
status: str # PENDING, STARTED, SUCCESS, FAILURE, REVOKED
|
status: str # PENDING, STARTED, SUCCESS, FAILURE, REVOKED, SKIPPED
|
||||||
result: Any | None
|
result: Any | None
|
||||||
progress: float | None # 0.0 to 1.0
|
progress: float | None # 0.0 to 1.0
|
||||||
|
processed: int | None # Number of items processed
|
||||||
|
total: int | None # Total number of items
|
||||||
|
message: str | None # Human-readable status message (e.g., "Fetching listings")
|
||||||
|
error: str | None # Error message if failed
|
||||||
|
traceback: str | None # Full traceback if failed
|
||||||
|
|
||||||
|
|
||||||
def get_task_status(task_id: str) -> TaskStatus:
|
def get_task_status(task_id: str) -> TaskStatus:
|
||||||
|
|
@ -33,21 +38,50 @@ def get_task_status(task_id: str) -> TaskStatus:
|
||||||
task_result = dump_listings_task.AsyncResult(task_id)
|
task_result = dump_listings_task.AsyncResult(task_id)
|
||||||
|
|
||||||
# Try to serialize result
|
# Try to serialize result
|
||||||
try:
|
result = None
|
||||||
result = json.loads(json.dumps(task_result.result))
|
error = None
|
||||||
except (TypeError, json.JSONDecodeError):
|
if task_result.failed():
|
||||||
result = str(task_result.result) if task_result.result else None
|
# Extract error message from failed task
|
||||||
|
error = str(task_result.result) if task_result.result else None
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
result = json.loads(json.dumps(task_result.result))
|
||||||
|
except (TypeError, json.JSONDecodeError):
|
||||||
|
result = str(task_result.result) if task_result.result else None
|
||||||
|
|
||||||
# Extract progress from task meta if available
|
# Extract traceback if available
|
||||||
|
task_traceback = task_result.traceback if task_result.failed() else None
|
||||||
|
|
||||||
|
# Extract progress, processed, total, and message from task meta
|
||||||
progress = None
|
progress = None
|
||||||
|
processed = None
|
||||||
|
total = None
|
||||||
|
message = None
|
||||||
|
|
||||||
if task_result.info and isinstance(task_result.info, dict):
|
if task_result.info and isinstance(task_result.info, dict):
|
||||||
progress = task_result.info.get("progress")
|
progress = task_result.info.get("progress")
|
||||||
|
processed = task_result.info.get("processed")
|
||||||
|
total = task_result.info.get("total")
|
||||||
|
# Use 'message' if available, fall back to 'reason' for SKIPPED tasks
|
||||||
|
message = task_result.info.get("message") or task_result.info.get("reason")
|
||||||
|
|
||||||
|
# For custom states (like "Fetching listings"), use the state as message
|
||||||
|
# if no message was provided in info
|
||||||
|
if not message and task_result.status not in (
|
||||||
|
"PENDING", "STARTED", "SUCCESS", "FAILURE", "REVOKED", "RETRY"
|
||||||
|
):
|
||||||
|
message = task_result.status
|
||||||
|
|
||||||
return TaskStatus(
|
return TaskStatus(
|
||||||
task_id=task_id,
|
task_id=task_id,
|
||||||
status=task_result.status,
|
status=task_result.status,
|
||||||
result=result,
|
result=result,
|
||||||
progress=progress,
|
progress=progress,
|
||||||
|
processed=processed,
|
||||||
|
total=total,
|
||||||
|
message=message,
|
||||||
|
error=error,
|
||||||
|
traceback=task_traceback,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
196
crawler/start.sh
196
crawler/start.sh
|
|
@ -1,55 +1,153 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
set -eu
|
||||||
|
|
||||||
# This sript is used to start the backend services and configure them according to what's available in the system
|
# Real Estate Crawler - Development Server
|
||||||
|
# Usage:
|
||||||
set -eux
|
# ./start.sh - Start with Docker (recommended)
|
||||||
|
# ./start.sh --local - Start locally (requires Poetry and dependencies)
|
||||||
ENV_MODE=${ENV:-"dev"} # Defaults to "dev" if ENV_MODE is unset
|
# ./start.sh --help - Show help
|
||||||
|
|
||||||
|
|
||||||
case "$ENV_MODE" in
|
|
||||||
dev)
|
|
||||||
echo "🛠️ Running in DEVELOPMENT mode"
|
|
||||||
set +e
|
|
||||||
pkill -f celery
|
|
||||||
pkill watchmedo
|
|
||||||
set -e
|
|
||||||
if ! netstat -tlnp |grep 6379; then
|
|
||||||
echo "Did not find a running redis on 6379. Starting a new instance..."
|
|
||||||
docker run -d --rm --name redis-server -p 6379:6379 redis:latest
|
|
||||||
fi
|
|
||||||
echo "Checking connection to redis is successful..."
|
|
||||||
python celery_app.py
|
|
||||||
|
|
||||||
watchmedo auto-restart --directory=./ --pattern='*.py' --recursive -- celery -A celery_app worker & # DEV to autoreload on changes
|
|
||||||
CELERY_PID=$!
|
|
||||||
;;
|
|
||||||
prod)
|
|
||||||
echo "🚀 Running in PRODUCTION mode"
|
|
||||||
echo "Checking connection to redis is successful..."
|
|
||||||
python celery_app.py
|
|
||||||
alembic upgrade head
|
|
||||||
celery -A celery_app worker --beat &
|
|
||||||
CELERY_PID=$!
|
|
||||||
;;
|
|
||||||
*)
|
|
||||||
echo "❌ Unknown ENV_MODE: $ENV_MODE. Defaulting to DEV."
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
|
|
||||||
cleanup() {
|
|
||||||
echo "Stopping background process (PID: $CELERY_PID)..."
|
|
||||||
kill "$CELERY_PID" 2>/dev/null # Graceful shutdown (SIGTERM)
|
|
||||||
wait "$CELERY_PID" 2>/dev/null # Wait for process to exit
|
|
||||||
|
|
||||||
|
show_help() {
|
||||||
|
echo "Real Estate Crawler - Development Server"
|
||||||
|
echo ""
|
||||||
|
echo "Usage: ./start.sh [OPTIONS]"
|
||||||
|
echo ""
|
||||||
|
echo "Options:"
|
||||||
|
echo " (default) Start all services with Docker Compose"
|
||||||
|
echo " --local Run locally with Poetry (requires local deps)"
|
||||||
|
echo " --build Rebuild Docker images before starting"
|
||||||
|
echo " --down Stop and remove all containers"
|
||||||
|
echo " --logs Follow logs from all services"
|
||||||
|
echo " --help Show this help message"
|
||||||
|
echo ""
|
||||||
|
echo "Examples:"
|
||||||
|
echo " ./start.sh # Start with Docker"
|
||||||
|
echo " ./start.sh --build # Rebuild and start"
|
||||||
|
echo " ./start.sh --local # Run locally with Poetry"
|
||||||
}
|
}
|
||||||
trap cleanup EXIT SIGINT SIGTERM
|
|
||||||
|
|
||||||
# celery -A celery_app worker -D # PROD
|
start_docker() {
|
||||||
uvicorn api.app:app --host 0.0.0.0 --port 5001 --log-level debug
|
local build_flag=""
|
||||||
# UVICORN_PID=$!
|
if [[ "${1:-}" == "--build" ]]; then
|
||||||
|
build_flag="--build"
|
||||||
|
fi
|
||||||
|
|
||||||
# wait for
|
echo "🐳 Starting services with Docker Compose..."
|
||||||
# less /etc/passwd > /dev/null
|
echo ""
|
||||||
|
|
||||||
|
# Check if docker/podman is available
|
||||||
|
if command -v docker &> /dev/null; then
|
||||||
|
COMPOSE_CMD="docker compose"
|
||||||
|
elif command -v podman-compose &> /dev/null; then
|
||||||
|
COMPOSE_CMD="podman-compose"
|
||||||
|
else
|
||||||
|
echo "❌ Error: Neither docker nor podman-compose found."
|
||||||
|
echo " Install Docker: https://docs.docker.com/get-docker/"
|
||||||
|
echo " Or run locally: ./start.sh --local"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
$COMPOSE_CMD up $build_flag
|
||||||
|
}
|
||||||
|
|
||||||
|
stop_docker() {
|
||||||
|
echo "🛑 Stopping all containers..."
|
||||||
|
if command -v docker &> /dev/null; then
|
||||||
|
docker compose down
|
||||||
|
elif command -v podman-compose &> /dev/null; then
|
||||||
|
podman-compose down
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
show_logs() {
|
||||||
|
if command -v docker &> /dev/null; then
|
||||||
|
docker compose logs -f
|
||||||
|
elif command -v podman-compose &> /dev/null; then
|
||||||
|
podman-compose logs -f
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
start_local() {
|
||||||
|
echo "🛠️ Starting locally with Poetry..."
|
||||||
|
echo ""
|
||||||
|
|
||||||
|
# Check Poetry is available
|
||||||
|
if ! command -v poetry &> /dev/null; then
|
||||||
|
echo "❌ Error: Poetry not found."
|
||||||
|
echo " Install: curl -sSL https://install.python-poetry.org | python3 -"
|
||||||
|
echo " Or use Docker: ./start.sh"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Source .env if it exists
|
||||||
|
if [[ -f .env ]]; then
|
||||||
|
set -a
|
||||||
|
source .env
|
||||||
|
set +a
|
||||||
|
fi
|
||||||
|
|
||||||
|
ENV_MODE=${ENV:-"dev"}
|
||||||
|
|
||||||
|
# Ensure Redis is running
|
||||||
|
if ! nc -z localhost 6379 2>/dev/null; then
|
||||||
|
echo "📦 Starting Redis container..."
|
||||||
|
docker run -d --rm --name rec-redis-local -p 6379:6379 redis:latest || true
|
||||||
|
sleep 2
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "✅ Redis OK"
|
||||||
|
|
||||||
|
# Test celery connection
|
||||||
|
poetry run python celery_app.py
|
||||||
|
|
||||||
|
# Start Celery worker in background
|
||||||
|
echo "🔧 Starting Celery worker..."
|
||||||
|
if [[ "$ENV_MODE" == "dev" ]]; then
|
||||||
|
poetry run celery -A celery_app worker --loglevel=info &
|
||||||
|
else
|
||||||
|
poetry run alembic upgrade head
|
||||||
|
poetry run celery -A celery_app worker --beat --loglevel=info &
|
||||||
|
fi
|
||||||
|
CELERY_PID=$!
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
echo ""
|
||||||
|
echo "🛑 Stopping Celery worker (PID: $CELERY_PID)..."
|
||||||
|
kill "$CELERY_PID" 2>/dev/null || true
|
||||||
|
wait "$CELERY_PID" 2>/dev/null || true
|
||||||
|
}
|
||||||
|
trap cleanup EXIT SIGINT SIGTERM
|
||||||
|
|
||||||
|
# Start uvicorn
|
||||||
|
echo "🚀 Starting API server on http://localhost:5001"
|
||||||
|
echo ""
|
||||||
|
poetry run uvicorn api.app:app --host 0.0.0.0 --port 5001 --reload
|
||||||
|
}
|
||||||
|
|
||||||
|
# Parse arguments
|
||||||
|
case "${1:-}" in
|
||||||
|
--help|-h)
|
||||||
|
show_help
|
||||||
|
;;
|
||||||
|
--local)
|
||||||
|
start_local
|
||||||
|
;;
|
||||||
|
--down)
|
||||||
|
stop_docker
|
||||||
|
;;
|
||||||
|
--logs)
|
||||||
|
show_logs
|
||||||
|
;;
|
||||||
|
--build)
|
||||||
|
start_docker --build
|
||||||
|
;;
|
||||||
|
"")
|
||||||
|
start_docker
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "❌ Unknown option: $1"
|
||||||
|
echo ""
|
||||||
|
show_help
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
|
||||||
1
crawler/tests/__init__.py
Normal file
1
crawler/tests/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Tests package
|
||||||
186
crawler/tests/conftest.py
Normal file
186
crawler/tests/conftest.py
Normal file
|
|
@ -0,0 +1,186 @@
|
||||||
|
"""Shared pytest fixtures for the test suite."""
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import AsyncGenerator, Generator
|
||||||
|
import pytest
|
||||||
|
from sqlalchemy import Engine
|
||||||
|
from sqlmodel import SQLModel, Session, create_engine
|
||||||
|
from httpx import ASGITransport, AsyncClient
|
||||||
|
|
||||||
|
from models.listing import (
|
||||||
|
BuyListing,
|
||||||
|
FurnishType,
|
||||||
|
ListingSite,
|
||||||
|
RentListing,
|
||||||
|
Listing,
|
||||||
|
)
|
||||||
|
from repositories.listing_repository import ListingRepository
|
||||||
|
from api.auth import User
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def in_memory_engine() -> Generator[Engine, None, None]:
|
||||||
|
"""Create an in-memory SQLite engine for testing."""
|
||||||
|
engine = create_engine(
|
||||||
|
"sqlite:///:memory:",
|
||||||
|
echo=False,
|
||||||
|
connect_args={"check_same_thread": False},
|
||||||
|
)
|
||||||
|
SQLModel.metadata.create_all(engine)
|
||||||
|
yield engine
|
||||||
|
SQLModel.metadata.drop_all(engine)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def listing_repository(in_memory_engine: Engine) -> ListingRepository:
|
||||||
|
"""Create a ListingRepository with the in-memory engine."""
|
||||||
|
return ListingRepository(engine=in_memory_engine)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_rent_listing() -> RentListing:
|
||||||
|
"""Create a sample RentListing for testing."""
|
||||||
|
return RentListing(
|
||||||
|
id=12345678,
|
||||||
|
price=2500.0,
|
||||||
|
number_of_bedrooms=2,
|
||||||
|
square_meters=65.0,
|
||||||
|
agency="Test Agency",
|
||||||
|
council_tax_band="C",
|
||||||
|
longitude=-0.1276,
|
||||||
|
latitude=51.5074,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=datetime.now(),
|
||||||
|
photo_thumbnail="https://example.com/photo.jpg",
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.FURNISHED,
|
||||||
|
available_from=datetime.now(),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_buy_listing() -> BuyListing:
|
||||||
|
"""Create a sample BuyListing for testing."""
|
||||||
|
return BuyListing(
|
||||||
|
id=87654321,
|
||||||
|
price=450000.0,
|
||||||
|
number_of_bedrooms=3,
|
||||||
|
square_meters=95.0,
|
||||||
|
agency="Test Estate Agents",
|
||||||
|
council_tax_band="D",
|
||||||
|
longitude=-0.1180,
|
||||||
|
latitude=51.5100,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=datetime.now(),
|
||||||
|
photo_thumbnail="https://example.com/buy_photo.jpg",
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
service_charge=1500.0,
|
||||||
|
lease_left=90,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def sample_rent_listings() -> list[RentListing]:
|
||||||
|
"""Create multiple sample RentListings for testing filters."""
|
||||||
|
now = datetime.now()
|
||||||
|
return [
|
||||||
|
RentListing(
|
||||||
|
id=1,
|
||||||
|
price=1500.0,
|
||||||
|
number_of_bedrooms=1,
|
||||||
|
square_meters=40.0,
|
||||||
|
agency="Agency A",
|
||||||
|
council_tax_band="B",
|
||||||
|
longitude=-0.1,
|
||||||
|
latitude=51.5,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=now,
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.FURNISHED,
|
||||||
|
available_from=now,
|
||||||
|
),
|
||||||
|
RentListing(
|
||||||
|
id=2,
|
||||||
|
price=2000.0,
|
||||||
|
number_of_bedrooms=2,
|
||||||
|
square_meters=55.0,
|
||||||
|
agency="Agency B",
|
||||||
|
council_tax_band="C",
|
||||||
|
longitude=-0.12,
|
||||||
|
latitude=51.51,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=now,
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.UNFURNISHED,
|
||||||
|
available_from=now,
|
||||||
|
),
|
||||||
|
RentListing(
|
||||||
|
id=3,
|
||||||
|
price=3000.0,
|
||||||
|
number_of_bedrooms=3,
|
||||||
|
square_meters=80.0,
|
||||||
|
agency="Agency C",
|
||||||
|
council_tax_band="D",
|
||||||
|
longitude=-0.14,
|
||||||
|
latitude=51.52,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=now,
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.FURNISHED,
|
||||||
|
available_from=now,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_user() -> User:
|
||||||
|
"""Create a mock user for API tests."""
|
||||||
|
return User(
|
||||||
|
sub="test-user-id",
|
||||||
|
email="test@example.com",
|
||||||
|
name="Test User",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
async def async_client(
|
||||||
|
in_memory_engine: Engine, mock_user: User
|
||||||
|
) -> AsyncGenerator[AsyncClient, None]:
|
||||||
|
"""Create an AsyncClient for API testing with mock auth."""
|
||||||
|
from api.app import app
|
||||||
|
from api.auth import get_current_user
|
||||||
|
|
||||||
|
# Override dependencies
|
||||||
|
app.dependency_overrides[get_current_user] = lambda: mock_user
|
||||||
|
|
||||||
|
# Patch the engine used by the repository
|
||||||
|
original_engine = None
|
||||||
|
try:
|
||||||
|
from database import engine as db_engine
|
||||||
|
original_engine = db_engine
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
transport = ASGITransport(app=app)
|
||||||
|
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||||
|
yield client
|
||||||
|
|
||||||
|
# Clean up dependency overrides
|
||||||
|
app.dependency_overrides.clear()
|
||||||
1
crawler/tests/integration/__init__.py
Normal file
1
crawler/tests/integration/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Integration tests package
|
||||||
180
crawler/tests/integration/test_api.py
Normal file
180
crawler/tests/integration/test_api.py
Normal file
|
|
@ -0,0 +1,180 @@
|
||||||
|
"""Integration tests for API endpoints."""
|
||||||
|
from unittest.mock import AsyncMock, patch
|
||||||
|
import pytest
|
||||||
|
from httpx import AsyncClient
|
||||||
|
|
||||||
|
from api.auth import User
|
||||||
|
|
||||||
|
|
||||||
|
class TestStatusEndpoint:
|
||||||
|
"""Tests for the /api/status endpoint."""
|
||||||
|
|
||||||
|
async def test_status_endpoint_returns_ok(
|
||||||
|
self, async_client: AsyncClient
|
||||||
|
) -> None:
|
||||||
|
"""Test that status endpoint returns OK status."""
|
||||||
|
response = await async_client.get("/api/status")
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.json() == {"status": "OK"}
|
||||||
|
|
||||||
|
|
||||||
|
class TestListingEndpoint:
|
||||||
|
"""Tests for the /api/listing endpoint."""
|
||||||
|
|
||||||
|
async def test_listing_endpoint_requires_auth(self) -> None:
|
||||||
|
"""Test that listing endpoint requires authentication."""
|
||||||
|
from api.app import app
|
||||||
|
from httpx import ASGITransport, AsyncClient
|
||||||
|
|
||||||
|
# Clear any dependency overrides to test auth requirement
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
transport = ASGITransport(app=app)
|
||||||
|
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||||
|
response = await client.get("/api/listing")
|
||||||
|
# Should return 401 or 403 without valid auth
|
||||||
|
assert response.status_code in (401, 403)
|
||||||
|
|
||||||
|
async def test_listing_endpoint_with_auth(
|
||||||
|
self, async_client: AsyncClient
|
||||||
|
) -> None:
|
||||||
|
"""Test that listing endpoint works with authentication."""
|
||||||
|
# Mock the repository to return empty list
|
||||||
|
with patch(
|
||||||
|
"api.app.ListingRepository.get_listings",
|
||||||
|
new_callable=AsyncMock,
|
||||||
|
return_value=[],
|
||||||
|
):
|
||||||
|
response = await async_client.get("/api/listing")
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert "listings" in data
|
||||||
|
|
||||||
|
|
||||||
|
class TestListingGeoJsonEndpoint:
|
||||||
|
"""Tests for the /api/listing_geojson endpoint."""
|
||||||
|
|
||||||
|
async def test_listing_geojson_requires_auth(self) -> None:
|
||||||
|
"""Test that listing_geojson endpoint requires authentication."""
|
||||||
|
from api.app import app
|
||||||
|
from httpx import ASGITransport, AsyncClient
|
||||||
|
|
||||||
|
# Clear any dependency overrides to test auth requirement
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
transport = ASGITransport(app=app)
|
||||||
|
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||||
|
response = await client.get(
|
||||||
|
"/api/listing_geojson",
|
||||||
|
params={"listing_type": "RENT"},
|
||||||
|
)
|
||||||
|
# Should return 401 or 403 without valid auth
|
||||||
|
assert response.status_code in (401, 403)
|
||||||
|
|
||||||
|
async def test_listing_geojson_with_filters(
|
||||||
|
self, async_client: AsyncClient
|
||||||
|
) -> None:
|
||||||
|
"""Test that listing_geojson accepts filter parameters."""
|
||||||
|
with patch(
|
||||||
|
"api.app.export_immoweb",
|
||||||
|
new_callable=AsyncMock,
|
||||||
|
return_value={"type": "FeatureCollection", "features": []},
|
||||||
|
):
|
||||||
|
response = await async_client.get(
|
||||||
|
"/api/listing_geojson",
|
||||||
|
params={
|
||||||
|
"listing_type": "RENT",
|
||||||
|
"min_bedrooms": 2,
|
||||||
|
"max_bedrooms": 3,
|
||||||
|
"min_price": 1500,
|
||||||
|
"max_price": 3000,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert data["type"] == "FeatureCollection"
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetDistrictsEndpoint:
|
||||||
|
"""Tests for the /api/get_districts endpoint."""
|
||||||
|
|
||||||
|
async def test_get_districts_requires_auth(self) -> None:
|
||||||
|
"""Test that get_districts endpoint requires authentication."""
|
||||||
|
from api.app import app
|
||||||
|
from httpx import ASGITransport, AsyncClient
|
||||||
|
|
||||||
|
# Clear any dependency overrides to test auth requirement
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
transport = ASGITransport(app=app)
|
||||||
|
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||||
|
response = await client.get("/api/get_districts")
|
||||||
|
# Should return 401 or 403 without valid auth
|
||||||
|
assert response.status_code in (401, 403)
|
||||||
|
|
||||||
|
async def test_get_districts_returns_dict(
|
||||||
|
self, async_client: AsyncClient
|
||||||
|
) -> None:
|
||||||
|
"""Test that get_districts returns a dictionary of districts."""
|
||||||
|
response = await async_client.get("/api/get_districts")
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert isinstance(data, dict)
|
||||||
|
# Check some known districts exist
|
||||||
|
assert "London" in data
|
||||||
|
assert "Westminster" in data
|
||||||
|
assert "Camden" in data
|
||||||
|
|
||||||
|
async def test_get_districts_values_are_region_ids(
|
||||||
|
self, async_client: AsyncClient
|
||||||
|
) -> None:
|
||||||
|
"""Test that district values are REGION identifiers."""
|
||||||
|
response = await async_client.get("/api/get_districts")
|
||||||
|
data = response.json()
|
||||||
|
# All values should be REGION^... format
|
||||||
|
for district_name, region_id in data.items():
|
||||||
|
assert region_id.startswith("REGION^"), (
|
||||||
|
f"District {district_name} has invalid region ID: {region_id}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRefreshListingsEndpoint:
|
||||||
|
"""Tests for the /api/refresh_listings endpoint."""
|
||||||
|
|
||||||
|
async def test_refresh_listings_requires_auth(self) -> None:
|
||||||
|
"""Test that refresh_listings endpoint requires authentication."""
|
||||||
|
from api.app import app
|
||||||
|
from httpx import ASGITransport, AsyncClient
|
||||||
|
|
||||||
|
# Clear any dependency overrides to test auth requirement
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
transport = ASGITransport(app=app)
|
||||||
|
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||||
|
response = await client.post(
|
||||||
|
"/api/refresh_listings",
|
||||||
|
params={"listing_type": "RENT"},
|
||||||
|
)
|
||||||
|
# Should return 401 or 403 without valid auth
|
||||||
|
assert response.status_code in (401, 403)
|
||||||
|
|
||||||
|
|
||||||
|
class TestTaskStatusEndpoint:
|
||||||
|
"""Tests for the /api/task_status endpoint."""
|
||||||
|
|
||||||
|
async def test_task_status_requires_auth(self) -> None:
|
||||||
|
"""Test that task_status endpoint requires authentication."""
|
||||||
|
from api.app import app
|
||||||
|
from httpx import ASGITransport, AsyncClient
|
||||||
|
|
||||||
|
# Clear any dependency overrides to test auth requirement
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
transport = ASGITransport(app=app)
|
||||||
|
async with AsyncClient(transport=transport, base_url="http://test") as client:
|
||||||
|
response = await client.get(
|
||||||
|
"/api/task_status",
|
||||||
|
params={"task_id": "test-task-id"},
|
||||||
|
)
|
||||||
|
# Should return 401 or 403 without valid auth
|
||||||
|
assert response.status_code in (401, 403)
|
||||||
299
crawler/tests/test_listing_geojson.py
Normal file
299
crawler/tests/test_listing_geojson.py
Normal file
|
|
@ -0,0 +1,299 @@
|
||||||
|
"""Tests for the listing_geojson API endpoint and QueryParameters parsing."""
|
||||||
|
import json
|
||||||
|
import pytest
|
||||||
|
from datetime import datetime
|
||||||
|
from unittest.mock import patch, MagicMock, AsyncMock
|
||||||
|
|
||||||
|
|
||||||
|
class TestQueryParametersModel:
|
||||||
|
"""Test QueryParameters model directly."""
|
||||||
|
|
||||||
|
def test_datetime_parsing_z_suffix(self):
|
||||||
|
"""Test that datetime with Z suffix is parsed correctly."""
|
||||||
|
from models.listing import QueryParameters, ListingType
|
||||||
|
|
||||||
|
params = QueryParameters(
|
||||||
|
listing_type=ListingType.RENT,
|
||||||
|
let_date_available_from="2026-02-01T11:33:01.248Z",
|
||||||
|
)
|
||||||
|
assert params.let_date_available_from is not None
|
||||||
|
assert params.let_date_available_from.year == 2026
|
||||||
|
|
||||||
|
def test_datetime_parsing_offset(self):
|
||||||
|
"""Test that datetime with offset is parsed correctly."""
|
||||||
|
from models.listing import QueryParameters, ListingType
|
||||||
|
|
||||||
|
params = QueryParameters(
|
||||||
|
listing_type=ListingType.RENT,
|
||||||
|
let_date_available_from="2026-02-01T11:33:01.248+00:00",
|
||||||
|
)
|
||||||
|
assert params.let_date_available_from is not None
|
||||||
|
|
||||||
|
def test_defaults_work(self):
|
||||||
|
"""Test that default values are applied correctly."""
|
||||||
|
from models.listing import QueryParameters, ListingType
|
||||||
|
|
||||||
|
params = QueryParameters(listing_type=ListingType.RENT)
|
||||||
|
assert params.min_bedrooms == 1
|
||||||
|
assert params.max_bedrooms == 999
|
||||||
|
assert params.min_price == 0
|
||||||
|
assert params.max_price == 10_000_000
|
||||||
|
assert params.district_names == set()
|
||||||
|
assert params.let_date_available_from is None
|
||||||
|
|
||||||
|
def test_full_frontend_params(self):
|
||||||
|
"""Test with all parameters as sent by frontend."""
|
||||||
|
from models.listing import QueryParameters, ListingType
|
||||||
|
|
||||||
|
params = QueryParameters(
|
||||||
|
listing_type=ListingType.RENT,
|
||||||
|
min_bedrooms=1,
|
||||||
|
max_bedrooms=3,
|
||||||
|
max_price=3000,
|
||||||
|
min_price=2000,
|
||||||
|
min_sqm=50,
|
||||||
|
last_seen_days=28,
|
||||||
|
let_date_available_from="2026-02-01T11:19:22.072Z",
|
||||||
|
)
|
||||||
|
assert params.listing_type == ListingType.RENT
|
||||||
|
assert params.min_bedrooms == 1
|
||||||
|
assert params.max_bedrooms == 3
|
||||||
|
assert params.min_sqm == 50
|
||||||
|
|
||||||
|
|
||||||
|
class TestGetQueryParametersDependency:
|
||||||
|
"""Test the get_query_parameters FastAPI dependency."""
|
||||||
|
|
||||||
|
def test_parses_datetime_correctly(self):
|
||||||
|
"""Test that the dependency parses datetime Z suffix."""
|
||||||
|
from api.app import get_query_parameters
|
||||||
|
from models.listing import ListingType
|
||||||
|
|
||||||
|
params = get_query_parameters(
|
||||||
|
listing_type=ListingType.RENT,
|
||||||
|
let_date_available_from=datetime(2026, 2, 1, 11, 33, 1),
|
||||||
|
)
|
||||||
|
assert params.let_date_available_from is not None
|
||||||
|
|
||||||
|
def test_defaults_applied(self):
|
||||||
|
"""Test that defaults are applied when not provided."""
|
||||||
|
from api.app import get_query_parameters
|
||||||
|
from models.listing import ListingType
|
||||||
|
|
||||||
|
params = get_query_parameters(listing_type=ListingType.RENT)
|
||||||
|
assert params.min_bedrooms == 1
|
||||||
|
assert params.max_bedrooms == 999
|
||||||
|
|
||||||
|
|
||||||
|
class TestListingGeoJsonEndpoint:
|
||||||
|
"""Test the /api/listing_geojson endpoint."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client(self):
|
||||||
|
"""Create test client with mocked auth."""
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from api.app import app, get_current_user
|
||||||
|
from api.auth import User
|
||||||
|
|
||||||
|
# Override auth dependency
|
||||||
|
async def mock_auth():
|
||||||
|
return User(email="test@example.com", name="Test User")
|
||||||
|
|
||||||
|
app.dependency_overrides[get_current_user] = mock_auth
|
||||||
|
yield TestClient(app)
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_export(self):
|
||||||
|
"""Mock the export service."""
|
||||||
|
with patch("api.app.export_service.export_to_geojson") as mock:
|
||||||
|
mock.return_value = MagicMock(
|
||||||
|
data={"type": "FeatureCollection", "features": [{"type": "Feature"}]}
|
||||||
|
)
|
||||||
|
yield mock
|
||||||
|
|
||||||
|
def test_minimal_params_no_422(self, client, mock_export):
|
||||||
|
"""Test that minimal params don't cause 422."""
|
||||||
|
response = client.get("/api/listing_geojson?listing_type=RENT")
|
||||||
|
assert response.status_code != 422, f"Got 422: {response.json()}"
|
||||||
|
|
||||||
|
def test_with_datetime_z_suffix_no_422(self, client, mock_export):
|
||||||
|
"""Test datetime parsing with Z suffix doesn't cause 422."""
|
||||||
|
response = client.get(
|
||||||
|
"/api/listing_geojson?"
|
||||||
|
"listing_type=RENT"
|
||||||
|
"&let_date_available_from=2026-02-01T11:33:01.248Z"
|
||||||
|
)
|
||||||
|
assert response.status_code != 422, f"Got 422: {response.json()}"
|
||||||
|
|
||||||
|
def test_full_frontend_params_no_422(self, client, mock_export):
|
||||||
|
"""Test with all parameters as sent by frontend."""
|
||||||
|
response = client.get(
|
||||||
|
"/api/listing_geojson?"
|
||||||
|
"listing_type=RENT"
|
||||||
|
"&min_bedrooms=1"
|
||||||
|
"&max_bedrooms=3"
|
||||||
|
"&max_price=3000"
|
||||||
|
"&min_price=2000"
|
||||||
|
"&min_sqm=50"
|
||||||
|
"&last_seen_days=28"
|
||||||
|
"&let_date_available_from=2026-02-01T11:19:22.072Z"
|
||||||
|
)
|
||||||
|
assert response.status_code != 422, f"Got 422: {response.json()}"
|
||||||
|
|
||||||
|
def test_returns_geojson_structure(self, client, mock_export):
|
||||||
|
"""Test that endpoint returns proper GeoJSON structure."""
|
||||||
|
response = client.get("/api/listing_geojson?listing_type=RENT")
|
||||||
|
assert response.status_code == 200
|
||||||
|
data = response.json()
|
||||||
|
assert "type" in data
|
||||||
|
assert data["type"] == "FeatureCollection"
|
||||||
|
assert "features" in data
|
||||||
|
|
||||||
|
|
||||||
|
class TestStreamingEndpoint:
|
||||||
|
"""Test the /api/listing_geojson/stream endpoint."""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client(self):
|
||||||
|
"""Create test client with mocked auth."""
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
from api.app import app
|
||||||
|
from api.auth import get_current_user, User
|
||||||
|
|
||||||
|
async def mock_auth():
|
||||||
|
return User(sub="test-id", email="test@example.com", name="Test User")
|
||||||
|
|
||||||
|
app.dependency_overrides[get_current_user] = mock_auth
|
||||||
|
yield TestClient(app)
|
||||||
|
app.dependency_overrides.clear()
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_repository(self):
|
||||||
|
"""Mock the repository methods."""
|
||||||
|
with patch("api.app.ListingRepository") as MockRepo:
|
||||||
|
mock_instance = MagicMock()
|
||||||
|
mock_instance.count_listings.return_value = 3
|
||||||
|
mock_instance.stream_listings_optimized.return_value = iter([
|
||||||
|
{
|
||||||
|
'id': 1,
|
||||||
|
'price': 2000.0,
|
||||||
|
'number_of_bedrooms': 2,
|
||||||
|
'square_meters': 50.0,
|
||||||
|
'longitude': -0.1,
|
||||||
|
'latitude': 51.5,
|
||||||
|
'photo_thumbnail': 'https://example.com/1.jpg',
|
||||||
|
'last_seen': datetime.now(),
|
||||||
|
'agency': 'Test Agency',
|
||||||
|
'price_history_json': '[]',
|
||||||
|
'available_from': datetime.now(),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'id': 2,
|
||||||
|
'price': 2500.0,
|
||||||
|
'number_of_bedrooms': 2,
|
||||||
|
'square_meters': 60.0,
|
||||||
|
'longitude': -0.12,
|
||||||
|
'latitude': 51.51,
|
||||||
|
'photo_thumbnail': 'https://example.com/2.jpg',
|
||||||
|
'last_seen': datetime.now(),
|
||||||
|
'agency': 'Test Agency 2',
|
||||||
|
'price_history_json': '[]',
|
||||||
|
'available_from': None,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'id': 3,
|
||||||
|
'price': 3000.0,
|
||||||
|
'number_of_bedrooms': 3,
|
||||||
|
'square_meters': None,
|
||||||
|
'longitude': -0.14,
|
||||||
|
'latitude': 51.52,
|
||||||
|
'photo_thumbnail': None,
|
||||||
|
'last_seen': datetime.now(),
|
||||||
|
'agency': None,
|
||||||
|
'price_history_json': '[{"first_seen": "2026-01-01", "last_seen": "2026-01-15", "price": 2800}]',
|
||||||
|
'available_from': None,
|
||||||
|
},
|
||||||
|
])
|
||||||
|
MockRepo.return_value = mock_instance
|
||||||
|
yield mock_instance
|
||||||
|
|
||||||
|
def test_streaming_returns_ndjson(self, client, mock_repository):
|
||||||
|
"""Test that streaming endpoint returns NDJSON format."""
|
||||||
|
response = client.get("/api/listing_geojson/stream?listing_type=RENT&limit=10")
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert response.headers["content-type"] == "application/x-ndjson"
|
||||||
|
|
||||||
|
def test_streaming_metadata_includes_total_expected(self, client, mock_repository):
|
||||||
|
"""Test that first line includes total_expected count."""
|
||||||
|
response = client.get("/api/listing_geojson/stream?listing_type=RENT&limit=10")
|
||||||
|
lines = response.text.strip().split("\n")
|
||||||
|
assert len(lines) >= 1
|
||||||
|
|
||||||
|
metadata = json.loads(lines[0])
|
||||||
|
assert metadata["type"] == "metadata"
|
||||||
|
assert "total_expected" in metadata
|
||||||
|
assert metadata["total_expected"] == 3
|
||||||
|
assert "batch_size" in metadata
|
||||||
|
|
||||||
|
def test_streaming_returns_batches_and_complete(self, client, mock_repository):
|
||||||
|
"""Test that streaming returns batch and complete messages."""
|
||||||
|
response = client.get("/api/listing_geojson/stream?listing_type=RENT&limit=10")
|
||||||
|
lines = response.text.strip().split("\n")
|
||||||
|
|
||||||
|
# Parse all lines
|
||||||
|
messages = [json.loads(line) for line in lines]
|
||||||
|
|
||||||
|
# First should be metadata
|
||||||
|
assert messages[0]["type"] == "metadata"
|
||||||
|
|
||||||
|
# Should have at least one batch
|
||||||
|
batch_messages = [m for m in messages if m["type"] == "batch"]
|
||||||
|
assert len(batch_messages) >= 1
|
||||||
|
|
||||||
|
# Last should be complete
|
||||||
|
assert messages[-1]["type"] == "complete"
|
||||||
|
assert "total" in messages[-1]
|
||||||
|
|
||||||
|
def test_streaming_features_have_correct_structure(self, client, mock_repository):
|
||||||
|
"""Test that streamed features have correct GeoJSON structure."""
|
||||||
|
response = client.get("/api/listing_geojson/stream?listing_type=RENT&batch_size=10&limit=10")
|
||||||
|
lines = response.text.strip().split("\n")
|
||||||
|
messages = [json.loads(line) for line in lines]
|
||||||
|
|
||||||
|
batch_messages = [m for m in messages if m["type"] == "batch"]
|
||||||
|
assert len(batch_messages) >= 1
|
||||||
|
|
||||||
|
features = batch_messages[0]["features"]
|
||||||
|
assert len(features) > 0
|
||||||
|
|
||||||
|
feature = features[0]
|
||||||
|
assert feature["type"] == "Feature"
|
||||||
|
assert "properties" in feature
|
||||||
|
assert "geometry" in feature
|
||||||
|
assert feature["geometry"]["type"] == "Point"
|
||||||
|
assert "coordinates" in feature["geometry"]
|
||||||
|
|
||||||
|
# Check properties
|
||||||
|
props = feature["properties"]
|
||||||
|
assert "total_price" in props
|
||||||
|
assert "rooms" in props
|
||||||
|
assert "url" in props
|
||||||
|
assert "last_seen" in props
|
||||||
|
|
||||||
|
def test_streaming_handles_null_square_meters(self, client, mock_repository):
|
||||||
|
"""Test that null square_meters doesn't cause errors."""
|
||||||
|
response = client.get("/api/listing_geojson/stream?listing_type=RENT&batch_size=10&limit=10")
|
||||||
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
lines = response.text.strip().split("\n")
|
||||||
|
messages = [json.loads(line) for line in lines]
|
||||||
|
|
||||||
|
# Find feature with id=3 (has null square_meters)
|
||||||
|
for msg in messages:
|
||||||
|
if msg["type"] == "batch":
|
||||||
|
for feature in msg["features"]:
|
||||||
|
if feature["properties"]["url"].endswith("/3"):
|
||||||
|
assert feature["properties"]["qm"] is None
|
||||||
|
assert feature["properties"]["qmprice"] is None
|
||||||
|
|
||||||
1
crawler/tests/unit/__init__.py
Normal file
1
crawler/tests/unit/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Unit tests package
|
||||||
343
crawler/tests/unit/test_models.py
Normal file
343
crawler/tests/unit/test_models.py
Normal file
|
|
@ -0,0 +1,343 @@
|
||||||
|
"""Unit tests for Listing models."""
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from models.listing import (
|
||||||
|
BuyListing,
|
||||||
|
FurnishType,
|
||||||
|
ListingSite,
|
||||||
|
PriceHistoryItem,
|
||||||
|
RentListing,
|
||||||
|
Listing,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestListing:
|
||||||
|
"""Tests for the base Listing model."""
|
||||||
|
|
||||||
|
def test_price_per_square_meter_calculation(self) -> None:
|
||||||
|
"""Test that price_per_square_meter is calculated correctly."""
|
||||||
|
listing = RentListing(
|
||||||
|
id=1,
|
||||||
|
price=2000.0,
|
||||||
|
number_of_bedrooms=2,
|
||||||
|
square_meters=50.0,
|
||||||
|
agency="Test",
|
||||||
|
council_tax_band="C",
|
||||||
|
longitude=0.0,
|
||||||
|
latitude=0.0,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=datetime.now(),
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.FURNISHED,
|
||||||
|
available_from=None,
|
||||||
|
)
|
||||||
|
assert listing.price_per_square_meter == 40.0
|
||||||
|
|
||||||
|
def test_price_per_square_meter_none_when_no_sqm(self) -> None:
|
||||||
|
"""Test that price_per_square_meter is None when square_meters is None."""
|
||||||
|
listing = RentListing(
|
||||||
|
id=1,
|
||||||
|
price=2000.0,
|
||||||
|
number_of_bedrooms=2,
|
||||||
|
square_meters=None,
|
||||||
|
agency="Test",
|
||||||
|
council_tax_band="C",
|
||||||
|
longitude=0.0,
|
||||||
|
latitude=0.0,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=datetime.now(),
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.FURNISHED,
|
||||||
|
available_from=None,
|
||||||
|
)
|
||||||
|
assert listing.price_per_square_meter is None
|
||||||
|
|
||||||
|
def test_price_per_square_meter_none_when_sqm_zero(self) -> None:
|
||||||
|
"""Test that price_per_square_meter is None when square_meters is 0."""
|
||||||
|
listing = RentListing(
|
||||||
|
id=1,
|
||||||
|
price=2000.0,
|
||||||
|
number_of_bedrooms=2,
|
||||||
|
square_meters=0.0,
|
||||||
|
agency="Test",
|
||||||
|
council_tax_band="C",
|
||||||
|
longitude=0.0,
|
||||||
|
latitude=0.0,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=datetime.now(),
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.FURNISHED,
|
||||||
|
available_from=None,
|
||||||
|
)
|
||||||
|
assert listing.price_per_square_meter is None
|
||||||
|
|
||||||
|
def test_url_property(self) -> None:
|
||||||
|
"""Test that url property returns correct Rightmove URL."""
|
||||||
|
listing = RentListing(
|
||||||
|
id=123456789,
|
||||||
|
price=2000.0,
|
||||||
|
number_of_bedrooms=2,
|
||||||
|
square_meters=50.0,
|
||||||
|
agency="Test",
|
||||||
|
council_tax_band="C",
|
||||||
|
longitude=0.0,
|
||||||
|
latitude=0.0,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=datetime.now(),
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.FURNISHED,
|
||||||
|
available_from=None,
|
||||||
|
)
|
||||||
|
assert listing.url == "https://www.rightmove.co.uk/properties/123456789"
|
||||||
|
|
||||||
|
def test_is_removed_property_visible(self) -> None:
|
||||||
|
"""Test that is_removed returns False when property is visible."""
|
||||||
|
listing = RentListing(
|
||||||
|
id=1,
|
||||||
|
price=2000.0,
|
||||||
|
number_of_bedrooms=2,
|
||||||
|
square_meters=50.0,
|
||||||
|
agency="Test",
|
||||||
|
council_tax_band="C",
|
||||||
|
longitude=0.0,
|
||||||
|
latitude=0.0,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=datetime.now(),
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.FURNISHED,
|
||||||
|
available_from=None,
|
||||||
|
)
|
||||||
|
assert listing.is_removed is False
|
||||||
|
|
||||||
|
def test_is_removed_property_not_visible(self) -> None:
|
||||||
|
"""Test that is_removed returns True when property is not visible."""
|
||||||
|
listing = RentListing(
|
||||||
|
id=1,
|
||||||
|
price=2000.0,
|
||||||
|
number_of_bedrooms=2,
|
||||||
|
square_meters=50.0,
|
||||||
|
agency="Test",
|
||||||
|
council_tax_band="C",
|
||||||
|
longitude=0.0,
|
||||||
|
latitude=0.0,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=datetime.now(),
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": False}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.FURNISHED,
|
||||||
|
available_from=None,
|
||||||
|
)
|
||||||
|
assert listing.is_removed is True
|
||||||
|
|
||||||
|
|
||||||
|
class TestPriceHistory:
|
||||||
|
"""Tests for price history serialization/deserialization."""
|
||||||
|
|
||||||
|
def test_price_history_serialization_roundtrip(self) -> None:
|
||||||
|
"""Test that price history can be serialized and deserialized."""
|
||||||
|
now = datetime.now()
|
||||||
|
price_history = [
|
||||||
|
PriceHistoryItem(
|
||||||
|
first_seen=now,
|
||||||
|
last_seen=now,
|
||||||
|
price=2000.0,
|
||||||
|
),
|
||||||
|
PriceHistoryItem(
|
||||||
|
first_seen=now,
|
||||||
|
last_seen=now,
|
||||||
|
price=2100.0,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Serialize
|
||||||
|
serialized = Listing.serialize_price_history(price_history)
|
||||||
|
assert isinstance(serialized, str)
|
||||||
|
|
||||||
|
# Create listing with serialized history
|
||||||
|
listing = RentListing(
|
||||||
|
id=1,
|
||||||
|
price=2100.0,
|
||||||
|
number_of_bedrooms=2,
|
||||||
|
square_meters=50.0,
|
||||||
|
agency="Test",
|
||||||
|
council_tax_band="C",
|
||||||
|
longitude=0.0,
|
||||||
|
latitude=0.0,
|
||||||
|
price_history_json=serialized,
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=now,
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.FURNISHED,
|
||||||
|
available_from=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Deserialize
|
||||||
|
deserialized = listing.price_history
|
||||||
|
assert len(deserialized) == 2
|
||||||
|
assert deserialized[0].price == 2000.0
|
||||||
|
assert deserialized[1].price == 2100.0
|
||||||
|
|
||||||
|
def test_price_history_empty(self) -> None:
|
||||||
|
"""Test that empty price history works correctly."""
|
||||||
|
listing = RentListing(
|
||||||
|
id=1,
|
||||||
|
price=2000.0,
|
||||||
|
number_of_bedrooms=2,
|
||||||
|
square_meters=50.0,
|
||||||
|
agency="Test",
|
||||||
|
council_tax_band="C",
|
||||||
|
longitude=0.0,
|
||||||
|
latitude=0.0,
|
||||||
|
price_history_json="",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=datetime.now(),
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.FURNISHED,
|
||||||
|
available_from=None,
|
||||||
|
)
|
||||||
|
assert listing.price_history == []
|
||||||
|
|
||||||
|
def test_price_history_item_to_dict(self) -> None:
|
||||||
|
"""Test PriceHistoryItem.to_dict() method."""
|
||||||
|
now = datetime.now()
|
||||||
|
item = PriceHistoryItem(
|
||||||
|
first_seen=now,
|
||||||
|
last_seen=now,
|
||||||
|
price=2500.0,
|
||||||
|
)
|
||||||
|
result = item.to_dict()
|
||||||
|
assert result["price"] == 2500.0
|
||||||
|
assert result["first_seen"] == now.isoformat()
|
||||||
|
assert result["last_seen"] == now.isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
class TestRentListing:
|
||||||
|
"""Tests specific to RentListing model."""
|
||||||
|
|
||||||
|
def test_rent_listing_has_furnish_type(self) -> None:
|
||||||
|
"""Test that RentListing has furnish_type field."""
|
||||||
|
listing = RentListing(
|
||||||
|
id=1,
|
||||||
|
price=2000.0,
|
||||||
|
number_of_bedrooms=2,
|
||||||
|
square_meters=50.0,
|
||||||
|
agency="Test",
|
||||||
|
council_tax_band="C",
|
||||||
|
longitude=0.0,
|
||||||
|
latitude=0.0,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=datetime.now(),
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.PART_FURNISHED,
|
||||||
|
available_from=None,
|
||||||
|
)
|
||||||
|
assert listing.furnish_type == FurnishType.PART_FURNISHED
|
||||||
|
|
||||||
|
def test_rent_listing_has_available_from(self) -> None:
|
||||||
|
"""Test that RentListing has available_from field."""
|
||||||
|
now = datetime.now()
|
||||||
|
listing = RentListing(
|
||||||
|
id=1,
|
||||||
|
price=2000.0,
|
||||||
|
number_of_bedrooms=2,
|
||||||
|
square_meters=50.0,
|
||||||
|
agency="Test",
|
||||||
|
council_tax_band="C",
|
||||||
|
longitude=0.0,
|
||||||
|
latitude=0.0,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=now,
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
furnish_type=FurnishType.FURNISHED,
|
||||||
|
available_from=now,
|
||||||
|
)
|
||||||
|
assert listing.available_from == now
|
||||||
|
|
||||||
|
|
||||||
|
class TestBuyListing:
|
||||||
|
"""Tests specific to BuyListing model."""
|
||||||
|
|
||||||
|
def test_buy_listing_has_service_charge(self) -> None:
|
||||||
|
"""Test that BuyListing has service_charge field."""
|
||||||
|
listing = BuyListing(
|
||||||
|
id=1,
|
||||||
|
price=450000.0,
|
||||||
|
number_of_bedrooms=3,
|
||||||
|
square_meters=95.0,
|
||||||
|
agency="Test",
|
||||||
|
council_tax_band="D",
|
||||||
|
longitude=0.0,
|
||||||
|
latitude=0.0,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=datetime.now(),
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
service_charge=2500.0,
|
||||||
|
lease_left=85,
|
||||||
|
)
|
||||||
|
assert listing.service_charge == 2500.0
|
||||||
|
|
||||||
|
def test_buy_listing_has_lease_left(self) -> None:
|
||||||
|
"""Test that BuyListing has lease_left field."""
|
||||||
|
listing = BuyListing(
|
||||||
|
id=1,
|
||||||
|
price=450000.0,
|
||||||
|
number_of_bedrooms=3,
|
||||||
|
square_meters=95.0,
|
||||||
|
agency="Test",
|
||||||
|
council_tax_band="D",
|
||||||
|
longitude=0.0,
|
||||||
|
latitude=0.0,
|
||||||
|
price_history_json="[]",
|
||||||
|
listing_site=ListingSite.RIGHTMOVE,
|
||||||
|
last_seen=datetime.now(),
|
||||||
|
photo_thumbnail=None,
|
||||||
|
floorplan_image_paths=[],
|
||||||
|
additional_info={"property": {"visible": True}},
|
||||||
|
routing_info_json=None,
|
||||||
|
service_charge=None,
|
||||||
|
lease_left=120,
|
||||||
|
)
|
||||||
|
assert listing.lease_left == 120
|
||||||
74
crawler/tests/unit/test_redis_lock.py
Normal file
74
crawler/tests/unit/test_redis_lock.py
Normal file
|
|
@ -0,0 +1,74 @@
|
||||||
|
"""Unit tests for Redis distributed lock."""
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from utils.redis_lock import redis_lock, get_redis_client
|
||||||
|
|
||||||
|
|
||||||
|
class TestRedisLock:
|
||||||
|
"""Tests for redis_lock context manager."""
|
||||||
|
|
||||||
|
@mock.patch("utils.redis_lock.get_redis_client")
|
||||||
|
def test_lock_acquired_successfully(self, mock_get_client):
|
||||||
|
"""Test lock acquisition when no other lock exists."""
|
||||||
|
mock_client = mock.MagicMock()
|
||||||
|
mock_client.set.return_value = True
|
||||||
|
mock_get_client.return_value = mock_client
|
||||||
|
|
||||||
|
with redis_lock("test_lock") as acquired:
|
||||||
|
assert acquired is True
|
||||||
|
|
||||||
|
mock_client.set.assert_called_once_with("lock:test_lock", "1", nx=True, ex=3600 * 4)
|
||||||
|
mock_client.delete.assert_called_once_with("lock:test_lock")
|
||||||
|
|
||||||
|
@mock.patch("utils.redis_lock.get_redis_client")
|
||||||
|
def test_lock_not_acquired(self, mock_get_client):
|
||||||
|
"""Test lock not acquired when another lock exists."""
|
||||||
|
mock_client = mock.MagicMock()
|
||||||
|
mock_client.set.return_value = None # Redis returns None when nx=True fails
|
||||||
|
mock_get_client.return_value = mock_client
|
||||||
|
|
||||||
|
with redis_lock("test_lock") as acquired:
|
||||||
|
assert acquired is False
|
||||||
|
|
||||||
|
mock_client.set.assert_called_once_with("lock:test_lock", "1", nx=True, ex=3600 * 4)
|
||||||
|
# Should NOT call delete since we didn't acquire the lock
|
||||||
|
mock_client.delete.assert_not_called()
|
||||||
|
|
||||||
|
@mock.patch("utils.redis_lock.get_redis_client")
|
||||||
|
def test_lock_released_on_exception(self, mock_get_client):
|
||||||
|
"""Test lock is released even when exception occurs."""
|
||||||
|
mock_client = mock.MagicMock()
|
||||||
|
mock_client.set.return_value = True
|
||||||
|
mock_get_client.return_value = mock_client
|
||||||
|
|
||||||
|
with pytest.raises(ValueError):
|
||||||
|
with redis_lock("test_lock") as acquired:
|
||||||
|
assert acquired is True
|
||||||
|
raise ValueError("Test error")
|
||||||
|
|
||||||
|
# Lock should still be released
|
||||||
|
mock_client.delete.assert_called_once_with("lock:test_lock")
|
||||||
|
|
||||||
|
@mock.patch("utils.redis_lock.get_redis_client")
|
||||||
|
def test_custom_timeout(self, mock_get_client):
|
||||||
|
"""Test lock with custom timeout."""
|
||||||
|
mock_client = mock.MagicMock()
|
||||||
|
mock_client.set.return_value = True
|
||||||
|
mock_get_client.return_value = mock_client
|
||||||
|
|
||||||
|
with redis_lock("test_lock", timeout=300) as acquired:
|
||||||
|
assert acquired is True
|
||||||
|
|
||||||
|
mock_client.set.assert_called_once_with("lock:test_lock", "1", nx=True, ex=300)
|
||||||
|
|
||||||
|
@mock.patch("utils.redis_lock.redis")
|
||||||
|
def test_get_redis_client_uses_broker_url(self, mock_redis):
|
||||||
|
"""Test Redis client is created from CELERY_BROKER_URL."""
|
||||||
|
with mock.patch.dict("os.environ", {"CELERY_BROKER_URL": "redis://testhost:1234/5"}):
|
||||||
|
get_redis_client()
|
||||||
|
|
||||||
|
mock_redis.from_url.assert_called_once_with(
|
||||||
|
"redis://testhost:1234/5", decode_responses=True
|
||||||
|
)
|
||||||
227
crawler/tests/unit/test_repository.py
Normal file
227
crawler/tests/unit/test_repository.py
Normal file
|
|
@ -0,0 +1,227 @@
|
||||||
|
"""Unit tests for ListingRepository."""
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import pytest
|
||||||
|
from sqlalchemy import Engine
|
||||||
|
|
||||||
|
from models.listing import (
|
||||||
|
FurnishType,
|
||||||
|
ListingType,
|
||||||
|
QueryParameters,
|
||||||
|
RentListing,
|
||||||
|
)
|
||||||
|
from repositories.listing_repository import ListingRepository
|
||||||
|
|
||||||
|
|
||||||
|
class TestListingRepository:
|
||||||
|
"""Tests for ListingRepository methods."""
|
||||||
|
|
||||||
|
async def test_get_listings_empty_db(
|
||||||
|
self, listing_repository: ListingRepository
|
||||||
|
) -> None:
|
||||||
|
"""Test that get_listings returns empty list for empty database."""
|
||||||
|
listings = await listing_repository.get_listings()
|
||||||
|
assert listings == []
|
||||||
|
|
||||||
|
async def test_get_listings_returns_inserted_listings(
|
||||||
|
self,
|
||||||
|
listing_repository: ListingRepository,
|
||||||
|
sample_rent_listing: RentListing,
|
||||||
|
) -> None:
|
||||||
|
"""Test that get_listings returns listings that were inserted."""
|
||||||
|
await listing_repository.upsert_listings([sample_rent_listing])
|
||||||
|
listings = await listing_repository.get_listings()
|
||||||
|
assert len(listings) == 1
|
||||||
|
assert listings[0].id == sample_rent_listing.id
|
||||||
|
|
||||||
|
async def test_upsert_listings_creates_new(
|
||||||
|
self,
|
||||||
|
listing_repository: ListingRepository,
|
||||||
|
sample_rent_listing: RentListing,
|
||||||
|
) -> None:
|
||||||
|
"""Test that upsert_listings creates new listings."""
|
||||||
|
result = await listing_repository.upsert_listings([sample_rent_listing])
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result[0].id == sample_rent_listing.id
|
||||||
|
|
||||||
|
# Verify it's in the database
|
||||||
|
listings = await listing_repository.get_listings()
|
||||||
|
assert len(listings) == 1
|
||||||
|
|
||||||
|
async def test_upsert_listings_updates_existing(
|
||||||
|
self,
|
||||||
|
listing_repository: ListingRepository,
|
||||||
|
sample_rent_listing: RentListing,
|
||||||
|
) -> None:
|
||||||
|
"""Test that upsert_listings updates existing listings."""
|
||||||
|
# Insert initial listing
|
||||||
|
await listing_repository.upsert_listings([sample_rent_listing])
|
||||||
|
|
||||||
|
# Update the listing
|
||||||
|
sample_rent_listing.price = 3000.0
|
||||||
|
await listing_repository.upsert_listings([sample_rent_listing])
|
||||||
|
|
||||||
|
# Verify update
|
||||||
|
listings = await listing_repository.get_listings()
|
||||||
|
assert len(listings) == 1
|
||||||
|
assert listings[0].price == 3000.0
|
||||||
|
|
||||||
|
async def test_mark_seen_updates_timestamp(
|
||||||
|
self,
|
||||||
|
listing_repository: ListingRepository,
|
||||||
|
sample_rent_listing: RentListing,
|
||||||
|
) -> None:
|
||||||
|
"""Test that mark_seen updates the last_seen timestamp."""
|
||||||
|
# Set an old timestamp
|
||||||
|
old_time = datetime.now() - timedelta(days=7)
|
||||||
|
sample_rent_listing.last_seen = old_time
|
||||||
|
await listing_repository.upsert_listings([sample_rent_listing])
|
||||||
|
|
||||||
|
# Mark as seen
|
||||||
|
await listing_repository.mark_seen(sample_rent_listing.id)
|
||||||
|
|
||||||
|
# Verify timestamp was updated
|
||||||
|
listings = await listing_repository.get_listings()
|
||||||
|
assert len(listings) == 1
|
||||||
|
assert listings[0].last_seen > old_time
|
||||||
|
|
||||||
|
async def test_mark_seen_nonexistent_listing(
|
||||||
|
self, listing_repository: ListingRepository
|
||||||
|
) -> None:
|
||||||
|
"""Test that mark_seen handles nonexistent listings gracefully."""
|
||||||
|
# Should not raise an exception
|
||||||
|
await listing_repository.mark_seen(999999)
|
||||||
|
|
||||||
|
async def test_get_listings_with_only_ids(
|
||||||
|
self,
|
||||||
|
listing_repository: ListingRepository,
|
||||||
|
sample_rent_listings: list[RentListing],
|
||||||
|
) -> None:
|
||||||
|
"""Test that get_listings filters by only_ids."""
|
||||||
|
await listing_repository.upsert_listings(sample_rent_listings)
|
||||||
|
|
||||||
|
# Request only specific IDs
|
||||||
|
listings = await listing_repository.get_listings(only_ids=[1, 3])
|
||||||
|
assert len(listings) == 2
|
||||||
|
listing_ids = [l.id for l in listings]
|
||||||
|
assert 1 in listing_ids
|
||||||
|
assert 3 in listing_ids
|
||||||
|
assert 2 not in listing_ids
|
||||||
|
|
||||||
|
async def test_get_listings_with_limit(
|
||||||
|
self,
|
||||||
|
listing_repository: ListingRepository,
|
||||||
|
sample_rent_listings: list[RentListing],
|
||||||
|
) -> None:
|
||||||
|
"""Test that get_listings respects limit parameter."""
|
||||||
|
await listing_repository.upsert_listings(sample_rent_listings)
|
||||||
|
|
||||||
|
listings = await listing_repository.get_listings(limit=2)
|
||||||
|
assert len(listings) == 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestListingRepositoryFilters:
|
||||||
|
"""Tests for ListingRepository query parameter filtering."""
|
||||||
|
|
||||||
|
async def test_filter_by_bedrooms(
|
||||||
|
self,
|
||||||
|
listing_repository: ListingRepository,
|
||||||
|
sample_rent_listings: list[RentListing],
|
||||||
|
) -> None:
|
||||||
|
"""Test filtering by bedroom count."""
|
||||||
|
await listing_repository.upsert_listings(sample_rent_listings)
|
||||||
|
|
||||||
|
query_params = QueryParameters(
|
||||||
|
listing_type=ListingType.RENT,
|
||||||
|
min_bedrooms=2,
|
||||||
|
max_bedrooms=2,
|
||||||
|
)
|
||||||
|
listings = await listing_repository.get_listings(query_parameters=query_params)
|
||||||
|
assert len(listings) == 1
|
||||||
|
assert listings[0].number_of_bedrooms == 2
|
||||||
|
|
||||||
|
async def test_filter_by_price_range(
|
||||||
|
self,
|
||||||
|
listing_repository: ListingRepository,
|
||||||
|
sample_rent_listings: list[RentListing],
|
||||||
|
) -> None:
|
||||||
|
"""Test filtering by price range."""
|
||||||
|
await listing_repository.upsert_listings(sample_rent_listings)
|
||||||
|
|
||||||
|
query_params = QueryParameters(
|
||||||
|
listing_type=ListingType.RENT,
|
||||||
|
min_price=1800,
|
||||||
|
max_price=2500,
|
||||||
|
)
|
||||||
|
listings = await listing_repository.get_listings(query_parameters=query_params)
|
||||||
|
assert len(listings) == 1
|
||||||
|
assert listings[0].price == 2000.0
|
||||||
|
|
||||||
|
async def test_filter_by_min_sqm(
|
||||||
|
self,
|
||||||
|
listing_repository: ListingRepository,
|
||||||
|
sample_rent_listings: list[RentListing],
|
||||||
|
) -> None:
|
||||||
|
"""Test filtering by minimum square meters."""
|
||||||
|
await listing_repository.upsert_listings(sample_rent_listings)
|
||||||
|
|
||||||
|
query_params = QueryParameters(
|
||||||
|
listing_type=ListingType.RENT,
|
||||||
|
min_sqm=60,
|
||||||
|
)
|
||||||
|
listings = await listing_repository.get_listings(query_parameters=query_params)
|
||||||
|
assert len(listings) == 1
|
||||||
|
assert listings[0].square_meters == 80.0
|
||||||
|
|
||||||
|
async def test_filter_by_furnish_type(
|
||||||
|
self,
|
||||||
|
listing_repository: ListingRepository,
|
||||||
|
sample_rent_listings: list[RentListing],
|
||||||
|
) -> None:
|
||||||
|
"""Test filtering by furnish type."""
|
||||||
|
await listing_repository.upsert_listings(sample_rent_listings)
|
||||||
|
|
||||||
|
query_params = QueryParameters(
|
||||||
|
listing_type=ListingType.RENT,
|
||||||
|
furnish_types=[FurnishType.UNFURNISHED],
|
||||||
|
)
|
||||||
|
listings = await listing_repository.get_listings(query_parameters=query_params)
|
||||||
|
assert len(listings) == 1
|
||||||
|
assert listings[0].furnish_type == FurnishType.UNFURNISHED
|
||||||
|
|
||||||
|
async def test_filter_by_last_seen_days(
|
||||||
|
self,
|
||||||
|
listing_repository: ListingRepository,
|
||||||
|
sample_rent_listings: list[RentListing],
|
||||||
|
) -> None:
|
||||||
|
"""Test filtering by last_seen_days."""
|
||||||
|
# Make one listing old
|
||||||
|
sample_rent_listings[0].last_seen = datetime.now() - timedelta(days=30)
|
||||||
|
await listing_repository.upsert_listings(sample_rent_listings)
|
||||||
|
|
||||||
|
query_params = QueryParameters(
|
||||||
|
listing_type=ListingType.RENT,
|
||||||
|
last_seen_days=7,
|
||||||
|
)
|
||||||
|
listings = await listing_repository.get_listings(query_parameters=query_params)
|
||||||
|
# Only 2 should be recent enough
|
||||||
|
assert len(listings) == 2
|
||||||
|
|
||||||
|
async def test_combined_filters(
|
||||||
|
self,
|
||||||
|
listing_repository: ListingRepository,
|
||||||
|
sample_rent_listings: list[RentListing],
|
||||||
|
) -> None:
|
||||||
|
"""Test combining multiple filters."""
|
||||||
|
await listing_repository.upsert_listings(sample_rent_listings)
|
||||||
|
|
||||||
|
query_params = QueryParameters(
|
||||||
|
listing_type=ListingType.RENT,
|
||||||
|
min_bedrooms=1,
|
||||||
|
max_bedrooms=2,
|
||||||
|
min_price=1000,
|
||||||
|
max_price=2500,
|
||||||
|
furnish_types=[FurnishType.FURNISHED, FurnishType.UNFURNISHED],
|
||||||
|
)
|
||||||
|
listings = await listing_repository.get_listings(query_parameters=query_params)
|
||||||
|
# Should match listings with 1-2 bedrooms in price range
|
||||||
|
assert len(listings) == 2
|
||||||
293
crawler/tests/unit/test_schedule_config.py
Normal file
293
crawler/tests/unit/test_schedule_config.py
Normal file
|
|
@ -0,0 +1,293 @@
|
||||||
|
"""Unit tests for schedule configuration."""
|
||||||
|
import os
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pydantic import ValidationError
|
||||||
|
|
||||||
|
from config.schedule_config import ScheduleConfig, SchedulesConfig
|
||||||
|
from models.listing import FurnishType, ListingType
|
||||||
|
|
||||||
|
|
||||||
|
class TestScheduleConfig:
|
||||||
|
"""Tests for ScheduleConfig model."""
|
||||||
|
|
||||||
|
def test_basic_creation_with_defaults(self):
|
||||||
|
"""Test creating a schedule with minimal required fields."""
|
||||||
|
schedule = ScheduleConfig(name="Test Schedule", listing_type=ListingType.RENT)
|
||||||
|
|
||||||
|
assert schedule.name == "Test Schedule"
|
||||||
|
assert schedule.enabled is True
|
||||||
|
assert schedule.minute == "0"
|
||||||
|
assert schedule.hour == "2"
|
||||||
|
assert schedule.day_of_week == "*"
|
||||||
|
assert schedule.listing_type == ListingType.RENT
|
||||||
|
assert schedule.min_bedrooms == 1
|
||||||
|
assert schedule.max_bedrooms == 999
|
||||||
|
assert schedule.min_price == 0
|
||||||
|
assert schedule.max_price == 10_000_000
|
||||||
|
assert schedule.district_names == []
|
||||||
|
assert schedule.furnish_types is None
|
||||||
|
|
||||||
|
def test_full_creation(self):
|
||||||
|
"""Test creating a schedule with all fields specified."""
|
||||||
|
schedule = ScheduleConfig(
|
||||||
|
name="Full Schedule",
|
||||||
|
enabled=False,
|
||||||
|
minute="30",
|
||||||
|
hour="4",
|
||||||
|
day_of_week="1,3,5",
|
||||||
|
listing_type=ListingType.BUY,
|
||||||
|
min_bedrooms=2,
|
||||||
|
max_bedrooms=3,
|
||||||
|
min_price=400000,
|
||||||
|
max_price=800000,
|
||||||
|
district_names=["Westminster", "Camden"],
|
||||||
|
furnish_types=["furnished", "unfurnished"],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert schedule.name == "Full Schedule"
|
||||||
|
assert schedule.enabled is False
|
||||||
|
assert schedule.minute == "30"
|
||||||
|
assert schedule.hour == "4"
|
||||||
|
assert schedule.day_of_week == "1,3,5"
|
||||||
|
assert schedule.listing_type == ListingType.BUY
|
||||||
|
assert schedule.min_bedrooms == 2
|
||||||
|
assert schedule.max_bedrooms == 3
|
||||||
|
assert schedule.min_price == 400000
|
||||||
|
assert schedule.max_price == 800000
|
||||||
|
assert schedule.district_names == ["Westminster", "Camden"]
|
||||||
|
assert schedule.furnish_types == ["furnished", "unfurnished"]
|
||||||
|
|
||||||
|
def test_to_query_parameters(self):
|
||||||
|
"""Test conversion to QueryParameters."""
|
||||||
|
schedule = ScheduleConfig(
|
||||||
|
name="Test",
|
||||||
|
listing_type=ListingType.RENT,
|
||||||
|
min_bedrooms=2,
|
||||||
|
max_bedrooms=3,
|
||||||
|
min_price=2000,
|
||||||
|
max_price=4000,
|
||||||
|
district_names=["Westminster"],
|
||||||
|
furnish_types=["furnished"],
|
||||||
|
)
|
||||||
|
|
||||||
|
params = schedule.to_query_parameters()
|
||||||
|
|
||||||
|
assert params.listing_type == ListingType.RENT
|
||||||
|
assert params.min_bedrooms == 2
|
||||||
|
assert params.max_bedrooms == 3
|
||||||
|
assert params.min_price == 2000
|
||||||
|
assert params.max_price == 4000
|
||||||
|
assert params.district_names == {"Westminster"}
|
||||||
|
assert params.furnish_types == [FurnishType.FURNISHED]
|
||||||
|
|
||||||
|
def test_to_query_parameters_no_furnish_types(self):
|
||||||
|
"""Test conversion when furnish_types is None."""
|
||||||
|
schedule = ScheduleConfig(
|
||||||
|
name="Test",
|
||||||
|
listing_type=ListingType.BUY,
|
||||||
|
)
|
||||||
|
|
||||||
|
params = schedule.to_query_parameters()
|
||||||
|
|
||||||
|
assert params.furnish_types is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestCronValidation:
|
||||||
|
"""Tests for cron field validation."""
|
||||||
|
|
||||||
|
# Valid minute values
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"minute",
|
||||||
|
[
|
||||||
|
"0",
|
||||||
|
"59",
|
||||||
|
"*",
|
||||||
|
"*/5",
|
||||||
|
"*/15",
|
||||||
|
"0,15,30,45",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_valid_minute(self, minute: str):
|
||||||
|
"""Test valid minute values are accepted."""
|
||||||
|
schedule = ScheduleConfig(
|
||||||
|
name="Test", listing_type=ListingType.RENT, minute=minute
|
||||||
|
)
|
||||||
|
assert schedule.minute == minute
|
||||||
|
|
||||||
|
# Invalid minute values
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"minute",
|
||||||
|
[
|
||||||
|
"60",
|
||||||
|
"-1",
|
||||||
|
"abc",
|
||||||
|
"*/0",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_invalid_minute(self, minute: str):
|
||||||
|
"""Test invalid minute values are rejected."""
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScheduleConfig(name="Test", listing_type=ListingType.RENT, minute=minute)
|
||||||
|
|
||||||
|
# Valid hour values
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"hour",
|
||||||
|
[
|
||||||
|
"0",
|
||||||
|
"23",
|
||||||
|
"*",
|
||||||
|
"*/6",
|
||||||
|
"0,6,12,18",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_valid_hour(self, hour: str):
|
||||||
|
"""Test valid hour values are accepted."""
|
||||||
|
schedule = ScheduleConfig(
|
||||||
|
name="Test", listing_type=ListingType.RENT, hour=hour
|
||||||
|
)
|
||||||
|
assert schedule.hour == hour
|
||||||
|
|
||||||
|
# Invalid hour values
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"hour",
|
||||||
|
[
|
||||||
|
"24",
|
||||||
|
"-1",
|
||||||
|
"abc",
|
||||||
|
"*/0",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_invalid_hour(self, hour: str):
|
||||||
|
"""Test invalid hour values are rejected."""
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScheduleConfig(name="Test", listing_type=ListingType.RENT, hour=hour)
|
||||||
|
|
||||||
|
# Valid day_of_week values
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"day_of_week",
|
||||||
|
[
|
||||||
|
"0",
|
||||||
|
"6",
|
||||||
|
"*",
|
||||||
|
"1,3,5",
|
||||||
|
"*/2",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_valid_day_of_week(self, day_of_week: str):
|
||||||
|
"""Test valid day_of_week values are accepted."""
|
||||||
|
schedule = ScheduleConfig(
|
||||||
|
name="Test", listing_type=ListingType.RENT, day_of_week=day_of_week
|
||||||
|
)
|
||||||
|
assert schedule.day_of_week == day_of_week
|
||||||
|
|
||||||
|
# Invalid day_of_week values
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"day_of_week",
|
||||||
|
[
|
||||||
|
"7",
|
||||||
|
"-1",
|
||||||
|
"abc",
|
||||||
|
"*/0",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_invalid_day_of_week(self, day_of_week: str):
|
||||||
|
"""Test invalid day_of_week values are rejected."""
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
ScheduleConfig(
|
||||||
|
name="Test", listing_type=ListingType.RENT, day_of_week=day_of_week
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchedulesConfig:
|
||||||
|
"""Tests for SchedulesConfig container."""
|
||||||
|
|
||||||
|
def test_from_env_empty(self):
|
||||||
|
"""Test loading from empty environment variable."""
|
||||||
|
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": ""}, clear=False):
|
||||||
|
config = SchedulesConfig.from_env()
|
||||||
|
assert config.schedules == []
|
||||||
|
|
||||||
|
def test_from_env_missing(self):
|
||||||
|
"""Test loading when environment variable is not set."""
|
||||||
|
with mock.patch.dict(os.environ, {}, clear=True):
|
||||||
|
# Ensure SCRAPE_SCHEDULES is not set
|
||||||
|
os.environ.pop("SCRAPE_SCHEDULES", None)
|
||||||
|
config = SchedulesConfig.from_env()
|
||||||
|
assert config.schedules == []
|
||||||
|
|
||||||
|
def test_from_env_valid_single(self):
|
||||||
|
"""Test loading a single valid schedule."""
|
||||||
|
json_config = '[{"name":"Daily RENT","listing_type":"RENT","hour":"2"}]'
|
||||||
|
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": json_config}):
|
||||||
|
config = SchedulesConfig.from_env()
|
||||||
|
|
||||||
|
assert len(config.schedules) == 1
|
||||||
|
assert config.schedules[0].name == "Daily RENT"
|
||||||
|
assert config.schedules[0].listing_type == ListingType.RENT
|
||||||
|
assert config.schedules[0].hour == "2"
|
||||||
|
|
||||||
|
def test_from_env_valid_multiple(self):
|
||||||
|
"""Test loading multiple valid schedules."""
|
||||||
|
json_config = """[
|
||||||
|
{"name":"Daily RENT","listing_type":"RENT","hour":"2"},
|
||||||
|
{"name":"Daily BUY","listing_type":"BUY","hour":"4","enabled":false}
|
||||||
|
]"""
|
||||||
|
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": json_config}):
|
||||||
|
config = SchedulesConfig.from_env()
|
||||||
|
|
||||||
|
assert len(config.schedules) == 2
|
||||||
|
assert config.schedules[0].name == "Daily RENT"
|
||||||
|
assert config.schedules[0].enabled is True
|
||||||
|
assert config.schedules[1].name == "Daily BUY"
|
||||||
|
assert config.schedules[1].enabled is False
|
||||||
|
|
||||||
|
def test_from_env_invalid_json(self):
|
||||||
|
"""Test error on invalid JSON."""
|
||||||
|
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": "not json"}):
|
||||||
|
with pytest.raises(ValueError, match="Invalid JSON"):
|
||||||
|
SchedulesConfig.from_env()
|
||||||
|
|
||||||
|
def test_from_env_not_array(self):
|
||||||
|
"""Test error when JSON is not an array."""
|
||||||
|
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": '{"name":"test"}'}):
|
||||||
|
with pytest.raises(ValueError, match="must be a JSON array"):
|
||||||
|
SchedulesConfig.from_env()
|
||||||
|
|
||||||
|
def test_from_env_invalid_schedule(self):
|
||||||
|
"""Test error when schedule validation fails."""
|
||||||
|
# Missing required listing_type
|
||||||
|
json_config = '[{"name":"Invalid"}]'
|
||||||
|
with mock.patch.dict(os.environ, {"SCRAPE_SCHEDULES": json_config}):
|
||||||
|
with pytest.raises(ValidationError):
|
||||||
|
SchedulesConfig.from_env()
|
||||||
|
|
||||||
|
def test_get_enabled_schedules(self):
|
||||||
|
"""Test filtering to only enabled schedules."""
|
||||||
|
config = SchedulesConfig(
|
||||||
|
schedules=[
|
||||||
|
ScheduleConfig(name="Enabled", listing_type=ListingType.RENT, enabled=True),
|
||||||
|
ScheduleConfig(name="Disabled", listing_type=ListingType.BUY, enabled=False),
|
||||||
|
ScheduleConfig(name="Also Enabled", listing_type=ListingType.RENT, enabled=True),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
enabled = config.get_enabled_schedules()
|
||||||
|
|
||||||
|
assert len(enabled) == 2
|
||||||
|
assert enabled[0].name == "Enabled"
|
||||||
|
assert enabled[1].name == "Also Enabled"
|
||||||
|
|
||||||
|
def test_get_enabled_schedules_all_disabled(self):
|
||||||
|
"""Test when all schedules are disabled."""
|
||||||
|
config = SchedulesConfig(
|
||||||
|
schedules=[
|
||||||
|
ScheduleConfig(name="Disabled1", listing_type=ListingType.RENT, enabled=False),
|
||||||
|
ScheduleConfig(name="Disabled2", listing_type=ListingType.BUY, enabled=False),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
enabled = config.get_enabled_schedules()
|
||||||
|
|
||||||
|
assert len(enabled) == 0
|
||||||
4
crawler/utils/__init__.py
Normal file
4
crawler/utils/__init__.py
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
"""Utility modules."""
|
||||||
|
from utils.redis_lock import redis_lock
|
||||||
|
|
||||||
|
__all__ = ["redis_lock"]
|
||||||
50
crawler/utils/redis_lock.py
Normal file
50
crawler/utils/redis_lock.py
Normal file
|
|
@ -0,0 +1,50 @@
|
||||||
|
"""Redis-based distributed locking for task coordination."""
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from typing import Generator
|
||||||
|
|
||||||
|
import redis
|
||||||
|
|
||||||
|
logger = logging.getLogger("uvicorn.error")
|
||||||
|
|
||||||
|
|
||||||
|
def get_redis_client() -> redis.Redis:
|
||||||
|
"""Get Redis client from Celery broker URL."""
|
||||||
|
broker_url = os.getenv("CELERY_BROKER_URL", "redis://localhost:6379/0")
|
||||||
|
return redis.from_url(broker_url, decode_responses=True)
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def redis_lock(
|
||||||
|
lock_name: str, timeout: int = 3600 * 4
|
||||||
|
) -> Generator[bool, None, None]:
|
||||||
|
"""Distributed lock using Redis.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lock_name: Unique name for the lock
|
||||||
|
timeout: Lock expiration time in seconds (default: 4 hours)
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
bool: True if lock was acquired, False otherwise
|
||||||
|
|
||||||
|
Example:
|
||||||
|
with redis_lock("scrape_listings") as acquired:
|
||||||
|
if not acquired:
|
||||||
|
logger.warning("Another scrape is already running")
|
||||||
|
return
|
||||||
|
# ... do work ...
|
||||||
|
"""
|
||||||
|
client = get_redis_client()
|
||||||
|
lock_key = f"lock:{lock_name}"
|
||||||
|
|
||||||
|
# Try to acquire the lock
|
||||||
|
acquired = client.set(lock_key, "1", nx=True, ex=timeout)
|
||||||
|
|
||||||
|
try:
|
||||||
|
yield bool(acquired)
|
||||||
|
finally:
|
||||||
|
# Release the lock only if we acquired it
|
||||||
|
if acquired:
|
||||||
|
client.delete(lock_key)
|
||||||
|
logger.info(f"Released lock: {lock_name}")
|
||||||
Loading…
Add table
Add a link
Reference in a new issue