fixing bugs, adding properties for querying and analysis§
This commit is contained in:
parent
d108bf11ee
commit
de2639f9c3
6 changed files with 2848 additions and 6 deletions
6
crawler/.ipynb_checkpoints/exploration-checkpoint.ipynb
Normal file
6
crawler/.ipynb_checkpoints/exploration-checkpoint.ipynb
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
{
|
||||||
|
"cells": [],
|
||||||
|
"metadata": {},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
25
crawler/.ipynb_checkpoints/pyproject-checkpoint.toml
Normal file
25
crawler/.ipynb_checkpoints/pyproject-checkpoint.toml
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
[tool.poetry]
|
||||||
|
name = "rec"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = ""
|
||||||
|
authors = ["Kadir Tugan <git@k8n.dev>"]
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = ">3.11"
|
||||||
|
SQLAlchemy = "^2.0.23"
|
||||||
|
requests = "^2.31.0"
|
||||||
|
cachetools = "^5.3.2"
|
||||||
|
diskcache = "^5.6.3"
|
||||||
|
tqdm = "^4.66.2"
|
||||||
|
pillow = "^10.2.0"
|
||||||
|
torch = "^2.2.1"
|
||||||
|
numpy = "^1.26.4"
|
||||||
|
transformers = "^4.38.2"
|
||||||
|
pytesseract = "^0.3.10"
|
||||||
|
jupyterlab = "^4.1.4"
|
||||||
|
|
||||||
|
[tool.poetry.dev-dependencies]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core>=1.0.0"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import json
|
import json
|
||||||
import pathlib
|
import pathlib
|
||||||
from typing import List
|
from typing import List, Dict
|
||||||
from rec import floorplan
|
from rec import floorplan
|
||||||
|
|
||||||
_DATA_DIR = pathlib.Path('data/rs/')
|
_DATA_DIR = pathlib.Path('data/rs/')
|
||||||
|
|
@ -9,6 +9,7 @@ _DATA_DIR = pathlib.Path('data/rs/')
|
||||||
@dataclass()
|
@dataclass()
|
||||||
class Listing():
|
class Listing():
|
||||||
identifier: int
|
identifier: int
|
||||||
|
_cached: Dict = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_all_listings() -> List['Listing']:
|
def get_all_listings() -> List['Listing']:
|
||||||
|
|
@ -70,7 +71,7 @@ class Listing():
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sqm_model(self, recalculate=True):
|
def sqm_model(self, recalculate=True):
|
||||||
if recalculate and not self.path_floorplan_model_json().exists():
|
if not self.path_floorplan_model_json().exists() or recalculate:
|
||||||
self.calculate_sqm_model()
|
self.calculate_sqm_model()
|
||||||
|
|
||||||
with open(self.path_floorplan_json()) as f:
|
with open(self.path_floorplan_json()) as f:
|
||||||
|
|
@ -93,16 +94,49 @@ class Listing():
|
||||||
json.dump(objs, f)
|
json.dump(objs, f)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sqm_ocr(self, recalculate=True):
|
def sqm_ocr(self, recalculate=False):
|
||||||
if recalculate and not self.path_floorplan_ocr_json().exists():
|
if not self.path_floorplan_ocr_json().exists() or recalculate:
|
||||||
self.calculate_sqm_ocr()
|
self.calculate_sqm_ocr()
|
||||||
|
|
||||||
with open(self.path_floorplan_ocr_json()) as f:
|
with open(self.path_floorplan_ocr_json()) as f:
|
||||||
objs = json.load(f)
|
objs = json.load(f)
|
||||||
|
|
||||||
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
|
|
||||||
|
sqms = [o['estimated_sqm'] for o in objs if o['estimated_sqm'] is not None]
|
||||||
|
if len(sqms) == 0:
|
||||||
|
return None
|
||||||
|
max_sqm = max(sqms)
|
||||||
return max_sqm
|
return max_sqm
|
||||||
|
|
||||||
|
@property
|
||||||
|
def url(self):
|
||||||
|
return f'https://www.rightmove.co.uk/properties/{self.identifier}'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def detailobject(self):
|
||||||
|
if self._cached is None:
|
||||||
|
with open(self.path_detail_json()) as f:
|
||||||
|
self._cached = json.load(f)
|
||||||
|
return self._cached
|
||||||
|
|
||||||
|
@property
|
||||||
|
def price(self) -> float:
|
||||||
|
return self.detailobject['property']['price']
|
||||||
|
|
||||||
|
@property
|
||||||
|
def price_per_sqm(self) -> float:
|
||||||
|
if self.sqm_ocr is None:
|
||||||
|
return None
|
||||||
|
return self.price / self.sqm_ocr
|
||||||
|
|
||||||
|
def dict_nicely(self):
|
||||||
|
return {
|
||||||
|
'sqm_ocr': self.sqm_ocr,
|
||||||
|
'price': self.price,
|
||||||
|
'price_per_sqm': self.price_per_sqm,
|
||||||
|
'url': self.url,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
1030
crawler/exploration.ipynb
Normal file
1030
crawler/exploration.ipynb
Normal file
File diff suppressed because it is too large
Load diff
1747
crawler/poetry.lock
generated
1747
crawler/poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -16,6 +16,8 @@ torch = "^2.2.1"
|
||||||
numpy = "^1.26.4"
|
numpy = "^1.26.4"
|
||||||
transformers = "^4.38.2"
|
transformers = "^4.38.2"
|
||||||
pytesseract = "^0.3.10"
|
pytesseract = "^0.3.10"
|
||||||
|
jupyterlab = "^4.1.4"
|
||||||
|
pandas = "^2.2.1"
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue