fixing bugs, adding properties for querying and analysis§

This commit is contained in:
Kadir 2024-03-11 09:44:37 +00:00
parent d108bf11ee
commit de2639f9c3
6 changed files with 2848 additions and 6 deletions

View file

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -0,0 +1,25 @@
[tool.poetry]
name = "rec"
version = "0.1.0"
description = ""
authors = ["Kadir Tugan <git@k8n.dev>"]
[tool.poetry.dependencies]
python = ">3.11"
SQLAlchemy = "^2.0.23"
requests = "^2.31.0"
cachetools = "^5.3.2"
diskcache = "^5.6.3"
tqdm = "^4.66.2"
pillow = "^10.2.0"
torch = "^2.2.1"
numpy = "^1.26.4"
transformers = "^4.38.2"
pytesseract = "^0.3.10"
jupyterlab = "^4.1.4"
[tool.poetry.dev-dependencies]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View file

@ -1,7 +1,7 @@
from dataclasses import dataclass from dataclasses import dataclass
import json import json
import pathlib import pathlib
from typing import List from typing import List, Dict
from rec import floorplan from rec import floorplan
_DATA_DIR = pathlib.Path('data/rs/') _DATA_DIR = pathlib.Path('data/rs/')
@ -9,6 +9,7 @@ _DATA_DIR = pathlib.Path('data/rs/')
@dataclass() @dataclass()
class Listing(): class Listing():
identifier: int identifier: int
_cached: Dict = None
@staticmethod @staticmethod
def get_all_listings() -> List['Listing']: def get_all_listings() -> List['Listing']:
@ -70,7 +71,7 @@ class Listing():
@property @property
def sqm_model(self, recalculate=True): def sqm_model(self, recalculate=True):
if recalculate and not self.path_floorplan_model_json().exists(): if not self.path_floorplan_model_json().exists() or recalculate:
self.calculate_sqm_model() self.calculate_sqm_model()
with open(self.path_floorplan_json()) as f: with open(self.path_floorplan_json()) as f:
@ -93,16 +94,49 @@ class Listing():
json.dump(objs, f) json.dump(objs, f)
@property @property
def sqm_ocr(self, recalculate=True): def sqm_ocr(self, recalculate=False):
if recalculate and not self.path_floorplan_ocr_json().exists(): if not self.path_floorplan_ocr_json().exists() or recalculate:
self.calculate_sqm_ocr() self.calculate_sqm_ocr()
with open(self.path_floorplan_ocr_json()) as f: with open(self.path_floorplan_ocr_json()) as f:
objs = json.load(f) objs = json.load(f)
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
sqms = [o['estimated_sqm'] for o in objs if o['estimated_sqm'] is not None]
if len(sqms) == 0:
return None
max_sqm = max(sqms)
return max_sqm return max_sqm
@property
def url(self):
return f'https://www.rightmove.co.uk/properties/{self.identifier}'
@property
def detailobject(self):
if self._cached is None:
with open(self.path_detail_json()) as f:
self._cached = json.load(f)
return self._cached
@property
def price(self) -> float:
return self.detailobject['property']['price']
@property
def price_per_sqm(self) -> float:
if self.sqm_ocr is None:
return None
return self.price / self.sqm_ocr
def dict_nicely(self):
return {
'sqm_ocr': self.sqm_ocr,
'price': self.price,
'price_per_sqm': self.price_per_sqm,
'url': self.url,
}

1030
crawler/exploration.ipynb Normal file

File diff suppressed because it is too large Load diff

1747
crawler/poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -16,6 +16,8 @@ torch = "^2.2.1"
numpy = "^1.26.4" numpy = "^1.26.4"
transformers = "^4.38.2" transformers = "^4.38.2"
pytesseract = "^0.3.10" pytesseract = "^0.3.10"
jupyterlab = "^4.1.4"
pandas = "^2.2.1"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]