fixing bugs, adding properties for querying and analysis§

This commit is contained in:
Kadir 2024-03-11 09:44:37 +00:00
parent d108bf11ee
commit de2639f9c3
6 changed files with 2848 additions and 6 deletions

View file

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}

View file

@ -0,0 +1,25 @@
[tool.poetry]
name = "rec"
version = "0.1.0"
description = ""
authors = ["Kadir Tugan <git@k8n.dev>"]
[tool.poetry.dependencies]
python = ">3.11"
SQLAlchemy = "^2.0.23"
requests = "^2.31.0"
cachetools = "^5.3.2"
diskcache = "^5.6.3"
tqdm = "^4.66.2"
pillow = "^10.2.0"
torch = "^2.2.1"
numpy = "^1.26.4"
transformers = "^4.38.2"
pytesseract = "^0.3.10"
jupyterlab = "^4.1.4"
[tool.poetry.dev-dependencies]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View file

@ -1,7 +1,7 @@
from dataclasses import dataclass
import json
import pathlib
from typing import List
from typing import List, Dict
from rec import floorplan
_DATA_DIR = pathlib.Path('data/rs/')
@ -9,6 +9,7 @@ _DATA_DIR = pathlib.Path('data/rs/')
@dataclass()
class Listing():
identifier: int
_cached: Dict = None
@staticmethod
def get_all_listings() -> List['Listing']:
@ -70,7 +71,7 @@ class Listing():
@property
def sqm_model(self, recalculate=True):
if recalculate and not self.path_floorplan_model_json().exists():
if not self.path_floorplan_model_json().exists() or recalculate:
self.calculate_sqm_model()
with open(self.path_floorplan_json()) as f:
@ -93,16 +94,49 @@ class Listing():
json.dump(objs, f)
@property
def sqm_ocr(self, recalculate=True):
if recalculate and not self.path_floorplan_ocr_json().exists():
def sqm_ocr(self, recalculate=False):
if not self.path_floorplan_ocr_json().exists() or recalculate:
self.calculate_sqm_ocr()
with open(self.path_floorplan_ocr_json()) as f:
objs = json.load(f)
max_sqm = max([o['estimated_sqm'] for o in objs if o is None]) # filter out Nones
sqms = [o['estimated_sqm'] for o in objs if o['estimated_sqm'] is not None]
if len(sqms) == 0:
return None
max_sqm = max(sqms)
return max_sqm
@property
def url(self):
return f'https://www.rightmove.co.uk/properties/{self.identifier}'
@property
def detailobject(self):
if self._cached is None:
with open(self.path_detail_json()) as f:
self._cached = json.load(f)
return self._cached
@property
def price(self) -> float:
return self.detailobject['property']['price']
@property
def price_per_sqm(self) -> float:
if self.sqm_ocr is None:
return None
return self.price / self.sqm_ocr
def dict_nicely(self):
return {
'sqm_ocr': self.sqm_ocr,
'price': self.price,
'price_per_sqm': self.price_per_sqm,
'url': self.url,
}

1030
crawler/exploration.ipynb Normal file

File diff suppressed because it is too large Load diff

1747
crawler/poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -16,6 +16,8 @@ torch = "^2.2.1"
numpy = "^1.26.4"
transformers = "^4.38.2"
pytesseract = "^0.3.10"
jupyterlab = "^4.1.4"
pandas = "^2.2.1"
[tool.poetry.dev-dependencies]