Flatten repo structure: move crawler/ to root, remove vqa/ and immoweb/
The crawler subdirectory was the only active project. Moving it to the repo root simplifies paths and removes the unnecessary nesting. The vqa/ and immoweb/ directories were legacy/unused and have been removed. Updated .drone.yml, .gitignore, .claude/ docs, and skills to reflect the new flat structure.
This commit is contained in:
parent
e2247be700
commit
eafbc1ac52
221 changed files with 70 additions and 146140 deletions
40
csv_exporter.py
Normal file
40
csv_exporter.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from models.listing import QueryParameters
|
||||
from repositories.listing_repository import ListingRepository
|
||||
|
||||
|
||||
async def export_to_csv(
|
||||
repository: ListingRepository,
|
||||
output_file: Path,
|
||||
query_parameters: QueryParameters | None = None,
|
||||
) -> None:
|
||||
listings = await repository.get_listings(query_parameters=query_parameters)
|
||||
ds = [listing.__dict__ for listing in listings]
|
||||
df = pd.DataFrame(ds)
|
||||
|
||||
# read decisions on file
|
||||
decisions_path = "data/decisions.json"
|
||||
decisions = pd.read_json(decisions_path)
|
||||
df.loc[:, "decision"] = df.id.apply(lambda x: decisions.get(x))
|
||||
|
||||
# remove _sa_instance_state column
|
||||
drop_columns = ["_sa_instance_state", "additional_info"]
|
||||
df = df.drop(columns=drop_columns)
|
||||
|
||||
# fill in gap values for service charge and lease left for Excel filters
|
||||
if "service_charge" not in df.columns:
|
||||
df.loc[:, "service_charge"] = -1
|
||||
df.loc[:, "service_charge"] = df.service_charge.fillna(-1)
|
||||
if "lease_left" not in df.columns:
|
||||
df.loc[:, "lease_left"] = -1
|
||||
df.loc[:, "lease_left"] = df.lease_left.fillna(-1)
|
||||
if "square_meters" not in df.columns:
|
||||
df.loc[:, "square_meters"] = -1
|
||||
df.loc[:, "square_meters"] = df.square_meters.fillna(-1)
|
||||
|
||||
# Add price per sqm column
|
||||
df.loc[:, "price_per_sqm"] = df.price / df.square_meters
|
||||
|
||||
df = df.sort_values(by=["price_per_sqm"], ascending=True)
|
||||
df.to_csv(str(output_file), index=False)
|
||||
Loading…
Add table
Add a link
Reference in a new issue