{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "38e8690a-f6f7-4e14-a657-f20605477afd", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/kadir/code/realestate/crawler/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from data_access import Listing\n", "import pandas as pd" ] }, { "cell_type": "markdown", "id": "cfe2ab03-3204-4fd8-b76a-a734f6b87d75", "metadata": {}, "source": [ "### Fetch previous decisions" ] }, { "cell_type": "code", "execution_count": 2, "id": "424501ab-ecc6-42f5-b87e-b0d2871bdc74", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/wl/kx43lvyn6yv7lq988gwrkq_m0000gn/T/ipykernel_85865/3290399543.py:3: FutureWarning: The behavior of 'to_datetime' with 'unit' when parsing strings is deprecated. In a future version, strings will be parsed as datetime strings, matching the behavior without a 'unit'. To retain the old behavior, explicitly cast ints or floats to numeric type before calling to_datetime.\n", " decisions = pd.read_json(decisions_path)\n", "/var/folders/wl/kx43lvyn6yv7lq988gwrkq_m0000gn/T/ipykernel_85865/3290399543.py:3: FutureWarning: The behavior of 'to_datetime' with 'unit' when parsing strings is deprecated. In a future version, strings will be parsed as datetime strings, matching the behavior without a 'unit'. To retain the old behavior, explicitly cast ints or floats to numeric type before calling to_datetime.\n", " decisions = pd.read_json(decisions_path)\n" ] } ], "source": [ "# read decisions on file\n", "decisions_path = 'data/decisions.json'\n", "decisions = pd.read_json(decisions_path)" ] }, { "cell_type": "code", "execution_count": null, "id": "ed170ba4-700a-4e0e-9950-a80765cd751c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 3, "id": "354237e5-a07e-4c0b-a775-223d5f701f2c", "metadata": {}, "outputs": [], "source": [ "# read new decisions\n", "try:\n", " newdecisions = pd.read_clipboard()\n", " newdecisions = newdecisions.loc[newdecisions.decision.notna(), ['identifier', 'decision']]\n", " # drop old decision rows and concat\n", " decisions = decisions[~decisions.identifier.isin(newdecisions.identifier)]\n", " decisions = pd.concat([decisions, newdecisions])\n", " # save to json\n", " decisions.to_json(decisions_path)\n", " print(decisions.shape)\n", " decisions.head()\n", "except:\n", " newdecisions = pd.DataFrame()" ] }, { "cell_type": "code", "execution_count": 4, "id": "86224a20-53e1-403c-8d9f-71b9a9df750c", "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "output:\n", "{145699277: 'n',\n", " 144642851: 'n',\n", " 145394765: 'n',\n", " 145418669: 'removed',\n", " 143205230: 'n',\n", " 140628560: 'eigentlich geil',\n", " ...\n", "}\n", "\"\"\"\n", "decisions = decisions.set_index('identifier').decision.to_dict()" ] }, { "cell_type": "code", "execution_count": null, "id": "ec257220-f170-41b8-9f9d-b8ef61512acf", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "id": "6dbd25bd-802d-4953-83c3-f01640174353", "metadata": {}, "outputs": [], "source": [ "# Use if we want to skip at the bottom\n", "# decisions = {}" ] }, { "cell_type": "markdown", "id": "7c1ee5eb-1000-4ced-983c-df47fb6ceae8", "metadata": {}, "source": [ "### Get all data prepped for sheets" ] }, { "cell_type": "code", "execution_count": null, "id": "f20bddee-1e7c-4c46-a17a-c7bb6c13f30c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "id": "b1101088-9613-465f-81fd-79801e0202b8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "18508\n" ] } ], "source": [ "ls = Listing.get_all_listings()\n", "filtered = []\n", "for l in ls:\n", " last_seen = l.last_seen\n", " if last_seen is None or last_seen > 30:\n", " continue\n", " filtered.append(l)\n", "print(len(filtered))" ] }, { "cell_type": "code", "execution_count": 7, "id": "63e61601-7e3f-4d58-89f6-1794e4868cc3", "metadata": {}, "outputs": [], "source": [ "ds = [l.dict_nicely() for l in filtered]" ] }, { "cell_type": "code", "execution_count": 8, "id": "1c222721-f426-42c0-9ac5-badc1f7a2034", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | identifier | \n", "sqm_ocr | \n", "price | \n", "price_per_sqm | \n", "url | \n", "bedrooms | \n", "travel_time_fastest | \n", "travel_time_second | \n", "lease_left | \n", "service_charge | \n", "development | \n", "tenure_type | \n", "updated_days | \n", "status | \n", "last_seen | \n", "decision | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "101369066 | \n", "NaN | \n", "875000.0 | \n", "NaN | \n", "https://www.rightmove.co.uk/properties/101369066 | \n", "3 | \n", "{'duration': 2252, 'distance': 7140, 'duration... | \n", "{'duration': 2465, 'distance': 7502, 'duration... | \n", "0.0 | \n", "NaN | \n", "False | \n", "Share of Freehold | \n", "12 | \n", "None | \n", "0 | \n", "None | \n", "
| 1 | \n", "105484772 | \n", "45.7 | \n", "325000.0 | \n", "7111.597374 | \n", "https://www.rightmove.co.uk/properties/105484772 | \n", "1 | \n", "{'duration': 1983, 'distance': 10095, 'duratio... | \n", "{'duration': 2043, 'distance': 10083, 'duratio... | \n", "104.0 | \n", "641.53 | \n", "False | \n", "Leasehold | \n", "36 | \n", "None | \n", "0 | \n", "None | \n", "
| 2 | \n", "105827126 | \n", "58.5 | \n", "950000.0 | \n", "16239.316239 | \n", "https://www.rightmove.co.uk/properties/105827126 | \n", "1 | \n", "{'duration': 2478, 'distance': 9584, 'duration... | \n", "{'duration': 2478, 'distance': 9584, 'duration... | \n", "NaN | \n", "NaN | \n", "True | \n", "Leasehold | \n", "2 | \n", "None | \n", "0 | \n", "None | \n", "
| 3 | \n", "105836849 | \n", "NaN | \n", "400000.0 | \n", "NaN | \n", "https://www.rightmove.co.uk/properties/105836849 | \n", "3 | \n", "{'duration': 2565, 'distance': 14070, 'duratio... | \n", "{'duration': 2565, 'distance': 14070, 'duratio... | \n", "NaN | \n", "NaN | \n", "False | \n", "Leasehold | \n", "393 | \n", "None | \n", "20 | \n", "None | \n", "
| 4 | \n", "108102476 | \n", "53.7 | \n", "515000.0 | \n", "9590.316574 | \n", "https://www.rightmove.co.uk/properties/108102476 | \n", "1 | \n", "{'duration': 1266, 'distance': 4042, 'duration... | \n", "{'duration': 1861, 'distance': 4548, 'duration... | \n", "104.0 | \n", "NaN | \n", "False | \n", "Leasehold | \n", "16 | \n", "None | \n", "0 | \n", "None | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 18503 | \n", "94206080 | \n", "49.6 | \n", "899000.0 | \n", "18125.000000 | \n", "https://www.rightmove.co.uk/properties/94206080 | \n", "1 | \n", "{'duration': 1125, 'distance': 4637, 'duration... | \n", "{'duration': 1125, 'distance': 4641, 'duration... | \n", "NaN | \n", "NaN | \n", "True | \n", "Leasehold | \n", "256 | \n", "None | \n", "0 | \n", "None | \n", "
| 18504 | \n", "94206329 | \n", "NaN | \n", "700000.0 | \n", "NaN | \n", "https://www.rightmove.co.uk/properties/94206329 | \n", "1 | \n", "{'duration': 2172, 'distance': 12497, 'duratio... | \n", "{'duration': 2112, 'distance': 12497, 'duratio... | \n", "NaN | \n", "NaN | \n", "False | \n", "Leasehold | \n", "256 | \n", "None | \n", "20 | \n", "None | \n", "
| 18505 | \n", "94508306 | \n", "94.0 | \n", "1000000.0 | \n", "10638.297872 | \n", "https://www.rightmove.co.uk/properties/94508306 | \n", "2 | \n", "{'duration': 1046, 'distance': 2193, 'duration... | \n", "{'duration': 1046, 'distance': 2193, 'duration... | \n", "977.0 | \n", "NaN | \n", "False | \n", "Leasehold | \n", "149 | \n", "None | \n", "0 | \n", "None | \n", "
| 18506 | \n", "95975483 | \n", "NaN | \n", "800000.0 | \n", "NaN | \n", "https://www.rightmove.co.uk/properties/95975483 | \n", "2 | \n", "{'duration': 2281, 'distance': 7262, 'duration... | \n", "{'duration': 2815, 'distance': 5607, 'duration... | \n", "999.0 | \n", "0.00 | \n", "False | \n", "Leasehold | \n", "3 | \n", "None | \n", "0 | \n", "None | \n", "
| 18507 | \n", "96773996 | \n", "70.8 | \n", "1000000.0 | \n", "14124.293785 | \n", "https://www.rightmove.co.uk/properties/96773996 | \n", "2 | \n", "{'duration': 1608, 'distance': 8301, 'duration... | \n", "{'duration': 1608, 'distance': 8301, 'duration... | \n", "992.0 | \n", "4716.36 | \n", "True | \n", "Leasehold | \n", "227 | \n", "None | \n", "20 | \n", "None | \n", "
18508 rows × 16 columns
\n", "| \n", " | identifier | \n", "sqm_ocr | \n", "price | \n", "price_per_sqm | \n", "url | \n", "bedrooms | \n", "lease_left | \n", "service_charge | \n", "development | \n", "tenure_type | \n", "updated_days | \n", "last_seen | \n", "decision | \n", "duration | \n", "initial_walk_duration | \n", "duration_per_transit | \n", "number_of_transit_stops | \n", "
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", "101369066 | \n", "-1.0 | \n", "875000.0 | \n", "NaN | \n", "https://www.rightmove.co.uk/properties/101369066 | \n", "3 | \n", "0.0 | \n", "-1.00 | \n", "False | \n", "Share of Freehold | \n", "12 | \n", "0 | \n", "None | \n", "38 | \n", "142 | \n", "{'WALK': 797, 'TRANSIT': 1227} | \n", "2 | \n", "
| 1 | \n", "105484772 | \n", "45.7 | \n", "325000.0 | \n", "7111.597374 | \n", "https://www.rightmove.co.uk/properties/105484772 | \n", "1 | \n", "104.0 | \n", "641.53 | \n", "False | \n", "Leasehold | \n", "36 | \n", "0 | \n", "None | \n", "33 | \n", "372 | \n", "{'WALK': 609, 'TRANSIT': 1109} | \n", "2 | \n", "