From 6a43a7f485e9fdb3fe8992e731f7c3582af98ddf Mon Sep 17 00:00:00 2001 From: Kadir Date: Mon, 1 Apr 2024 15:09:13 +0200 Subject: [PATCH] adding tasks and updating exploration notebook --- crawler/{TASKS => TASKS.md} | 4 + crawler/exploration.ipynb | 285 ++++++++++++++++++++++++++++++++---- 2 files changed, 258 insertions(+), 31 deletions(-) rename crawler/{TASKS => TASKS.md} (90%) diff --git a/crawler/TASKS b/crawler/TASKS.md similarity index 90% rename from crawler/TASKS rename to crawler/TASKS.md index 404674a..9043e91 100644 --- a/crawler/TASKS +++ b/crawler/TASKS.md @@ -1,3 +1,7 @@ +## Extra + +- [ ] The routing is now expensive. I could simplify it by finding the walking distance to the nearest trainstations with overpass turbo and then have a routing map between stations. + - [ ] Partition query further as each query can listing query can only grab a 1000 entries at most. If the query is too broad, it will fail afterwards. - District: City of London, totalAvailableResults: 60 diff --git a/crawler/exploration.ipynb b/crawler/exploration.ipynb index 371bb0f..a98dc25 100644 --- a/crawler/exploration.ipynb +++ b/crawler/exploration.ipynb @@ -1615,15 +1615,12 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "id": "577dcd5f-07df-4d61-b837-a90db59e3ed5", "metadata": {}, "outputs": [], "source": [ - "class PropertyType(enum.Enum):\n", - " def __str__(self):\n", - " return str(self.value)\n", - " \n", + "class PropertyType(enum.StrEnum): \n", " BUNGALOW= \"bungalow\"\n", " DETACHED= \"detached\"\n", " FLAT= \"flat\"\n", @@ -1635,47 +1632,273 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 19, "id": "50cc2eb3-1c3b-49b8-86a3-73dd2d151a61", "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "sequence item 0: expected str instance, PropertyType found", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[13], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m x \u001b[38;5;241m=\u001b[39m [PropertyType\u001b[38;5;241m.\u001b[39mBUNGALOW,PropertyType\u001b[38;5;241m.\u001b[39mLAND]\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m,\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n", - "\u001b[0;31mTypeError\u001b[0m: sequence item 0: expected str instance, PropertyType found" - ] - } - ], - "source": [ - "x = [PropertyType.BUNGALOW,PropertyType.LAND]\n", - "','.join(x)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "09c02626-8159-4a39-8a8e-21d2fd241301", - "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[, ]" + "'bungalow,park-home'" ] }, - "execution_count": 14, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], + "source": [ + "x = [PropertyType.BUNGALOW,PropertyType.PARK_HOME]\n", + "','.join(x)" + ] + }, + { + "cell_type": "markdown", + "id": "87ead853-8a71-4de9-98d1-f4f2673a5592", + "metadata": {}, "source": [ "x" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "862e9e52-53fa-4bf9-8e31-7847481d45be", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e284aefd-c280-4d82-935c-969b022b6bbc", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04bb61d5-cba7-4739-9568-b00342c1b636", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "a73fba2d-afeb-4194-8421-eff8e84a14e9", + "metadata": {}, + "source": [ + "# Typeahead" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "61844fe2-408d-4b89-995f-c31110a850f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'key': 'CAMD',\n", + " 'term': 'CAMD',\n", + " 'typeAheadLocations': [{'displayName': 'Camden, North West London',\n", + " 'locationIdentifier': 'REGION^85261',\n", + " 'normalisedSearchTerm': 'CAMDEN NORTH WEST LONDON'},\n", + " {'displayName': 'Camden (London Borough)',\n", + " 'locationIdentifier': 'REGION^93941',\n", + " 'normalisedSearchTerm': 'CAMDEN LONDON BOROUGH'},\n", + " {'displayName': 'Camden Town, North West London',\n", + " 'locationIdentifier': 'REGION^85262',\n", + " 'normalisedSearchTerm': 'CAMDEN TOWN NORTH WEST LONDON'},\n", + " {'displayName': 'Camden Town Station',\n", + " 'locationIdentifier': 'STATION^1712',\n", + " 'normalisedSearchTerm': 'CAMDEN TOWN STATION'},\n", + " {'displayName': 'Camden Road Station',\n", + " 'locationIdentifier': 'STATION^1709',\n", + " 'normalisedSearchTerm': 'CAMDEN ROAD STATION'},\n", + " {'displayName': 'Camden Town, Gosport, Hampshire',\n", + " 'locationIdentifier': 'REGION^76577',\n", + " 'normalisedSearchTerm': 'CAMDEN TOWN GOSPORT HAMPSHIRE'},\n", + " {'displayName': 'Camderry, Omagh, County Tyrone, Northern Ireland',\n", + " 'locationIdentifier': 'REGION^73327',\n", + " 'normalisedSearchTerm': 'CAMDERRY OMAGH COUNTY TYRONE NORTHERN IRELAND'}],\n", + " 'isComplete': True}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import requests\n", + "\n", + "cookies = {\n", + " 'permuserid': '240330LSEXEOANG04Q2VA3OZCIQ8TTSQ',\n", + " 'TS019c0ed0': '012f990cd3494097746bc0b10b8d61bc6237319024e80701e0e8d735bd7d62a792529823c3164c771f223b0cade5ec9ae4b7fc001c',\n", + " 'beta_optin': 'N:36:-1',\n", + " 'RM_Register': 'C',\n", + " 'JSESSIONID': '0BE8E261D81387C9BC530DB1A5F28955',\n", + " 'svr': '3111',\n", + " 'permuserid': '240330LSEXEOANG04Q2VA3OZCIQ8TTSQ',\n", + " 'TS01ec61d1': '012f990cd3161dd68e4ed69b9d64f7d4de2356c651edef96201cc7facd11b28ef9338596a2022bc2ed7a56f09c5dee5aa1711de2d1',\n", + " 'rmsessionid': '7bc54ce6-da97-42cf-8719-4e3e9c53e276',\n", + " 'TS01821201': '012f990cd35255a563a541cfe06e4a774c129628165af71838ccdb7a17919672962514804459a0d9d9a90fe7b8feeec66145e30b98',\n", + " 'TS01826437': '012f990cd3161dd68e4ed69b9d64f7d4de2356c651edef96201cc7facd11b28ef9338596a2022bc2ed7a56f09c5dee5aa1711de2d1',\n", + " 'TPCmaxPrice': '800000',\n", + " 'TS01a07bd2': '012f990cd3161dd68e4ed69b9d64f7d4de2356c651edef96201cc7facd11b28ef9338596a2022bc2ed7a56f09c5dee5aa1711de2d1',\n", + " 'TPCminPrice': '700000',\n", + "}\n", + "\n", + "headers = {\n", + " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0',\n", + " 'Accept': 'application/json, text/javascript',\n", + " 'Accept-Language': 'en-GB,en;q=0.5',\n", + " # 'Accept-Encoding': 'gzip, deflate, br',\n", + " 'Referer': 'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E87515&maxBedrooms=3&minBedrooms=1&maxPrice=800000&minPrice=750001&propertyTypes=&includeSSTC=false&mustHave=&dontShow=retirement%2CsharedOwnership&furnishTypes=&keywords=',\n", + " 'X-Correlation-Text-Val': 'source=search',\n", + " 'Content-Type': 'application/x-www-form-urlencoded',\n", + " 'traceparent': '00-d3020142d839bf1ef2b172fa596acea8-605693e1c4c5cbb5-00',\n", + " 'DNT': '1',\n", + " 'Sec-GPC': '1',\n", + " 'Connection': 'keep-alive',\n", + " # 'Cookie': 'permuserid=240330LSEXEOANG04Q2VA3OZCIQ8TTSQ; TS019c0ed0=012f990cd3494097746bc0b10b8d61bc6237319024e80701e0e8d735bd7d62a792529823c3164c771f223b0cade5ec9ae4b7fc001c; beta_optin=N:36:-1; RM_Register=C; JSESSIONID=0BE8E261D81387C9BC530DB1A5F28955; svr=3111; permuserid=240330LSEXEOANG04Q2VA3OZCIQ8TTSQ; TS01ec61d1=012f990cd3161dd68e4ed69b9d64f7d4de2356c651edef96201cc7facd11b28ef9338596a2022bc2ed7a56f09c5dee5aa1711de2d1; rmsessionid=7bc54ce6-da97-42cf-8719-4e3e9c53e276; TS01821201=012f990cd35255a563a541cfe06e4a774c129628165af71838ccdb7a17919672962514804459a0d9d9a90fe7b8feeec66145e30b98; TS01826437=012f990cd3161dd68e4ed69b9d64f7d4de2356c651edef96201cc7facd11b28ef9338596a2022bc2ed7a56f09c5dee5aa1711de2d1; TPCmaxPrice=800000; TS01a07bd2=012f990cd3161dd68e4ed69b9d64f7d4de2356c651edef96201cc7facd11b28ef9338596a2022bc2ed7a56f09c5dee5aa1711de2d1; TPCminPrice=700000',\n", + " 'Sec-Fetch-Dest': 'empty',\n", + " 'Sec-Fetch-Mode': 'cors',\n", + " 'Sec-Fetch-Site': 'same-origin',\n", + "}\n", + "\n", + "response = requests.get('https://www.rightmove.co.uk/typeAhead/uknostreet/CA/MD/EN/', cookies=cookies, headers=headers)\n", + "response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "eba907d0-680d-4374-a56e-95bd4abf0053", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from textwrap import wrap\n", + "\n", + "def query_loc_identifier(name: str)->str:\n", + " \"\"\"\n", + " \n", + " \"\"\"\n", + " name = name.upper()\n", + " name = '/'.join(wrap(name,2))\n", + " \n", + " headers = {\n", + " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:123.0) Gecko/20100101 Firefox/123.0',\n", + " }\n", + " \n", + " response = requests.get(f'https://www.rightmove.co.uk/typeAhead/uknostreet/{name}', headers=headers)\n", + " return response.json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d185e013-9beb-4e57-9d8b-a830e14339c3", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cabe5d3f-ad0f-49c2-9fbf-686539a05bd1", + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "ba500fe6-fb18-466e-a697-403d28181674", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "56c49b50-ff31-4785-9088-45ff5a39545e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'Greenwich': 'REGION^61226', 'Hillingdon': 'REGION^93959', 'Ealing': 'REGION^93947', 'Richmond upon Thames': 'REGION^61415', 'Sutton': 'REGION^93974', 'Wandsworth': 'REGION^93977', 'Camden': 'REGION^93941', 'Enfield': 'REGION^93950', 'Croydon': 'REGION^93944', 'Hackney': 'REGION^93953', 'Kingston upon Thames': 'REGION^93968', 'Kensington and Chelsea': 'REGION^61229', 'Bromley': 'REGION^93938', 'Brent': 'REGION^93935', 'Waltham Forest': 'REGION^61232', 'Southwark': 'REGION^61518', 'Harrow': 'REGION^93956', 'Lewisham': 'REGION^61413', 'Barnet': 'REGION^93929', 'Islington': 'REGION^93965', 'Haringey': 'REGION^61227', 'Lambeth': 'REGION^93971', 'Westminster': '', 'Tower Hamlets': 'REGION^61417', 'Havering': 'REGION^61228', 'Barking and Dagenham': 'REGION^61400', 'Hammersmith and Fulham': 'REGION^61407', 'Bexley': 'REGION^93932', 'Redbridge': 'REGION^61537', 'Newham': 'REGION^61231', 'Merton': 'REGION^61414', 'Hounslow': 'REGION^93962'}\n" + ] + } + ], + "source": [ + "{\n", + " \"City of London\": \"REGION^61224\",\n", + " \"Greenwich\": \"REGION^61226\",\n", + " \"Hillingdon\": \"REGION^93959\",\n", + " \"Ealing\": \"REGION^93947\",\n", + " \"Richmond upon Thames\": \"REGION^61415\",\n", + " \"Sutton\": \"REGION^93974\",\n", + " \"Wandsworth\": \"REGION^93977\",\n", + " \"Camden\": \"REGION^93941\",\n", + " \"Enfield\": \"REGION^93950\",\n", + " \"Croydon\": \"REGION^93944\",\n", + " \"Hackney\": \"REGION^93953\",\n", + " \"Kingston upon Thames\": \"REGION^93968\",\n", + " \"Kensington and Chelsea\": \"REGION^61229\",\n", + " \"Bromley\": \"REGION^93938\",\n", + " \"Brent\": \"REGION^93935\",\n", + " \"Waltham Forest\": \"REGION^61232\",\n", + " \"Southwark\": \"REGION^61518\",\n", + " \"Harrow\": \"REGION^93956\",\n", + " \"Lewisham\": \"REGION^61413\",\n", + " \"Barnet\": \"REGION^93929\",\n", + " \"Islington\": \"REGION^93965\",\n", + " \"Haringey\": \"REGION^61227\",\n", + " \"Lambeth\": \"REGION^93971\",\n", + " \"Westminster\": \"REGION^93980\",\n", + " \"Tower Hamlets\": \"REGION^61417\",\n", + " \"Havering\": \"REGION^61228\",\n", + " \"Barking and Dagenham\": \"REGION^61400\",\n", + " \"Hammersmith and Fulham\": \"REGION^61407\",\n", + " \"Bexley\": \"REGION^93932\",\n", + " \"Redbridge\": \"REGION^61537\",\n", + " \"Newham\": \"REGION^61231\",\n", + " \"Merton\": \"REGION^61414\",\n", + " \"Hounslow\": \"REGION^93962\",\n", + "}\n", + "\n", + "bor_to_locid = {}\n", + "\n", + "for borough in boroughs:\n", + " bor_to_locid[borough] = ''\n", + " d = query_loc_identifier(borough)\n", + " locs = d['typeAheadLocations']\n", + " filtered = [l for l in locs if 'Borough' in l['displayName']]\n", + " if len(filtered)>1:\n", + " print(f\"{borough} has more entries: {len(filtered)}!\")\n", + " if filtered:\n", + " bor_to_locid[borough] = filtered[0]['locationIdentifier']\n", + "\n", + "Westminster (City of)\n", + "print(bor_to_locid)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "157e20a2-d137-45b8-802b-948fa8e04ba3", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {