diff --git a/crawler/5_routing.py b/crawler/5_routing.py index 88263ca..439cb0f 100644 --- a/crawler/5_routing.py +++ b/crawler/5_routing.py @@ -1,10 +1,21 @@ from data_access import Listing from tqdm import tqdm +from geopy.distance import geodesic listings = Listing.get_all_listings() BROCK_STREET_LAT_LONG = 51.52570434674584, -0.13956495005056113 -for listing in tqdm(listings): +# reduce listings to everything within 7 miles +filtered_listings = [] +for listing in listings: + miles = geodesic(BROCK_STREET_LAT_LONG, (listing.latitude, listing.longitude)).miles + if miles <= 7: + filtered_listings.append(listing) + +print(f"Filtered listings from {len(listings)} to {len(filtered_listings)}") + + +for listing in tqdm(filtered_listings): lat, long = BROCK_STREET_LAT_LONG listing.calculate_route(lat, long, recalculate=False) traveltime = listing.travel_time[0] diff --git a/crawler/exploration.ipynb b/crawler/exploration.ipynb index a98dc25..3a63d40 100644 --- a/crawler/exploration.ipynb +++ b/crawler/exploration.ipynb @@ -1656,40 +1656,53 @@ "cell_type": "markdown", "id": "87ead853-8a71-4de9-98d1-f4f2673a5592", "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "id": "98f8e950-2a3b-4856-aa62-3bc758e2fd42", + "metadata": {}, "source": [ - "x" + "# Find out the proper radius we want to use" ] }, { "cell_type": "code", - "execution_count": null, - "id": "862e9e52-53fa-4bf9-8e31-7847481d45be", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e284aefd-c280-4d82-935c-969b022b6bbc", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 50, "id": "04bb61d5-cba7-4739-9568-b00342c1b636", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filtered listings from 32271 to 15804\n" + ] + } + ], + "source": [ + "from data_access import Listing\n", + "from geopy.distance import geodesic\n", + "\n", + "listings = Listing.get_all_listings()\n", + "BROCK_STREET_LAT_LONG = 51.52570434674584, -0.13956495005056113\n", + "\n", + "# reduce listings to everything within 7 miles\n", + "filtered_listings = []\n", + "for listing in listings:\n", + " miles = geodesic(BROCK_STREET_LAT_LONG, (listing.latitude, listing.longitude)).miles\n", + " if miles <= 7:\n", + " filtered_listings.append(listing)\n", + "\n", + "print(f\"Filtered listings from {len(listings)} to {len(filtered_listings)}\")" + ] }, { "cell_type": "markdown", "id": "a73fba2d-afeb-4194-8421-eff8e84a14e9", "metadata": {}, "source": [ - "# Typeahead" + "# Typeahead / fetch all boroughs" ] }, { diff --git a/crawler/poetry.lock b/crawler/poetry.lock index 745fbab..b35908d 100644 --- a/crawler/poetry.lock +++ b/crawler/poetry.lock @@ -575,6 +575,40 @@ smb = ["smbprotocol"] ssh = ["paramiko"] tqdm = ["tqdm"] +[[package]] +name = "geographiclib" +version = "2.0" +description = "The geodesic routines from GeographicLib" +optional = false +python-versions = ">=3.7" +files = [ + {file = "geographiclib-2.0-py3-none-any.whl", hash = "sha256:6b7225248e45ff7edcee32becc4e0a1504c606ac5ee163a5656d482e0cd38734"}, + {file = "geographiclib-2.0.tar.gz", hash = "sha256:f7f41c85dc3e1c2d3d935ec86660dc3b2c848c83e17f9a9e51ba9d5146a15859"}, +] + +[[package]] +name = "geopy" +version = "2.4.1" +description = "Python Geocoding Toolbox" +optional = false +python-versions = ">=3.7" +files = [ + {file = "geopy-2.4.1-py3-none-any.whl", hash = "sha256:ae8b4bc5c1131820f4d75fce9d4aaaca0c85189b3aa5d64c3dcaf5e3b7b882a7"}, + {file = "geopy-2.4.1.tar.gz", hash = "sha256:50283d8e7ad07d89be5cb027338c6365a32044df3ae2556ad3f52f4840b3d0d1"}, +] + +[package.dependencies] +geographiclib = ">=1.52,<3" + +[package.extras] +aiohttp = ["aiohttp"] +dev = ["coverage", "flake8 (>=5.0,<5.1)", "isort (>=5.10.0,<5.11.0)", "pytest (>=3.10)", "pytest-asyncio (>=0.17)", "readme-renderer", "sphinx (<=4.3.2)", "sphinx-issues", "sphinx-rtd-theme (>=0.5.0)"] +dev-docs = ["readme-renderer", "sphinx (<=4.3.2)", "sphinx-issues", "sphinx-rtd-theme (>=0.5.0)"] +dev-lint = ["flake8 (>=5.0,<5.1)", "isort (>=5.10.0,<5.11.0)"] +dev-test = ["coverage", "pytest (>=3.10)", "pytest-asyncio (>=0.17)", "sphinx (<=4.3.2)"] +requests = ["requests (>=2.16.2)", "urllib3 (>=1.24.2)"] +timezone = ["pytz"] + [[package]] name = "greenlet" version = "3.0.3" @@ -3215,4 +3249,4 @@ test = ["websockets"] [metadata] lock-version = "2.0" python-versions = ">3.11" -content-hash = "a095775d567e8a3540d2a351a8f2ab13daef7021eb3ec1570bdc3f075a1df166" +content-hash = "3ae843dbf76bd6b91052d1ea782eb3dbd0921acf7c166c4396c9ae7b7fd66cae" diff --git a/crawler/pyproject.toml b/crawler/pyproject.toml index fa11939..6495ce5 100644 --- a/crawler/pyproject.toml +++ b/crawler/pyproject.toml @@ -18,6 +18,7 @@ transformers = "^4.38.2" pytesseract = "^0.3.10" jupyterlab = "^4.1.4" pandas = "^2.2.1" +geopy = "^2.4.1" [tool.poetry.dev-dependencies]