diff --git a/crawler/1_dump_listings.py b/crawler/1_dump_listings.py index 8d5517a..8be24ac 100644 --- a/crawler/1_dump_listings.py +++ b/crawler/1_dump_listings.py @@ -1,21 +1,38 @@ from rec.query import listing_query +from rec.districts import get_districts import pathlib import json from data_access import Listing -d = listing_query(1, 3, 3, 15, 0, 800000, max_days_since_added=7) folder = pathlib.Path("data/rs/") +districts = get_districts() -for i in range(1, 10000): - try: - print(f"page {i}") - d = listing_query(i, 3, 3, 15, 0, 800000, max_days_since_added=1) - except: - break +for district, locid in districts.items(): + print("#### District:", district) + for i in range(1, 41): + try: + d = listing_query( + page=i, + min_bedrooms=1, + max_bedrooms=3, + radius=0, + min_price=0, + max_price=800000, + location_id=locid, + ) + except Exception as e: + print(e) + break + if i == 1: + print("totalAvailableResults: ", d["totalAvailableResults"]) + if len(d["properties"]) == 0: + break + print(f"page {i}", end=", ", flush=True) - for property in d["properties"]: - identifier = property["identifier"] + for property in d["properties"]: + identifier = property["identifier"] - listing = Listing(identifier) - with open(listing.path_listing_json(), "w") as f: - json.dump(property, f) + listing = Listing(identifier) + with open(listing.path_listing_json(), "w") as f: + json.dump(property, f) + print() # break line as we used end=, above. diff --git a/crawler/TASKS b/crawler/TASKS index e909ff9..404674a 100644 --- a/crawler/TASKS +++ b/crawler/TASKS @@ -1,19 +1,35 @@ +- [ ] Partition query further as each query can listing query can only grab a 1000 entries at most. If the query is too broad, it will fail afterwards. -- [ ] Update capability of the database -- [ ] Check if the entry already exists in the database - - -- [ ] Crawl single entry -- [ ] Download pictures and map to database entry - - - - -# Distance measuring -- [ ] route api to find shortest path apartments -- [ ] Switch to distance matrix api. Evaluate if its the right one. - -# sqm measure -- [ ] OCR to detect the area of the apartment - - +- District: City of London, totalAvailableResults: 60 +- District: Greenwich, totalAvailableResults: 1371 +- District: Hillingdon, totalAvailableResults: 1026 +- District: Ealing, totalAvailableResults: 1736 +- District: Richmond upon Thames, totalAvailableResults: 819 +- District: Sutton, totalAvailableResults: 664 +- District: Wandsworth, totalAvailableResults: 1824 +- District: Camden, totalAvailableResults: 801 +- District: Enfield, totalAvailableResults: 1056 +- District: Croydon, totalAvailableResults: 1865 +- District: Hackney, totalAvailableResults: 840 +- District: Kingston upon Thames, totalAvailableResults: 685 +- District: Kensington and Chelsea, totalAvailableResults: 658 +- District: Bromley, totalAvailableResults: 1341 +- District: Brent, totalAvailableResults: 1332 +- District: Waltham Forest, totalAvailableResults: 763 +- District: Southwark, totalAvailableResults: 1460 +- District: Harrow, totalAvailableResults: 948 +- District: Lewisham, totalAvailableResults: 1192 +- District: Barnet, totalAvailableResults: 1683 +- District: Islington, totalAvailableResults: 766 +- District: Haringey, totalAvailableResults: 795 +- District: Lambeth, totalAvailableResults: 1626 +- District: Westminster, totalAvailableResults: 1130 +- District: Tower Hamlets, totalAvailableResults: 2213 +- District: Havering, totalAvailableResults: 863 +- District: Barking and Dagenham, totalAvailableResults: 485 +- District: Hammersmith and Fulham, totalAvailableResults: 1038 +- District: Bexley, totalAvailableResults: 803 +- District: Redbridge, totalAvailableResults: 720 +- District: Newham, totalAvailableResults: 1306 +- District: Merton, totalAvailableResults: 873 +- District: Hounslow, totalAvailableResults: 1096