-
Notifications
You must be signed in to change notification settings - Fork 103
/
Copy pathimmowelt.py
108 lines (91 loc) · 3.92 KB
/
immowelt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
This is an example web scraper for immowelt.de.
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
import os
import json
import math
from typing import Dict, List
from pathlib import Path
from loguru import logger as log
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
BASE_CONFIG = {
# bypass web scraping blocking
"asp": True,
# set the proxy country to switzerland
"country": "DE"
}
output = Path(__file__).parent / "results"
output.mkdir(exist_ok=True)
def parse_property_pages(response: ScrapeApiResponse) -> Dict:
"""parse lsiting data from property page"""
# get the property id from the page URL
# proeprty_id = str(response.context["url"]).split("expose/")[-1]
selector = response.selector
# extract the data from the script tag
next_data = selector.xpath("//script[@id='__NEXT_DATA__']/text()").get()
next_data = json.loads(next_data)["props"]["pageProps"]["classified"]
# remove web app related keys
{key: next_data.pop(key, None) for key in ['metadata', 'tracking', 'advertising', 'seo', 'defaultBackToSearch']}
return next_data
async def scrape_properties(urls: List[str]) -> List[Dict]:
"""scrape listings data from property pages"""
# add all property pages to a scraping list
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
properties = []
# scrape all property pages concurrently
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
try:
data = parse_property_pages(response)
properties.append(data)
except:
log.warning("expired property page")
log.success(f"scraped {len(properties)} property listings")
return properties
def find_json_objects(text: str, decoder=json.JSONDecoder()):
"""Find JSON objects in text, and generate decoded JSON data"""
pos = 0
while True:
match = text.find("{", pos)
if match == -1:
break
try:
result, index = decoder.raw_decode(text[match:])
yield result
pos = match + index
except ValueError:
pos = match + 1
def parse_search_pages(response: ScrapeApiResponse) -> List[Dict]:
"""parse search data from script tags"""
selector = response.selector
script = selector.xpath('//script[contains(text(),"classified-serp-init-data")]/text()').get()
script = script.replace('window["__UFRN_FETCHER__"]=JSON.parse("', '').replace('");', '')
script = script.encode().decode('unicode_escape')
data = json.loads(script)['data']['classified-serp-init-data']['pageProps']
search_data = []
for k, v in data['classifiedsData'].items():
search_data.append(v)
max_pages = math.ceil(data['totalCount'] / 30)
return {'search_data': search_data, 'max_pages': max_pages}
async def scrape_search(url: str, max_scrape_pages: int = None) -> List[Dict]:
"""scrape search pages from XHR call responses"""
log.info("scraping first search page")
first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
data = parse_search_pages(first_page)
search_data = data['search_data']
max_pages = data['max_pages']
# get the total number of pages to scrape
if max_scrape_pages and max_scrape_pages < max_pages:
max_pages = max_scrape_pages
log.info(f"scraping search pagination, remaining ({max_pages - 1}) more pages")
# scrape the remaining search pages concurrently
to_scrape = [
ScrapeConfig(url + f'&page={page}', **BASE_CONFIG)
for page in range(2, max_pages + 1)
]
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
search_data.extend(parse_search_pages(response)['search_data'])
log.success(f"scraped {len(search_data)} properties from search")
return search_data