-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspitogatosscraper.py
83 lines (64 loc) · 2.51 KB
/
spitogatosscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
from bs4 import BeautifulSoup as bs
import cfscrape
from collections import OrderedDict
import logging
import re
import json
import argparse
# Base Url
base_url = "https://www.spitogatos.gr"
# Regex Patterns
location_pattern = re.compile(r'\((.*?)\)')
title_pattern = re.compile(r'\d+τ\.μ\.')
price_pattern = re.compile(r'\d+')
def scrape_properties(url,pageNo):
print(url)
scraper = cfscrape.create_scraper()
response = scraper.get(url)
if response.status_code != 200:
print(f"Error: Received status code {response.status_code}")
return None
soup = bs(response.text, "html.parser")
data = soup.findAll('article',class_='ordered-element')
properties = {}
for d in data:
title = d.find('h3',class_='tile__title').text.strip()
location = d.find('h3',class_='tile__location').text.strip()
price = d.find('p',class_='price__text').text.strip()
href = d.find('a',class_='tile__link')["href"]
'''
Apply the Regexs
'''
href_apply_pref = f"{base_url}{href}"
location_apply_pattern = re.search(location_pattern, location).group(1)
title_apply_pattern = re.search(title_pattern, title).group()
price_apply_pattern = re.search(price_pattern,price).group()
result = {
"price" : price_apply_pattern,
"title" : title_apply_pattern,
"location" : location_apply_pattern,
}
properties[href_apply_pref] = result
json_string = json.dumps(properties, indent=4, ensure_ascii=False)
sample_file_name = f"samples/{url_to_filename(url)}"
with open(sample_file_name,"w",encoding="utf-8") as outputFile:
outputFile.write(json_string)
def log_print(message):
logging.info(message)
import os
def url_to_filename(url):
url = url.replace('https://www.spitogatos.gr/', '')
url = re.sub(r'[^a-zA-Z0-9-_./]+', '_', url)
url = url.strip('_')
url = re.sub(r'_{2,}', '_', url)
url = url.replace('/', '-')
url += '.json'
return url
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Scrape properties from multiple pages.')
parser.add_argument('url', type=str, help='Base URL for scraping')
parser.add_argument('start_page', type=int, help='Start page number')
parser.add_argument('end_page', type=int, help='End page number')
args = parser.parse_args()
for i in range(args.start_page, args.end_page + 1):
properties = scrape_properties(f"{args.url}/selida_{i}", f"{i}")