forked from maladeep/Coventry-PureHub-Search-Engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScrapper.py
146 lines (124 loc) · 6.73 KB
/
Scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#import os # Module for interacting with the operating system
import time # Module for time-related operations
import ujson # Module for working with JSON data
from random import randint # Module for generating random numbers
from typing import Dict, List, Any # Type hinting imports
import requests # Library for making HTTP requests
from bs4 import BeautifulSoup # Library for parsing HTML data
from selenium import webdriver # Library for browser automation
from selenium.common.exceptions import NoSuchElementException # Exception for missing elements
from webdriver_manager.chrome import ChromeDriverManager # Driver manager for Chrome (We are using Chromium based )
# Delete files if present
# try:
# os.remove('Authors_URL.txt')
# os.remove('scraper_results.json')
# except OSError:
# pass
def write_authors(list1, file_name):
# Function to write authors' URLs to a file
with open(file_name, 'w', encoding='utf-8') as f:
for i in range(0, len(list1)):
f.write(list1[i] + '\n')
def initCrawlerScraper(seed,max_profiles=500):
# Initialize driver for Chrome
webOpt = webdriver.ChromeOptions()
webOpt.add_experimental_option('excludeSwitches', ['enable-logging'])
webOpt.add_argument('--ignore-certificate-errors')
webOpt.add_argument('--incognito')
webOpt.headless = True
driver = webdriver.Chrome(ChromeDriverManager().install(), options=webOpt)
driver.get(seed) # Start with the original link
Links = [] # Array with pureportal profiles URL
pub_data = [] # To store publication information for each pureportal profile
nextLink = driver.find_element_by_css_selector(".nextLink").is_enabled() # Check if the next page link is enabled
print("Crawler has begun...")
while (nextLink):
page = driver.page_source
# XML parser to parse each URL
bs = BeautifulSoup(page, "lxml") # Parse the page source using BeautifulSoup
# Extracting exact URL by spliting string into list
for link in bs.findAll('a', class_='link person'):
url = str(link)[str(link).find('https://pureportal.coventry.ac.uk/en/persons/'):].split('"')
Links.append(url[0])
# Click on Next button to visit next page
try:
if driver.find_element_by_css_selector(".nextLink"):
element = driver.find_element_by_css_selector(".nextLink")
driver.execute_script("arguments[0].click();", element)
else:
nextLink = False
except NoSuchElementException:
break
# Check if the maximum number of profiles is reached
if len(Links) >= max_profiles:
break
print("Crawler has found ", len(Links), " pureportal profiles")
write_authors(Links, 'Authors_URL.txt') # Write the authors' URLs to a file
print("Scraping publication data for ", len(Links), " pureportal profiles...")
count = 0
for link in Links:
# Visit each link to get data
time.sleep(1) # Delay of 1 second to hit next data
driver.get(link)
try:
if driver.find_elements_by_css_selector(".portal_link.btn-primary.btn-large"):
element = driver.find_elements_by_css_selector(".portal_link.btn-primary.btn-large")
for a in element:
if "research output".lower() in a.text.lower():
driver.execute_script("arguments[0].click();", a)
driver.get(driver.current_url)
# Get name of Author
name = driver.find_element_by_css_selector("div[class='header person-details']>h1")
r = requests.get(driver.current_url)
# Parse all the data via BeautifulSoup
soup = BeautifulSoup(r.content, 'lxml')
# Extracting publication name, publication url, date and CU Authors
table = soup.find('ul', attrs={'class': 'list-results'})
if table != None:
for row in table.findAll('div', attrs={'class': 'result-container'}):
data = {}
data['name'] = row.h3.a.text
data['pub_url'] = row.h3.a['href']
date = row.find("span", class_="date")
rowitem = row.find_all(['div'])
span = row.find_all(['span'])
data['cu_author'] = name.text
data['date'] = date.text
print("Publication Name :", row.h3.a.text)
print("Publication URL :", row.h3.a['href'])
print("CU Author :", name.text)
print("Date :", date.text)
print("\n")
pub_data.append(data)
else:
# Get name of Author
name = driver.find_element_by_css_selector("div[class='header person-details']>h1")
r = requests.get(link)
# Parse all the data via BeautifulSoup
soup = BeautifulSoup(r.content, 'lxml')
# Extracting publication name, publication url, date and CU Authors
table = soup.find('div', attrs={'class': 'relation-list relation-list-publications'})
if table != None:
for row in table.findAll('div', attrs={'class': 'result-container'}):
data = {}
data["name"] = row.h3.a.text
data['pub_url'] = row.h3.a['href']
date = row.find("span", class_="date")
rowitem = row.find_all(['div'])
span = row.find_all(['span'])
data['cu_author'] = name.text
data['date'] = date.text
print("Publication Name :", row.h3.a.text)
print("Publication URL :", row.h3.a['href'])
print("CU Author :", name.text)
print("Date :", date.text)
print("\n")
pub_data.append(data)
except Exception:
continue
print("Crawler has scrapped data for ", len(pub_data), " pureportal publications")
driver.quit()
# Writing all the scraped results in a file with JSON format
with open('scraper_results.json', 'w') as f:
ujson.dump(pub_data, f)
initCrawlerScraper('https://pureportal.coventry.ac.uk/en/organisations/coventry-university/persons/', max_profiles=500)