-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWebScrape.py
62 lines (47 loc) · 2.47 KB
/
WebScrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import requests
import pandas as pd
from bs4 import BeautifulSoup
# Get URL for data scraping
URL = "https://charleyproject.org/case-searches/advanced-search?first-name=&middle-name=&last-name=&suffix=&sex=&missing-since=&missing-from-city=&missing-from-state=&classification=&date-of-birth=&age=&height-and-weight=&distinguishing-chars=&clothing-jewelry-desc=&medical-conditions=&details-of-disappearance=&investigating-agency=&source-information="
#URL = "https://charleyproject.org/case-searches/alphabetical-cases?letter=U" #Smaller Scale Test
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")
# Gets all missing person cases in one list
test=soup.find(id="case-container").find_all(class_="case")
# Creates lists to save variables to with the same size as the number of cases
size = len(test)
StoreLoc=[" "]*size
StoreName=[" "]*size
StoreMissFr=[" "]*size
StoreSex=[" "]*size
StoreRace=[" "]*size
StoreImg=[" "]*size
for i in range(0,size):
#Get link of Caes
test3=test[i].find('a').get('href')
Tpage = requests.get(test3)
Tsoup = BeautifulSoup(Tpage.content, "html.parser").find("article")
# Scrape data
Name = Tsoup.find(class_="title entry-title is-1").text
Loc=Tsoup.find(id="case-top").find_all(class_="column")[1].find_all("li")[1].text.replace('Missing From', '').replace('\t', '').strip()
MissFr=Tsoup.find(id="case-top").find_all(class_="column")[1].find_all("li")[0].text.replace('Missing Since', '').replace('\t', '').strip()
Sex=Tsoup.find(id="case-top").find_all(class_="column")[1].find_all("li")[3].text.replace('Sex', '').replace('\t', '').strip()
Race=Tsoup.find(id="case-top").find_all(class_="column")[1].find_all("li")[4].text.replace('Race', '').replace('\t', '').strip()
Img=Tsoup.find(id="case-top").find_all(class_="column")[0].find_all("li")[0].find('img').get('src')
#Store data
StoreName[i]=Name
StoreLoc[i]=Loc
StoreMissFr[i]=MissFr
StoreSex[i]=Sex
StoreRace[i]=Race
StoreImg[i]=Img
#Show progress and save to csv every 100 cases
if i%100==0:
print(i)
dict = {'Name':StoreName,'Location':StoreLoc,'Missing From':StoreMissFr,'Sex':StoreSex,'Race':StoreRace,'Image':StoreImg}
df=pd.DataFrame(dict)
df.to_csv('Test.csv')
#After loop finishes, save to csv one last time
dict = {'Name':StoreName,'Location':StoreLoc,'Missing From':StoreMissFr,'Sex':StoreSex,'Race':StoreRace,'Image':StoreImg}
df=pd.DataFrame(dict)
df.to_csv('Test.csv')