-
Notifications
You must be signed in to change notification settings - Fork 1
/
metaDataCollector.py
261 lines (209 loc) · 7.08 KB
/
metaDataCollector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
from newspaper import Article
import newspaper
from nltk.corpus import stopwords
import re
import datetime
from pymongo import MongoClient
#Select Database
"""
Collect meta information from a given url, add it to the database
Step 1: identify if the url is of a news source or an article
information to collect example
url:
news_source: "the economist"
authors:
text:
title:
publish_date:
image:
keywords:
summary:
collected_by:
"""
class MetaDataCollector():
def __init__(self, url):
"""
Initialize an instance of Meta Data collection on the given url
:param url:
"""
# connection to database
client = MongoClient()
self.db = client['true-news']
self.documents = self.db.documents
# error tracking
self.error = False
self.error_msg = ""
self.news_source_url = url
self.url = url
self.authors = None
self.title = ""
self.text = ""
self.published = None
self.images = None
self.keywords = ""
self.summary = ""
self.news_source = ""
url_split = self.url.split('://')[1].split('/')
if '' in url_split:
url_split.remove('')
if len(url_split) < 1:
self.error = True
self.error_msg = "URL not in recognizable format"
elif len(url_split) > 1: # URL is an article
self.news_source = url_split[0]
if not self.already_visited():
self.newspaper_article()
else: # URL is a root or news_source
self.news_source = url_split[0]
if not self.already_visited(True):
self.newspaper_newsource()
if self.error:
print(self.error_msg)
def already_visited(self, news_source=False):
"""
Returns True if the url is already in the Database, and collects all of the metadata from the database
:news_source: boolean: cif True, check if the news source has already been visited, else, check the article
:return: boolean
"""
if news_source:
cursor = self.db.newsSource.find_one({'news_source': self.news_source})
if cursor:
news_source_id = cursor["_id"]
else:
news_source_id = None
if news_source_id:
# Add news source to database
news_source = {
"reliability": None, # No automated way to rank reliability at the moment
"reliability_source": None, # link for manual research
"url": self.news_source_url,
"news_source": self.news_source
}
# Add article to the article database
try:
self.db.newsSource.insert_one(news_source)
except:
self.error = True
self.error_msg = "Failed to add news source to database"
else:
cursor = self.documents.find_one({"url": self.url})
if cursor:
document_id = cursor["_id"]
else:
document_id = None
if document_id:
self.db_doc = self.db.documents.find_one({"_id": document_id})
self.title = self.db_doc["title"]
self.authors = self.db_doc["authors"]
self.text = self.db_doc["text"]
self.published = self.db_doc["publish-date"]
# try:
# self.images = self.db_doc["images"]
# except:
# print("This article is known but is missing the images parameter")
try:
self.keywords = self.db_doc["keywords"]
except:
print("This article is known but is missing the keywords parameter")
try:
self.summary = self.db_doc["summary"]
except:
print("This article is known but is missing the summary parameter")
return True
return False
def newspaper_article(self):
"""
Use newspaper3k to collect
-author
-text
-title
-publish_date
-image
:return:
"""
self.article = Article(self.url)
if self.newspaper_download_and_parse():
# Meta Collection
self.authors = self.article.authors
self.published = self.article.publish_date
self.text = self.article.text
self.images = self.article.images
self.title = self.article.title
else:
return self.error_msg
if self.newspaper_nlp():
self.keywords = self.article.keywords
self.summary = self.article.summary
self.add_to_database()
def newspaper_newsource(self):
"""
:return:
"""
news_source = newspaper.build(self.news_source_url, language='en')
# Number of articles that have not been scraped on this news source
size = news_source.size()
for article in news_source.articles:
self.url = article.url
self.newspaper_article()
def add_to_database(self):
"""
:return:
"""
document = {
"authors": self.authors,
"title": self.title,
"text": self.text,
"url": self.url,
"publish-date": self.published,
# "images": self.images,
"keywords": self.keywords,
"summary": self.summary
}
# Add article to the article database
try:
self.documents.insert(document)
except:
self.error = True
self.error_msg = "Failed to add article '" + self.title + "' to database"
# Helper Functions
def newspaper_download_and_parse(self):
"""
Helper function
:return: boolean
"""
# Download Article
try:
self.article.download()
except:
self.error = True
self.error_msg = "Article failed to download"
return False
# Parse Article
try:
self.article.parse()
except:
self.error = True
self.error_msg = "Article failed to parse"
return False
return True
def newspaper_nlp(self):
"""
Helper Function
:return: boolean
"""
# Natural Language Processing on Article
try:
self.article.nlp()
except:
self.error = True
self.error_msg = "Natural Language Processing failed on Article"
return False
return True
def nltk_keywords_stopwords_removal(self):
"""
Helper Function
:return:
"""
# optimization note: 'in', binary search would optimize this, look into how 'in' searches.
self.keywords = [word for word in self.keywords if word not in stopwords.words('english')]
# MetaDataCollector("https://www.foxnews.com/")