-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathworkers.py
40 lines (34 loc) · 1.44 KB
/
workers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import pandas as pd
def count_words(args):
"""
Count number of word occurance, but also filters out words that are not included in the filter list
Parameters:
args: which is a tuple of:
row : pd dataframe row that includes the article metadata + content (for tagging)
pos_filter (list) : List of POS tags to retain for the word count
Returns:
Counter: Counter object from collections, that has (word,pos) as the key and the value as the occurence of the word in the article
df: metadata of the article, which include url, # of sentences, and # of words in the article
"""
row,pos_filter = args
# Perform processing on the row here
content = row['content']
df = pd.DataFrame(columns=['url','year','month','sentence count','word count'])
filtered_content = []
sentence_count = 0
word_count = 0
tokenenized_content = word_tokenize(content)
# Perform POS tagging
pos_content = nltk.pos_tag(tokenenized_content)
# Check if the query string is in the set
for word, pos in pos_content:
word_count += 1
if (word == '.'):
sentence_count += 1
elif pos in pos_filter:
filtered_content.append((word,pos))
df.loc[row['index']] = [row['url'],row['date'][0:4],row['date'][5:7],sentence_count,word_count]
return (Counter(filtered_content),df)