-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathworkers2.py
60 lines (48 loc) · 2.46 KB
/
workers2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import pandas as pd
def pos_tag(content,pos_filter):
"""
Count number of word occurance for two sources/columns, but also filters out words that are not included in the filter list
Parameters:
args: which is a tuple of:
row : pd dataframe row that includes the article metadata + original content + modified content
pos_filter (list) : List of POS tags to retain for the word count
Returns:
Counter: Counter object from collections, that has (word,pos) as the key and the value as the occurence of the word in the article
df: metadata of the article, which include url, # of sentences, and # of words in the article
"""
filtered_content = []
sentence_count = 0
word_count = 0
tokenenized_content = word_tokenize(content)
# Perform POS tagging
pos_content = nltk.pos_tag(tokenenized_content)
# Check if the query string is in the set
for word, pos in pos_content:
word_count += 1
if (word == '.'):
sentence_count += 1
elif pos in pos_filter:
filtered_content.append((word,pos))
return sentence_count,word_count,filtered_content
def count_words_2(args):
"""
Count number of word occurance for two sources/columns, but also filters out words that are not included in the filter list
Parameters:
args: which is a tuple of:
row : pd dataframe row that includes the article metadata + original content + modified content
pos_filter (list) : List of POS tags to retain for the word count
Returns:
Counter: Counter object from collections, that has (word,pos) as the key and the value as the occurence of the word in the article
df: metadata of the article, which include url, # of sentences, and # of words in the article
"""
row,pos_filter = args
# Perform processing on the row here
content = row['content orig']
sentence_count_orig, word_count_orig, filtered_content_orig = pos_tag(row['content orig'],pos_filter)
sentence_count_mod, word_count_mod, filtered_content_mod = pos_tag(row['content mod'],pos_filter)
df = pd.DataFrame(columns=['url','sentence count orig','word count orig','sentence count mod','word count mod'])
df.loc[row['index']] = [row['url'],sentence_count_orig,word_count_orig,sentence_count_mod,word_count_mod ]
return (Counter(filtered_content_orig),Counter(filtered_content_mod),df)