-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_analysis.py
88 lines (73 loc) · 2.77 KB
/
text_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
def word_count(files_folder, list_words):
"""
count exact time for words appearance
Params:
files_folder: relative path to file folder
list_words: list of words to be counted
Outputs:
result: Dict with key of word, value of count
"""
if isinstance(list_words, str):
list_words = list_words.split(",")
result = {}
list_files = [os.path.join(files_folder, file) for file in os.listdir(files_folder)]
num_lines = 0
# Iter through files
for file_path in list_files:
with open(file_path, 'r', encoding='utf-8') as f:
f_lines = f.readlines()
# Iter through lines
for line in f_lines:
num_lines += 1
# Iter through each word
for word in list_words:
if word in line:
count_old = result.get(word, 0)
result[word] = count_old + 1
print(f"Finished -> Through {len(list_files)} files, {num_lines} lines")
return result
def word_combine(files_folder, list_combine):
"""
count exact time for words-combine appearance
Params:
files_folder: relative path to file folder
list_combine: list of words to be combined
Outputs:
result: Dict with key of appear file name, value of combine news paragraph
"""
if isinstance(list_combine, str):
list_combine = list_combine.split(",")
result = {}
list_files = os.listdir(files_folder)
num_lines = 0
# Iter through files
for file_name in list_files:
file_path = os.path.join(files_folder, file_name)
with open(file_path, 'r', encoding='utf-8') as f:
f_lines = f.readlines()
# Iter through lines
for line in f_lines:
num_lines += 1
all_in = True
# Iter through each word
for word in list_combine:
if word not in line:
all_in = False
break
# Check all in
if not all_in:
continue
else:
result[file_name] = line
print(f"Finished -> Through {len(list_files)} files, {num_lines} lines")
return result
if __name__ == '__main__':
list_word_countries = ['朝鲜', '美国', '台湾', '巴西', '以色列', '乌克兰', '意大利']
result_count = word_count(files_folder=r"data\news", list_words=list_word_countries)
print(result_count)
list_word_combine = ['美国', '元首']
result_combine = word_combine(files_folder=r"data\news", list_combine=list_word_combine)
print(f"combine count: {len(result_combine)}")
if result_combine:
print(result_combine[0])