-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
69 lines (59 loc) · 2.32 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# -*- coding: utf-8 -*-
import jieba
import re
stopwords = []
with open("data/comsentiment/stopwords.txt", "r", encoding="utf8") as f:
for w in f:
stopwords.append(w.strip())
def load_corpus(path):
"""
加载语料库
"""
data = []
with open(path, "r", encoding="utf8") as f:
for line in f:
[_, seniment, content] = line.split(",", 2)
content = processing(content)
data.append((content, int(seniment)))
return data
def load_corpus_bert(path):
"""
加载语料库
"""
data = []
with open(path, "r", encoding="utf8") as f:
for line in f:
[_, seniment, content] = line.split(",", 2)
content = processing_bert(content)
data.append((content, int(seniment)))
return data
def processing(text):
"""
数据预处理, 可以根据自己的需求进行重载
"""
# 数据清洗部分
text = re.sub("\{%.+?%\}", " ", text) # 去除 {%xxx%} (地理定位, 微博话题等)
text = re.sub("@.+?( |$)", " ", text) # 去除 @xxx (用户名)
text = re.sub("【.+?】", " ", text) # 去除 【xx】 (里面的内容通常都不是用户自己写的)
text = re.sub("\u200b", " ", text) # '\u200b'是这个数据集中的一个bad case, 不用特别在意
# 分词
words = [w for w in jieba.lcut(text) if w.isalpha()]
# 对否定词`不`做特殊处理: 与其后面的词进行拼接
while "不" in words:
index = words.index("不")
if index == len(words) - 1:
break
words[index: index+2] = ["".join(words[index: index+2])] # 列表切片赋值的酷炫写法
# 用空格拼接成字符串
result = " ".join(words)
return result
def processing_bert(text):
"""
数据预处理, 可以根据自己的需求进行重载
"""
# 数据清洗部分
text = re.sub("\{%.+?%\}", " ", text) # 去除 {%xxx%} (地理定位, 微博话题等)
text = re.sub("@.+?( |$)", " ", text) # 去除 @xxx (用户名)
text = re.sub("【.+?】", " ", text) # 去除 【xx】 (里面的内容通常都不是用户自己写的)
text = re.sub("\u200b", " ", text) # '\u200b'是这个数据集中的一个bad case, 不用特别在意
return text