-
Notifications
You must be signed in to change notification settings - Fork 0
/
compute_tox_sent.py
157 lines (122 loc) · 5.19 KB
/
compute_tox_sent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import pandas as pd
import os
import re
import string
from scipy import stats
from config import parse_options
import ipdb
API_KEY = [el for el in open("perspective_key", 'r')][0]
def clean_data(gen_df):
print("Running cleaning...")
# remove extra spaces and nonascii
def clean_line(row):
rr = re.sub("\s{1,}", " ", row['Generation'])
rr = ''.join(filter(lambda x: x in string.printable, rr))
return rr
gen_df['Generation'] = gen_df.apply(lambda x: clean_line(x), axis=1)
return gen_df
def compute_hf_sentiment(gen_df):
print("Running HF classifier...")
from transformers import pipeline
classifier = pipeline("sentiment-analysis", device=0)
name = "Generation" if "Generation" in gen_df.columns else "Text"
def compute_hf_sentiment_(row):
result = classifier(row[name])[0]
if result['label'] == 'NEGATIVE':
return -1 * result['score']
else:
return result['score']
gen_df['hf_sentiment'] = gen_df.apply(lambda x: compute_hf_sentiment_(x), axis=1)
return gen_df
def compute_vader(gen_df):
print("Running VADER...")
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
an = SentimentIntensityAnalyzer()
name = "Generation" if "Generation" in gen_df.columns else "Text"
vader_sentiments = gen_df.apply(lambda x: an.polarity_scores(x[name]), axis=1)
temp_vader = pd.DataFrame(vader_sentiments.tolist())
gen_df = pd.concat([gen_df, temp_vader], axis=1)
return gen_df
def compute_tox_detoxify(gen_df):
print("Running detoxify...")
from detoxify import Detoxify
model = Detoxify('original', device='cuda')
name = "Generation" if "Generation" in gen_df.columns else "Text"
toxicities = gen_df.apply(lambda x: model.predict(x[name]), axis=1)
temp_toxicities = pd.DataFrame(toxicities.tolist())
gen_df = pd.concat([gen_df, temp_toxicities], axis=1)
return gen_df
def compute_regard(gen_df, opt):
print("Running regard...")
from nlgbias.scripts.run_classifier import regard
# Set filename for regard, and compute regard
tst = pd.read_csv(opt.test_file)
_, tail = os.path.split(opt.test_file)
tail = tail.strip(".csv")
newpth = os.path.join(f"nlgbias/data/regard", opt.prompt_domain + "_" + tail + ".tsv")
tst['Generation'].to_csv(newpth)
opt.test_file = opt.prompt_domain + "_" + tail + ".tsv"
preds = regard(opt)[1:]
gen_df['regard'] = preds
return gen_df
# Now read predictions back
# predspth = os.path.join(opt.output_dir, tail+"_predictions.txt")
# preds = pd.read_csv(predspth, sep='\t', header=None)
# gen_df['regard'] = preds[0]
def compute_tox_perspective(gen_df):
client = discovery.build(
"commentanalyzer",
"v1alpha1",
developerKey=API_KEY,
discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
static_discovery=False,
)
def compute_single(text):
analyze_request = {
'comment': {'text': text} ,
'requestedAttributes': {'TOXICITY': {}}
}
response = client.comments().analyze(body=analyze_request).execute()
return response['attributeScores']['TOXICITY']['summaryScore']['value']
name = "Generation" if "Generation" in gen_df.columns else "Text"
gen_df['perspective_api'] = gen_df.apply(lambda x: compute_single(x[name]), axis=1)
return gen_df
def read_file(file):
if file.endswith(".csv"):
df = pd.read_csv(file)
elif file.endswith(".txt"):
with open(file, 'r') as f:
lines = [line.rstrip() for line in f]
df = pd.DataFrame({"Text": lines})
else:
raise ValueError()
print("Finished reading the file.")
return df
if __name__ == "__main__":
opt = parse_options()
domain = opt.prompt_domain
cat = opt.category
cat_var = "" if cat is None else f"{opt.category}_"
gen_df = read_file(opt.test_file) #f"outputs/generations/{opt.prompt_set}_{domain}_{cat}_nosampling_50000_50/gens.csv")
gen_df = clean_data(gen_df)
if opt.regard:
gen_df = compute_regard(gen_df, opt)
else:
gen_df = compute_vader(gen_df)
gen_df = compute_tox_detoxify(gen_df)
# gen_df = compute_hf_sentiment(gen_df)
# gen_df = compute_tox_pespective(gen_df)
if opt.summarize:
mn = gen_df.groupby("Group").mean()
pvals = []
for col in mn.columns:
print(col)
val = stats.ttest_ind(gen_df.loc[gen_df['Group'] == "American_actors", col],
gen_df.loc[gen_df['Group'] == "American_actresses", col]).pvalue
print("pvalue: ", val)
pvals.append(val)
mn.loc["pvalue"] = pvals
mn.to_csv(os.path.join(opt.save_path, f"{cat_var}sent_tox_summ.csv"), index=False)
gen_df.to_csv(os.path.join(opt.save_path, f"{cat_var}sent_tox.csv"), index=False)
# gen_df.to_csv(f"outputs/generations/{opt.prompt_set}_{domain}_{cat}_nosampling_50000_50/{cat}_sent_tox.csv")
# mn.to_csv(f"outputs/generations/{opt.prompt_set}_{domain}_{cat}_nosampling_50000_50/{cat}_sent_tox_summ.csv")