-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1.data.py
122 lines (115 loc) · 3.92 KB
/
1.data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# %%
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import preprocessing
# %%
data = pd.read_csv('train.csv')
# %%
print(f'Number of data points in train set {data.shape[0]}')
# %%
# %%
data.info()
# %%
duplicate = data.groupby('is_duplicate')['id'].count()
plt.figure(figsize=(5,5))
plt.pie(duplicate, labels= ['Non-Duplicate','Duplicate'], autopct='%1.1f%%')
plt.title('Duplicate vs Non-Duplicate values')
plt.show()
# %%
questions = data['qid1'].tolist() + data['qid2'].tolist()
values = np.unique(questions)
# %%
store = {}
values = []
for x in questions:
if x in store:
try:
store[x] += 1
except:
pass
else:
store[x] = 1
# %%
max_n = store[max(store, key= store.get)]
# %%
counts = 0
for x in store.values():
if x <= 1:
pass
else:
counts += 1
print(f'number of unique questions {len(store)}')
print(f'maximum time a question appears more then once {max_n}')
print(f'Total number question appear more then once {counts}')
# %%
# store is a dict contains questions and their frequcey
# count contains number of questions appear more then once
plt.bar(['unique', 'repeated'], [len(store),counts], color=['r','b'])
plt.xlabel('Types of questions')
plt.ylabel('Frequency')
plt.show
# %%
duplicates = data[data.duplicated(['qid1','qid2'])]
print(f'Number of duplicate rows {duplicates.shape[0]}')
# %%
# %%
plt.figure(figsize= (20,10))
plt.hist(store.values(), bins = 170)
plt.yscale('log')
plt.xlabel('Frequency of questions')
plt.ylabel('Number of questions')
plt.show
# %%
data.isnull().sum()
# %%
null_data = data[data.isnull().any(1)].index.tolist()
# %%
null_data
# %%
data = data.drop(null_data)
# %%
data.isnull().sum()
# %%
"""
freq_qid1 = Frequency of qid1's
freq_qid2 = Frequency of qid2's
q1len = Length of q1
q2len = Length of q2
q1_n_words = Number of words in Question 1
q2_n_words = Number of words in Question 2
word_Common = (Number of common unique words in Question 1 and Question 2)
word_Total =(Total num of words in Question 1 + Total num of words in Question 2)
freq_q1+freq_q2 = sum total of frequency of qid1 and qid2
freq_q1-freq_q2 = absolute difference of frequency of qid1 and qid2
word_share__ = (word_common)/(word_Total)
"""
# %%
def matcher(a,b):
a,b = a.lower().strip().split(' '), b.lower().strip().split(' ')
return len(list(set(a).intersection(b)))
def simple_feat(data):
data['freq_q1'] = [ store[count] for count in data['qid1']]
data['freq_q2'] = [ store[count] for count in data['qid2']]
data['len_q1'] = [len(sent) for sent in data['question1']]
data['len_q2'] = [len(sent) for sent in data['question2']]
data['words_q1'] = [len(sent.split(' ')) for sent in data['question1']]
data['words_q2'] = [len(sent.split(' ')) for sent in data['question2']]
data['common_words'] = [matcher(sent1,sent2) for sent1, sent2 in zip(data['question1'],data['question2'])]
data['total_words'] = [words1 + words2 for words1, words2 in zip(data['words_q1'],data['words_q2'])]
#data['word_share'] = [1.0 * round(common/len(sent1.lower().strip().split(' '))+len(sent2.lower().strip().split(' ')),2) for sent1, sent2, common in zip(data['question1'],data['question2'],data['common_words'])]
data["word_share"] = [common_w / total_w for common_w, total_w in zip(data['common_words'], data['total_words'])]
data['freq_sum'] = [sent1+sent2 for sent1, sent2 in zip(data['freq_q1'],data['freq_q2'])]
data['freq_dif'] = [sent1-sent2 for sent1, sent2 in zip(data['freq_q1'], data['freq_q2'])]
data.to_csv("simple_features.csv", index=False)
return data
# %%
simple_feat(data)
data.info()
# %%
print('Question with minimum length in question1', min(data['len_q1']))
print('Question with minimum length in question2', min(data['len_q2']))
print('Question with maximum length in question1', max(data['len_q1']))
print('Question with maximum length in question2', max(data['len_q2']))
# %%