-
Notifications
You must be signed in to change notification settings - Fork 27
/
adasyn.py
239 lines (193 loc) · 8.66 KB
/
adasyn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals
import warnings
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array, check_random_state
from collections import Counter
__author__ = 'stavrianos'
# Link to paper: bit.ly/22KgAnP
class ADASYN(object):
"""
Oversampling parent class with the main methods required by scikit-learn:
fit, transform and fit_transform
"""
def __init__(self,
ratio=0.5,
imb_threshold=0.5,
k=5,
random_state=None,
verbose=True):
"""
:ratio:
Growth percentage with respect to initial minority
class size. For example if ratio=0.65 then after
resampling minority class(es) will have 1.65 times
its initial size
:imb_threshold:
The imbalance ratio threshold to allow/deny oversampling.
For example if imb_threshold=0.5 then minority class needs
to be at most half the size of the majority in order for
resampling to apply
:k:
Number of K-nearest-neighbors
:random_state:
seed for random number generation
:verbose:
Determines if messages will be printed to terminal or not
Extra Instance variables:
:self.X:
Feature matrix to be oversampled
:self.y:
Class labels for data
:self.clstats:
Class populations to determine minority/majority
:self.unique_classes_:
Number of unique classes
:self.maj_class_:
Label of majority class
:self.random_state_:
Seed
"""
self.ratio = ratio
self.imb_threshold = imb_threshold
self.k = k
self.random_state = random_state
self.verbose = verbose
self.clstats = {}
self.num_new = 0
self.index_new = []
def fit(self, X, y):
"""
Class method to define class populations and store them as instance
variables. Also stores majority class label
"""
self.X = check_array(X)
self.y = np.array(y).astype(np.int64)
self.random_state_ = check_random_state(self.random_state)
self.unique_classes_ = set(self.y)
# Initialize all class populations with zero
for element in self.unique_classes_:
self.clstats[element] = 0
# Count occurences of each class
for element in self.y:
self.clstats[element] += 1
# Find majority class
v = list(self.clstats.values())
k = list(self.clstats.keys())
self.maj_class_ = k[v.index(max(v))]
if self.verbose:
print(
'Majority class is %s and total number of classes is %s'
% (self.maj_class_, len(self.unique_classes_)))
def transform(self, X, y):
"""
Applies oversampling transformation to data as proposed by
the ADASYN algorithm. Returns oversampled X,y
"""
self.new_X, self.new_y = self.oversample()
def fit_transform(self, X, y):
"""
Fits the data and then returns the transformed version
"""
self.fit(X, y)
self.new_X, self.new_y = self.oversample()
self.new_X = np.concatenate((self.new_X, self.X), axis=0)
self.new_y = np.concatenate((self.new_y, self.y), axis=0)
return self.new_X, self.new_y
def generate_samples(self, x, knns, knnLabels, cl):
# List to store synthetically generated samples and their labels
new_data = []
new_labels = []
for ind, elem in enumerate(x):
# calculating k-neighbors that belong to minority (their indexes in x)
# Unfortunately knn returns the example itself as a neighbor. So it needs
# to be ignored thats why it is iterated [1:-1] and
# knnLabelsp[ind][+1].
min_knns = [ele for index,ele in enumerate(knns[ind][1:-1])
if knnLabels[ind][index +1] == cl]
if not min_knns:
continue
# generate gi synthetic examples for every minority example
for i in range(0, int(self.gi[ind])):
# randi holds an integer to choose a random minority kNNs
randi = self.random_state_.random_integers(
0, len(min_knns) - 1)
# l is a random number in [0,1)
l = self.random_state_.random_sample()
# X[min_knns[randi]] is the Xzi on equation [5]
si = self.X[elem] + \
(self.X[min_knns[randi]] - self.X[elem]) * l
new_data.append(si)
new_labels.append(self.y[elem])
self.num_new += 1
return(np.asarray(new_data), np.asarray(new_labels))
def oversample(self):
"""
Preliminary calculations before generation of
synthetic samples. Calculates and stores as instance
variables: img_degree(d),G,ri,gi as defined by equations
[1],[2],[3],[4] in the original paper
"""
try:
# Checking if variable exists, i.e. if fit() was called
self.unique_classes_ = self.unique_classes_
except:
raise RuntimeError("You need to fit() before applying tranform(),"
"or simply fit_transform()")
int_X = np.zeros([1, self.X.shape[1]])
int_y = np.zeros([1])
# Iterating through all minority classes to determine
# if they should be oversampled and to what extent
for cl in self.unique_classes_:
# Calculate imbalance degree and compare to threshold
imb_degree = float(self.clstats[cl]) / \
self.clstats[self.maj_class_]
if imb_degree > self.imb_threshold:
if self.verbose:
print('Class %s is within imbalance threshold' % cl)
else:
# G is the number of synthetic examples to be synthetically
# produced for the current minority class
self.G = (self.clstats[self.maj_class_] - self.clstats[cl]) \
* self.ratio
# ADASYN is built upon eucliden distance so p=2 default
self.nearest_neighbors_ = NearestNeighbors(n_neighbors=self.k + 1)
self.nearest_neighbors_.fit(self.X)
# keeping indexes of minority examples
minx = [ind for ind, exam in enumerate(self.X) if self.y[ind] == cl]
# Computing kNearestNeighbors for every minority example
knn = self.nearest_neighbors_.kneighbors(
self.X[minx], return_distance=False)
# Getting labels of k-neighbors of each example to determine how many of them
# are of different class than the one being oversampled
knnLabels = self.y[knn.ravel()].reshape(knn.shape)
tempdi = [Counter(i) for i in knnLabels]
# Calculating ri as defined in ADASYN paper:
# No. of k-neighbors belonging to different class than the minority divided by K
# which is ratio of friendly/non-friendly neighbors
self.ri = np.array(
[(sum(i.values())- i[cl]) / float(self.k) for i in tempdi])
# Normalizing so that ri is a density distribution (i.e.
# sum(ri)=1)
if np.sum(self.ri):
self.ri = self.ri / np.sum(self.ri)
# Calculating #synthetic_examples that need to be generated for
# each minority instance and rounding to nearest integer because
# it can't produce e.g 2.35 new examples.
self.gi = np.rint(self.ri * self.G)
# Generation of synthetic samples
inter_X, inter_y = self.generate_samples(
minx, knn, knnLabels, cl)
# in case no samples where generated at all concatenation
# won't be attempted
if len(inter_X):
int_X = np.concatenate((int_X, inter_X), axis=0)
if len(inter_y):
int_y = np.concatenate((int_y, inter_y), axis=0)
# New samples are concatenated in the beggining of the X,y arrays
# index_new contains the indiced of artificial examples
self.index_new = [i for i in range(0,self.num_new)]
return(int_X[1:-1], int_y[1:-1])