-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathHVDM.py
92 lines (78 loc) · 3.73 KB
/
HVDM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
from distython import VDM
class HVDM(VDM):
def __init__(self, X , y_ix, cat_ix, nan_equivalents=[np.nan, 0], normalised="variance"):
""" Heterogeneous Value Difference Metric
Distance metric class which initializes the parameters
used in hvdm() function
Parameters
----------
X : array-like of shape = [n_rows, n_features]
Dataset that will be used with HVDM. Needs to be provided
here because minimum and maximimum values from numerical
columns have to be extracted
y_ix : int array-like, list of shape [1]
Single element array with indices for the categorical output variable
If y is numerical it should be converted to categorical (if it makes sense)
cat_ix : array-like of shape = [cat_columns_number]
List containing categorical feature indices
cat_ix : array-like of shape = [x]
List containing missing values indicators
normalised: string
Normalises euclidan distance function for numerical variables
Can be set as "std". The other option is a column range
Returns
-------
None
"""
# Initialize VDM object
super().__init__(X, y_ix, cat_ix)
self.nan_eqvs = nan_equivalents
self.cat_ix = cat_ix
self.col_ix = [i for i in range(X.shape[1])]
# Get the normalization scheme for numerical variables
if normalised == "std":
self.range = 4* np.nanstd(X, axis = 0)
else:
self.range = np.nanmax(X, axis = 0) - np.nanmin(X, axis = 0)
def hvdm(self, x, y):
""" Heterogeneous Value Difference Metric
Distance metric function which calculates the distance
between two instances. Handles heterogeneous data and missing values.
For categorical variables, it uses conditional probability
that the output class is given 'c' when attribute 'a' has a value of 'n'.
For numerical variables, it uses a normalized Euclidan distance.
It can be used as a custom defined function for distance metrics
in Scikit-Learn
Parameters
----------
x : array-like of shape = [n_features]
First instance
y : array-like of shape = [n_features]
Second instance
Returns
-------
result: float
Returns the result of the distance metrics function
"""
# Initialise results array
results_array = np.zeros(x.shape)
# Get indices for missing values, if any
nan_x_ix = np.flatnonzero(np.logical_or(np.isin(x, self.nan_eqvs), np.isnan(x)))
nan_y_ix = np.flatnonzero(np.logical_or(np.isin(y, self.nan_eqvs), np.isnan(y)))
nan_ix = np.unique(np.concatenate((nan_x_ix, nan_y_ix)))
# Calculate the distance for missing values elements
results_array[nan_ix] = 1
# Get categorical indices without missing values elements
cat_ix = np.setdiff1d(self.cat_ix, nan_ix)
# Calculate the distance for categorical elements
results_array[cat_ix] = super().vdm(x, y, nan_ix)[cat_ix]
# Get numerical indices without missing values elements
num_ix = np.setdiff1d(self.col_ix, self.cat_ix)
num_ix = np.setdiff1d(num_ix, nan_ix)
# Calculate the distance for numerical elements
results_array[num_ix] = np.abs(x[num_ix] - y[num_ix]) / self.range[num_ix]
# Return the final result
# Square root is not computed in practice
# As it doesn't change similarity between instances
return np.sum(np.square(results_array))