-
Notifications
You must be signed in to change notification settings - Fork 1
/
ks_test.py
45 lines (37 loc) · 1.52 KB
/
ks_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""
Implement Kolmogorov–Smirnov test that can be used to compare two non-parametric variables
https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
Phong D. Le - [email protected]
"""
import numpy as np
from scipy.stats import iqr, tstd
def KSTest(X, Y):
"""
Calculate Kolmogorov–Smirnov statistic of two samples
Formula: D = max(abs(CDF(X) - CDF(Y)))
"""
# Calculate the number of bins using Sturges' rule
# Formula: bins = ceil(1 + log2(n))
bins1 = np.ceil(1 + np.log2(len(X))).astype('int')
bins2 = np.ceil(1 + np.log2(len(Y))).astype('int')
# Number of bins will be the average of two bins
bins = np.ceil((bins1 + bins2)/2).astype('int')
# Split data of two variables in the same range and the same number of bins
amax = max(np.max(X), np.max(Y))
amin = min(np.min(X), np.min(Y))
histX, _ = np.histogram(X, bins=bins, range=[amin, amax])
histY, _ = np.histogram(Y, bins=bins, range=[amin, amax])
# Get the probability of bins
px = histX / histX.sum()
py = histY / histY.sum()
# Get cumulative distribution probability
cdfX = np.cumsum(px)
cdfY = np.cumsum(py)
return max(abs(cdfX - cdfY))
if __name__ == '__main__':
# X, a random Gaussian distribution
X = np.random.normal(0, 100, 1000)
# Y, a random bivariate Gaussian distribution
Y = np.concatenate([np.random.normal(0, 50, 500),
np.random.normal(200, 70, 500)])
print('Kolmogorov–Smirnov test: D(X,Y) = {}'.format(KSTest(X, Y)))