Skip to content

Commit

Permalink
fixed multiple warnings, improved conover friedman test, linted and i…
Browse files Browse the repository at this point in the history
…mproved code
  • Loading branch information
maximtrp committed Oct 26, 2024
1 parent d018ba3 commit 65e9b6b
Show file tree
Hide file tree
Showing 5 changed files with 420 additions and 338 deletions.
6 changes: 3 additions & 3 deletions scikit_posthocs/_omnibus.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def test_mackwolfe(
return (np.nan, np.nan)

Rij = x[_val_col].rank()
n = cast(Series, x.groupby(_group_col)[_val_col].count())
n = cast(Series, x.groupby(_group_col, observed=False)[_val_col].count())

def _fn(Ri, Rj):
return np.sum(Ri.apply(lambda x: Rj[Rj > x].size))
Expand Down Expand Up @@ -243,7 +243,7 @@ def test_osrt(

x.sort_values(by=[_group_col], ascending=True, inplace=True)
groups = np.unique(x[_group_col])
x_grouped = x.groupby(_group_col)[_val_col]
x_grouped = x.groupby(_group_col, observed=False)[_val_col]

xi = x_grouped.mean()
ni = x_grouped.count()
Expand All @@ -257,7 +257,7 @@ def test_osrt(
for i in range(k):
for j in range(ni.iloc[i]):
c += 1
sigma2 += (x[_val_col].iat[c] - xi[i]) ** 2.0 / df
sigma2 += (x[_val_col].iloc[c] - xi.iloc[i]) ** 2.0 / df

sigma = np.sqrt(sigma2)

Expand Down
91 changes: 45 additions & 46 deletions scikit_posthocs/_outliers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,8 @@


def outliers_iqr(
x: Union[List, np.ndarray],
ret: str = 'filtered',
coef: float = 1.5) -> np.ndarray:
x: Union[List, np.ndarray], ret: str = "filtered", coef: float = 1.5
) -> np.ndarray:
"""Simple detection of potential outliers based on interquartile range
(IQR). Data that lie within the lower and upper limits are considered
non-outliers. The lower limit is the number that lies 1.5 IQRs below
Expand Down Expand Up @@ -56,20 +55,19 @@ def outliers_iqr(
ll = q1 - iqr * coef
ul = q3 + iqr * coef

if ret == 'indices':
if ret == "indices":
return np.where((arr > ll) & (arr < ul))[0]
elif ret == 'outliers':
elif ret == "outliers":
return arr[(arr < ll) | (arr > ul)]
elif ret == 'outliers_indices':
elif ret == "outliers_indices":
return np.where((arr < ll) | (arr > ul))[0]
else:
return x[(x > ll) & (x < ul)]


def outliers_grubbs(
x: Union[List, np.ndarray],
hypo: bool = False,
alpha: float = 0.05) -> Union[np.ndarray, bool]:
x: Union[List, np.ndarray], hypo: bool = False, alpha: float = 0.05
) -> Union[np.ndarray, bool]:
"""Grubbs' Test for Outliers [1]_. This is the two-sided version
of the test. The null hypothesis implies that there are no outliers
in the data set.
Expand Down Expand Up @@ -113,10 +111,10 @@ def outliers_grubbs(
ind = np.argmax(np.abs(arr - np.mean(arr)))
G = val / np.std(arr, ddof=1)
N = len(arr)
result = G > (N-1) / np.sqrt(N) *\
np.sqrt(
(t.ppf(1-alpha/(2*N), N-2) ** 2) /
(N - 2 + t.ppf(1-alpha/(2*N), N-2) ** 2))
result = G > (N - 1) / np.sqrt(N) * np.sqrt(
(t.ppf(1 - alpha / (2 * N), N - 2) ** 2)
/ (N - 2 + t.ppf(1 - alpha / (2 * N), N - 2) ** 2)
)

if hypo:
return result
Expand All @@ -128,10 +126,8 @@ def outliers_grubbs(


def outliers_tietjen(
x: Union[List, np.ndarray],
k: int,
hypo: bool = False,
alpha: float = 0.05) -> Union[np.ndarray, bool]:
x: Union[List, np.ndarray], k: int, hypo: bool = False, alpha: float = 0.05
) -> Union[np.ndarray, bool]:
"""Tietjen-Moore test [1]_ to detect multiple outliers in a univariate
data set that follows an approximately normal distribution.
The Tietjen-Moore test [2]_ is a generalization of the Grubbs' test to
Expand Down Expand Up @@ -213,11 +209,12 @@ def tietjen(x_, k_):


def outliers_gesd(
x: Union[List, np.ndarray],
outliers: int = 5,
hypo: bool = False,
report: bool = False,
alpha: float = 0.05) -> np.ndarray:
x: Union[List, np.ndarray],
outliers: int = 5,
hypo: bool = False,
report: bool = False,
alpha: float = 0.05,
) -> np.ndarray:
"""The generalized (Extreme Studentized Deviate) ESD test is used
to detect one or more outliers in a univariate data set that follows
an approximately normal distribution [1]_.
Expand Down Expand Up @@ -303,7 +300,6 @@ def outliers_gesd(
ls = ((n - nol - 1) * t_ppr) / np.sqrt((df + t_ppr**2) * (n - nol))

for i in np.arange(outliers):

abs_d = np.abs(data_proc - np.mean(data_proc))

# R-value calculation
Expand All @@ -312,32 +308,35 @@ def outliers_gesd(

# Masked values
lms = ms[-1] if len(ms) > 0 else []
ms.append(
lms + np.where(data == data_proc[np.argmax(abs_d)])[0].tolist())
ms.append(lms + np.where(data == data_proc[np.argmax(abs_d)])[0].tolist())

# Remove the observation that maximizes |xi − xmean|
data_proc = np.delete(data_proc, np.argmax(abs_d))

if report:

report = ["H0: no outliers in the data",
"Ha: up to " + str(outliers) + " outliers in the data",
"Significance level: α = " + str(alpha),
"Reject H0 if Ri > Critical Value (λi)", "",
"Summary Table for Two-Tailed Test",
"---------------------------------------",
" Exact Test Critical",
" Number of Statistic Value, λi",
"Outliers, i Value, Ri {:5.3g} %".format(100*alpha),
"---------------------------------------"]

for i, (r, l) in enumerate(zip(rs, ls)):
report.append('{: >11s}'.format(str(i+1)) +
'{: >15s}'.format(str(np.round(r, 3))) +
'{: >13s}'.format(str(np.round(l, 3))) +
(" *" if r > l else ""))

print("\n".join(report))
report_str = [
"H0: no outliers in the data",
"Ha: up to " + str(outliers) + " outliers in the data",
"Significance level: α = " + str(alpha),
"Reject H0 if Ri > Critical Value (λi)",
"",
"Summary Table for Two-Tailed Test",
"---------------------------------------",
" Exact Test Critical",
" Number of Statistic Value, λi",
"Outliers, i Value, Ri {:5.3g} %".format(100 * alpha),
"---------------------------------------",
]

for i, (stat, crit_val) in enumerate(zip(rs, ls)):
report_str.append(
"{: >11s}".format(str(i + 1))
+ "{: >15s}".format(str(np.round(stat, 3)))
+ "{: >13s}".format(str(np.round(crit_val, 3)))
+ (" *" if stat > crit_val else "")
)

print("\n".join(report_str))

# Remove masked values
# for which the test statistic is greater
Expand All @@ -349,8 +348,8 @@ def outliers_gesd(
data[ms[np.max(np.where(rs > ls))]] = True
# rearrange data so mask is in same order as incoming data
data = np.vstack((data, np.arange(0, data.shape[0])[argsort_index]))
data = data[0, data.argsort()[1, ]]
data = data.astype('bool')
data = data[0, data.argsort()[1,]]
data = data.astype("bool")
else:
data = np.delete(data, ms[np.max(np.where(rs > ls))])

Expand Down
Loading

0 comments on commit 65e9b6b

Please sign in to comment.