Skip to content

Commit

Permalink
Major refactor (#33)
Browse files Browse the repository at this point in the history
* Do not call kcandidates if no hr is used

* Adapted changelog

* Early return for no hubness reduction

* Avoid NotFittedError

* Fixed index order

* Improve no hubness

* Remove float transformation

* Started refactoring

* Fixed some inconsistencies

* More detailled analysis

* Fix import

* Set only fit target flat

* Major refactor and simplification

* Fixed some doc struff
  • Loading branch information
dobraczka authored Dec 22, 2023
1 parent 9f4b15c commit 95c6852
Show file tree
Hide file tree
Showing 26 changed files with 433 additions and 961 deletions.
11 changes: 9 additions & 2 deletions kiez/analysis/estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float:
----------
k_occurrence: ndarray
Reverse nearest neighbor count for each object.
Returns
-------
skew_truncnorm
Expand All @@ -62,7 +63,8 @@ def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float:
def _calc_gini_index(
k_occurrence: np.ndarray, limiting="memory", verbose: int = 0
) -> float:
"""Hubness measure; Gini index
"""Hubness measure; Gini index.
Parameters
----------
k_occurrence: ndarray
Expand All @@ -73,6 +75,7 @@ def _calc_gini_index(
otherwise use naive implementation (slow, low memory usage)
verbose: int
control verbosity
Returns
-------
gini_index
Expand Down Expand Up @@ -103,6 +106,7 @@ def _calc_robinhood_index(k_occurrence: np.ndarray) -> float:
----------
k_occurrence: ndarray
Reverse nearest neighbor count for each object.
Returns
-------
robinhood_index
Expand Down Expand Up @@ -135,6 +139,7 @@ def _calc_atkinson_index(k_occurrence: np.ndarray, eps: float = 0.5) -> float:
Reverse nearest neighbor count for each object.
eps: float
'Income' weight. Turns the index into a normative measure.
Returns
-------
atkinson_index
Expand All @@ -156,6 +161,7 @@ def _calc_antihub_occurrence(k_occurrence: np.ndarray) -> Tuple[np.ndarray, floa
----------
k_occurrence: ndarray
Reverse nearest neighbor count for each object.
Returns
-------
antihubs, antihub_occurrence
Expand All @@ -180,6 +186,7 @@ def _calc_hub_occurrence(
Number of queries (or objects in a test set)
hub_size: float
Factor to determine hubs
Returns
-------
hubs, hub_occurrence
Expand All @@ -201,7 +208,7 @@ def hubness_score(
return_value: str = "all_but_gini",
store_k_occurrence: bool = False,
) -> Union[float, dict]:
"""Calculates hubness scores from given neighbor indices
"""Calculate hubness scores from given neighbor indices.
Utilizes findings from [1]_ and [2]_.
Expand Down
4 changes: 0 additions & 4 deletions kiez/hubness_reduction/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause
# adapted from skhubness: https://github.com/VarIr/scikit-hubness/

from class_resolver import ClassResolver

from .base import HubnessReduction, NoHubnessReduction
Expand Down
98 changes: 77 additions & 21 deletions kiez/hubness_reduction/base.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,97 @@
# -*- coding: utf-8 -*-
# adapted from skhubness
# SPDX-License-Identifier: BSD-3-Clause

import warnings
from abc import ABC, abstractmethod
from typing import Optional, Tuple

import numpy as np

from ..neighbors import NNAlgorithm


class HubnessReduction(ABC):
"""Base class for hubness reduction."""

@abstractmethod
def __init__(self, **kwargs):
pass
def __init__(self, nn_algo: NNAlgorithm, verbose: int = 0, **kwargs):
self.nn_algo = nn_algo
self.verbose = verbose
if nn_algo.n_candidates == 1:
raise ValueError(
"Cannot perform hubness reduction with a single candidate per query!"
)

@abstractmethod
def fit(
self, neigh_dist, neigh_ind, source, target, assume_sorted, *args, **kwargs
):
def _fit(self, neigh_dist, neigh_ind, source, target):
pass # pragma: no cover

def fit(self, source, target=None):
self.nn_algo.fit(source, target)
if target is None:
target = source
neigh_dist_t_to_s, neigh_ind_t_to_s = self.nn_algo.kneighbors(
k=self.nn_algo.n_candidates,
query=target,
s_to_t=False,
return_distance=True,
)
self._fit(
neigh_dist_t_to_s,
neigh_ind_t_to_s,
source,
target,
)

@abstractmethod
def transform(self, neigh_dist, neigh_ind, query, assume_sorted, *args, **kwargs):
def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarray]:
pass # pragma: no cover

def _set_k_if_needed(self, k: Optional[int] = None) -> int:
if k is None:
warnings.warn(
f"No k supplied, setting to n_candidates = {self.nn_algo.n_candidates}"
)
return self.nn_algo.n_candidates
if k > self.nn_algo.n_candidates:
warnings.warn(
"k > n_candidates supplied! Setting to n_candidates ="
f" {self.nn_algo.n_candidates}"
)
return self.nn_algo.n_candidates
return k

class NoHubnessReduction(HubnessReduction):
"""Compatibility class for neighbor search without hubness reduction."""
def kneighbors(self, k: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
n_neighbors = self._set_k_if_needed(k)
# First obtain candidate neighbors
query_dist, query_ind = self.nn_algo.kneighbors(
query=None, k=self.nn_algo.n_candidates, return_distance=True
)

def __init__(self, **kwargs):
super().__init__(**kwargs)
# Second, reduce hubness
hubness_reduced_query_dist, query_ind = self.transform(
query_dist,
query_ind,
self.nn_algo.source_,
)
# Third, sort hubness reduced candidate neighbors to get the final k neighbors
kth = np.arange(n_neighbors)
mask = np.argpartition(hubness_reduced_query_dist, kth=kth)[:, :n_neighbors]
hubness_reduced_query_dist = np.take_along_axis(
hubness_reduced_query_dist, mask, axis=1
)
query_ind = np.take_along_axis(query_ind, mask, axis=1)
return hubness_reduced_query_dist, query_ind

def fit(self, *args, **kwargs):

class NoHubnessReduction(HubnessReduction):
"""Base class for hubness reduction."""

def _fit(self, neigh_dist, neigh_ind, source, target):
pass # pragma: no cover

def __repr__(self):
return "NoHubnessReduction"
def fit(self, source, target=None):
self.nn_algo.fit(source, target, only_fit_target=True)

def transform(
self, neigh_dist, neigh_ind, query, assume_sorted=True, *args, **kwargs
):
def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarray]:
return neigh_dist, neigh_ind

def kneighbors(self, k: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
n_neighbors = self._set_k_if_needed(k)
return self.nn_algo.kneighbors(query=None, k=n_neighbors, return_distance=True)
81 changes: 7 additions & 74 deletions kiez/hubness_reduction/csls.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,6 @@ class CSLS(HubnessReduction):
Uses the formula presented in [1]_.
Parameters
----------
k: int, default = 5
Number of neighbors to consider for mean distance of k-nearest neighbors
verbose: int, default= 0
Verbosity level
References
----------
.. [1] Lample, G., Conneau, A., Ranzato, M., Denoyer, L., & Jégou, H. (2018)
Expand All @@ -31,23 +24,15 @@ class CSLS(HubnessReduction):
https://openreview.net/forum?id=H196sainb
"""

def __init__(self, k: int = 5, verbose: int = 0, *args, **kwargs):
super().__init__(**kwargs)
self.k = k
self.verbose = verbose

def __repr__(self):
return f"{self.__class__.__name__}(k={self.k}, verbose = {self.verbose})"
return f"{self.__class__.__name__}(verbose = {self.verbose})"

def fit(
def _fit(
self,
neigh_dist,
neigh_ind,
source=None,
target=None,
assume_sorted=None,
*args,
**kwargs,
) -> CSLS:
"""Fit the model using target, neigh_dist, and neigh_ind as training data.
Expand All @@ -62,54 +47,21 @@ def fit(
ignored
target
ignored
assume_sorted: bool, default=True #noqa: DAR103
Assume input matrices are sorted according to neigh_dist.
If False, these are sorted here.
*args
Ignored
**kwargs
Ignored
Returns
-------
CSLS
Fitted CSLS
Raises
------
ValueError
If self.k < 0
TypeError
If self.k not int
"""
# Check equal number of rows and columns
check_consistent_length(neigh_ind, neigh_dist)
check_consistent_length(neigh_ind.T, neigh_dist.T)
try:
if self.k <= 0:
raise ValueError(f"Expected k > 0. Got {self.k}")
except TypeError as exc:
raise TypeError(f"Expected k: int > 0. Got {self.k}") from exc

# increment to include the k-th element in slicing
k = self.k + 1

if assume_sorted:
self.r_dist_train_ = neigh_dist[:, :k]
self.r_ind_train_ = neigh_ind[:, :k]
else:
kth = np.arange(self.k)
mask = np.argpartition(neigh_dist, kth=kth)[:, :k]
self.r_dist_train_ = np.take_along_axis(neigh_dist, mask, axis=1)
self.r_ind_train_ = np.take_along_axis(neigh_ind, mask, axis=1)
self.r_dist_train_ = neigh_dist
self.r_ind_train_ = neigh_ind
return self

def transform(
self,
neigh_dist,
neigh_ind,
query,
assume_sorted: bool = True,
*args,
**kwargs,
) -> Tuple[np.ndarray, np.ndarray]:
"""Transform distance between test and training data with CSLS.
Expand All @@ -122,17 +74,12 @@ def transform(
Neighbor indices corresponding to the values in neigh_dist
query
Ignored
assume_sorted: bool
ignored
*args
Ignored
**kwargs
Ignored
Returns
-------
hub_reduced_dist, neigh_ind
CSLS distances, and corresponding neighbor indices
Notes
-----
The returned distances are NOT sorted! If you use this class directly,
Expand All @@ -142,22 +89,8 @@ def transform(

n_test, n_indexed = neigh_dist.shape

if n_indexed == 1:
warnings.warn(
"Cannot perform hubness reduction with a single neighbor per query. "
"Skipping hubness reduction, and returning untransformed distances."
)
return neigh_dist, neigh_ind

k = self.k

# Find average distances to the k nearest neighbors
if assume_sorted:
r_dist_test = neigh_dist[:, :k]
else:
kth = np.arange(self.k)
mask = np.argpartition(neigh_dist, kth=kth)[:, :k]
r_dist_test = np.take_along_axis(neigh_dist, mask, axis=1)
r_dist_test = neigh_dist

hub_reduced_dist = np.empty_like(neigh_dist)

Expand Down
Loading

0 comments on commit 95c6852

Please sign in to comment.