Major refactor (#33)

* Do not call kcandidates if no hr is used * Adapted changelog * Early return for no hubness reduction * Avoid NotFittedError * Fixed index order * Improve no hubness * Remove float transformation * Started refactoring * Fixed some inconsistencies * More detailled analysis * Fix import * Set only fit target flat * Major refactor and simplification * Fixed some doc struff
dobraczka · Dec 22, 2023 · 95c6852 · 95c6852
1 parent 9f4b15c
commit 95c6852
Show file tree

Hide file tree

Showing 26 changed files with 433 additions and 961 deletions.
diff --git a/kiez/analysis/estimation.py b/kiez/analysis/estimation.py
@@ -45,6 +45,7 @@ def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float:
     ----------
     k_occurrence: ndarray
         Reverse nearest neighbor count for each object.
+
     Returns
     -------
     skew_truncnorm
@@ -62,7 +63,8 @@ def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float:
 def _calc_gini_index(
     k_occurrence: np.ndarray, limiting="memory", verbose: int = 0
 ) -> float:
-    """Hubness measure; Gini index
+    """Hubness measure; Gini index.
+
     Parameters
     ----------
     k_occurrence: ndarray
@@ -73,6 +75,7 @@ def _calc_gini_index(
         otherwise use naive implementation (slow, low memory usage)
     verbose: int
         control verbosity
+
     Returns
     -------
     gini_index
@@ -103,6 +106,7 @@ def _calc_robinhood_index(k_occurrence: np.ndarray) -> float:
     ----------
     k_occurrence: ndarray
         Reverse nearest neighbor count for each object.
+
     Returns
     -------
     robinhood_index
@@ -135,6 +139,7 @@ def _calc_atkinson_index(k_occurrence: np.ndarray, eps: float = 0.5) -> float:
         Reverse nearest neighbor count for each object.
     eps: float
         'Income' weight. Turns the index into a normative measure.
+
     Returns
     -------
     atkinson_index
@@ -156,6 +161,7 @@ def _calc_antihub_occurrence(k_occurrence: np.ndarray) -> Tuple[np.ndarray, floa
     ----------
     k_occurrence: ndarray
         Reverse nearest neighbor count for each object.
+
     Returns
     -------
     antihubs, antihub_occurrence
@@ -180,6 +186,7 @@ def _calc_hub_occurrence(
         Number of queries (or objects in a test set)
     hub_size: float
         Factor to determine hubs
+
     Returns
     -------
     hubs, hub_occurrence
@@ -201,7 +208,7 @@ def hubness_score(
     return_value: str = "all_but_gini",
     store_k_occurrence: bool = False,
 ) -> Union[float, dict]:
-    """Calculates hubness scores from given neighbor indices
+    """Calculate hubness scores from given neighbor indices.
 
     Utilizes findings from [1]_ and [2]_.
 

diff --git a/kiez/hubness_reduction/__init__.py b/kiez/hubness_reduction/__init__.py
@@ -1,7 +1,3 @@
-# -*- coding: utf-8 -*-
-# SPDX-License-Identifier: BSD-3-Clause
-# adapted from skhubness: https://github.com/VarIr/scikit-hubness/
-
 from class_resolver import ClassResolver
 
 from .base import HubnessReduction, NoHubnessReduction

diff --git a/kiez/hubness_reduction/base.py b/kiez/hubness_reduction/base.py
@@ -1,41 +1,97 @@
-# -*- coding: utf-8 -*-
-# adapted from skhubness
-# SPDX-License-Identifier: BSD-3-Clause
-
+import warnings
 from abc import ABC, abstractmethod
+from typing import Optional, Tuple
+
+import numpy as np
+
+from ..neighbors import NNAlgorithm
 
 
 class HubnessReduction(ABC):
     """Base class for hubness reduction."""
 
-    @abstractmethod
-    def __init__(self, **kwargs):
-        pass
+    def __init__(self, nn_algo: NNAlgorithm, verbose: int = 0, **kwargs):
+        self.nn_algo = nn_algo
+        self.verbose = verbose
+        if nn_algo.n_candidates == 1:
+            raise ValueError(
+                "Cannot perform hubness reduction with a single candidate per query!"
+            )
 
     @abstractmethod
-    def fit(
-        self, neigh_dist, neigh_ind, source, target, assume_sorted, *args, **kwargs
-    ):
+    def _fit(self, neigh_dist, neigh_ind, source, target):
         pass  # pragma: no cover
 
+    def fit(self, source, target=None):
+        self.nn_algo.fit(source, target)
+        if target is None:
+            target = source
+        neigh_dist_t_to_s, neigh_ind_t_to_s = self.nn_algo.kneighbors(
+            k=self.nn_algo.n_candidates,
+            query=target,
+            s_to_t=False,
+            return_distance=True,
+        )
+        self._fit(
+            neigh_dist_t_to_s,
+            neigh_ind_t_to_s,
+            source,
+            target,
+        )
+
     @abstractmethod
-    def transform(self, neigh_dist, neigh_ind, query, assume_sorted, *args, **kwargs):
+    def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarray]:
         pass  # pragma: no cover
 
+    def _set_k_if_needed(self, k: Optional[int] = None) -> int:
+        if k is None:
+            warnings.warn(
+                f"No k supplied, setting to n_candidates = {self.nn_algo.n_candidates}"
+            )
+            return self.nn_algo.n_candidates
+        if k > self.nn_algo.n_candidates:
+            warnings.warn(
+                "k > n_candidates supplied! Setting to n_candidates ="
+                f" {self.nn_algo.n_candidates}"
+            )
+            return self.nn_algo.n_candidates
+        return k
 
-class NoHubnessReduction(HubnessReduction):
-    """Compatibility class for neighbor search without hubness reduction."""
+    def kneighbors(self, k: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
+        n_neighbors = self._set_k_if_needed(k)
+        # First obtain candidate neighbors
+        query_dist, query_ind = self.nn_algo.kneighbors(
+            query=None, k=self.nn_algo.n_candidates, return_distance=True
+        )
 
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
+        # Second, reduce hubness
+        hubness_reduced_query_dist, query_ind = self.transform(
+            query_dist,
+            query_ind,
+            self.nn_algo.source_,
+        )
+        # Third, sort hubness reduced candidate neighbors to get the final k neighbors
+        kth = np.arange(n_neighbors)
+        mask = np.argpartition(hubness_reduced_query_dist, kth=kth)[:, :n_neighbors]
+        hubness_reduced_query_dist = np.take_along_axis(
+            hubness_reduced_query_dist, mask, axis=1
+        )
+        query_ind = np.take_along_axis(query_ind, mask, axis=1)
+        return hubness_reduced_query_dist, query_ind
 
-    def fit(self, *args, **kwargs):
+
+class NoHubnessReduction(HubnessReduction):
+    """Base class for hubness reduction."""
+
+    def _fit(self, neigh_dist, neigh_ind, source, target):
         pass  # pragma: no cover
 
-    def __repr__(self):
-        return "NoHubnessReduction"
+    def fit(self, source, target=None):
+        self.nn_algo.fit(source, target, only_fit_target=True)
 
-    def transform(
-        self, neigh_dist, neigh_ind, query, assume_sorted=True, *args, **kwargs
-    ):
+    def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarray]:
         return neigh_dist, neigh_ind
+
+    def kneighbors(self, k: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
+        n_neighbors = self._set_k_if_needed(k)
+        return self.nn_algo.kneighbors(query=None, k=n_neighbors, return_distance=True)
diff --git a/kiez/hubness_reduction/csls.py b/kiez/hubness_reduction/csls.py
@@ -15,13 +15,6 @@ class CSLS(HubnessReduction):
 
     Uses the formula presented in [1]_.
 
-    Parameters
-    ----------
-    k: int, default = 5
-        Number of neighbors to consider for mean distance of k-nearest neighbors
-    verbose: int, default= 0
-        Verbosity level
-
     References
     ----------
     .. [1] Lample, G., Conneau, A., Ranzato, M., Denoyer, L., & Jégou, H. (2018)
@@ -31,23 +24,15 @@ class CSLS(HubnessReduction):
            https://openreview.net/forum?id=H196sainb
     """
 
-    def __init__(self, k: int = 5, verbose: int = 0, *args, **kwargs):
-        super().__init__(**kwargs)
-        self.k = k
-        self.verbose = verbose
-
     def __repr__(self):
-        return f"{self.__class__.__name__}(k={self.k}, verbose = {self.verbose})"
+        return f"{self.__class__.__name__}(verbose = {self.verbose})"
 
-    def fit(
+    def _fit(
         self,
         neigh_dist,
         neigh_ind,
         source=None,
         target=None,
-        assume_sorted=None,
-        *args,
-        **kwargs,
     ) -> CSLS:
         """Fit the model using target, neigh_dist, and neigh_ind as training data.
 
@@ -62,54 +47,21 @@ def fit(
             ignored
         target
             ignored
-        assume_sorted: bool, default=True #noqa: DAR103
-            Assume input matrices are sorted according to neigh_dist.
-            If False, these are sorted here.
-        *args
-            Ignored
-        **kwargs
-            Ignored
+
         Returns
         -------
         CSLS
             Fitted CSLS
-        Raises
-        ------
-        ValueError
-            If self.k < 0
-        TypeError
-            If self.k not int
         """
-        # Check equal number of rows and columns
-        check_consistent_length(neigh_ind, neigh_dist)
-        check_consistent_length(neigh_ind.T, neigh_dist.T)
-        try:
-            if self.k <= 0:
-                raise ValueError(f"Expected k > 0. Got {self.k}")
-        except TypeError as exc:
-            raise TypeError(f"Expected k: int > 0. Got {self.k}") from exc
-
-        # increment to include the k-th element in slicing
-        k = self.k + 1
-
-        if assume_sorted:
-            self.r_dist_train_ = neigh_dist[:, :k]
-            self.r_ind_train_ = neigh_ind[:, :k]
-        else:
-            kth = np.arange(self.k)
-            mask = np.argpartition(neigh_dist, kth=kth)[:, :k]
-            self.r_dist_train_ = np.take_along_axis(neigh_dist, mask, axis=1)
-            self.r_ind_train_ = np.take_along_axis(neigh_ind, mask, axis=1)
+        self.r_dist_train_ = neigh_dist
+        self.r_ind_train_ = neigh_ind
         return self
 
     def transform(
         self,
         neigh_dist,
         neigh_ind,
         query,
-        assume_sorted: bool = True,
-        *args,
-        **kwargs,
     ) -> Tuple[np.ndarray, np.ndarray]:
         """Transform distance between test and training data with CSLS.
 
@@ -122,17 +74,12 @@ def transform(
             Neighbor indices corresponding to the values in neigh_dist
         query
             Ignored
-        assume_sorted: bool
-            ignored
-        *args
-            Ignored
-        **kwargs
-            Ignored
 
         Returns
         -------
         hub_reduced_dist, neigh_ind
             CSLS distances, and corresponding neighbor indices
+
         Notes
         -----
         The returned distances are NOT sorted! If you use this class directly,
@@ -142,22 +89,8 @@ def transform(
 
         n_test, n_indexed = neigh_dist.shape
 
-        if n_indexed == 1:
-            warnings.warn(
-                "Cannot perform hubness reduction with a single neighbor per query. "
-                "Skipping hubness reduction, and returning untransformed distances."
-            )
-            return neigh_dist, neigh_ind
-
-        k = self.k
-
         # Find average distances to the k nearest neighbors
-        if assume_sorted:
-            r_dist_test = neigh_dist[:, :k]
-        else:
-            kth = np.arange(self.k)
-            mask = np.argpartition(neigh_dist, kth=kth)[:, :k]
-            r_dist_test = np.take_along_axis(neigh_dist, mask, axis=1)
+        r_dist_test = neigh_dist
 
         hub_reduced_dist = np.empty_like(neigh_dist)