From 95c68523809130b989efd6e2106d96e0a4377c9b Mon Sep 17 00:00:00 2001 From: Daniel Obraczka Date: Fri, 22 Dec 2023 10:05:43 +0100 Subject: [PATCH] Major refactor (#33) * Do not call kcandidates if no hr is used * Adapted changelog * Early return for no hubness reduction * Avoid NotFittedError * Fixed index order * Improve no hubness * Remove float transformation * Started refactoring * Fixed some inconsistencies * More detailled analysis * Fix import * Set only fit target flat * Major refactor and simplification * Fixed some doc struff --- kiez/analysis/estimation.py | 11 +- kiez/hubness_reduction/__init__.py | 4 - kiez/hubness_reduction/base.py | 98 ++++++-- kiez/hubness_reduction/csls.py | 81 +------ kiez/hubness_reduction/dis_sim.py | 94 ++------ kiez/hubness_reduction/local_scaling.py | 73 +----- kiez/hubness_reduction/mutual_proximity.py | 49 +--- kiez/io/temp_file_handling.py | 2 +- kiez/kiez.py | 143 +++--------- kiez/neighbors/approximate/faiss.py | 131 ++--------- .../exact/sklearn_nearest_neighbors.py | 3 +- kiez/neighbors/neighbor_algorithm_base.py | 20 +- kiez/neighbors/util.py | 6 +- noxfile.py | 50 +++- pyproject.toml | 2 +- tests/conftest.py | 11 + .../test_mutual_proximity.py | 25 -- tests/hubness_reduction/test_wrong_inputs.py | 21 ++ tests/neighbors/test_alignment.py | 178 --------------- tests/neighbors/test_annoy.py | 47 ++-- tests/neighbors/test_base.py | 11 +- tests/neighbors/test_faiss.py | 12 +- tests/neighbors/test_hnsw.py | 42 +--- tests/neighbors/test_nng.py | 60 ++--- tests/neighbors/test_sklearn.py | 4 +- tests/test_kiez.py | 216 +++++++----------- 26 files changed, 433 insertions(+), 961 deletions(-) create mode 100644 tests/conftest.py delete mode 100644 tests/hubness_reduction/test_mutual_proximity.py create mode 100644 tests/hubness_reduction/test_wrong_inputs.py delete mode 100644 tests/neighbors/test_alignment.py diff --git a/kiez/analysis/estimation.py b/kiez/analysis/estimation.py index b75d41c..22f92d9 100644 --- a/kiez/analysis/estimation.py +++ b/kiez/analysis/estimation.py @@ -45,6 +45,7 @@ def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float: ---------- k_occurrence: ndarray Reverse nearest neighbor count for each object. + Returns ------- skew_truncnorm @@ -62,7 +63,8 @@ def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float: def _calc_gini_index( k_occurrence: np.ndarray, limiting="memory", verbose: int = 0 ) -> float: - """Hubness measure; Gini index + """Hubness measure; Gini index. + Parameters ---------- k_occurrence: ndarray @@ -73,6 +75,7 @@ def _calc_gini_index( otherwise use naive implementation (slow, low memory usage) verbose: int control verbosity + Returns ------- gini_index @@ -103,6 +106,7 @@ def _calc_robinhood_index(k_occurrence: np.ndarray) -> float: ---------- k_occurrence: ndarray Reverse nearest neighbor count for each object. + Returns ------- robinhood_index @@ -135,6 +139,7 @@ def _calc_atkinson_index(k_occurrence: np.ndarray, eps: float = 0.5) -> float: Reverse nearest neighbor count for each object. eps: float 'Income' weight. Turns the index into a normative measure. + Returns ------- atkinson_index @@ -156,6 +161,7 @@ def _calc_antihub_occurrence(k_occurrence: np.ndarray) -> Tuple[np.ndarray, floa ---------- k_occurrence: ndarray Reverse nearest neighbor count for each object. + Returns ------- antihubs, antihub_occurrence @@ -180,6 +186,7 @@ def _calc_hub_occurrence( Number of queries (or objects in a test set) hub_size: float Factor to determine hubs + Returns ------- hubs, hub_occurrence @@ -201,7 +208,7 @@ def hubness_score( return_value: str = "all_but_gini", store_k_occurrence: bool = False, ) -> Union[float, dict]: - """Calculates hubness scores from given neighbor indices + """Calculate hubness scores from given neighbor indices. Utilizes findings from [1]_ and [2]_. diff --git a/kiez/hubness_reduction/__init__.py b/kiez/hubness_reduction/__init__.py index bc90944..1740fba 100644 --- a/kiez/hubness_reduction/__init__.py +++ b/kiez/hubness_reduction/__init__.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- -# SPDX-License-Identifier: BSD-3-Clause -# adapted from skhubness: https://github.com/VarIr/scikit-hubness/ - from class_resolver import ClassResolver from .base import HubnessReduction, NoHubnessReduction diff --git a/kiez/hubness_reduction/base.py b/kiez/hubness_reduction/base.py index 87c81b7..9040384 100644 --- a/kiez/hubness_reduction/base.py +++ b/kiez/hubness_reduction/base.py @@ -1,41 +1,97 @@ -# -*- coding: utf-8 -*- -# adapted from skhubness -# SPDX-License-Identifier: BSD-3-Clause - +import warnings from abc import ABC, abstractmethod +from typing import Optional, Tuple + +import numpy as np + +from ..neighbors import NNAlgorithm class HubnessReduction(ABC): """Base class for hubness reduction.""" - @abstractmethod - def __init__(self, **kwargs): - pass + def __init__(self, nn_algo: NNAlgorithm, verbose: int = 0, **kwargs): + self.nn_algo = nn_algo + self.verbose = verbose + if nn_algo.n_candidates == 1: + raise ValueError( + "Cannot perform hubness reduction with a single candidate per query!" + ) @abstractmethod - def fit( - self, neigh_dist, neigh_ind, source, target, assume_sorted, *args, **kwargs - ): + def _fit(self, neigh_dist, neigh_ind, source, target): pass # pragma: no cover + def fit(self, source, target=None): + self.nn_algo.fit(source, target) + if target is None: + target = source + neigh_dist_t_to_s, neigh_ind_t_to_s = self.nn_algo.kneighbors( + k=self.nn_algo.n_candidates, + query=target, + s_to_t=False, + return_distance=True, + ) + self._fit( + neigh_dist_t_to_s, + neigh_ind_t_to_s, + source, + target, + ) + @abstractmethod - def transform(self, neigh_dist, neigh_ind, query, assume_sorted, *args, **kwargs): + def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarray]: pass # pragma: no cover + def _set_k_if_needed(self, k: Optional[int] = None) -> int: + if k is None: + warnings.warn( + f"No k supplied, setting to n_candidates = {self.nn_algo.n_candidates}" + ) + return self.nn_algo.n_candidates + if k > self.nn_algo.n_candidates: + warnings.warn( + "k > n_candidates supplied! Setting to n_candidates =" + f" {self.nn_algo.n_candidates}" + ) + return self.nn_algo.n_candidates + return k -class NoHubnessReduction(HubnessReduction): - """Compatibility class for neighbor search without hubness reduction.""" + def kneighbors(self, k: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]: + n_neighbors = self._set_k_if_needed(k) + # First obtain candidate neighbors + query_dist, query_ind = self.nn_algo.kneighbors( + query=None, k=self.nn_algo.n_candidates, return_distance=True + ) - def __init__(self, **kwargs): - super().__init__(**kwargs) + # Second, reduce hubness + hubness_reduced_query_dist, query_ind = self.transform( + query_dist, + query_ind, + self.nn_algo.source_, + ) + # Third, sort hubness reduced candidate neighbors to get the final k neighbors + kth = np.arange(n_neighbors) + mask = np.argpartition(hubness_reduced_query_dist, kth=kth)[:, :n_neighbors] + hubness_reduced_query_dist = np.take_along_axis( + hubness_reduced_query_dist, mask, axis=1 + ) + query_ind = np.take_along_axis(query_ind, mask, axis=1) + return hubness_reduced_query_dist, query_ind - def fit(self, *args, **kwargs): + +class NoHubnessReduction(HubnessReduction): + """Base class for hubness reduction.""" + + def _fit(self, neigh_dist, neigh_ind, source, target): pass # pragma: no cover - def __repr__(self): - return "NoHubnessReduction" + def fit(self, source, target=None): + self.nn_algo.fit(source, target, only_fit_target=True) - def transform( - self, neigh_dist, neigh_ind, query, assume_sorted=True, *args, **kwargs - ): + def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarray]: return neigh_dist, neigh_ind + + def kneighbors(self, k: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]: + n_neighbors = self._set_k_if_needed(k) + return self.nn_algo.kneighbors(query=None, k=n_neighbors, return_distance=True) diff --git a/kiez/hubness_reduction/csls.py b/kiez/hubness_reduction/csls.py index eeb6eea..c252f77 100644 --- a/kiez/hubness_reduction/csls.py +++ b/kiez/hubness_reduction/csls.py @@ -15,13 +15,6 @@ class CSLS(HubnessReduction): Uses the formula presented in [1]_. - Parameters - ---------- - k: int, default = 5 - Number of neighbors to consider for mean distance of k-nearest neighbors - verbose: int, default= 0 - Verbosity level - References ---------- .. [1] Lample, G., Conneau, A., Ranzato, M., Denoyer, L., & Jégou, H. (2018) @@ -31,23 +24,15 @@ class CSLS(HubnessReduction): https://openreview.net/forum?id=H196sainb """ - def __init__(self, k: int = 5, verbose: int = 0, *args, **kwargs): - super().__init__(**kwargs) - self.k = k - self.verbose = verbose - def __repr__(self): - return f"{self.__class__.__name__}(k={self.k}, verbose = {self.verbose})" + return f"{self.__class__.__name__}(verbose = {self.verbose})" - def fit( + def _fit( self, neigh_dist, neigh_ind, source=None, target=None, - assume_sorted=None, - *args, - **kwargs, ) -> CSLS: """Fit the model using target, neigh_dist, and neigh_ind as training data. @@ -62,44 +47,14 @@ def fit( ignored target ignored - assume_sorted: bool, default=True #noqa: DAR103 - Assume input matrices are sorted according to neigh_dist. - If False, these are sorted here. - *args - Ignored - **kwargs - Ignored + Returns ------- CSLS Fitted CSLS - Raises - ------ - ValueError - If self.k < 0 - TypeError - If self.k not int """ - # Check equal number of rows and columns - check_consistent_length(neigh_ind, neigh_dist) - check_consistent_length(neigh_ind.T, neigh_dist.T) - try: - if self.k <= 0: - raise ValueError(f"Expected k > 0. Got {self.k}") - except TypeError as exc: - raise TypeError(f"Expected k: int > 0. Got {self.k}") from exc - - # increment to include the k-th element in slicing - k = self.k + 1 - - if assume_sorted: - self.r_dist_train_ = neigh_dist[:, :k] - self.r_ind_train_ = neigh_ind[:, :k] - else: - kth = np.arange(self.k) - mask = np.argpartition(neigh_dist, kth=kth)[:, :k] - self.r_dist_train_ = np.take_along_axis(neigh_dist, mask, axis=1) - self.r_ind_train_ = np.take_along_axis(neigh_ind, mask, axis=1) + self.r_dist_train_ = neigh_dist + self.r_ind_train_ = neigh_ind return self def transform( @@ -107,9 +62,6 @@ def transform( neigh_dist, neigh_ind, query, - assume_sorted: bool = True, - *args, - **kwargs, ) -> Tuple[np.ndarray, np.ndarray]: """Transform distance between test and training data with CSLS. @@ -122,17 +74,12 @@ def transform( Neighbor indices corresponding to the values in neigh_dist query Ignored - assume_sorted: bool - ignored - *args - Ignored - **kwargs - Ignored Returns ------- hub_reduced_dist, neigh_ind CSLS distances, and corresponding neighbor indices + Notes ----- The returned distances are NOT sorted! If you use this class directly, @@ -142,22 +89,8 @@ def transform( n_test, n_indexed = neigh_dist.shape - if n_indexed == 1: - warnings.warn( - "Cannot perform hubness reduction with a single neighbor per query. " - "Skipping hubness reduction, and returning untransformed distances." - ) - return neigh_dist, neigh_ind - - k = self.k - # Find average distances to the k nearest neighbors - if assume_sorted: - r_dist_test = neigh_dist[:, :k] - else: - kth = np.arange(self.k) - mask = np.argpartition(neigh_dist, kth=kth)[:, :k] - r_dist_test = np.take_along_axis(neigh_dist, mask, axis=1) + r_dist_test = neigh_dist hub_reduced_dist = np.empty_like(neigh_dist) diff --git a/kiez/hubness_reduction/dis_sim.py b/kiez/hubness_reduction/dis_sim.py index ce8240d..49849ba 100644 --- a/kiez/hubness_reduction/dis_sim.py +++ b/kiez/hubness_reduction/dis_sim.py @@ -22,12 +22,11 @@ class DisSimLocal(HubnessReduction): Parameters ---------- - k: int, default = 5 - Number of neighbors to consider for the local centroids squared: bool, default = True DisSimLocal operates on squared Euclidean distances. If True, return (quasi) squared Euclidean distances; if False, return (quasi) Eucldean distances. + References ---------- .. [1] Hara K, Suzuki I, Kobayashi K, Fukumizu K, Radovanović M (2016) @@ -36,23 +35,35 @@ class DisSimLocal(HubnessReduction): https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12055 """ - def __init__(self, k: int = 5, squared: bool = True, *args, **kwargs): - super().__init__() - self.k = k + def __init__(self, squared: bool = True, **kwargs): + super().__init__(**kwargs) self.squared = squared + if self.nn_algo.metric in ["euclidean", "minkowski"]: + self.squared = False + if hasattr(self.nn_algo, "p"): + if self.nn_algo.p != 2: + raise ValueError( + "DisSimLocal only supports squared Euclidean distances. If" + " the provided NNAlgorithm has a `p` parameter it must be" + f" set to p=2. Now it is p={self.nn_algo.p}" + ) + elif self.nn_algo.metric in ["sqeuclidean"]: + self.squared = True + else: + raise ValueError( + "DisSimLocal only supports squared Euclidean distances, not" + f" metric={self.nn_algo.metric}." + ) def __repr__(self): - return f"{self.__class__.__name__}(k={self.k}, squared = {self.squared})" + return f"{self.__class__.__name__}(squared = {self.squared})" - def fit( + def _fit( self, neigh_dist: np.ndarray, neigh_ind: np.ndarray, source: np.ndarray, target: np.ndarray, - assume_sorted: bool = True, - *args, - **kwargs, ) -> DisSimLocal: """Fit the model using target, neigh_dist, and neigh_ind as training data. @@ -69,47 +80,14 @@ def fit( target: np.ndarray, shape (n_samples, n_features) Target embedding, where n_samples is the number of vectors, and n_features their dimensionality (number of features). - assume_sorted: bool, default=True #noqa: DAR103 - Assume input matrices are sorted according to neigh_dist. - If False, these are sorted here. - *args: Ignored - Ignored - **kwargs: Ignored - Ignored + Returns ------- DisSimLocal Fitted DisSimLocal - Raises - ------ - ValueError - If self.k < 0 - TypeError - If self.k not int """ - # Check equal number of rows and columns - check_consistent_length(neigh_ind, neigh_dist) - check_consistent_length(neigh_ind.T, neigh_dist.T) - try: - if self.k <= 0: - raise ValueError(f"Expected k > 0. Got {self.k}") - except TypeError as exc: - raise TypeError(f"Expected k: int > 0. Got {self.k}") from exc - - k = self.k - if k > neigh_ind.shape[1]: - warnings.warn( - "Neighborhood parameter k larger than provided neighbors in" - f" neigh_dist, neigh_ind. Will reduce to k={neigh_ind.shape[1]}." - ) - k = neigh_ind.shape[1] - # Calculate local neighborhood centroids among the training points - if assume_sorted: - knn = neigh_ind[:, :k] - else: - mask = np.argpartition(neigh_dist, kth=k - 1)[:, :k] - knn = np.take_along_axis(neigh_ind, mask, axis=1) + knn = neigh_ind centroids = source[knn].mean(axis=1) dist_to_cent = row_norms(target - centroids, squared=True) @@ -117,7 +95,6 @@ def fit( self.target_ = target self.target_centroids_ = centroids self.target_dist_to_centroids_ = dist_to_cent - return self def transform( @@ -125,9 +102,6 @@ def transform( neigh_dist: np.ndarray, neigh_ind: np.ndarray, query: np.ndarray, - assume_sorted: bool = True, - *args, - **kwargs, ) -> Tuple[np.ndarray, np.ndarray]: """Transform distance between test and training data with DisSimLocal. @@ -141,13 +115,12 @@ def transform( query: np.ndarray, shape (n_query, n_features) Query entities that were used to obtain neighbors If none is provided use source that was provided in fit step - assume_sorted: bool - ignored Returns ------- hub_reduced_dist, neigh_ind DisSimLocal distances, and corresponding neighbor indices + Notes ----- The returned distances are NOT sorted! If you use this class directly, @@ -157,27 +130,10 @@ def transform( self, ["target_", "target_centroids_", "target_dist_to_centroids_"], ) - if query is None: - query = self.source_ - n_test, n_indexed = neigh_dist.shape - if n_indexed == 1: - warnings.warn( - "Cannot perform hubness reduction with a single neighbor per query. " - "Skipping hubness reduction, and returning untransformed distances." - ) - return neigh_dist, neigh_ind - - k = self.k - if k > neigh_ind.shape[1]: - warnings.warn( - "Neighborhood parameter k larger than provided neighbors in" - f" neigh_dist, neigh_ind. Will reduce to k={neigh_ind.shape[1]}." - ) - k = neigh_ind.shape[1] - # Calculate local neighborhood centroids for source objects among target objects + k = neigh_ind.shape[1] mask = np.argpartition(neigh_dist, kth=k - 1) for i, ind in enumerate(neigh_ind): neigh_dist[i, :] = euclidean_distances( diff --git a/kiez/hubness_reduction/local_scaling.py b/kiez/hubness_reduction/local_scaling.py index b4a28a8..03fb7b2 100644 --- a/kiez/hubness_reduction/local_scaling.py +++ b/kiez/hubness_reduction/local_scaling.py @@ -29,6 +29,7 @@ class LocalScaling(HubnessReduction): - 'nicdm' rescales distances using a statistic over distances to k neighbors verbose: int, default = 0 If verbose > 0, show progress bar. + References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). @@ -36,33 +37,26 @@ class LocalScaling(HubnessReduction): Learning Research, 13(1), 2871–2902. """ - def __init__( - self, k: int = 5, method: str = "standard", verbose: int = 0, **kwargs - ): + def __init__(self, method: str = "standard", **kwargs): super().__init__(**kwargs) - self.k = k self.method = method.lower() if self.method not in ["ls", "standard", "nicdm"]: raise ValueError( f"Internal: Invalid method {self.method}. Try 'ls' or 'nicdm'." ) - self.verbose = verbose def __repr__(self): return ( - f"{self.__class__.__name__}(k={self.k}, method = {self.method}, verbose =" + f"{self.__class__.__name__}(method = {self.method}, verbose =" f" {self.verbose})" ) - def fit( + def _fit( self, neigh_dist, neigh_ind, source, target, - assume_sorted: bool = True, - *args, - **kwargs, ) -> LocalScaling: """Fit the model using neigh_dist and neigh_ind as training data. @@ -77,34 +71,14 @@ def fit( Ignored target Ignored - assume_sorted: bool, default = True #noqa: DAR103 - Assume input matrices are sorted according to neigh_dist. - If False, these are sorted here. - *args - Ignored - **kwargs - Ignored + Returns ------- LocalScaling Fitted LocalScaling """ - # Check equal number of rows and columns - check_consistent_length(neigh_ind, neigh_dist) - check_consistent_length(neigh_ind.T, neigh_dist.T) - - # increment to include the k-th element in slicing - k = self.k + 1 - - # Find distances to the k-th neighbor (standard LS) or the k neighbors (NICDM) - if assume_sorted: - self.r_dist_t_to_s_ = neigh_dist[:, :k] - self.r_ind_t_to_s_ = neigh_ind[:, :k] - else: - kth = np.arange(self.k) - mask = np.argpartition(neigh_dist, kth=kth)[:, :k] - self.r_dist_t_to_s_ = np.take_along_axis(neigh_dist, mask, axis=1) - self.r_ind_t_to_s_ = np.take_along_axis(neigh_ind, mask, axis=1) + self.r_dist_t_to_s_ = neigh_dist + self.r_ind_t_to_s_ = neigh_ind return self def transform( @@ -112,9 +86,6 @@ def transform( neigh_dist, neigh_ind, query=None, - assume_sorted: bool = True, - *args, - **kwargs, ) -> Tuple[np.ndarray, np.ndarray]: """Transform distance between test and training data with Mutual Proximity. @@ -127,18 +98,17 @@ def transform( Neighbor indices corresponding to the values in neigh_dist query Ignored - assume_sorted: bool, default = True #noqa: DAR103 - Assume input matrices are sorted according to neigh_dist. - If False, these are partitioned here. - NOTE: The returned matrices are never sorted. + Returns ------- hub_reduced_dist, neigh_ind Local scaling distances, and corresponding neighbor indices + Raises ------ ValueError If wrong self.method was supplied + Notes ----- The returned distances are NOT sorted! If you use this class directly, @@ -148,23 +118,8 @@ def transform( n_test, n_indexed = neigh_dist.shape - if n_indexed == 1: - warnings.warn( - "Cannot perform hubness reduction with a single neighbor per query. " - "Skipping hubness reduction, and returning untransformed distances." - ) - return neigh_dist, neigh_ind - - # increment to include the k-th element in slicing - k = self.k + 1 - # Find distances to the k-th neighbor (standard LS) or the k neighbors (NICDM) - if assume_sorted: - r_dist_s_to_t = neigh_dist[:, :k] - else: - kth = np.arange(self.k) - mask = np.argpartition(neigh_dist, kth=kth)[:, :k] - r_dist_s_to_t = np.take_along_axis(neigh_dist, mask, axis=1) + r_dist_s_to_t = neigh_dist # Calculate LS or NICDM hub_reduced_dist = np.empty_like(neigh_dist) @@ -178,11 +133,7 @@ def transform( ) # Perform standard local scaling... - if self.method not in ["ls", "standard", "nicdm"]: - raise ValueError( - f"Internal: Invalid method {self.method}. Try 'ls' or 'nicdm'." - ) - elif self.method in ["ls", "standard"]: + if self.method in ["ls", "standard"]: r_t_to_s = self.r_dist_t_to_s_[:, -1] r_s_to_t = r_dist_s_to_t[:, -1] for i in range_n_test: diff --git a/kiez/hubness_reduction/mutual_proximity.py b/kiez/hubness_reduction/mutual_proximity.py index be3f950..358b474 100644 --- a/kiez/hubness_reduction/mutual_proximity.py +++ b/kiez/hubness_reduction/mutual_proximity.py @@ -31,6 +31,7 @@ class MutualProximity(HubnessReduction): - 'empiric' or 'exact' model distances with the empiric distributions (slow) verbose: int, default = 0 If verbose > 0, show progress bar. + References ---------- .. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012). @@ -38,18 +39,17 @@ class MutualProximity(HubnessReduction): Learning Research, 13(1), 2871–2902. """ - def __init__(self, method: str = "normal", verbose: int = 0, **kwargs): + def __init__(self, method: str = "normal", **kwargs): super().__init__(**kwargs) if method not in ["exact", "empiric", "normal", "gaussi"]: raise ValueError( f'Mutual proximity method "{method}" not recognized. Try "normal"' ' or "empiric".' ) - if method in ["exact", "empiric"]: + elif method in ["exact", "empiric"]: self.method = "empiric" elif method in ["normal", "gaussi"]: self.method = "normal" - self.verbose = verbose def __repr__(self): return ( @@ -57,15 +57,12 @@ def __repr__(self): f" {self.verbose})" ) - def fit( + def _fit( self, neigh_dist, neigh_ind, source, target, - assume_sorted=None, - *args, - **kwargs, ) -> MutualProximity: """Fit the model using neigh_dist and neigh_ind as training data. @@ -80,10 +77,9 @@ def fit( Ignored target Ignored - assume_sorted - Ignored + Returns - ------ + ------- MutualProximity Raises @@ -91,17 +87,9 @@ def fit( ValueError If self.method is unknown """ - # Check equal number of rows and columns - check_consistent_length(neigh_ind, neigh_dist) - check_consistent_length(neigh_ind.T, neigh_dist.T) - check_array(neigh_dist, force_all_finite=False) - check_array(neigh_ind) - self.n_train = neigh_dist.shape[0] - if self.method not in ["normal", "empiric"]: - raise ValueError(f"Internal: Invalid method {self.method}.") - elif self.method == "empiric": + if self.method == "empiric": self.neigh_dist_t_to_s_ = neigh_dist self.neigh_ind_t_to_s_ = neigh_ind elif self.method == "normal": @@ -109,9 +97,7 @@ def fit( self.sd_t_to_s_ = np.nanstd(neigh_dist, axis=1, ddof=0) return self - def transform( - self, neigh_dist, neigh_ind, query, assume_sorted=None, *args, **kwargs - ): + def transform(self, neigh_dist, neigh_ind, query): """Transform distance between test and training data with Mutual Proximity. Parameters @@ -123,16 +109,17 @@ def transform( Neighbor indices corresponding to the values in neigh_dist query Ignored - assume_sorted - Ignored + Returns ------- hub_reduced_dist, neigh_ind Mutual Proximity distances, and corresponding neighbor indices + Raises ------ ValueError if self.method is unknown + Notes ----- The returned distances are NOT sorted! If you use this class directly, @@ -148,18 +135,8 @@ def transform( ], all_or_any=any, ) - check_array(neigh_dist, force_all_finite="allow-nan") - check_array(neigh_ind) - n_test, n_indexed = neigh_dist.shape - if n_indexed == 1: - warnings.warn( - "Cannot perform hubness reduction with a single neighbor per query. " - "Skipping hubness reduction, and returning untransformed distances." - ) - return neigh_dist, neigh_ind - hub_reduced_dist = np.empty_like(neigh_dist) # Show progress in hubness reduction loop @@ -171,9 +148,7 @@ def transform( ) # Calculate MP with independent Gaussians - if self.method not in ["normal", "empiric"]: - raise ValueError(f"Internal: Invalid method {self.method}.") - elif self.method == "normal": + if self.method == "normal": mu_t_to_s = self.mu_t_to_s_ sd_t_to_s_ = self.sd_t_to_s_ for i in range_n_test: diff --git a/kiez/io/temp_file_handling.py b/kiez/io/temp_file_handling.py index bc3f883..83829d3 100644 --- a/kiez/io/temp_file_handling.py +++ b/kiez/io/temp_file_handling.py @@ -19,7 +19,7 @@ def create_tempfile_preferably_in_dir( For example, this is useful to try to save into /dev/shm. Parameters - --------- + ---------- suffix: str suffix of tempfile prefix: str diff --git a/kiez/kiez.py b/kiez/kiez.py index 08dacc5..9e23f2d 100644 --- a/kiez/kiez.py +++ b/kiez/kiez.py @@ -1,21 +1,20 @@ -# -*- coding: utf-8 -*- -# SPDX-License-Identifier: BSD-3-Clause from __future__ import annotations import json +import warnings from pathlib import Path from typing import Any, Dict, Optional, Tuple, Union import numpy as np from class_resolver import HintOrType -from kiez.hubness_reduction import DisSimLocal, hubness_reduction_resolver +from kiez.hubness_reduction import hubness_reduction_resolver from kiez.hubness_reduction.base import HubnessReduction, NoHubnessReduction from kiez.neighbors import NNAlgorithm, nn_algorithm_resolver class Kiez: - """Performs hubness reduced nearest neighbor search for entity alignment + """Performs hubness reduced nearest neighbor search for entity alignment. Use the given algorithm to :meth:`fit` the data and calculate the :meth:`kneighbors`. @@ -79,38 +78,47 @@ class Kiez: def __init__( self, - n_neighbors: int = 5, + n_candidates: int = 10, algorithm: HintOrType[NNAlgorithm] = None, algorithm_kwargs: Optional[Dict[str, Any]] = None, hubness: HintOrType[HubnessReduction] = None, hubness_kwargs: Optional[Dict[str, Any]] = None, ): - if not np.issubdtype(type(n_neighbors), np.integer): + if not np.issubdtype(type(n_candidates), np.integer): raise TypeError( - f"n_neighbors does not take {type(n_neighbors)} value, enter" + f"n_neighbors does not take {type(n_candidates)} value, enter" " integer value" ) - elif n_neighbors <= 0: - raise ValueError(f"Expected n_neighbors > 0. Got {n_neighbors}") - self.n_neighbors = n_neighbors - if algorithm is None and algorithm_kwargs is None: - algorithm_kwargs = {"n_candidates": n_neighbors} + elif n_candidates <= 0: + raise ValueError(f"Expected n_candidates > 0. Got {n_candidates}") + if algorithm_kwargs is None: + algorithm_kwargs = {"n_candidates": n_candidates} + elif "n_candidates" not in algorithm_kwargs: + algorithm_kwargs["n_candidates"] = n_candidates if algorithm is None: try: - self.algorithm = nn_algorithm_resolver.make("Faiss", algorithm_kwargs) + algorithm = nn_algorithm_resolver.make("Faiss", algorithm_kwargs) except ImportError: - self.algorithm = nn_algorithm_resolver.make( - "SklearnNN", algorithm_kwargs - ) + algorithm = nn_algorithm_resolver.make("SklearnNN", algorithm_kwargs) else: - self.algorithm = nn_algorithm_resolver.make(algorithm, algorithm_kwargs) - assert self.algorithm + algorithm = nn_algorithm_resolver.make(algorithm, algorithm_kwargs) + assert algorithm + if hubness_kwargs is None: + hubness_kwargs = dict() + hubness_kwargs["nn_algo"] = algorithm self.hubness = hubness_reduction_resolver.make(hubness, hubness_kwargs) - self._check_algorithm_hubness_compatibility() + + @property + def algorithm(self): + return self.hubness.nn_algo + + @algorithm.setter + def algorithm(self, value): + self.hubness.nn_algo = value def __repr__(self): return ( - f"Kiez(n_neighbors: {self.n_neighbors}, algorithm: {self.algorithm}," + f"Kiez(algorithm: {self.algorithm}," f" hubness: {self.hubness})" f" {self.algorithm._describe_source_target_fitted()}" ) @@ -121,43 +129,8 @@ def from_path(cls, path: Union[str, Path]) -> Kiez: with open(path) as file: return cls(**json.load(file)) - def _kcandidates( - self, query_points, *, s_to_t=True, k=None, return_distance=True - ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: - if k is None: - k = self.algorithm.n_candidates - - # The number of candidates must not be less than the number of neighbors used downstream - if k < self.n_neighbors: - k = self.n_neighbors - return self.algorithm.kneighbors( - k=k, - query=query_points, - s_to_t=s_to_t, - return_distance=return_distance, - ) - - def _check_algorithm_hubness_compatibility(self): - if isinstance(self.hubness, DisSimLocal): - if self.algorithm.metric in ["euclidean", "minkowski"]: - self.hubness.squared = False - if hasattr(self.algorithm, "p"): - if self.algorithm.p != 2: - raise ValueError( - "DisSimLocal only supports squared Euclidean distances. If" - " the provided NNAlgorithm has a `p` parameter it must be" - f" set to p=2. Now it is p={self.algorithm.p}" - ) - elif self.algorithm.metric in ["sqeuclidean"]: - self.hubness.squared = True - else: - raise ValueError( - "DisSimLocal only supports squared Euclidean distances, not" - f" metric={self.algorithm.metric}." - ) - def fit(self, source, target=None) -> Kiez: - """Fits the algorithm and hubness reduction method + """Fits the algorithm and hubness reduction method. Parameters ---------- @@ -171,40 +144,20 @@ def fit(self, source, target=None) -> Kiez: Kiez Fitted kiez instance """ - self.algorithm.fit(source, target) - if target is None: - target = source - if not isinstance(self.hubness, NoHubnessReduction): - neigh_dist_t_to_s, neigh_ind_t_to_s = self._kcandidates( - target, - s_to_t=False, - k=self.algorithm.n_candidates, - return_distance=True, - ) - self.hubness.fit( - neigh_dist_t_to_s, - neigh_ind_t_to_s, - source, - target, - assume_sorted=False, - ) + self.hubness.fit(source, target) return self def kneighbors( self, - source_query_points=None, - k=None, + k: Optional[int] = None, return_distance=True, ) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]: - """Retrieves the k-nearest neighbors using the supplied nearest neighbor algorithm and hubness reduction method. + """Retrieve the k-nearest neighbors using the supplied nearest neighbor algorithm and hubness reduction method. Parameters ---------- - source_query_points : matrix of shape (n_samples, n_features), default = None - subset of source entity embeddings - if `None` all source entities are used for querying - k : int, default = None - number of nearest neighbors to search for + k : Optional[int], default = None + k-nearest neighbors, if None is set to number of n_candidates return_distance : bool, default = True Whether to return distances If `False` only indices are returned @@ -217,33 +170,7 @@ def kneighbors( neigh_ind : ndarray of shape (n_queries, n_neighbors) Indices of the nearest points in the population matrix. """ - # function loosely adapted from skhubness: https://github.com/VarIr/scikit-hubness - - if k is None: - n_neighbors = self.n_neighbors - else: - n_neighbors = k - # First obtain candidate neighbors - query_dist, query_ind = self._kcandidates( - source_query_points, return_distance=True - ) - query_dist = np.atleast_2d(query_dist) - query_ind = np.atleast_2d(query_ind) - - # Second, reduce hubness - hubness_reduced_query_dist, query_ind = self.hubness.transform( - query_dist, - query_ind, - source_query_points, - assume_sorted=True, - ) - # Third, sort hubness reduced candidate neighbors to get the final k neighbors - kth = np.arange(n_neighbors) - mask = np.argpartition(hubness_reduced_query_dist, kth=kth)[:, :n_neighbors] - hubness_reduced_query_dist = np.take_along_axis( - hubness_reduced_query_dist, mask, axis=1 - ) - query_ind = np.take_along_axis(query_ind, mask, axis=1) + hubness_reduced_query_dist, query_ind = self.hubness.kneighbors(k) if return_distance: result = hubness_reduced_query_dist, query_ind diff --git a/kiez/neighbors/approximate/faiss.py b/kiez/neighbors/approximate/faiss.py index f95e1fa..72d0be9 100644 --- a/kiez/neighbors/approximate/faiss.py +++ b/kiez/neighbors/approximate/faiss.py @@ -10,19 +10,12 @@ import faiss except ImportError: # pragma: no cover faiss = None -try: - import autofaiss -except ImportError: # pragma: no cover - autofaiss = None class Faiss(NNAlgorithm): """Wrapper for `faiss` library. Faiss implements a number of (A)NN algorithms and enables the use of GPUs. - If it is installed and you let it, kiez utilizes the `autofaiss` package to - find the appropriate indexing structure and optimizes the hyperparameters of - the algorithm Parameters ---------- @@ -51,14 +44,8 @@ class Faiss(NNAlgorithm): >>> k_inst = Kiez(algorithm="Faiss") >>> k_inst.fit(source, target) - get info about selected indices - - >>> k_inst.algorithm.source_index_infos["index_key"] - 'HNSW15' - >>> k_inst = Kiez(algorithm="Faiss",algorithm_kwargs={"metric":"euclidean","index_key":"Flat"}) - supply hyperparameters for indexing algorithm >>> k_inst = Kiez(algorithm="Faiss",algorithm_kwargs={"index_key":"HNSW32","index_param":"efSearch=16383"}) @@ -66,7 +53,6 @@ class Faiss(NNAlgorithm): Notes ----- For details about configuring faiss consult their wiki: https://github.com/facebookresearch/faiss/wiki - For details about autofaiss see their documentation: https://criteo.github.io/autofaiss/ """ valid_metrics = ["l2", "euclidean"] @@ -76,7 +62,7 @@ def __init__( self, n_candidates: int = 5, metric: str = "l2", - index_key: Optional[str] = None, + index_key: str = "Flat", index_param: Optional[str] = None, use_gpu: bool = False, verbose: int = logging.WARNING, @@ -97,118 +83,37 @@ def __init__( else: self.space = metric super().__init__(n_candidates=n_candidates, metric=metric, n_jobs=None) - use_auto_tune = autofaiss is not None - # check index string - if index_key: - try: - faiss.index_factory(1, index_key) - except RuntimeError as exc: - raise ValueError( - f'Could not parse index "{index_key}".\n Please consult the faiss' - " wiki to create a correct instruction:" - " https://github.com/facebookresearch/faiss/wiki/The-index-factory" - ) from exc - # user seems to know what they want so no tuning - if index_param or index_key == "Flat": - use_auto_tune = False - elif index_param: - warnings.warn( - "Index key not set but hyperparameter given. Are you sure this is" - " intended?" - ) - else: - # no index and no hyperparams so check - # if autofaiss is available - if autofaiss is None: # pragma: no cover - warnings.warn( - "Please install the `autofaiss` package, to enable automatic index" - " selection.\nYou can install `autofaiss` via: pip install" - " autofaiss\n Will use `Flat` index for now, but there are probably" - " better choices..." - ) - use_auto_tune = False self.index_key = index_key self.index_param = index_param - self.use_auto_tune = use_auto_tune self.use_gpu = use_gpu - self.index_infos = None self.verbose = verbose - def _source_target_repr(self, is_source: bool): - ret_str = f"{self.__class__.__name__}(n_candidates={self.n_candidates},metric={self.metric}," - if is_source: - ret_str += ( - f"index_key={self.source_index_key}," - f" index_param={{{self.source_index_param}}}," - ) - else: - ret_str += ( - f"index_key={self.target_index_key}," - f" index_param={{{self.target_index_param}}}," - ) - ret_str += f"use_auto_tune={self.use_auto_tune}, use_gpu={self.use_gpu})" - return ret_str - def __repr__(self): - if hasattr(self, "source_index_key") and hasattr(self, "target_index_key"): - ret_str = ( - f"Source: {self._source_target_repr(True)}, " - f"Target: {self._source_target_repr(False)}" - ) - elif hasattr(self, "source_index_key"): - ret_str = f"{self._source_target_repr(True)}" - else: - ret_str = ( - f"{self.__class__.__name__}(n_candidates={self.n_candidates}," - + f"metric={self.metric}," - + f"index_key={self.index_key}," - + f"index_param={{{self.index_param}}}," - + f"use_auto_tune={self.use_auto_tune}," - + f"use_gpu={self.use_gpu})" - ) - return ret_str - - def _to_float32(self, data): - if not data.dtype == "float32": - return data.astype("float32") - return data + return ( + f"{self.__class__.__name__}(n_candidates={self.n_candidates}," + + f"metric={self.metric}," + + f"index_key={self.index_key}," + + f"index_param={{{self.index_param}}}," + + f"use_gpu={self.use_gpu})" + ) def _fit(self, data, is_source: bool): dim = data.shape[1] - if self.use_auto_tune: - index, index_infos = autofaiss.build_index( - self._to_float32(data), - index_key=self.index_key, - index_param=self.index_param, - metric_type=self.space, - save_on_disk=False, - use_gpu=self.use_gpu, - verbose=self.verbose, - ) - if is_source: - self.source_index_key = index_infos["index_key"] - self.source_index_param = index_infos["index_param"] - self.source_index_infos = index_infos - else: - self.target_index_key = index_infos["index_key"] - self.target_index_param = index_infos["index_param"] - self.target_index_infos = index_infos - else: - index = faiss.index_factory(dim, self.index_key) - params = faiss.ParameterSpace() - if self.use_gpu: - index = faiss.index_cpu_to_all_gpus(index) - params = faiss.GpuParameterSpace() - if self.index_param is not None: - params.set_index_parameters(index, self.index_param) - index.add(self._to_float32(data)) + index = faiss.index_factory(dim, self.index_key) + params = faiss.ParameterSpace() + if self.use_gpu: + index = faiss.index_cpu_to_all_gpus(index) + params = faiss.GpuParameterSpace() + if self.index_param is not None: + params.set_index_parameters(index, self.index_param) + index.add(data) return index def _kneighbors(self, k, query, index, return_distance, is_self_querying): if is_self_querying: - dist, ind = index.search(self._to_float32(self.source_), k) + dist, ind = index.search(self.source_, k) else: - dist, ind = index.search(self._to_float32(query), k) + dist, ind = index.search(query, k) if return_distance: if self.metric == "euclidean": dist = np.sqrt(dist) diff --git a/kiez/neighbors/exact/sklearn_nearest_neighbors.py b/kiez/neighbors/exact/sklearn_nearest_neighbors.py index 55378c8..86a1c5b 100644 --- a/kiez/neighbors/exact/sklearn_nearest_neighbors.py +++ b/kiez/neighbors/exact/sklearn_nearest_neighbors.py @@ -5,7 +5,7 @@ class SklearnNN(NNAlgorithm): - """Wrapper for scikit learn's NearestNeighbors class + """Wrapper for scikit learn's NearestNeighbors class. Parameters ---------- @@ -40,6 +40,7 @@ class SklearnNN(NNAlgorithm): The number of parallel jobs to run for neighbors search. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. + Notes ----- See also scikit learn's guide: https://scikit-learn.org/stable/modules/neighbors.html#unsupervised-neighbors diff --git a/kiez/neighbors/neighbor_algorithm_base.py b/kiez/neighbors/neighbor_algorithm_base.py index 6f59e53..0e5f397 100644 --- a/kiez/neighbors/neighbor_algorithm_base.py +++ b/kiez/neighbors/neighbor_algorithm_base.py @@ -32,7 +32,12 @@ def valid_metrics(self): def _fit(self, data, is_source: bool): pass # pragma: no cover - def fit(self, source: np.ndarray, target: np.ndarray = None): + def fit( + self, + source: np.ndarray, + target: np.ndarray = None, + only_fit_target: bool = False, + ): """Indexes the given data using the underlying algorithm Parameters @@ -41,6 +46,10 @@ def fit(self, source: np.ndarray, target: np.ndarray = None): embeddings of source entities target : matrix of shape (m_samples, n_features) embeddings of target entities or None in a single-source use case + only_fit_target : bool + If true only indexes target. Will lead to problems later with many + hubness reduction methods and should mainly be used for search + without hubness reduction Raises ------ @@ -59,8 +68,11 @@ def fit(self, source: np.ndarray, target: np.ndarray = None): f" but got source.shape: {source.shape} and target.shape:" f" {target.shape}" ) - self.source_index = self._fit(source, True) - self.target_index = self._fit(target, False) + if only_fit_target: + self.target_index = self._fit(target, True) + else: + self.source_index = self._fit(source, True) + self.target_index = self._fit(target, False) self.source_ = source self.target_ = target @@ -82,7 +94,7 @@ def _kneighbors(self, query, k, index, return_distance, is_self_querying): pass # pragma: no cover def kneighbors(self, query=None, k=None, s_to_t=True, return_distance=True): - check_is_fitted(self, ["source_index", "target_index"]) + check_is_fitted(self, ["source_index", "target_index"], all_or_any=any) k = self.n_candidates if k is None else k is_self_querying = query is None and self.source_equals_target diff --git a/kiez/neighbors/util.py b/kiez/neighbors/util.py index cbd34cd..56ce333 100644 --- a/kiez/neighbors/util.py +++ b/kiez/neighbors/util.py @@ -3,14 +3,14 @@ from kiez.neighbors import NNAlgorithm, nn_algorithm_resolver -def available_ann_algorithms() -> List[Type[NNAlgorithm]]: - """Get available approximate nearest neighbor algorithms +def available_nn_algorithms() -> List[Type[NNAlgorithm]]: + """Get available (approximate) nearest neighbor algorithms Returns ------- algorithms: List[Type[NNAlgorithm]] A tuple of available algorithms """ - possible = ["NMSLIB", "NNG", "Annoy", "Faiss"] + possible = ["NMSLIB", "NNG", "Annoy", "Faiss", "SklearnNN"] available = [] for ann in possible: try: diff --git a/noxfile.py b/noxfile.py index 2efcff3..3884ff6 100644 --- a/noxfile.py +++ b/noxfile.py @@ -8,7 +8,15 @@ def tests(session: Session) -> None: session.install(".") session.install("pytest") session.install("pytest-cov") - session.run("pytest", *args) + session.run( + "coverage", + "run", + "--source=kiez", + "--data-file=.coverage.base", + "-m", + "pytest", + *args + ) @nox_session(python="3.10", venv_backend="conda") @@ -21,7 +29,15 @@ def test_faiss(session: Session) -> None: session.install("autofaiss") session.install("pytest") session.install("pytest-cov") - session.run("pytest", *args) + session.run( + "coverage", + "run", + "--source=kiez", + "--data-file=.coverage.faiss", + "-m", + "pytest", + *args + ) @session(python="3.10") @@ -30,7 +46,15 @@ def test_ngt(session: Session) -> None: session.install(".[ngt]") session.install("pytest") session.install("pytest-cov") - session.run("pytest", *args) + session.run( + "coverage", + "run", + "--source=kiez", + "--data-file=.coverage.ngt", + "-m", + "pytest", + *args + ) @session(python="3.10") @@ -39,7 +63,15 @@ def test_nmslib(session: Session) -> None: session.install(".[nmslib]") session.install("pytest") session.install("pytest-cov") - session.run("pytest", *args) + session.run( + "coverage", + "run", + "--source=kiez", + "--data-file=.coverage.nmslib", + "-m", + "pytest", + *args + ) @session(python="3.10") @@ -48,7 +80,15 @@ def test_annoy(session: Session) -> None: session.install(".[annoy]") session.install("pytest") session.install("pytest-cov") - session.run("pytest", *args) + session.run( + "coverage", + "run", + "--source=kiez", + "--data-file=.coverage.annoy", + "-m", + "pytest", + *args + ) locations = ["kiez", "tests", "noxfile.py"] diff --git a/pyproject.toml b/pyproject.toml index 28c708f..0706cf4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" [tool.flake8] -ignore = "E203, E266, E501, W503, F403, F401, B950, B905" +ignore = "E203, E266, E501, W503, F403, F401, B950, B905, C408" max-line-length = 88 max-complexity = 18 select = "B,C,E,F,W,T4,B9" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..5c7cfc3 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,11 @@ +import numpy as np +import pytest + + +@pytest.fixture(scope="session", autouse=True) +def source_target(request): + rng = np.random.RandomState(42) + n_samples = 20 + n_samples2 = 50 + n_features = 5 + return rng.rand(n_samples, n_features), rng.rand(n_samples2, n_features) diff --git a/tests/hubness_reduction/test_mutual_proximity.py b/tests/hubness_reduction/test_mutual_proximity.py deleted file mode 100644 index bf1eb75..0000000 --- a/tests/hubness_reduction/test_mutual_proximity.py +++ /dev/null @@ -1,25 +0,0 @@ -import numpy as np -import pytest -from numpy.testing import assert_array_equal - -from kiez import Kiez -from kiez.hubness_reduction import MutualProximity - -rng = np.random.RandomState(2) - - -def test_wrong_input(): - with pytest.raises(ValueError) as exc_info: - MutualProximity(method="wrong") - assert "not recognized" in str(exc_info.value) - - -def test_sqeuclidean(n_samples=20, n_features=5): - source = rng.rand(n_samples, n_features) - target = rng.rand(n_samples, n_features) - k_inst = Kiez(hubness=MutualProximity()) - k_inst.fit(source, target) - ndist, nind = k_inst.kneighbors(k=1) - out_dist, out_nind = k_inst.hubness.transform(ndist, nind, None) - assert_array_equal(ndist, out_dist) - assert_array_equal(nind, out_nind) diff --git a/tests/hubness_reduction/test_wrong_inputs.py b/tests/hubness_reduction/test_wrong_inputs.py new file mode 100644 index 0000000..f88a908 --- /dev/null +++ b/tests/hubness_reduction/test_wrong_inputs.py @@ -0,0 +1,21 @@ +import numpy as np +import pytest +from numpy.testing import assert_array_equal + +from kiez import Kiez +from kiez.hubness_reduction import LocalScaling, MutualProximity +from kiez.neighbors import SklearnNN + +rng = np.random.RandomState(2) + + +def test_wrong_input_mp(): + with pytest.raises(ValueError) as exc_info: + MutualProximity(nn_algo=SklearnNN(), method="wrong") + assert "not recognized" in str(exc_info.value) + + +def test_wrong_input_ls(): + with pytest.raises(ValueError) as exc_info: + LocalScaling(nn_algo=SklearnNN(), method="wrong") + assert "Invalid" in str(exc_info.value) diff --git a/tests/neighbors/test_alignment.py b/tests/neighbors/test_alignment.py deleted file mode 100644 index fe6ac14..0000000 --- a/tests/neighbors/test_alignment.py +++ /dev/null @@ -1,178 +0,0 @@ -import numpy as np -import pytest -from numpy.testing import assert_array_almost_equal, assert_array_equal - -from kiez import Kiez -from kiez.hubness_reduction import CSLS, DisSimLocal, LocalScaling, MutualProximity -from kiez.neighbors import Annoy, Faiss, SklearnNN -from kiez.neighbors.util import available_ann_algorithms - -P = (1, 3, 4, np.inf, 2) # Euclidean last, for tests against approx NN -rng = np.random.RandomState(2) -APPROXIMATE_ALGORITHMS = available_ann_algorithms() - -MP = [MutualProximity(method=method) for method in ["normal", "empiric"]] -LS = [LocalScaling(method=method) for method in ["standard", "nicdm"]] -DSL = [DisSimLocal(squared=val) for val in [True, False]] -HUBNESS = [None, CSLS(), *MP, *LS, *DSL] - - -@pytest.mark.parametrize("hubness", HUBNESS) -def test_alignment_source_equals_target( - hubness, - n_samples=20, - n_features=5, - n_query_pts=10, - n_neighbors=5, -): - source = rng.rand(n_samples, n_features) - query = rng.rand(n_query_pts, n_features) - exactalgos = [ - SklearnNN(n_candidates=n_neighbors, algorithm=algo) - for algo in ["auto", "kd_tree", "ball_tree", "brute"] - ] - - if Faiss in APPROXIMATE_ALGORITHMS: - exactalgos.append( - Faiss(n_candidates=n_neighbors, metric="euclidean", index_key="Flat") - ) - - for p in P: - results = [] - results_nodist = [] - - for algo in exactalgos: - if hubness == "dsl" and p != 2: - with pytest.raises(ValueError): - align = Kiez( - n_neighbors=n_neighbors, algorithm=algo, hubness=hubness - ) - continue - align = Kiez(n_neighbors=n_neighbors, algorithm=algo, hubness=hubness) - align.fit(source) - results.append( - align.kneighbors(source_query_points=query, return_distance=True) - ) - results_nodist.append( - align.kneighbors(source_query_points=query, return_distance=False) - ) - for i in range(len(results) - 1): - assert_array_almost_equal(results_nodist[i], results[i][1]) - assert_array_almost_equal(results[i][0], results[i + 1][0], decimal=3) - assert_array_almost_equal(results[i][1], results[i + 1][1]) - # Test approximate NN against exact NN with Euclidean distances - assert p == 2, f"Internal: last parameter p={p}, should have been 2" - - ann_algos = [ - algo_cls(n_candidates=n_neighbors, metric="euclidean") - for algo_cls in APPROXIMATE_ALGORITHMS - ] - for algo in ann_algos: - align = Kiez( - n_neighbors=n_neighbors, - algorithm=algo, - hubness=hubness, - ) - align.fit(source) - results_approx = align.kneighbors( - source_query_points=query, return_distance=True - ) - results_approx_nodist = align.kneighbors( - source_query_points=query, return_distance=False - ) - assert_array_equal(results_approx_nodist, results_approx[1]) - if isinstance(algo, Annoy): # quite imprecise - assert_array_almost_equal(results_approx[0], results[1][0], decimal=0) - for i in range(len(results_approx[1])): - assert np.intersect1d(results_approx[1][i], results[1][1][i]).size >= 1 - else: - assert_array_almost_equal(results_approx[0], results[1][0], decimal=3) - for ra, r in zip(results_approx[1], results[1][1]): - assert set(ra) == set(r) - - -@pytest.mark.parametrize("hubness", HUBNESS) -def test_alignment( - hubness, - n_samples=20, - n_features=5, - n_query_pts=10, - n_neighbors=5, -): - source = rng.rand(n_query_pts, n_features) - target = rng.rand(n_samples, n_features) - - exactalgos = [ - SklearnNN(n_candidates=n_neighbors, algorithm=algo) - for algo in ["auto", "kd_tree", "ball_tree", "brute"] - ] - if Faiss in APPROXIMATE_ALGORITHMS: - exactalgos.append( - Faiss(n_candidates=n_neighbors, metric="euclidean", index_key="Flat") - ) - - for p in P: - results = [] - results_nodist = [] - - for algo in exactalgos: - if hubness == "dsl" and p != 2: - with pytest.raises(ValueError): - align = Kiez( - n_neighbors=n_neighbors, algorithm=algo, hubness=hubness - ) - continue - align = Kiez(n_neighbors=n_neighbors, algorithm=algo, hubness=hubness) - align.fit(source, target) - results.append(align.kneighbors(return_distance=True)) - results_nodist.append(align.kneighbors(return_distance=False)) - for i in range(len(results) - 1): - try: - assert_array_almost_equal(results_nodist[i], results[i][1]) - assert_array_almost_equal(results[i][0], results[i + 1][0]) - assert_array_almost_equal(results[i][1], results[i + 1][1]) - except AssertionError as error: - # empiric mp with ball tree can give slightly different results - # because slight differences in distance provided by ball_tree - if not ( - isinstance(hubness, MutualProximity) and hubness.method == "empiric" - ): - raise error - # Test approximate NN against exact NN with Euclidean distances - assert p == 2, f"Internal: last parameter p={p}, should have been 2" - ann_algos = [ - algo_cls(n_candidates=n_neighbors, metric="euclidean") - for algo_cls in APPROXIMATE_ALGORITHMS - ] - for algo in ann_algos: - align = Kiez( - n_neighbors=n_neighbors, - algorithm=algo, - hubness=hubness, - ) - align.fit(source, target) - results_approx = align.kneighbors( - source_query_points=source, return_distance=True - ) - results_approx_nodist = align.kneighbors( - source_query_points=source, return_distance=False - ) - assert_array_equal(results_approx_nodist, results_approx[1]) - if isinstance(algo, Annoy): # quite imprecise - assert_array_almost_equal(results_approx[0], results[1][0], decimal=0) - for i in range(len(results_approx[1])): - try: - assert ( - np.intersect1d(results_approx[1][i], results[1][1][i]).size >= 1 - ), f"{algo} failed with {hubness}" - except AssertionError as error: - # empiric mp with ball tree can give slightly different results - # because slight differences in distance provided by ball_tree - if not ( - isinstance(hubness, MutualProximity) - and hubness.method == "empiric" - ): - raise error - else: - for ra, r in zip(results_approx[1], results[1][1]): - assert set(ra) == set(r), f"{algo} failed with {hubness}" diff --git a/tests/neighbors/test_annoy.py b/tests/neighbors/test_annoy.py index af86564..76511ad 100644 --- a/tests/neighbors/test_annoy.py +++ b/tests/neighbors/test_annoy.py @@ -3,10 +3,10 @@ from numpy.testing import assert_array_equal from kiez.neighbors import Annoy -from kiez.neighbors.util import available_ann_algorithms +from kiez.neighbors.util import available_nn_algorithms -APPROXIMATE_ALGORITHMS = available_ann_algorithms() -if Annoy not in APPROXIMATE_ALGORITHMS: +NN_ALGORITHMS = available_nn_algorithms() +if Annoy not in NN_ALGORITHMS: skip = True else: skip = False @@ -30,8 +30,8 @@ def test_minkowski_metric(): @pytest.mark.skipif(skip, reason=skip_reason) -def test_self_query(n_samples=20, n_features=5, n_neighbors=5): - source = rng.rand(n_samples, n_features) +def test_self_query(source_target, n_neighbors=5): + source, _ = source_target annoy = Annoy(n_candidates=n_neighbors) annoy.fit(source, source) d, i = annoy.kneighbors() @@ -40,49 +40,28 @@ def test_self_query(n_samples=20, n_features=5, n_neighbors=5): @pytest.mark.skipif(skip, reason=skip_reason) -def test_query(tmp_path, n_samples=20, n_features=5, n_neighbors=5): - source = rng.rand(n_samples, n_features) - target = rng.rand(n_samples, n_features) +def test_query(tmp_path, source_target, n_neighbors=5): + source, target = source_target annoy = Annoy(n_candidates=n_neighbors, metric="euclidean") annoy.fit(source, target) - d, i = annoy.kneighbors( - query=source[ - :5, - ] - ) + d, i = annoy.kneighbors() annoy2 = Annoy(n_candidates=n_neighbors, metric="minkowski") annoy2.fit(source, target) - i2 = annoy2.kneighbors( - query=source[ - :5, - ], - return_distance=False, - ) + i2 = annoy2.kneighbors(return_distance=False) assert_array_equal(i, i2) annoy3 = Annoy(n_candidates=n_neighbors, mmap_dir=str(tmp_path)) annoy3.fit(source, target) - i3 = annoy3.kneighbors( - query=source[ - :5, - ], - return_distance=False, - ) + i3 = annoy3.kneighbors(return_distance=False) assert_array_equal(i, i3) annoy4 = Annoy(n_candidates=n_neighbors, mmap_dir=None) annoy4.fit(source, target) - i4 = annoy4.kneighbors( - query=source[ - :5, - ], - return_distance=False, - ) + i4 = annoy4.kneighbors(return_distance=False) assert_array_equal(i, i4) @pytest.mark.skipif(skip, reason=skip_reason) -def test_inner_kneighbors(tmp_path, n_samples=20, n_features=5, n_neighbors=5): - source = rng.rand(n_samples, n_features) - target = rng.rand(n_samples, n_features) +def test_inner_kneighbors(tmp_path, source_target, n_neighbors=5): + source, target = source_target annoy = Annoy(n_candidates=n_neighbors) annoy.fit(source, target) with pytest.raises(AssertionError) as exc_info: diff --git a/tests/neighbors/test_base.py b/tests/neighbors/test_base.py index 4ca55ff..7812b8b 100644 --- a/tests/neighbors/test_base.py +++ b/tests/neighbors/test_base.py @@ -2,17 +2,16 @@ import pytest from kiez.neighbors import NMSLIB, NNG, Annoy, Faiss, SklearnNN -from kiez.neighbors.util import available_ann_algorithms +from kiez.neighbors.util import available_nn_algorithms -APPROXIMATE_ALGORITHMS = available_ann_algorithms() -ALGORITHMS = [*APPROXIMATE_ALGORITHMS, SklearnNN] +NN_ALGORITHMS = available_nn_algorithms() rng = np.random.RandomState(2) -@pytest.mark.parametrize("algo_cls", ALGORITHMS) -def test_str_rep(algo_cls, n_samples=20, n_features=5): - source = rng.rand(n_samples, n_features) +@pytest.mark.parametrize("algo_cls", NN_ALGORITHMS) +def test_str_rep(algo_cls, source_target): + source, _ = source_target algo = algo_cls() assert "is unfitted" in str(algo._describe_source_target_fitted()) algo.fit(source, source) diff --git a/tests/neighbors/test_faiss.py b/tests/neighbors/test_faiss.py index 417ac6a..014f1ac 100644 --- a/tests/neighbors/test_faiss.py +++ b/tests/neighbors/test_faiss.py @@ -3,10 +3,10 @@ from numpy.testing import assert_array_equal from kiez.neighbors import Faiss -from kiez.neighbors.util import available_ann_algorithms +from kiez.neighbors.util import available_nn_algorithms -APPROXIMATE_ALGORITHMS = available_ann_algorithms() -if Faiss not in APPROXIMATE_ALGORITHMS: +NN_ALGORITHMS = available_nn_algorithms() +if Faiss not in NN_ALGORITHMS: skip = True else: skip = False @@ -14,10 +14,8 @@ @pytest.mark.skipif(skip, reason="Faiss not installed") @pytest.mark.parametrize("single_source", [True, False]) -def test_different_instantiations(single_source): - rng = np.random.RandomState(2) - source = rng.rand(50, 100) - target = rng.rand(50, 100) +def test_different_instantiations(single_source, source_target): + source, target = source_target for same_config in [ ( {"metric": "l2"}, diff --git a/tests/neighbors/test_hnsw.py b/tests/neighbors/test_hnsw.py index 8b7768d..539ddcf 100644 --- a/tests/neighbors/test_hnsw.py +++ b/tests/neighbors/test_hnsw.py @@ -3,10 +3,10 @@ from numpy.testing import assert_array_equal from kiez.neighbors import NMSLIB -from kiez.neighbors.util import available_ann_algorithms +from kiez.neighbors.util import available_nn_algorithms -APPROXIMATE_ALGORITHMS = available_ann_algorithms() -if NMSLIB not in APPROXIMATE_ALGORITHMS: +NN_ALGORITHMS = available_nn_algorithms() +if NMSLIB not in NN_ALGORITHMS: skip = True else: skip = False @@ -23,44 +23,24 @@ def test_wrong_metric(): @pytest.mark.skipif(skip, reason=skip_reason) -def test_sqeuclidean(n_samples=20, n_features=5, n_neighbors=5): - source = rng.rand(n_samples, n_features) - target = rng.rand(n_samples, n_features) +def test_sqeuclidean(source_target, n_neighbors=5): + source, target = source_target hnsw1 = NMSLIB(n_candidates=n_neighbors, metric="sqeuclidean") hnsw1.fit(source, target) - d, i = hnsw1.kneighbors( - query=source[ - :5, - ] - ) + d, i = hnsw1.kneighbors() hnsw2 = NMSLIB(n_candidates=n_neighbors) hnsw2.fit(source, target) - i2 = hnsw2.kneighbors( - query=source[ - :5, - ], - return_distance=False, - ) + i2 = hnsw2.kneighbors(return_distance=False) assert_array_equal(i, i2) @pytest.mark.skipif(skip, reason=skip_reason) -def test_cosine(n_samples=20, n_features=5, n_neighbors=5): - source = rng.rand(n_samples, n_features) - target = rng.rand(n_samples, n_features) +def test_cosine(source_target, n_neighbors=5): + source, target = source_target hnsw1 = NMSLIB(n_candidates=n_neighbors, metric="cosine") hnsw1.fit(source, target) - d, i = hnsw1.kneighbors( - query=source[ - :5, - ] - ) + d, i = hnsw1.kneighbors() hnsw2 = NMSLIB(n_candidates=n_neighbors, metric="cosinesimil") hnsw2.fit(source, target) - i2 = hnsw2.kneighbors( - query=source[ - :5, - ], - return_distance=False, - ) + i2 = hnsw2.kneighbors(return_distance=False) assert_array_equal(i, i2) diff --git a/tests/neighbors/test_nng.py b/tests/neighbors/test_nng.py index bc65ac4..1cba671 100644 --- a/tests/neighbors/test_nng.py +++ b/tests/neighbors/test_nng.py @@ -3,10 +3,10 @@ from numpy.testing import assert_array_equal from kiez.neighbors import NNG -from kiez.neighbors.util import available_ann_algorithms +from kiez.neighbors.util import available_nn_algorithms -APPROXIMATE_ALGORITHMS = available_ann_algorithms() -if NNG not in APPROXIMATE_ALGORITHMS: +NN_ALGORITHMS = available_nn_algorithms() +if NNG not in NN_ALGORITHMS: skip = True else: skip = False @@ -24,8 +24,8 @@ def test_wrong_metric(): @pytest.mark.skipif(skip, reason=skip_reason) -def test_wrong_dir(n_samples=20, n_features=5): - source = rng.rand(n_samples, n_features) +def test_wrong_dir(source_target): + source, _ = source_target with pytest.raises(TypeError) as exc_info: nng = NNG(index_dir=1) nng.fit(source) @@ -33,26 +33,24 @@ def test_wrong_dir(n_samples=20, n_features=5): @pytest.mark.skipif(skip, reason=skip_reason) -def test_right_dir(tmp_path, n_samples=20, n_features=5): - source = rng.rand(n_samples, n_features) - target = rng.rand(n_samples, n_features) +def test_right_dir(tmp_path, source_target): + source, target = source_target nng = NNG(index_dir=str(tmp_path)) nng.fit(source, target) assert nng is not None @pytest.mark.skipif(skip, reason=skip_reason) -def test_none_dir(n_samples=20, n_features=5): - source = rng.rand(n_samples, n_features) - target = rng.rand(n_samples, n_features) +def test_none_dir(source_target): + source, target = source_target nng = NNG(index_dir=None) nng.fit(source, target) assert nng is not None @pytest.mark.skipif(skip, reason=skip_reason) -def test_self_query(n_samples=20, n_features=5, n_neighbors=5): - source = rng.rand(n_samples, n_features) +def test_self_query(source_target, n_neighbors=5): + source, _ = source_target nng = NNG(index_dir=None, n_candidates=n_neighbors, epsilon=0.00001) nng.fit(source, source) d, i = nng.kneighbors() @@ -61,42 +59,22 @@ def test_self_query(n_samples=20, n_features=5, n_neighbors=5): @pytest.mark.skipif(skip, reason=skip_reason) -def test_query(n_samples=20, n_features=5, n_neighbors=5): - source = rng.rand(n_samples, n_features) - target = rng.rand(n_samples, n_features) +def test_query(source_target, n_neighbors=5): + source, target = source_target nng = NNG(index_dir=None, n_candidates=n_neighbors, epsilon=0.00001) nng.fit(source, target) - d, i = nng.kneighbors( - query=source[ - :5, - ] - ) - i2 = nng.kneighbors( - query=source[ - :5, - ], - return_distance=False, - ) + d, i = nng.kneighbors() + i2 = nng.kneighbors(return_distance=False) assert_array_equal(i, i2) @pytest.mark.skipif(skip, reason=skip_reason) -def test_sqeuclidean(n_samples=20, n_features=5, n_neighbors=5): - source = rng.rand(n_samples, n_features) - target = rng.rand(n_samples, n_features) +def test_sqeuclidean(source_target, n_neighbors=5): + source, target = source_target nng1 = NNG(index_dir=None, n_candidates=n_neighbors, metric="sqeuclidean") nng1.fit(source, target) - d, i = nng1.kneighbors( - query=source[ - :5, - ] - ) + d, i = nng1.kneighbors() nng2 = NNG(index_dir=None, n_candidates=n_neighbors) nng2.fit(source, target) - i2 = nng2.kneighbors( - query=source[ - :5, - ], - return_distance=False, - ) + i2 = nng2.kneighbors(return_distance=False) assert_array_equal(i, i2) diff --git a/tests/neighbors/test_sklearn.py b/tests/neighbors/test_sklearn.py index 8154736..8aec387 100644 --- a/tests/neighbors/test_sklearn.py +++ b/tests/neighbors/test_sklearn.py @@ -6,8 +6,8 @@ rng = np.random.RandomState(2) -def test_self_query(n_samples=20, n_features=5, n_neighbors=5): - source = rng.rand(n_samples, n_features) +def test_self_query(source_target, n_neighbors=5): + source, _ = source_target sklearnnn = SklearnNN() sklearnnn.fit(source, source) d, i = sklearnnn.kneighbors() diff --git a/tests/test_kiez.py b/tests/test_kiez.py index 367e20e..78bafe2 100644 --- a/tests/test_kiez.py +++ b/tests/test_kiez.py @@ -1,174 +1,124 @@ import pathlib from unittest import mock -import numpy as np import pytest -from numpy.testing import assert_array_equal -from sklearn.neighbors import NearestNeighbors from kiez import Kiez -from kiez.hubness_reduction import ( - DisSimLocal, - HubnessReduction, - LocalScaling, - NoHubnessReduction, -) +from kiez.hubness_reduction import HubnessReduction, LocalScaling from kiez.neighbors import NMSLIB, NNAlgorithm, SklearnNN -from kiez.neighbors.util import available_ann_algorithms +from kiez.neighbors.util import available_nn_algorithms + +NN_ALGORITHMS = available_nn_algorithms() + +MP = [("MutualProximity", dict(method=method)) for method in ["normal", "empiric"]] +LS = [("LocalScaling", dict(method=method)) for method in ["standard", "nicdm"]] +DSL = [("DisSimLocal", dict(squared=val)) for val in [True, False]] +HUBNESS_AND_KWARGS = [(None, {}), ("CSLS", {}), *MP, *LS, *DSL] -APPROXIMATE_ALGORITHMS = available_ann_algorithms() HERE = pathlib.Path(__file__).parent.resolve() -rng = np.random.RandomState(2) - - -class CustomHubness(HubnessReduction): - """Test class to make sure user created classes work""" - - def __init__(self, **kwargs): - super().__init__(**kwargs) - - def fit(self, *args, **kwargs): - pass # pragma: no cover - - def __repr__(self): - return "NoHubnessReduction" - - def transform( - self, - neigh_dist, - neigh_ind, - query, - assume_sorted=True, - return_distance=True, - *args, - **kwargs, - ): - if return_distance: - return neigh_dist, neigh_ind - else: - return neigh_ind - - -class CustomAlgorithm(NNAlgorithm): - """Test class to make sure user created classes work""" - - valid_metrics = ["minkowski"] - - def __init__( - self, - n_candidates=5, - algorithm="auto", - leaf_size=30, - metric="minkowski", - p=2, - metric_params=None, - n_jobs=None, - ): - super().__init__(n_candidates=n_candidates, metric=metric, n_jobs=n_jobs) - self.algorithm = algorithm - self.leaf_size = leaf_size - self.p = p - self.metric_params = metric_params - - def _fit(self, data, is_source: bool): - nn = NearestNeighbors( - n_neighbors=self.n_candidates, - algorithm=self.algorithm, - leaf_size=self.leaf_size, - metric=self.metric, - p=self.p, - metric_params=self.metric_params, - n_jobs=self.n_jobs, - ) - nn.fit(data) - return nn - - def _kneighbors(self, k, query, index, return_distance, is_self_querying): - if is_self_querying: - return index.kneighbors( - X=None, n_neighbors=k, return_distance=return_distance - ) - return index.kneighbors(X=query, n_neighbors=k, return_distance=return_distance) - - -def test_hubness_resolver(n_samples=20, n_features=5): - source = rng.rand(n_samples, n_features) - target = rng.rand(n_samples, n_features) - res = [] - for algo in [ - SklearnNN(), - SklearnNN, - "SklearnNN", - CustomAlgorithm, - CustomAlgorithm(), - ]: - for hub in [ - NoHubnessReduction(), - NoHubnessReduction, - None, - "NoHubnessReduction", - CustomHubness, - CustomHubness(), - ]: - k_inst = Kiez(algorithm=algo, hubness=hub) - k_inst.fit(source, target) - res.append(k_inst.kneighbors(source, k=1)) - for i in range(len(res) - 1): - assert_array_equal(res[i][0], res[i + 1][0]) - assert_array_equal(res[i][1], res[i + 1][1]) - - -def test_wrong_kcandidates(n_samples=20, n_features=5): - source = rng.rand(n_samples, n_features) - target = rng.rand(n_samples, n_features) - k_inst = Kiez() - k_inst.fit(source, target) - nn_ind = k_inst._kcandidates(source, k=1, return_distance=False) - assert nn_ind.shape == (20, 5) -def test_non_default_kneighbors(n_samples=20, n_features=5): - source = rng.rand(n_samples, n_features) - target = rng.rand(n_samples, n_features) - k_inst = Kiez() +def test_no_hub(source_target): + source, target = source_target + n_cand = 10 + k_inst = Kiez(n_candidates=n_cand) k_inst.fit(source, target) - nn_ind = k_inst.kneighbors(source, k=1, return_distance=False) - assert nn_ind.shape == (20, 1) + # check only created target index + assert not hasattr(k_inst.algorithm, "source_index") + k_inst.algorithm = SklearnNN() + assert "f{k_inst}" + assert ( + Kiez( + n_candidates=n_cand, + algorithm="SklearnNN", + algorithm_kwargs=dict(metric="minkowski"), + ).algorithm.n_candidates + == n_cand + ) + + +def assert_different_neighbors(k_inst, n_cand): + dist, neigh = k_inst.kneighbors() + assert neigh.shape[1] == n_cand + assert dist.shape[1] == n_cand + + neigh = k_inst.kneighbors(return_distance=False) + assert neigh.shape[1] == n_cand + + dist, neigh = k_inst.kneighbors(k=1) + assert neigh.shape[1] == 1 + assert dist.shape[1] == 1 + + dist, neigh = k_inst.kneighbors(k=20) + assert neigh.shape[1] == n_cand + assert dist.shape[1] == n_cand + + +@pytest.mark.parametrize("algo", NN_ALGORITHMS) +def test_algo_resolver(source_target, algo, n_cand=5): + source, target = source_target + k_inst = Kiez(algorithm=algo, n_candidates=n_cand) + k_inst.fit(source, target) + assert_different_neighbors(k_inst, n_cand) + + +@pytest.mark.parametrize("hub,hubkwargs", HUBNESS_AND_KWARGS) +def test_hubness_resolver(hub, hubkwargs, source_target, n_cand=5): + source, target = source_target + k_inst = Kiez( + algorithm="SklearnNN", + n_candidates=n_cand, + hubness=hub, + hubness_kwargs=hubkwargs, + ) + assert f"{k_inst}" is not None + k_inst.fit(source, target) + assert_different_neighbors(k_inst, n_cand) + k_inst.fit(source, None) + assert_different_neighbors(k_inst, n_cand) + with pytest.raises(ValueError) as exc_info: + k_inst = Kiez( + algorithm="SklearnNN", + n_candidates=1, + hubness=hub, + hubness_kwargs=hubkwargs, + ) + assert "Cannot" in str(exc_info.value) -def test_n_neighbors_wrong(): +def test_n_candidates_wrong(): with pytest.raises(ValueError) as exc_info: - Kiez(n_neighbors=-1) + Kiez(n_candidates=-1) assert "Expected" in str(exc_info.value) -def test_n_neighbors_wrong_type(): +def test_n_candidates_wrong_type(): with pytest.raises(TypeError) as exc_info: - Kiez(n_neighbors="1") + Kiez(n_candidates="1") assert "does not" in str(exc_info.value) def test_dis_sim_local_wrong(): with pytest.raises(ValueError) as exc_info: - Kiez(algorithm=SklearnNN(p=1), hubness=DisSimLocal()) + Kiez(algorithm=SklearnNN(p=1), hubness="DisSimLocal") assert "only supports" in str(exc_info.value) def test_dis_sim_local_wrong_metric(): with pytest.raises(ValueError) as exc_info: - Kiez(algorithm=SklearnNN(metric="cosine"), hubness=DisSimLocal()) + Kiez(algorithm=SklearnNN(metric="cosine"), hubness="DisSimLocal") assert "only supports" in str(exc_info.value) def test_dis_sim_local_squaring(): - if NMSLIB in APPROXIMATE_ALGORITHMS: - k_inst = Kiez(algorithm=NMSLIB(metric="sqeuclidean"), hubness=DisSimLocal()) + if NMSLIB in NN_ALGORITHMS: + k_inst = Kiez(algorithm=NMSLIB(metric="sqeuclidean"), hubness="DisSimLocal") assert k_inst.hubness.squared def test_from_config(): - if NMSLIB in APPROXIMATE_ALGORITHMS: + if NMSLIB in NN_ALGORITHMS: path = HERE.joinpath("example_conf.json") kiez = Kiez.from_path(path) assert kiez.hubness is not None