Fix MP and cleanup

dobraczka · Jan 11, 2024 · 6ccb674 · 6ccb674
1 parent 0e96685
commit 6ccb674
Show file tree

Hide file tree

Showing 19 changed files with 275 additions and 92 deletions.
diff --git a/kiez/hubness_reduction/base.py b/kiez/hubness_reduction/base.py
@@ -1,6 +1,6 @@
 import warnings
 from abc import ABC, abstractmethod
-from typing import Optional, Tuple
+from typing import Optional, Tuple, TypeVar
 
 import numpy as np
 
@@ -11,23 +11,26 @@
 except ImportError:
     torch = None
 
+T = TypeVar("T")
+
 
 class HubnessReduction(ABC):
     """Base class for hubness reduction."""
 
     def __init__(self, nn_algo: NNAlgorithm, verbose: int = 0, **kwargs):
         self.nn_algo = nn_algo
         self.verbose = verbose
+        self._use_torch = False
         if nn_algo.n_candidates == 1:
             raise ValueError(
                 "Cannot perform hubness reduction with a single candidate per query!"
             )
 
     @abstractmethod
-    def _fit(self, neigh_dist, neigh_ind, source, target):
+    def _fit(self, neigh_dist: T, neigh_ind: T, source: T, target: T):
         pass  # pragma: no cover
 
-    def fit(self, source, target=None):
+    def fit(self, source: T, target: Optional[T] = None):
         self.nn_algo.fit(source, target)
         if target is None:
             target = source
@@ -37,6 +40,8 @@ def fit(self, source, target=None):
             s_to_t=False,
             return_distance=True,
         )
+        if torch and isinstance(neigh_dist_t_to_s, torch.Tensor):
+            self._use_torch = True
         self._fit(
             neigh_dist_t_to_s,
             neigh_ind_t_to_s,
@@ -45,7 +50,7 @@ def fit(self, source, target=None):
         )
 
     @abstractmethod
-    def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarray]:
+    def transform(self, neigh_dist, neigh_ind, query) -> Tuple[T, T]:
         pass  # pragma: no cover
 
     def _set_k_if_needed(self, k: Optional[int] = None) -> int:
@@ -65,9 +70,7 @@ def _set_k_if_needed(self, k: Optional[int] = None) -> int:
         return k
 
     @staticmethod
-    def _sort(
-        hubness_reduced_query_dist, query_ind, n_neighbors: int
-    ) -> tuple[np.ndarray, np.ndarray]:
+    def _sort(hubness_reduced_query_dist, query_ind, n_neighbors: int) -> Tuple[T, T]:
         if torch and isinstance(hubness_reduced_query_dist, torch.Tensor):
             mask = torch.argsort(hubness_reduced_query_dist)[:, :n_neighbors]
             hubness_reduced_query_dist = torch.take_along_dim(
@@ -83,7 +86,7 @@ def _sort(
             query_ind = np.take_along_axis(query_ind, mask, axis=1)
         return hubness_reduced_query_dist, query_ind
 
-    def kneighbors(self, k: Optional[int] = None) -> tuple[np.ndarray, np.ndarray]:
+    def kneighbors(self, k: Optional[int] = None) -> Tuple[T, T]:
         n_neighbors = self._set_k_if_needed(k)
         # First obtain candidate neighbors
         query_dist, query_ind = self.nn_algo.kneighbors(
@@ -111,9 +114,9 @@ def _fit(self, neigh_dist, neigh_ind, source, target):
     def fit(self, source, target=None):
         self.nn_algo.fit(source, target, only_fit_target=True)
 
-    def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarray]:
+    def transform(self, neigh_dist, neigh_ind, query) -> Tuple[T, T]:
         return neigh_dist, neigh_ind
 
-    def kneighbors(self, k: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
+    def kneighbors(self, k: Optional[int] = None) -> Tuple[T, T]:
         n_neighbors = self._set_k_if_needed(k)
         return self.nn_algo.kneighbors(query=None, k=n_neighbors, return_distance=True)
diff --git a/kiez/hubness_reduction/csls.py b/kiez/hubness_reduction/csls.py
@@ -1,8 +1,11 @@
-import numpy as np
+from typing import Tuple, TypeVar
+
 from sklearn.utils.validation import check_is_fitted
 
 from .base import HubnessReduction
 
+T = TypeVar("T")
+
 
 class CSLS(HubnessReduction):
     """Hubness reduction with Cross-domain similarity local scaling.
@@ -56,7 +59,7 @@ def transform(
         neigh_dist,
         neigh_ind,
         query,
-    ) -> tuple[np.ndarray, np.ndarray]:
+    ) -> Tuple[T, T]:
         """Transform distance between test and training data with CSLS.
 
         Parameters

diff --git a/kiez/hubness_reduction/dis_sim.py b/kiez/hubness_reduction/dis_sim.py
@@ -1,13 +1,17 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # adapted from skhubness: https://github.com/VarIr/scikit-hubness/
 
+from typing import Tuple, TypeVar
+
 import numpy as np
 from sklearn.metrics import euclidean_distances
 from sklearn.utils.extmath import row_norms
 from sklearn.utils.validation import check_is_fitted
 
 from .base import HubnessReduction
 
+T = TypeVar("T")
+
 _DESIRED_P_VALUE = 2
 _MINIMUM_DIST = 0.0
 
@@ -90,7 +94,7 @@ def _fit(
         # Calculate local neighborhood centroids among the training points
         knn = neigh_ind
         centroids = source[knn].mean(axis=1)
-        if torch and isinstance(centroids, torch.Tensor):
+        if self._use_torch:
             # see https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/utils/extmath.py#L87C21-L87C48
             X = target - centroids
             dist_to_cent = torch.einsum("ij,ij->i", X, X)
@@ -108,7 +112,7 @@ def transform(
         neigh_dist,
         neigh_ind,
         query,
-    ) -> tuple[np.ndarray, np.ndarray]:
+    ) -> Tuple[T, T]:
         """Transform distance between test and training data with DisSimLocal.
 
         Parameters
@@ -137,7 +141,7 @@ def transform(
             ["target_", "target_centroids_", "target_dist_to_centroids_"],
         )
         # Calculate local neighborhood centroids for source objects among target objects
-        if torch and isinstance(neigh_ind, torch.Tensor):
+        if self._use_torch:
             # pairwise squared euclidean distance between each query vector and knn
             # unsqueeze to enable batching
             hub_reduced_dist = (

diff --git a/kiez/hubness_reduction/local_scaling.py b/kiez/hubness_reduction/local_scaling.py
@@ -1,11 +1,15 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # adapted from skhubness: https://github.com/VarIr/scikit-hubness/
 
+from typing import Tuple, TypeVar
+
 import numpy as np
 from sklearn.utils.validation import check_is_fitted
 
 from .base import HubnessReduction
 
+T = TypeVar("T")
+
 try:
     import torch
 except ImportError:
@@ -79,12 +83,22 @@ def _fit(
         self.r_ind_t_to_s_ = neigh_ind
         return self
 
+    def _exp(self, inner_exp):
+        if self._use_torch:
+            return torch.exp(inner_exp)
+        return np.exp(inner_exp)
+
+    def _sqrt(self, value):
+        if self._use_torch:
+            return torch.sqrt(value)
+        return np.sqrt(value)
+
     def transform(
         self,
         neigh_dist,
         neigh_ind,
         query=None,
-    ) -> tuple[np.ndarray, np.ndarray]:
+    ) -> Tuple[T, T]:
         """Transform distance between test and training data with Mutual Proximity.
 
         Parameters
@@ -122,20 +136,14 @@ def transform(
             r_t_to_s = self.r_dist_t_to_s_[:, -1]
             r_s_to_t = r_dist_s_to_t[:, -1].reshape(-1, 1)
             inner_exp = -1 * neigh_dist**2 / (r_s_to_t * r_t_to_s[neigh_ind])
-            if torch and isinstance(inner_exp, torch.Tensor):
-                exp = torch.exp(inner_exp)
-            else:
-                exp = np.exp(inner_exp)
+            exp = self._exp(inner_exp)
             hub_reduced_dist = 1.0 - exp
         # ...or use non-iterative contextual dissimilarity measure
         elif self.method == "nicdm":
             r_t_to_s = self.r_dist_t_to_s_.mean(axis=1)
             r_s_to_t = r_dist_s_to_t.mean(axis=1).reshape(-1, 1)
             inner_sqrt = r_s_to_t * r_t_to_s[neigh_ind]
-            if torch and isinstance(inner_sqrt, torch.Tensor):
-                sqrt = torch.sqrt(inner_sqrt)
-            else:
-                sqrt = np.sqrt(inner_sqrt)
+            sqrt = self._sqrt(inner_sqrt)
             hub_reduced_dist = neigh_dist / sqrt
 
         # Return the hubness reduced distances

diff --git a/kiez/hubness_reduction/mutual_proximity.py b/kiez/hubness_reduction/mutual_proximity.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 # adapted from skhubness: https://github.com/VarIr/scikit-hubness/
 
-import warnings
+from typing import Tuple, TypeVar
 
 import numpy as np
 from scipy import stats
@@ -10,6 +10,8 @@
 
 from .base import HubnessReduction
 
+T = TypeVar("T")
+
 try:
     import torch
     from torch.distributions.normal import Normal
@@ -90,25 +92,38 @@ def _fit(
         self.n_train = neigh_dist.shape[0]
 
         if self.method == "empiric":
-            if torch and isinstance(neigh_dist, torch.Tensor):
-                warnings.warn(
-                    "No Torch implementation for `method=empiric`. Will cast to and return numpy arrays!",
-                    stacklevel=2,
-                )
-                neigh_dist = neigh_dist.cpu().numpy()
-                neigh_ind = neigh_ind.cpu().numpy()
             self.neigh_dist_t_to_s_ = neigh_dist
             self.neigh_ind_t_to_s_ = neigh_ind
         elif self.method == "normal":
-            if torch and isinstance(neigh_dist, torch.Tensor):
+            if self._use_torch:
                 self.mu_t_to_s_ = torch.nanmean(neigh_dist, axis=1)
                 self.sd_t_to_s_ = torch.std(neigh_dist, axis=1)
             else:
                 self.mu_t_to_s_ = np.nanmean(neigh_dist, axis=1)
                 self.sd_t_to_s_ = np.nanstd(neigh_dist, axis=1)
         return self
 
-    def transform(self, neigh_dist, neigh_ind, query):
+    def _zeros(self, value):
+        if self._use_torch:
+            return torch.zeros(value)
+        return np.zeros(value)
+
+    def _empty_like(self, value):
+        if self._use_torch:
+            return torch.empty_like(value)
+        return np.empty_like(value)
+
+    def _sum(self, value, axis):
+        if self._use_torch:
+            return torch.sum(value, axis=axis)
+        return np.sum(value, axis=axis)
+
+    def _numel(self, value):
+        if self._use_torch:
+            return value.numel()
+        return value.size
+
+    def transform(self, neigh_dist, neigh_ind, query) -> Tuple[T, T]:
         """Transform distance between test and training data with Mutual Proximity.
 
         Parameters
@@ -151,7 +166,7 @@ def transform(self, neigh_dist, neigh_ind, query):
         if self.method == "normal":
             mu_t_to_s = self.mu_t_to_s_
             sd_t_to_s_ = self.sd_t_to_s_
-            if torch and isinstance(neigh_dist, torch.Tensor):
+            if self._use_torch:
                 mu = torch.nanmean(neigh_dist, axis=1).reshape(-1, 1)
                 sd = torch.std(neigh_dist, axis=1).reshape(-1, 1)
                 p1 = 1 - Normal(mu, sd).cdf(neigh_dist)
@@ -168,12 +183,8 @@ def transform(self, neigh_dist, neigh_ind, query):
             hub_reduced_dist = 1 - p1 * p2
         # Calculate MP empiric (slow)
         elif self.method == "empiric":
-            if torch and isinstance(neigh_dist, torch.Tensor):
-                # already fired warning during fit
-                neigh_dist = neigh_dist.cpu().numpy()
-                neigh_ind = neigh_ind.cpu().numpy()
-                query = query.cpu().numpy()
-            hub_reduced_dist = np.empty_like(neigh_dist)
+            # if self._use_torch:
+            hub_reduced_dist = self._empty_like(neigh_dist)
             n_test, n_indexed = neigh_dist.shape
             # Show progress in hubness reduction loop
             disable_tqdm = not self.verbose
@@ -185,10 +196,10 @@ def transform(self, neigh_dist, neigh_ind, query):
 
             max_ind = max(self.neigh_ind_t_to_s_.max(), neigh_ind.max())
             for i in range_n_test:
-                d_i = neigh_dist[i, :][np.newaxis, :]  # broadcasted afterwards
-                d_j = np.zeros((d_i.size, n_indexed))
+                d_i = neigh_dist[i, :][None, :]  # broadcasted afterwards
+                d_j = self._zeros((self._numel(d_i), n_indexed))
                 for j in range(n_indexed):
-                    tmp = np.zeros(max_ind + 1) + (
+                    tmp = self._zeros(max_ind + 1) + (
                         self.neigh_dist_t_to_s_[neigh_ind[i, j], -1] + 1e-6
                     )
                     tmp[
@@ -197,7 +208,7 @@ def transform(self, neigh_dist, neigh_ind, query):
                     d_j[j, :] = tmp[neigh_ind[i]]
                 d = d_i.T
                 hub_reduced_dist[i, :] = 1.0 - (
-                    np.sum((d_i > d) & (d_j > d), axis=1) / n_indexed
+                    self._sum((d_i > d) & (d_j > d), axis=1) / n_indexed
                 )
 
         # Return the hubness reduced distances

diff --git a/kiez/kiez.py b/kiez/kiez.py
@@ -1,8 +1,6 @@
-from __future__ import annotations
-
 import json
 from pathlib import Path
-from typing import Any, Optional, Union
+from typing import Any, Literal, Optional, Tuple, TypeVar, Union, overload
 
 import numpy as np
 from class_resolver import HintOrType
@@ -11,6 +9,8 @@
 from kiez.hubness_reduction.base import HubnessReduction
 from kiez.neighbors import NNAlgorithm, nn_algorithm_resolver
 
+T = TypeVar("T")
+
 
 class Kiez:
     """Performs hubness reduced nearest neighbor search for entity alignment.
@@ -124,12 +124,12 @@ def __repr__(self):
         )
 
     @classmethod
-    def from_path(cls, path: Union[str, Path]) -> Kiez:
+    def from_path(cls, path: Union[str, Path]) -> "Kiez":
         """Load a Kiez instance from configuration in a JSON file, based on its path."""
         with open(path) as file:
             return cls(**json.load(file))
 
-    def fit(self, source, target=None) -> Kiez:
+    def fit(self, source: T, target: Optional[T] = None) -> "Kiez":
         """Fits the algorithm and hubness reduction method.
 
         Parameters
@@ -147,11 +147,27 @@ def fit(self, source, target=None) -> Kiez:
         self.hubness.fit(source, target)
         return self
 
+    @overload
+    def kneighbors(
+        self,
+        k: Optional[int] = None,
+        return_distance: Literal[True] = True,
+    ) -> Tuple[T, T]:
+        ...
+
+    @overload
+    def kneighbors(
+        self,
+        k: Optional[int] = None,
+        return_distance: Literal[False] = False,
+    ) -> Any:
+        ...
+
     def kneighbors(
         self,
         k: Optional[int] = None,
-        return_distance=True,
-    ) -> Union[np.ndarray, tuple[np.ndarray, np.ndarray]]:
+        return_distance: bool = True,
+    ) -> Union[T, Tuple[T, T]]:
         """Retrieve the k-nearest neighbors using the supplied nearest neighbor algorithm and hubness reduction method.
 
         Parameters