Enable torch (#35)

* Enable torch * Move to device * Change skip reason * Set use_gpu flag to true * Add torch_utils import * Add float conversion * Fix float conversion in test * Remove for loop in CSLS * Remove more for loops * Enable torch sorting and adapt test * Use tolerance * Adapt tolerance * Adapt local scaling for torch * Forgot variable rename * Adapt Mutual proximity * Use indices not dist * Add print * Adapted fit in MP * Use std for torch * Remove ddof * Adapt tolerance * Add torch support for dissimlocal * Change metric in test * Change metric everywhere * Use torch for euclidean if needed for faiss * Use torch row_norm * Remove unnecessary variable switch * Set knn * Add prints * Fixed DisSimLocal Torch bug * Refactor code deduplication DisSimLocal * Fix MP and cleanup * Cleanup incompatible type hints for py3.8 and remove autofaiss mentions * Added possibility to show available NN and HR * Adapted Changelog
dobraczka · Jan 11, 2024 · 23f2667 · 23f2667
1 parent e97e28e
commit 23f2667
Show file tree

Hide file tree

Showing 25 changed files with 697 additions and 171 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+## [0.5.0] - 2024-01-11
+
+### Added
+
+- Support for torch, when using Faiss
+- More metrics for Faiss
+
+### Changed
+
+- Simplified kneighbors API, i.e. no queries can be supplied anymore, since they need to come from the source anyway
+
+### Removed
+
+- Autofaiss support was removed
+
+### Fixed
+
+- Several efficiency problems when not using Hubness Reduction were addressed
+
 ## [0.4.4] - 2023-10-18
 
 ### Changed

diff --git a/README.md b/README.md
@@ -31,7 +31,6 @@ If you have a GPU you can make kiez faster by installing [faiss](https://github.
 conda env create -n kiez-faiss python=3.10
 conda activate kiez-faiss
 conda install -c pytorch -c nvidia faiss-gpu=1.7.4 mkl=2021 blas=1.0=mkl
-pip install autofaiss
 pip install kiez
 ```
 
@@ -66,16 +65,36 @@ rng = np.random.RandomState(0)
 source = rng.rand(100,50)
 target = rng.rand(100,50)
 # prepare algorithm and hubness reduction
-from kiez.neighbors import Faiss
-faiss = Faiss(n_candidates=10)
-from kiez.hubness_reduction import CSLS
-hr = CSLS()
+algo_kwargs = {"n_candidates": 10}
+k_inst = Kiez(n_neighbors=5, algorithm="Faiss" algorithm_kwargs=algo_kwargs, hubness="CSLS")
 # fit and get neighbors
-k_inst = Kiez(n_neighbors=5, algorithm=faiss, hubness=hr)
 k_inst.fit(source, target)
 nn_dist, nn_ind = k_inst.kneighbors()
 ```
 
+## Torch Support
+Beginning with version 0.5.0 torch can be used, when using `Faiss` as NN library:
+
+```python
+
+    from kiez import Kiez
+    import torch
+    source = torch.randn((100,10))
+    target = torch.randn((200,10))
+    k_inst = Kiez(algorithm="Faiss", hubness="CSLS")
+    k_inst.fit(source, target)
+    nn_dist, nn_ind = k_inst.kneighbors()
+```
+
+You can also utilize tensor on the GPU:
+
+```python
+
+    k_inst = Kiez(algorithm="Faiss", algorithm_kwargs={"use_gpu":True}, hubness="CSLS")
+    k_inst.fit(source.cuda(), target.cuda())
+    nn_dist, nn_ind = k_inst.kneighbors()
+```
+
 ## Documentation
 You can find more documentation on [readthedocs](https://kiez.readthedocs.io)
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -57,7 +57,6 @@ If you have a GPU you can make kiez faster by installing `faiss <https://github.
     conda env create -n kiez-faiss python=3.10
     conda activate kiez-faiss
     conda install -c pytorch -c nvidia faiss-gpu=1.7.4 mkl=2021 blas=1.0=mkl
-    pip install autofaiss
     pip install kiez
 
 For more information see their `installation instructions <https://github.com/facebookresearch/faiss/blob/main/INSTALL.md>`_.

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -17,7 +17,6 @@ If you have a GPU you can make kiez faster by installing `faiss <https://github.
     conda env create -n kiez-faiss python=3.10
     conda activate kiez-faiss
     conda install -c pytorch -c nvidia faiss-gpu=1.7.4 mkl=2021 blas=1.0=mkl
-    pip install autofaiss
     pip install kiez
 
 For more information see their `installation instructions <https://github.com/facebookresearch/faiss/blob/main/INSTALL.md>`_.

diff --git a/kiez/hubness_reduction/base.py b/kiez/hubness_reduction/base.py
@@ -1,28 +1,36 @@
 import warnings
 from abc import ABC, abstractmethod
-from typing import Optional, Tuple
+from typing import Optional, Tuple, TypeVar
 
 import numpy as np
 
 from ..neighbors import NNAlgorithm
 
+try:
+    import torch
+except ImportError:
+    torch = None
+
+T = TypeVar("T")
+
 
 class HubnessReduction(ABC):
     """Base class for hubness reduction."""
 
     def __init__(self, nn_algo: NNAlgorithm, verbose: int = 0, **kwargs):
         self.nn_algo = nn_algo
         self.verbose = verbose
+        self._use_torch = False
         if nn_algo.n_candidates == 1:
             raise ValueError(
                 "Cannot perform hubness reduction with a single candidate per query!"
             )
 
     @abstractmethod
-    def _fit(self, neigh_dist, neigh_ind, source, target):
+    def _fit(self, neigh_dist: T, neigh_ind: T, source: T, target: T):
         pass  # pragma: no cover
 
-    def fit(self, source, target=None):
+    def fit(self, source: T, target: Optional[T] = None):
         self.nn_algo.fit(source, target)
         if target is None:
             target = source
@@ -32,6 +40,8 @@ def fit(self, source, target=None):
             s_to_t=False,
             return_distance=True,
         )
+        if torch and isinstance(neigh_dist_t_to_s, torch.Tensor):
+            self._use_torch = True
         self._fit(
             neigh_dist_t_to_s,
             neigh_ind_t_to_s,
@@ -40,7 +50,7 @@ def fit(self, source, target=None):
         )
 
     @abstractmethod
-    def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarray]:
+    def transform(self, neigh_dist, neigh_ind, query) -> Tuple[T, T]:
         pass  # pragma: no cover
 
     def _set_k_if_needed(self, k: Optional[int] = None) -> int:
@@ -59,7 +69,24 @@ def _set_k_if_needed(self, k: Optional[int] = None) -> int:
             return self.nn_algo.n_candidates
         return k
 
-    def kneighbors(self, k: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
+    @staticmethod
+    def _sort(hubness_reduced_query_dist, query_ind, n_neighbors: int) -> Tuple[T, T]:
+        if torch and isinstance(hubness_reduced_query_dist, torch.Tensor):
+            mask = torch.argsort(hubness_reduced_query_dist)[:, :n_neighbors]
+            hubness_reduced_query_dist = torch.take_along_dim(
+                hubness_reduced_query_dist, mask, dim=1
+            )
+            query_ind = torch.take_along_dim(query_ind, mask, dim=1)
+        else:
+            kth = np.arange(n_neighbors)
+            mask = np.argpartition(hubness_reduced_query_dist, kth=kth)[:, :n_neighbors]
+            hubness_reduced_query_dist = np.take_along_axis(
+                hubness_reduced_query_dist, mask, axis=1
+            )
+            query_ind = np.take_along_axis(query_ind, mask, axis=1)
+        return hubness_reduced_query_dist, query_ind
+
+    def kneighbors(self, k: Optional[int] = None) -> Tuple[T, T]:
         n_neighbors = self._set_k_if_needed(k)
         # First obtain candidate neighbors
         query_dist, query_ind = self.nn_algo.kneighbors(
@@ -73,13 +100,9 @@ def kneighbors(self, k: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
             self.nn_algo.source_,
         )
         # Third, sort hubness reduced candidate neighbors to get the final k neighbors
-        kth = np.arange(n_neighbors)
-        mask = np.argpartition(hubness_reduced_query_dist, kth=kth)[:, :n_neighbors]
-        hubness_reduced_query_dist = np.take_along_axis(
-            hubness_reduced_query_dist, mask, axis=1
+        return HubnessReduction._sort(
+            hubness_reduced_query_dist, query_ind, n_neighbors
         )
-        query_ind = np.take_along_axis(query_ind, mask, axis=1)
-        return hubness_reduced_query_dist, query_ind
 
 
 class NoHubnessReduction(HubnessReduction):
@@ -91,9 +114,9 @@ def _fit(self, neigh_dist, neigh_ind, source, target):
     def fit(self, source, target=None):
         self.nn_algo.fit(source, target, only_fit_target=True)
 
-    def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarray]:
+    def transform(self, neigh_dist, neigh_ind, query) -> Tuple[T, T]:
         return neigh_dist, neigh_ind
 
-    def kneighbors(self, k: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
+    def kneighbors(self, k: Optional[int] = None) -> Tuple[T, T]:
         n_neighbors = self._set_k_if_needed(k)
         return self.nn_algo.kneighbors(query=None, k=n_neighbors, return_distance=True)
diff --git a/kiez/hubness_reduction/csls.py b/kiez/hubness_reduction/csls.py
@@ -1,11 +1,11 @@
-from __future__ import annotations
+from typing import Tuple, TypeVar
 
-import numpy as np
 from sklearn.utils.validation import check_is_fitted
-from tqdm.auto import tqdm
 
 from .base import HubnessReduction
 
+T = TypeVar("T")
+
 
 class CSLS(HubnessReduction):
     """Hubness reduction with Cross-domain similarity local scaling.
@@ -30,7 +30,7 @@ def _fit(
         neigh_ind,
         source=None,
         target=None,
-    ) -> CSLS:
+    ) -> "CSLS":
         """Fit the model using target, neigh_dist, and neigh_ind as training data.
 
         Parameters
@@ -59,7 +59,7 @@ def transform(
         neigh_dist,
         neigh_ind,
         query,
-    ) -> tuple[np.ndarray, np.ndarray]:
+    ) -> Tuple[T, T]:
         """Transform distance between test and training data with CSLS.
 
         Parameters
@@ -84,27 +84,13 @@ def transform(
         """
         check_is_fitted(self, "r_dist_train_")
 
-        n_test, n_indexed = neigh_dist.shape
-
         # Find average distances to the k nearest neighbors
         r_dist_test = neigh_dist
 
-        hub_reduced_dist = np.empty_like(neigh_dist)
-
-        # Optionally show progress of local scaling loop
-        disable_tqdm = not self.verbose
-        range_n_test = tqdm(
-            range(n_test),
-            desc="CSLS",
-            disable=disable_tqdm,
-        )
-
         r_train = self.r_dist_train_.mean(axis=1)
-        r_test = r_dist_test.mean(axis=1)
-        for i in range_n_test:
-            hub_reduced_dist[i, :] = (
-                2 * neigh_dist[i] - r_test[i] - r_train[neigh_ind[i]]
-            )
+        r_test = r_dist_test.mean(axis=1).reshape(-1, 1)
+
+        hub_reduced_dist = 2 * neigh_dist - r_test - r_train[neigh_ind]
         # Return the hubness reduced distances
         # These must be sorted downstream
         return hub_reduced_dist, neigh_ind