Skip to content

Commit

Permalink
Fix MP and cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
dobraczka committed Jan 11, 2024
1 parent 0e96685 commit 6ccb674
Show file tree
Hide file tree
Showing 19 changed files with 275 additions and 92 deletions.
23 changes: 13 additions & 10 deletions kiez/hubness_reduction/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import warnings
from abc import ABC, abstractmethod
from typing import Optional, Tuple
from typing import Optional, Tuple, TypeVar

import numpy as np

Expand All @@ -11,23 +11,26 @@
except ImportError:
torch = None

T = TypeVar("T")


class HubnessReduction(ABC):
"""Base class for hubness reduction."""

def __init__(self, nn_algo: NNAlgorithm, verbose: int = 0, **kwargs):
self.nn_algo = nn_algo
self.verbose = verbose
self._use_torch = False
if nn_algo.n_candidates == 1:
raise ValueError(
"Cannot perform hubness reduction with a single candidate per query!"
)

@abstractmethod
def _fit(self, neigh_dist, neigh_ind, source, target):
def _fit(self, neigh_dist: T, neigh_ind: T, source: T, target: T):
pass # pragma: no cover

def fit(self, source, target=None):
def fit(self, source: T, target: Optional[T] = None):
self.nn_algo.fit(source, target)
if target is None:
target = source
Expand All @@ -37,6 +40,8 @@ def fit(self, source, target=None):
s_to_t=False,
return_distance=True,
)
if torch and isinstance(neigh_dist_t_to_s, torch.Tensor):
self._use_torch = True
self._fit(
neigh_dist_t_to_s,
neigh_ind_t_to_s,
Expand All @@ -45,7 +50,7 @@ def fit(self, source, target=None):
)

@abstractmethod
def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarray]:
def transform(self, neigh_dist, neigh_ind, query) -> Tuple[T, T]:
pass # pragma: no cover

def _set_k_if_needed(self, k: Optional[int] = None) -> int:
Expand All @@ -65,9 +70,7 @@ def _set_k_if_needed(self, k: Optional[int] = None) -> int:
return k

@staticmethod
def _sort(
hubness_reduced_query_dist, query_ind, n_neighbors: int
) -> tuple[np.ndarray, np.ndarray]:
def _sort(hubness_reduced_query_dist, query_ind, n_neighbors: int) -> Tuple[T, T]:
if torch and isinstance(hubness_reduced_query_dist, torch.Tensor):
mask = torch.argsort(hubness_reduced_query_dist)[:, :n_neighbors]
hubness_reduced_query_dist = torch.take_along_dim(
Expand All @@ -83,7 +86,7 @@ def _sort(
query_ind = np.take_along_axis(query_ind, mask, axis=1)
return hubness_reduced_query_dist, query_ind

def kneighbors(self, k: Optional[int] = None) -> tuple[np.ndarray, np.ndarray]:
def kneighbors(self, k: Optional[int] = None) -> Tuple[T, T]:
n_neighbors = self._set_k_if_needed(k)
# First obtain candidate neighbors
query_dist, query_ind = self.nn_algo.kneighbors(
Expand Down Expand Up @@ -111,9 +114,9 @@ def _fit(self, neigh_dist, neigh_ind, source, target):
def fit(self, source, target=None):
self.nn_algo.fit(source, target, only_fit_target=True)

def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarray]:
def transform(self, neigh_dist, neigh_ind, query) -> Tuple[T, T]:
return neigh_dist, neigh_ind

def kneighbors(self, k: Optional[int] = None) -> Tuple[np.ndarray, np.ndarray]:
def kneighbors(self, k: Optional[int] = None) -> Tuple[T, T]:
n_neighbors = self._set_k_if_needed(k)
return self.nn_algo.kneighbors(query=None, k=n_neighbors, return_distance=True)
7 changes: 5 additions & 2 deletions kiez/hubness_reduction/csls.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import numpy as np
from typing import Tuple, TypeVar

from sklearn.utils.validation import check_is_fitted

from .base import HubnessReduction

T = TypeVar("T")


class CSLS(HubnessReduction):
"""Hubness reduction with Cross-domain similarity local scaling.
Expand Down Expand Up @@ -56,7 +59,7 @@ def transform(
neigh_dist,
neigh_ind,
query,
) -> tuple[np.ndarray, np.ndarray]:
) -> Tuple[T, T]:
"""Transform distance between test and training data with CSLS.
Parameters
Expand Down
10 changes: 7 additions & 3 deletions kiez/hubness_reduction/dis_sim.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
# SPDX-License-Identifier: BSD-3-Clause
# adapted from skhubness: https://github.com/VarIr/scikit-hubness/

from typing import Tuple, TypeVar

import numpy as np
from sklearn.metrics import euclidean_distances
from sklearn.utils.extmath import row_norms
from sklearn.utils.validation import check_is_fitted

from .base import HubnessReduction

T = TypeVar("T")

_DESIRED_P_VALUE = 2
_MINIMUM_DIST = 0.0

Expand Down Expand Up @@ -90,7 +94,7 @@ def _fit(
# Calculate local neighborhood centroids among the training points
knn = neigh_ind
centroids = source[knn].mean(axis=1)
if torch and isinstance(centroids, torch.Tensor):
if self._use_torch:
# see https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/utils/extmath.py#L87C21-L87C48
X = target - centroids
dist_to_cent = torch.einsum("ij,ij->i", X, X)
Expand All @@ -108,7 +112,7 @@ def transform(
neigh_dist,
neigh_ind,
query,
) -> tuple[np.ndarray, np.ndarray]:
) -> Tuple[T, T]:
"""Transform distance between test and training data with DisSimLocal.
Parameters
Expand Down Expand Up @@ -137,7 +141,7 @@ def transform(
["target_", "target_centroids_", "target_dist_to_centroids_"],
)
# Calculate local neighborhood centroids for source objects among target objects
if torch and isinstance(neigh_ind, torch.Tensor):
if self._use_torch:
# pairwise squared euclidean distance between each query vector and knn
# unsqueeze to enable batching
hub_reduced_dist = (
Expand Down
26 changes: 17 additions & 9 deletions kiez/hubness_reduction/local_scaling.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
# SPDX-License-Identifier: BSD-3-Clause
# adapted from skhubness: https://github.com/VarIr/scikit-hubness/

from typing import Tuple, TypeVar

import numpy as np
from sklearn.utils.validation import check_is_fitted

from .base import HubnessReduction

T = TypeVar("T")

try:
import torch
except ImportError:
Expand Down Expand Up @@ -79,12 +83,22 @@ def _fit(
self.r_ind_t_to_s_ = neigh_ind
return self

def _exp(self, inner_exp):
if self._use_torch:
return torch.exp(inner_exp)
return np.exp(inner_exp)

def _sqrt(self, value):
if self._use_torch:
return torch.sqrt(value)
return np.sqrt(value)

def transform(
self,
neigh_dist,
neigh_ind,
query=None,
) -> tuple[np.ndarray, np.ndarray]:
) -> Tuple[T, T]:
"""Transform distance between test and training data with Mutual Proximity.
Parameters
Expand Down Expand Up @@ -122,20 +136,14 @@ def transform(
r_t_to_s = self.r_dist_t_to_s_[:, -1]
r_s_to_t = r_dist_s_to_t[:, -1].reshape(-1, 1)
inner_exp = -1 * neigh_dist**2 / (r_s_to_t * r_t_to_s[neigh_ind])
if torch and isinstance(inner_exp, torch.Tensor):
exp = torch.exp(inner_exp)
else:
exp = np.exp(inner_exp)
exp = self._exp(inner_exp)
hub_reduced_dist = 1.0 - exp
# ...or use non-iterative contextual dissimilarity measure
elif self.method == "nicdm":
r_t_to_s = self.r_dist_t_to_s_.mean(axis=1)
r_s_to_t = r_dist_s_to_t.mean(axis=1).reshape(-1, 1)
inner_sqrt = r_s_to_t * r_t_to_s[neigh_ind]
if torch and isinstance(inner_sqrt, torch.Tensor):
sqrt = torch.sqrt(inner_sqrt)
else:
sqrt = np.sqrt(inner_sqrt)
sqrt = self._sqrt(inner_sqrt)
hub_reduced_dist = neigh_dist / sqrt

# Return the hubness reduced distances
Expand Down
53 changes: 32 additions & 21 deletions kiez/hubness_reduction/mutual_proximity.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: BSD-3-Clause
# adapted from skhubness: https://github.com/VarIr/scikit-hubness/

import warnings
from typing import Tuple, TypeVar

import numpy as np
from scipy import stats
Expand All @@ -10,6 +10,8 @@

from .base import HubnessReduction

T = TypeVar("T")

try:
import torch
from torch.distributions.normal import Normal
Expand Down Expand Up @@ -90,25 +92,38 @@ def _fit(
self.n_train = neigh_dist.shape[0]

if self.method == "empiric":
if torch and isinstance(neigh_dist, torch.Tensor):
warnings.warn(
"No Torch implementation for `method=empiric`. Will cast to and return numpy arrays!",
stacklevel=2,
)
neigh_dist = neigh_dist.cpu().numpy()
neigh_ind = neigh_ind.cpu().numpy()
self.neigh_dist_t_to_s_ = neigh_dist
self.neigh_ind_t_to_s_ = neigh_ind
elif self.method == "normal":
if torch and isinstance(neigh_dist, torch.Tensor):
if self._use_torch:
self.mu_t_to_s_ = torch.nanmean(neigh_dist, axis=1)
self.sd_t_to_s_ = torch.std(neigh_dist, axis=1)
else:
self.mu_t_to_s_ = np.nanmean(neigh_dist, axis=1)
self.sd_t_to_s_ = np.nanstd(neigh_dist, axis=1)
return self

def transform(self, neigh_dist, neigh_ind, query):
def _zeros(self, value):
if self._use_torch:
return torch.zeros(value)
return np.zeros(value)

def _empty_like(self, value):
if self._use_torch:
return torch.empty_like(value)
return np.empty_like(value)

def _sum(self, value, axis):
if self._use_torch:
return torch.sum(value, axis=axis)
return np.sum(value, axis=axis)

def _numel(self, value):
if self._use_torch:
return value.numel()
return value.size

def transform(self, neigh_dist, neigh_ind, query) -> Tuple[T, T]:
"""Transform distance between test and training data with Mutual Proximity.
Parameters
Expand Down Expand Up @@ -151,7 +166,7 @@ def transform(self, neigh_dist, neigh_ind, query):
if self.method == "normal":
mu_t_to_s = self.mu_t_to_s_
sd_t_to_s_ = self.sd_t_to_s_
if torch and isinstance(neigh_dist, torch.Tensor):
if self._use_torch:
mu = torch.nanmean(neigh_dist, axis=1).reshape(-1, 1)
sd = torch.std(neigh_dist, axis=1).reshape(-1, 1)
p1 = 1 - Normal(mu, sd).cdf(neigh_dist)
Expand All @@ -168,12 +183,8 @@ def transform(self, neigh_dist, neigh_ind, query):
hub_reduced_dist = 1 - p1 * p2
# Calculate MP empiric (slow)
elif self.method == "empiric":
if torch and isinstance(neigh_dist, torch.Tensor):
# already fired warning during fit
neigh_dist = neigh_dist.cpu().numpy()
neigh_ind = neigh_ind.cpu().numpy()
query = query.cpu().numpy()
hub_reduced_dist = np.empty_like(neigh_dist)
# if self._use_torch:
hub_reduced_dist = self._empty_like(neigh_dist)
n_test, n_indexed = neigh_dist.shape
# Show progress in hubness reduction loop
disable_tqdm = not self.verbose
Expand All @@ -185,10 +196,10 @@ def transform(self, neigh_dist, neigh_ind, query):

max_ind = max(self.neigh_ind_t_to_s_.max(), neigh_ind.max())
for i in range_n_test:
d_i = neigh_dist[i, :][np.newaxis, :] # broadcasted afterwards
d_j = np.zeros((d_i.size, n_indexed))
d_i = neigh_dist[i, :][None, :] # broadcasted afterwards
d_j = self._zeros((self._numel(d_i), n_indexed))
for j in range(n_indexed):
tmp = np.zeros(max_ind + 1) + (
tmp = self._zeros(max_ind + 1) + (
self.neigh_dist_t_to_s_[neigh_ind[i, j], -1] + 1e-6
)
tmp[
Expand All @@ -197,7 +208,7 @@ def transform(self, neigh_dist, neigh_ind, query):
d_j[j, :] = tmp[neigh_ind[i]]
d = d_i.T
hub_reduced_dist[i, :] = 1.0 - (
np.sum((d_i > d) & (d_j > d), axis=1) / n_indexed
self._sum((d_i > d) & (d_j > d), axis=1) / n_indexed
)

# Return the hubness reduced distances
Expand Down
30 changes: 23 additions & 7 deletions kiez/kiez.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from __future__ import annotations

import json
from pathlib import Path
from typing import Any, Optional, Union
from typing import Any, Literal, Optional, Tuple, TypeVar, Union, overload

import numpy as np
from class_resolver import HintOrType
Expand All @@ -11,6 +9,8 @@
from kiez.hubness_reduction.base import HubnessReduction
from kiez.neighbors import NNAlgorithm, nn_algorithm_resolver

T = TypeVar("T")


class Kiez:
"""Performs hubness reduced nearest neighbor search for entity alignment.
Expand Down Expand Up @@ -124,12 +124,12 @@ def __repr__(self):
)

@classmethod
def from_path(cls, path: Union[str, Path]) -> Kiez:
def from_path(cls, path: Union[str, Path]) -> "Kiez":
"""Load a Kiez instance from configuration in a JSON file, based on its path."""
with open(path) as file:
return cls(**json.load(file))

def fit(self, source, target=None) -> Kiez:
def fit(self, source: T, target: Optional[T] = None) -> "Kiez":
"""Fits the algorithm and hubness reduction method.
Parameters
Expand All @@ -147,11 +147,27 @@ def fit(self, source, target=None) -> Kiez:
self.hubness.fit(source, target)
return self

@overload
def kneighbors(
self,
k: Optional[int] = None,
return_distance: Literal[True] = True,
) -> Tuple[T, T]:
...

@overload
def kneighbors(
self,
k: Optional[int] = None,
return_distance: Literal[False] = False,
) -> Any:
...

def kneighbors(
self,
k: Optional[int] = None,
return_distance=True,
) -> Union[np.ndarray, tuple[np.ndarray, np.ndarray]]:
return_distance: bool = True,
) -> Union[T, Tuple[T, T]]:
"""Retrieve the k-nearest neighbors using the supplied nearest neighbor algorithm and hubness reduction method.
Parameters
Expand Down
Loading

0 comments on commit 6ccb674

Please sign in to comment.