Skip to content

Commit

Permalink
Switch to ruff, include new rules and adapt code (#34)
Browse files Browse the repository at this point in the history
* Switch to ruff and adapt

* More linting rules and adaptations

* Even more linting and adjustements

* Remove unnecessary test
  • Loading branch information
dobraczka authored Jan 2, 2024
1 parent 95c6852 commit e97e28e
Show file tree
Hide file tree
Showing 33 changed files with 750 additions and 1,064 deletions.
29 changes: 22 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,24 @@
repos:
- repo: local
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: nox
name: Nox
entry: nox -rs lint --
language: system
types: [python]
require_serial: true
- id: check-added-large-files
- id: check-case-conflict
- id: check-merge-conflict
- id: check-symlinks
- id: check-yaml
- id: debug-statements
- id: end-of-file-fixer
- id: mixed-line-ending
- id: requirements-txt-fixer
- id: trailing-whitespace
- repo: https://github.com/tox-dev/pyproject-fmt
rev: "1.3.0"
hooks:
- id: pyproject-fmt
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.3
hooks:
- id: ruff
args: ["--fix", "--show-fixes"]
- id: ruff-format
5 changes: 3 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
import os
import sys

from kiez import __version__
from sphinx.ext.autodoc import between

from kiez import __version__

sys.path.insert(0, os.path.abspath("."))


Expand All @@ -30,7 +31,7 @@ def setup(app):
# -- Project information -----------------------------------------------------

project = "kiez"
copyright = "2021, Daniel Obraczka"
copyright = "2021, Daniel Obraczka" # noqa: A001
author = "Daniel Obraczka"

# The full version, including alpha/beta/rc tags
Expand Down
5 changes: 2 additions & 3 deletions docs/source/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,10 @@ You can also get other specific libraries with e.g.:
Other options to get specific libraries are ``nmslib``,``annoy``, ``ngt``. However faiss is the recommended library, which provides the most accurate and fastest results.


To build kiez from source use `poetry <https://python-poetry.org/>`_
To build kiez from source use `poetry <https://python-poetry.org/>`_

.. code-block:: bash
git clone [email protected]:dobraczka/kiez.git
git clone [email protected]:dobraczka/kiez.git
cd kiez
poetry install
1 change: 0 additions & 1 deletion kiez/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause

from .estimation import hubness_score
Expand Down
46 changes: 15 additions & 31 deletions kiez/analysis/estimation.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause
# adapted from skhubness: https://github.com/VarIr/scikit-hubness/
"""
Estimate hubness in datasets
"""
"""Estimate hubness in datasets."""


from __future__ import annotations

import logging
import warnings
from typing import Optional, Tuple, Union
from typing import Optional, Union

import numpy as np
from scipy import stats
Expand All @@ -34,6 +31,8 @@
"k_occurrence",
]

_SPACE_LIMIT = 10000


def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float:
"""Hubness measure; corrected for non-negativity of k-occurrence.
Expand All @@ -56,8 +55,7 @@ def _calc_skewness_truncnorm(k_occurrence: np.ndarray) -> float:
k_occurrence_std = k_occurrence.std(ddof=1)
a = (clip_left - k_occurrence_mean) / k_occurrence_std
b = (clip_right - k_occurrence_mean) / k_occurrence_std
skew_truncnorm = stats.truncnorm(a, b).moment(3)
return skew_truncnorm
return stats.truncnorm(a, b).moment(3)


def _calc_gini_index(
Expand Down Expand Up @@ -151,7 +149,7 @@ def _calc_atkinson_index(k_occurrence: np.ndarray, eps: float = 0.5) -> float:
return float(1.0 - 1.0 / k_occurrence.mean() * term)


def _calc_antihub_occurrence(k_occurrence: np.ndarray) -> Tuple[np.ndarray, float]:
def _calc_antihub_occurrence(k_occurrence: np.ndarray) -> tuple[np.ndarray, float]:
"""Proportion of antihubs in data set.
Antihubs are objects that are never among the nearest neighbors
Expand All @@ -173,7 +171,7 @@ def _calc_antihub_occurrence(k_occurrence: np.ndarray) -> Tuple[np.ndarray, floa

def _calc_hub_occurrence(
k: int, k_occurrence: np.ndarray, n_test: int, hub_size: float = 2
) -> Tuple[np.ndarray, float]:
) -> tuple[np.ndarray, float]:
"""Proportion of nearest neighbor slots occupied by hubs.
Parameters
Expand Down Expand Up @@ -202,8 +200,6 @@ def hubness_score(
*,
k: Optional[int] = None,
hub_size: float = 2.0,
shuffle_equal: bool = True,
random_state=None,
verbose: int = 0,
return_value: str = "all_but_gini",
store_k_occurrence: bool = False,
Expand All @@ -222,16 +218,6 @@ def hubness_score(
number of k for k-nearest neighbor
hub_size : float
Hubs are defined as objects with k-occurrence > hub_size * k.
shuffle_equal : bool
If true shuffle neighbors with identical distances
to avoid artifact hubness.
NOTE: This is especially useful for secondary distance measures
with a finite number of possible values
random_state: int, RandomState instance or None, optional
If int, random_state is the seed used by the random number generator;
If RandomState instance, random_state is the random number generator;
If None, the random number generator is the RandomState instance used
by `np.random`.
verbose : int
Level of output messages
return_value : str
Expand Down Expand Up @@ -288,12 +274,11 @@ def hubness_score(
k_neighbors = nn_ind.copy()
if k is None:
k = nn_ind.shape[1]
else:
if k < k_neighbors.shape[1]:
k_neighbors = k_neighbors[:, :k]
elif k > k_neighbors.shape[1]:
k = nn_ind.shape[1]
warnings.warn(f"k > nn_ind.shape[1], k will be set to {k}")
elif k < k_neighbors.shape[1]:
k_neighbors = k_neighbors[:, :k]
elif k > k_neighbors.shape[1]:
k = nn_ind.shape[1]
warnings.warn(f"k > nn_ind.shape[1], k will be set to {k}", stacklevel=2)
assert k is not None

# Negative indices can occur, when ANN does not find enough neighbors,
Expand All @@ -317,7 +302,7 @@ def hubness_score(

# Gini index
if return_value in ["gini", "all"]:
limiting = "space" if k_occurrence.shape[0] > 10_000 else "time"
limiting = "space" if k_occurrence.shape[0] > _SPACE_LIMIT else "time"
gini_index = _calc_gini_index(k_occurrence, limiting, verbose=verbose)
else:
gini_index = np.nan
Expand Down Expand Up @@ -360,8 +345,7 @@ def hubness_score(
hubness_measures["k_occurrence"] = k_occurrence
if return_value == "all":
return hubness_measures
elif return_value == "all_but_gini":
if return_value == "all_but_gini":
del hubness_measures["gini"]
return hubness_measures
else:
return hubness_measures[return_value]
return hubness_measures[return_value]
6 changes: 2 additions & 4 deletions kiez/evaluate/eval_metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
"""
Calculate evaluation metrics such as hits@k
"""
"""Calculate evaluation metrics such as hits@k."""
from typing import Any, Dict, List, Union

import numpy as np
Expand All @@ -27,7 +25,7 @@ def hits(
gold: Dict[Any, Any], # source -> target
k=None,
) -> Dict[int, float]:
"""Show hits@k
"""Show hits@k.
Parameters
----------
Expand Down
6 changes: 4 additions & 2 deletions kiez/hubness_reduction/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,15 @@ def transform(self, neigh_dist, neigh_ind, query) -> Tuple[np.ndarray, np.ndarra
def _set_k_if_needed(self, k: Optional[int] = None) -> int:
if k is None:
warnings.warn(
f"No k supplied, setting to n_candidates = {self.nn_algo.n_candidates}"
f"No k supplied, setting to n_candidates = {self.nn_algo.n_candidates}",
stacklevel=2,
)
return self.nn_algo.n_candidates
if k > self.nn_algo.n_candidates:
warnings.warn(
"k > n_candidates supplied! Setting to n_candidates ="
f" {self.nn_algo.n_candidates}"
f" {self.nn_algo.n_candidates}",
stacklevel=2,
)
return self.nn_algo.n_candidates
return k
Expand Down
7 changes: 2 additions & 5 deletions kiez/hubness_reduction/csls.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
from __future__ import annotations

import warnings
from typing import Tuple

import numpy as np
from sklearn.utils.validation import check_consistent_length, check_is_fitted
from sklearn.utils.validation import check_is_fitted
from tqdm.auto import tqdm

from .base import HubnessReduction
Expand Down Expand Up @@ -62,7 +59,7 @@ def transform(
neigh_dist,
neigh_ind,
query,
) -> Tuple[np.ndarray, np.ndarray]:
) -> tuple[np.ndarray, np.ndarray]:
"""Transform distance between test and training data with CSLS.
Parameters
Expand Down
28 changes: 13 additions & 15 deletions kiez/hubness_reduction/dis_sim.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause
# adapted from skhubness: https://github.com/VarIr/scikit-hubness/

from __future__ import annotations

import warnings
from typing import Tuple

import numpy as np
from sklearn.metrics import euclidean_distances
from sklearn.utils.extmath import row_norms
from sklearn.utils.validation import check_consistent_length, check_is_fitted
from sklearn.utils.validation import check_is_fitted

from .base import HubnessReduction

_DESIRED_P_VALUE = 2
_MINIMUM_DIST = 0.0


class DisSimLocal(HubnessReduction):
"""Hubness reduction with DisSimLocal.
Expand All @@ -31,7 +30,7 @@ class DisSimLocal(HubnessReduction):
----------
.. [1] Hara K, Suzuki I, Kobayashi K, Fukumizu K, Radovanović M (2016)
Flattening the density gradient for eliminating spatial centrality to reduce hubness.
In: Proceedings of the 30th AAAI conference on artificial intelligence, pp 16591665.
In: Proceedings of the 30th AAAI conference on artificial intelligence, pp 1659-1665.
https://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/viewPaper/12055
"""

Expand All @@ -40,13 +39,12 @@ def __init__(self, squared: bool = True, **kwargs):
self.squared = squared
if self.nn_algo.metric in ["euclidean", "minkowski"]:
self.squared = False
if hasattr(self.nn_algo, "p"):
if self.nn_algo.p != 2:
raise ValueError(
"DisSimLocal only supports squared Euclidean distances. If"
" the provided NNAlgorithm has a `p` parameter it must be"
f" set to p=2. Now it is p={self.nn_algo.p}"
)
if hasattr(self.nn_algo, "p") and self.nn_algo.p != _DESIRED_P_VALUE:
raise ValueError(
"DisSimLocal only supports squared Euclidean distances. If"
" the provided NNAlgorithm has a `p` parameter it must be"
f" set to p=2. Now it is p={self.nn_algo.p}"
)
elif self.nn_algo.metric in ["sqeuclidean"]:
self.squared = True
else:
Expand Down Expand Up @@ -102,7 +100,7 @@ def transform(
neigh_dist: np.ndarray,
neigh_ind: np.ndarray,
query: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray]:
) -> tuple[np.ndarray, np.ndarray]:
"""Transform distance between test and training data with DisSimLocal.
Parameters
Expand Down Expand Up @@ -156,7 +154,7 @@ def transform(
# certain scikit-learn routines (e.g. in metric='precomputed' usages).
# We, therefore, shift dissimilarities to non-negative values, if necessary.
min_dist = hub_reduced_dist.min()
if min_dist < 0.0:
if min_dist < _MINIMUM_DIST:
hub_reduced_dist += -min_dist

# Return Euclidean or squared Euclidean distances?
Expand Down
12 changes: 4 additions & 8 deletions kiez/hubness_reduction/local_scaling.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
# -*- coding: utf-8 -*-
# SPDX-License-Identifier: BSD-3-Clause
# adapted from skhubness: https://github.com/VarIr/scikit-hubness/

from __future__ import annotations

import warnings
from typing import Tuple

import numpy as np
from sklearn.utils.validation import check_consistent_length, check_is_fitted
from sklearn.utils.validation import check_is_fitted
from tqdm.auto import tqdm

from .base import HubnessReduction
Expand All @@ -34,7 +30,7 @@ class LocalScaling(HubnessReduction):
----------
.. [1] Schnitzer, D., Flexer, A., Schedl, M., & Widmer, G. (2012).
Local and global scaling reduce hubs in space. The Journal of Machine
Learning Research, 13(1), 28712902.
Learning Research, 13(1), 2871-2902.
"""

def __init__(self, method: str = "standard", **kwargs):
Expand Down Expand Up @@ -86,7 +82,7 @@ def transform(
neigh_dist,
neigh_ind,
query=None,
) -> Tuple[np.ndarray, np.ndarray]:
) -> tuple[np.ndarray, np.ndarray]:
"""Transform distance between test and training data with Mutual Proximity.
Parameters
Expand Down Expand Up @@ -146,7 +142,7 @@ def transform(
r_s_to_t = r_dist_s_to_t.mean(axis=1)
for i in range_n_test:
hub_reduced_dist[i, :] = neigh_dist[i] / np.sqrt(
(r_s_to_t[i] * r_t_to_s[neigh_ind[i]])
r_s_to_t[i] * r_t_to_s[neigh_ind[i]]
)

# Return the hubness reduced distances
Expand Down
Loading

0 comments on commit e97e28e

Please sign in to comment.