Merge branch 'skrub-data:main' into chore-fix-issue1729

techy4shri · web-flow · commit ce7d1568e79e · 2026-01-14T18:43:14.000+05:30
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -14,6 +14,13 @@ New features
 
 Changes
 -------
+- The :class:`StringEncoder` now exposes the ``vocabulary`` parameter from the parent
+  :class:`TfidfVectorizer`.
+  :pr:`1819` by :user:`Eloi Massoulié <emassoulie>`
+
+
+- :func:`compute_ngram_distance` has been renamed to :func:`_compute_ngram_distance` and is now a private function.
+  :pr:`1838` by :user:`Siddharth Baleja <siddharthbaleja>`.
 
 Bugfixes
 --------
diff --git a/examples/0050_deduplication.py b/examples/0050_deduplication.py
@@ -148,33 +148,6 @@
 # If we want to adapt the translation table, we can
 # modify it manually.
 
-###############################################################################
-# Visualizing string pair-wise distance between names
-# ---------------------------------------------------
-#
-# Below, we use a heatmap to visualize the pairwise-distance between medication
-# names. A darker color means that two medication names are closer together
-# (i.e. more similar), a lighter color means a larger distance.
-#
-
-from scipy.spatial.distance import squareform
-
-from skrub import compute_ngram_distance
-
-ngram_distances = compute_ngram_distance(unique_examples)
-square_distances = squareform(ngram_distances)
-
-import seaborn as sns
-
-fig, ax = plt.subplots(figsize=(14, 12))
-sns.heatmap(
-    square_distances, yticklabels=unique_examples, xticklabels=unique_examples, ax=ax
-)
-plt.show()
-
-###############################################################################
-# We have three clusters appearing - the original medication
-# names and their misspellings that form a cluster around them.
 
 ###############################################################################
 # Conclusion
diff --git a/skrub/__init__.py b/skrub/__init__.py
@@ -29,7 +29,7 @@
     y,
 )
 from ._datetime_encoder import DatetimeEncoder
-from ._deduplicate import compute_ngram_distance, deduplicate
+from ._deduplicate import deduplicate
 from ._drop_uninformative import DropUninformative
 from ._fuzzy_join import fuzzy_join
 from ._gap_encoder import GapEncoder
@@ -77,7 +77,7 @@
     "Cleaner",
     "DropUninformative",
     "deduplicate",
-    "compute_ngram_distance",
+    "deduplicate",
     "ToCategorical",
     "to_datetime",
     "AggJoiner",
diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py
@@ -12,7 +12,7 @@
 from sklearn.metrics import silhouette_score
 
 
-def compute_ngram_distance(
+def _compute_ngram_distance(
     unique_words,
     ngram_range=(2, 4),
     analyzer="char_wb",
@@ -260,7 +260,7 @@ def deduplicate(
     9  white      9              white
     """
     unique_words, counts = np.unique(X, return_counts=True)
-    distance_mat = compute_ngram_distance(
+    distance_mat = _compute_ngram_distance(
         unique_words, ngram_range=ngram_range, analyzer=analyzer
     )
 
diff --git a/skrub/_string_encoder.py b/skrub/_string_encoder.py
@@ -61,6 +61,11 @@ class StringEncoder(TransformerMixin, SingleColumnTransformer):
         Used during randomized svd. Pass an int for reproducible results across
         multiple function calls.
 
+    vocabulary : Mapping or iterable, default=None
+        In case of "tfidf" vectorizer, the vocabulary mapping passed to the vectorizer.
+        Either a Mapping (e.g., a dict) where keys are terms and values are
+        indices in the feature matrix, or an iterable over terms.
+
     Attributes
     ----------
     input_name_ : str
@@ -131,13 +136,15 @@ def __init__(
         analyzer="char_wb",
         stop_words=None,
         random_state=None,
+        vocabulary=None,
     ):
         self.n_components = n_components
         self.vectorizer = vectorizer
         self.ngram_range = ngram_range
         self.analyzer = analyzer
         self.stop_words = stop_words
         self.random_state = random_state
+        self.vocabulary = vocabulary
 
     def fit_transform(self, X, y=None):
         """Fit the encoder and transform a column.
@@ -165,21 +172,29 @@ def fit_transform(self, X, y=None):
                 ngram_range=self.ngram_range,
                 analyzer=self.analyzer,
                 stop_words=self.stop_words,
+                vocabulary=self.vocabulary,
             )
         elif self.vectorizer == "hashing":
-            self.vectorizer_ = Pipeline(
-                [
-                    (
-                        "hashing",
-                        HashingVectorizer(
-                            ngram_range=self.ngram_range,
-                            analyzer=self.analyzer,
-                            stop_words=self.stop_words,
+            if self.vocabulary is not None:
+                raise ValueError(
+                    "Custom vocabulary passed to StringEncoder, unsupported by"
+                    "HashingVectorizer. Rerun without a 'vocabulary' parameter."
+                )
+            else:
+                self.vectorizer_ = Pipeline(
+                    [
+                        (
+                            "hashing",
+                            HashingVectorizer(
+                                ngram_range=self.ngram_range,
+                                analyzer=self.analyzer,
+                                stop_words=self.stop_words,
+                            ),
                         ),
-                    ),
-                    ("tfidf", TfidfTransformer()),
-                ]
-            )
+                        ("tfidf", TfidfTransformer()),
+                    ]
+                )
+
         else:
             raise ValueError(
                 f"Unknown vectorizer {self.vectorizer}. Options are 'tfidf' or"
diff --git a/skrub/tests/test_deduplicate.py b/skrub/tests/test_deduplicate.py
@@ -7,9 +7,9 @@
 from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel
 
 from skrub._deduplicate import (
+    _compute_ngram_distance,
     _create_spelling_correction,
     _guess_clusters,
-    compute_ngram_distance,
     deduplicate,
 )
 from skrub.datasets import make_deduplication_data
@@ -60,7 +60,7 @@ def test_deduplicate(
 
 def test_compute_ngram_distance():
     words = np.array(["aac", "aaa", "aaab", "aaa", "aaab", "aaa", "aaab", "aaa"])
-    distance = compute_ngram_distance(words)
+    distance = _compute_ngram_distance(words)
     distance = squareform(distance)
     assert distance.shape[0] == words.shape[0]
     assert np.allclose(np.diag(distance), 0)
@@ -70,7 +70,7 @@ def test_compute_ngram_distance():
 
 def test__guess_clusters():
     words = np.array(["aac", "aaa", "aaab", "aaa", "aaab", "aaa", "aaab", "aaa"])
-    distance = compute_ngram_distance(words)
+    distance = _compute_ngram_distance(words)
     Z = linkage(distance, method="average")
     n_clusters = _guess_clusters(Z, distance)
     assert n_clusters == len(np.unique(words))
diff --git a/skrub/tests/test_string_encoder.py b/skrub/tests/test_string_encoder.py
@@ -310,3 +310,49 @@ def test_zero_padding_in_feature_names_out(df_module, n_components, expected_col
     feature_names = encoder.get_feature_names_out()
 
     assert feature_names[: len(expected_columns)] == expected_columns
+
+
+def test_vocabulary_parameter(df_module):
+    voc = {
+        "this": 5,
+        "is": 1,
+        "simple": 3,
+        "example": 0,
+        "this is": 6,
+        "is simple": 2,
+        "simple example": 4,
+    }
+    encoder = StringEncoder(n_components=2, vocabulary=voc)
+    pipeline = Pipeline(
+        [
+            (
+                "tfidf",
+                TfidfVectorizer(ngram_range=(3, 4), analyzer="char_wb", vocabulary=voc),
+            ),
+            ("tsvd", TruncatedSVD()),
+        ]
+    )
+    X = df_module.make_column(
+        "col",
+        ["this is a sentence", "this simple example is simple", "other words", ""],
+    )
+
+    enc_out = encoder.fit_transform(X)
+    pipe_out = pipeline.fit_transform(X)
+    pipe_out /= scaling_factor(pipe_out)
+
+    assert encoder.vectorizer_.vocabulary_ == voc
+    assert_almost_equal(enc_out, pipe_out)
+
+
+def test_vocabulary_on_hashing_vectorizer(df_module):
+    voc = {
+        "this": 5,
+    }
+    encoder = StringEncoder(vocabulary=voc, vectorizer="hashing")
+    with pytest.raises(ValueError, match="Custom vocabulary passed to StringEncoder*"):
+        X = df_module.make_column(
+            "col",
+            ["this is a sentence", "this simple example is simple", "other words", ""],
+        )
+        encoder.fit_transform(X)