35
35
from dask .utils import M
36
36
from tqdm import tqdm
37
37
38
- from nemo_curator ._compat import MINHASH_PERMUTED_AVAILABLE
38
+ from nemo_curator ._compat import MINHASH_DEPRECATED_API , MINHASH_PERMUTED_AVAILABLE
39
39
from nemo_curator .datasets import DocumentDataset
40
40
from nemo_curator .log import create_logger
41
41
from nemo_curator .modules .config import FuzzyDuplicatesConfig
@@ -98,15 +98,17 @@ def __init__(
98
98
"""
99
99
self .num_hashes = num_hashes
100
100
self .char_ngram = char_ngrams
101
- if MINHASH_PERMUTED_AVAILABLE :
101
+ if MINHASH_DEPRECATED_API :
102
+ self .seeds = self .generate_seeds (n_seeds = self .num_hashes , seed = seed )
103
+ else :
102
104
self .seeds = self .generate_hash_permutation_seeds (
103
105
bit_width = 64 if use_64bit_hash else 32 ,
104
106
n_permutations = self .num_hashes ,
105
107
seed = seed ,
106
108
)
107
- else :
108
- self .seeds = self .generate_seeds (n_seeds = self .num_hashes , seed = seed )
109
+
109
110
self .minhash_method = self .minhash64 if use_64bit_hash else self .minhash32
111
+
110
112
self .id_field = id_field
111
113
self .text_field = text_field
112
114
@@ -171,7 +173,7 @@ def minhash32(
171
173
if not isinstance (ser , cudf .Series ):
172
174
raise TypeError ("Expected data of type cudf.Series" )
173
175
174
- if not MINHASH_PERMUTED_AVAILABLE :
176
+ if MINHASH_DEPRECATED_API :
175
177
warnings .warn (
176
178
"Using an outdated minhash implementation, please update to cuDF version 24.12 "
177
179
"or later for improved performance. "
@@ -184,9 +186,14 @@ def minhash32(
184
186
seeds_a = cudf .Series (seeds [:, 0 ], dtype = "uint32" )
185
187
seeds_b = cudf .Series (seeds [:, 1 ], dtype = "uint32" )
186
188
187
- return ser .str .minhash_permuted (
188
- a = seeds_a , b = seeds_b , seed = seeds [0 ][0 ], width = char_ngram
189
- )
189
+ if MINHASH_PERMUTED_AVAILABLE :
190
+ return ser .str .minhash_permuted (
191
+ a = seeds_a , b = seeds_b , seed = seeds [0 ][0 ], width = char_ngram
192
+ )
193
+ else :
194
+ return ser .str .minhash (
195
+ a = seeds_a , b = seeds_b , seed = seeds [0 ][0 ], width = char_ngram
196
+ )
190
197
191
198
def minhash64 (
192
199
self , ser : cudf .Series , seeds : np .ndarray , char_ngram : int
@@ -196,7 +203,7 @@ def minhash64(
196
203
"""
197
204
if not isinstance (ser , cudf .Series ):
198
205
raise TypeError ("Expected data of type cudf.Series" )
199
- if not MINHASH_PERMUTED_AVAILABLE :
206
+ if MINHASH_DEPRECATED_API :
200
207
warnings .warn (
201
208
"Using an outdated minhash implementation, please update to cuDF version 24.12 "
202
209
"or later for improved performance. "
@@ -209,9 +216,14 @@ def minhash64(
209
216
seeds_a = cudf .Series (seeds [:, 0 ], dtype = "uint64" )
210
217
seeds_b = cudf .Series (seeds [:, 1 ], dtype = "uint64" )
211
218
212
- return ser .str .minhash64_permuted (
213
- a = seeds_a , b = seeds_b , seed = seeds [0 ][0 ], width = char_ngram
214
- )
219
+ if MINHASH_PERMUTED_AVAILABLE :
220
+ return ser .str .minhash64_permuted (
221
+ a = seeds_a , b = seeds_b , seed = seeds [0 ][0 ], width = char_ngram
222
+ )
223
+ else :
224
+ return ser .str .minhash64 (
225
+ a = seeds_a , b = seeds_b , seed = seeds [0 ][0 ], width = char_ngram
226
+ )
215
227
216
228
def __call__ (self , dataset : DocumentDataset ) -> Union [str , DocumentDataset ]:
217
229
"""
0 commit comments