@@ -61,6 +61,11 @@ class StringEncoder(TransformerMixin, SingleColumnTransformer):
6161 Used during randomized svd. Pass an int for reproducible results across
6262 multiple function calls.
6363
64+ vocabulary : Mapping or iterable, default=None
65+ In case of "tfidf" vectorizer, the vocabulary mapping passed to the vectorizer.
66+ Either a Mapping (e.g., a dict) where keys are terms and values are
67+ indices in the feature matrix, or an iterable over terms.
68+
6469 Attributes
6570 ----------
6671 input_name_ : str
@@ -131,13 +136,15 @@ def __init__(
131136 analyzer = "char_wb" ,
132137 stop_words = None ,
133138 random_state = None ,
139+ vocabulary = None ,
134140 ):
135141 self .n_components = n_components
136142 self .vectorizer = vectorizer
137143 self .ngram_range = ngram_range
138144 self .analyzer = analyzer
139145 self .stop_words = stop_words
140146 self .random_state = random_state
147+ self .vocabulary = vocabulary
141148
142149 def fit_transform (self , X , y = None ):
143150 """Fit the encoder and transform a column.
@@ -165,21 +172,29 @@ def fit_transform(self, X, y=None):
165172 ngram_range = self .ngram_range ,
166173 analyzer = self .analyzer ,
167174 stop_words = self .stop_words ,
175+ vocabulary = self .vocabulary ,
168176 )
169177 elif self .vectorizer == "hashing" :
170- self .vectorizer_ = Pipeline (
171- [
172- (
173- "hashing" ,
174- HashingVectorizer (
175- ngram_range = self .ngram_range ,
176- analyzer = self .analyzer ,
177- stop_words = self .stop_words ,
178+ if self .vocabulary is not None :
179+ raise ValueError (
180+ "Custom vocabulary passed to StringEncoder, unsupported by"
181+ "HashingVectorizer. Rerun without a 'vocabulary' parameter."
182+ )
183+ else :
184+ self .vectorizer_ = Pipeline (
185+ [
186+ (
187+ "hashing" ,
188+ HashingVectorizer (
189+ ngram_range = self .ngram_range ,
190+ analyzer = self .analyzer ,
191+ stop_words = self .stop_words ,
192+ ),
178193 ),
179- ),
180- ( "tfidf" , TfidfTransformer ()),
181- ]
182- )
194+ ( "tfidf" , TfidfTransformer () ),
195+ ]
196+ )
197+
183198 else :
184199 raise ValueError (
185200 f"Unknown vectorizer { self .vectorizer } . Options are 'tfidf' or"
0 commit comments