Skip to content

Commit 71f9599

Browse files
authored
Merge pull request #765 from snipsco/release/0.19.3
Release 0.19.3
2 parents 04087e1 + 4b850fb commit 71f9599

18 files changed

+162
-538
lines changed

CHANGELOG.md

+8
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
# Changelog
22
All notable changes to this project will be documented in this file.
33

4+
## [0.19.3] - 2019-03-05
5+
### Fixed
6+
- Issue with intent classification reducing classification accuracy
7+
- Issue resulting in a mutation of the CRFSlotFillerConfig
8+
- Wrong required resources of the `DeterministicIntentParser`
9+
- Issue with non ASCII characters when using the parsing CLI with Python2
10+
411
## [0.19.2] - 2019-02-11
512
### Fixed
613
- Fix an issue regarding the way builtin entities were handled by the `CRFSlotFiller`
@@ -236,6 +243,7 @@ several commands.
236243
- Fix compiling issue with `bindgen` dependency when installing from source
237244
- Fix issue in `CRFSlotFiller` when handling builtin entities
238245

246+
[0.19.3]: https://github.com/snipsco/snips-nlu/compare/0.19.2...0.19.3
239247
[0.19.2]: https://github.com/snipsco/snips-nlu/compare/0.19.1...0.19.2
240248
[0.19.1]: https://github.com/snipsco/snips-nlu/compare/0.19.0...0.19.1
241249
[0.19.0]: https://github.com/snipsco/snips-nlu/compare/0.18.0...0.19.0

snips_nlu/__about__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
1212
__license__ = "Apache License, Version 2.0"
1313

14-
__version__ = "0.19.2"
14+
__version__ = "0.19.3"
1515
__model_version__ = "0.19.0"
1616

1717
__download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download"

snips_nlu/cli/inference.py

+3
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
)
1919
def parse(training_path, query, verbose=False):
2020
"""Load a trained NLU engine and play with its parsing API interactively"""
21+
from builtins import str
2122
if verbose:
2223
set_nlu_logger(logging.DEBUG)
2324

@@ -29,6 +30,8 @@ def parse(training_path, query, verbose=False):
2930

3031
while True:
3132
query = input("Enter a query (type 'q' to quit): ").strip()
33+
if not isinstance(query, str):
34+
query = query.decode("utf-8")
3235
if query == "q":
3336
break
3437
print_parsing_result(engine, query)

snips_nlu/cli/utils.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from enum import Enum, unique
88

99
import requests
10+
from semantic_version import Version
1011

1112
import snips_nlu
1213
from snips_nlu import __about__
@@ -71,13 +72,16 @@ def get_json(url, desc):
7172

7273
def get_compatibility():
7374
version = __about__.__version__
75+
semver_version = Version(version)
76+
minor_version = "%d.%d" % (semver_version.major, semver_version.minor)
7477
table = get_json(__about__.__compatibility__, "Compatibility table")
75-
compatibility = table["snips-nlu"]
76-
if version not in compatibility:
78+
nlu_table = table["snips-nlu"]
79+
compatibility = nlu_table.get(version, nlu_table.get(minor_version))
80+
if compatibility is None:
7781
pretty_print("No compatible resources found for version %s" % version,
7882
title="Resources compatibility error", exits=1,
7983
level=PrettyPrintLevel.ERROR)
80-
return compatibility[version]
84+
return compatibility
8185

8286

8387
def get_resources_version(resource_fullname, resource_alias, compatibility):

snips_nlu/common/log_utils.py

+3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from __future__ import unicode_literals
2+
3+
from builtins import str
14
from datetime import datetime
25
from functools import wraps
36

snips_nlu/data_augmentation.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def get_intent_entities(dataset, intent_name):
8888
for chunk in utterance[DATA]:
8989
if ENTITY in chunk:
9090
intent_entities.add(chunk[ENTITY])
91-
return intent_entities
91+
return sorted(intent_entities)
9292

9393

9494
def num_queries_to_generate(dataset, intent_name, min_utterances):

snips_nlu/dataset/validation.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _validate_and_format_custom_entity(entity, queries_entities, language,
128128
validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool,
129129
object_label="automatically_extensible")
130130
validate_type(entity[DATA], list, object_label="entity data")
131-
validate_type(entity[MATCHING_STRICTNESS], float,
131+
validate_type(entity[MATCHING_STRICTNESS], (float, int),
132132
object_label="matching_strictness")
133133

134134
formatted_entity = dict()

snips_nlu/intent_classifier/featurizer.py

+38-68
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@
1717
json_string, fitted_required, replace_entities_with_placeholders,
1818
check_persisted_path)
1919
from snips_nlu.constants import (
20-
DATA, END, ENTITY, ENTITY_KIND, LANGUAGE, NGRAM, RES_MATCH_RANGE,
21-
RES_VALUE, START, TEXT, ENTITIES)
20+
DATA, ENTITY, ENTITY_KIND, LANGUAGE, NGRAM, TEXT, ENTITIES)
2221
from snips_nlu.dataset import get_text_from_chunks, validate_and_format_dataset
2322
from snips_nlu.entity_parser.builtin_entity_parser import (
2423
is_builtin_entity)
@@ -264,7 +263,7 @@ def fit(self, x, dataset):
264263
self._init_vectorizer(self._language)
265264
self.builtin_entity_scope = set(
266265
e for e in dataset[ENTITIES] if is_builtin_entity(e))
267-
preprocessed_data = self._preprocess(x, training=True)
266+
preprocessed_data = self._preprocess(x)
268267
utterances = [
269268
self._enrich_utterance(u, builtin_ents, custom_ents, w_clusters)
270269
for u, builtin_ents, custom_ents, w_clusters
@@ -296,7 +295,7 @@ def fit_transform(self, x, dataset):
296295
self._init_vectorizer(self._language)
297296
self.builtin_entity_scope = set(
298297
e for e in dataset[ENTITIES] if is_builtin_entity(e))
299-
preprocessed_data = self._preprocess(x, training=True)
298+
preprocessed_data = self._preprocess(x)
300299
utterances = [
301300
self._enrich_utterance(u, builtin_ents, custom_ents, w_clusters)
302301
for u, builtin_ents, custom_ents, w_clusters
@@ -330,31 +329,30 @@ def transform(self, x):
330329
for data in zip(*self._preprocess(x))]
331330
return self._tfidf_vectorizer.transform(utterances)
332331

333-
def _preprocess(self, utterances, training=False):
332+
def _preprocess(self, utterances):
334333
normalized_utterances = deepcopy(utterances)
335334
for u in normalized_utterances:
336-
for chunk in u[DATA]:
335+
nb_chunks = len(u[DATA])
336+
for i, chunk in enumerate(u[DATA]):
337337
chunk[TEXT] = _normalize_stem(
338338
chunk[TEXT], self.language, self.resources,
339339
self.config.use_stemming)
340-
341-
if training:
342-
builtin_ents, custom_ents = zip(
343-
*[_entities_from_utterance(u) for u in utterances])
344-
else:
345-
# Extract builtin entities on unormalized utterances
346-
builtin_ents = [
347-
self.builtin_entity_parser.parse(
348-
get_text_from_chunks(u[DATA]),
349-
self.builtin_entity_scope, use_cache=True)
350-
for u in utterances
351-
]
352-
# Extract builtin entities on normalized utterances
353-
custom_ents = [
354-
self.custom_entity_parser.parse(
355-
get_text_from_chunks(u[DATA]), use_cache=True)
356-
for u in normalized_utterances
357-
]
340+
if i < nb_chunks - 1:
341+
chunk[TEXT] += " "
342+
343+
# Extract builtin entities on unormalized utterances
344+
builtin_ents = [
345+
self.builtin_entity_parser.parse(
346+
get_text_from_chunks(u[DATA]),
347+
self.builtin_entity_scope, use_cache=True)
348+
for u in utterances
349+
]
350+
# Extract builtin entities on normalized utterances
351+
custom_ents = [
352+
self.custom_entity_parser.parse(
353+
get_text_from_chunks(u[DATA]), use_cache=True)
354+
for u in normalized_utterances
355+
]
358356
if self.config.word_clusters_name:
359357
# Extract world clusters on unormalized utterances
360358
original_utterances_text = [get_text_from_chunks(u[DATA])
@@ -582,7 +580,7 @@ def fit(self, x, dataset):
582580
self.builtin_entity_scope = set(
583581
e for e in dataset[ENTITIES] if is_builtin_entity(e))
584582

585-
preprocessed = self._preprocess(list(x), training=True)
583+
preprocessed = self._preprocess(list(x))
586584
utterances = [
587585
self._enrich_utterance(utterance, builtin_ents, custom_ent)
588586
for utterance, builtin_ents, custom_ent in zip(*preprocessed)]
@@ -648,7 +646,7 @@ def transform(self, x):
648646
Raises:
649647
NotTrained: when the vectorizer is not fitted
650648
"""
651-
preprocessed = self._preprocess(x, training=False)
649+
preprocessed = self._preprocess(x)
652650
utterances = [
653651
self._enrich_utterance(utterance, builtin_ents, custom_ent)
654652
for utterance, builtin_ents, custom_ent in zip(*preprocessed)]
@@ -661,24 +659,20 @@ def transform(self, x):
661659

662660
return x_coo.tocsr()
663661

664-
def _preprocess(self, x, training=False):
665-
if training:
666-
builtin_ents, custom_ents = zip(
667-
*[_entities_from_utterance(u) for u in x])
668-
else:
669-
# Extract all entities on unnormalized data
670-
builtin_ents = [
671-
self.builtin_entity_parser.parse(
672-
get_text_from_chunks(u[DATA]),
673-
self.builtin_entity_scope,
674-
use_cache=True
675-
) for u in x
676-
]
677-
custom_ents = [
678-
self.custom_entity_parser.parse(
679-
get_text_from_chunks(u[DATA]), use_cache=True)
680-
for u in x
681-
]
662+
def _preprocess(self, x):
663+
# Extract all entities on unnormalized data
664+
builtin_ents = [
665+
self.builtin_entity_parser.parse(
666+
get_text_from_chunks(u[DATA]),
667+
self.builtin_entity_scope,
668+
use_cache=True
669+
) for u in x
670+
]
671+
custom_ents = [
672+
self.custom_entity_parser.parse(
673+
get_text_from_chunks(u[DATA]), use_cache=True)
674+
for u in x
675+
]
682676
return x, builtin_ents, custom_ents
683677

684678
def _extract_word_pairs(self, utterance):
@@ -805,27 +799,3 @@ def _get_word_cluster_features(query_tokens, clusters_name, resources):
805799
if cluster is not None:
806800
cluster_features.append(cluster)
807801
return cluster_features
808-
809-
810-
def _entities_from_utterance(utterance):
811-
builtin_ents = []
812-
custom_ents = []
813-
current_ix = 0
814-
for chunk in utterance[DATA]:
815-
text = chunk[TEXT]
816-
text_length = len(text)
817-
if ENTITY in chunk:
818-
ent = {
819-
ENTITY_KIND: chunk[ENTITY],
820-
RES_VALUE: text,
821-
RES_MATCH_RANGE: {
822-
START: current_ix,
823-
END: current_ix + text_length
824-
}
825-
}
826-
if is_builtin_entity(ent[ENTITY_KIND]):
827-
builtin_ents.append(ent)
828-
else:
829-
custom_ents.append(ent)
830-
current_ix += text_length
831-
return builtin_ents, custom_ents

snips_nlu/intent_classifier/log_reg_classifier.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -87,13 +87,12 @@ def fit(self, dataset):
8787

8888
none_class = max(classes)
8989
try:
90-
self.featurizer = self.featurizer.fit(
90+
x = self.featurizer.fit_transform(
9191
dataset, utterances, classes, none_class)
9292
except _EmptyDatasetUtterancesError:
9393
self.featurizer = None
9494
return self
9595

96-
x = self.featurizer.transform(utterances)
9796
alpha = get_regularization_factor(dataset)
9897
self.classifier = SGDClassifier(random_state=random_state,
9998
alpha=alpha, **LOG_REG_ARGS)

snips_nlu/intent_classifier/log_reg_classifier_utils.py

+2-18
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import numpy as np
1010
from future.utils import iteritems, itervalues
1111

12-
from snips_nlu.constants import (DATA, ENTITIES, ENTITY, INTENTS, TEXT,
12+
from snips_nlu.constants import (DATA, ENTITY, INTENTS, TEXT,
1313
UNKNOWNWORD, UTTERANCES)
1414
from snips_nlu.data_augmentation import augment_utterances
1515
from snips_nlu.dataset import get_text_from_chunks
@@ -108,22 +108,6 @@ def add_unknown_word_to_utterances(utterances, replacement_string,
108108
return new_utterances
109109

110110

111-
def get_dataset_specific_noise(dataset, resources):
112-
"""Return a noise list that excludes the dataset entity values"""
113-
entities_values = set()
114-
for ent_name, ent in iteritems(dataset[ENTITIES]):
115-
if is_builtin_entity(ent_name):
116-
continue
117-
for k, v in iteritems(ent[UTTERANCES]):
118-
entities_values.add(k)
119-
entities_values.add(v)
120-
original_noise = get_noise(resources)
121-
specific_noise = [n for n in original_noise if n not in entities_values]
122-
if not specific_noise: # Avoid returning an empty noise
123-
return original_noise
124-
return specific_noise
125-
126-
127111
def build_training_data(dataset, language, data_augmentation_config, resources,
128112
random_state):
129113
# Create class mapping
@@ -164,7 +148,7 @@ def build_training_data(dataset, language, data_augmentation_config, resources,
164148
)
165149

166150
# Adding noise
167-
noise = get_dataset_specific_noise(dataset, resources)
151+
noise = get_noise(resources)
168152
noisy_utterances = generate_noise_utterances(
169153
augmented_utterances, noise, len(intents), data_augmentation_config,
170154
language, random_state)

snips_nlu/intent_parser/deterministic_intent_parser.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ def fit(self, dataset, force_retrain=True):
127127
"""Fits the intent parser with a valid Snips dataset"""
128128
logger.info("Fitting deterministic parser...")
129129
dataset = validate_and_format_dataset(dataset)
130+
self.load_resources_if_needed(dataset[LANGUAGE])
130131
self.fit_builtin_entity_parser_if_needed(dataset)
131132
self.fit_custom_entity_parser_if_needed(dataset)
132133
self.language = dataset[LANGUAGE]
@@ -313,7 +314,7 @@ def _get_matching_result(self, text, processed_text, regex, intent,
313314
for group_name in found_result.groupdict():
314315
ref_group_name = group_name
315316
if "_" in group_name:
316-
ref_group_name = group_name[:(len(group_name) - 2)]
317+
ref_group_name = group_name.split("_")[0]
317318
slot_name = self.group_names_to_slot_names[ref_group_name]
318319
entity = self.slot_names_to_entities[intent][slot_name]
319320
rng = (found_result.start(group_name),

snips_nlu/pipeline/configs/intent_parser.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import unicode_literals
22

33
from snips_nlu.common.from_dict import FromDict
4-
from snips_nlu.constants import CUSTOM_ENTITY_PARSER_USAGE
4+
from snips_nlu.constants import CUSTOM_ENTITY_PARSER_USAGE, STOP_WORDS
55
from snips_nlu.entity_parser import CustomEntityParserUsage
66
from snips_nlu.pipeline.configs import ProcessingUnitConfig
77
from snips_nlu.resources import merge_required_resources
@@ -84,7 +84,8 @@ def unit_name(self):
8484

8585
def get_required_resources(self):
8686
return {
87-
CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS
87+
CUSTOM_ENTITY_PARSER_USAGE: CustomEntityParserUsage.WITHOUT_STEMS,
88+
STOP_WORDS: self.ignore_stop_words
8889
}
8990

9091
def to_dict(self):

snips_nlu/slot_filler/crf_slot_filler.py

+4
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import shutil
88
import tempfile
99
from builtins import range
10+
from copy import deepcopy
1011
from pathlib import Path
1112

1213
from future.utils import iteritems
@@ -48,6 +49,9 @@ class CRFSlotFiller(SlotFiller):
4849
def __init__(self, config=None, **shared):
4950
"""The CRF slot filler can be configured by passing a
5051
:class:`.CRFSlotFillerConfig`"""
52+
# The CRFSlotFillerConfig must be deep-copied as it is mutated when
53+
# fitting the feature factories
54+
config = deepcopy(config)
5155
super(CRFSlotFiller, self).__init__(config, **shared)
5256
self.crf_model = None
5357
self.features_factories = [

snips_nlu/tests/test_crf_slot_filler.py

+1
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ def test_should_get_sub_builtin_slots(self):
9797
- find me something from [start](9am) to [end](12pm)
9898
- I need a break from [start](2pm) until [end](4pm)
9999
- Can you suggest something from [start](april 4th) until [end](april 6th) ?
100+
- find an activity from [start](6pm) to [end](8pm)
100101
- Book me a trip from [start](this friday) to [end](next tuesday)""")
101102
dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
102103
config = CRFSlotFillerConfig(random_seed=42)

0 commit comments

Comments
 (0)