Merge pull request #287 from snipsco/hotfix/feature-extraction

Hotfix/feature extraction
snipsco · Jun 13, 2017 · 2449657 · 2449657
2 parents a651b16 + 1e04edd
commit 2449657
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 8 deletions.
diff --git a/snips_nlu/__version__ b/snips_nlu/__version__
@@ -1 +1 @@
-0.8.4
+0.8.5
diff --git a/snips_nlu/resources.py b/snips_nlu/resources.py
@@ -88,11 +88,9 @@ def load_clusters():
                 with io.open(path, encoding="utf8") as f:
                     _word_clusters[name] = dict()
                     for l in f:
-                        split = l.rstrip().lower().split("\t")
-                        normalized = " ".join(
-                            [t.value for t in tokenize(split[0])])
+                        split = l.rstrip().split("\t")
                         if len(split) == 2:
-                            _word_clusters[name][normalized] = split[1]
+                            _word_clusters[name][split[0]] = split[1]
 
 
 def get_word_clusters(language):

diff --git a/snips_nlu/slot_filler/feature_functions.py b/snips_nlu/slot_filler/feature_functions.py
@@ -100,7 +100,7 @@ def get_ngram_fn(n, use_stemming, language_code=None,
     def ngram(tokens, token_index):
         max_len = len(tokens)
         end = token_index + n
-        if 0 <= token_index < max_len and 0 < end <= max_len:
+        if 0 <= token_index < max_len and end <= max_len:
             if gazetteer is None:
                 if use_stemming:
                     return " ".join(t.stem.lower()
@@ -113,7 +113,7 @@ def ngram(tokens, token_index):
                 for t in tokens[token_index:end]:
                     lowered = t.stem.lower() if use_stemming else \
                         t.value.lower()
-                    words.append(lowered if t.value.lower() in gazetteer
+                    words.append(lowered if lowered in gazetteer
                                  else "rare_word")
                 return " ".join(words)
         return None
@@ -128,7 +128,7 @@ def get_shape_ngram_fn(n):
     def shape_ngram(tokens, token_index):
         max_len = len(tokens)
         end = token_index + n
-        if 0 <= token_index < max_len and 0 <= end < max_len:
+        if 0 <= token_index < max_len and end <= max_len:
             return " ".join(get_shape(t.value)
                             for t in tokens[token_index:end])
         return None