Merge pull request #295 from snipsco/develop

Release 0.8.6
snipsco · Jun 14, 2017 · 6a4a578 · 6a4a578
2 parents 2449657 + 242c127
commit 6a4a578
Show file tree

Hide file tree

Showing 7 changed files with 239 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Snips NLU (0.8.2)
+# Snips NLU (0.8.6)
 
 [![Build Status](https://jenkins2.snips.ai/buildStatus/icon?job=SDK/snips-nlu/master)](https://jenkins2.snips.ai/job/SDK/job/snips-nlu/view/Branches/job/master)
 
@@ -8,22 +8,8 @@ Python wheels of the `snips-nlu` package can be found on the nexus repository at
 
 You will need to be signed in to access the repo.
 
-You'll need the tensorflow C library installed, here are some commands to install it on various OSs :
-
-```bash
-$ brew install libtensorflow # macOS
-$ yaourt -S tensorflow # Arch Linux
-$ DEB=libtensorflow_1.0.1-snips-2_`dpkg --print-architecture`.deb \
-    && wget https://s3.amazonaws.com/snips/tensorflow-deb/$DEB \
-    && sudo dpkg -i $DEB # Ubuntu
-```
-
 ## Development
 
-### Dependencies
-
-You'll need the tensorflow C library installed, refer to the previous section to install it
-
 ### Installation
 Create a virtual env:
 
@@ -68,17 +54,13 @@ The NLU Engine can be initialized in two ways:
 
 - Or you can load an already trained engine:
     ```python
-    engine = SnipsNLUEngine.load_from(
-          language='en', 
-          customs=customs_dict, 
-          builtin_path='path/to/builtin'
-        )
+    engine = SnipsNLUEngine.from_dict(engine_as_dict)
     ```
 
 ### Serialization
 The NLU Engine has an API that allows to persist the object as a dictionary:
 ```python
-engine_dict = engine.to_dict()
+engine_as_dict = engine.to_dict()
 ```
 
 ### Parsing
@@ -93,14 +75,24 @@ engine_dict = engine.to_dict()
 #     }
 #     "slots": [
 #         {
-#             "value": "on",
+#             "value": {
+#                 "kind": "Custom",
+#                 "value": "on"
+#             },
+#             "raw_value": "on",
 #             "range": [5, 7],
-#             "slot_name": "on_off",
+#             "entity": "on_off"
+#             "slot_name": "light_on_off",
 #         },
 #         {
-#             "value": "kitchen",
+#             "value": {
+#                 "kind": "Custom",
+#                 "value": "kitchen"
+#             },
+#             "raw_value": "kitchen",
 #             "range": [25, 32],
-#             "slot_name": "room",
+#             "entity": "light",
+#             "slot_name": "light_room",
 #         }
 #     ]
 # }

diff --git a/snips_nlu/__version__ b/snips_nlu/__version__
@@ -1 +1 @@
-0.8.5
+0.8.6
diff --git a/snips_nlu/dataset.py b/snips_nlu/dataset.py
@@ -84,14 +84,19 @@ def validate_and_format_custom_entity(entity):
     validate_type(entity[USE_SYNONYMS], bool)
     validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool)
     validate_type(entity[DATA], list)
+    valid_entity_data = []
     for entry in entity[DATA]:
         validate_type(entry, dict)
         validate_keys(entry, [VALUE, SYNONYMS],
                       object_label="entity entry")
+        if len(entry[VALUE]) == 0:
+            continue
+        entry[SYNONYMS] = [s for s in entry[SYNONYMS] if len(s) > 0]
         validate_type(entry[SYNONYMS], list)
         if entry[VALUE] not in entry[SYNONYMS]:
             entry[SYNONYMS].append(entry[VALUE])
-
+        valid_entity_data.append(entry)
+    entity[DATA] = valid_entity_data
     return entity
 
 
@@ -123,6 +128,8 @@ def filter_dataset(dataset, engine_type=None, min_utterances=0):
 
 
 def add_entity_value_if_missing(value, entity):
+    if len(value) == 0:
+        return
     if entity[USE_SYNONYMS]:
         entity_values = set(v for entry in entity[DATA] for v in
                             entry[SYNONYMS] + [entry[VALUE]])

diff --git a/snips_nlu/intent_parser/regex_intent_parser.py b/snips_nlu/intent_parser/regex_intent_parser.py
@@ -98,6 +98,7 @@ def get_joined_entity_utterances(dataset):
             else:
                 utterances = [entry[VALUE] for entry in entity[DATA]]
         utterances_patterns = [regex_escape(e) for e in utterances]
+        utterances_patterns = [p for p in utterances_patterns if len(p) > 0]
         joined_entity_utterances[entity_name] = r"|".join(
             sorted(utterances_patterns, key=len, reverse=True))
     return joined_entity_utterances
@@ -127,12 +128,12 @@ def get_builtin_entity_name(entity_label):
     return "%%%s%%" % "".join(tokenize_light(entity_label)).upper()
 
 
-def preprocess_builtin_entities(utterance):
+def preprocess_builtin_entities(utterance, language):
     new_utterance = deepcopy(utterance)
     for i, chunk in enumerate(utterance[DATA]):
-        if ENTITY in chunk and is_builtin_entity(chunk[ENTITY]):
-            new_utterance[DATA][i][TEXT] = get_builtin_entity_name(
-                chunk[ENTITY])
+        _, processed_chunk_text = replace_builtin_entities(chunk[TEXT],
+                                                           language)
+        new_utterance[DATA][i][TEXT] = processed_chunk_text
     return new_utterance
 
 
@@ -200,7 +201,7 @@ def fit(self, dataset, intents=None):
             if intent_name not in intents_to_train:
                 self.regexes_per_intent[intent_name] = []
                 continue
-            utterances = [preprocess_builtin_entities(u)
+            utterances = [preprocess_builtin_entities(u, self.language)
                           for u in intent[UTTERANCES]]
             regexes, self.group_names_to_slot_names = generate_regexes(
                 utterances, joined_entity_utterances,

diff --git a/snips_nlu/slot_filler/features_utils.py b/snips_nlu/slot_filler/features_utils.py
@@ -101,7 +101,6 @@ def initial_string_from_tokens(tokens):
         current_index = t.end
     return s
 
-
-LOWER_REGEX = re.compile(r"^[^A-Z]+$")
-UPPER_REGEX = re.compile(r"^[^a-z]+$")
-TITLE_REGEX = re.compile(r"^[A-Z][^A-Z]+$")
+LOWER_REGEX = re.compile(r"^[a-z]+$")
+UPPER_REGEX = re.compile(r"^[A-Z]+$")
+TITLE_REGEX = re.compile(r"^[A-Z][a-z]+$")
diff --git a/snips_nlu/tests/test_dataset.py b/snips_nlu/tests/test_dataset.py
@@ -139,7 +139,7 @@ def test_should_format_dataset_by_adding_synonyms(self):
                     "data": [
                         {
                             "value": "entity 1",
-                            "synonyms": []
+                            "synonyms": ["entity 2"]
                         }
                     ],
                     "use_synonyms": True,
@@ -157,7 +157,7 @@ def test_should_format_dataset_by_adding_synonyms(self):
                     "data": [
                         {
                             "value": "entity 1",
-                            "synonyms": ["entity 1"]
+                            "synonyms": ["entity 2", "entity 1"]
                         }
                     ],
                     "use_synonyms": True,
@@ -455,6 +455,114 @@ def test_should_not_require_data_for_builtin_entities(self):
         except:
             self.fail("Could not validate dataset")
 
+    def test_should_remove_empty_entities_value_and_empty_synonyms(self):
+        # Given
+        dataset = {
+            "intents": {
+                "intent1": {
+                    "utterances": [
+                        {
+                            "data": [
+                                {
+                                    "text": "this is ",
+                                },
+                                {
+                                    "text": "",
+                                    "entity": "entity1",
+                                    "slot_name": "slot1"
+                                }
+                            ]
+                        },
+                        {
+                            "data": [
+                                {
+                                    "text": "this is ",
+                                },
+                                {
+                                    "text": "entity 1",
+                                    "entity": "entity1",
+                                    "slot_name": "slot1"
+                                }
+                            ]
+                        }
+                    ],
+                    "engineType": CUSTOM_ENGINE
+                }
+            },
+            "entities": {
+                "entity1": {
+                    "data": [
+                        {
+                            "value": "entity 1",
+                            "synonyms": [""]
+                        },
+                        {
+                            "value": "",
+                            "synonyms": []
+                        }
+                    ],
+                    "use_synonyms": False,
+                    "automatically_extensible": False
+                }
+            },
+            "language": "en",
+            "snips_nlu_version": "0.0.1"
+        }
+
+        expected_dataset = {
+            "intents": {
+                "intent1": {
+                    "utterances": [
+                        {
+                            "data": [
+                                {
+                                    "text": "this is ",
+                                },
+                                {
+                                    "text": "",
+                                    "entity": "entity1",
+                                    "slot_name": "slot1"
+                                }
+                            ]
+                        },
+                        {
+                            "data": [
+                                {
+                                    "text": "this is ",
+                                },
+                                {
+                                    "text": "entity 1",
+                                    "entity": "entity1",
+                                    "slot_name": "slot1"
+                                }
+                            ]
+                        }
+                    ],
+                    "engineType": CUSTOM_ENGINE
+                }
+            },
+            "entities": {
+                "entity1": {
+                    "data": [
+                        {
+                            "value": "entity 1",
+                            "synonyms": ["entity 1"]
+                        }
+                    ],
+                    "use_synonyms": False,
+                    "automatically_extensible": False
+                }
+            },
+            "language": "en",
+            "snips_nlu_version": "0.0.1"
+        }
+
+        # When
+        dataset = validate_and_format_dataset(dataset)
+
+        # Then
+        self.assertEqual(dataset, expected_dataset)
+
 
 if __name__ == '__main__':
     unittest.main()