Skip to content

Commit

Permalink
Merge pull request #295 from snipsco/develop
Browse files Browse the repository at this point in the history
Release 0.8.6
  • Loading branch information
ClemDoum authored Jun 14, 2017
2 parents 2449657 + 242c127 commit 6a4a578
Show file tree
Hide file tree
Showing 7 changed files with 239 additions and 44 deletions.
42 changes: 17 additions & 25 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Snips NLU (0.8.2)
# Snips NLU (0.8.6)

[![Build Status](https://jenkins2.snips.ai/buildStatus/icon?job=SDK/snips-nlu/master)](https://jenkins2.snips.ai/job/SDK/job/snips-nlu/view/Branches/job/master)

Expand All @@ -8,22 +8,8 @@ Python wheels of the `snips-nlu` package can be found on the nexus repository at

You will need to be signed in to access the repo.

You'll need the tensorflow C library installed, here are some commands to install it on various OSs :

```bash
$ brew install libtensorflow # macOS
$ yaourt -S tensorflow # Arch Linux
$ DEB=libtensorflow_1.0.1-snips-2_`dpkg --print-architecture`.deb \
&& wget https://s3.amazonaws.com/snips/tensorflow-deb/$DEB \
&& sudo dpkg -i $DEB # Ubuntu
```

## Development

### Dependencies

You'll need the tensorflow C library installed, refer to the previous section to install it

### Installation
Create a virtual env:

Expand Down Expand Up @@ -68,17 +54,13 @@ The NLU Engine can be initialized in two ways:

- Or you can load an already trained engine:
```python
engine = SnipsNLUEngine.load_from(
language='en',
customs=customs_dict,
builtin_path='path/to/builtin'
)
engine = SnipsNLUEngine.from_dict(engine_as_dict)
```

### Serialization
The NLU Engine has an API that allows to persist the object as a dictionary:
```python
engine_dict = engine.to_dict()
engine_as_dict = engine.to_dict()
```

### Parsing
Expand All @@ -93,14 +75,24 @@ engine_dict = engine.to_dict()
# }
# "slots": [
# {
# "value": "on",
# "value": {
# "kind": "Custom",
# "value": "on"
# },
# "raw_value": "on",
# "range": [5, 7],
# "slot_name": "on_off",
# "entity": "on_off"
# "slot_name": "light_on_off",
# },
# {
# "value": "kitchen",
# "value": {
# "kind": "Custom",
# "value": "kitchen"
# },
# "raw_value": "kitchen",
# "range": [25, 32],
# "slot_name": "room",
# "entity": "light",
# "slot_name": "light_room",
# }
# ]
# }
Expand Down
2 changes: 1 addition & 1 deletion snips_nlu/__version__
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.8.5
0.8.6
9 changes: 8 additions & 1 deletion snips_nlu/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,19 @@ def validate_and_format_custom_entity(entity):
validate_type(entity[USE_SYNONYMS], bool)
validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool)
validate_type(entity[DATA], list)
valid_entity_data = []
for entry in entity[DATA]:
validate_type(entry, dict)
validate_keys(entry, [VALUE, SYNONYMS],
object_label="entity entry")
if len(entry[VALUE]) == 0:
continue
entry[SYNONYMS] = [s for s in entry[SYNONYMS] if len(s) > 0]
validate_type(entry[SYNONYMS], list)
if entry[VALUE] not in entry[SYNONYMS]:
entry[SYNONYMS].append(entry[VALUE])

valid_entity_data.append(entry)
entity[DATA] = valid_entity_data
return entity


Expand Down Expand Up @@ -123,6 +128,8 @@ def filter_dataset(dataset, engine_type=None, min_utterances=0):


def add_entity_value_if_missing(value, entity):
if len(value) == 0:
return
if entity[USE_SYNONYMS]:
entity_values = set(v for entry in entity[DATA] for v in
entry[SYNONYMS] + [entry[VALUE]])
Expand Down
11 changes: 6 additions & 5 deletions snips_nlu/intent_parser/regex_intent_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ def get_joined_entity_utterances(dataset):
else:
utterances = [entry[VALUE] for entry in entity[DATA]]
utterances_patterns = [regex_escape(e) for e in utterances]
utterances_patterns = [p for p in utterances_patterns if len(p) > 0]
joined_entity_utterances[entity_name] = r"|".join(
sorted(utterances_patterns, key=len, reverse=True))
return joined_entity_utterances
Expand Down Expand Up @@ -127,12 +128,12 @@ def get_builtin_entity_name(entity_label):
return "%%%s%%" % "".join(tokenize_light(entity_label)).upper()


def preprocess_builtin_entities(utterance):
def preprocess_builtin_entities(utterance, language):
new_utterance = deepcopy(utterance)
for i, chunk in enumerate(utterance[DATA]):
if ENTITY in chunk and is_builtin_entity(chunk[ENTITY]):
new_utterance[DATA][i][TEXT] = get_builtin_entity_name(
chunk[ENTITY])
_, processed_chunk_text = replace_builtin_entities(chunk[TEXT],
language)
new_utterance[DATA][i][TEXT] = processed_chunk_text
return new_utterance


Expand Down Expand Up @@ -200,7 +201,7 @@ def fit(self, dataset, intents=None):
if intent_name not in intents_to_train:
self.regexes_per_intent[intent_name] = []
continue
utterances = [preprocess_builtin_entities(u)
utterances = [preprocess_builtin_entities(u, self.language)
for u in intent[UTTERANCES]]
regexes, self.group_names_to_slot_names = generate_regexes(
utterances, joined_entity_utterances,
Expand Down
7 changes: 3 additions & 4 deletions snips_nlu/slot_filler/features_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ def initial_string_from_tokens(tokens):
current_index = t.end
return s


LOWER_REGEX = re.compile(r"^[^A-Z]+$")
UPPER_REGEX = re.compile(r"^[^a-z]+$")
TITLE_REGEX = re.compile(r"^[A-Z][^A-Z]+$")
LOWER_REGEX = re.compile(r"^[a-z]+$")
UPPER_REGEX = re.compile(r"^[A-Z]+$")
TITLE_REGEX = re.compile(r"^[A-Z][a-z]+$")
112 changes: 110 additions & 2 deletions snips_nlu/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def test_should_format_dataset_by_adding_synonyms(self):
"data": [
{
"value": "entity 1",
"synonyms": []
"synonyms": ["entity 2"]
}
],
"use_synonyms": True,
Expand All @@ -157,7 +157,7 @@ def test_should_format_dataset_by_adding_synonyms(self):
"data": [
{
"value": "entity 1",
"synonyms": ["entity 1"]
"synonyms": ["entity 2", "entity 1"]
}
],
"use_synonyms": True,
Expand Down Expand Up @@ -455,6 +455,114 @@ def test_should_not_require_data_for_builtin_entities(self):
except:
self.fail("Could not validate dataset")

def test_should_remove_empty_entities_value_and_empty_synonyms(self):
# Given
dataset = {
"intents": {
"intent1": {
"utterances": [
{
"data": [
{
"text": "this is ",
},
{
"text": "",
"entity": "entity1",
"slot_name": "slot1"
}
]
},
{
"data": [
{
"text": "this is ",
},
{
"text": "entity 1",
"entity": "entity1",
"slot_name": "slot1"
}
]
}
],
"engineType": CUSTOM_ENGINE
}
},
"entities": {
"entity1": {
"data": [
{
"value": "entity 1",
"synonyms": [""]
},
{
"value": "",
"synonyms": []
}
],
"use_synonyms": False,
"automatically_extensible": False
}
},
"language": "en",
"snips_nlu_version": "0.0.1"
}

expected_dataset = {
"intents": {
"intent1": {
"utterances": [
{
"data": [
{
"text": "this is ",
},
{
"text": "",
"entity": "entity1",
"slot_name": "slot1"
}
]
},
{
"data": [
{
"text": "this is ",
},
{
"text": "entity 1",
"entity": "entity1",
"slot_name": "slot1"
}
]
}
],
"engineType": CUSTOM_ENGINE
}
},
"entities": {
"entity1": {
"data": [
{
"value": "entity 1",
"synonyms": ["entity 1"]
}
],
"use_synonyms": False,
"automatically_extensible": False
}
},
"language": "en",
"snips_nlu_version": "0.0.1"
}

# When
dataset = validate_and_format_dataset(dataset)

# Then
self.assertEqual(dataset, expected_dataset)


if __name__ == '__main__':
unittest.main()
Loading

0 comments on commit 6a4a578

Please sign in to comment.