Commit 64f36cef authored by Jeroen F.J. Laros's avatar Jeroen F.J. Laros

Merge branch 'iter' into 'master'

Iter

See merge request !1
parents 71c8b85c 19352723
# Trie implementation using nested dictionaries
This library provides a [trie](https://en.wikipedia.org/wiki/Trie)
implementation using nested dictionaries. Apart from the basic operations, a
number of functions for *approximate matching* are implemented.
## Installation
Via [pypi](https://pypi.python.org/pypi/dict-trie):
pip install dict-trie
From source:
git clone https://git.lumc.nl/j.f.j.laros/dict-trie.git
cd dict-trie
pip install .
## Usage
The library provides the `Trie` class. Full documentation can be found
[here](https://git.lumc.nl/j.f.j.laros/dict-trie)
### Basic operations
Initialisation of the trie is done via the constructor by providing a list of
words.
```python
>>> from dict_trie import Trie
>>>
>>> trie = Trie(['abc', 'te', 'test'])
```
Alternatively, an empty trie can be made to which words can be added with the
`add` function.
```python
>>> trie = Trie()
>>> trie.add('abc')
>>> trie.add('te')
>>> trie.add('test')
```
Membership can be tested with the `in` statement.
```python
>>> 'abc' in trie
True
```
Test whether a prefix is present by using the `has_prefix` function.
```python
>>> trie.has_prefix('ab')
True
```
Remove a word from the trie with the `remove` function. This function returns
`False` if the word was not in the trie.
```python
>>> trie.remove('abc')
True
>>> 'abc' in trie
False
>>> trie.remove('abc')
False
```
Iterate over all words in a trie.
```python
>>> list(trie)
['abc', 'te', 'test']
```
### Approximate matching
A trie can be used to efficiently find a word that is similar to a query word.
This is implemented via a number of functions that search for a word, allowing
a given number of mismatches. These functions are divided in two families, one
using the Hamming distance which only allows substitutions, the other using the
Levenshtein distance which allows substitutions, insertions and deletions.
To find a word that has at most Hamming distance 2 to the word 'abe', the
`hamming` function is used.
```python
>>> trie = Trie(['abc', 'aaa', 'ccc'])
>>> trie.hamming('abe', 2)
'aaa'
```
To get all words that have at most Hamming distance 2 to the word 'abe', the
`all_hamming` function is used. This function returns a generator.
```python
>>> list(trie.all_hamming('abe', 2))
['aaa', 'abc']
```
In order to find a word that is closest to the query word, the `best_hamming`
function is used. In this case a word with distance 1 is returned.
```python
>>> trie.best_hamming('abe', 2)
'abc'
```
The functions `levenshtein`, `all_levenshtein` and `best_levenshtein` are used
in a similar way.
### Other functionalities
A trie can be populated with all words of a fixed length over an alphabet by
using the `fill` function.
```python
>>> trie = Trie()
>>> trie.fill(('a', 'b'), 2)
>>> list(trie)
['aa', 'ab', 'ba', 'bb']
```
The trie data structure can be accessed via the `root` member variable.
```python
>>> trie.root
{'a': {'a': {'': {}}, 'b': {'': {}}}, 'b': {'a': {'': {}}, 'b': {'': {}}}}
>>> trie.root.keys()
['a', 'b']
```
""" """dict-trie: Basic implementation of a trie.
dict-trie: Basic implementation of a trie.
Copyright (c) 2017 Leiden University Medical Center <humgen@lumc.nl> Copyright (c) 2017 Leiden University Medical Center <humgen@lumc.nl>
......
def _hamming(path, node, word, distance): def _add(root, word):
"""Add a word to the trie.
:arg dict root: Root of the trie.
:arg str word: A word.
""" """
Find the first path in the trie that is within a certain hamming distance node = root
of {word}. Note that this does not necessarily the one with the smallest
distance. for char in word:
if char not in node:
node[char] = {}
node = node[char]
node[''] = {}
def _find(root, word):
"""Find the node after following the path in the trie given by {word}.
:arg dict root: Root of the trie.
:arg str word: A word.
:returns dict: The node if found, {} otherwise.
"""
node = root
for char in word:
if char not in node:
return {}
node = node[char]
return node
def _remove(node, word):
"""Remove a word from a trie.
:arg dict node: Current node.
:arg str word: Word to be removed.
:returns bool:
"""
if not word:
if '' in node:
node.pop('')
return True
return False
car, cdr = word[0], word[1:]
if car not in node:
return False
result = _remove(node[car], cdr)
if result:
if not node[car]:
node.pop(car)
return result
def _iterate(path, node):
"""Convert a trie into a list.
:arg str path: Path taken so far to reach the current node.
:arg dict node: Current node.
:returns iter: All words in the trie.
"""
if '' in node:
yield path
for char in node:
for result in _iterate(path + char, node[char]):
yield result
def _fill(node, alphabet, length):
"""Make a full trie using the characters in {alphabet}.
:arg dict node: Current node.
:arg tuple alphabet: Used alphabet.
:arg int length: Length of the words to be generated.
:returns iter: Trie containing all words of length {length} over alphabet
{alphabet}.
"""
if not length:
node[''] = {}
return
for char in alphabet:
node[char] = {}
_fill(node[char], alphabet, length - 1)
def _hamming(path, node, word, distance):
"""Find all paths in the trie that are within a certain hamming distance of
{word}.
:arg str path: Path taken so far to reach the current node. :arg str path: Path taken so far to reach the current node.
:arg dict node: Current node. :arg dict node: Current node.
:arg str word: Query word. :arg str word: Query word.
:arg int distance: Amount of errors we can still make. :arg int distance: Amount of allowed errors.
:returns str: A word in the trie that has Hamming distance of at most :returns iter: All word in the trie that have Hamming distance of at most
{distance} to {word}. {distance} to {word}.
""" """
if distance < 0: if distance < 0:
return '' return
if not word: if not word:
return path if '' in node else '' if '' in node:
yield path
return
car, cdr = word[0], word[1:] car, cdr = word[0], word[1:]
for char in node: for char in node:
result = _hamming( for result in _hamming(
path + char, node[char], cdr, distance - int(char != car)) path + char, node[char], cdr, distance - int(char != car)):
if result: yield result
return result
return ''
def _levenshtein(path, node, word, distance): def _levenshtein(path, node, word, distance):
""" """Find all paths in the trie that are within a certain Levenshtein
distance of {word}.
:arg str path: Path taken so far to reach the current node.
:arg dict node: Current node.
:arg str word: Query word.
:arg int distance: Amount of allowed errors.
:returns iter: All word in the trie that have Hamming distance of at most
{distance} to {word}.
""" """
if distance < 0: if distance < 0:
return '' return
if not word: if not word:
return path if '' in node else '' if '' in node:
yield path
car, cdr = word[0], word[1:] car, cdr = '', ''
else:
car, cdr = word[0], word[1:]
# Deletion. # Deletion.
result = _levenshtein(path, node, cdr, distance - 1) for result in _levenshtein(path, node, cdr, distance - 1):
if result: yield result
return result
for char in node: for char in node:
# Substitution and insertion. # Substitution.
result = ( if car:
_levenshtein( for result in _levenshtein(
path + char, node[char], cdr, distance - int(char != car)) or path + char, node[char], cdr, distance - int(char != car)):
_levenshtein(path + char, node[char], word, distance - 1)) yield result
if result: # Insertion.
return result for result in _levenshtein(
path + char, node[char], word, distance - 1):
return '' yield result
class Trie(object): class Trie(object):
def __init__(self, words): def __init__(self, words=None):
""" """Initialise the class.
Initialise the class.
:arg list words: List of words. :arg list words: List of words.
""" """
self.root = {} self.root = {}
self._build(words) if words:
for word in words:
def _build(self, words): self.add(word)
"""
Build the trie.
:arg list words: List of words.
"""
for word in words:
self.add(word)
def _find(self, word):
"""
Find the node after following the path in the trie given by {word}.
:arg str word: A word.
:returns dict: The node if found, {} otherwise.
"""
node = self.root
for char in word:
if char not in node:
return {}
node = node[char]
return node
def __contains__(self, word): def __contains__(self, word):
return '' in self._find(word) return '' in _find(self.root, word)
def __iter__(self):
return _iterate('', self.root)
def add(self, word): def add(self, word):
""" _add(self.root, word)
Add a word to the trie.
:arg str word: A word. def remove(self, word):
""" return _remove(self.root, word)
node = self.root
for char in word: def has_prefix(self, word):
if char not in node: return _find(self.root, word) != {}
node[char] = {}
node = node[char]
node[''] = {} def fill(self, alphabet, length):
_fill(self.root, alphabet, length)
def has_prefix(self, word): def all_hamming(self, word, distance):
return self._find(word) != {} return _hamming('', self.root, word, distance)
def hamming(self, word, distance): def hamming(self, word, distance):
return _hamming('', self.root, word, distance) try:
return self.all_hamming(word, distance).next()
except StopIteration:
return ''
def best_hamming(self, word, distance): def best_hamming(self, word, distance):
""" """Find the best match with {word} in the trie.
Find the best match with {word} in the trie.
:arg str word: Query word. :arg str word: Query word.
:arg int distance: Amount of errors we can still make. :arg int distance: Maximum allowed distance.
:returns str: Best match with {word}. :returns str: Best match with {word}.
""" """
if word in self: if _find(self.root, word):
return word return word
for i in range(1, distance + 1): for i in range(1, distance + 1):
...@@ -134,19 +213,24 @@ class Trie(object): ...@@ -134,19 +213,24 @@ class Trie(object):
return '' return ''
def levenshtein(self, word, distance): def all_levenshtein(self, word, distance):
return _levenshtein('', self.root, word, distance) return _levenshtein('', self.root, word, distance)
def levenshtein(self, word, distance):
try:
return self.all_levenshtein(word, distance).next()
except StopIteration:
return ''
def best_levenshtein(self, word, distance): def best_levenshtein(self, word, distance):
""" """Find the best match with {word} in the trie.
Find the best match with {word} in the trie.
:arg str word: Query word. :arg str word: Query word.
:arg int distance: Amount of errors we can still make. :arg int distance: Maximum allowed distance.
:returns str: Best match with {word}. :returns str: Best match with {word}.
""" """
if word in self: if _find(self.root, word):
return word return word
for i in range(1, distance + 1): for i in range(1, distance + 1):
......
""" """Tests for the trie library.
Tests for the trie library.
""" """
#from __future__ import ( #from __future__ import (
# absolute_import, division, print_function, unicode_literals) # absolute_import, division, print_function, unicode_literals)
...@@ -11,6 +10,9 @@ class TestTrie(object): ...@@ -11,6 +10,9 @@ class TestTrie(object):
def setup(self): def setup(self):
self._trie = Trie(['abc', 'abd', 'test', 'te']) self._trie = Trie(['abc', 'abd', 'test', 'te'])
def test_empty(self):
assert Trie().root == {}
def test_root(self): def test_root(self):
assert self._trie.root == { assert self._trie.root == {
'a': { 'a': {
...@@ -54,6 +56,44 @@ class TestTrie(object): ...@@ -54,6 +56,44 @@ class TestTrie(object):
def test_prefix_order(self): def test_prefix_order(self):
assert Trie(['test', 'te']).root == Trie(['te', 'test']).root assert Trie(['test', 'te']).root == Trie(['te', 'test']).root
def test_add(self):
self._trie.add('abx')
assert 'abx' in self._trie
def test_remove_present(self):
assert self._trie.remove('test')
assert 'test' not in self._trie
assert 'te' in self._trie
def test_remove_prefix_present(self):
assert self._trie.remove('te')
assert 'te' not in self._trie
assert 'test' in self._trie
def test_remove_absent(self):
assert not self._trie.remove('xxxx')
def test_remove_prefix_absent(self):
assert not self._trie.remove('ab')
def test_iter(self):
assert list(self._trie) == ['abc', 'abd', 'te', 'test']
def test_fill(self):
trie = Trie()
trie.fill(('a', 'b'), 3)
assert list(trie) == [
'aaa', 'aab', 'aba', 'abb', 'baa', 'bab', 'bba', 'bbb']
def test_all_hamming_1_perfect(self):
assert list(self._trie.all_hamming('abc', 1)) == ['abc', 'abd']
def test_all_hamming_1_not_perfect(self):
assert list(self._trie.all_hamming('abx', 1)) == ['abc', 'abd']
def test_all_hamming_1_no_match(self):
assert not list(self._trie.all_hamming('xbx', 1))
def test_hamming_0_no_prefix(self): def test_hamming_0_no_prefix(self):
assert self._trie.hamming('ab', 0) == '' assert self._trie.hamming('ab', 0) == ''
...@@ -105,6 +145,9 @@ class TestTrie(object): ...@@ -105,6 +145,9 @@ class TestTrie(object):
def test_best_hamming_match(self): def test_best_hamming_match(self):
assert self._trie.best_hamming('abd', 1) == 'abd' assert self._trie.best_hamming('abd', 1) == 'abd'
def test_all_levenshtein_1_not_perfect(self):
assert list(self._trie.all_levenshtein('tes', 1)) == ['te', 'test']
def test_levenshtein_0_match_1(self): def test_levenshtein_0_match_1(self):
assert self._trie.levenshtein('abc', 0) == 'abc' assert self._trie.levenshtein('abc', 0) == 'abc'
...@@ -117,5 +160,8 @@ class TestTrie(object): ...@@ -117,5 +160,8 @@ class TestTrie(object):
def test_levenshtein_1_del(self): def test_levenshtein_1_del(self):
assert self._trie.levenshtein('ac', 1) == 'abc' assert self._trie.levenshtein('ac', 1) == 'abc'
def test_levenshtein_1_prefix(self):
assert self._trie.levenshtein('ab', 1) == 'abc'
def test_levenshtein_1_ins(self): def test_levenshtein_1_ins(self):
assert self._trie.levenshtein('abbc', 1) == 'abc' assert self._trie.levenshtein('abbc', 1) == 'abc'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment