Commit 64f36cef authored by Jeroen F.J. Laros's avatar Jeroen F.J. Laros

Merge branch 'iter' into 'master'

Iter

See merge request !1
parents 71c8b85c 19352723
# Trie implementation using nested dictionaries
This library provides a [trie](https://en.wikipedia.org/wiki/Trie)
implementation using nested dictionaries. Apart from the basic operations, a
number of functions for *approximate matching* are implemented.
## Installation
Via [pypi](https://pypi.python.org/pypi/dict-trie):
pip install dict-trie
From source:
git clone https://git.lumc.nl/j.f.j.laros/dict-trie.git
cd dict-trie
pip install .
## Usage
The library provides the `Trie` class. Full documentation can be found
[here](https://git.lumc.nl/j.f.j.laros/dict-trie)
### Basic operations
Initialisation of the trie is done via the constructor by providing a list of
words.
```python
>>> from dict_trie import Trie
>>>
>>> trie = Trie(['abc', 'te', 'test'])
```
Alternatively, an empty trie can be made to which words can be added with the
`add` function.
```python
>>> trie = Trie()
>>> trie.add('abc')
>>> trie.add('te')
>>> trie.add('test')
```
Membership can be tested with the `in` statement.
```python
>>> 'abc' in trie
True
```
Test whether a prefix is present by using the `has_prefix` function.
```python
>>> trie.has_prefix('ab')
True
```
Remove a word from the trie with the `remove` function. This function returns
`False` if the word was not in the trie.
```python
>>> trie.remove('abc')
True
>>> 'abc' in trie
False
>>> trie.remove('abc')
False
```
Iterate over all words in a trie.
```python
>>> list(trie)
['abc', 'te', 'test']
```
### Approximate matching
A trie can be used to efficiently find a word that is similar to a query word.
This is implemented via a number of functions that search for a word, allowing
a given number of mismatches. These functions are divided in two families, one
using the Hamming distance which only allows substitutions, the other using the
Levenshtein distance which allows substitutions, insertions and deletions.
To find a word that has at most Hamming distance 2 to the word 'abe', the
`hamming` function is used.
```python
>>> trie = Trie(['abc', 'aaa', 'ccc'])
>>> trie.hamming('abe', 2)
'aaa'
```
To get all words that have at most Hamming distance 2 to the word 'abe', the
`all_hamming` function is used. This function returns a generator.
```python
>>> list(trie.all_hamming('abe', 2))
['aaa', 'abc']
```
In order to find a word that is closest to the query word, the `best_hamming`
function is used. In this case a word with distance 1 is returned.
```python
>>> trie.best_hamming('abe', 2)
'abc'
```
The functions `levenshtein`, `all_levenshtein` and `best_levenshtein` are used
in a similar way.
### Other functionalities
A trie can be populated with all words of a fixed length over an alphabet by
using the `fill` function.
```python
>>> trie = Trie()
>>> trie.fill(('a', 'b'), 2)
>>> list(trie)
['aa', 'ab', 'ba', 'bb']
```
The trie data structure can be accessed via the `root` member variable.
```python
>>> trie.root
{'a': {'a': {'': {}}, 'b': {'': {}}}, 'b': {'a': {'': {}}, 'b': {'': {}}}}
>>> trie.root.keys()
['a', 'b']
```
"""
dict-trie: Basic implementation of a trie.
"""dict-trie: Basic implementation of a trie.
Copyright (c) 2017 Leiden University Medical Center <humgen@lumc.nl>
......
def _hamming(path, node, word, distance):
def _add(root, word):
"""Add a word to the trie.
:arg dict root: Root of the trie.
:arg str word: A word.
"""
Find the first path in the trie that is within a certain hamming distance
of {word}. Note that this does not necessarily the one with the smallest
distance.
node = root
for char in word:
if char not in node:
node[char] = {}
node = node[char]
node[''] = {}
def _find(root, word):
"""Find the node after following the path in the trie given by {word}.
:arg dict root: Root of the trie.
:arg str word: A word.
:returns dict: The node if found, {} otherwise.
"""
node = root
for char in word:
if char not in node:
return {}
node = node[char]
return node
def _remove(node, word):
"""Remove a word from a trie.
:arg dict node: Current node.
:arg str word: Word to be removed.
:returns bool:
"""
if not word:
if '' in node:
node.pop('')
return True
return False
car, cdr = word[0], word[1:]
if car not in node:
return False
result = _remove(node[car], cdr)
if result:
if not node[car]:
node.pop(car)
return result
def _iterate(path, node):
"""Convert a trie into a list.
:arg str path: Path taken so far to reach the current node.
:arg dict node: Current node.
:returns iter: All words in the trie.
"""
if '' in node:
yield path
for char in node:
for result in _iterate(path + char, node[char]):
yield result
def _fill(node, alphabet, length):
"""Make a full trie using the characters in {alphabet}.
:arg dict node: Current node.
:arg tuple alphabet: Used alphabet.
:arg int length: Length of the words to be generated.
:returns iter: Trie containing all words of length {length} over alphabet
{alphabet}.
"""
if not length:
node[''] = {}
return
for char in alphabet:
node[char] = {}
_fill(node[char], alphabet, length - 1)
def _hamming(path, node, word, distance):
"""Find all paths in the trie that are within a certain hamming distance of
{word}.
:arg str path: Path taken so far to reach the current node.
:arg dict node: Current node.
:arg str word: Query word.
:arg int distance: Amount of errors we can still make.
:arg int distance: Amount of allowed errors.
:returns str: A word in the trie that has Hamming distance of at most
:returns iter: All word in the trie that have Hamming distance of at most
{distance} to {word}.
"""
if distance < 0:
return ''
return
if not word:
return path if '' in node else ''
if '' in node:
yield path
return
car, cdr = word[0], word[1:]
for char in node:
result = _hamming(
path + char, node[char], cdr, distance - int(char != car))
if result:
return result
return ''
for result in _hamming(
path + char, node[char], cdr, distance - int(char != car)):
yield result
def _levenshtein(path, node, word, distance):
"""
"""Find all paths in the trie that are within a certain Levenshtein
distance of {word}.
:arg str path: Path taken so far to reach the current node.
:arg dict node: Current node.
:arg str word: Query word.
:arg int distance: Amount of allowed errors.
:returns iter: All word in the trie that have Hamming distance of at most
{distance} to {word}.
"""
if distance < 0:
return ''
return
if not word:
return path if '' in node else ''
car, cdr = word[0], word[1:]
if '' in node:
yield path
car, cdr = '', ''
else:
car, cdr = word[0], word[1:]
# Deletion.
result = _levenshtein(path, node, cdr, distance - 1)
if result:
return result
for result in _levenshtein(path, node, cdr, distance - 1):
yield result
for char in node:
# Substitution and insertion.
result = (
_levenshtein(
path + char, node[char], cdr, distance - int(char != car)) or
_levenshtein(path + char, node[char], word, distance - 1))
if result:
return result
return ''
# Substitution.
if car:
for result in _levenshtein(
path + char, node[char], cdr, distance - int(char != car)):
yield result
# Insertion.
for result in _levenshtein(
path + char, node[char], word, distance - 1):
yield result
class Trie(object):
def __init__(self, words):
"""
Initialise the class.
def __init__(self, words=None):
"""Initialise the class.
:arg list words: List of words.
"""
self.root = {}
self._build(words)
def _build(self, words):
"""
Build the trie.
:arg list words: List of words.
"""
for word in words:
self.add(word)
def _find(self, word):
"""
Find the node after following the path in the trie given by {word}.
:arg str word: A word.
:returns dict: The node if found, {} otherwise.
"""
node = self.root
for char in word:
if char not in node:
return {}
node = node[char]
return node
if words:
for word in words:
self.add(word)
def __contains__(self, word):
return '' in self._find(word)
return '' in _find(self.root, word)
def __iter__(self):
return _iterate('', self.root)
def add(self, word):
"""
Add a word to the trie.
_add(self.root, word)
:arg str word: A word.
"""
node = self.root
def remove(self, word):
return _remove(self.root, word)
for char in word:
if char not in node:
node[char] = {}
node = node[char]
def has_prefix(self, word):
return _find(self.root, word) != {}
node[''] = {}
def fill(self, alphabet, length):
_fill(self.root, alphabet, length)
def has_prefix(self, word):
return self._find(word) != {}
def all_hamming(self, word, distance):
return _hamming('', self.root, word, distance)
def hamming(self, word, distance):
return _hamming('', self.root, word, distance)
try:
return self.all_hamming(word, distance).next()
except StopIteration:
return ''
def best_hamming(self, word, distance):
"""
Find the best match with {word} in the trie.
"""Find the best match with {word} in the trie.
:arg str word: Query word.
:arg int distance: Amount of errors we can still make.
:arg int distance: Maximum allowed distance.
:returns str: Best match with {word}.
"""
if word in self:
if _find(self.root, word):
return word
for i in range(1, distance + 1):
......@@ -134,19 +213,24 @@ class Trie(object):
return ''
def levenshtein(self, word, distance):
def all_levenshtein(self, word, distance):
return _levenshtein('', self.root, word, distance)
def levenshtein(self, word, distance):
try:
return self.all_levenshtein(word, distance).next()
except StopIteration:
return ''
def best_levenshtein(self, word, distance):
"""
Find the best match with {word} in the trie.
"""Find the best match with {word} in the trie.
:arg str word: Query word.
:arg int distance: Amount of errors we can still make.
:arg int distance: Maximum allowed distance.
:returns str: Best match with {word}.
"""
if word in self:
if _find(self.root, word):
return word
for i in range(1, distance + 1):
......
"""
Tests for the trie library.
"""Tests for the trie library.
"""
#from __future__ import (
# absolute_import, division, print_function, unicode_literals)
......@@ -11,6 +10,9 @@ class TestTrie(object):
def setup(self):
self._trie = Trie(['abc', 'abd', 'test', 'te'])
def test_empty(self):
assert Trie().root == {}
def test_root(self):
assert self._trie.root == {
'a': {
......@@ -54,6 +56,44 @@ class TestTrie(object):
def test_prefix_order(self):
assert Trie(['test', 'te']).root == Trie(['te', 'test']).root
def test_add(self):
self._trie.add('abx')
assert 'abx' in self._trie
def test_remove_present(self):
assert self._trie.remove('test')
assert 'test' not in self._trie
assert 'te' in self._trie
def test_remove_prefix_present(self):
assert self._trie.remove('te')
assert 'te' not in self._trie
assert 'test' in self._trie
def test_remove_absent(self):
assert not self._trie.remove('xxxx')
def test_remove_prefix_absent(self):
assert not self._trie.remove('ab')
def test_iter(self):
assert list(self._trie) == ['abc', 'abd', 'te', 'test']
def test_fill(self):
trie = Trie()
trie.fill(('a', 'b'), 3)
assert list(trie) == [
'aaa', 'aab', 'aba', 'abb', 'baa', 'bab', 'bba', 'bbb']
def test_all_hamming_1_perfect(self):
assert list(self._trie.all_hamming('abc', 1)) == ['abc', 'abd']
def test_all_hamming_1_not_perfect(self):
assert list(self._trie.all_hamming('abx', 1)) == ['abc', 'abd']
def test_all_hamming_1_no_match(self):
assert not list(self._trie.all_hamming('xbx', 1))
def test_hamming_0_no_prefix(self):
assert self._trie.hamming('ab', 0) == ''
......@@ -105,6 +145,9 @@ class TestTrie(object):
def test_best_hamming_match(self):
assert self._trie.best_hamming('abd', 1) == 'abd'
def test_all_levenshtein_1_not_perfect(self):
assert list(self._trie.all_levenshtein('tes', 1)) == ['te', 'test']
def test_levenshtein_0_match_1(self):
assert self._trie.levenshtein('abc', 0) == 'abc'
......@@ -117,5 +160,8 @@ class TestTrie(object):
def test_levenshtein_1_del(self):
assert self._trie.levenshtein('ac', 1) == 'abc'
def test_levenshtein_1_prefix(self):
assert self._trie.levenshtein('ab', 1) == 'abc'
def test_levenshtein_1_ins(self):
assert self._trie.levenshtein('abbc', 1) == 'abc'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment