diff --git a/dict_trie/dict_trie.py b/dict_trie/dict_trie.py index 1503d0b47e0f57cced753f82a83423be90d3774e..ea9fb4b8d44e70b23eb673001c631467a3aa4383 100644 --- a/dict_trie/dict_trie.py +++ b/dict_trie/dict_trie.py @@ -1,3 +1,6 @@ +import itertools + + def _add(root, word): """Add a word to the trie. @@ -93,7 +96,7 @@ def _fill(node, alphabet, length): _fill(node[char], alphabet, length - 1) -def _hamming(path, node, word, distance): +def _hamming(path, node, word, distance, cigar): """Find all paths in the trie that are within a certain hamming distance of {word}. @@ -109,17 +112,24 @@ def _hamming(path, node, word, distance): return if not word: if '' in node: - yield path + yield (path, distance, cigar) return car, cdr = word[0], word[1:] for char in node: + if char == car: + penalty = 0 + operation = '=' + else: + penalty = 1 + operation = 'X' for result in _hamming( - path + char, node[char], cdr, distance - int(char != car)): + path + char, node[char], cdr, distance - penalty, + cigar + operation): yield result -def _levenshtein(path, node, word, distance): +def _levenshtein(path, node, word, distance, cigar): """Find all paths in the trie that are within a certain Levenshtein distance of {word}. @@ -135,24 +145,31 @@ def _levenshtein(path, node, word, distance): return if not word: if '' in node: - yield path + yield (path, distance, cigar) car, cdr = '', '' else: car, cdr = word[0], word[1:] # Deletion. - for result in _levenshtein(path, node, cdr, distance - 1): + for result in _levenshtein(path, node, cdr, distance - 1, cigar + 'D'): yield result for char in node: # Substitution. if car: + if char == car: + penalty = 0 + operation = '=' + else: + penalty = 1 + operation = 'X' for result in _levenshtein( - path + char, node[char], cdr, distance - int(char != car)): + path + char, node[char], cdr, distance - penalty, + cigar + operation): yield result # Insertion. for result in _levenshtein( - path + char, node[char], word, distance - 1): + path + char, node[char], word, distance - 1, cigar + 'I'): yield result @@ -186,8 +203,14 @@ class Trie(object): def fill(self, alphabet, length): _fill(self.root, alphabet, length) + def all_hamming_(self, word, distance): + return itertools.imap( + lambda x: (x[0], distance - x[1], x[2]), + _hamming('', self.root, word, distance, '')) + def all_hamming(self, word, distance): - return _hamming('', self.root, word, distance) + return itertools.imap( + lambda x: x[0], _hamming('', self.root, word, distance, '')) def hamming(self, word, distance): try: @@ -213,8 +236,14 @@ class Trie(object): return '' + def all_levenshtein_(self, word, distance): + return itertools.imap( + lambda x: (x[0], distance - x[1], x[2]), + _levenshtein('', self.root, word, distance, '')) + def all_levenshtein(self, word, distance): - return _levenshtein('', self.root, word, distance) + return itertools.imap( + lambda x: x[0], _levenshtein('', self.root, word, distance, '')) def levenshtein(self, word, distance): try: diff --git a/tests/test_trie.py b/tests/test_trie.py index 74e76496a598287667232207547d386fcfb1bca5..eac0bb1fc52733f60155e4ebfef64efbd918f98d 100644 --- a/tests/test_trie.py +++ b/tests/test_trie.py @@ -165,3 +165,12 @@ class TestTrie(object): def test_levenshtein_1_ins(self): assert self._trie.levenshtein('abbc', 1) == 'abc' + + def test_all_hamming__2(self): + assert list(self._trie.all_hamming_('acb', 2)) == [ + ('abc', 2, '=XX'), ('abd', 2, '=XX')] + + def test_all_levenshtein__2(self): + assert list(self._trie.all_levenshtein_('acb', 2)) == [ + ('abc', 2, '=D=I'), ('abd', 2, '=D=I'), ('abc', 2, '=XX'), + ('abd', 2, '=XX'), ('abc', 2, '=I=D')]