Commit bbb84ae2 authored by Jeroen F.J. Laros's avatar Jeroen F.J. Laros

Added more informative output to the approximate matching algorithms.

parent 64f36cef
import itertools
def _add(root, word): def _add(root, word):
"""Add a word to the trie. """Add a word to the trie.
...@@ -93,7 +96,7 @@ def _fill(node, alphabet, length): ...@@ -93,7 +96,7 @@ def _fill(node, alphabet, length):
_fill(node[char], alphabet, length - 1) _fill(node[char], alphabet, length - 1)
def _hamming(path, node, word, distance): def _hamming(path, node, word, distance, cigar):
"""Find all paths in the trie that are within a certain hamming distance of """Find all paths in the trie that are within a certain hamming distance of
{word}. {word}.
...@@ -109,17 +112,24 @@ def _hamming(path, node, word, distance): ...@@ -109,17 +112,24 @@ def _hamming(path, node, word, distance):
return return
if not word: if not word:
if '' in node: if '' in node:
yield path yield (path, distance, cigar)
return return
car, cdr = word[0], word[1:] car, cdr = word[0], word[1:]
for char in node: for char in node:
if char == car:
penalty = 0
operation = '='
else:
penalty = 1
operation = 'X'
for result in _hamming( for result in _hamming(
path + char, node[char], cdr, distance - int(char != car)): path + char, node[char], cdr, distance - penalty,
cigar + operation):
yield result yield result
def _levenshtein(path, node, word, distance): def _levenshtein(path, node, word, distance, cigar):
"""Find all paths in the trie that are within a certain Levenshtein """Find all paths in the trie that are within a certain Levenshtein
distance of {word}. distance of {word}.
...@@ -135,24 +145,31 @@ def _levenshtein(path, node, word, distance): ...@@ -135,24 +145,31 @@ def _levenshtein(path, node, word, distance):
return return
if not word: if not word:
if '' in node: if '' in node:
yield path yield (path, distance, cigar)
car, cdr = '', '' car, cdr = '', ''
else: else:
car, cdr = word[0], word[1:] car, cdr = word[0], word[1:]
# Deletion. # Deletion.
for result in _levenshtein(path, node, cdr, distance - 1): for result in _levenshtein(path, node, cdr, distance - 1, cigar + 'D'):
yield result yield result
for char in node: for char in node:
# Substitution. # Substitution.
if car: if car:
if char == car:
penalty = 0
operation = '='
else:
penalty = 1
operation = 'X'
for result in _levenshtein( for result in _levenshtein(
path + char, node[char], cdr, distance - int(char != car)): path + char, node[char], cdr, distance - penalty,
cigar + operation):
yield result yield result
# Insertion. # Insertion.
for result in _levenshtein( for result in _levenshtein(
path + char, node[char], word, distance - 1): path + char, node[char], word, distance - 1, cigar + 'I'):
yield result yield result
...@@ -186,8 +203,14 @@ class Trie(object): ...@@ -186,8 +203,14 @@ class Trie(object):
def fill(self, alphabet, length): def fill(self, alphabet, length):
_fill(self.root, alphabet, length) _fill(self.root, alphabet, length)
def all_hamming_(self, word, distance):
return itertools.imap(
lambda x: (x[0], distance - x[1], x[2]),
_hamming('', self.root, word, distance, ''))
def all_hamming(self, word, distance): def all_hamming(self, word, distance):
return _hamming('', self.root, word, distance) return itertools.imap(
lambda x: x[0], _hamming('', self.root, word, distance, ''))
def hamming(self, word, distance): def hamming(self, word, distance):
try: try:
...@@ -213,8 +236,14 @@ class Trie(object): ...@@ -213,8 +236,14 @@ class Trie(object):
return '' return ''
def all_levenshtein_(self, word, distance):
return itertools.imap(
lambda x: (x[0], distance - x[1], x[2]),
_levenshtein('', self.root, word, distance, ''))
def all_levenshtein(self, word, distance): def all_levenshtein(self, word, distance):
return _levenshtein('', self.root, word, distance) return itertools.imap(
lambda x: x[0], _levenshtein('', self.root, word, distance, ''))
def levenshtein(self, word, distance): def levenshtein(self, word, distance):
try: try:
......
...@@ -165,3 +165,12 @@ class TestTrie(object): ...@@ -165,3 +165,12 @@ class TestTrie(object):
def test_levenshtein_1_ins(self): def test_levenshtein_1_ins(self):
assert self._trie.levenshtein('abbc', 1) == 'abc' assert self._trie.levenshtein('abbc', 1) == 'abc'
def test_all_hamming__2(self):
assert list(self._trie.all_hamming_('acb', 2)) == [
('abc', 2, '=XX'), ('abd', 2, '=XX')]
def test_all_levenshtein__2(self):
assert list(self._trie.all_levenshtein_('acb', 2)) == [
('abc', 2, '=D=I'), ('abd', 2, '=D=I'), ('abc', 2, '=XX'),
('abd', 2, '=XX'), ('abc', 2, '=I=D')]
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment