dict_trie.py 3.7 KB
Newer Older
1 2
def _hamming(path, node, word, distance):
    """
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
3 4 5 6
    Find the first path in the trie that is within a certain hamming distance
    of {word}. Note that this does not necessarily the one with the smallest
    distance.

7 8 9 10 11
    :arg str path: Path taken so far to reach the current node.
    :arg dict node: Current node.
    :arg str word: Query word.
    :arg int distance: Amount of errors we can still make.

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
12 13
    :returns str: A word in the trie that has Hamming distance of at most
        {distance} to {word}.
14 15 16 17
    """
    if distance < 0:
        return ''
    if not word:
18
        return path if '' in node else ''
19 20 21 22 23 24 25 26 27 28 29

    car, cdr = word[0], word[1:]
    for char in node:
        result = _hamming(
            path + char, node[char], cdr, distance - int(char != car))
        if result:
            return result

    return ''


30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
def _levenshtein(path, node, word, distance):
    """
    """
    if distance < 0:
        return ''
    if not word:
        return path if '' in node else ''

    car, cdr = word[0], word[1:]

    # Deletion.
    result = _levenshtein(path, node, cdr, distance - 1)
    if result:
        return result

    for char in node:
        # Substitution and insertion.
        result = (
            _levenshtein(
                path + char, node[char], cdr, distance - int(char != car)) or
            _levenshtein(path + char, node[char], word, distance - 1))
        if result:
            return result

    return ''
55 56


57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
class Trie(object):
    def __init__(self, words):
        """
        Initialise the class.

        :arg list words: List of words.
        """
        self.root = {}

        self._build(words)

    def _build(self, words):
        """
        Build the trie.

        :arg list words: List of words.
        """
        for word in words:
            self.add(word)

    def _find(self, word):
        """
        Find the node after following the path in the trie given by {word}.

        :arg str word: A word.

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
83
        :returns dict: The node if found, {} otherwise.
84 85 86
        """
        node = self.root

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
87 88
        for char in word:
            if char not in node:
89
                return {}
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
90
            node = node[char]
91 92 93 94

        return node

    def __contains__(self, word):
95
        return '' in self._find(word)
96 97 98 99

    def add(self, word):
        """
        Add a word to the trie.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
100 101

        :arg str word: A word.
102 103 104
        """
        node = self.root

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
105 106 107 108
        for char in word:
            if char not in node:
                node[char] = {}
            node = node[char]
109 110 111 112

        node[''] = {}

    def has_prefix(self, word):
113
        return self._find(word) != {}
114 115 116

    def hamming(self, word, distance):
        return _hamming('', self.root, word, distance)
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136

    def best_hamming(self, word, distance):
        """
        Find the best match with {word} in the trie.

        :arg str word: Query word.
        :arg int distance: Amount of errors we can still make.

        :returns str: Best match with {word}.
        """
        if word in self:
            return word

        for i in range(1, distance + 1):
            result = self.hamming(word, i)
            if result:
                return result

        return ''

137 138
    def levenshtein(self, word, distance):
        return _levenshtein('', self.root, word, distance)
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157

    def best_levenshtein(self, word, distance):
        """
        Find the best match with {word} in the trie.

        :arg str word: Query word.
        :arg int distance: Amount of errors we can still make.

        :returns str: Best match with {word}.
        """
        if word in self:
            return word

        for i in range(1, distance + 1):
            result = self.levenshtein(word, i)
            if result:
                return result

        return ''