dict_trie.py 7.36 KB
Newer Older
1 2 3
import itertools


Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
4
def _add(root, word, count):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
5
    """Add a word to the trie.
6

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
7 8
    :arg dict root: Root of the trie.
    :arg str word: A word.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
9
    :arg int count: Multiplicity of `word`.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
10 11
    """
    node = root
12

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
13 14 15 16 17
    for char in word:
        if char not in node:
            node[char] = {}
        node = node[char]

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
18 19 20
    if '' not in node:
        node[''] = {'count': 0}
    node['']['count'] += count
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
21 22 23 24 25 26 27 28 29


def _find(root, word):
    """Find the node after following the path in the trie given by {word}.

    :arg dict root: Root of the trie.
    :arg str word: A word.

    :returns dict: The node if found, {} otherwise.
30
    """
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
31 32 33 34 35 36
    node = root

    for char in word:
        if char not in node:
            return {}
        node = node[char]
37

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
38
    return node
39 40


Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
41
def _remove(node, word, count):
42 43 44 45
    """Remove a word from a trie.

    :arg dict node: Current node.
    :arg str word: Word to be removed.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
46
    :arg int count: Multiplicity of `word`, force remove if this is -1.
47

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
48
    :returns bool: True if the last occurrence of `word` is removed.
49 50 51
    """
    if not word:
        if '' in node:
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
52 53 54 55
            node['']['count'] -= count
            if node['']['count'] < 1 or count == -1:
                node.pop('')
                return True
56 57 58 59 60 61
        return False

    car, cdr = word[0], word[1:]
    if car not in node:
        return False

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
62
    result = _remove(node[car], cdr, count)
63 64 65 66 67 68 69
    if result:
        if not node[car]:
            node.pop(car)

    return result


Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
70
def _iterate(path, node):
71 72 73 74 75 76 77 78 79 80
    """Convert a trie into a list.

    :arg str path: Path taken so far to reach the current node.
    :arg dict node: Current node.

    :returns iter: All words in the trie.
    """
    if '' in node:
        yield path

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
81
    for char in node:
82 83 84
        if char:
            for result in _iterate(path + char, node[char]):
                yield result
85 86


Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
def _fill(node, alphabet, length):
    """Make a full trie using the characters in {alphabet}.

    :arg dict node: Current node.
    :arg tuple alphabet: Used alphabet.
    :arg int length: Length of the words to be generated.

    :returns iter: Trie containing all words of length {length} over alphabet
        {alphabet}.
    """
    if not length:
        node[''] = {}
        return

    for char in alphabet:
        node[char] = {}
        _fill(node[char], alphabet, length - 1)


106
def _hamming(path, node, word, distance, cigar):
107 108
    """Find all paths in the trie that are within a certain hamming distance of
    {word}.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
109

110 111 112
    :arg str path: Path taken so far to reach the current node.
    :arg dict node: Current node.
    :arg str word: Query word.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
113
    :arg int distance: Amount of allowed errors.
114

115
    :returns iter: All word in the trie that have Hamming distance of at most
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
116
        {distance} to {word}.
117 118
    """
    if distance < 0:
119
        return
120
    if not word:
121
        if '' in node:
122
            yield (path, distance, cigar)
123
        return
124 125 126

    car, cdr = word[0], word[1:]
    for char in node:
127 128 129 130 131 132 133 134 135 136 137
        if char:
            if char == car:
                penalty = 0
                operation = '='
            else:
                penalty = 1
                operation = 'X'
            for result in _hamming(
                    path + char, node[char], cdr, distance - penalty,
                    cigar + operation):
                yield result
138 139


140
def _levenshtein(path, node, word, distance, cigar):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
141 142 143 144 145 146 147 148 149 150
    """Find all paths in the trie that are within a certain Levenshtein
    distance of {word}.

    :arg str path: Path taken so far to reach the current node.
    :arg dict node: Current node.
    :arg str word: Query word.
    :arg int distance: Amount of allowed errors.

    :returns iter: All word in the trie that have Hamming distance of at most
        {distance} to {word}.
151 152
    """
    if distance < 0:
153
        return
154
    if not word:
155
        if '' in node:
156
            yield (path, distance, cigar)
157 158 159
        car, cdr = '', ''
    else:
        car, cdr = word[0], word[1:]
160 161

    # Deletion.
162
    for result in _levenshtein(path, node, cdr, distance - 1, cigar + 'D'):
163
        yield result
164 165

    for char in node:
166 167 168 169 170 171 172 173 174 175 176 177 178 179
        if char:
            # Substitution.
            if car:
                if char == car:
                    penalty = 0
                    operation = '='
                else:
                    penalty = 1
                    operation = 'X'
                for result in _levenshtein(
                        path + char, node[char], cdr, distance - penalty,
                        cigar + operation):
                    yield result
            # Insertion.
180
            for result in _levenshtein(
181
                    path + char, node[char], word, distance - 1, cigar + 'I'):
182
                yield result
183 184


185
class Trie(object):
186
    def __init__(self, words=None):
Jeroen F.J. Laros's avatar
PEP8.  
Jeroen F.J. Laros committed
187
        """Initialise the class.
188 189 190 191 192

        :arg list words: List of words.
        """
        self.root = {}

193
        if words:
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
194 195
            for word in words:
                self.add(word)
196 197

    def __contains__(self, word):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
198
        return '' in _find(self.root, word)
199

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
200 201
    def __iter__(self):
        return _iterate('', self.root)
202

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
203 204
    def add(self, word, count=1):
        _add(self.root, word, count)
205

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
206 207 208 209 210 211 212 213
    def get(self, word):
        node = _find(self.root, word)
        if '' in node:
            return node['']
        return None

    def remove(self, word, count=1):
        return _remove(self.root, word, count)
214

215
    def has_prefix(self, word):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
216
        return _find(self.root, word) != {}
217

218 219 220
    def fill(self, alphabet, length):
        _fill(self.root, alphabet, length)

221 222 223 224 225
    def all_hamming_(self, word, distance):
        return itertools.imap(
            lambda x: (x[0], distance - x[1], x[2]),
            _hamming('', self.root, word, distance, ''))

226
    def all_hamming(self, word, distance):
227 228
        return itertools.imap(
            lambda x: x[0], _hamming('', self.root, word, distance, ''))
229

230 231 232 233 234 235
    def hamming(self, word, distance):
        try:
            return self.all_hamming(word, distance).next()
        except StopIteration:
            return ''

236
    def best_hamming(self, word, distance):
Jeroen F.J. Laros's avatar
PEP8.  
Jeroen F.J. Laros committed
237
        """Find the best match with {word} in the trie.
238 239

        :arg str word: Query word.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
240
        :arg int distance: Maximum allowed distance.
241 242 243

        :returns str: Best match with {word}.
        """
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
244
        if _find(self.root, word):
245 246 247 248 249 250 251 252 253
            return word

        for i in range(1, distance + 1):
            result = self.hamming(word, i)
            if result:
                return result

        return ''

254 255 256 257 258
    def all_levenshtein_(self, word, distance):
        return itertools.imap(
            lambda x: (x[0], distance - x[1], x[2]),
            _levenshtein('', self.root, word, distance, ''))

259
    def all_levenshtein(self, word, distance):
260 261
        return itertools.imap(
            lambda x: x[0], _levenshtein('', self.root, word, distance, ''))
262

263 264 265 266 267 268
    def levenshtein(self, word, distance):
        try:
            return self.all_levenshtein(word, distance).next()
        except StopIteration:
            return ''

269
    def best_levenshtein(self, word, distance):
Jeroen F.J. Laros's avatar
PEP8.  
Jeroen F.J. Laros committed
270
        """Find the best match with {word} in the trie.
271 272

        :arg str word: Query word.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
273
        :arg int distance: Maximum allowed distance.
274 275 276

        :returns str: Best match with {word}.
        """
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
277
        if _find(self.root, word):
278 279 280 281 282 283 284 285
            return word

        for i in range(1, distance + 1):
            result = self.levenshtein(word, i)
            if result:
                return result

        return ''