dict_trie.py 7.56 KB
Newer Older
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
1 2 3 4
import sys


if sys.version_info.major < 3:
5
    from itertools import imap as map
6 7


Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
8
def _add(root, word, count):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
9
    """Add a word to a trie.
10

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
11 12
    :arg dict root: Root of the trie.
    :arg str word: A word.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
13
    :arg int count: Multiplicity of `word`.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
14 15
    """
    node = root
16

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
17 18 19 20 21
    for char in word:
        if char not in node:
            node[char] = {}
        node = node[char]

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
22
    if '' not in node:
23 24
        node[''] = 0
    node[''] += count
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
25 26 27


def _find(root, word):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
28
    """Find the node after following the path in a trie given by {word}.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
29 30 31 32 33

    :arg dict root: Root of the trie.
    :arg str word: A word.

    :returns dict: The node if found, {} otherwise.
34
    """
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
35 36 37 38 39 40
    node = root

    for char in word:
        if char not in node:
            return {}
        node = node[char]
41

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
42
    return node
43 44


Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
45
def _remove(node, word, count):
46 47 48 49
    """Remove a word from a trie.

    :arg dict node: Current node.
    :arg str word: Word to be removed.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
50
    :arg int count: Multiplicity of `word`, force remove if this is -1.
51

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
52
    :returns bool: True if the last occurrence of `word` is removed.
53 54 55
    """
    if not word:
        if '' in node:
56 57
            node[''] -= count
            if node[''] < 1 or count == -1:
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
58 59
                node.pop('')
                return True
60 61 62 63 64 65
        return False

    car, cdr = word[0], word[1:]
    if car not in node:
        return False

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
66
    result = _remove(node[car], cdr, count)
67 68 69 70 71 72 73
    if result:
        if not node[car]:
            node.pop(car)

    return result


74
def _iterate(path, node, unique):
75 76 77 78
    """Convert a trie into a list.

    :arg str path: Path taken so far to reach the current node.
    :arg dict node: Current node.
79
    :arg bool unique: Do not list multiplicities.
80

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
81
    :returns iter: All words in a trie.
82 83
    """
    if '' in node:
84
        if not unique:
85
            for _ in range(1, node['']):
86
                yield path
87 88
        yield path

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
89
    for char in node:
90
        if char:
91
            for result in _iterate(path + char, node[char], unique):
92
                yield result
93 94


Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
95 96 97 98 99 100 101 102 103 104 105
def _fill(node, alphabet, length):
    """Make a full trie using the characters in {alphabet}.

    :arg dict node: Current node.
    :arg tuple alphabet: Used alphabet.
    :arg int length: Length of the words to be generated.

    :returns iter: Trie containing all words of length {length} over alphabet
        {alphabet}.
    """
    if not length:
106
        node[''] = 1
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
107 108 109 110 111 112 113
        return

    for char in alphabet:
        node[char] = {}
        _fill(node[char], alphabet, length - 1)


114
def _hamming(path, node, word, distance, cigar):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
115
    """Find all paths in a trie that are within a certain hamming distance of
116
    {word}.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
117

118 119 120
    :arg str path: Path taken so far to reach the current node.
    :arg dict node: Current node.
    :arg str word: Query word.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
121
    :arg int distance: Amount of allowed errors.
122

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
123
    :returns iter: All word in a trie that have Hamming distance of at most
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
124
        {distance} to {word}.
125 126
    """
    if distance < 0:
127
        return
128
    if not word:
129
        if '' in node:
130
            yield (path, distance, cigar)
131
        return
132 133 134

    car, cdr = word[0], word[1:]
    for char in node:
135 136 137 138 139 140 141 142 143 144 145
        if char:
            if char == car:
                penalty = 0
                operation = '='
            else:
                penalty = 1
                operation = 'X'
            for result in _hamming(
                    path + char, node[char], cdr, distance - penalty,
                    cigar + operation):
                yield result
146 147


148
def _levenshtein(path, node, word, distance, cigar):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
149
    """Find all paths in a trie that are within a certain Levenshtein
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
150 151 152 153 154 155 156
    distance of {word}.

    :arg str path: Path taken so far to reach the current node.
    :arg dict node: Current node.
    :arg str word: Query word.
    :arg int distance: Amount of allowed errors.

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
157
    :returns iter: All word in a trie that have Hamming distance of at most
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
158
        {distance} to {word}.
159 160
    """
    if distance < 0:
161
        return
162
    if not word:
163
        if '' in node:
164
            yield (path, distance, cigar)
165 166 167
        car, cdr = '', ''
    else:
        car, cdr = word[0], word[1:]
168 169

    # Deletion.
170
    for result in _levenshtein(path, node, cdr, distance - 1, cigar + 'D'):
171
        yield result
172 173

    for char in node:
174 175 176 177 178 179 180 181 182 183 184 185 186 187
        if char:
            # Substitution.
            if car:
                if char == car:
                    penalty = 0
                    operation = '='
                else:
                    penalty = 1
                    operation = 'X'
                for result in _levenshtein(
                        path + char, node[char], cdr, distance - penalty,
                        cigar + operation):
                    yield result
            # Insertion.
188
            for result in _levenshtein(
189
                    path + char, node[char], word, distance - 1, cigar + 'I'):
190
                yield result
191 192


193
class Trie(object):
194
    def __init__(self, words=None):
Jeroen F.J. Laros's avatar
PEP8.  
Jeroen F.J. Laros committed
195
        """Initialise the class.
196 197 198 199 200

        :arg list words: List of words.
        """
        self.root = {}

201
        if words:
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
202 203
            for word in words:
                self.add(word)
204 205

    def __contains__(self, word):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
206
        return '' in _find(self.root, word)
207

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
208
    def __iter__(self):
209 210 211 212
        return _iterate('', self.root, True)

    def list(self, unique=True):
        return _iterate('', self.root, unique)
213

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
214 215
    def add(self, word, count=1):
        _add(self.root, word, count)
216

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
217 218 219 220 221
    def get(self, word):
        node = _find(self.root, word)
        if '' in node:
            return node['']
        return None
222

Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
223 224
    def remove(self, word, count=1):
        return _remove(self.root, word, count)
225

226
    def has_prefix(self, word):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
227
        return _find(self.root, word) != {}
228

229 230 231
    def fill(self, alphabet, length):
        _fill(self.root, alphabet, length)

232
    def all_hamming_(self, word, distance):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
233
        return map(
234 235 236
            lambda x: (x[0], distance - x[1], x[2]),
            _hamming('', self.root, word, distance, ''))

237
    def all_hamming(self, word, distance):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
238
        return map(
239
            lambda x: x[0], _hamming('', self.root, word, distance, ''))
240

241 242
    def hamming(self, word, distance):
        try:
243
            return next(self.all_hamming(word, distance))
244 245 246
        except StopIteration:
            return ''

247
    def best_hamming(self, word, distance):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
248
        """Find the best match with {word} in a trie.
249 250

        :arg str word: Query word.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
251
        :arg int distance: Maximum allowed distance.
252 253 254

        :returns str: Best match with {word}.
        """
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
255
        if _find(self.root, word):
256 257 258 259 260 261 262 263 264
            return word

        for i in range(1, distance + 1):
            result = self.hamming(word, i)
            if result:
                return result

        return ''

265
    def all_levenshtein_(self, word, distance):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
266
        return map(
267 268 269
            lambda x: (x[0], distance - x[1], x[2]),
            _levenshtein('', self.root, word, distance, ''))

270
    def all_levenshtein(self, word, distance):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
271
        return map(
272
            lambda x: x[0], _levenshtein('', self.root, word, distance, ''))
273

274 275
    def levenshtein(self, word, distance):
        try:
276
            return next(self.all_levenshtein(word, distance))
277 278 279
        except StopIteration:
            return ''

280
    def best_levenshtein(self, word, distance):
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
281
        """Find the best match with {word} in a trie.
282 283

        :arg str word: Query word.
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
284
        :arg int distance: Maximum allowed distance.
285 286 287

        :returns str: Best match with {word}.
        """
Jeroen F.J. Laros's avatar
Jeroen F.J. Laros committed
288
        if _find(self.root, word):
289 290 291 292 293 294 295 296
            return word

        for i in range(1, distance + 1):
            result = self.levenshtein(word, i)
            if result:
                return result

        return ''