lib.py 73.2 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
2
#!/usr/bin/env python

3
#
4
# Copyright (C) 2017 Jerry Hoogenboom
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
#
# FDSTools is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# FDSTools is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with FDSTools.  If not, see <http://www.gnu.org/licenses/>.
#

23
import re, sys, argparse, random, itertools, textwrap, glob
jhoogenboom's avatar
jhoogenboom committed
24
#import numpy as np  # Imported only when calling nnls()
jhoogenboom's avatar
jhoogenboom committed
25
26

from ConfigParser import RawConfigParser, MissingSectionHeaderError
27
from StringIO import StringIO
28
from errno import EPIPE
jhoogenboom's avatar
jhoogenboom committed
29
30
31

# Patterns that match entire sequences.
PAT_SEQ_RAW = re.compile("^[ACGT]*$")
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
32
PAT_SEQ_IUPAC = re.compile("^[ACGTUWSMKRYBDHVNacgtuwsmkrybdhvn]*$")
jhoogenboom's avatar
jhoogenboom committed
33
PAT_SEQ_TSSV = re.compile("^(?:[ACGT]+\(\d+\))*$")
34
PAT_SEQ_ALLELENAME_STR = re.compile(  # First line: n_ACT[m] or alias.
jhoogenboom's avatar
jhoogenboom committed
35
    "^(?:(?:(?:CE)?-?\d+(?:\.\d+)?_(?:[ACGT]+\[\d+\])*)|((?!_).+?))"
jhoogenboom's avatar
jhoogenboom committed
36
    "(?:_[-+]\d+(?:\.1)?(?P<a>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>"  # _+3A>
37
        "(?!(?P=a))(?:[ACGT]+|-))*$")  # Portion of variants after '>'.
38
39
40
PAT_SEQ_ALLELENAME_SNP = re.compile(
    "^REF$|^(?:(?:(?<=^)|(?<!^) )"  # 'REF' or space-separated variants.
    "\d+(?:\.1)?(?P<a>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>"
41
        "(?!(?P=a))(?:[ACGT]+|-))+$")  # Portion of variants after '>'.
42
43
PAT_SEQ_ALLELENAME_MT = re.compile(
    "^REF$|^(?:(?:(?<=^)|(?<!^) )"  # 'REF' or space-separated variants.
44
    "(?:-?\d+\.\d+[ACGT]|(?P<a>[ACGT])?\d+(?(a)(?!(?P=a)))(?:[ACGT-]|del)))+$")
jhoogenboom's avatar
jhoogenboom committed
45

46
# Patterns that match blocks of TSSV-style sequences and allele names.
jhoogenboom's avatar
jhoogenboom committed
47
48
PAT_TSSV_BLOCK = re.compile("([ACGT]+)\((\d+)\)")
PAT_ALLELENAME_BLOCK = re.compile("([ACGT]+)\[(\d+)\]")
49
PAT_ALIAS = re.compile("^(?!_).+$")
jhoogenboom's avatar
jhoogenboom committed
50

51
52
53
54
# Patterns that match a single variant.
PAT_VARIANT_STR = re.compile(
    "^(?P<pos>[-+]\d+)(?:\.(?P<ins>1))?"
    "(?P<old>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>"
55
    "(?!(?P=old))(?P<new>[ACGT]+|-)$")
56
57
58
PAT_VARIANT_SNP = re.compile(
    "^(?P<pos>\d+)(?:\.(?P<ins>1))?"
    "(?P<old>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>"
59
    "(?!(?P=old))(?P<new>[ACGT]+|-)$")
60
61
62
63
PAT_VARIANT_MT = re.compile(
    "^(?P<old>(?P<a>[ACGT])|-?)"
    "(?P<pos>\d+)(?(a)|(?:\.(?P<ins>\d+))?)"
    "(?P<new>[ACGT-]|del)$")
jhoogenboom's avatar
jhoogenboom committed
64
65

# Patterns that match (parts of) an STR definition.
66
PAT_STR_DEF = re.compile("^(?:(?:(?<=^)|(?<!^)\s+)[ACGT]+\s+\d+\s+\d+)*$")
jhoogenboom's avatar
jhoogenboom committed
67
68
69
PAT_STR_DEF_BLOCK = re.compile("([ACGT]+)\s+(\d+)\s+(\d+)")

# Pattern to split a comma-, semicolon-, or space-separated list.
70
PAT_SPLIT = re.compile("\s*[,; \t]\s*")
71
72
PAT_SPLIT_QUOTED = re.compile(
    r""""((?:\\"|[^"])*)"|'((?:\\'|[^'])*)'|(\S+)""")
73
74
75
76

# Pattern that matches a chromosome name/number.
PAT_CHROMOSOME = re.compile(
    "^(?:[Cc][Hh][Rr](?:[Oo][Mm])?)?([1-9XYM]|1\d|2[0-2])$")
jhoogenboom's avatar
jhoogenboom committed
77

jhoogenboom's avatar
jhoogenboom committed
78
79
# Default regular expression to capture sample tags in file names.
# This is the default of the -e command line option.
jhoogenboom's avatar
jhoogenboom committed
80
DEF_TAG_EXPR = "^(.*?)(?:\.[^.]+)?$"
jhoogenboom's avatar
jhoogenboom committed
81
82
83
84
85

# Default formatting template to write sample tags.
# This is the default of the -f command line option.
DEF_TAG_FORMAT = "\\1"

86
87
88
89
90
# Default formatting template to construct output file names for batch
# processing.  \1 and \2 refer to sample tag and tool name.
# This is the default for the -o command line option with batch support.
DEF_OUTFILE_FORMAT = "\\1-\\2.out"

jhoogenboom's avatar
jhoogenboom committed
91
92
93
94
95
96
# IUPAC Table of complementary bases.
COMPL = {"A": "T", "T": "A", "U": "A", "G": "C", "C": "G", "R": "Y", "Y": "R",
         "K": "M", "M": "K", "B": "V", "V": "B", "D": "H", "H": "D",
         "a": "t", "t": "a", "u": "a", "g": "c", "c": "g", "r": "y", "y": "r",
         "k": "m", "m": "k", "b": "v", "v": "b", "d": "h", "h": "d"}

Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
97
98
99
100
101
102
103
104
105
106
107
# IUPAC Table of ambiguous bases.
IUPAC = {"A": "A",   "C": "C",   "G": "G",   "T": "T",  "U": "T",  "W": "AT",
         "S": "CG",  "M": "AC",  "K": "GT",  "R": "AG", "Y": "CT", "B": "CGT",
         "D": "AGT", "H": "ACT", "V": "ACG", "N": "ACGT",
         "a": ("A", ""), "c": ("C", ""), "g": ("G", ""), "t": ("T", ""),
         "u": ("T", ""), "w": ("A", "T", ""), "s": ("C", "G", ""),
         "m": ("A", "C", ""), "k": ("G", "T", ""), "r": ("A", "G", ""),
         "y": ("C", "T", ""), "b": ("C", "G", "T", ""),
         "d": ("A", "G", "T", ""), "h": ("A", "C", "T", ""),
         "v": ("A", "C", "G", ""), "n": ("A", "C", "G", "T", "")}

108
109
110
# Special values that may appear in the place of a sequence.
SEQ_SPECIAL_VALUES = ("No data", "Other sequences")

111
112
113
114
# TextWrapper object for formatting help texts in generated INI files.
INI_COMMENT = textwrap.TextWrapper(width=79, initial_indent="; ",
    subsequent_indent="; ", break_on_hyphens=False)

jhoogenboom's avatar
jhoogenboom committed
115

116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def get_genome_pos(location, x, invert=False):
    """Get the genome position of the x-th base in a sequence."""
    if invert:
        offset = 0
        for i in range(1, len(location)):
            if i % 2:
                # Starting position.
                pos = location[i]
            elif pos <= x <= location[i]:
                # x is in the current range
                break
            else:
                offset += location[i]-pos+1
        else:
            if len(location) % 2:
                raise ValueError("Position %i is outside sequence range" % x)
        return offset + x - pos
    else:
        for i in range(1, len(location)):
            if i % 2:
                # Starting position.
                pos = location[i]
            elif location[i]-pos < x:
                # x is after this ending position
                x -= location[i]-pos+1
            else:
                # x is before this ending position
                break
        return pos + x
#get_genome_pos


148
def call_variants(template, sequence, location=("?", 1), cache=True,
jhoogenboom's avatar
jhoogenboom committed
149
150
151
                  debug=False):
    """
    Perform a global alignment of sequence to template and return a
152
153
154
    list of variants detected.  The format (nomenclature) of the
    returned variants depends on the location argument.

155
156
157
158
    If location is a tuple ("chromosome name", position), with any
    integer for the position, all variants are given as substitutions in
    the form posX>Y.  Insertions and deletions are written as pos.1->Y
    and posX>-, respectively.  The given position is that of the first
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
159
    base in the template.  With the location set to "suffix", a plus
160
161
162
    sign is prepended to position numbers and the first base in the
    template is pos=1.  With location set to "prefix", a minus sign is
    prepended and bases are counted from right to left instead.
163
164
165
166
167
168

    If location is a tuple ("M", position) with any integer for the
    position, variants are written following the mtDNA nomenclature
    guidelines.  The given position is that of the first base in the
    template.

jhoogenboom's avatar
jhoogenboom committed
169
170
171
172
173
174
175
176
    By default, the results of this function are cached.  Set cache to
    False to suppress caching the result and reduce memory usage.

    Setting debug to True will cause the alignment matrices to be
    printed to sys.stdout.  Be aware that they can be quite large.
    """
    # Saving the results in a cache to avoid repeating alignments.
    try:
177
        return call_variants.cache[template, sequence, location]
jhoogenboom's avatar
jhoogenboom committed
178
    except KeyError:
179
        cache_key = location
jhoogenboom's avatar
jhoogenboom committed
180
181
182
183
184

    row_offset = len(template) + 1
    matrix_match = [0] * row_offset * (len(sequence)+1)
    matrix_gap1 = [-sys.maxint-1] * row_offset * (len(sequence)+1)
    matrix_gap2 = [-sys.maxint-1] * row_offset * (len(sequence)+1)
185
186
187
188
189
190
191
192
193
194
195
196
197
    matrix_direction = [0] * row_offset * (len(sequence)+1)

    # Matrix and arrow enum constants.
    M_MATCH = 0
    M_GAP1 = 1
    M_GAP2 = 2
    A_MATCH = 1
    A_HORZ_O = 2
    A_HORZ_E = 4
    A_VERT_O = 8
    A_VERT_E = 16

    # Settings.
jhoogenboom's avatar
jhoogenboom committed
198
    MATCH_SCORE = 1
199
200
201
    MISMATCH_SCORE = -3
    GAP_OPEN_SCORE = -7
    GAP_EXTEND_SCORE = -2
202
203
204
205
206
207
208
209
    variant_format = "%i%s>%s"

    if location == "prefix":
        location = ("prefix", -len(template))
    elif location == "suffix":
        # Include plus signs for position numbers.
        variant_format = "%+i%s>%s"
        location = ("suffix", 1)
210
    elif type(location) != tuple or len(location) < 2:
211
        raise ValueError("Unknown location %r. It should be 'prefix', "
212
213
            "'suffix', or a tuple (chromosome, position [, endpos])" %
            location)
214
215
216
217
218
    elif location[0] == "M":
        MATCH_SCORE = 1
        MISMATCH_SCORE = -1
        GAP_OPEN_SCORE = -2
        GAP_EXTEND_SCORE = -1
jhoogenboom's avatar
jhoogenboom committed
219
220
221
222
223
224
225
226
227
228
229

    for i in range(len(matrix_match)):
        x = i % row_offset
        y = i / row_offset

        # Initialisation of first row and column.
        if x == 0 or y == 0:
            if x != 0:
                # Top row.
                matrix_gap1[i] = GAP_OPEN_SCORE + GAP_EXTEND_SCORE * (x - 1)
                matrix_match[i] = matrix_gap1[i]
230
                matrix_direction[i] = A_HORZ_E | (A_HORZ_O if x == 1 else 0)
jhoogenboom's avatar
jhoogenboom committed
231
232
233
234
            elif y != 0:
                # Left column.
                matrix_gap2[i] = GAP_OPEN_SCORE + GAP_EXTEND_SCORE * (y - 1)
                matrix_match[i] = matrix_gap2[i]
235
236
237
238
                matrix_direction[i] = A_VERT_E | (A_VERT_O if y == 1 else 0)
            else:
                # Top left corner.
                matrix_direction[i] = A_MATCH
jhoogenboom's avatar
jhoogenboom committed
239
240
241
242
243
244
245
            continue

        if template[x-1] == sequence[y-1]:
            match = MATCH_SCORE
        else:
            match = MISMATCH_SCORE

246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
        options_gap1 = (
            matrix_match[i-1] + GAP_OPEN_SCORE,
            matrix_gap1[i-1] + GAP_EXTEND_SCORE)
        matrix_gap1[i] = max(options_gap1)
        if options_gap1[0] > options_gap1[1]:
            matrix_direction[i] |= A_HORZ_O  # Must exit M_GAP1 here.

        options_gap2 = (
            matrix_match[i-row_offset] + GAP_OPEN_SCORE,
            matrix_gap2[i-row_offset] + GAP_EXTEND_SCORE)
        matrix_gap2[i] = max(options_gap2)
        if options_gap2[0] > options_gap2[1]:
            matrix_direction[i] |= A_VERT_O  # Must exit M_GAP2 here.

        options = (
jhoogenboom's avatar
jhoogenboom committed
261
262
263
            matrix_match[i-1-row_offset] + match,
            matrix_gap1[i],
            matrix_gap2[i])
264
265
266
267
268
269
270
        matrix_match[i] = max(options)
        if options[0] == matrix_match[i]:
            matrix_direction[i] |= A_MATCH  # Can stay in M_MATCH here.
        if options[1] == matrix_match[i]:
            matrix_direction[i] |= A_HORZ_E  # Can enter M_GAP1 here.
        if options[2] == matrix_match[i]:
            matrix_direction[i] |= A_VERT_E  # Can enter M_GAP2 here.
jhoogenboom's avatar
jhoogenboom committed
271
272
273
274
275
276
277
278
279
280
281

    if debug:
        print("GAP1")
        for i in range(0, len(matrix_gap1), row_offset):
            print(("%5i" * row_offset) % tuple(matrix_gap1[i:i+row_offset]))
        print("GAP2")
        for i in range(0, len(matrix_gap2), row_offset):
            print(("%5i" * row_offset) % tuple(matrix_gap2[i:i+row_offset]))
        print("Match")
        for i in range(0, len(matrix_match), row_offset):
            print(("%5i" * row_offset) % tuple(matrix_match[i:i+row_offset]))
282
283
284
285
286
287
288
289
290
291
        print("FLAGS")
        for i in range(0, len(matrix_direction), row_offset):
            print(("%5s|" * row_offset) % tuple("".join([
                "h" if x & A_HORZ_O else " ",
                "H" if x & A_HORZ_E else " ",
                "D" if x & A_MATCH else " ",
                "V" if x & A_VERT_E else " ",
                "v" if x & A_VERT_O else " "
            ]) for x in matrix_direction[i:i+row_offset]))
        print("Traceback")
jhoogenboom's avatar
jhoogenboom committed
292
293
294
295
296
297
298


    # Backtracking.
    variants = []
    variant_template = 0
    variant_sequence = 0
    i = len(matrix_match) - 1
299
    in_matrix = M_MATCH  # May change before first step.
jhoogenboom's avatar
jhoogenboom committed
300
301
302
    while i >= 0:
        x = i % row_offset
        y = i / row_offset
303
304
305
306
307
308
309
310
311
312
313
        if debug:
            print("(%i, %i)" % (x,y))

        if in_matrix == M_MATCH:
            # Make gaps as soon as possible (pushed right).
            if matrix_direction[i] & A_HORZ_E:
                in_matrix = M_GAP1
            elif matrix_direction[i] & A_VERT_E:
                in_matrix = M_GAP2
            elif not (matrix_direction[i] & A_MATCH):
                raise ValueError(
314
315
                    "Alignment error: Dead route! (This is a bug.) [%s,%s]" %
                    (template,sequence))
316
317

        if in_matrix == M_GAP1:
jhoogenboom's avatar
jhoogenboom committed
318
319
            # Go horizontally.  Deletion.
            variant_template += 1
320
321
322
            if matrix_direction[i] & A_HORZ_O:
                # End of gap, go diagonally after this.
                in_matrix = M_MATCH
jhoogenboom's avatar
jhoogenboom committed
323
324
325
            i -= 1
            continue

326
        if in_matrix == M_GAP2:
jhoogenboom's avatar
jhoogenboom committed
327
328
            # Go vertically.  Insertion.
            variant_sequence += 1
329
330
331
            if matrix_direction[i] & A_VERT_O:
                # End of gap, go diagonally after this.
                in_matrix = M_MATCH
jhoogenboom's avatar
jhoogenboom committed
332
333
334
335
            i -= row_offset
            continue

        # Go diagonally.  Either match or mismatch.
336
337
338
339
340
341
        if i != 0 and template[x - 1] != sequence[y - 1]:
            # Start/extend mismatch.
            variant_template += 1
            variant_sequence += 1

        else:
jhoogenboom's avatar
jhoogenboom committed
342
343
            # Match.  Flush variants.
            if variant_template or variant_sequence:
344
345
346
347
348
349
                if location[0] == "M":
                    # MtDNA variants are one-base-at-a-time.
                    for j in range(
                            max(variant_template, variant_sequence)-1, -1, -1):
                        variants.append("%s%i%s%s" % (
                            template[x+j] if j < variant_template else "",#"-",
350
351
                            get_genome_pos(
                                location, x + min(j, variant_template-1)),
352
353
                            ".%i" % (j-variant_template+1)
                                if j >= variant_template else "",
354
                            sequence[y+j] if j < variant_sequence else "del"))
355
                elif variant_template == 0:
jhoogenboom's avatar
jhoogenboom committed
356
                    # Insertions: "-131.1->C" instead of "-130->C".
357
                    variants.append(variant_format % (
358
                        get_genome_pos(location, x - 1),
359
                        ".1-",
jhoogenboom's avatar
jhoogenboom committed
360
361
                        sequence[y:y+variant_sequence]))
                else:
362
                    variants.append(variant_format % (
363
                        get_genome_pos(location, x),
jhoogenboom's avatar
jhoogenboom committed
364
365
366
367
368
369
                        template[x:x+variant_template],
                        sequence[y:y+variant_sequence] or "-"))
                variant_template = 0
                variant_sequence = 0
        i -= 1 + row_offset

370
371
    # Variants were called from right to left.  Reverse their order.
    if location[0] != "prefix":
jhoogenboom's avatar
jhoogenboom committed
372
373
374
375
        variants.reverse()

    # Store the result in the cache.
    if cache:
376
        call_variants.cache[template, sequence, cache_key] = variants
jhoogenboom's avatar
jhoogenboom committed
377
378
379
380
381
    return variants
#call_variants
call_variants.cache = {}


382
def mutate_sequence(seq, variants, location=None):
jhoogenboom's avatar
jhoogenboom committed
383
    """Apply the given variants to the given sequence."""
384
    if type(location) != tuple or len(location) < 2:
385
        pattern = PAT_VARIANT_STR
386
        location = (None, 0)
387
388
    elif location[0] == "M":
        pattern = PAT_VARIANT_MT
389
        location = (location[0], location[1]-1) + tuple(location[2:])
390
391
    else:
        pattern = PAT_VARIANT_SNP
392
        location = (location[0], location[1]-1) + tuple(location[2:])
393
394

    seq = [[]] + [[base] for base in seq]
jhoogenboom's avatar
jhoogenboom committed
395
    for variant in variants:
396
        vm = pattern.match(variant)
jhoogenboom's avatar
jhoogenboom committed
397
398
        if vm is None:
            raise ValueError("Unrecognised variant '%s'" % variant)
399
400
401
402
        pos = int(vm.group("pos"))
        ins = int(vm.group("ins") or 0)
        old = vm.group("old")
        new = vm.group("new")
jhoogenboom's avatar
jhoogenboom committed
403
404
        if old == "-":
            old = ""
405
        if new == "-" or new == "del":
jhoogenboom's avatar
jhoogenboom committed
406
407
            new = ""
        if pos < 0:
408
            pos += len(seq)
409
410
        pos = get_genome_pos(location, pos, True)
        if pos < 0 or (pos == 0 and not ins) or pos >= len(seq):
411
            raise ValueError(
412
413
                "Position of variant '%s' is outside sequence range" %
                    (variant))
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
        if (not ins and old and old != "".join("".join(x[:1])
                for x in seq[pos:pos+len(old)])):
            raise ValueError(
                "Incorrect original sequence in variant '%s'; should be '%s'!"
                % (variant, "".join("".join(x[:1])
                    for x in seq[pos:pos+len(old)])))
        elif not ins and not old:
            # MtDNA substitution with reference base omitted.
            old = "".join("".join(x[:1]) for x in seq[pos:pos+len(new)])
        if not ins:
            # Remove old bases, retaining those inserted between/after.
            seq[pos:pos+len(old)] = [
                [""] + x[1:] for x in seq[pos:pos+len(old)]]
            # Place new entirely in the position of the first old base.
            seq[pos][0] = new
jhoogenboom's avatar
jhoogenboom committed
429
        else:
430
431
432
433
434
            # Insert new exactly ins positions after pos.
            while len(seq[pos]) <= ins:
                seq[pos].append("")
            seq[pos][ins] = new
    return "".join("".join(x) for x in seq)
jhoogenboom's avatar
jhoogenboom committed
435
436
437
#mutate_sequence


Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
def iupac_expand_ambiguous(seq):
    """Return generator for all possible values of ambiguous seq."""
    return ("".join(x) for x in itertools.product(
        *((b for b in IUPAC[a]) for a in seq)))
#iupac_expand_ambiguous


def iupac_pattern_ambiguous(seq):
    """Return regex pattern that matches the ambiguous seq."""
    return "".join(
        (("[%s]" if len(IUPAC[x.upper()]) > 1 else "%s") +
        ("?" if x.islower() else "")) % IUPAC[x.upper()] for x in seq)
#iupac_expand_ambiguous


453
def parse_library(libfile, stream=False):
jhoogenboom's avatar
jhoogenboom committed
454
    try:
455
        if not stream:
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
            libfile = sys.stdin if libfile == "-" else open(libfile, "r")
        if libfile == sys.stdin:
            # Can't seek on pipes, so read it into a buffer first.
            libfile = StringIO(sys.stdin.read())
        try:
            library = parse_library_ini(libfile)
            if not stream:
                libfile.close()
            return library
        except MissingSectionHeaderError:
            # Not an ini file.
            pass
        libfile.seek(0)
        library = parse_library_tsv(libfile)
        if not stream and libfile != sys.stdin:
471
472
            libfile.close()
        return library
473
474
    except ValueError as err:
        raise argparse.ArgumentTypeError(err)
jhoogenboom's avatar
jhoogenboom committed
475
476
477
478
479
480
481
482
483
484
485
486
487
488
#parse_library


def parse_library_tsv(handle):
    """
    Parse a TSSV library file (tab-separated values format).

    The provided file should contain at least four columns: marker name,
    left flanking sequence, right flanking sequence, and STR definition.

    Return a nested dict with top-level keys "flanks" and "regex".
    """
    library = {
      "flanks": {},
489
      "regex": {},
490
      "blocks_middle": {}
jhoogenboom's avatar
jhoogenboom committed
491
492
    }
    for line in handle:
jhoogenboom's avatar
jhoogenboom committed
493
        line = [x.strip() for x in line.rstrip("\r\n").split("\t")]
jhoogenboom's avatar
jhoogenboom committed
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
        if line == [""]:
            continue
        if len(line) < 4:
            raise ValueError(
                "Invalid library file: encountered line with %i columns, "
                "need at least 4" % len(line))
        marker = line[0]
        if PAT_SEQ_RAW.match(line[1]) is None:
            raise ValueError("Flanking sequence '%s' of marker %s is invalid" %
                             (line[1], marker))
        if PAT_SEQ_RAW.match(line[2]) is None:
            raise ValueError("Flanking sequence '%s' of marker %s is invalid" %
                             (line[2], marker))
        if PAT_STR_DEF.match(line[3]) is None:
            raise ValueError("STR definition '%s' of marker %s is invalid" %
                             (line[3], marker))
        library["flanks"][marker] = line[1:3]
511
512
513
        library["blocks_middle"][marker] = [
            (block[0], int(block[1]), int(block[2])) for block in
                PAT_STR_DEF_BLOCK.findall(line[3])]
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
514
        # NOTE: Libconvert tool expects "(seq){num,num}" blocks ONLY!
515
        library["regex"][marker] = re.compile(
516
517
518
            "".join(("^", "".join(
                "(%s){%s,%s}" % x for x in PAT_STR_DEF_BLOCK.findall(line[3])),
                "$")))
jhoogenboom's avatar
jhoogenboom committed
519
520
521
522
523
524
525
526
527
528
    return library
#parse_library_tsv


def parse_library_ini(handle):
    library = {
      "flanks": {},
      "prefix": {},
      "suffix": {},
      "regex": {},
529
      "blocks_middle": {},
530
531
      "nostr_reference": {},
      "genome_position": {},
532
533
      "length_adjust": {},
      "block_length": {},
534
      "max_expected_copies": {},
535
      "expected_length": {},
jhoogenboom's avatar
jhoogenboom committed
536
537
538
539
540
541
542
543
544
545
      "aliases": {}
    }
    markers = set()

    ini = RawConfigParser()
    ini.optionxform = str
    ini.readfp(handle)
    for section in ini.sections():
        for marker in ini.options(section):
            value = ini.get(section, marker)
546
547
            section_low = section.lower()
            if section_low == "flanks":
jhoogenboom's avatar
jhoogenboom committed
548
549
550
551
552
553
554
555
556
557
558
559
                values = PAT_SPLIT.split(value)
                if len(values) != 2:
                    raise ValueError(
                        "For marker %s, %i flanking sequences were given,"
                        "need exactly 2" % (marker, len(values)))
                for value in values:
                    if PAT_SEQ_RAW.match(value) is None:
                        raise ValueError(
                            "Flanking sequence '%s' of marker %s is invalid" %
                            (value, marker))
                library["flanks"][marker] = values
                markers.add(marker)
560
            elif section_low == "prefix":
561
562
563
                if marker in library["nostr_reference"]:
                    raise ValueError(
                        "A prefix was defined for non-STR marker %s" % marker)
jhoogenboom's avatar
jhoogenboom committed
564
                values = PAT_SPLIT.split(value)
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
565
566
567
                for i in range(len(values)):
                    if (not i and PAT_SEQ_RAW.match(values[i]) is None or
                            i and PAT_SEQ_IUPAC.match(values[i]) is None):
jhoogenboom's avatar
jhoogenboom committed
568
569
                        raise ValueError(
                            "Prefix sequence '%s' of marker %s is invalid" %
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
570
                            (values[i], marker))
571
                library["prefix"][marker] = values
jhoogenboom's avatar
jhoogenboom committed
572
                markers.add(marker)
573
            elif section_low == "suffix":
574
575
576
                if marker in library["nostr_reference"]:
                    raise ValueError(
                        "A suffix was defined for non-STR marker %s" % marker)
jhoogenboom's avatar
jhoogenboom committed
577
                values = PAT_SPLIT.split(value)
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
578
579
580
                for i in range(len(values)):
                    if (not i and PAT_SEQ_RAW.match(values[i]) is None or
                            i and PAT_SEQ_IUPAC.match(values[i]) is None):
jhoogenboom's avatar
jhoogenboom committed
581
582
                        raise ValueError(
                            "Suffix sequence '%s' of marker %s is invalid" %
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
583
                            (values[i], marker))
584
                library["suffix"][marker] = values
jhoogenboom's avatar
jhoogenboom committed
585
                markers.add(marker)
586
587
588
589
590
591
592
            elif section_low == "genome_position":
                values = PAT_SPLIT.split(value)
                chromosome = PAT_CHROMOSOME.match(values[0])
                if chromosome is None:
                    raise ValueError(
                        "Invalid chromosome '%s' for marker %s." %
                        (values[0], marker))
593
594
595
596
597
598
599
600
601
602
603
604
605
                pos = [chromosome.group(1)]
                for i in range(1, len(values)):
                    try:
                        pos.append(int(values[i]))
                    except:
                        raise ValueError(
                            "Position '%s' of marker %s is not a valid integer"
                            % (values[i], marker))
                    if not i % 2 and pos[-2] >= pos[-1]:
                        raise ValueError(
                            "End position %i of marker %s must be higher than "
                            "corresponding start position %i" %
                            (pos[-1], marker, pos[-2]))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
606
607
                if len(values) == 1:
                    pos.append(1)
608
                library["genome_position"][marker] = tuple(pos)
609
                markers.add(marker)
610
            elif section_low == "length_adjust":
jhoogenboom's avatar
jhoogenboom committed
611
612
613
614
615
616
617
618
                try:
                    value = int(value)
                except:
                    raise ValueError(
                        "Length adjustment '%s' of marker %s is not a valid "
                        "integer" % (value, marker))
                library["length_adjust"][marker] = value
                markers.add(marker)
619
            elif section_low == "block_length":
jhoogenboom's avatar
jhoogenboom committed
620
621
622
623
624
625
626
627
                try:
                    value = int(value)
                except:
                    raise ValueError(
                        "Block length '%s' of marker %s is not a valid integer"
                        % (value, marker))
                library["block_length"][marker] = value
                markers.add(marker)
628
629
630
631
632
633
634
635
636
637
            elif section_low == "max_expected_copies":
                try:
                    value = int(value)
                except:
                    raise ValueError(
                        "Maximum number of expected copies '%s' of marker %s "
                        "is not a valid integer" % (value, marker))
                library["max_expected_copies"][marker] = value
                markers.add(marker)
            elif section_low == "aliases":
jhoogenboom's avatar
jhoogenboom committed
638
639
640
641
642
                values = PAT_SPLIT.split(value)
                if len(values) != 3:
                    raise ValueError("Alias %s does not have 3 values, but %i"
                                     % (marker, len(values)))
                if PAT_SEQ_RAW.match(values[1]) is None:
643
644
645
646
647
648
649
                    raise ValueError(
                        "Alias sequence '%s' of alias %s is invalid" %
                        (values[1], marker))
                if PAT_ALIAS.match(values[2]) is None:
                    raise ValueError(
                        "Allele name '%s' of alias %s is invalid" %
                        (values[2], marker))
jhoogenboom's avatar
jhoogenboom committed
650
651
652
653
654
655
                library["aliases"][marker] = {
                    "marker": values[0],
                    "sequence": values[1],
                    "name": values[2]
                }
                markers.add(marker)
656
            elif section_low == "repeat":
657
658
659
660
                if marker in library["nostr_reference"]:
                    raise ValueError(
                        "Marker %s was encountered in both [repeat] and "
                        "[no_repeat] sections" % marker)
jhoogenboom's avatar
jhoogenboom committed
661
662
663
664
665
666
                if PAT_STR_DEF.match(value) is None:
                    raise ValueError(
                        "STR definition '%s' of marker %s is invalid" %
                        (value, marker))
                library["regex"][marker] = value
                markers.add(marker)
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
            elif section_low == "no_repeat":
                if marker in library["regex"]:
                    raise ValueError(
                        "Marker %s was encountered in both [repeat] and "
                        "[no_repeat] sections" % marker)
                if marker in library["prefix"] or marker in library["suffix"]:
                    raise ValueError(
                        "A prefix or suffix was defined for non-STR marker %s"
                        % marker)
                if PAT_SEQ_RAW.match(value) is None:
                    raise ValueError(
                        "Reference sequence '%s' of marker %s is invalid" %
                        (value, marker))
                library["nostr_reference"][marker] = value
                markers.add(marker)
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
            elif section_low == "expected_allele_length":
                values = PAT_SPLIT.split(value)
                try:
                    min_length = int(values[0])
                except:
                    raise ValueError(
                        "Minimum expected allele length '%s' of marker %s "
                        "is not a valid integer" % (values[0], marker))
                if len(values) > 1:
                    try:
                        max_length = int(values[1])
                    except:
                        raise ValueError(
                            "Maximum expected allele length '%s' of marker %s "
                            "is not a valid integer" % (values[1], marker))
                else:
                    max_length = sys.maxint
                library["expected_length"][marker] = (min_length, max_length)
                markers.add(marker)
701
702
703
704
705
706
707
708

    # Sanity check: prohibit prefix/suffix for aliases of non-STRs.
    for alias in library["aliases"]:
        if library["aliases"][alias]["marker"] in library["nostr_reference"] \
                and (alias in library["prefix"] or alias in library["suffix"]):
            raise ValueError(
                "A prefix or suffix was defined for alias %s of non-STR "
                "marker %s" % (alias, library["aliases"][alias]["marker"]))
jhoogenboom's avatar
jhoogenboom committed
709

710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
    # Sanity check: end position of marker should reflect ref length.
    for marker in library["genome_position"]:
        if marker not in library["nostr_reference"]:
            continue
        pos = library["genome_position"][marker]
        reflength = len(library["nostr_reference"][marker])
        length = 0
        for i in range(2, len(pos), 2):
            length += pos[i] - pos[i-1] + 1
        if reflength < length or (len(pos) % 2 and reflength != length):
            raise ValueError(
                "Length of reference sequence of marker %s is %i bases, but "
                "genome positions add up to %i bases" %
                (marker, reflength, length))

jhoogenboom's avatar
jhoogenboom committed
725
726
727
    # Compile regular expressions.
    for marker in markers:
        parts = []
728
        blocksm = []
jhoogenboom's avatar
jhoogenboom committed
729
        if marker in library["prefix"]:
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
730
731
732
733
734
735
736
            parts += ("(%s){0,1}" % iupac_pattern_ambiguous(x)
                for x in library["prefix"][marker])
        elif (marker in library["aliases"] and
                library["aliases"][marker]["marker"] in library["prefix"]):
            parts += ("(%s){0,1}" % iupac_pattern_ambiguous(x)
                for x in library["prefix"][
                    library["aliases"][marker]["marker"]])
jhoogenboom's avatar
jhoogenboom committed
737
        if marker in library["aliases"]:
738
            blocksm.append((library["aliases"][marker]["sequence"], 0, 1))
jhoogenboom's avatar
jhoogenboom committed
739
        elif marker in library["regex"]:
740
741
            blocksm += [(block[0], int(block[1]), int(block[2])) for block in
                        PAT_STR_DEF_BLOCK.findall(library["regex"][marker])]
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
742
        parts += ("(%s){%s,%s}" % x for x in blocksm)
jhoogenboom's avatar
jhoogenboom committed
743
        if marker in library["suffix"]:
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
744
745
746
747
748
749
750
            parts += ("(%s){0,1}" % iupac_pattern_ambiguous(x)
                for x in library["suffix"][marker])
        elif (marker in library["aliases"] and
                library["aliases"][marker]["marker"] in library["suffix"]):
            parts += ("(%s){0,1}" % iupac_pattern_ambiguous(x)
                for x in library["suffix"][
                    library["aliases"][marker]["marker"]])
jhoogenboom's avatar
jhoogenboom committed
751
752
753
        if parts:
            library["regex"][marker] = re.compile(
                "".join(["^"] + parts + ["$"]))
754
755
        if blocksm:
            library["blocks_middle"][marker] = blocksm
jhoogenboom's avatar
jhoogenboom committed
756
757
    return library
#parse_library_ini
jhoogenboom's avatar
jhoogenboom committed
758
759


760
def load_profiles(profilefile, library=None):
761
    # TODO: To be replaced with load_profiles_new (less memory).
jhoogenboom's avatar
jhoogenboom committed
762
    column_names = profilefile.readline().rstrip("\r\n").split("\t")
763
764
    if column_names == [""]:
        return {}  # Empty file.
765
766
767
    (colid_marker, colid_allele, colid_sequence, colid_fmean, colid_rmean,
     colid_tool) = get_column_ids(column_names, "marker", "allele", "sequence",
        "fmean", "rmean", "tool")
jhoogenboom's avatar
jhoogenboom committed
768
769
770
771
772
773
774
775
776
777
778
779
780

    profiles = {}
    for line in profilefile:
        line = line.rstrip("\r\n").split("\t")
        if line == [""]:
            continue
        marker = line[colid_marker]
        if marker not in profiles:
            profiles[marker] = {
                "m": set(),  # To be replaced by its length below.
                "n": set(),  # To be replaced by its length below.
                "seqs": [],
                "forward": {},  # To be replaced by a list below.
781
782
                "reverse": {},  # To be replaced by a list below.
                "tool": {}  # To be replaced by a list below.
jhoogenboom's avatar
jhoogenboom committed
783
784
785
786
787
788
789
790
791
792
793
794
                }
        allele = ensure_sequence_format(line[colid_allele], "raw",
            library=library, marker=marker)
        sequence = ensure_sequence_format(line[colid_sequence], "raw",
            library=library, marker=marker)
        if (allele, sequence) in profiles[marker]["forward"]:
            raise ValueError(
                "Invalid background noise profiles file: encountered "
                "multiple values for marker '%s' allele '%s' sequence '%s'" %
                (marker, allele, sequence))
        profiles[marker]["forward"][allele,sequence] = float(line[colid_fmean])
        profiles[marker]["reverse"][allele,sequence] = float(line[colid_rmean])
795
        profiles[marker]["tool"][allele, sequence] = line[colid_tool]
jhoogenboom's avatar
jhoogenboom committed
796
797
798
799
800
801
802
803
804
805
        profiles[marker]["m"].update((allele, sequence))
        profiles[marker]["n"].add(allele)

    # Check completeness and reorder true alleles.
    for marker in profiles:
        profiles[marker]["seqs"] = list(profiles[marker]["n"]) + \
            list(profiles[marker]["m"]-profiles[marker]["n"])
        profiles[marker]["n"] = len(profiles[marker]["n"])
        profiles[marker]["m"] = len(profiles[marker]["m"])
        newprofiles = {"forward": [], "reverse": []}
806
        tools = []
jhoogenboom's avatar
jhoogenboom committed
807
808
809
810
        for i in range(profiles[marker]["n"]):
            allele = profiles[marker]["seqs"][i]
            for direction in newprofiles:
                newprofiles[direction].append([0] * profiles[marker]["m"])
811
            tools.append([""] * profiles[marker]["m"])
jhoogenboom's avatar
jhoogenboom committed
812
813
814
815
816
817
            for j in range(profiles[marker]["m"]):
                sequence = profiles[marker]["seqs"][j]
                if (allele, sequence) in profiles[marker]["forward"]:
                    for direction in newprofiles:
                        newprofiles[direction][i][j] = \
                            profiles[marker][direction][allele, sequence]
818
                    tools[i][j] = profiles[marker]["tool"][allele, sequence]
jhoogenboom's avatar
jhoogenboom committed
819
820
        profiles[marker]["forward"] = newprofiles["forward"]
        profiles[marker]["reverse"] = newprofiles["reverse"]
821
        profiles[marker]["tool"] = tools
jhoogenboom's avatar
jhoogenboom committed
822
823

    return profiles
824
#load_profiles
825
826


827
828
829
def load_profiles_new(profilefile, library=None):
    # TODO, rename this to load_profiles to complete transition.
    column_names = profilefile.readline().rstrip("\r\n").split("\t")
830
831
    if column_names == [""]:
        return {}  # Empty file.
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
    (colid_marker, colid_allele, colid_sequence, colid_fmean, colid_rmean,
     colid_tool) = get_column_ids(column_names, "marker", "allele", "sequence",
        "fmean", "rmean", "tool")

    profiles = {}
    for line in profilefile:
        line = line.rstrip("\r\n").split("\t")
        if line == [""]:
            continue
        marker = line[colid_marker]
        if marker not in profiles:
            profiles[marker] = {}
        allele = ensure_sequence_format(line[colid_allele], "raw",
            library=library, marker=marker)
        sequence = ensure_sequence_format(line[colid_sequence], "raw",
            library=library, marker=marker)
        if allele not in profiles[marker]:
            profiles[marker][allele] = {}
        elif sequence in profiles[marker][allele]:
            raise ValueError(
                "Invalid background noise profiles file: encountered "
                "multiple values for marker '%s' allele '%s' sequence '%s'" %
                (marker, allele, sequence))
        profiles[marker][allele][sequence] = {
            "forward": float(line[colid_fmean]),
            "reverse": float(line[colid_rmean]),
            "tool": line[colid_tool]}

    return profiles
#load_profiles_new


864
def pattern_longest_match(pattern, subject):
865
    """Return the longest match of the pattern in the subject string."""
866
867
868
869
870
    # FIXME, this function tries only one match at each position in the
    # sequence, which is not neccessarily the longest match at that
    # position. For now, we'll search the reverse sequence as well.
    # Re-implement this without regular expressions to test all options.
    reverse = False
871
872
    match = None
    pos = 0
873
    pat = re.compile("".join("(%s){%i,%i}" % x for x in pattern))
874
    while pos < len(subject):
875
        m = pat.search(subject, pos)
876
877
878
879
880
        if m is None:
            break
        if match is None or m.end()-m.start() > match.end()-match.start():
            match = m
        pos = m.start() + 1
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929

    # Try to find a longer match from the other end.
    if match is not None:
        subject = reverse_complement(subject)
        pos = 0
        pat = re.compile("".join(
            "(%s){%i,%i}" % (reverse_complement(x[0]), x[1], x[2])
            for x in reversed(pattern)))
        while pos < len(subject):
            m = pat.search(subject, pos)
            if m is None:
                break
            if m.end()-m.start() > match.end()-match.start():
                match = m
                reverse = True
            pos = m.start() + 1

    # Extract the blocks from the match.
    match = [] if match is None or not match.group() else reduce(
        lambda x, i: (
            x[0] + [match.group(i)]*((match.end(i)-x[1])/len(match.group(i))),
            match.end(i)) if match.group(i) else x,
        range(1, match.lastindex+1), ([], match.start()))[0]

    # Return the match in the same sequence orientation as the input.
    return map(reverse_complement, reversed(match)) if reverse else match
#pattern_longest_match


def pattern_longest_match_veryslow(pattern, subject):
    """Return the longest match of the pattern in the subject string."""
    longest = 0
    the_match = []
    # Generate all possible matching sequences for this pattern.
    #print("Finding match of pattern %r to sequence %s" % (pattern, subject))
    for matching_blocks in itertools.product(*(
            [[block[0]]*i for i in range(block[1], block[2]+1)]
            for block in pattern)):
        matching = itertools.chain.from_iterable(matching_blocks)
        matching_seq = "".join(matching)
        matching_len = len(matching_seq)
        if matching_len <= longest:
            continue
        if matching_seq in subject:
            longest = matching_len
            the_match = matching
    #print("Found match covering %i/%i bases" % (longest, len(subject)))
    return the_match
#pattern_longest_match_veryslow
930
931


jhoogenboom's avatar
jhoogenboom committed
932
933
934
def detect_sequence_format(seq):
    """Return format of seq.  One of 'raw', 'tssv', or 'allelename'."""
    if not seq:
jhoogenboom's avatar
jhoogenboom committed
935
        raise ValueError("Empty sequence")
936
    if seq in SEQ_SPECIAL_VALUES:
937
938
        # Special case.
        return False
jhoogenboom's avatar
jhoogenboom committed
939
940
941
942
    if PAT_SEQ_RAW.match(seq):
        return 'raw'
    if PAT_SEQ_TSSV.match(seq):
        return 'tssv'
943
944
    if PAT_SEQ_ALLELENAME_STR.match(seq) or PAT_SEQ_ALLELENAME_MT.match(seq) \
            or PAT_SEQ_ALLELENAME_SNP.match(seq):
jhoogenboom's avatar
jhoogenboom committed
945
        return 'allelename'
jhoogenboom's avatar
jhoogenboom committed
946
    raise ValueError("Unrecognised sequence format")
jhoogenboom's avatar
jhoogenboom committed
947
948
949
#detect_sequence_format


jhoogenboom's avatar
jhoogenboom committed
950
def convert_sequence_tssv_raw(seq):
jhoogenboom's avatar
jhoogenboom committed
951
952
    return "".join(block[0] * int(block[1])
                   for block in PAT_TSSV_BLOCK.findall(seq))
jhoogenboom's avatar
jhoogenboom committed
953
954
955
956
957
958
959
960
#convert_sequence_tssv_raw


def convert_sequence_raw_tssv(seq, library, marker, return_alias=False):
    # Try to match this marker's pattern, or any of its aliases.
    match = None
    if "aliases" in library:
        for alias in library["aliases"]:
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
961
            if library["aliases"][alias]["marker"] == marker:
jhoogenboom's avatar
jhoogenboom committed
962
963
964
965
966
967
                match = library["regex"][alias].match(seq)
                if match is not None:
                    marker = alias
                    break
    if match is None and marker in library["regex"]:
        match = library["regex"][marker].match(seq)
968
    if match is not None:
jhoogenboom's avatar
jhoogenboom committed
969
970
        parts = ((match.group(i), match.end(i)) for i in range(1, 1 if
            match.lastindex is None else match.lastindex+1) if match.group(i))
jhoogenboom's avatar
jhoogenboom committed
971

972
    # Use heuristics if the sequence does not match the pattern.
jhoogenboom's avatar
jhoogenboom committed
973
    else:
974
975
976
977

        # Find explictily provided prefix and/or suffix if present.
        pre_suf = ["", ""]
        if "prefix" in library and marker in library["prefix"]:
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
978
979
            for prefix in (x for x in library["prefix"][marker]
                    for x in iupac_expand_ambiguous(x)):
980
981
982
983
984
                if seq.startswith(prefix):
                    pre_suf[0] = prefix
                    seq = seq[len(prefix):]
                    break
        if "suffix" in library and marker in library["suffix"]:
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
985
986
            for suffix in (x for x in library["suffix"][marker]
                    for x in iupac_expand_ambiguous(x)):
987
988
989
990
991
992
                if seq.endswith(suffix):
                    pre_suf[1] = suffix
                    seq = seq[:-len(suffix)]
                    break

        # Find longest match of middle pattern.
993
        middle = [(seq, len(pre_suf[0])+len(seq))] if seq else []
994
995
996
997
        if middle and marker in library["blocks_middle"]:
            match = pattern_longest_match(library["blocks_middle"][marker],seq)
            matched = "".join(match)
            if matched:
998
999
1000
1001
1002
1003

                # If this allele does not match the prefix of this
                # marker, but the canonical prefix of the marker ends
                # with the same sequence as the start of our match, we
                # move that portion of the match into the prefix.
                # Then, we do the same thing with the suffix.
1004
1005
1006
1007
1008
                middle = match
                match_start = seq.index(matched)
                match_end = match_start + len(matched)
                start = match_start
                end = match_end
1009
1010
1011
1012
                modified = False
                if (not pre_suf[0] and "prefix" in library
                        and marker in library["prefix"]):
                    ref = library["prefix"][marker][0]
1013
1014
1015
1016
1017
1018
1019
1020
1021
                    if start != len(ref):
                        i = min(len(ref), len(matched))
                        while i > 0:
                            if ref.endswith(matched[:i]):
                                start += i
                                matched = matched[i:]
                                modified = True
                                break
                            i -= 1
1022
1023
1024
                if (not pre_suf[1] and "suffix" in library
                        and marker in library["suffix"]):
                    ref = library["suffix"][marker][0]
1025
1026
1027
1028
1029
1030
1031
1032
1033
                    if len(seq) - end != len(ref):
                        i = min(len(ref), len(matched))
                        while i > 0:
                            if ref.startswith(matched[-i:]):
                                end -= i
                                matched = matched[:-i]
                                modified = True
                                break
                            i -= 1
1034
                if modified:
1035
1036
                    from_start = start - match_start
                    from_end = match_end - end
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
                    while from_start:
                        if from_start < len(middle[0]):
                            middle[0] = middle[0][from_start:]
                            break
                        else:
                            from_start -= len(middle[0])
                            middle = middle[1:]
                    while from_end:
                        if from_end < len(middle[-1]):
                            middle[-1] = middle[-1][:-from_end]
                            break
                        else:
                            from_end -= len(middle[-1])
                            middle = middle[:-1]
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
                if start:
                    if pre_suf[0]:
                        middle.insert(0, seq[:start])
                        matched = seq[:start] + matched
                    else:
                        pre_suf[0] = seq[:start]
                if end < len(seq):
                    if pre_suf[1]:
                        middle.append(seq[end:])
                        matched = matched + seq[end:]
                    else:
                        pre_suf[1] = seq[end:]
1063
1064
1065
1066
1067
                if middle:
                    middle = reduce(
                        lambda x, y: (x[:-1] if x[-1][0] == y else x) +
                            [(y, x[-1][1]+len(y))], middle[1:],
                            [(middle[0],
1068
                              len(middle[0])+len(pre_suf[0]))])
1069
                seq = matched
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084

        # Now construct parts.
        parts = []
        if pre_suf[0]:
            parts.append((pre_suf[0], len(pre_suf[0])))
        parts += middle
        if pre_suf[1]:
            parts.append((pre_suf[1], sum(map(len,pre_suf))+len(seq)))

    seq = reduce(
        lambda a, b: (a[0] + "%s(%i)" % (b[0], (b[1]-a[1])/len(b[0])), b[1]),
        reduce(
            lambda x, y: x[:-1] + [y] if x[-1][0] == y[0] else x + [y],
            parts,
            [("", 0)]))[0]
jhoogenboom's avatar
jhoogenboom committed
1085
1086
1087
1088
1089
1090
    return (seq, marker) if return_alias else seq
#convert_sequence_raw_tssv


def convert_sequence_allelename_tssv(seq, library, marker):
    # Check whether there is an alias for this sequence.
1091
    alias_of = None
jhoogenboom's avatar
jhoogenboom committed
1092
1093
1094
1095
1096
    if "aliases" in library:
        for alias in library["aliases"]:
            if library["aliases"][alias]["marker"] == marker and (
                    seq == library["aliases"][alias]["name"] or
                    seq.startswith(library["aliases"][alias]["name"] + "_")):
1097
                alias_of = marker
jhoogenboom's avatar
jhoogenboom committed
1098
1099
1100
1101
1102
1103
1104
                marker = alias
                seq = "".join([
                    "0_",
                    library["aliases"][alias]["sequence"] + "[1]",
                    seq[len(library["aliases"][alias]["name"]):]])
                break

1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
    nameformat = None
    if PAT_SEQ_ALLELENAME_MT.match(seq) is not None:
        nameformat = "MtDNA"
    elif PAT_SEQ_ALLELENAME_SNP.match(seq) is not None:
        nameformat = "SNP"
    if nameformat is not None:
        # MtDNA and SNP markers.
        try:
            reference = library["nostr_reference"][marker]
        except KeyError:
            raise ValueError(
                "%s allele '%s' found for marker %s, but "
                "no reference sequence was found in the library" %
                (nameformat, seq, marker))
        if seq == "REF":
            return reference + "(1)"
        return mutate_sequence(reference, seq.split(),
            library["genome_position"].get(marker,
                ("M" if nameformat == "MtDNA" else "", 1))) + "(1)"

    # Note: aliases of mtDNA and SNP markers end up here as well.
    # It should NOT look like an alias now, however.
    match = PAT_SEQ_ALLELENAME_STR.match(seq)
1128
1129
1130
    if match is None or match.group(1) is not None:
        raise ValueError("Invalid allele name '%s' encountered!" % seq)

jhoogenboom's avatar
jhoogenboom committed
1131
1132
1133
    allele = seq.split("_")

    # Get and mutate prefix and suffix.
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
    prefix = ""
    suffix = ""
    if "prefix" in library:
        if marker in library["prefix"]:
            prefix = library["prefix"][marker][0]
        elif alias_of is not None and alias_of in library["prefix"]:
            prefix = library["prefix"][alias_of][0]
    if "suffix" in library:
        if marker in library["suffix"]:
            suffix = library["suffix"][marker][0]
        elif alias_of is not None and alias_of in library["suffix"]:
            suffix = library["suffix"][alias_of][0]
jhoogenboom's avatar
jhoogenboom committed
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
    variants = [[], []]
    for variant in allele[2:]:
        if variant[0] == "-":
            if not prefix:
                raise ValueError("Encountered prefix variant '%s', but marker "
                                 "'%s' has no prefix!" % (variant, marker))
            variants[0].append(variant)
        elif variant[0] == "+":
            if not suffix:
                raise ValueError("Encountered suffix variant '%s', but marker "
                                 "'%s' has no suffix!" % (variant, marker))
            variants[1].append(variant)
        else:
            raise ValueError("Unrecognised variant '%s'" % variant)
    if variants[0]:
        prefix = mutate_sequence(prefix, variants[0])
    if variants[1]:
        suffix = mutate_sequence(suffix, variants[1])

    blocks = []
    if prefix:
        blocks.append((prefix, 1))
    for block in PAT_ALLELENAME_BLOCK.findall(allele[1]):
        blocks.append((block[0], int(block[1])))
    if suffix:
        blocks.append((suffix, 1))
jhoogenboom's avatar
jhoogenboom committed
1172
    return "".join("%s(%i)" % block for block in blocks)
jhoogenboom's avatar
jhoogenboom committed
1173
1174
1175
1176
1177
1178
#convert_sequence_allelename_tssv


def convert_sequence_raw_allelename(seq, library, marker):
    # We actually convert raw->allelename via TSSV format.
    seq, alias = convert_sequence_raw_tssv(seq, library, marker, True)
1179
1180
    blocks = PAT_TSSV_BLOCK.findall(seq)

1181
    if "nostr_reference" in library and marker in library["nostr_reference"]:
1182
1183
1184
        # Handle non-STR markers here.
        if alias != marker:
            return library["aliases"][alias]["name"]
1185
1186
1187
        if not blocks:
            # Oh dear, empty sequence... Primer dimer?
            blocks = (("",),)
1188
1189
1190
1191
        if library["nostr_reference"][marker] == blocks[0][0]:
            return "REF"
        return " ".join(
            call_variants(library["nostr_reference"][marker], blocks[0][0],
1192
                library["genome_position"].get(marker, ("?", 1))))
1193

1194
1195
1196
    # Find prefix and suffix.
    prefix = suffix = this_prefix = this_suffix = ""
    remaining_blocks = len(blocks)
1197
1198
1199
1200
1201
    if "prefix" in library:
        if alias in library["prefix"]:
            prefix = library["prefix"][alias][0]
        elif marker in library["prefix"]:
            prefix = library["prefix"][marker][0]
jhoogenboom's avatar
jhoogenboom committed
1202
        if prefix and remaining_blocks > 0 and blocks[0][1] == "1":
1203
            remaining_blocks -= 1
1204
1205
1206
1207
1208
    if "suffix" in library:
        if alias in library["suffix"]:
            suffix = library["suffix"][alias][0]
        elif marker in library["suffix"]:
            suffix = library["suffix"][marker][0]
jhoogenboom's avatar
jhoogenboom committed
1209
        if suffix and remaining_blocks > 0 and blocks[-1][1] == "1":
1210
            remaining_blocks -= 1
1211
1212
1213
1214
1215
1216
    if remaining_blocks > 0 and prefix and blocks[0][1] == "1":
        this_prefix = blocks[0][0]
        blocks = blocks[1:]
    if remaining_blocks > 0 and suffix and blocks[-1][1] == "1":
        this_suffix = blocks[-1][0]
        blocks = blocks[:-1]
jhoogenboom's avatar
jhoogenboom committed
1217
1218
1219
1220

    # Generate prefix/suffix variants.
    length = 0
    variants = []
1221
    if prefix != this_prefix:
1222
        variants += call_variants(prefix, this_prefix, "prefix")
1223
1224
        length += len(this_prefix) - len(prefix)
    if suffix != this_suffix:
1225
        variants += call_variants(suffix, this_suffix, "suffix")
1226
        length += len(this_suffix) - len(suffix)
jhoogenboom's avatar
jhoogenboom committed
1227
1228
1229

    # We are ready to return the allele name of aliases.
    if alias != marker:
1230
        return "_".join([library["aliases"][alias]["name"]] + variants)
jhoogenboom's avatar
jhoogenboom committed
1231
1232
1233

    # Compute CE allele number for the other alleles.
    # TODO: perhaps produce a more intelligent name if there is exactly
1234
1235
    #       1 alias with the same length, or only 1 alias sequence is
    #       contained somewhere within the allele.
jhoogenboom's avatar
jhoogenboom committed
1236
    blocknames = []
1237
1238
    blocksize = library.get("block_length", {}).get(marker, 4)
    length -= library.get("length_adjust", {}).get(marker, 0)
jhoogenboom's avatar
jhoogenboom committed
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
    for block in blocks:
        blocknames.append("%s[%s]" % (block[0], block[1]))
        length += len(block[0]) * int(block[1])

    allelename = "CE" + str(length / blocksize)
    if length % blocksize:
        allelename += "." + str(length % blocksize)
    return "_".join([allelename, "".join(blocknames)] + variants)
#convert_sequence_raw_allelename


def ensure_sequence_format(seq, to_format, from_format=None, library=None,
1251
                           marker=None):
jhoogenboom's avatar
jhoogenboom committed
1252
    """Convert seq to 'raw', 'tssv', or 'allelename' format."""
1253
    if seq in SEQ_SPECIAL_VALUES:
1254
        # Special case.