lib.py 73.2 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1 2
#!/usr/bin/env python

3
#
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
4
# Copyright (C) 2019 Jerry Hoogenboom
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
#
# FDSTools is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# FDSTools is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with FDSTools.  If not, see <http://www.gnu.org/licenses/>.
#

23
import re, sys, argparse, random, itertools, textwrap, glob
jhoogenboom's avatar
jhoogenboom committed
24
#import numpy as np  # Imported only when calling nnls()
jhoogenboom's avatar
jhoogenboom committed
25 26

from ConfigParser import RawConfigParser, MissingSectionHeaderError
27
from StringIO import StringIO
28
from errno import EPIPE
jhoogenboom's avatar
jhoogenboom committed
29 30 31

# Patterns that match entire sequences.
PAT_SEQ_RAW = re.compile("^[ACGT]*$")
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
32
PAT_SEQ_IUPAC = re.compile("^[ACGTUWSMKRYBDHVNacgtuwsmkrybdhvn]*$")
jhoogenboom's avatar
jhoogenboom committed
33
PAT_SEQ_TSSV = re.compile("^(?:[ACGT]+\(\d+\))*$")
34
PAT_SEQ_ALLELENAME_STR = re.compile(  # First line: n_ACT[m] or alias.
jhoogenboom's avatar
jhoogenboom committed
35
    "^(?:(?:(?:CE)?-?\d+(?:\.\d+)?_(?:[ACGT]+\[\d+\])*)|((?!_).+?))"
jhoogenboom's avatar
jhoogenboom committed
36
    "(?:_[-+]\d+(?:\.1)?(?P<a>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>"  # _+3A>
37
        "(?!(?P=a))(?:[ACGT]+|-))*$")  # Portion of variants after '>'.
38 39 40
PAT_SEQ_ALLELENAME_SNP = re.compile(
    "^REF$|^(?:(?:(?<=^)|(?<!^) )"  # 'REF' or space-separated variants.
    "\d+(?:\.1)?(?P<a>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>"
41
        "(?!(?P=a))(?:[ACGT]+|-))+$")  # Portion of variants after '>'.
42 43
PAT_SEQ_ALLELENAME_MT = re.compile(
    "^REF$|^(?:(?:(?<=^)|(?<!^) )"  # 'REF' or space-separated variants.
44
    "(?:-?\d+\.\d+[ACGT]|(?P<a>[ACGT])?\d+(?(a)(?!(?P=a)))(?:[ACGT-]|del)))+$")
jhoogenboom's avatar
jhoogenboom committed
45

46
# Patterns that match blocks of TSSV-style sequences and allele names.
jhoogenboom's avatar
jhoogenboom committed
47 48
PAT_TSSV_BLOCK = re.compile("([ACGT]+)\((\d+)\)")
PAT_ALLELENAME_BLOCK = re.compile("([ACGT]+)\[(\d+)\]")
49
PAT_ALIAS = re.compile("^(?!_).+$")
jhoogenboom's avatar
jhoogenboom committed
50

51 52 53 54
# Patterns that match a single variant.
PAT_VARIANT_STR = re.compile(
    "^(?P<pos>[-+]\d+)(?:\.(?P<ins>1))?"
    "(?P<old>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>"
55
    "(?!(?P=old))(?P<new>[ACGT]+|-)$")
56 57 58
PAT_VARIANT_SNP = re.compile(
    "^(?P<pos>\d+)(?:\.(?P<ins>1))?"
    "(?P<old>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>"
59
    "(?!(?P=old))(?P<new>[ACGT]+|-)$")
60 61 62 63
PAT_VARIANT_MT = re.compile(
    "^(?P<old>(?P<a>[ACGT])|-?)"
    "(?P<pos>\d+)(?(a)|(?:\.(?P<ins>\d+))?)"
    "(?P<new>[ACGT-]|del)$")
jhoogenboom's avatar
jhoogenboom committed
64 65

# Patterns that match (parts of) an STR definition.
66
PAT_STR_DEF = re.compile("^(?:(?:(?<=^)|(?<!^)\s+)[ACGT]+\s+\d+\s+\d+)*$")
jhoogenboom's avatar
jhoogenboom committed
67 68 69
PAT_STR_DEF_BLOCK = re.compile("([ACGT]+)\s+(\d+)\s+(\d+)")

# Pattern to split a comma-, semicolon-, or space-separated list.
70
PAT_SPLIT = re.compile("\s*[,; \t]\s*")
71 72
PAT_SPLIT_QUOTED = re.compile(
    r""""((?:\\"|[^"])*)"|'((?:\\'|[^'])*)'|(\S+)""")
73 74 75 76

# Pattern that matches a chromosome name/number.
PAT_CHROMOSOME = re.compile(
    "^(?:[Cc][Hh][Rr](?:[Oo][Mm])?)?([1-9XYM]|1\d|2[0-2])$")
jhoogenboom's avatar
jhoogenboom committed
77

jhoogenboom's avatar
jhoogenboom committed
78 79
# Default regular expression to capture sample tags in file names.
# This is the default of the -e command line option.
jhoogenboom's avatar
jhoogenboom committed
80
DEF_TAG_EXPR = "^(.*?)(?:\.[^.]+)?$"
jhoogenboom's avatar
jhoogenboom committed
81 82 83 84 85

# Default formatting template to write sample tags.
# This is the default of the -f command line option.
DEF_TAG_FORMAT = "\\1"

86 87 88 89 90
# Default formatting template to construct output file names for batch
# processing.  \1 and \2 refer to sample tag and tool name.
# This is the default for the -o command line option with batch support.
DEF_OUTFILE_FORMAT = "\\1-\\2.out"

jhoogenboom's avatar
jhoogenboom committed
91 92 93 94 95 96
# IUPAC Table of complementary bases.
COMPL = {"A": "T", "T": "A", "U": "A", "G": "C", "C": "G", "R": "Y", "Y": "R",
         "K": "M", "M": "K", "B": "V", "V": "B", "D": "H", "H": "D",
         "a": "t", "t": "a", "u": "a", "g": "c", "c": "g", "r": "y", "y": "r",
         "k": "m", "m": "k", "b": "v", "v": "b", "d": "h", "h": "d"}

Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
97 98 99 100 101 102 103 104 105 106 107
# IUPAC Table of ambiguous bases.
IUPAC = {"A": "A",   "C": "C",   "G": "G",   "T": "T",  "U": "T",  "W": "AT",
         "S": "CG",  "M": "AC",  "K": "GT",  "R": "AG", "Y": "CT", "B": "CGT",
         "D": "AGT", "H": "ACT", "V": "ACG", "N": "ACGT",
         "a": ("A", ""), "c": ("C", ""), "g": ("G", ""), "t": ("T", ""),
         "u": ("T", ""), "w": ("A", "T", ""), "s": ("C", "G", ""),
         "m": ("A", "C", ""), "k": ("G", "T", ""), "r": ("A", "G", ""),
         "y": ("C", "T", ""), "b": ("C", "G", "T", ""),
         "d": ("A", "G", "T", ""), "h": ("A", "C", "T", ""),
         "v": ("A", "C", "G", ""), "n": ("A", "C", "G", "T", "")}

108 109 110
# Special values that may appear in the place of a sequence.
SEQ_SPECIAL_VALUES = ("No data", "Other sequences")

111 112 113 114
# TextWrapper object for formatting help texts in generated INI files.
INI_COMMENT = textwrap.TextWrapper(width=79, initial_indent="; ",
    subsequent_indent="; ", break_on_hyphens=False)

jhoogenboom's avatar
jhoogenboom committed
115

116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
def get_genome_pos(location, x, invert=False):
    """Get the genome position of the x-th base in a sequence."""
    if invert:
        offset = 0
        for i in range(1, len(location)):
            if i % 2:
                # Starting position.
                pos = location[i]
            elif pos <= x <= location[i]:
                # x is in the current range
                break
            else:
                offset += location[i]-pos+1
        else:
            if len(location) % 2:
                raise ValueError("Position %i is outside sequence range" % x)
        return offset + x - pos
    else:
        for i in range(1, len(location)):
            if i % 2:
                # Starting position.
                pos = location[i]
            elif location[i]-pos < x:
                # x is after this ending position
                x -= location[i]-pos+1
            else:
                # x is before this ending position
                break
        return pos + x
#get_genome_pos


148
def call_variants(template, sequence, location=("?", 1), cache=True,
jhoogenboom's avatar
jhoogenboom committed
149 150 151
                  debug=False):
    """
    Perform a global alignment of sequence to template and return a
152 153 154
    list of variants detected.  The format (nomenclature) of the
    returned variants depends on the location argument.

155 156 157 158
    If location is a tuple ("chromosome name", position), with any
    integer for the position, all variants are given as substitutions in
    the form posX>Y.  Insertions and deletions are written as pos.1->Y
    and posX>-, respectively.  The given position is that of the first
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
159
    base in the template.  With the location set to "suffix", a plus
160 161 162
    sign is prepended to position numbers and the first base in the
    template is pos=1.  With location set to "prefix", a minus sign is
    prepended and bases are counted from right to left instead.
163 164 165 166 167 168

    If location is a tuple ("M", position) with any integer for the
    position, variants are written following the mtDNA nomenclature
    guidelines.  The given position is that of the first base in the
    template.

jhoogenboom's avatar
jhoogenboom committed
169 170 171 172 173 174 175 176
    By default, the results of this function are cached.  Set cache to
    False to suppress caching the result and reduce memory usage.

    Setting debug to True will cause the alignment matrices to be
    printed to sys.stdout.  Be aware that they can be quite large.
    """
    # Saving the results in a cache to avoid repeating alignments.
    try:
177
        return call_variants.cache[template, sequence, location]
jhoogenboom's avatar
jhoogenboom committed
178
    except KeyError:
179
        cache_key = location
jhoogenboom's avatar
jhoogenboom committed
180 181 182 183 184

    row_offset = len(template) + 1
    matrix_match = [0] * row_offset * (len(sequence)+1)
    matrix_gap1 = [-sys.maxint-1] * row_offset * (len(sequence)+1)
    matrix_gap2 = [-sys.maxint-1] * row_offset * (len(sequence)+1)
185 186 187 188 189 190 191 192 193 194 195 196 197
    matrix_direction = [0] * row_offset * (len(sequence)+1)

    # Matrix and arrow enum constants.
    M_MATCH = 0
    M_GAP1 = 1
    M_GAP2 = 2
    A_MATCH = 1
    A_HORZ_O = 2
    A_HORZ_E = 4
    A_VERT_O = 8
    A_VERT_E = 16

    # Settings.
jhoogenboom's avatar
jhoogenboom committed
198
    MATCH_SCORE = 1
199 200 201
    MISMATCH_SCORE = -3
    GAP_OPEN_SCORE = -7
    GAP_EXTEND_SCORE = -2
202 203 204 205 206 207 208 209
    variant_format = "%i%s>%s"

    if location == "prefix":
        location = ("prefix", -len(template))
    elif location == "suffix":
        # Include plus signs for position numbers.
        variant_format = "%+i%s>%s"
        location = ("suffix", 1)
210
    elif type(location) != tuple or len(location) < 2:
211
        raise ValueError("Unknown location %r. It should be 'prefix', "
212 213
            "'suffix', or a tuple (chromosome, position [, endpos])" %
            location)
214 215 216 217 218
    elif location[0] == "M":
        MATCH_SCORE = 1
        MISMATCH_SCORE = -1
        GAP_OPEN_SCORE = -2
        GAP_EXTEND_SCORE = -1
jhoogenboom's avatar
jhoogenboom committed
219 220 221 222 223 224 225 226 227 228 229

    for i in range(len(matrix_match)):
        x = i % row_offset
        y = i / row_offset

        # Initialisation of first row and column.
        if x == 0 or y == 0:
            if x != 0:
                # Top row.
                matrix_gap1[i] = GAP_OPEN_SCORE + GAP_EXTEND_SCORE * (x - 1)
                matrix_match[i] = matrix_gap1[i]
230
                matrix_direction[i] = A_HORZ_E | (A_HORZ_O if x == 1 else 0)
jhoogenboom's avatar
jhoogenboom committed
231 232 233 234
            elif y != 0:
                # Left column.
                matrix_gap2[i] = GAP_OPEN_SCORE + GAP_EXTEND_SCORE * (y - 1)
                matrix_match[i] = matrix_gap2[i]
235 236 237 238
                matrix_direction[i] = A_VERT_E | (A_VERT_O if y == 1 else 0)
            else:
                # Top left corner.
                matrix_direction[i] = A_MATCH
jhoogenboom's avatar
jhoogenboom committed
239 240 241 242 243 244 245
            continue

        if template[x-1] == sequence[y-1]:
            match = MATCH_SCORE
        else:
            match = MISMATCH_SCORE

246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
        options_gap1 = (
            matrix_match[i-1] + GAP_OPEN_SCORE,
            matrix_gap1[i-1] + GAP_EXTEND_SCORE)
        matrix_gap1[i] = max(options_gap1)
        if options_gap1[0] > options_gap1[1]:
            matrix_direction[i] |= A_HORZ_O  # Must exit M_GAP1 here.

        options_gap2 = (
            matrix_match[i-row_offset] + GAP_OPEN_SCORE,
            matrix_gap2[i-row_offset] + GAP_EXTEND_SCORE)
        matrix_gap2[i] = max(options_gap2)
        if options_gap2[0] > options_gap2[1]:
            matrix_direction[i] |= A_VERT_O  # Must exit M_GAP2 here.

        options = (
jhoogenboom's avatar
jhoogenboom committed
261 262 263
            matrix_match[i-1-row_offset] + match,
            matrix_gap1[i],
            matrix_gap2[i])
264 265 266 267 268 269 270
        matrix_match[i] = max(options)
        if options[0] == matrix_match[i]:
            matrix_direction[i] |= A_MATCH  # Can stay in M_MATCH here.
        if options[1] == matrix_match[i]:
            matrix_direction[i] |= A_HORZ_E  # Can enter M_GAP1 here.
        if options[2] == matrix_match[i]:
            matrix_direction[i] |= A_VERT_E  # Can enter M_GAP2 here.
jhoogenboom's avatar
jhoogenboom committed
271 272 273 274 275 276 277 278 279 280 281

    if debug:
        print("GAP1")
        for i in range(0, len(matrix_gap1), row_offset):
            print(("%5i" * row_offset) % tuple(matrix_gap1[i:i+row_offset]))
        print("GAP2")
        for i in range(0, len(matrix_gap2), row_offset):
            print(("%5i" * row_offset) % tuple(matrix_gap2[i:i+row_offset]))
        print("Match")
        for i in range(0, len(matrix_match), row_offset):
            print(("%5i" * row_offset) % tuple(matrix_match[i:i+row_offset]))
282 283 284 285 286 287 288 289 290 291
        print("FLAGS")
        for i in range(0, len(matrix_direction), row_offset):
            print(("%5s|" * row_offset) % tuple("".join([
                "h" if x & A_HORZ_O else " ",
                "H" if x & A_HORZ_E else " ",
                "D" if x & A_MATCH else " ",
                "V" if x & A_VERT_E else " ",
                "v" if x & A_VERT_O else " "
            ]) for x in matrix_direction[i:i+row_offset]))
        print("Traceback")
jhoogenboom's avatar
jhoogenboom committed
292 293 294 295 296 297 298


    # Backtracking.
    variants = []
    variant_template = 0
    variant_sequence = 0
    i = len(matrix_match) - 1
299
    in_matrix = M_MATCH  # May change before first step.
jhoogenboom's avatar
jhoogenboom committed
300 301 302
    while i >= 0:
        x = i % row_offset
        y = i / row_offset
303 304 305 306 307 308 309 310 311 312 313
        if debug:
            print("(%i, %i)" % (x,y))

        if in_matrix == M_MATCH:
            # Make gaps as soon as possible (pushed right).
            if matrix_direction[i] & A_HORZ_E:
                in_matrix = M_GAP1
            elif matrix_direction[i] & A_VERT_E:
                in_matrix = M_GAP2
            elif not (matrix_direction[i] & A_MATCH):
                raise ValueError(
314 315
                    "Alignment error: Dead route! (This is a bug.) [%s,%s]" %
                    (template,sequence))
316 317

        if in_matrix == M_GAP1:
jhoogenboom's avatar
jhoogenboom committed
318 319
            # Go horizontally.  Deletion.
            variant_template += 1
320 321 322
            if matrix_direction[i] & A_HORZ_O:
                # End of gap, go diagonally after this.
                in_matrix = M_MATCH
jhoogenboom's avatar
jhoogenboom committed
323 324 325
            i -= 1
            continue

326
        if in_matrix == M_GAP2:
jhoogenboom's avatar
jhoogenboom committed
327 328
            # Go vertically.  Insertion.
            variant_sequence += 1
329 330 331
            if matrix_direction[i] & A_VERT_O:
                # End of gap, go diagonally after this.
                in_matrix = M_MATCH
jhoogenboom's avatar
jhoogenboom committed
332 333 334 335
            i -= row_offset
            continue

        # Go diagonally.  Either match or mismatch.
336 337 338 339 340 341
        if i != 0 and template[x - 1] != sequence[y - 1]:
            # Start/extend mismatch.
            variant_template += 1
            variant_sequence += 1

        else:
jhoogenboom's avatar
jhoogenboom committed
342 343
            # Match.  Flush variants.
            if variant_template or variant_sequence:
344 345 346 347 348 349
                if location[0] == "M":
                    # MtDNA variants are one-base-at-a-time.
                    for j in range(
                            max(variant_template, variant_sequence)-1, -1, -1):
                        variants.append("%s%i%s%s" % (
                            template[x+j] if j < variant_template else "",#"-",
350 351
                            get_genome_pos(
                                location, x + min(j, variant_template-1)),
352 353
                            ".%i" % (j-variant_template+1)
                                if j >= variant_template else "",
354
                            sequence[y+j] if j < variant_sequence else "del"))
355
                elif variant_template == 0:
jhoogenboom's avatar
jhoogenboom committed
356
                    # Insertions: "-131.1->C" instead of "-130->C".
357
                    variants.append(variant_format % (
358
                        get_genome_pos(location, x - 1),
359
                        ".1-",
jhoogenboom's avatar
jhoogenboom committed
360 361
                        sequence[y:y+variant_sequence]))
                else:
362
                    variants.append(variant_format % (
363
                        get_genome_pos(location, x),
jhoogenboom's avatar
jhoogenboom committed
364 365 366 367 368 369
                        template[x:x+variant_template],
                        sequence[y:y+variant_sequence] or "-"))
                variant_template = 0
                variant_sequence = 0
        i -= 1 + row_offset

370 371
    # Variants were called from right to left.  Reverse their order.
    if location[0] != "prefix":
jhoogenboom's avatar
jhoogenboom committed
372 373 374 375
        variants.reverse()

    # Store the result in the cache.
    if cache:
376
        call_variants.cache[template, sequence, cache_key] = variants
jhoogenboom's avatar
jhoogenboom committed
377 378 379 380 381
    return variants
#call_variants
call_variants.cache = {}


382
def mutate_sequence(seq, variants, location=None):
jhoogenboom's avatar
jhoogenboom committed
383
    """Apply the given variants to the given sequence."""
384
    if type(location) != tuple or len(location) < 2:
385
        pattern = PAT_VARIANT_STR
386
        location = (None, 0)
387 388
    elif location[0] == "M":
        pattern = PAT_VARIANT_MT
389
        location = (location[0], location[1]-1) + tuple(location[2:])
390 391
    else:
        pattern = PAT_VARIANT_SNP
392
        location = (location[0], location[1]-1) + tuple(location[2:])
393 394

    seq = [[]] + [[base] for base in seq]
jhoogenboom's avatar
jhoogenboom committed
395
    for variant in variants:
396
        vm = pattern.match(variant)
jhoogenboom's avatar
jhoogenboom committed
397 398
        if vm is None:
            raise ValueError("Unrecognised variant '%s'" % variant)
399 400 401 402
        pos = int(vm.group("pos"))
        ins = int(vm.group("ins") or 0)
        old = vm.group("old")
        new = vm.group("new")
jhoogenboom's avatar
jhoogenboom committed
403 404
        if old == "-":
            old = ""
405
        if new == "-" or new == "del":
jhoogenboom's avatar
jhoogenboom committed
406 407
            new = ""
        if pos < 0:
408
            pos += len(seq)
409 410
        pos = get_genome_pos(location, pos, True)
        if pos < 0 or (pos == 0 and not ins) or pos >= len(seq):
411
            raise ValueError(
412 413
                "Position of variant '%s' is outside sequence range" %
                    (variant))
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
        if (not ins and old and old != "".join("".join(x[:1])
                for x in seq[pos:pos+len(old)])):
            raise ValueError(
                "Incorrect original sequence in variant '%s'; should be '%s'!"
                % (variant, "".join("".join(x[:1])
                    for x in seq[pos:pos+len(old)])))
        elif not ins and not old:
            # MtDNA substitution with reference base omitted.
            old = "".join("".join(x[:1]) for x in seq[pos:pos+len(new)])
        if not ins:
            # Remove old bases, retaining those inserted between/after.
            seq[pos:pos+len(old)] = [
                [""] + x[1:] for x in seq[pos:pos+len(old)]]
            # Place new entirely in the position of the first old base.
            seq[pos][0] = new
jhoogenboom's avatar
jhoogenboom committed
429
        else:
430 431 432 433 434
            # Insert new exactly ins positions after pos.
            while len(seq[pos]) <= ins:
                seq[pos].append("")
            seq[pos][ins] = new
    return "".join("".join(x) for x in seq)
jhoogenboom's avatar
jhoogenboom committed
435 436 437
#mutate_sequence


Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
def iupac_expand_ambiguous(seq):
    """Return generator for all possible values of ambiguous seq."""
    return ("".join(x) for x in itertools.product(
        *((b for b in IUPAC[a]) for a in seq)))
#iupac_expand_ambiguous


def iupac_pattern_ambiguous(seq):
    """Return regex pattern that matches the ambiguous seq."""
    return "".join(
        (("[%s]" if len(IUPAC[x.upper()]) > 1 else "%s") +
        ("?" if x.islower() else "")) % IUPAC[x.upper()] for x in seq)
#iupac_expand_ambiguous


453
def parse_library(libfile, stream=False):
jhoogenboom's avatar
jhoogenboom committed
454
    try:
455
        if not stream:
456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
            libfile = sys.stdin if libfile == "-" else open(libfile, "r")
        if libfile == sys.stdin:
            # Can't seek on pipes, so read it into a buffer first.
            libfile = StringIO(sys.stdin.read())
        try:
            library = parse_library_ini(libfile)
            if not stream:
                libfile.close()
            return library
        except MissingSectionHeaderError:
            # Not an ini file.
            pass
        libfile.seek(0)
        library = parse_library_tsv(libfile)
        if not stream and libfile != sys.stdin:
471 472
            libfile.close()
        return library
473 474
    except ValueError as err:
        raise argparse.ArgumentTypeError(err)
jhoogenboom's avatar
jhoogenboom committed
475 476 477 478 479 480 481 482 483 484 485 486 487 488
#parse_library


def parse_library_tsv(handle):
    """
    Parse a TSSV library file (tab-separated values format).

    The provided file should contain at least four columns: marker name,
    left flanking sequence, right flanking sequence, and STR definition.

    Return a nested dict with top-level keys "flanks" and "regex".
    """
    library = {
      "flanks": {},
489
      "regex": {},
490
      "blocks_middle": {}
jhoogenboom's avatar
jhoogenboom committed
491 492
    }
    for line in handle:
jhoogenboom's avatar
jhoogenboom committed
493
        line = [x.strip() for x in line.rstrip("\r\n").split("\t")]
jhoogenboom's avatar
jhoogenboom committed
494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510
        if line == [""]:
            continue
        if len(line) < 4:
            raise ValueError(
                "Invalid library file: encountered line with %i columns, "
                "need at least 4" % len(line))
        marker = line[0]
        if PAT_SEQ_RAW.match(line[1]) is None:
            raise ValueError("Flanking sequence '%s' of marker %s is invalid" %
                             (line[1], marker))
        if PAT_SEQ_RAW.match(line[2]) is None:
            raise ValueError("Flanking sequence '%s' of marker %s is invalid" %
                             (line[2], marker))
        if PAT_STR_DEF.match(line[3]) is None:
            raise ValueError("STR definition '%s' of marker %s is invalid" %
                             (line[3], marker))
        library["flanks"][marker] = line[1:3]
511 512 513
        library["blocks_middle"][marker] = [
            (block[0], int(block[1]), int(block[2])) for block in
                PAT_STR_DEF_BLOCK.findall(line[3])]
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
514
        # NOTE: Libconvert tool expects "(seq){num,num}" blocks ONLY!
515
        library["regex"][marker] = re.compile(
516 517 518
            "".join(("^", "".join(
                "(%s){%s,%s}" % x for x in PAT_STR_DEF_BLOCK.findall(line[3])),
                "$")))
jhoogenboom's avatar
jhoogenboom committed
519 520 521 522 523 524 525 526 527 528
    return library
#parse_library_tsv


def parse_library_ini(handle):
    library = {
      "flanks": {},
      "prefix": {},
      "suffix": {},
      "regex": {},
529
      "blocks_middle": {},
530 531
      "nostr_reference": {},
      "genome_position": {},
532 533
      "length_adjust": {},
      "block_length": {},
534
      "max_expected_copies": {},
535
      "expected_length": {},
jhoogenboom's avatar
jhoogenboom committed
536 537 538 539 540 541 542 543 544 545
      "aliases": {}
    }
    markers = set()

    ini = RawConfigParser()
    ini.optionxform = str
    ini.readfp(handle)
    for section in ini.sections():
        for marker in ini.options(section):
            value = ini.get(section, marker)
546 547
            section_low = section.lower()
            if section_low == "flanks":
jhoogenboom's avatar
jhoogenboom committed
548 549 550 551 552 553 554 555 556 557 558 559
                values = PAT_SPLIT.split(value)
                if len(values) != 2:
                    raise ValueError(
                        "For marker %s, %i flanking sequences were given,"
                        "need exactly 2" % (marker, len(values)))
                for value in values:
                    if PAT_SEQ_RAW.match(value) is None:
                        raise ValueError(
                            "Flanking sequence '%s' of marker %s is invalid" %
                            (value, marker))
                library["flanks"][marker] = values
                markers.add(marker)
560
            elif section_low == "prefix":
561 562 563
                if marker in library["nostr_reference"]:
                    raise ValueError(
                        "A prefix was defined for non-STR marker %s" % marker)
jhoogenboom's avatar
jhoogenboom committed
564
                values = PAT_SPLIT.split(value)
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
565 566 567
                for i in range(len(values)):
                    if (not i and PAT_SEQ_RAW.match(values[i]) is None or
                            i and PAT_SEQ_IUPAC.match(values[i]) is None):
jhoogenboom's avatar
jhoogenboom committed
568 569
                        raise ValueError(
                            "Prefix sequence '%s' of marker %s is invalid" %
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
570
                            (values[i], marker))
571
                library["prefix"][marker] = values
jhoogenboom's avatar
jhoogenboom committed
572
                markers.add(marker)
573
            elif section_low == "suffix":
574 575 576
                if marker in library["nostr_reference"]:
                    raise ValueError(
                        "A suffix was defined for non-STR marker %s" % marker)
jhoogenboom's avatar
jhoogenboom committed
577
                values = PAT_SPLIT.split(value)
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
578 579 580
                for i in range(len(values)):
                    if (not i and PAT_SEQ_RAW.match(values[i]) is None or
                            i and PAT_SEQ_IUPAC.match(values[i]) is None):
jhoogenboom's avatar
jhoogenboom committed
581 582
                        raise ValueError(
                            "Suffix sequence '%s' of marker %s is invalid" %
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
583
                            (values[i], marker))
584
                library["suffix"][marker] = values
jhoogenboom's avatar
jhoogenboom committed
585
                markers.add(marker)
586 587 588 589 590 591 592
            elif section_low == "genome_position":
                values = PAT_SPLIT.split(value)
                chromosome = PAT_CHROMOSOME.match(values[0])
                if chromosome is None:
                    raise ValueError(
                        "Invalid chromosome '%s' for marker %s." %
                        (values[0], marker))
593 594 595 596 597 598 599 600 601 602 603 604 605
                pos = [chromosome.group(1)]
                for i in range(1, len(values)):
                    try:
                        pos.append(int(values[i]))
                    except:
                        raise ValueError(
                            "Position '%s' of marker %s is not a valid integer"
                            % (values[i], marker))
                    if not i % 2 and pos[-2] >= pos[-1]:
                        raise ValueError(
                            "End position %i of marker %s must be higher than "
                            "corresponding start position %i" %
                            (pos[-1], marker, pos[-2]))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
606 607
                if len(values) == 1:
                    pos.append(1)
608
                library["genome_position"][marker] = tuple(pos)
609
                markers.add(marker)
610
            elif section_low == "length_adjust":
jhoogenboom's avatar
jhoogenboom committed
611 612 613 614 615 616 617 618
                try:
                    value = int(value)
                except:
                    raise ValueError(
                        "Length adjustment '%s' of marker %s is not a valid "
                        "integer" % (value, marker))
                library["length_adjust"][marker] = value
                markers.add(marker)
619
            elif section_low == "block_length":
jhoogenboom's avatar
jhoogenboom committed
620 621 622 623 624 625 626 627
                try:
                    value = int(value)
                except:
                    raise ValueError(
                        "Block length '%s' of marker %s is not a valid integer"
                        % (value, marker))
                library["block_length"][marker] = value
                markers.add(marker)
628 629 630 631 632 633 634 635 636 637
            elif section_low == "max_expected_copies":
                try:
                    value = int(value)
                except:
                    raise ValueError(
                        "Maximum number of expected copies '%s' of marker %s "
                        "is not a valid integer" % (value, marker))
                library["max_expected_copies"][marker] = value
                markers.add(marker)
            elif section_low == "aliases":
jhoogenboom's avatar
jhoogenboom committed
638 639 640 641 642
                values = PAT_SPLIT.split(value)
                if len(values) != 3:
                    raise ValueError("Alias %s does not have 3 values, but %i"
                                     % (marker, len(values)))
                if PAT_SEQ_RAW.match(values[1]) is None:
643 644 645 646 647 648 649
                    raise ValueError(
                        "Alias sequence '%s' of alias %s is invalid" %
                        (values[1], marker))
                if PAT_ALIAS.match(values[2]) is None:
                    raise ValueError(
                        "Allele name '%s' of alias %s is invalid" %
                        (values[2], marker))
jhoogenboom's avatar
jhoogenboom committed
650 651 652 653 654 655
                library["aliases"][marker] = {
                    "marker": values[0],
                    "sequence": values[1],
                    "name": values[2]
                }
                markers.add(marker)
656
            elif section_low == "repeat":
657 658 659 660
                if marker in library["nostr_reference"]:
                    raise ValueError(
                        "Marker %s was encountered in both [repeat] and "
                        "[no_repeat] sections" % marker)
jhoogenboom's avatar
jhoogenboom committed
661 662 663 664 665 666
                if PAT_STR_DEF.match(value) is None:
                    raise ValueError(
                        "STR definition '%s' of marker %s is invalid" %
                        (value, marker))
                library["regex"][marker] = value
                markers.add(marker)
667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
            elif section_low == "no_repeat":
                if marker in library["regex"]:
                    raise ValueError(
                        "Marker %s was encountered in both [repeat] and "
                        "[no_repeat] sections" % marker)
                if marker in library["prefix"] or marker in library["suffix"]:
                    raise ValueError(
                        "A prefix or suffix was defined for non-STR marker %s"
                        % marker)
                if PAT_SEQ_RAW.match(value) is None:
                    raise ValueError(
                        "Reference sequence '%s' of marker %s is invalid" %
                        (value, marker))
                library["nostr_reference"][marker] = value
                markers.add(marker)
682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700
            elif section_low == "expected_allele_length":
                values = PAT_SPLIT.split(value)
                try:
                    min_length = int(values[0])
                except:
                    raise ValueError(
                        "Minimum expected allele length '%s' of marker %s "
                        "is not a valid integer" % (values[0], marker))
                if len(values) > 1:
                    try:
                        max_length = int(values[1])
                    except:
                        raise ValueError(
                            "Maximum expected allele length '%s' of marker %s "
                            "is not a valid integer" % (values[1], marker))
                else:
                    max_length = sys.maxint
                library["expected_length"][marker] = (min_length, max_length)
                markers.add(marker)
701 702 703 704 705 706 707 708

    # Sanity check: prohibit prefix/suffix for aliases of non-STRs.
    for alias in library["aliases"]:
        if library["aliases"][alias]["marker"] in library["nostr_reference"] \
                and (alias in library["prefix"] or alias in library["suffix"]):
            raise ValueError(
                "A prefix or suffix was defined for alias %s of non-STR "
                "marker %s" % (alias, library["aliases"][alias]["marker"]))
jhoogenboom's avatar
jhoogenboom committed
709

710 711 712 713 714 715 716 717 718 719 720 721 722 723 724
    # Sanity check: end position of marker should reflect ref length.
    for marker in library["genome_position"]:
        if marker not in library["nostr_reference"]:
            continue
        pos = library["genome_position"][marker]
        reflength = len(library["nostr_reference"][marker])
        length = 0
        for i in range(2, len(pos), 2):
            length += pos[i] - pos[i-1] + 1
        if reflength < length or (len(pos) % 2 and reflength != length):
            raise ValueError(
                "Length of reference sequence of marker %s is %i bases, but "
                "genome positions add up to %i bases" %
                (marker, reflength, length))

jhoogenboom's avatar
jhoogenboom committed
725 726 727
    # Compile regular expressions.
    for marker in markers:
        parts = []
728
        blocksm = []
jhoogenboom's avatar
jhoogenboom committed
729
        if marker in library["prefix"]:
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
730 731 732 733 734 735 736
            parts += ("(%s){0,1}" % iupac_pattern_ambiguous(x)
                for x in library["prefix"][marker])
        elif (marker in library["aliases"] and
                library["aliases"][marker]["marker"] in library["prefix"]):
            parts += ("(%s){0,1}" % iupac_pattern_ambiguous(x)
                for x in library["prefix"][
                    library["aliases"][marker]["marker"]])
jhoogenboom's avatar
jhoogenboom committed
737
        if marker in library["aliases"]:
738
            blocksm.append((library["aliases"][marker]["sequence"], 0, 1))
jhoogenboom's avatar
jhoogenboom committed
739
        elif marker in library["regex"]:
740 741
            blocksm += [(block[0], int(block[1]), int(block[2])) for block in
                        PAT_STR_DEF_BLOCK.findall(library["regex"][marker])]
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
742
        parts += ("(%s){%s,%s}" % x for x in blocksm)
jhoogenboom's avatar
jhoogenboom committed
743
        if marker in library["suffix"]:
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
744 745 746 747 748 749 750
            parts += ("(%s){0,1}" % iupac_pattern_ambiguous(x)
                for x in library["suffix"][marker])
        elif (marker in library["aliases"] and
                library["aliases"][marker]["marker"] in library["suffix"]):
            parts += ("(%s){0,1}" % iupac_pattern_ambiguous(x)
                for x in library["suffix"][
                    library["aliases"][marker]["marker"]])
jhoogenboom's avatar
jhoogenboom committed
751 752 753
        if parts:
            library["regex"][marker] = re.compile(
                "".join(["^"] + parts + ["$"]))
754 755
        if blocksm:
            library["blocks_middle"][marker] = blocksm
jhoogenboom's avatar
jhoogenboom committed
756 757
    return library
#parse_library_ini
jhoogenboom's avatar
jhoogenboom committed
758 759


760
def load_profiles(profilefile, library=None):
761
    # TODO: To be replaced with load_profiles_new (less memory).
jhoogenboom's avatar
jhoogenboom committed
762
    column_names = profilefile.readline().rstrip("\r\n").split("\t")
763 764
    if column_names == [""]:
        return {}  # Empty file.
765 766 767
    (colid_marker, colid_allele, colid_sequence, colid_fmean, colid_rmean,
     colid_tool) = get_column_ids(column_names, "marker", "allele", "sequence",
        "fmean", "rmean", "tool")
jhoogenboom's avatar
jhoogenboom committed
768 769 770 771 772 773 774 775 776 777 778 779 780

    profiles = {}
    for line in profilefile:
        line = line.rstrip("\r\n").split("\t")
        if line == [""]:
            continue
        marker = line[colid_marker]
        if marker not in profiles:
            profiles[marker] = {
                "m": set(),  # To be replaced by its length below.
                "n": set(),  # To be replaced by its length below.
                "seqs": [],
                "forward": {},  # To be replaced by a list below.
781 782
                "reverse": {},  # To be replaced by a list below.
                "tool": {}  # To be replaced by a list below.
jhoogenboom's avatar
jhoogenboom committed
783 784 785 786 787 788 789 790 791 792 793 794
                }
        allele = ensure_sequence_format(line[colid_allele], "raw",
            library=library, marker=marker)
        sequence = ensure_sequence_format(line[colid_sequence], "raw",
            library=library, marker=marker)
        if (allele, sequence) in profiles[marker]["forward"]:
            raise ValueError(
                "Invalid background noise profiles file: encountered "
                "multiple values for marker '%s' allele '%s' sequence '%s'" %
                (marker, allele, sequence))
        profiles[marker]["forward"][allele,sequence] = float(line[colid_fmean])
        profiles[marker]["reverse"][allele,sequence] = float(line[colid_rmean])
795
        profiles[marker]["tool"][allele, sequence] = line[colid_tool]
jhoogenboom's avatar
jhoogenboom committed
796 797 798 799 800 801 802 803 804 805
        profiles[marker]["m"].update((allele, sequence))
        profiles[marker]["n"].add(allele)

    # Check completeness and reorder true alleles.
    for marker in profiles:
        profiles[marker]["seqs"] = list(profiles[marker]["n"]) + \
            list(profiles[marker]["m"]-profiles[marker]["n"])
        profiles[marker]["n"] = len(profiles[marker]["n"])
        profiles[marker]["m"] = len(profiles[marker]["m"])
        newprofiles = {"forward": [], "reverse": []}
806
        tools = []
jhoogenboom's avatar
jhoogenboom committed
807 808 809 810
        for i in range(profiles[marker]["n"]):
            allele = profiles[marker]["seqs"][i]
            for direction in newprofiles:
                newprofiles[direction].append([0] * profiles[marker]["m"])
811
            tools.append([""] * profiles[marker]["m"])
jhoogenboom's avatar
jhoogenboom committed
812 813 814 815 816 817
            for j in range(profiles[marker]["m"]):
                sequence = profiles[marker]["seqs"][j]
                if (allele, sequence) in profiles[marker]["forward"]:
                    for direction in newprofiles:
                        newprofiles[direction][i][j] = \
                            profiles[marker][direction][allele, sequence]
818
                    tools[i][j] = profiles[marker]["tool"][allele, sequence]
jhoogenboom's avatar
jhoogenboom committed
819 820
        profiles[marker]["forward"] = newprofiles["forward"]
        profiles[marker]["reverse"] = newprofiles["reverse"]
821
        profiles[marker]["tool"] = tools
jhoogenboom's avatar
jhoogenboom committed
822 823

    return profiles
824
#load_profiles
825 826


827 828 829
def load_profiles_new(profilefile, library=None):
    # TODO, rename this to load_profiles to complete transition.
    column_names = profilefile.readline().rstrip("\r\n").split("\t")
830 831
    if column_names == [""]:
        return {}  # Empty file.
832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863
    (colid_marker, colid_allele, colid_sequence, colid_fmean, colid_rmean,
     colid_tool) = get_column_ids(column_names, "marker", "allele", "sequence",
        "fmean", "rmean", "tool")

    profiles = {}
    for line in profilefile:
        line = line.rstrip("\r\n").split("\t")
        if line == [""]:
            continue
        marker = line[colid_marker]
        if marker not in profiles:
            profiles[marker] = {}
        allele = ensure_sequence_format(line[colid_allele], "raw",
            library=library, marker=marker)
        sequence = ensure_sequence_format(line[colid_sequence], "raw",
            library=library, marker=marker)
        if allele not in profiles[marker]:
            profiles[marker][allele] = {}
        elif sequence in profiles[marker][allele]:
            raise ValueError(
                "Invalid background noise profiles file: encountered "
                "multiple values for marker '%s' allele '%s' sequence '%s'" %
                (marker, allele, sequence))
        profiles[marker][allele][sequence] = {
            "forward": float(line[colid_fmean]),
            "reverse": float(line[colid_rmean]),
            "tool": line[colid_tool]}

    return profiles
#load_profiles_new


864
def pattern_longest_match(pattern, subject):
865
    """Return the longest match of the pattern in the subject string."""
866 867 868 869 870
    # FIXME, this function tries only one match at each position in the
    # sequence, which is not neccessarily the longest match at that
    # position. For now, we'll search the reverse sequence as well.
    # Re-implement this without regular expressions to test all options.
    reverse = False
871 872
    match = None
    pos = 0
873
    pat = re.compile("".join("(%s){%i,%i}" % x for x in pattern))
874
    while pos < len(subject):
875
        m = pat.search(subject, pos)
876 877 878 879 880
        if m is None:
            break
        if match is None or m.end()-m.start() > match.end()-match.start():
            match = m
        pos = m.start() + 1
881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929

    # Try to find a longer match from the other end.
    if match is not None:
        subject = reverse_complement(subject)
        pos = 0
        pat = re.compile("".join(
            "(%s){%i,%i}" % (reverse_complement(x[0]), x[1], x[2])
            for x in reversed(pattern)))
        while pos < len(subject):
            m = pat.search(subject, pos)
            if m is None:
                break
            if m.end()-m.start() > match.end()-match.start():
                match = m
                reverse = True
            pos = m.start() + 1

    # Extract the blocks from the match.
    match = [] if match is None or not match.group() else reduce(
        lambda x, i: (
            x[0] + [match.group(i)]*((match.end(i)-x[1])/len(match.group(i))),
            match.end(i)) if match.group(i) else x,
        range(1, match.lastindex+1), ([], match.start()))[0]

    # Return the match in the same sequence orientation as the input.
    return map(reverse_complement, reversed(match)) if reverse else match
#pattern_longest_match


def pattern_longest_match_veryslow(pattern, subject):
    """Return the longest match of the pattern in the subject string."""
    longest = 0
    the_match = []
    # Generate all possible matching sequences for this pattern.
    #print("Finding match of pattern %r to sequence %s" % (pattern, subject))
    for matching_blocks in itertools.product(*(
            [[block[0]]*i for i in range(block[1], block[2]+1)]
            for block in pattern)):
        matching = itertools.chain.from_iterable(matching_blocks)
        matching_seq = "".join(matching)
        matching_len = len(matching_seq)
        if matching_len <= longest:
            continue
        if matching_seq in subject:
            longest = matching_len
            the_match = matching
    #print("Found match covering %i/%i bases" % (longest, len(subject)))
    return the_match
#pattern_longest_match_veryslow
930 931


jhoogenboom's avatar
jhoogenboom committed
932 933 934
def detect_sequence_format(seq):
    """Return format of seq.  One of 'raw', 'tssv', or 'allelename'."""
    if not seq:
jhoogenboom's avatar
jhoogenboom committed
935
        raise ValueError("Empty sequence")
936
    if seq in SEQ_SPECIAL_VALUES:
937 938
        # Special case.
        return False
jhoogenboom's avatar
jhoogenboom committed
939
    if PAT_SEQ_RAW.match(seq):
940
        return "raw"
jhoogenboom's avatar
jhoogenboom committed
941
    if PAT_SEQ_TSSV.match(seq):
942
        return "tssv"
943 944
    if PAT_SEQ_ALLELENAME_STR.match(seq) or PAT_SEQ_ALLELENAME_MT.match(seq) \
            or PAT_SEQ_ALLELENAME_SNP.match(seq):
945
        return "allelename"
jhoogenboom's avatar
jhoogenboom committed
946
    raise ValueError("Unrecognised sequence format")
jhoogenboom's avatar
jhoogenboom committed
947 948 949
#detect_sequence_format


jhoogenboom's avatar
jhoogenboom committed
950
def convert_sequence_tssv_raw(seq):
jhoogenboom's avatar
jhoogenboom committed
951 952
    return "".join(block[0] * int(block[1])
                   for block in PAT_TSSV_BLOCK.findall(seq))
jhoogenboom's avatar
jhoogenboom committed
953 954 955 956 957 958 959 960
#convert_sequence_tssv_raw


def convert_sequence_raw_tssv(seq, library, marker, return_alias=False):
    # Try to match this marker's pattern, or any of its aliases.
    match = None
    if "aliases" in library:
        for alias in library["aliases"]:
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
961
            if library["aliases"][alias]["marker"] == marker:
jhoogenboom's avatar
jhoogenboom committed
962 963 964 965 966 967
                match = library["regex"][alias].match(seq)
                if match is not None:
                    marker = alias
                    break
    if match is None and marker in library["regex"]:
        match = library["regex"][marker].match(seq)
968
    if match is not None:
jhoogenboom's avatar
jhoogenboom committed
969 970
        parts = ((match.group(i), match.end(i)) for i in range(1, 1 if
            match.lastindex is None else match.lastindex+1) if match.group(i))
jhoogenboom's avatar
jhoogenboom committed
971

972
    # Use heuristics if the sequence does not match the pattern.
jhoogenboom's avatar
jhoogenboom committed
973
    else:
974 975 976 977

        # Find explictily provided prefix and/or suffix if present.
        pre_suf = ["", ""]
        if "prefix" in library and marker in library["prefix"]:
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
978 979
            for prefix in (x for x in library["prefix"][marker]
                    for x in iupac_expand_ambiguous(x)):
980 981 982 983 984
                if seq.startswith(prefix):
                    pre_suf[0] = prefix
                    seq = seq[len(prefix):]
                    break
        if "suffix" in library and marker in library["suffix"]:
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
985 986
            for suffix in (x for x in library["suffix"][marker]
                    for x in iupac_expand_ambiguous(x)):
987 988 989 990 991 992
                if seq.endswith(suffix):
                    pre_suf[1] = suffix
                    seq = seq[:-len(suffix)]
                    break

        # Find longest match of middle pattern.
993
        middle = [(seq, len(pre_suf[0])+len(seq))] if seq else []
994 995 996 997
        if middle and marker in library["blocks_middle"]:
            match = pattern_longest_match(library["blocks_middle"][marker],seq)
            matched = "".join(match)
            if matched:
998 999 1000 1001 1002 1003

                # If this allele does not match the prefix of this
                # marker, but the canonical prefix of the marker ends
                # with the same sequence as the start of our match, we
                # move that portion of the match into the prefix.
                # Then, we do the same thing with the suffix.
1004 1005 1006 1007 1008
                middle = match
                match_start = seq.index(matched)
                match_end = match_start + len(matched)
                start = match_start
                end = match_end
1009 1010 1011 1012
                modified = False
                if (not pre_suf[0] and "prefix" in library
                        and marker in library["prefix"]):
                    ref = library["prefix"][marker][0]
1013 1014 1015 1016 1017 1018 1019 1020 1021
                    if start != len(ref):
                        i = min(len(ref), len(matched))
                        while i > 0:
                            if ref.endswith(matched[:i]):
                                start += i
                                matched = matched[i:]
                                modified = True
                                break
                            i -= 1
1022 1023 1024
                if (not pre_suf[1] and "suffix" in library
                        and marker in library["suffix"]):
                    ref = library["suffix"][marker][0]
1025 1026 1027 1028 1029 1030 1031 1032 1033
                    if len(seq) - end != len(ref):
                        i = min(len(ref), len(matched))
                        while i > 0:
                            if ref.startswith(matched[-i:]):
                                end -= i
                                matched = matched[:-i]
                                modified = True
                                break
                            i -= 1
1034
                if modified:
1035 1036
                    from_start = start - match_start
                    from_end = match_end - end
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
                    while from_start:
                        if from_start < len(middle[0]):
                            middle[0] = middle[0][from_start:]
                            break
                        else:
                            from_start -= len(middle[0])
                            middle = middle[1:]
                    while from_end:
                        if from_end < len(middle[-1]):
                            middle[-1] = middle[-1][:-from_end]
                            break
                        else:
                            from_end -= len(middle[-1])
                            middle = middle[:-1]
1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
                if start:
                    if pre_suf[0]:
                        middle.insert(0, seq[:start])
                        matched = seq[:start] + matched
                    else:
                        pre_suf[0] = seq[:start]
                if end < len(seq):
                    if pre_suf[1]:
                        middle.append(seq[end:])
                        matched = matched + seq[end:]
                    else:
                        pre_suf[1] = seq[end:]
1063 1064 1065 1066 1067
                if middle:
                    middle = reduce(
                        lambda x, y: (x[:-1] if x[-1][0] == y else x) +
                            [(y, x[-1][1]+len(y))], middle[1:],
                            [(middle[0],
1068
                              len(middle[0])+len(pre_suf[0]))])
1069
                seq = matched
1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084

        # Now construct parts.