Commit 1083919c authored by Hoogenboom, Jerry's avatar Hoogenboom, Jerry
Browse files

Implemented support for non-STR markers, improved file handling and more

Additions and improvements to the FDSTools library file format:
* New [genome_position] section in FDSTools-style library files allows
for specifying the chromosome and position of each marker.
* New [no_repeat] section in FDSTools-style library files allows for
including non-STR markers.
* Comma/semicolon/space-separated values in FDSTools-style library files
can now also be separated by tab characters and multiple consecutive
separators are no longer collapsed (with the exception of whitespace).
* If no prefix and/or suffix has been specified for an alias, the
prefix/suffix of the marker itself is used.
* Implemented support for non-STR markers (e.g. SNP clusters) and mtDNA
markers. Allele names of the latter follow mtDNA nomenclature.
* Improved the logic of generating STR allele names for sequences that
have a prefix or suffix sequence that was not included in the library
file.
* Updated and clarified various explanatory texts in generated FDSTools
library files.

Fixed:
* Fixed a bug that caused prefix/suffix variants in aliases to go
missing in allele names.

Improved file handling:
* Library files are now closed immediately after parsing them.
* Sample data input files are opened one at a time now.

Visualisations:
* Updated Vega to version 2.3.1.
* Worked around a bug in Google Chrome that caused the 'Save image' link
to stop working after having been used once.
parent d96335b0
*.pyc *.pyc
dist/* dist/*
*.egg-info/* *.egg-info/*
/.project
""" """
Tools for characterisation and filtering of PCR stutter artefacts and other Tools for characterisation and filtering of PCR stutter artefacts and other
systemic noise in Next Generation Sequencing data of forensic STR markers. systemic noise in Next Generation Sequencing data of forensic DNA markers.
""" """
__version_info__ = ('0', '0', '2') __version_info__ = ('0', '0', '2')
......
...@@ -62,7 +62,7 @@ def main(): ...@@ -62,7 +62,7 @@ def main():
default=argparse.SUPPRESS, nargs=argparse.REMAINDER, default=argparse.SUPPRESS, nargs=argparse.REMAINDER,
help="show version number and exit") help="show version number and exit")
parser.add_argument('-d', "--debug", action="store_true", parser.add_argument('-d', "--debug", action="store_true",
help="if specified, debug output is printed to stdout") help="if specified, additional debug output is given")
subparsers = parser.add_subparsers(title='available tools', dest='tool', subparsers = parser.add_subparsers(title='available tools', dest='tool',
metavar='TOOL', help="specify which " metavar='TOOL', help="specify which "
"tool to run") "tool to run")
...@@ -82,7 +82,7 @@ def main(): ...@@ -82,7 +82,7 @@ def main():
version=version(parser.prog, name, module.__version__)) version=version(parser.prog, name, module.__version__))
__tools__[name] = subparser __tools__[name] = subparser
subparser.add_argument('-d', "--debug", action="store_true", subparser.add_argument('-d', "--debug", action="store_true",
help="if specified, debug output is printed to stdout") help="if specified, additional debug output is given")
module.add_arguments(subparser) module.add_arguments(subparser)
subparser.set_defaults(func=module.run) subparser.set_defaults(func=module.run)
try: try:
......
...@@ -9,29 +9,47 @@ from StringIO import StringIO ...@@ -9,29 +9,47 @@ from StringIO import StringIO
# Patterns that match entire sequences. # Patterns that match entire sequences.
PAT_SEQ_RAW = re.compile("^[ACGT]*$") PAT_SEQ_RAW = re.compile("^[ACGT]*$")
PAT_SEQ_TSSV = re.compile("^(?:[ACGT]+\(\d+\))*$") PAT_SEQ_TSSV = re.compile("^(?:[ACGT]+\(\d+\))*$")
PAT_SEQ_ALLELENAME = re.compile( # First line: n_ACT[m] or alias. PAT_SEQ_ALLELENAME_STR = re.compile( # First line: n_ACT[m] or alias.
"^(?:(?:(?:CE)?-?\d+(?:\.\d+)?_(?:[ACGT]+\[\d+\])*)|((?!_).+?))" "^(?:(?:(?:CE)?-?\d+(?:\.\d+)?_(?:[ACGT]+\[\d+\])*)|((?!_).+?))"
"(?:_[-+]\d+(?:\.1)?(?P<a>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>" # _+3A> "(?:_[-+]\d+(?:\.1)?(?P<a>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>" # _+3A>
"(?!(?P=a))(?:[ACTG]+|-))*$") # Portion of variants after '>'. "(?!(?P=a))(?:[ACTG]+|-))*$") # Portion of variants after '>'.
PAT_SEQ_ALLELENAME_SNP = re.compile(
"^REF$|^(?:(?:(?<=^)|(?<!^) )" # 'REF' or space-separated variants.
"\d+(?:\.1)?(?P<a>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>"
"(?!(?P=a))(?:[ACTG]+|-))+$") # Portion of variants after '>'.
PAT_SEQ_ALLELENAME_MT = re.compile(
"^REF$|^(?:(?:(?<=^)|(?<!^) )" # 'REF' or space-separated variants.
"(?:(?P<a>[ACGT])|-?)\d+(?(a)|(?:\.\d+)?)(?:[ACGT-]|del))+$")
# Patterns that match blocks of TSSV-style sequences and allele names. # Patterns that match blocks of TSSV-style sequences and allele names.
PAT_TSSV_BLOCK = re.compile("([ACGT]+)\((\d+)\)") PAT_TSSV_BLOCK = re.compile("([ACGT]+)\((\d+)\)")
PAT_ALLELENAME_BLOCK = re.compile("([ACGT]+)\[(\d+)\]") PAT_ALLELENAME_BLOCK = re.compile("([ACGT]+)\[(\d+)\]")
PAT_ALIAS = re.compile("^(?!_).+$") PAT_ALIAS = re.compile("^(?!_).+$")
# Pattern that matches a single prefix/suffix variant. # Patterns that match a single variant.
PAT_VARIANT = re.compile( PAT_VARIANT_STR = re.compile(
"^([-+]\d+)(?:\.1)?" # Position number "^(?P<pos>[-+]\d+)(?:\.(?P<ins>1))?"
"(?P<a>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>" # From part "(?P<old>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>"
"(?!(?P=a))([ACTG]+|-)$") # To part "(?!(?P=old))(?P<new>[ACTG]+|-)$")
PAT_VARIANT_SNP = re.compile(
"^(?P<pos>\d+)(?:\.(?P<ins>1))?"
"(?P<old>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>"
"(?!(?P=old))(?P<new>[ACTG]+|-)$")
PAT_VARIANT_MT = re.compile(
"^(?P<old>(?P<a>[ACGT])|-?)"
"(?P<pos>\d+)(?(a)|(?:\.(?P<ins>\d+))?)"
"(?P<new>[ACGT-]|del)$")
# Patterns that match (parts of) an STR definition. # Patterns that match (parts of) an STR definition.
PAT_STR_DEF = re.compile( PAT_STR_DEF = re.compile("^(?:(?:(?<=^)|(?<!^)\s+)[ACGT]+\s+\d+\s+\d+)*$")
"^(?:[ACGT]+\s+\d+\s+\d+(?:\s+[ACGT]+\s+\d+\s+\d+)*)?$")
PAT_STR_DEF_BLOCK = re.compile("([ACGT]+)\s+(\d+)\s+(\d+)") PAT_STR_DEF_BLOCK = re.compile("([ACGT]+)\s+(\d+)\s+(\d+)")
# Pattern to split a comma-, semicolon-, or space-separated list. # Pattern to split a comma-, semicolon-, or space-separated list.
PAT_SPLIT = re.compile("[,; ]+") PAT_SPLIT = re.compile("\s*[,; \t]\s*")
# Pattern that matches a chromosome name/number.
PAT_CHROMOSOME = re.compile(
"^(?:[Cc][Hh][Rr](?:[Oo][Mm])?)?([1-9XYM]|1\d|2[0-2])$")
# Default regular expression to capture sample tags in file names. # Default regular expression to capture sample tags in file names.
# This is the default of the -e command line option. # This is the default of the -e command line option.
...@@ -53,14 +71,26 @@ COMPL = {"A": "T", "T": "A", "U": "A", "G": "C", "C": "G", "R": "Y", "Y": "R", ...@@ -53,14 +71,26 @@ COMPL = {"A": "T", "T": "A", "U": "A", "G": "C", "C": "G", "R": "Y", "Y": "R",
"k": "m", "m": "k", "b": "v", "v": "b", "d": "h", "h": "d"} "k": "m", "m": "k", "b": "v", "v": "b", "d": "h", "h": "d"}
def call_variants(template, sequence, reverse_indices=False, cache=True, def call_variants(template, sequence, location="suffix", cache=True,
debug=False): debug=False):
""" """
Perform a global alignment of sequence to template and return a Perform a global alignment of sequence to template and return a
list of variants detected. All variants are given as substitutions list of variants detected. The format (nomenclature) of the
in the form posX>Y, where the first base in the template is pos=1. returned variants depends on the location argument.
Set reverse_indices to True to count from right to left instead.
Insertions and deletions are pos.1->Y and posX>-, respectively. If location is "suffix" (the default), all variants are given as
substitutions in the form posX>Y, where the first base in the
template is pos=1. With location set to "prefix", bases are counted
from right to left instead. Insertions and deletions are written as
pos.1->Y and posX>-, respectively.
If location is a tuple ("M", position) with any integer for the
position, variants are written following the mtDNA nomenclature
guidelines. The given position is that of the first base in the
template.
If location is a tuple ("chromosome name", position), a
NotImplementedError is raised.
By default, the results of this function are cached. Set cache to By default, the results of this function are cached. Set cache to
False to suppress caching the result and reduce memory usage. False to suppress caching the result and reduce memory usage.
...@@ -70,9 +100,9 @@ def call_variants(template, sequence, reverse_indices=False, cache=True, ...@@ -70,9 +100,9 @@ def call_variants(template, sequence, reverse_indices=False, cache=True,
""" """
# Saving the results in a cache to avoid repeating alignments. # Saving the results in a cache to avoid repeating alignments.
try: try:
return call_variants.cache[template, sequence, reverse_indices] return call_variants.cache[template, sequence, location]
except KeyError: except KeyError:
pass cache_key = location
row_offset = len(template) + 1 row_offset = len(template) + 1
matrix_match = [0] * row_offset * (len(sequence)+1) matrix_match = [0] * row_offset * (len(sequence)+1)
...@@ -83,6 +113,20 @@ def call_variants(template, sequence, reverse_indices=False, cache=True, ...@@ -83,6 +113,20 @@ def call_variants(template, sequence, reverse_indices=False, cache=True,
MISMATCH_SCORE = -1 MISMATCH_SCORE = -1
GAP_OPEN_SCORE = -10 GAP_OPEN_SCORE = -10
GAP_EXTEND_SCORE = -1 GAP_EXTEND_SCORE = -1
variant_format = "%i%s>%s"
if location == "prefix":
location = ("prefix", -len(template))
elif location == "suffix":
# Include plus signs for position numbers.
variant_format = "%+i%s>%s"
location = ("suffix", 1)
elif type(location) != tuple or len(location) != 2:
raise ValueError("Unknown location %r. It should be 'prefix', "
"'suffix', or a two-tuple (chromosome, position)" % location)
elif location[0] == "M":
# No need to avoid gaps in mtDNA notation.
GAP_OPEN_SCORE = -1
for i in range(len(matrix_match)): for i in range(len(matrix_match)):
x = i % row_offset x = i % row_offset
...@@ -155,14 +199,25 @@ def call_variants(template, sequence, reverse_indices=False, cache=True, ...@@ -155,14 +199,25 @@ def call_variants(template, sequence, reverse_indices=False, cache=True,
if i == 0 or template[x - 1] == sequence[y - 1]: if i == 0 or template[x - 1] == sequence[y - 1]:
# Match. Flush variants. # Match. Flush variants.
if variant_template or variant_sequence: if variant_template or variant_sequence:
if variant_template == 0: if location[0] == "M":
# MtDNA variants are one-base-at-a-time.
for j in range(
max(variant_template, variant_sequence)-1, -1, -1):
variants.append("%s%i%s%s" % (
template[x+j] if j < variant_template else "",#"-",
x + min(j, variant_template-1) + location[1],
".%i" % (j-variant_template+1)
if j >= variant_template else "",
sequence[x+j] if j < variant_sequence else "del"))
elif variant_template == 0:
# Insertions: "-131.1->C" instead of "-130->C". # Insertions: "-131.1->C" instead of "-130->C".
variants.append("%+i.1->%s" % ( variants.append(variant_format % (
x - int(reverse_indices) * row_offset, x - 1 + location[1],
".1-",
sequence[y:y+variant_sequence])) sequence[y:y+variant_sequence]))
else: else:
variants.append("%+i%s>%s" % ( variants.append(variant_format % (
(x + 1) - int(reverse_indices) * row_offset, x + location[1],
template[x:x+variant_template], template[x:x+variant_template],
sequence[y:y+variant_sequence] or "-")) sequence[y:y+variant_sequence] or "-"))
variant_template = 0 variant_template = 0
...@@ -173,65 +228,93 @@ def call_variants(template, sequence, reverse_indices=False, cache=True, ...@@ -173,65 +228,93 @@ def call_variants(template, sequence, reverse_indices=False, cache=True,
variant_sequence += 1 variant_sequence += 1
i -= 1 + row_offset i -= 1 + row_offset
# If reverse_indices=False, we need to reverse the output instead. # Variants were called from right to left. Reverse their order.
if not reverse_indices: if location[0] != "prefix":
variants.reverse() variants.reverse()
# Store the result in the cache. # Store the result in the cache.
if cache: if cache:
call_variants.cache[template, sequence, reverse_indices] = variants call_variants.cache[template, sequence, cache_key] = variants
return variants return variants
#call_variants #call_variants
call_variants.cache = {} call_variants.cache = {}
def mutate_sequence(sequence, variants): def mutate_sequence(seq, variants, location=None):
"""Apply the given variants to the given sequence.""" """Apply the given variants to the given sequence."""
if not sequence and len(variants) > 1: if type(location) != tuple or len(location) != 2:
raise ValueError("With an empty sequence, only a single variant is " pattern = PAT_VARIANT_STR
"possible: an insertion '+0.1->x' or '-1.1->x'.") offset = 1
sequence = list(sequence) elif location[0] == "M":
pattern = PAT_VARIANT_MT
offset = location[1]
else:
pattern = PAT_VARIANT_SNP
offset = location[1]
seq = [[]] + [[base] for base in seq]
for variant in variants: for variant in variants:
vm = PAT_VARIANT.match(variant) vm = pattern.match(variant)
if vm is None: if vm is None:
raise ValueError("Unrecognised variant '%s'" % variant) raise ValueError("Unrecognised variant '%s'" % variant)
pos = int(vm.group(1)) pos = int(vm.group("pos"))
old = vm.group(2) ins = int(vm.group("ins") or 0)
new = vm.group(3) old = vm.group("old")
new = vm.group("new")
if old == "-": if old == "-":
old = "" old = ""
if new == "-": if new == "-" or new == "del":
new = "" new = ""
if pos < 0: if pos < 0:
pos += len(sequence) + 1 pos += len(seq) + 1
if pos == 0 and len(sequence) == 0: pos -= offset - 1
# Insertion into empty sequence. if pos < 0 or (pos == 0 and not ins) or pos > len(seq):
return new raise ValueError(
if old != "".join(sequence[pos-1:pos+len(old)-1]): "Position of variant '%s' is outside sequence range (%i-%i)" %
raise ValueError("Incorrect original sequence in variant '%s'; " (variant, offset, offset+len(seq)-2))
"should be '%s'!" % (variant, if (not ins and old and old != "".join("".join(x[:1])
"".join(sequence[pos-1:pos+len(old)-1]) or "-")) for x in seq[pos:pos+len(old)])):
sequence[pos-1:pos+len(old)-1] = [""] * len(old) raise ValueError(
if pos: "Incorrect original sequence in variant '%s'; should be '%s'!"
sequence[pos-1] += new % (variant, "".join("".join(x[:1])
for x in seq[pos:pos+len(old)])))
elif not ins and not old:
# MtDNA substitution with reference base omitted.
old = "".join("".join(x[:1]) for x in seq[pos:pos+len(new)])
if not ins:
# Remove old bases, retaining those inserted between/after.
seq[pos:pos+len(old)] = [
[""] + x[1:] for x in seq[pos:pos+len(old)]]
# Place new entirely in the position of the first old base.
seq[pos][0] = new
else: else:
# Insertion at the beginning of the sequence # Insert new exactly ins positions after pos.
sequence[0] = new + sequence[0] while len(seq[pos]) <= ins:
return "".join(sequence) seq[pos].append("")
seq[pos][ins] = new
return "".join("".join(x) for x in seq)
#mutate_sequence #mutate_sequence
def parse_library(handle): def parse_library(libfile, stream=False):
if handle == sys.stdin: if not stream:
libfile = sys.stdin if libfile == "-" else open(libfile, "r")
if libfile == sys.stdin:
# Can't seek on pipes, so read it into a buffer first. # Can't seek on pipes, so read it into a buffer first.
handle = StringIO(sys.stdin.read()) libfile = StringIO(sys.stdin.read())
try: try:
return parse_library_ini(handle) library = parse_library_ini(libfile)
if not stream:
libfile.close()
return library
except MissingSectionHeaderError: except MissingSectionHeaderError:
# Not an ini file. # Not an ini file.
pass pass
handle.seek(0) libfile.seek(0)
return parse_library_tsv(handle) library = parse_library_tsv(libfile)
if not stream and libfile != sys.stdin:
libfile.close()
return library
#parse_library #parse_library
...@@ -283,6 +366,8 @@ def parse_library_ini(handle): ...@@ -283,6 +366,8 @@ def parse_library_ini(handle):
"suffix": {}, "suffix": {},
"regex": {}, "regex": {},
"regex_middle": {}, "regex_middle": {},
"nostr_reference": {},
"genome_position": {},
"length_adjust": {}, "length_adjust": {},
"block_length": {}, "block_length": {},
"max_expected_copies": {}, "max_expected_copies": {},
...@@ -311,6 +396,9 @@ def parse_library_ini(handle): ...@@ -311,6 +396,9 @@ def parse_library_ini(handle):
library["flanks"][marker] = values library["flanks"][marker] = values
markers.add(marker) markers.add(marker)
elif section_low == "prefix": elif section_low == "prefix":
if marker in library["nostr_reference"]:
raise ValueError(
"A prefix was defined for non-STR marker %s" % marker)
values = PAT_SPLIT.split(value) values = PAT_SPLIT.split(value)
for value in values: for value in values:
if PAT_SEQ_RAW.match(value) is None: if PAT_SEQ_RAW.match(value) is None:
...@@ -320,6 +408,9 @@ def parse_library_ini(handle): ...@@ -320,6 +408,9 @@ def parse_library_ini(handle):
library["prefix"][marker] = values library["prefix"][marker] = values
markers.add(marker) markers.add(marker)
elif section_low == "suffix": elif section_low == "suffix":
if marker in library["nostr_reference"]:
raise ValueError(
"A suffix was defined for non-STR marker %s" % marker)
values = PAT_SPLIT.split(value) values = PAT_SPLIT.split(value)
for value in values: for value in values:
if PAT_SEQ_RAW.match(value) is None: if PAT_SEQ_RAW.match(value) is None:
...@@ -328,6 +419,27 @@ def parse_library_ini(handle): ...@@ -328,6 +419,27 @@ def parse_library_ini(handle):
(value, marker)) (value, marker))
library["suffix"][marker] = values library["suffix"][marker] = values
markers.add(marker) markers.add(marker)
elif section_low == "genome_position":
values = PAT_SPLIT.split(value)
if len(values) != 2:
raise ValueError(
"Invalid genome position '%s' for marker %s. Expected "
"a chromosome name and a position number." %
(value, marker))
chromosome = PAT_CHROMOSOME.match(values[0])
if chromosome is None:
raise ValueError(
"Invalid chromosome '%s' for marker %s." %
(values[0], marker))
try:
position = int(values[1])
except:
raise ValueError(
"Position '%s' of marker %s is not a valid integer" %
(values[1], marker))
library["genome_position"][marker] = (chromosome.group(1),
position)
markers.add(marker)
elif section_low == "length_adjust": elif section_low == "length_adjust":
try: try:
value = int(value) value = int(value)
...@@ -375,12 +487,39 @@ def parse_library_ini(handle): ...@@ -375,12 +487,39 @@ def parse_library_ini(handle):
} }
markers.add(marker) markers.add(marker)
elif section_low == "repeat": elif section_low == "repeat":
if marker in library["nostr_reference"]:
raise ValueError(
"Marker %s was encountered in both [repeat] and "
"[no_repeat] sections" % marker)
if PAT_STR_DEF.match(value) is None: if PAT_STR_DEF.match(value) is None:
raise ValueError( raise ValueError(
"STR definition '%s' of marker %s is invalid" % "STR definition '%s' of marker %s is invalid" %
(value, marker)) (value, marker))
library["regex"][marker] = value library["regex"][marker] = value
markers.add(marker) markers.add(marker)
elif section_low == "no_repeat":
if marker in library["regex"]:
raise ValueError(
"Marker %s was encountered in both [repeat] and "
"[no_repeat] sections" % marker)
if marker in library["prefix"] or marker in library["suffix"]:
raise ValueError(
"A prefix or suffix was defined for non-STR marker %s"
% marker)
if PAT_SEQ_RAW.match(value) is None:
raise ValueError(
"Reference sequence '%s' of marker %s is invalid" %
(value, marker))
library["nostr_reference"][marker] = value
markers.add(marker)
# Sanity check: prohibit prefix/suffix for aliases of non-STRs.
for alias in library["aliases"]:
if library["aliases"][alias]["marker"] in library["nostr_reference"] \
and (alias in library["prefix"] or alias in library["suffix"]):
raise ValueError(
"A prefix or suffix was defined for alias %s of non-STR "
"marker %s" % (alias, library["aliases"][alias]["marker"]))
# Compile regular expressions. # Compile regular expressions.
# NOTE: The libconvert tool expects "(seq){num,num}" blocks ONLY! # NOTE: The libconvert tool expects "(seq){num,num}" blocks ONLY!
...@@ -577,7 +716,8 @@ def detect_sequence_format(seq): ...@@ -577,7 +716,8 @@ def detect_sequence_format(seq):
return 'raw' return 'raw'
if PAT_SEQ_TSSV.match(seq): if PAT_SEQ_TSSV.match(seq):
return 'tssv' return 'tssv'
if PAT_SEQ_ALLELENAME.match(seq): if PAT_SEQ_ALLELENAME_STR.match(seq) or PAT_SEQ_ALLELENAME_MT.match(seq) \
or PAT_SEQ_ALLELENAME_SNP.match(seq):
return 'allelename' return 'allelename'
raise ValueError("Unrecognised sequence format") raise ValueError("Unrecognised sequence format")
#detect_sequence_format #detect_sequence_format
...@@ -629,15 +769,75 @@ def convert_sequence_raw_tssv(seq, library, marker, return_alias=False): ...@@ -629,15 +769,75 @@ def convert_sequence_raw_tssv(seq, library, marker, return_alias=False):
if marker in library["regex_middle"]: if marker in library["regex_middle"]:
match = regex_longest_match(library["regex_middle"][marker], seq) match = regex_longest_match(library["regex_middle"][marker], seq)
if match is not None and match.end()-match.start(): if match is not None and match.end()-match.start():
# TODO: If I did not have a prefix yet, but the
# canonical prefix ends with the first few blocks that # If this allele does not match the prefix of this
# we obtained in the match, move these blocks into the # marker, but the canonical prefix of the marker ends
# prefix. Also do this with the suffix. # with the same sequence as the start of our match, we
middle = [(match.group(i), match.end(i)+len(pre_suf[0])) # move that portion of the match into the prefix.
for i in range(1, match.lastindex+1) if match.group(i)] # Then, we do the same thing with the suffix.
pre_suf[0] += seq[:match.start()] matched = match.group()
pre_suf[1] = seq[match.end():] + pre_suf[1] start = match.start()
seq = match.group() end = match.end()
modified = False
if (not pre_suf[0] and "prefix" in library
and marker in library["prefix"]):
ref = library["prefix"][marker][0]
i = min(len(ref), len(matched))
while i > 0:
if ref.endswith(matched[:i]):
start += i
matched = matched[i:]
modified = True
break
i -= 1
if (not pre_suf[1] and "suffix" in library
and marker in library["suffix"]):
ref = library["suffix"][marker][0]
i = min(len(ref), len(matched))
while i > 0:
if ref.startswith(matched[-i:]):
end -= i
matched = matched[:-i]
modified = True
break
i -= 1
if modified:
from_start = start-match.start()
from_end = match.end()-end
middle = reduce(
lambda x, i: (
x[0] + [match.group(i)] *
((match.end(i)-x[1])/len(match.group(i))),
match.end(i)) if match.group(i) else x,
range(1, match.lastindex+1), ([], match.start()))[0]
while from_start:
if from_start < len(middle[0]):
middle[0] = middle[0][from_start:]
break
else:
from_start -= len(middle[0])
middle = middle[1:]
while from_end:
if from_end < len(middle[-1]):
middle[-1] = middle[-1][:-from_end]
break
else:
from_end -= len(middle[-1])
middle = middle[:-1]
if middle:
middle = reduce(
lambda x, y: (x[:-1] if x[-1][0] == y else x) +
[(y, x[-1][1]+len(y))], middle[1:],
[(middle[0], start+len(middle[0]))])
else:
# No trickery with prefix or suffix was done.
middle = [(match.group(i), match.end(i)+len(pre_suf[0]))
for i in range(1, match.lastindex+1) if match.group(i)]
pre_suf[0] += seq[:start]
pre_suf[1] = seq[end:] + pre_suf[1]
seq = matched
# Now construct parts. # Now construct parts.
parts = [] parts = []
...@@ -659,11 +859,13 @@ def convert_sequence_raw_tssv(seq, library, marker, return_alias=False): ...@@ -659,11 +859,13 @@ def co