Commit e1bd3e41 authored by Hoogenboom, Jerry's avatar Hoogenboom, Jerry
Browse files

Improved sequence aligment (variant calling) quality

FDSTools would sometimes produce suboptimal alignments. Most notably, it
it would produce multiple smaller insertions/deletions when the
difference between two sequences could be described by one larger
insertion/deletion in combination with a base substitution. The latter
description is often more biologically sound and also usually results in
a shorter allele name.

* Fixed a bug that sometimes caused FDSTools to choose an incorrect path
  through the alignment matrix, producing a suboptimal alignment.
* Tweaked the alignment parameters to produce more meaningful results.
parent 406f5bf0
......@@ -143,11 +143,23 @@ def call_variants(template, sequence, location="suffix", cache=True,
matrix_match = [0] * row_offset * (len(sequence)+1)
matrix_gap1 = [-sys.maxint-1] * row_offset * (len(sequence)+1)
matrix_gap2 = [-sys.maxint-1] * row_offset * (len(sequence)+1)
matrix_direction = [0] * row_offset * (len(sequence)+1)
# Matrix and arrow enum constants.
M_MATCH = 0
M_GAP1 = 1
M_GAP2 = 2
A_MATCH = 1
A_HORZ_O = 2
A_HORZ_E = 4
A_VERT_O = 8
A_VERT_E = 16
# Settings.
MATCH_SCORE = 1
MISMATCH_SCORE = -1
GAP_OPEN_SCORE = -10
GAP_EXTEND_SCORE = -1
MISMATCH_SCORE = -3
GAP_OPEN_SCORE = -7
GAP_EXTEND_SCORE = -2
variant_format = "%i%s>%s"
if location == "prefix":
......@@ -171,28 +183,47 @@ def call_variants(template, sequence, location="suffix", cache=True,
# Top row.
matrix_gap1[i] = GAP_OPEN_SCORE + GAP_EXTEND_SCORE * (x - 1)
matrix_match[i] = matrix_gap1[i]
matrix_direction[i] = A_HORZ_E | (A_HORZ_O if x == 1 else 0)
elif y != 0:
# Left column.
matrix_gap2[i] = GAP_OPEN_SCORE + GAP_EXTEND_SCORE * (y - 1)
matrix_match[i] = matrix_gap2[i]
matrix_direction[i] = A_VERT_E | (A_VERT_O if y == 1 else 0)
else:
# Top left corner.
matrix_direction[i] = A_MATCH
continue
matrix_gap1[i] = max(
matrix_match[i-1] + GAP_OPEN_SCORE,
matrix_gap1[i-1] + GAP_EXTEND_SCORE)
matrix_gap2[i] = max(
matrix_match[i-row_offset] + GAP_OPEN_SCORE,
matrix_gap2[i-row_offset] + GAP_EXTEND_SCORE)
if template[x-1] == sequence[y-1]:
match = MATCH_SCORE
else:
match = MISMATCH_SCORE
matrix_match[i] = max(
options_gap1 = (
matrix_match[i-1] + GAP_OPEN_SCORE,
matrix_gap1[i-1] + GAP_EXTEND_SCORE)
matrix_gap1[i] = max(options_gap1)
if options_gap1[0] > options_gap1[1]:
matrix_direction[i] |= A_HORZ_O # Must exit M_GAP1 here.
options_gap2 = (
matrix_match[i-row_offset] + GAP_OPEN_SCORE,
matrix_gap2[i-row_offset] + GAP_EXTEND_SCORE)
matrix_gap2[i] = max(options_gap2)
if options_gap2[0] > options_gap2[1]:
matrix_direction[i] |= A_VERT_O # Must exit M_GAP2 here.
options = (
matrix_match[i-1-row_offset] + match,
matrix_gap1[i],
matrix_gap2[i])
matrix_match[i] = max(options)
if options[0] == matrix_match[i]:
matrix_direction[i] |= A_MATCH # Can stay in M_MATCH here.
if options[1] == matrix_match[i]:
matrix_direction[i] |= A_HORZ_E # Can enter M_GAP1 here.
if options[2] == matrix_match[i]:
matrix_direction[i] |= A_VERT_E # Can enter M_GAP2 here.
if debug:
print("GAP1")
......@@ -204,6 +235,16 @@ def call_variants(template, sequence, location="suffix", cache=True,
print("Match")
for i in range(0, len(matrix_match), row_offset):
print(("%5i" * row_offset) % tuple(matrix_match[i:i+row_offset]))
print("FLAGS")
for i in range(0, len(matrix_direction), row_offset):
print(("%5s|" * row_offset) % tuple("".join([
"h" if x & A_HORZ_O else " ",
"H" if x & A_HORZ_E else " ",
"D" if x & A_MATCH else " ",
"V" if x & A_VERT_E else " ",
"v" if x & A_VERT_O else " "
]) for x in matrix_direction[i:i+row_offset]))
print("Traceback")
# Backtracking.
......@@ -211,25 +252,48 @@ def call_variants(template, sequence, location="suffix", cache=True,
variant_template = 0
variant_sequence = 0
i = len(matrix_match) - 1
in_matrix = M_MATCH # May change before first step.
while i >= 0:
x = i % row_offset
y = i / row_offset
if matrix_gap1[i] == matrix_match[i]:
if debug:
print("(%i, %i)" % (x,y))
if in_matrix == M_MATCH:
# Make gaps as soon as possible (pushed right).
if matrix_direction[i] & A_HORZ_E:
in_matrix = M_GAP1
elif matrix_direction[i] & A_VERT_E:
in_matrix = M_GAP2
elif not (matrix_direction[i] & A_MATCH):
raise ValueError(
"Alignment error: Dead route! (This is a bug.) [%s,%s]" % (template,sequence))
if in_matrix == M_GAP1:
# Go horizontally. Deletion.
variant_template += 1
if matrix_direction[i] & A_HORZ_O:
# End of gap, go diagonally after this.
in_matrix = M_MATCH
i -= 1
continue
if matrix_gap2[i] == matrix_match[i]:
if in_matrix == M_GAP2:
# Go vertically. Insertion.
variant_sequence += 1
if matrix_direction[i] & A_VERT_O:
# End of gap, go diagonally after this.
in_matrix = M_MATCH
i -= row_offset
continue
# Only backtracking diagonally if a gap is out of the question.
# Go diagonally. Either match or mismatch.
if i == 0 or template[x - 1] == sequence[y - 1]:
if i != 0 and template[x - 1] != sequence[y - 1]:
# Start/extend mismatch.
variant_template += 1
variant_sequence += 1
else:
# Match. Flush variants.
if variant_template or variant_sequence:
if location[0] == "M":
......@@ -256,10 +320,6 @@ def call_variants(template, sequence, location="suffix", cache=True,
sequence[y:y+variant_sequence] or "-"))
variant_template = 0
variant_sequence = 0
else:
# Start/extend mismatch.
variant_template += 1
variant_sequence += 1
i -= 1 + row_offset
# Variants were called from right to left. Reverse their order.
......
......@@ -193,6 +193,8 @@ name tssv OK
Find TODOs and FIXMEs in the code:
grep "TODO\|FIXME" *.py */*.py */*/*.py
Number of lines of Python, excluding empty lines and comments:
grep -v "^\s*\(#.*\)\?$" *.py */*.py */*/*.py | wc -l
Number of lines, excluding empty lines and comments:
grep -v "^\s*\(#.*\)\?$" *.py */*.py */*/*.py | wc -l
\ No newline at end of file
Number of lines of JSON, excluding empty lines and those with only a brace:
grep -v "^\s*[{}]\?\[\?\]\?,\?$" */*/*/*.json | wc -l
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment