Commit 7820cad0 authored by Hoogenboom, Jerry's avatar Hoogenboom, Jerry
Browse files

Bug fixes and improvements in allele name gen and auto allele selection

Fixed:
* In Samplevis HTML visualisations, the automatic allele selection was
  only checking the number of reverse reads for the 'minimum number of
  reads per orientation' setting.
* In Samplevis HTML visualisations, automatic allele selection would
  fail to select alleles that had exactly the given minimum number of
  reads.
* FDSTools would sometimes calculate incorrect and even negative repeat
  counts when producing TSSV-style sequences and allele names for
  sequences that did not exactly fit the STR structure given in the
  library.

Improved:
* The Samplestats tool now offers the same possibilities to mark alleles
  as Samplevis HTML visualisations do.
* In Samplevis HTML visualisations, user-removed alleles now have a line
  through their table row.
* Added a reference to https://docs.python.org/howto/regex in the sample
  tag parsing options section of the help text of many tools.
* FDSTools will now do a better job of finding the longest possible
  match of the STR repeat definition to produce TSSV-style sequences and
  allele names for seqences that do not exactly fit the STR structure
  given in the library.

Added:
* New visualisation type 'allele'. With Allelevis, you can generate a
  graph of the alleles of the reference samples (output from
  Allelefinder). (Known bug: it has a 'funny' amount of padding.)
parent e7517bbd
......@@ -669,9 +669,16 @@ def load_profiles(profilefile, library=None):
def regex_longest_match(pattern, subject):
"""Return the longest match of the pattern in the subject string."""
return reduce(lambda x, y:
y if x is None or x.end()-x.start() < y.end()-y.start() else x,
pattern.finditer(subject), None)
match = None
pos = 0
while pos < len(subject):
m = pattern.search(subject, pos)
if m is None:
break
if match is None or m.end()-m.start() > match.end()-match.start():
match = m
pos = m.start() + 1
return match
#regex_longest_match
......@@ -795,7 +802,8 @@ def convert_sequence_raw_tssv(seq, library, marker, return_alias=False):
middle = reduce(
lambda x, y: (x[:-1] if x[-1][0] == y else x) +
[(y, x[-1][1]+len(y))], middle[1:],
[(middle[0], start+len(middle[0]))])
[(middle[0],
start+len(middle[0])+len(pre_suf[0]))])
else:
# No trickery with prefix or suffix was done.
......@@ -1372,7 +1380,9 @@ def add_input_output_args(parser, single_in=False, batch_support=False,
help="file to write a report to (default: write to stderr)")
# Sample tag parsing options group.
group = parser.add_argument_group("sample tag parsing options")
group = parser.add_argument_group("sample tag parsing options",
"for details about REGEX syntax and capturing groups, check "
"https://docs.python.org/howto/regex")
group.add_argument('-e', '--tag-expr', metavar="REGEX", type=regex_arg,
default=DEF_TAG_EXPR,
help="regular expression that captures (using one or more capturing "
......
......@@ -21,16 +21,53 @@ X_add_mp: The number of X_add reads of this sequence as a percentage of
the total X_add of the marker.
X_corrected_mp: The number of X_corrected reads of this sequence as a
percentage of the total number of X_corrected reads of the marker.
flags: Sequences are flagged with 'allele' if they match the given
interpretation options. If the flags column, which always appears in
bgcorrect output, was not already present in the input, an additional
'not_corrected' flag is added to all sequences.
"""
import sys
from ..lib import ensure_sequence_format, add_sequence_format_args, \
add_input_output_args, get_input_output_files, get_column_ids
add_input_output_args, get_input_output_files, pos_int_arg, \
get_column_ids
__version__ = "0.1dev"
def compute_stats(infile, outfile, library, seqformat):
# Default values for parameters are specified below.
# Default minimum number of reads to mark as allele.
# This value can be overridden by the -n command line option.
_DEF_MIN_READS = 30
# Default minimum number of reads per strand to mark as allele.
# This value can be overridden by the -b command line option.
_DEF_MIN_PER_STRAND = 1
# Default minimum percentage of reads w.r.t. the highest allele of the
# marker to mark as allele.
# This value can be overridden by the -m command line option.
_DEF_MIN_PCT_OF_MAX = 5.
# Default minimum percentage of reads w.r.t. the marker's total number
# of reads to mark as allele.
# This value can be overridden by the -p command line option.
_DEF_MIN_PCT_OF_SUM = 3.
# Default minimum percentage of correction to mark as allele.
# This value can be overridden by the -c command line option.
_DEF_MIN_CORRECTION = 0
# Default minimum number of recovered reads as a percentage of the
# original number of reads to mark as allele.
# This value can be overridden by the -r command line option.
_DEF_MIN_RECOVERY = 0
def compute_stats(infile, outfile, library, seqformat, min_reads,
min_per_strand, min_pct_of_max, min_pct_of_sum,
min_correction, min_recovery):
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_name, colid_allele, colid_forward, colid_reverse, colid_total = \
......@@ -39,10 +76,15 @@ def compute_stats(infile, outfile, library, seqformat):
(colid_forward_noise, colid_reverse_noise, colid_total_noise,
colid_forward_add, colid_reverse_add, colid_total_add,
colid_forward_corrected, colid_reverse_corrected,
colid_total_corrected) = get_column_ids(column_names, "forward_noise",
"reverse_noise", "total_noise", "forward_add", "reverse_add",
"total_add", "forward_corrected", "reverse_corrected",
"total_corrected", optional=True)
colid_total_corrected, colid_flags) = get_column_ids(column_names,
"forward_noise", "reverse_noise", "total_noise", "forward_add",
"reverse_add", "total_add", "forward_corrected", "reverse_corrected",
"total_corrected", "flags", optional=True)
# Make sure we have the flags column.
if colid_flags is None:
colid_flags = len(column_names)
column_names.append("flags")
# Add columns for which we have the required data.
column_names.append("forward_pct")
......@@ -53,9 +95,11 @@ def compute_stats(infile, outfile, library, seqformat):
if colid_reverse_corrected is not None:
column_names.append("reverse_correction_pct")
if colid_total_corrected is not None:
colid_total_correction_pct = len(column_names)
column_names.append("total_correction_pct")
column_names.append("forward_mp")
column_names.append("reverse_mp")
colid_total_mp = len(column_names)
column_names.append("total_mp")
if colid_forward_noise is not None:
column_names.append("forward_noise_mp")
......@@ -74,6 +118,7 @@ def compute_stats(infile, outfile, library, seqformat):
if colid_reverse_corrected is not None:
column_names.append("reverse_corrected_mp")
if colid_total_corrected is not None:
colid_total_corrected_mp = len(column_names)
column_names.append("total_corrected_mp")
# Read data.
......@@ -141,7 +186,13 @@ def compute_stats(infile, outfile, library, seqformat):
if colid_total_corrected is not None:
marker_total_corrected = sum(
row[colid_total_corrected] for row in data[marker])
marker_max = max(row[colid_total] if colid_total_corrected is None
else row[colid_total_corrected] for row in data[marker])
for row in data[marker]:
if len(row) == colid_flags:
row.append(["not_corrected"])
else:
row[colid_flags] = map(str.strip, colid_flags.split(","))
row.append("%.3g" % (100.*row[colid_forward]/row[colid_total]))
if colid_forward_corrected is not None:
if colid_total_corrected is not None:
......@@ -203,6 +254,31 @@ def compute_stats(infile, outfile, library, seqformat):
(100.*row[colid_total_corrected]/marker_total_corrected
if marker_total_corrected else 0))
# Check if this sequence is an allele.
total_added = row[colid_total] if colid_total_corrected is None \
else row[colid_total_corrected]
pct_of_max = 100.*(1.*total_added/marker_max if marker_max
else total_added > 0)
pct_of_sum = float(row[colid_total_mp] if colid_total_corrected is
None else row[colid_total_corrected_mp])
correction = float(0 if colid_total_corrected is None
else row[colid_total_correction_pct])
recovery = 0 if colid_total_add is None \
else 100.*row[colid_total_add]/row[colid_total]
min_strand = min(
row[colid_forward] if colid_forward_corrected is None
else row[colid_forward_corrected],
row[colid_reverse] if colid_reverse_corrected is None
else row[colid_reverse_corrected])
if (total_added >= min_reads and
pct_of_max >= min_pct_of_max and
pct_of_sum >= min_pct_of_sum and
correction >= min_correction and
recovery >= min_recovery and
min_strand >= min_per_strand):
row[colid_flags].append("allele")
row[colid_flags] = ",".join(row[colid_flags])
# Write results.
outfile.write("\t".join(column_names) + "\n")
for marker in data:
......@@ -213,6 +289,32 @@ def compute_stats(infile, outfile, library, seqformat):
def add_arguments(parser):
add_input_output_args(parser, True, True, False)
intergroup = parser.add_argument_group("interpretation options",
"sequences that match all of these settings are marked as 'allele'")
intergroup.add_argument('-n', '--min-reads', metavar="N", type=pos_int_arg,
default=_DEF_MIN_READS,
help="the minimum number of reads (default: %(default)s)")
intergroup.add_argument('-b', '--min-per-strand', metavar="N",
type=pos_int_arg, default=_DEF_MIN_PER_STRAND,
help="the minimum number of reads in both orientations (default: "
"%(default)s)")
intergroup.add_argument('-m', '--min-pct-of-max', metavar="PCT",
type=float, default=_DEF_MIN_PCT_OF_MAX,
help="the minimum percentage of reads w.r.t. the highest allele of "
"the marker (default: %(default)s)")
intergroup.add_argument('-p', '--min-pct-of-sum', metavar="PCT",
type=float, default=_DEF_MIN_PCT_OF_SUM,
help="the minimum percentage of reads w.r.t. the marker's total "
"number of reads (default: %(default)s)")
intergroup.add_argument('-c', '--min-correction', metavar="PCT",
type=float, default=_DEF_MIN_CORRECTION,
help="the minimum change in read count due to correction by e.g., "
"bgcorrect (default: %(default)s)")
intergroup.add_argument('-r', '--min-recovery', metavar="PCT",
type=float, default=_DEF_MIN_RECOVERY,
help="the minimum number of reads that was recovered thanks to "
"noise correction (by e.g., bgcorrect), as a percentage of the "
"original number of reads (default: %(default)s)")
add_sequence_format_args(parser)
#add_arguments
......@@ -229,7 +331,10 @@ def run(args):
raise ValueError(
"multiple input files for sample '%s' specified " % tag)
infile = sys.stdin if infiles[0] == "-" else open(infiles[0], "r")
compute_stats(infile, outfile, args.library, args.sequence_format)
compute_stats(infile, outfile, args.library, args.sequence_format,
args.min_reads, args.min_per_strand, args.min_pct_of_max,
args.min_pct_of_sum, args.min_correction,
args.min_recovery)
if infile != sys.stdin:
infile.close()
#run
......@@ -145,10 +145,13 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
spec["width"] = width
if vistype == "stuttermodel":
set_signal_value(spec, "graphheight", height)
elif vistype == "allele":
spec["height"] = height
else:
set_signal_value(spec, "barwidth", bar_width)
set_signal_value(spec, "subgraphoffset", padding)
set_signal_value(spec, "filter_marker", marker)
if vistype != "allele":
set_signal_value(spec, "subgraphoffset", padding)
set_signal_value(spec, "filter_marker", marker)
# Apply type-specific settings.
if vistype == "stuttermodel":
......@@ -165,7 +168,7 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
set_signal_value(spec, "bias_threshold", bias_threshold)
# Apply axis scale settings.
if vistype != "stuttermodel":
if vistype != "stuttermodel" and vistype != "allele":
if not log_scale:
set_axis_scale(spec, "x", "linear")
elif vistype == "sample":
......@@ -227,14 +230,15 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
def add_arguments(parser):
parser.add_argument('type', metavar="TYPE",
choices=("sample", "profile", "bgraw", "stuttermodel"),
choices=("sample", "profile", "bgraw", "stuttermodel", "allele"),
help="the type of data to visualise; use 'sample' to visualise "
"sample data files and bgcorrect output; use 'profile' to "
"visualise background noise profiles obtained with bgestimate, "
"bghomstats, and bgpredict; use 'bgraw' to visualise raw "
"background noise data obtained with bghomraw; use "
"'stuttermodel' to visualise models of stutter obtained from "
"stuttermodel")
"stuttermodel; use 'allele' to visualise the allele list "
"obtained from allelefinder")
parser.add_argument('infile', metavar="IN", nargs="?",
help="file containing the data to embed in the visualisation file; if "
"not specified, HTML visualisation files will contain a file "
......@@ -305,12 +309,12 @@ def add_arguments(parser):
"%(default)s)")
visgroup.add_argument('-w', '--width', metavar="N", type=pos_int_arg,
default=_DEF_WIDTH,
help="[sample, profile, bgraw, stuttermodel] width of the graph area "
"in pixels (default: %(default)s)")
help="[sample, profile, bgraw, stuttermodel, allele] width of the "
"graph area in pixels (default: %(default)s)")
visgroup.add_argument('-H', '--height', metavar="N", type=pos_int_arg,
default=_DEF_HEIGHT,
help="[stuttermodel] height of the graph area in pixels (default: "
"%(default)s)")
help="[stuttermodel, allele] height of the graph area in pixels "
"(default: %(default)s)")
visgroup.add_argument('-A', '--no-alldata', action="store_true",
help="[stuttermodel] if specified, show only marker-specific fits")
#add_arguments
......
{
"width": 500,
"width": 600,
"height": 500,
"data": [
{
"name": "raw",
"values": "VALUES HERE",
"format": {
"type": "tsv"
},
"format": {"type": "tsv"},
"transform": [
{
"type": "formula",
......@@ -59,55 +57,79 @@
]
},
{
"name": "nodes",
"name": "samplecounts",
"source": "raw",
"transform": [
{
"type": "aggregate",
"groupby": ["marker", "allele"],
"summarize": {"*": "count"}
}
]
},
{
"name": "indices",
"source": "samplecounts",
"transform": [
{
"type": "cross",
"diagonal": false,
"filter": "datum.a._id > datum.b._id"
},
{
"type": "formula",
"field": "markerallele",
"expr": "datum.marker + datum.allele"
"expr": "datum.a.marker + datum.a.allele"
},
{
"type": "aggregate",
"groupby": ["markerallele"],
"summarize": [{"field": "*", "ops": ["count"], "as": ["index"]}]
}
]
},
{
"name": "nodes",
"source": "samplecounts",
"transform": [
{
"type": "lookup",
"on": "homozygotes",
"on": "indices",
"onKey": "markerallele",
"keys": ["markerallele"],
"as": ["homcountobj"],
"default": {"count": 0}
"as": ["indexobj"],
"default": {"index": 0}
},
{
"type": "formula",
"field": "homCount",
"expr": "datum.homcountobj.count"
"field": "markerallele",
"expr": "datum.marker + datum.allele"
},
{
"type": "lookup",
"on": "homozygotes",
"onKey": "markerallele",
"keys": ["markerallele"],
"as": ["homcount"],
"default": {"count": 0}
}
]
},
{
"name": "edges",
"source": "nodes",
"source": "raw",
"transform": [
{
"type": "cross",
"with": "raw",
"output": {"left": "source", "right": "samples"}
},
{
"type": "filter",
"test": "datum.source.markerallele == datum.samples.markerallele"
"with": "nodes",
"output": {"left": "samples", "right": "source"},
"filter": "datum.source.markerallele == datum.samples.markerallele"
},
{
"type": "cross",
"with": "nodes",
"output": {"left": "a", "right": "target"}
},
{
"type": "filter",
"test": "datum.a.source.marker == datum.target.marker && datum.a.source.allele < datum.target.allele"
"output": {"left": "a", "right": "target"},
"filter": "datum.a.source.marker == datum.target.marker && datum.a.source.allele < datum.target.allele"
},
{
"type": "formula",
......@@ -129,17 +151,12 @@
{
"type": "formula",
"field": "source",
"expr": "datum.a.source._id"
"expr": "datum.a.source.indexobj.index"
},
{
"type": "formula",
"field": "target",
"expr": "datum.target._id"
},
{
"type": "formula",
"field": "sampletag",
"expr": "datum.targetsample.sample"
"expr": "datum.target.indexobj.index"
},
{
"type": "aggregate",
......@@ -147,6 +164,109 @@
"summarize": {"*": "count"}
}
]
},
{
"name": "layout",
"source": "nodes",
"transform": [
{
"type": "force",
"links": "edges",
"linkDistance": 30,
"linkStrength": 0.1,
"charge": -20,
"iterations": 1000
}
]
}
],
"scales": [
{
"name": "c",
"type": "ordinal",
"range": "category20",
"domain": {"data": "raw", "field": "marker", "sort": true}
},
{
"name": "legend_sizes",
"type": "ordinal",
"range": [1, 5, 10, 20, 50],
"domain": [1, 5, 10, 20, 50]
}
],
"legends": [
{
"fill": "c",
"properties": {
"symbols": {
"size": {"value": 100},
"stroke": {"value": "transparent"},
"fillOpacity": {"value": 0.8}
}
}
},
{
"size": "legend_sizes",
"values": [1, 5, 10, 20, 50]
}
],
"marks": [
{
"type": "path",
"from": {
"data": "edges",
"transform": [
{
"type": "lookup",
"on": "layout",
"keys": ["source", "target"],
"as": ["_source", "_target"]
},
{
"type": "sort",
"by": "_source.marker"
},
{"type": "linkpath"}
]
},
"properties": {
"update": {
"path": {"field": "layout_path"},
"stroke": {"scale": "c", "field": "_source.marker"},
"strokeWidth": {"field": "count", "mult": 0.5},
"strokeOpacity": {"value": 0.3}
}
}
},
{
"type": "symbol",
"from": {"data": "layout"},
"properties": {
"enter": {
"size": {"field": "count"}
},
"update": {
"x": {"field": "layout_x"},
"y": {"field": "layout_y"},
"fill": {"scale": "c", "field": "marker"},
"stroke": {"scale": "c", "field": "marker"}
}
}
},
{
"type": "symbol",
"from": {"data": "layout"},
"properties": {
"enter": {
"size": {"field": "homcount.count"}
},
"update": {
"x": {"field": "layout_x"},
"y": {"field": "layout_y"},
"fill": {"value": "black"},
"stroke": {"value": "black"}
}
}
}
]
}
}
\ No newline at end of file
{
"width": 500,
"height": 500,
"data": [
{
"name": "raw",
"values": "VALUES HERE",
"format": {
"type": "tsv"
},
"transform": [
{
"type": "aggregate",
"groupby": ["marker", "allele"],
"summarize": {"*": "count"}
}
]
},
{
"name": "edges",
"values": "EDGES_VALUES_HERE",
"format": {
"type": "tsv"
}
},
{
"name": "nodes",
"values": "NODES_VALUES_HERE",
"format": {
"type": "tsv"
},
"transform": [
{
"type": "formula",
"field": "charge",
"expr":
"-20*pow(0.05*d.data.count, 1/2)"
},
{
"type": "force",
"links": "edges",
"linkDistance": 30,
"linkStrength": 0.1,
"charge": "charge",
"iterations": 1000
}
]
}
],
"scales": [
{
"name": "c",
"type": "ordinal",
"range": "category20"
},
{
"name": "legend_sizes",
"type": "ordinal",
"range": [1, 5, 10, 20, 50]
}
],
"legends": [
{
"fill": "c",
"stroke": "c",
"properties": {
"symbols": {
"size": {"value": 75}
}
}
},
{
"size": "legend_sizes",
"values": [1, 5, 10, 20, 50]
}
],
"marks": [
{
"type": "path",