Commit 29fcc171 authored by Hoogenboom, Jerry's avatar Hoogenboom, Jerry

Expected allele lengths, and more

Added:
* Added a new section expected_allele_length to the FDSTools library
  format. In this section, the minimum and (optionally) maximum allele
  length of each marker can be specified.
* Added -L/--check-length option to the TSSV tool. If specified, the
  tool will use the expected_allele_length values to filter the results.
* Samplevis can now truncate long allele names to a given number of
  characters (defaulting to 70).
* Added an option to Samplestats to keep negatives when filtering (abs
  filter).

Changed:
* Renamed the --aggregate-below-minimum option of the TSSV tool to
  --aggregate-filtered.

Improved:
* Added an option to read_sample_data_file such that other code can
  request or require that the X_corrected columns are used.
* Samplestats will now round to 4 or 5 significant digits if a value is
  above 1000 or 10000, respectively.
* BGHomRaw will no longer round the forward, reverse, and total columns.
* When generating mtDNA allele names, FDSTools will now try to avoid
  creating gaps in the alignment of the sequences against the reference.
* Grouped the filtering options of the TSSV tool in its help text.
* Cleaned up some leftover code for special sequence value handling
  (more specifically: code that expected ensure_sequence_format to
  return False for special sequence values, which it no longer does).
* Cleaned up some dead legacy code in reduce_read_counts.
parent 4148bb50
......@@ -160,9 +160,6 @@ def call_variants(template, sequence, location="suffix", cache=True,
raise ValueError("Unknown location %r. It should be 'prefix', "
"'suffix', or a tuple (chromosome, position [, endpos])" %
location)
elif location[0] == "M":
# No need to avoid gaps in mtDNA notation.
GAP_OPEN_SCORE = -1
for i in range(len(matrix_match)):
x = i % row_offset
......@@ -414,6 +411,7 @@ def parse_library_ini(handle):
"length_adjust": {},
"block_length": {},
"max_expected_copies": {},
"expected_length": {},
"aliases": {}
}
markers = set()
......@@ -558,6 +556,25 @@ def parse_library_ini(handle):
(value, marker))
library["nostr_reference"][marker] = value
markers.add(marker)
elif section_low == "expected_allele_length":
values = PAT_SPLIT.split(value)
try:
min_length = int(values[0])
except:
raise ValueError(
"Minimum expected allele length '%s' of marker %s "
"is not a valid integer" % (values[0], marker))
if len(values) > 1:
try:
max_length = int(values[1])
except:
raise ValueError(
"Maximum expected allele length '%s' of marker %s "
"is not a valid integer" % (values[1], marker))
else:
max_length = sys.maxint
library["expected_length"][marker] = (min_length, max_length)
markers.add(marker)
# Sanity check: prohibit prefix/suffix for aliases of non-STRs.
for alias in library["aliases"]:
......@@ -1179,12 +1196,21 @@ def get_repeat_pattern(seq):
def read_sample_data_file(infile, data, annotation_column=None, seqformat=None,
library=None, default_marker=None,
drop_special_seq=False):
drop_special_seq=False, after_correction=False):
"""Add data from infile to data dict as [marker, sequence]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_sequence, colid_forward, colid_reverse = \
get_column_ids(column_names, "sequence", "forward", "reverse")
colid_sequence = get_column_ids(column_names, "sequence")
colid_forward = None
colid_reverse = None
if after_correction:
colid_forward, colid_reverse = get_column_ids(column_names,
"forward_corrected", "reverse_corrected",
optional=(after_correction != "require"))
if colid_forward is None:
colid_forward = get_column_ids(column_names, "forward")
if colid_reverse is None:
colid_reverse = get_column_ids(column_names, "reverse")
# Get marker name column if it exists.
colid_marker = get_column_ids(column_names, "marker", optional=True)
......@@ -1224,24 +1250,22 @@ def reduce_read_counts(data, limit_reads):
return
remove = sorted(random.sample(xrange(sum_reads), sum_reads - limit_reads))
i = 0
removed = 0
seen = 0
while i < len(remove) and seen > remove[i]:
# Skip the reads filtered out above.
i += 1
for markerallele in data:
for direction in (0, 1):
seen += data[markerallele][direction]
while i < len(remove) and seen > remove[i]:
while removed < len(remove) and seen > remove[removed]:
data[markerallele][direction] -= 1
i += 1
removed += 1
#reduce_read_counts
def get_sample_data(tags_to_files, callback, allelelist=None,
annotation_column=None, seqformat=None, library=None,
marker=None, homozygotes=False, limit_reads=sys.maxint,
drop_samples=0, drop_special_seq=False):
drop_samples=0, drop_special_seq=False,
after_correction=False):
if drop_samples:
sample_tags = tags_to_files.keys()
for tag in random.sample(xrange(len(sample_tags)),
......@@ -1255,7 +1279,7 @@ def get_sample_data(tags_to_files, callback, allelelist=None,
infile = sys.stdin if infile == "-" else open(infile, "r")
alleles.update(read_sample_data_file(
infile, data, annotation_column, seqformat, library, marker,
drop_special_seq))
drop_special_seq, after_correction))
if infile != sys.stdin:
infile.close()
if limit_reads < sys.maxint:
......
......@@ -340,9 +340,6 @@ def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs, tag):
if marker not in sample_alleles or not sample_alleles[marker]:
# Sample does not participate in this marker (no alleles).
continue
if allele is False:
# This was a special sequence value, skip it.
continue
p = data[marker]["profiles"]
try:
......
......@@ -50,7 +50,7 @@ def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs, tag):
# Enter the read counts into data and check the thresholds.
for marker, sequence in sample_data:
if marker not in sample_alleles or sequence is False:
if marker not in sample_alleles:
# Sample does not participate in this marker.
continue
allele = sample_alleles[marker]
......@@ -136,11 +136,11 @@ def compute_ratios(samples_in, outfile, allelefile, annotation_column, min_pct,
outfile.write("\t".join([
data[marker, allele][sequence]["tag"][i], marker, allele,
sequence] + [
"%.3g" % x if abs(x) > 0.0000000001 else "0" for x in (
data[marker, allele][sequence]["forward"][i],
data[marker, allele][sequence]["reverse"][i],
data[marker, allele][sequence]["forward"][i] +
data[marker, allele][sequence]["reverse"][i],
data[marker, allele][sequence]["reverse"][i]] + [
"%.3g" % x if abs(x) > 0.0000000001 else "0" for x in (
data[marker, allele][sequence]["fnoise"][i],
data[marker, allele][sequence]["rnoise"][i],
data[marker, allele][sequence]["tnoise"][i])]) + "\n")
......
......@@ -53,7 +53,7 @@ def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs):
# Enter the read counts into data and check the thresholds.
for marker, sequence in sample_data:
if marker not in sample_alleles or sequence is False:
if marker not in sample_alleles:
# Sample does not participate in this marker.
continue
allele = sample_alleles[marker]
......
......@@ -310,6 +310,16 @@ def convert_library(infile, outfile, aliases=False):
"mitochondrial DNA or")
ini.set("max_expected_copies",
"; on the Y chromosome).")
ini.add_section("expected_allele_length")
ini.set("expected_allele_length",
"; Specify one or two values for each marker. The first value "
"gives the expected")
ini.set("expected_allele_length",
"; minimum length of the alleles and the second value (if "
"given) specifies the")
ini.set("expected_allele_length",
"; maximum allele length expected for this marker (both "
"inclusive).")
# Enter flanking sequences and STR definitions.
fmt = "%%-%is" % reduce(max, map(len,
......
......@@ -150,8 +150,8 @@ COLUMN_ORDER = [
def compute_stats(infile, outfile, min_reads,
min_per_strand, min_pct_of_max, min_pct_of_sum,
min_correction, min_recovery, filter_action, min_reads_filt,
min_per_strand_filt, min_pct_of_max_filt,
min_correction, min_recovery, filter_action, filter_absolute,
min_reads_filt, min_per_strand_filt, min_pct_of_max_filt,
min_pct_of_sum_filt, min_correction_filt, min_recovery_filt):
# Check presence of required columns.
column_names = infile.readline().rstrip("\r\n").split("\t")
......@@ -419,22 +419,23 @@ def compute_stats(infile, outfile, min_reads,
else row[ci["total_correction_pct"]]
recovery = 0 if "total_added_pct" not in ci \
else row[ci["total_added_pct"]]
min_strand = min(
strands = [
row[ci["forward"]] if "forward_corrected" not in ci
else row[ci["forward_corrected"]],
row[ci["reverse"]] if "reverse_corrected" not in ci
else row[ci["reverse_corrected"]])
else row[ci["reverse_corrected"]]]
fn = abs if filter_absolute else lambda x: x
# Check if this sequence should be filtered out.
# Always filter/combine existing 'Other sequences'.
if filter_action != "off" and (
row[ci["sequence"]] == "Other sequences" or (
total_added < min_reads_filt or
pct_of_max < min_pct_of_max_filt or
pct_of_sum < min_pct_of_sum_filt or
fn(total_added) < min_reads_filt or
fn(pct_of_max) < min_pct_of_max_filt or
fn(pct_of_sum) < min_pct_of_sum_filt or
(correction < min_correction_filt and
recovery < min_recovery_filt) or
min_strand < min_per_strand_filt)):
min(map(fn, strands)) < min_per_strand_filt)):
filtered[marker].append(row)
# Check if this sequence is an allele.
......@@ -444,7 +445,7 @@ def compute_stats(infile, outfile, min_reads,
pct_of_sum >= min_pct_of_sum and
(correction >= min_correction or
recovery >= min_recovery) and
min_strand >= min_per_strand):
min(strands) >= min_per_strand):
row[ci["flags"]].append("allele")
row[ci["flags"]] = ",".join(row[ci["flags"]])
......@@ -616,7 +617,12 @@ def compute_stats(infile, outfile, min_reads,
for i in (ci[column] for column in COLUMN_ORDER if column in ci
and column not in ("total", "forward", "reverse")):
combined[i] = "%#.3g" % combined[i]
if combined[i] >= 10000:
combined[i] = "%#.5g" % combined[i]
elif combined[i] >= 1000:
combined[i] = "%#.4g" % combined[i]
else:
combined[i] = "%#.3g" % combined[i]
outfile.write("\t".join(map(str,
(combined[new_order[i]] for i in range(len(combined))))) +"\n")
#compute_stats
......@@ -660,6 +666,10 @@ def add_arguments(parser):
"filtered sequences by a single line with aggregate values per "
"marker; 'delete', remove filtered sequences without leaving a "
"trace (default: %(default)s)")
filtergroup.add_argument('-A', '--filter-absolute', action="store_true",
help="if specified, apply filters to absolute read counts (i.e., with "
"the sign removed), which may keep over-corrected sequences that "
"would otherwise be filtered out")
filtergroup.add_argument('-N', '--min-reads-filt', metavar="N", type=float,
default=_DEF_MIN_READS_FILT,
help="the minimum number of reads (default: %(default)s)")
......@@ -703,9 +713,10 @@ def run(args):
args.min_reads, args.min_per_strand, args.min_pct_of_max,
args.min_pct_of_sum, args.min_correction,
args.min_recovery, args.filter_action,
args.min_reads_filt, args.min_per_strand_filt,
args.min_pct_of_max_filt, args.min_pct_of_sum_filt,
args.min_correction_filt, args.min_recovery_filt)
args.filter_absolute, args.min_reads_filt,
args.min_per_strand_filt, args.min_pct_of_max_filt,
args.min_pct_of_sum_filt, args.min_correction_filt,
args.min_recovery_filt)
if infile != sys.stdin:
infile.close()
#run
......@@ -153,7 +153,7 @@ def add_sample_data(data, sample_data, sample_alleles, tag, min_pct, min_abs):
# Enter the read counts into data and check the thresholds.
for marker, sequence in sample_data:
if marker not in sample_alleles or sequence is False:
if marker not in sample_alleles:
# Sample does not participate in this marker.
continue
if (tag, marker) not in data["samples"]:
......
......@@ -69,6 +69,10 @@ _DEF_BAR_WIDTH = 15
# This value can be overridden by the -p command line option.
_DEF_SUBGRAPH_PADDING = 70
# Default maximum length of sequences to display.
# This value can be overridden by the -x command line option.
_DEF_MAX_SEQ_LEN = 70
# Default graph width in pixels.
# This value can be overridden by the -w command line option.
_DEF_WIDTH = 600
......@@ -133,7 +137,7 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
min_pct_of_max, min_pct_of_sum, min_per_strand,
bias_threshold, bar_width, padding, marker, width,
height, log_scale, repeat_unit, no_alldata,
no_aggregate, no_ce_length_sort, title):
no_aggregate, no_ce_length_sort, max_seq_len, title):
# Get graph spec.
spec = json.load(resource_stream(
"fdstools", "vis/%svis/%svis.json" % (vistype, vistype)))
......@@ -173,6 +177,7 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
set_signal_value(spec, "amplitude_markerpct_threshold", min_pct_of_sum)
set_signal_value(spec, "show_other", not no_aggregate)
set_signal_value(spec, "sort_str_by_length", not no_ce_length_sort)
set_signal_value(spec, "max_seq_len", max_seq_len)
# Apply axis scale settings.
if vistype != "stuttermodel" and vistype != "allele":
......@@ -333,6 +338,10 @@ def add_arguments(parser):
default=_DEF_HEIGHT,
help="[stuttermodel, allele] height of the graph area in pixels "
"(default: %(default)s)")
visgroup.add_argument('-x', '--max-seq-len', type=pos_int_arg,
default=_DEF_MAX_SEQ_LEN,
help="[sample] truncate long sequences to this number of characters "
"(default: %(default)s)")
#add_arguments
......@@ -361,5 +370,5 @@ def run(args):
args.bar_width, args.padding, args.marker, args.width,
args.height, args.log_scale, args.repeat_unit,
args.no_alldata, args.no_aggregate,
args.no_ce_length_sort, args.title)
args.no_ce_length_sort, args.max_seq_len, args.title)
#run
......@@ -515,19 +515,23 @@
<div class="optionheader">Display options</div>
<table>
<tr>
<td><label for="graphwidth">Graph width</label></td>
<td colspan="2"><label for="graphwidth">Graph width</label></td>
<td colspan="2"><label><input type="text" value="600" id="graphwidth" size="3" class="num"> px</label></td>
</tr>
<tr>
<td><label for="barwidth">Bar width</label></td>
<td colspan="2"><label for="barwidth">Bar width</label></td>
<td colspan="2"><label><input type="text" value="15" id="barwidth" size="3" class="num"> px</label></td>
</tr>
<tr>
<td><label for="subgraphoffset">Marker spacing</label></td>
<td colspan="2"><label for="subgraphoffset">Marker spacing</label></td>
<td colspan="2"><label><input type="text" value="70" id="subgraphoffset" size="3" class="num"> px</label></td>
</tr>
<tr>
<td colspan="3">
<td colspan="2"><label for="subgraphoffset">Truncate sequences to</label></td>
<td colspan="2"><label><input type="text" value="70" id="maxseqlen" size="3" class="num"> characters</label></td>
</tr>
<tr>
<td colspan="4">
<label>
<input type="checkbox" id="splitmarkers" checked>
Split markers
......@@ -535,7 +539,7 @@
</td>
</tr>
<tr>
<td colspan="3">
<td colspan="4">
<label>
<input type="checkbox" id="commonrange" disabled>
Common axis range
......@@ -544,7 +548,7 @@
</td>
</tr>
<tr>
<td colspan="3">
<td colspan="4">
<label>
<input type="checkbox" id="sortstrbylength">
Sort STR alleles by length
......@@ -553,12 +557,12 @@
</tr>
<tr>
<td>Axis scale</td>
<td><label><input type="radio" name="scale" value="linear" id="scaleLinear" checked> Linear</label></td>
<td colspan="2"><label><input type="radio" name="scale" value="linear" id="scaleLinear" checked> Linear</label></td>
<td><label><input type="radio" name="scale" value="sqrt" id="scaleLog"> Square root</label></td>
</tr>
<tr class="noprint">
<td>Renderer</td>
<td><label><input type="radio" name="renderer" value="svg" id="renderSVG" checked> SVG</label></td>
<td colspan="2"><label><input type="radio" name="renderer" value="svg" id="renderSVG" checked> SVG</label></td>
<td><label><input type="radio" name="renderer" value="canvas" id="renderCanvas"> Canvas</label></td>
</tr>
</table>
......@@ -1141,6 +1145,12 @@ function onLoadSpec(has_data){
return;
setSignalValue("subgraphoffset", value);
}, false);
document.getElementById("maxseqlen").addEventListener('change', function(){
var value = parseFloat(this.value);
if(isNaN(value))
return;
setSignalValue("max_seq_len", value);
}, false);
document.getElementById("splitmarkers").addEventListener('change', function(){
if(graphs !== false)
parse(true);
......@@ -1235,6 +1245,7 @@ function onLoadSpec(has_data){
document.getElementById("graphwidth").value = getSignalValue("width");
document.getElementById("barwidth").value = getSignalValue("barwidth");
document.getElementById("subgraphoffset").value = getSignalValue("subgraphoffset");
document.getElementById("maxseqlen").value = getSignalValue("max_seq_len");
document.getElementById("minN").value = getSignalValue("amplitude_threshold");
document.getElementById("minP").value = getSignalValue("amplitude_pct_threshold");
document.getElementById("minT").value = getSignalValue("amplitude_markerpct_threshold");
......
......@@ -51,6 +51,10 @@
"name": "subgraphoffset",
"init": 70
},
{
"name": "max_seq_len",
"init": 70
},
{
"name": "clickedAllele",
"verbose": true,
......@@ -325,6 +329,11 @@
"type": "formula",
"field": "rank",
"expr": "datum.sequence == 'Other sequences'? 1/0 : datum.rank"
},
{
"type": "formula",
"field": "sequence_disp",
"expr": "length(datum.sequence) > max_seq_len+1? substring(datum.sequence, 0, max_seq_len) + '\u2026' : datum.sequence"
}
]
},
......@@ -750,7 +759,7 @@
"y": {"scale": "balance", "value": 50},
"baseline": {"value": "middle"},
"align": {"value": "right"},
"text": {"field": "sequence"},
"text": {"field": "sequence_disp"},
"fill": [
{
"test": "datum.thedatum && indata('selectedAlleles', datum.thedatum._id, 'thedatum._id')",
......
To-do:
* Samplevis:
* Add option to enable/disable truncating long allele names.
* Breaking issue: vega/vega#538. Causes bars to be merged together in case
of sequence name collisions after shortening the sequence name.
* Option to choose complete table download (all columns, not all rows).
* Option to freely adjust the sorting (currently CE length toggle only).
* When we have them, add default values to table filtering (for reference).
......@@ -21,10 +18,9 @@ To-do:
(maybe also additional value for confidence interval).
* Visualisation to display highest remaining background (positive and
negative) in known samples after BGCorrect analysis.
* Add option to Samplestats to keep negatives when filtering (abs filter).
* Add options to Libconvert to generate a template for STR or non-STR markers.
* Add options to Samplevis, Samplestats (and possibly other relevant tools) to
filter alleles by sequence length.
filter alleles by sequence length. The TSSV tool already supports this.
* Add plotting of raw data points to StuttermodelVis.
* Add a print stylesheet for the other visualisations (only Samplevis has one).
* Add visualisation with all markers in one graph ("samplesummaryvis"?).
......@@ -55,8 +51,6 @@ To-do:
(TODO: Write this list)
Open Vega issues:
* Transforms after aggregate are not processed in order.
https://github.com/vega/vega/issues/538
* Bug in aggregate transform w.r.t. signals.
https://github.com/vega/vega/issues/530
* Lookup transform only takes simple field names for the onKey parameter.
......@@ -118,7 +112,7 @@ output file options allelefinder,bgcorrect,bgestimate,bghomraw,bghomstat
sample tag parsing options allelefinder,bgcorrect,bgestimate,bghomraw,bghomstats,blame,samplestats,seqconvert,stuttermark,stuttermodel
allele detection options bgestimate,bghomraw,bghomstats,blame,stuttermodel
interpretation options samplestats
filtering options allelefinder,bgcorrect,bgestimate,bghomraw,bghomstats,bgpredict,blame,samplestats,stuttermark,stuttermodel
filtering options allelefinder,bgcorrect,bgestimate,bghomraw,bghomstats,bgpredict,blame,samplestats,stuttermark,stuttermodel,tssv
sequence format options allelefinder,bgcorrect,bgestimate,bghomraw,bghomstats,bgmerge,bgpredict,blame,stuttermark,stuttermodel,tssv
random subsampling options bgestimate,bghomstats,stuttermodel
visualisation options vis
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment