Commit 4148bb50 authored by Hoogenboom, Jerry's avatar Hoogenboom, Jerry
Browse files

Introducing findnewalleles

New tool findnewalleles:
* Given a list of known sequences, this tool can go through sample data
  files to mark all sequences that are not on the list.

Fixed:
* BGHomRaw, BGEstimate, BGHomStats, Stuttermodel, and Blame did not
  ignore the 'Other sequences' and 'No data' values that may occur in
  the place of a sequence as they were supposed to.

Improved:
* BGHomRaw will now include the sample tag in the "Missing allele X of
  marker Y" error message.
  
Changed:
* The -F/--sequence-format argument from BGHomRaw now defaults to "raw".

Visualisations:
* Updated Vega to version 2.5.0.
* The new version of Vega allowed the sorting to be fixed in Samplevis,
  Profilevis, BGRawvis, and Stuttermodelvis.
* Samplevis:
  * The 'Other sequences' bars are now drawn with an outline only.
  * STR alleles are now sorted by allele length by default (this can be
    toggled with a checkbox in HTML visualisations, and with an option
    in the Vis tool).
  * Fixed the clipping of the start ...
parent 13e0d781
......@@ -1253,8 +1253,9 @@ def get_sample_data(tags_to_files, callback, allelelist=None,
alleles = set()
for infile in tags_to_files[tag]:
infile = sys.stdin if infile == "-" else open(infile, "r")
alleles.update(read_sample_data_file(infile, data,
annotation_column, seqformat, library, marker, False))
alleles.update(read_sample_data_file(
infile, data, annotation_column, seqformat, library, marker,
drop_special_seq))
if infile != sys.stdin:
infile.close()
if limit_reads < sys.maxint:
......
......@@ -42,7 +42,8 @@ def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs, tag):
allele = sample_alleles[marker]
if (marker, allele) not in sample_data:
raise ValueError(
"Missing allele %s of marker %s!" % (allele, marker))
"Missing allele %s of marker %s in sample %s!" %
(allele, marker, tag))
elif 0 in sample_data[marker, allele]:
raise ValueError(
"Allele %s of marker %s has 0 reads!" % (allele, marker))
......@@ -171,7 +172,7 @@ def add_arguments(parser):
"particular true allele (default: %(default)s)")
filtergroup.add_argument('-M', '--marker', metavar="MARKER",
help="work only on MARKER")
add_sequence_format_args(parser)
add_sequence_format_args(parser, "raw")
#add_arguments
......
#!/usr/bin/env python
"""
Mark all sequences that are not in another list of sequences.
Adds a new column 'new_allele' to the input data. An asterisk (*) will
be placed in this column for any sequence that does not occur in the
provided list of known sequences.
"""
import argparse, sys
from ..lib import ensure_sequence_format, add_sequence_format_args, \
get_column_ids, add_input_output_args, SEQ_SPECIAL_VALUES, \
get_input_output_files
__version__ = "0.1dev"
def read_known(infile, library, default_marker=None):
"""Read sequence list from infile, return {marker: [sequences]}"""
data = {}
# Get column numbers. The marker name column is optional.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_sequence = get_column_ids(column_names, "sequence")
colid_marker = get_column_ids(column_names, "marker", optional=True)
for line in infile:
line = line.rstrip("\r\n").split("\t")
if line[colid_sequence] in SEQ_SPECIAL_VALUES:
continue
marker = line[colid_marker] if colid_marker is not None \
else default_marker
if default_marker is not None and marker != default_marker:
continue
sequence = ensure_sequence_format(line[colid_sequence], "raw",
library=library, marker=marker)
if marker in data:
data[marker].append(sequence)
else:
data[marker] = [sequence]
return data
#read_known
def find_new(infile, outfile, known, library):
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_marker, colid_sequence = get_column_ids(column_names, "marker",
"sequence")
column_names.insert(colid_sequence + 1, "new_allele")
outfile.write("\t".join(column_names) + "\n")
for line in infile:
cols = line.rstrip("\r\n").split("\t")
marker = cols[colid_marker]
cols.insert(colid_sequence + 1, "" if marker in known and
ensure_sequence_format(cols[colid_sequence], "raw",
library=library, marker=marker) in known[marker] else "*")
outfile.write("\t".join(cols) + "\n")
#find_new
def add_arguments(parser):
parser.add_argument('known', metavar="KNOWN",
type=argparse.FileType('r'),
help="file containing a list of known allelic sequences")
add_input_output_args(parser, True, True, False)
filtergroup = parser.add_argument_group("filtering options")
filtergroup.add_argument('-M', '--marker', metavar="MARKER",
help="work only on MARKER")
add_sequence_format_args(parser, "raw", True)
#add_arguments
def run(args):
gen = get_input_output_files(args, True, True)
if not gen:
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
# Read list of known sequences once.
known = read_known(args.known, args.library, args.marker)
for tag, infiles, outfile in gen:
# TODO: Aggregate data from all infiles of each sample.
if len(infiles) > 1:
raise ValueError(
"multiple input files for sample '%s' specified " % tag)
infile = sys.stdin if infiles[0] == "-" else open(infiles[0], "r")
find_new(infile, outfile, known, args.library)
if infile != sys.stdin:
infile.close()
#run
......@@ -133,7 +133,7 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
min_pct_of_max, min_pct_of_sum, min_per_strand,
bias_threshold, bar_width, padding, marker, width,
height, log_scale, repeat_unit, no_alldata,
no_aggregate, title):
no_aggregate, no_ce_length_sort, title):
# Get graph spec.
spec = json.load(resource_stream(
"fdstools", "vis/%svis/%svis.json" % (vistype, vistype)))
......@@ -171,7 +171,8 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
set_signal_value(spec, "orientation_threshold", min_per_strand)
set_signal_value(spec, "bias_threshold", bias_threshold)
set_signal_value(spec, "amplitude_markerpct_threshold", min_pct_of_sum)
set_signal_value(spec, "show_other", False if no_aggregate else True)
set_signal_value(spec, "show_other", not no_aggregate)
set_signal_value(spec, "sort_str_by_length", not no_ce_length_sort)
# Apply axis scale settings.
if vistype != "stuttermodel" and vistype != "allele":
......@@ -293,6 +294,8 @@ def add_arguments(parser):
default=_DEF_THRESHOLD_BIAS,
help="[sample] mark sequences that have less than this percentage of "
"reads on one strand (default: %(default)s)")
visgroup.add_argument('-c', '--no-ce-length-sort', action="store_true",
help="[sample] if specified, do not sort STR alleles by length")
visgroup.add_argument('-M', '--marker', metavar="MARKER",
default=_DEF_MARKER_NAME,
help="[sample, profile, bgraw, stuttermodel] only show graphs for the "
......@@ -305,6 +308,11 @@ def add_arguments(parser):
"contain the given value; separate multiple values with spaces; "
"prepend any value with '=' for an exact match (default: show "
"all repeat units)")
visgroup.add_argument('-A', '--no-alldata', action="store_true",
help="[stuttermodel] if specified, show only marker-specific fits")
visgroup.add_argument('-a', '--no-aggregate', action="store_true",
help="[sample] if specified, do not replace filtered sequences with a"
"per-marker aggregate 'Other sequences' entry")
visgroup.add_argument('-L', '--log-scale', action="store_true",
help="[sample, profile, bgraw] use logarithmic scale (for sample: "
"square root scale) instead of linear scale")
......@@ -325,11 +333,6 @@ def add_arguments(parser):
default=_DEF_HEIGHT,
help="[stuttermodel, allele] height of the graph area in pixels "
"(default: %(default)s)")
visgroup.add_argument('-A', '--no-alldata', action="store_true",
help="[stuttermodel] if specified, show only marker-specific fits")
visgroup.add_argument('-a', '--no-aggregate', action="store_true",
help="[sample] if specified, do not replace filtered sequences with a"
"per-marker aggregate 'Other sequences' entry")
#add_arguments
......@@ -357,5 +360,6 @@ def run(args):
args.min_per_strand, args.bias_threshold,
args.bar_width, args.padding, args.marker, args.width,
args.height, args.log_scale, args.repeat_unit,
args.no_alldata, args.no_aggregate, args.title)
args.no_alldata, args.no_aggregate,
args.no_ce_length_sort, args.title)
#run
......@@ -66,6 +66,9 @@
"type": "sort",
"by": ["name", "-tnoise", "-maxnoise"]
},
{
"type": "rank"
},
{
"type":
"formula",
......@@ -232,7 +235,7 @@
"points": true,
"padding": 1,
"range": "height",
"domain": {"field": "sequence"}
"domain": {"field": "sequence", "sort": {"field": "rank", "op": "min"}}
}
],
"axes": [
......
......@@ -72,6 +72,9 @@
{
"type": "sort",
"by": ["name", "sequence"]
},
{
"type": "rank"
}
]
},
......@@ -227,7 +230,7 @@
"name": "y",
"type": "ordinal",
"range": "height",
"domain": {"field": "sequence"}
"domain": {"field": "sequence", "sort": {"field": "rank", "op": "min"}}
}
],
"axes": [
......
......@@ -351,9 +351,16 @@
page-break-after: avoid;
page-break-inside: avoid;
}
.vega, .marks {
.vega, canvas {
max-width: 100%;
height: auto;
overflow: visible;
}
svg {
/* Using slightly smaller width to prevent clipping in Chrome */
max-width: 98%;
height: auto;
overflow: visible;
}
canvas {
object-fit: scale-down;
......@@ -532,6 +539,15 @@
<label>
<input type="checkbox" id="commonrange" disabled>
Common axis range
<span class="help" title="This option is only available if 'Split markers' has been turned off.">?</span>
</label>
</td>
</tr>
<tr>
<td colspan="3">
<label>
<input type="checkbox" id="sortstrbylength">
Sort STR alleles by length
</label>
</td>
</tr>
......@@ -631,19 +647,11 @@ function parse(maintainSelection){
visdiv.appendChild(newtab);
var graph = chart({el: newvis, renderer: rendererName});
graph.on("mouseover", function(event, item){
if(item && item.mark && item.mark.name && item.mark.name == "alleleSelector" && item.datum && item.datum.sequence && item.datum.sequence != "Other sequences")
document.body.style.cursor = "pointer";
});
graph.on("mouseout", function(event, item){
if(item && item.mark && item.mark.name && item.mark.name == "alleleSelector" && item.datum && item.datum.sequence && item.datum.sequence != "Other sequences")
document.body.style.cursor = "auto";
});
graph.onSignal("clickedAllele", function(name, value){
if(isAutoselecting)
return;
document.getElementById("clearSelectionLink").setAttribute("class", "");
if(graph.data("selectedAlleles").values().some(function(item){return item.sequence==value})){
if(graph.data("selectedAlleles").values().some(function(item){return item.thedatum==value})){
//User added this allele.
var pos = userDeselected.indexOf(value);
if(pos >= 0)
......@@ -728,7 +736,7 @@ function autoSelectAlleles(graph){
if(isNaN(minNa + minPa + minTa + minCa + minAa + minOa))
return;
(graph === undefined? graphs : [graph]).forEach(function(graph){
var selectedAllelesInGraph = graph.data("selectedAlleles").values().map(function(datum){return datum.sequence});
var selectedAllelesInGraph = graph.data("selectedAlleles").values().map(function(datum){return datum.thedatum});
graph.data("annotated").values().forEach(function(datum){
if(datum.sequence != "Other sequences" &&
datum.total_added >= minNa &&
......@@ -938,15 +946,7 @@ function saveTable(){
autoSelected.concat(userSelected).filter(function(datum, index, self){
return self.indexOf(datum) === index; //Deduplication.
}).sort(function(a, b){
if(a.marker != b.marker)
return (a.marker > b.marker) - (a.marker < b.marker);
if(a.total_added != b.total_added)
return b.total_added - a.total_added;
if(a.total_corr != b.total_corr)
return b.total_corr - a.total_corr;
if(a.total != b.total)
return b.total - a.total;
return (a.sequence > b.sequence) - (a.sequence < b.sequence);
return a.rank - b.rank;
}).map(function(datum){
var notes = [];
if(autoSelected.indexOf(datum) >= 0 && userDeselected.indexOf(datum) >= 0)
......@@ -984,6 +984,11 @@ function saveTable(){
}
//Data table update function.
function updateAllTables(){
if(graphs === false)
return;
graphs.forEach(function(graph, i){updateTable(i)});
}
function updateTable(i){
if(graphs === false)
return;
......@@ -1006,15 +1011,7 @@ function updateTable(i){
autoSelected.concat(userSelected).filter(function(datum, index, self){
return graphAlleles.indexOf(datum) >= 0 && self.indexOf(datum) === index;
}).sort(function(a, b){
if(a.marker != b.marker)
return (a.marker > b.marker) - (a.marker < b.marker);
if(a.total_added != b.total_added)
return b.total_added - a.total_added;
if(a.total_corr != b.total_corr)
return b.total_corr - a.total_corr;
if(a.total != b.total)
return b.total - a.total;
return (a.sequence > b.sequence) - (a.sequence < b.sequence);
return a.rank - b.rank;
}).forEach(function(datum){
row = table.insertRow();
row.insertCell().appendChild(document.createTextNode(datum.marker));
......@@ -1196,6 +1193,10 @@ function onLoadSpec(has_data){
document.getElementById("showother").addEventListener('change', function(){
setSignalValue("show_other", this.checked);
}, false);
document.getElementById("sortstrbylength").addEventListener('change', function(){
setSignalValue("sort_str_by_length", this.checked);
updateAllTables();
}, false);
document.getElementById("minNa").addEventListener('change', function(){
if(!isNaN(parseFloat(this.value)))
autoSelectAlleles();
......@@ -1241,6 +1242,7 @@ function onLoadSpec(has_data){
document.getElementById("minBias").value = getSignalValue("bias_threshold");
document.getElementById("shownegative").checked = getSignalValue("show_negative");
document.getElementById("showother").checked = getSignalValue("show_other");
document.getElementById("sortstrbylength").checked = getSignalValue("sort_str_by_length");
setMessage("Ready", false);
if(has_data)
......
......@@ -26,6 +26,10 @@
"name": "show_other",
"init": true
},
{
"name": "sort_str_by_length",
"init": true
},
{
"name": "bias_threshold",
"init": 25
......@@ -51,7 +55,10 @@
"name": "clickedAllele",
"verbose": true,
"streams": [
{"type": "@alleleSelector:click[datum.thedatum]", "expr": "datum.thedatum"}
{
"type": "@alleleSelector:click[datum.thedatum]",
"expr": "datum.thedatum"
}
]
}
],
......@@ -143,6 +150,18 @@
"field": "pct_of_sum",
"expr": "datum.aggr.sum_total_added? (datum.total_added / datum.aggr.sum_total_added * 100) : 100"
},
{
"type": "formula",
"field": "seq_order",
"expr": "(sort_str_by_length && test(/^CE\\d+\\.?\\d*_/, datum.sequence))? parseFloat(substring(datum.sequence, 2, indexof(datum.sequence, '_'))) : 1/0"
},
{
"type": "sort",
"by": ["marker", "seq_order", "-total_added", "-total_corr", "-total", "sequence"]
},
{
"type": "rank"
},
{
"type": "formula",
"field": "markersequence",
......@@ -211,7 +230,7 @@
]
},
{
"name": "unranked",
"name": "table",
"source": "preannotated",
"transform": [
{
......@@ -241,7 +260,8 @@
{"field": "total_corr", "ops": ["sum"], "as": ["total_corr"]},
{"field": "forward_added", "ops": ["sum"], "as": ["forward_added"]},
{"field": "reverse_added", "ops": ["sum"], "as": ["reverse_added"]},
{"field": "total_added", "ops": ["sum"], "as": ["total_added"]}
{"field": "total_added", "ops": ["sum"], "as": ["total_added"]},
{"field": "rank", "ops": ["min"], "as": ["rank"]}
]
},
{
......@@ -300,56 +320,11 @@
"onKey": "markersequence",
"keys": ["markersequence"],
"as": ["thedatum"]
}
]
},
{
"name": "ranks",
"source": "unranked",
"transform": [
{
"type": "cross",
"diagonal": false,
"filter": "datum.b.marker > datum.a.marker || (datum.b.marker == datum.a.marker && (datum.b.total_added < datum.a.total_added || (datum.b.total_added == datum.a.total_added && (datum.b.total_corr < datum.a.total_corr || (datum.b.total_corr == datum.a.total_corr && (datum.b.total < datum.a.total || (datum.b.total == datum.a.total && datum.b.sequence > datum.a.sequence)))))))"
},
{
"type": "formula",
"field": "markersequence",
"expr": "datum.a.markersequence"
},
{
"type": "formula",
"field": "sequence",
"expr": "datum.a.sequence"
},
{
"type": "aggregate",
"groupby": ["markersequence", "sequence"],
"summarize": [{"field": "*", "ops": ["count"], "as": ["rank"]}]
},
{
"type": "formula",
"field": "rank",
"expr": "datum.sequence == 'Other sequences'? -1 : datum.rank"
}
]
},
{
"name": "table",
"source": "unranked",
"transform": [
{
"type": "lookup",
"on": "ranks",
"onKey": "markersequence",
"keys": ["markersequence"],
"as": ["rankobj"],
"default": {"rank": 0}
},
{
"type": "formula",
"field": "rank",
"expr": "datum.rankobj.rank"
"expr": "datum.sequence == 'Other sequences'? 1/0 : datum.rank"
}
]
},
......@@ -445,26 +420,11 @@
{
"type": "toggle",
"signal": "clickedAllele",
"field": "sequence"
}
],
"transform": [
{
"type": "filter",
"test": "datum.sequence"
"field": "thedatum"
}
]
}
],
"predicates": [
{
"name": "isSelected",
"type": "in",
"item": {"arg": "sequence"},
"data": "selectedAlleles",
"field": "sequence"
}
],
"scales": [
{
"name": "b",
......@@ -542,8 +502,7 @@
"name": "y",
"type": "ordinal",
"range": "height",
"domain": {"field": "sequence", "sort": {"field": "rank", "op": "min"}},
"reverse": true
"domain": {"field": "sequence", "sort": {"field": "rank", "op": "min"}}
}
],
"axes": [
......@@ -600,12 +559,34 @@
"type": "rect",
"properties": {
"update": {
"x": {"scale": "x", "value": 0},
"x2": {"scale": "x", "field": "shared"},
"y": {"scale": "y", "field": "sequence", "offset": 1},
"height": {"scale": "y", "band": true, "offset": -2},
"fill": {"scale": "c", "value": "Genuine reads"},
"fillOpacity": {"value": 0.8}
"x": [
{"test": "datum.thedatum", "scale": "x", "value": 0},
{"scale": "x", "value": 0, "offset": 0.5}
],
"x2": [
{"test": "datum.thedatum", "scale": "x", "field": "shared"},
{"scale": "x", "field": "shared", "offset": -0.5}
],
"y": [
{"test": "datum.thedatum", "scale": "y", "field": "sequence", "offset": 1},
{"scale": "y", "field": "sequence", "offset": 1.5}
],
"height": [
{"test": "datum.thedatum", "scale": "y", "band": true, "offset": -2},
{"scale": "y", "band": true, "offset": -3}
],
"fill": [
{"test": "datum.thedatum", "scale": "c", "value": "Genuine reads"}
],
"stroke": [
{"test": "!datum.thedatum", "scale": "c", "value": "Genuine reads"}
],
"strokeWidth": [
{"test": "datum.thedatum || datum.shared == 0", "value": 0},
{"value": 1}
],
"fillOpacity": {"value": 0.8},
"strokeOpacity": {"value": 0.8}
}
}
},
......@@ -613,12 +594,34 @@
"type": "rect",
"properties": {
"update": {
"x": {"scale": "x", "field": "corr_shared"},
"x2": {"scale": "x", "field": "corr_decreased"},
"y": {"scale": "y", "field": "sequence", "offset": 1},
"height": {"scale": "y", "band": true, "offset": -2},
"fill": {"scale": "c", "value": "Noise reads"},
"fillOpacity": {"value": 0.8}
"x": [
{"test": "datum.thedatum", "scale": "x", "field": "corr_shared"},
{"scale": "x", "field": "corr_shared", "offset": 0.5}
],
"x2": [
{"test": "datum.thedatum", "scale": "x", "field": "corr_decreased"},
{"scale": "x", "field": "corr_decreased", "offset": -0.5}
],
"y": [
{"test": "datum.thedatum", "scale": "y", "field": "sequence", "offset": 1},
{"scale": "y", "field": "sequence", "offset": 1.5}
],
"height": [
{"test": "datum.thedatum", "scale": "y", "band": true, "offset": -2},
{"scale": "y", "band": true, "offset": -3}
],
"fill": [
{"test": "datum.thedatum", "scale": "c", "value": "Noise reads"}
],
"stroke": [
{"test": "!datum.thedatum", "scale": "c", "value": "Noise reads"}
],
"strokeWidth": [
{"test": "datum.thedatum || datum.corr_shared == datum.corr_decreased", "value": 0},
{"value": 1}
],
"fillOpacity": {"value": 0.8},
"strokeOpacity": {"value": 0.8}
}
}
},
......@@ -626,12 +629,34 @@
"type": "rect",
"properties": {
"update": {
"x": {"scale": "x", "field": "total_corr"},