Commit 4148bb50 authored by Hoogenboom, Jerry's avatar Hoogenboom, Jerry
Browse files

Introducing findnewalleles

New tool findnewalleles:
* Given a list of known sequences, this tool can go through sample data
  files to mark all sequences that are not on the list.

Fixed:
* BGHomRaw, BGEstimate, BGHomStats, Stuttermodel, and Blame did not
  ignore the 'Other sequences' and 'No data' values that may occur in
  the place of a sequence as they were supposed to.

Improved:
* BGHomRaw will now include the sample tag in the "Missing allele X of
  marker Y" error message.
  
Changed:
* The -F/--sequence-format argument from BGHomRaw now defaults to "raw".

Visualisations:
* Updated Vega to version 2.5.0.
* The new version of Vega allowed the sorting to be fixed in Samplevis,
  Profilevis, BGRawvis, and Stuttermodelvis.
* Samplevis:
  * The 'Other sequences' bars are now drawn with an outline only.
  * STR alleles are now sorted by allele length by default (this can be
    toggled with a checkbox in HTML visualisations, and with an option
    in the Vis tool).
  * Fixed the clipping of the start of long allele names when printing
    SVGs from Google Chrome.
  * Added a note (as '?' help tooltip) to the Common axis range option
    in the HTML visualisation, to inform the user of the fact that the
    Split markers option needs to be off for it to work.
parent 13e0d781
......@@ -1253,8 +1253,9 @@ def get_sample_data(tags_to_files, callback, allelelist=None,
alleles = set()
for infile in tags_to_files[tag]:
infile = sys.stdin if infile == "-" else open(infile, "r")
alleles.update(read_sample_data_file(infile, data,
annotation_column, seqformat, library, marker, False))
alleles.update(read_sample_data_file(
infile, data, annotation_column, seqformat, library, marker,
drop_special_seq))
if infile != sys.stdin:
infile.close()
if limit_reads < sys.maxint:
......
......@@ -42,7 +42,8 @@ def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs, tag):
allele = sample_alleles[marker]
if (marker, allele) not in sample_data:
raise ValueError(
"Missing allele %s of marker %s!" % (allele, marker))
"Missing allele %s of marker %s in sample %s!" %
(allele, marker, tag))
elif 0 in sample_data[marker, allele]:
raise ValueError(
"Allele %s of marker %s has 0 reads!" % (allele, marker))
......@@ -171,7 +172,7 @@ def add_arguments(parser):
"particular true allele (default: %(default)s)")
filtergroup.add_argument('-M', '--marker', metavar="MARKER",
help="work only on MARKER")
add_sequence_format_args(parser)
add_sequence_format_args(parser, "raw")
#add_arguments
......
#!/usr/bin/env python
"""
Mark all sequences that are not in another list of sequences.
Adds a new column 'new_allele' to the input data. An asterisk (*) will
be placed in this column for any sequence that does not occur in the
provided list of known sequences.
"""
import argparse, sys
from ..lib import ensure_sequence_format, add_sequence_format_args, \
get_column_ids, add_input_output_args, SEQ_SPECIAL_VALUES, \
get_input_output_files
__version__ = "0.1dev"
def read_known(infile, library, default_marker=None):
"""Read sequence list from infile, return {marker: [sequences]}"""
data = {}
# Get column numbers. The marker name column is optional.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_sequence = get_column_ids(column_names, "sequence")
colid_marker = get_column_ids(column_names, "marker", optional=True)
for line in infile:
line = line.rstrip("\r\n").split("\t")
if line[colid_sequence] in SEQ_SPECIAL_VALUES:
continue
marker = line[colid_marker] if colid_marker is not None \
else default_marker
if default_marker is not None and marker != default_marker:
continue
sequence = ensure_sequence_format(line[colid_sequence], "raw",
library=library, marker=marker)
if marker in data:
data[marker].append(sequence)
else:
data[marker] = [sequence]
return data
#read_known
def find_new(infile, outfile, known, library):
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_marker, colid_sequence = get_column_ids(column_names, "marker",
"sequence")
column_names.insert(colid_sequence + 1, "new_allele")
outfile.write("\t".join(column_names) + "\n")
for line in infile:
cols = line.rstrip("\r\n").split("\t")
marker = cols[colid_marker]
cols.insert(colid_sequence + 1, "" if marker in known and
ensure_sequence_format(cols[colid_sequence], "raw",
library=library, marker=marker) in known[marker] else "*")
outfile.write("\t".join(cols) + "\n")
#find_new
def add_arguments(parser):
parser.add_argument('known', metavar="KNOWN",
type=argparse.FileType('r'),
help="file containing a list of known allelic sequences")
add_input_output_args(parser, True, True, False)
filtergroup = parser.add_argument_group("filtering options")
filtergroup.add_argument('-M', '--marker', metavar="MARKER",
help="work only on MARKER")
add_sequence_format_args(parser, "raw", True)
#add_arguments
def run(args):
gen = get_input_output_files(args, True, True)
if not gen:
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
# Read list of known sequences once.
known = read_known(args.known, args.library, args.marker)
for tag, infiles, outfile in gen:
# TODO: Aggregate data from all infiles of each sample.
if len(infiles) > 1:
raise ValueError(
"multiple input files for sample '%s' specified " % tag)
infile = sys.stdin if infiles[0] == "-" else open(infiles[0], "r")
find_new(infile, outfile, known, args.library)
if infile != sys.stdin:
infile.close()
#run
......@@ -133,7 +133,7 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
min_pct_of_max, min_pct_of_sum, min_per_strand,
bias_threshold, bar_width, padding, marker, width,
height, log_scale, repeat_unit, no_alldata,
no_aggregate, title):
no_aggregate, no_ce_length_sort, title):
# Get graph spec.
spec = json.load(resource_stream(
"fdstools", "vis/%svis/%svis.json" % (vistype, vistype)))
......@@ -171,7 +171,8 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
set_signal_value(spec, "orientation_threshold", min_per_strand)
set_signal_value(spec, "bias_threshold", bias_threshold)
set_signal_value(spec, "amplitude_markerpct_threshold", min_pct_of_sum)
set_signal_value(spec, "show_other", False if no_aggregate else True)
set_signal_value(spec, "show_other", not no_aggregate)
set_signal_value(spec, "sort_str_by_length", not no_ce_length_sort)
# Apply axis scale settings.
if vistype != "stuttermodel" and vistype != "allele":
......@@ -293,6 +294,8 @@ def add_arguments(parser):
default=_DEF_THRESHOLD_BIAS,
help="[sample] mark sequences that have less than this percentage of "
"reads on one strand (default: %(default)s)")
visgroup.add_argument('-c', '--no-ce-length-sort', action="store_true",
help="[sample] if specified, do not sort STR alleles by length")
visgroup.add_argument('-M', '--marker', metavar="MARKER",
default=_DEF_MARKER_NAME,
help="[sample, profile, bgraw, stuttermodel] only show graphs for the "
......@@ -305,6 +308,11 @@ def add_arguments(parser):
"contain the given value; separate multiple values with spaces; "
"prepend any value with '=' for an exact match (default: show "
"all repeat units)")
visgroup.add_argument('-A', '--no-alldata', action="store_true",
help="[stuttermodel] if specified, show only marker-specific fits")
visgroup.add_argument('-a', '--no-aggregate', action="store_true",
help="[sample] if specified, do not replace filtered sequences with a"
"per-marker aggregate 'Other sequences' entry")
visgroup.add_argument('-L', '--log-scale', action="store_true",
help="[sample, profile, bgraw] use logarithmic scale (for sample: "
"square root scale) instead of linear scale")
......@@ -325,11 +333,6 @@ def add_arguments(parser):
default=_DEF_HEIGHT,
help="[stuttermodel, allele] height of the graph area in pixels "
"(default: %(default)s)")
visgroup.add_argument('-A', '--no-alldata', action="store_true",
help="[stuttermodel] if specified, show only marker-specific fits")
visgroup.add_argument('-a', '--no-aggregate', action="store_true",
help="[sample] if specified, do not replace filtered sequences with a"
"per-marker aggregate 'Other sequences' entry")
#add_arguments
......@@ -357,5 +360,6 @@ def run(args):
args.min_per_strand, args.bias_threshold,
args.bar_width, args.padding, args.marker, args.width,
args.height, args.log_scale, args.repeat_unit,
args.no_alldata, args.no_aggregate, args.title)
args.no_alldata, args.no_aggregate,
args.no_ce_length_sort, args.title)
#run
......@@ -66,6 +66,9 @@
"type": "sort",
"by": ["name", "-tnoise", "-maxnoise"]
},
{
"type": "rank"
},
{
"type":
"formula",
......@@ -232,7 +235,7 @@
"points": true,
"padding": 1,
"range": "height",
"domain": {"field": "sequence"}
"domain": {"field": "sequence", "sort": {"field": "rank", "op": "min"}}
}
],
"axes": [
......
......@@ -72,6 +72,9 @@
{
"type": "sort",
"by": ["name", "sequence"]
},
{
"type": "rank"
}
]
},
......@@ -227,7 +230,7 @@
"name": "y",
"type": "ordinal",
"range": "height",
"domain": {"field": "sequence"}
"domain": {"field": "sequence", "sort": {"field": "rank", "op": "min"}}
}
],
"axes": [
......
......@@ -351,9 +351,16 @@
page-break-after: avoid;
page-break-inside: avoid;
}
.vega, .marks {
.vega, canvas {
max-width: 100%;
height: auto;
overflow: visible;
}
svg {
/* Using slightly smaller width to prevent clipping in Chrome */
max-width: 98%;
height: auto;
overflow: visible;
}
canvas {
object-fit: scale-down;
......@@ -532,6 +539,15 @@
<label>
<input type="checkbox" id="commonrange" disabled>
Common axis range
<span class="help" title="This option is only available if 'Split markers' has been turned off.">?</span>
</label>
</td>
</tr>
<tr>
<td colspan="3">
<label>
<input type="checkbox" id="sortstrbylength">
Sort STR alleles by length
</label>
</td>
</tr>
......@@ -631,19 +647,11 @@ function parse(maintainSelection){
visdiv.appendChild(newtab);
var graph = chart({el: newvis, renderer: rendererName});
graph.on("mouseover", function(event, item){
if(item && item.mark && item.mark.name && item.mark.name == "alleleSelector" && item.datum && item.datum.sequence && item.datum.sequence != "Other sequences")
document.body.style.cursor = "pointer";
});
graph.on("mouseout", function(event, item){
if(item && item.mark && item.mark.name && item.mark.name == "alleleSelector" && item.datum && item.datum.sequence && item.datum.sequence != "Other sequences")
document.body.style.cursor = "auto";
});
graph.onSignal("clickedAllele", function(name, value){
if(isAutoselecting)
return;
document.getElementById("clearSelectionLink").setAttribute("class", "");
if(graph.data("selectedAlleles").values().some(function(item){return item.sequence==value})){
if(graph.data("selectedAlleles").values().some(function(item){return item.thedatum==value})){
//User added this allele.
var pos = userDeselected.indexOf(value);
if(pos >= 0)
......@@ -728,7 +736,7 @@ function autoSelectAlleles(graph){
if(isNaN(minNa + minPa + minTa + minCa + minAa + minOa))
return;
(graph === undefined? graphs : [graph]).forEach(function(graph){
var selectedAllelesInGraph = graph.data("selectedAlleles").values().map(function(datum){return datum.sequence});
var selectedAllelesInGraph = graph.data("selectedAlleles").values().map(function(datum){return datum.thedatum});
graph.data("annotated").values().forEach(function(datum){
if(datum.sequence != "Other sequences" &&
datum.total_added >= minNa &&
......@@ -938,15 +946,7 @@ function saveTable(){
autoSelected.concat(userSelected).filter(function(datum, index, self){
return self.indexOf(datum) === index; //Deduplication.
}).sort(function(a, b){
if(a.marker != b.marker)
return (a.marker > b.marker) - (a.marker < b.marker);
if(a.total_added != b.total_added)
return b.total_added - a.total_added;
if(a.total_corr != b.total_corr)
return b.total_corr - a.total_corr;
if(a.total != b.total)
return b.total - a.total;
return (a.sequence > b.sequence) - (a.sequence < b.sequence);
return a.rank - b.rank;
}).map(function(datum){
var notes = [];
if(autoSelected.indexOf(datum) >= 0 && userDeselected.indexOf(datum) >= 0)
......@@ -984,6 +984,11 @@ function saveTable(){
}
//Data table update function.
function updateAllTables(){
if(graphs === false)
return;
graphs.forEach(function(graph, i){updateTable(i)});
}
function updateTable(i){
if(graphs === false)
return;
......@@ -1006,15 +1011,7 @@ function updateTable(i){
autoSelected.concat(userSelected).filter(function(datum, index, self){
return graphAlleles.indexOf(datum) >= 0 && self.indexOf(datum) === index;
}).sort(function(a, b){
if(a.marker != b.marker)
return (a.marker > b.marker) - (a.marker < b.marker);
if(a.total_added != b.total_added)
return b.total_added - a.total_added;
if(a.total_corr != b.total_corr)
return b.total_corr - a.total_corr;
if(a.total != b.total)
return b.total - a.total;
return (a.sequence > b.sequence) - (a.sequence < b.sequence);
return a.rank - b.rank;
}).forEach(function(datum){
row = table.insertRow();
row.insertCell().appendChild(document.createTextNode(datum.marker));
......@@ -1196,6 +1193,10 @@ function onLoadSpec(has_data){
document.getElementById("showother").addEventListener('change', function(){
setSignalValue("show_other", this.checked);
}, false);
document.getElementById("sortstrbylength").addEventListener('change', function(){
setSignalValue("sort_str_by_length", this.checked);
updateAllTables();
}, false);
document.getElementById("minNa").addEventListener('change', function(){
if(!isNaN(parseFloat(this.value)))
autoSelectAlleles();
......@@ -1241,6 +1242,7 @@ function onLoadSpec(has_data){
document.getElementById("minBias").value = getSignalValue("bias_threshold");
document.getElementById("shownegative").checked = getSignalValue("show_negative");
document.getElementById("showother").checked = getSignalValue("show_other");
document.getElementById("sortstrbylength").checked = getSignalValue("sort_str_by_length");
setMessage("Ready", false);
if(has_data)
......
......@@ -26,6 +26,10 @@
"name": "show_other",
"init": true
},
{
"name": "sort_str_by_length",
"init": true
},
{
"name": "bias_threshold",
"init": 25
......@@ -51,7 +55,10 @@
"name": "clickedAllele",
"verbose": true,
"streams": [
{"type": "@alleleSelector:click[datum.thedatum]", "expr": "datum.thedatum"}
{
"type": "@alleleSelector:click[datum.thedatum]",
"expr": "datum.thedatum"
}
]
}
],
......@@ -143,6 +150,18 @@
"field": "pct_of_sum",
"expr": "datum.aggr.sum_total_added? (datum.total_added / datum.aggr.sum_total_added * 100) : 100"
},
{
"type": "formula",
"field": "seq_order",
"expr": "(sort_str_by_length && test(/^CE\\d+\\.?\\d*_/, datum.sequence))? parseFloat(substring(datum.sequence, 2, indexof(datum.sequence, '_'))) : 1/0"
},
{
"type": "sort",
"by": ["marker", "seq_order", "-total_added", "-total_corr", "-total", "sequence"]
},
{
"type": "rank"
},
{
"type": "formula",
"field": "markersequence",
......@@ -211,7 +230,7 @@
]
},
{
"name": "unranked",
"name": "table",
"source": "preannotated",
"transform": [
{
......@@ -241,7 +260,8 @@
{"field": "total_corr", "ops": ["sum"], "as": ["total_corr"]},
{"field": "forward_added", "ops": ["sum"], "as": ["forward_added"]},
{"field": "reverse_added", "ops": ["sum"], "as": ["reverse_added"]},
{"field": "total_added", "ops": ["sum"], "as": ["total_added"]}
{"field": "total_added", "ops": ["sum"], "as": ["total_added"]},
{"field": "rank", "ops": ["min"], "as": ["rank"]}
]
},
{
......@@ -300,56 +320,11 @@
"onKey": "markersequence",
"keys": ["markersequence"],
"as": ["thedatum"]
}
]
},
{
"name": "ranks",
"source": "unranked",
"transform": [
{
"type": "cross",
"diagonal": false,
"filter": "datum.b.marker > datum.a.marker || (datum.b.marker == datum.a.marker && (datum.b.total_added < datum.a.total_added || (datum.b.total_added == datum.a.total_added && (datum.b.total_corr < datum.a.total_corr || (datum.b.total_corr == datum.a.total_corr && (datum.b.total < datum.a.total || (datum.b.total == datum.a.total && datum.b.sequence > datum.a.sequence)))))))"
},
{
"type": "formula",
"field": "markersequence",
"expr": "datum.a.markersequence"
},
{
"type": "formula",
"field": "sequence",
"expr": "datum.a.sequence"
},
{
"type": "aggregate",
"groupby": ["markersequence", "sequence"],
"summarize": [{"field": "*", "ops": ["count"], "as": ["rank"]}]
},
{
"type": "formula",
"field": "rank",
"expr": "datum.sequence == 'Other sequences'? -1 : datum.rank"
}
]
},
{
"name": "table",
"source": "unranked",
"transform": [
{
"type": "lookup",
"on": "ranks",
"onKey": "markersequence",
"keys": ["markersequence"],
"as": ["rankobj"],
"default": {"rank": 0}
},
{
"type": "formula",
"field": "rank",
"expr": "datum.rankobj.rank"
"expr": "datum.sequence == 'Other sequences'? 1/0 : datum.rank"
}
]
},
......@@ -445,26 +420,11 @@
{
"type": "toggle",
"signal": "clickedAllele",
"field": "sequence"
}
],
"transform": [
{
"type": "filter",
"test": "datum.sequence"
"field": "thedatum"
}
]
}
],
"predicates": [
{
"name": "isSelected",
"type": "in",
"item": {"arg": "sequence"},
"data": "selectedAlleles",
"field": "sequence"
}
],
"scales": [
{
"name": "b",
......@@ -542,8 +502,7 @@
"name": "y",
"type": "ordinal",
"range": "height",
"domain": {"field": "sequence", "sort": {"field": "rank", "op": "min"}},
"reverse": true
"domain": {"field": "sequence", "sort": {"field": "rank", "op": "min"}}
}
],
"axes": [
......@@ -600,12 +559,34 @@
"type": "rect",
"properties": {
"update": {
"x": {"scale": "x", "value": 0},
"x2": {"scale": "x", "field": "shared"},
"y": {"scale": "y", "field": "sequence", "offset": 1},
"height": {"scale": "y", "band": true, "offset": -2},
"fill": {"scale": "c", "value": "Genuine reads"},
"fillOpacity": {"value": 0.8}
"x": [
{"test": "datum.thedatum", "scale": "x", "value": 0},
{"scale": "x", "value": 0, "offset": 0.5}
],
"x2": [
{"test": "datum.thedatum", "scale": "x", "field": "shared"},
{"scale": "x", "field": "shared", "offset": -0.5}
],
"y": [
{"test": "datum.thedatum", "scale": "y", "field": "sequence", "offset": 1},
{"scale": "y", "field": "sequence", "offset": 1.5}
],
"height": [
{"test": "datum.thedatum", "scale": "y", "band": true, "offset": -2},
{"scale": "y", "band": true, "offset": -3}
],
"fill": [
{"test": "datum.thedatum", "scale": "c", "value": "Genuine reads"}
],
"stroke": [
{"test": "!datum.thedatum", "scale": "c", "value": "Genuine reads"}
],
"strokeWidth": [
{"test": "datum.thedatum || datum.shared == 0", "value": 0},
{"value": 1}
],
"fillOpacity": {"value": 0.8},
"strokeOpacity": {"value": 0.8}
}
}
},
......@@ -613,12 +594,34 @@
"type": "rect",
"properties": {
"update": {
"x": {"scale": "x", "field": "corr_shared"},
"x2": {"scale": "x", "field": "corr_decreased"},
"y": {"scale": "y", "field": "sequence", "offset": 1},
"height": {"scale": "y", "band": true, "offset": -2},
"fill": {"scale": "c", "value": "Noise reads"},
"fillOpacity": {"value": 0.8}
"x": [
{"test": "datum.thedatum", "scale": "x", "field": "corr_shared"},
{"scale": "x", "field": "corr_shared", "offset": 0.5}
],
"x2": [
{"test": "datum.thedatum", "scale": "x", "field": "corr_decreased"},
{"scale": "x", "field": "corr_decreased", "offset": -0.5}
],
"y": [
{"test": "datum.thedatum", "scale": "y", "field": "sequence", "offset": 1},
{"scale": "y", "field": "sequence", "offset": 1.5}
],
"height": [
{"test": "datum.thedatum", "scale": "y", "band": true, "offset": -2},
{"scale": "y", "band": true, "offset": -3}
],
"fill": [
{"test": "datum.thedatum", "scale": "c", "value": "Noise reads"}
],
"stroke": [
{"test": "!datum.thedatum", "scale": "c", "value": "Noise reads"}
],
"strokeWidth": [
{"test": "datum.thedatum || datum.corr_shared == datum.corr_decreased", "value": 0},
{"value": 1}
],
"fillOpacity": {"value": 0.8},
"strokeOpacity": {"value": 0.8}
}