Commit 4148bb50 authored by Hoogenboom, Jerry's avatar Hoogenboom, Jerry

Introducing findnewalleles

New tool findnewalleles:
* Given a list of known sequences, this tool can go through sample data
  files to mark all sequences that are not on the list.

Fixed:
* BGHomRaw, BGEstimate, BGHomStats, Stuttermodel, and Blame did not
  ignore the 'Other sequences' and 'No data' values that may occur in
  the place of a sequence as they were supposed to.

Improved:
* BGHomRaw will now include the sample tag in the "Missing allele X of
  marker Y" error message.
  
Changed:
* The -F/--sequence-format argument from BGHomRaw now defaults to "raw".

Visualisations:
* Updated Vega to version 2.5.0.
* The new version of Vega allowed the sorting to be fixed in Samplevis,
  Profilevis, BGRawvis, and Stuttermodelvis.
* Samplevis:
  * The 'Other sequences' bars are now drawn with an outline only.
  * STR alleles are now sorted by allele length by default (this can be
    toggled with a checkbox in HTML visualisations, and with an option
    in the Vis tool).
  * Fixed the clipping of the start of long allele names when printing
    SVGs from Google Chrome.
  * Added a note (as '?' help tooltip) to the Common axis range option
    in the HTML visualisation, to inform the user of the fact that the
    Split markers option needs to be off for it to work.
parent 13e0d781
......@@ -1253,8 +1253,9 @@ def get_sample_data(tags_to_files, callback, allelelist=None,
alleles = set()
for infile in tags_to_files[tag]:
infile = sys.stdin if infile == "-" else open(infile, "r")
alleles.update(read_sample_data_file(infile, data,
annotation_column, seqformat, library, marker, False))
alleles.update(read_sample_data_file(
infile, data, annotation_column, seqformat, library, marker,
drop_special_seq))
if infile != sys.stdin:
infile.close()
if limit_reads < sys.maxint:
......
......@@ -42,7 +42,8 @@ def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs, tag):
allele = sample_alleles[marker]
if (marker, allele) not in sample_data:
raise ValueError(
"Missing allele %s of marker %s!" % (allele, marker))
"Missing allele %s of marker %s in sample %s!" %
(allele, marker, tag))
elif 0 in sample_data[marker, allele]:
raise ValueError(
"Allele %s of marker %s has 0 reads!" % (allele, marker))
......@@ -171,7 +172,7 @@ def add_arguments(parser):
"particular true allele (default: %(default)s)")
filtergroup.add_argument('-M', '--marker', metavar="MARKER",
help="work only on MARKER")
add_sequence_format_args(parser)
add_sequence_format_args(parser, "raw")
#add_arguments
......
#!/usr/bin/env python
"""
Mark all sequences that are not in another list of sequences.
Adds a new column 'new_allele' to the input data. An asterisk (*) will
be placed in this column for any sequence that does not occur in the
provided list of known sequences.
"""
import argparse, sys
from ..lib import ensure_sequence_format, add_sequence_format_args, \
get_column_ids, add_input_output_args, SEQ_SPECIAL_VALUES, \
get_input_output_files
__version__ = "0.1dev"
def read_known(infile, library, default_marker=None):
"""Read sequence list from infile, return {marker: [sequences]}"""
data = {}
# Get column numbers. The marker name column is optional.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_sequence = get_column_ids(column_names, "sequence")
colid_marker = get_column_ids(column_names, "marker", optional=True)
for line in infile:
line = line.rstrip("\r\n").split("\t")
if line[colid_sequence] in SEQ_SPECIAL_VALUES:
continue
marker = line[colid_marker] if colid_marker is not None \
else default_marker
if default_marker is not None and marker != default_marker:
continue
sequence = ensure_sequence_format(line[colid_sequence], "raw",
library=library, marker=marker)
if marker in data:
data[marker].append(sequence)
else:
data[marker] = [sequence]
return data
#read_known
def find_new(infile, outfile, known, library):
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_marker, colid_sequence = get_column_ids(column_names, "marker",
"sequence")
column_names.insert(colid_sequence + 1, "new_allele")
outfile.write("\t".join(column_names) + "\n")
for line in infile:
cols = line.rstrip("\r\n").split("\t")
marker = cols[colid_marker]
cols.insert(colid_sequence + 1, "" if marker in known and
ensure_sequence_format(cols[colid_sequence], "raw",
library=library, marker=marker) in known[marker] else "*")
outfile.write("\t".join(cols) + "\n")
#find_new
def add_arguments(parser):
parser.add_argument('known', metavar="KNOWN",
type=argparse.FileType('r'),
help="file containing a list of known allelic sequences")
add_input_output_args(parser, True, True, False)
filtergroup = parser.add_argument_group("filtering options")
filtergroup.add_argument('-M', '--marker', metavar="MARKER",
help="work only on MARKER")
add_sequence_format_args(parser, "raw", True)
#add_arguments
def run(args):
gen = get_input_output_files(args, True, True)
if not gen:
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
# Read list of known sequences once.
known = read_known(args.known, args.library, args.marker)
for tag, infiles, outfile in gen:
# TODO: Aggregate data from all infiles of each sample.
if len(infiles) > 1:
raise ValueError(
"multiple input files for sample '%s' specified " % tag)
infile = sys.stdin if infiles[0] == "-" else open(infiles[0], "r")
find_new(infile, outfile, known, args.library)
if infile != sys.stdin:
infile.close()
#run
......@@ -133,7 +133,7 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
min_pct_of_max, min_pct_of_sum, min_per_strand,
bias_threshold, bar_width, padding, marker, width,
height, log_scale, repeat_unit, no_alldata,
no_aggregate, title):
no_aggregate, no_ce_length_sort, title):
# Get graph spec.
spec = json.load(resource_stream(
"fdstools", "vis/%svis/%svis.json" % (vistype, vistype)))
......@@ -171,7 +171,8 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
set_signal_value(spec, "orientation_threshold", min_per_strand)
set_signal_value(spec, "bias_threshold", bias_threshold)
set_signal_value(spec, "amplitude_markerpct_threshold", min_pct_of_sum)
set_signal_value(spec, "show_other", False if no_aggregate else True)
set_signal_value(spec, "show_other", not no_aggregate)
set_signal_value(spec, "sort_str_by_length", not no_ce_length_sort)
# Apply axis scale settings.
if vistype != "stuttermodel" and vistype != "allele":
......@@ -293,6 +294,8 @@ def add_arguments(parser):
default=_DEF_THRESHOLD_BIAS,
help="[sample] mark sequences that have less than this percentage of "
"reads on one strand (default: %(default)s)")
visgroup.add_argument('-c', '--no-ce-length-sort', action="store_true",
help="[sample] if specified, do not sort STR alleles by length")
visgroup.add_argument('-M', '--marker', metavar="MARKER",
default=_DEF_MARKER_NAME,
help="[sample, profile, bgraw, stuttermodel] only show graphs for the "
......@@ -305,6 +308,11 @@ def add_arguments(parser):
"contain the given value; separate multiple values with spaces; "
"prepend any value with '=' for an exact match (default: show "
"all repeat units)")
visgroup.add_argument('-A', '--no-alldata', action="store_true",
help="[stuttermodel] if specified, show only marker-specific fits")
visgroup.add_argument('-a', '--no-aggregate', action="store_true",
help="[sample] if specified, do not replace filtered sequences with a"
"per-marker aggregate 'Other sequences' entry")
visgroup.add_argument('-L', '--log-scale', action="store_true",
help="[sample, profile, bgraw] use logarithmic scale (for sample: "
"square root scale) instead of linear scale")
......@@ -325,11 +333,6 @@ def add_arguments(parser):
default=_DEF_HEIGHT,
help="[stuttermodel, allele] height of the graph area in pixels "
"(default: %(default)s)")
visgroup.add_argument('-A', '--no-alldata', action="store_true",
help="[stuttermodel] if specified, show only marker-specific fits")
visgroup.add_argument('-a', '--no-aggregate', action="store_true",
help="[sample] if specified, do not replace filtered sequences with a"
"per-marker aggregate 'Other sequences' entry")
#add_arguments
......@@ -357,5 +360,6 @@ def run(args):
args.min_per_strand, args.bias_threshold,
args.bar_width, args.padding, args.marker, args.width,
args.height, args.log_scale, args.repeat_unit,
args.no_alldata, args.no_aggregate, args.title)
args.no_alldata, args.no_aggregate,
args.no_ce_length_sort, args.title)
#run
......@@ -66,6 +66,9 @@
"type": "sort",
"by": ["name", "-tnoise", "-maxnoise"]
},
{
"type": "rank"
},
{
"type":
"formula",
......@@ -232,7 +235,7 @@
"points": true,
"padding": 1,
"range": "height",
"domain": {"field": "sequence"}
"domain": {"field": "sequence", "sort": {"field": "rank", "op": "min"}}
}
],
"axes": [
......
......@@ -72,6 +72,9 @@
{
"type": "sort",
"by": ["name", "sequence"]
},
{
"type": "rank"
}
]
},
......@@ -227,7 +230,7 @@
"name": "y",
"type": "ordinal",
"range": "height",
"domain": {"field": "sequence"}
"domain": {"field": "sequence", "sort": {"field": "rank", "op": "min"}}
}
],
"axes": [
......
......@@ -351,9 +351,16 @@
page-break-after: avoid;
page-break-inside: avoid;
}
.vega, .marks {
.vega, canvas {
max-width: 100%;
height: auto;
overflow: visible;
}
svg {
/* Using slightly smaller width to prevent clipping in Chrome */
max-width: 98%;
height: auto;
overflow: visible;
}
canvas {
object-fit: scale-down;
......@@ -532,6 +539,15 @@
<label>
<input type="checkbox" id="commonrange" disabled>
Common axis range
<span class="help" title="This option is only available if 'Split markers' has been turned off.">?</span>
</label>
</td>
</tr>
<tr>
<td colspan="3">
<label>
<input type="checkbox" id="sortstrbylength">
Sort STR alleles by length
</label>
</td>
</tr>
......@@ -631,19 +647,11 @@ function parse(maintainSelection){
visdiv.appendChild(newtab);
var graph = chart({el: newvis, renderer: rendererName});
graph.on("mouseover", function(event, item){
if(item && item.mark && item.mark.name && item.mark.name == "alleleSelector" && item.datum && item.datum.sequence && item.datum.sequence != "Other sequences")
document.body.style.cursor = "pointer";
});
graph.on("mouseout", function(event, item){
if(item && item.mark && item.mark.name && item.mark.name == "alleleSelector" && item.datum && item.datum.sequence && item.datum.sequence != "Other sequences")
document.body.style.cursor = "auto";
});
graph.onSignal("clickedAllele", function(name, value){
if(isAutoselecting)
return;
document.getElementById("clearSelectionLink").setAttribute("class", "");
if(graph.data("selectedAlleles").values().some(function(item){return item.sequence==value})){
if(graph.data("selectedAlleles").values().some(function(item){return item.thedatum==value})){
//User added this allele.
var pos = userDeselected.indexOf(value);
if(pos >= 0)
......@@ -728,7 +736,7 @@ function autoSelectAlleles(graph){
if(isNaN(minNa + minPa + minTa + minCa + minAa + minOa))
return;
(graph === undefined? graphs : [graph]).forEach(function(graph){
var selectedAllelesInGraph = graph.data("selectedAlleles").values().map(function(datum){return datum.sequence});
var selectedAllelesInGraph = graph.data("selectedAlleles").values().map(function(datum){return datum.thedatum});
graph.data("annotated").values().forEach(function(datum){
if(datum.sequence != "Other sequences" &&
datum.total_added >= minNa &&
......@@ -938,15 +946,7 @@ function saveTable(){
autoSelected.concat(userSelected).filter(function(datum, index, self){
return self.indexOf(datum) === index; //Deduplication.
}).sort(function(a, b){
if(a.marker != b.marker)
return (a.marker > b.marker) - (a.marker < b.marker);
if(a.total_added != b.total_added)
return b.total_added - a.total_added;
if(a.total_corr != b.total_corr)
return b.total_corr - a.total_corr;
if(a.total != b.total)
return b.total - a.total;
return (a.sequence > b.sequence) - (a.sequence < b.sequence);
return a.rank - b.rank;
}).map(function(datum){
var notes = [];
if(autoSelected.indexOf(datum) >= 0 && userDeselected.indexOf(datum) >= 0)
......@@ -984,6 +984,11 @@ function saveTable(){
}
//Data table update function.
function updateAllTables(){
if(graphs === false)
return;
graphs.forEach(function(graph, i){updateTable(i)});
}
function updateTable(i){
if(graphs === false)
return;
......@@ -1006,15 +1011,7 @@ function updateTable(i){
autoSelected.concat(userSelected).filter(function(datum, index, self){
return graphAlleles.indexOf(datum) >= 0 && self.indexOf(datum) === index;
}).sort(function(a, b){
if(a.marker != b.marker)
return (a.marker > b.marker) - (a.marker < b.marker);
if(a.total_added != b.total_added)
return b.total_added - a.total_added;
if(a.total_corr != b.total_corr)
return b.total_corr - a.total_corr;
if(a.total != b.total)
return b.total - a.total;
return (a.sequence > b.sequence) - (a.sequence < b.sequence);
return a.rank - b.rank;
}).forEach(function(datum){
row = table.insertRow();
row.insertCell().appendChild(document.createTextNode(datum.marker));
......@@ -1196,6 +1193,10 @@ function onLoadSpec(has_data){
document.getElementById("showother").addEventListener('change', function(){
setSignalValue("show_other", this.checked);
}, false);
document.getElementById("sortstrbylength").addEventListener('change', function(){
setSignalValue("sort_str_by_length", this.checked);
updateAllTables();
}, false);
document.getElementById("minNa").addEventListener('change', function(){
if(!isNaN(parseFloat(this.value)))
autoSelectAlleles();
......@@ -1241,6 +1242,7 @@ function onLoadSpec(has_data){
document.getElementById("minBias").value = getSignalValue("bias_threshold");
document.getElementById("shownegative").checked = getSignalValue("show_negative");
document.getElementById("showother").checked = getSignalValue("show_other");
document.getElementById("sortstrbylength").checked = getSignalValue("sort_str_by_length");
setMessage("Ready", false);
if(has_data)
......
This diff is collapsed.
......@@ -187,6 +187,9 @@
{
"type": "sort",
"by": ["markerdirection", "length"]
},
{
"type": "rank"
}
]
}
......@@ -255,7 +258,7 @@
"name": "c",
"type": "ordinal",
"range": "category20",
"domain": {"field": "markerdirection"}
"domain": {"field": "markerdirection", "sort": {"field": "rank", "op": "min"}}
}
],
"axes": [
......
This diff is collapsed.
To-do:
* Samplevis:
* Give 'Other sequences' bars a distinct look.
* Add option to truncate long allele names.
* Sort STR alleles by length by default.
* Option to adjust the sorting.
* Add option to enable/disable truncating long allele names.
* Breaking issue: vega/vega#538. Causes bars to be merged together in case
of sequence name collisions after shortening the sequence name.
* Option to choose complete table download (all columns, not all rows).
* Option to freely adjust the sorting (currently CE length toggle only).
* When we have them, add default values to table filtering (for reference).
* Some of the media query breakpoints overlap, fix this.
* Perhaps it is desirable to be able to request a list of 'Other sequences'.
* Make Samplevis more responsive when rendering/updating graphs. To do this,
the work should be broken up into chunks and each chunk should set off the
next chunk through "window.setTimeout(nextChunkFunction);". The page will
be repainted between each chunk. One major issue with this is that user
input events may get scheduled between the chunks.
* Additions needed for publication:
* [If not too difficult to implement] BGEstimate should start with homozygous
samples and add heterozygous samples later to optimise correction.
......@@ -16,8 +21,7 @@ To-do:
(maybe also additional value for confidence interval).
* Visualisation to display highest remaining background (positive and
negative) in known samples after BGCorrect analysis.
* Add a way to identify sequences that do not fit the library, or an option not
to generate those.
* Add option to Samplestats to keep negatives when filtering (abs filter).
* Add options to Libconvert to generate a template for STR or non-STR markers.
* Add options to Samplevis, Samplestats (and possibly other relevant tools) to
filter alleles by sequence length.
......@@ -51,7 +55,9 @@ To-do:
(TODO: Write this list)
Open Vega issues:
* Bug in aggregate transform.
* Transforms after aggregate are not processed in order.
https://github.com/vega/vega/issues/538
* Bug in aggregate transform w.r.t. signals.
https://github.com/vega/vega/issues/530
* Lookup transform only takes simple field names for the onKey parameter.
https://github.com/vega/vega/issues/526
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment