Commit e7517bbd authored by Hoogenboom, Jerry's avatar Hoogenboom, Jerry
Browse files

Grand update to all visualisations, especially Samplevis

Fixed:
* The Vis tool no longer crashes if you specify '-' as the input file
  without piping data in from another program. It will just produce a
  visualisation file with no embedded data instead.
* FDSTools would crash when generating an allele name for a sequence of
  an STR marker that contained the prefix and suffix of the marker but
  not the actual STR (yes, this happened).
* Stuttermodelvis would draw all 'All data' fits in the graphs of all
  repeat unit sequences, instead of just the 'All data' fit that was
  fitted to the data of a particular repeat sequence.

Improved:
* BGHomStats, BGHomRaw, and Samplestats now round their output to three
  significant digits.
* BGCorrect now rounds its output to 3 decimal positions.

Various enhancements to Samplevis HTML visualisations:
* Added a whole new set of options which are used to automatically
  select the true alleles in a sample.
* Added an option to split the graphs and the table up per marker.
* The selected alleles are no longer lost when the graphs are
  re-rendered due to changed options.
* Added some more columns to the table of selected alleles and made the
  table prettier.
* Added a dedicated stylesheet for printing, which transforms the web
  page into a nicely formatted report when printed.
* Option groups can now be hidden separately.
* Filtering options are now based on the read numbers after correction.
* The mouse cursor now changes to a 'pointer' style cursor (usually a
  hand with stretched index finger) when hovered over the clickable
  portion of the graph.

Visualisations:
* Updated Vega to version 2.4.0 and d3 to version 3.5.10.
* All visualisations now use signals to set the options. This allows
  them to be updated without re-parsing the entire graph spec in most
  cases, which is much faster.
* Using new cross-and-filter capabilities in bgrawvis, profilevis,
  samplevis, and stuttermodelvis. This greatly reduces Vega's memory
  usage and speeds up rendering.
* The name of the currently loaded data file is prepended to the page
  title in HTML visualisations.
* If a file is loaded into an HTML visualisation by drag-and-drop, the
  name of the loaded file is displayed on the file input element.
* A new -T/--title option for the Vis tool allows for specifying
  something that should be prepended to the page title of HTML
  visualisations. This is particularly useful when data is piped in,
  because no file name is available in that case.
* Asynchronous rendering of visualisations is now cancelled if a new
  asynchronous rendering task has already been scheduled (HTML
  visualisations only).
parent 559ee083
......@@ -732,8 +732,8 @@ def convert_sequence_raw_tssv(seq, library, marker, return_alias=False):
break
# Find longest match of middle pattern.
middle = [(seq, len(pre_suf[0])+len(seq))]
if marker in library["regex_middle"]:
middle = [(seq, len(pre_suf[0])+len(seq))] if seq else []
if middle and marker in library["regex_middle"]:
match = regex_longest_match(library["regex_middle"][marker], seq)
if match is not None and match.end()-match.start():
......
......@@ -117,6 +117,12 @@ def match_profile(column_names, data, profile, convert_to_raw, library,
forward_add = np.multiply(A, P1.sum(1))
reverse_add = np.multiply(A, P2.sum(1))
# Round values to 3 decimal positions.
forward_noise.round(3, forward_noise);
reverse_noise.round(3, reverse_noise);
forward_add.round(3, forward_add);
reverse_add.round(3, reverse_add);
j = 0
for line in data:
j += 1
......
......@@ -134,7 +134,7 @@ def compute_ratios(samples_in, outfile, allelefile, annotation_column, min_pct,
outfile.write("\t".join([
data[marker, allele][sequence]["tag"][i], marker, allele,
sequence] + [
str(x) if abs(x) > 0.0000000001 else "0" for x in (
"%.3g" % x if abs(x) > 0.0000000001 else "0" for x in (
data[marker, allele][sequence]["forward"][i],
data[marker, allele][sequence]["reverse"][i],
data[marker, allele][sequence]["forward"][i] +
......
......@@ -126,7 +126,7 @@ def compute_stats(samples_in, outfile, allelefile, annotation_column, min_pct,
for marker, allele in data:
for sequence in data[marker, allele]:
outfile.write("\t".join([marker, allele, sequence] + [
str(x) if abs(x) > 0.0000000001 else "0" for x in (
"%.3g" % x if abs(x) > 0.0000000001 else "0" for x in (
data[marker, allele][sequence][0]["n"],
data[marker, allele][sequence][0]["min"],
data[marker, allele][sequence][0]["max"],
......
......@@ -142,61 +142,66 @@ def compute_stats(infile, outfile, library, seqformat):
marker_total_corrected = sum(
row[colid_total_corrected] for row in data[marker])
for row in data[marker]:
row.append(100.*row[colid_forward]/row[colid_total])
row.append("%.3g" % (100.*row[colid_forward]/row[colid_total]))
if colid_forward_corrected is not None:
if colid_total_corrected is not None:
row.append(100.*(
row.append("%.3g" % (100.*(
row[colid_forward_corrected]/row[colid_total_corrected]
if row[colid_total_corrected]
else row[colid_forward_corrected] > 0))
row.append(
100.*row[colid_forward_corrected]/row[colid_forward]-100
else row[colid_forward_corrected] > 0)))
row.append("%.3g" %
(100.*row[colid_forward_corrected]/row[colid_forward]-100
if row[colid_forward]
else (row[colid_forward_corrected]>0)*200-100)
else (row[colid_forward_corrected]>0)*200-100))
if colid_reverse_corrected is not None:
row.append(
100.*row[colid_reverse_corrected]/row[colid_reverse]-100
row.append("%.3g" %
(100.*row[colid_reverse_corrected]/row[colid_reverse]-100
if row[colid_reverse]
else (row[colid_reverse_corrected]>0)*200-100)
else (row[colid_reverse_corrected]>0)*200-100))
if colid_total_corrected is not None:
row.append(
100.*row[colid_total_corrected]/row[colid_total]-100)
row.append(100.*row[colid_forward]/marker_forward
if marker_forward else 0)
row.append(100.*row[colid_reverse]/marker_reverse
if marker_reverse else 0)
row.append(100.*row[colid_total]/marker_total
if marker_total else 0)
row.append("%.3g" %
(100.*row[colid_total_corrected]/row[colid_total]-100))
row.append("%.3g" % (100.*row[colid_forward]/marker_forward
if marker_forward else 0))
row.append("%.3g" % (100.*row[colid_reverse]/marker_reverse
if marker_reverse else 0))
row.append("%.3g" % (100.*row[colid_total]/marker_total
if marker_total else 0))
if colid_forward_noise is not None:
row.append(100.*row[colid_forward_noise]/marker_forward_noise
if marker_forward_noise else 0)
row.append("%.3g" %
(100.*row[colid_forward_noise]/marker_forward_noise
if marker_forward_noise else 0))
if colid_reverse_noise is not None:
row.append(100.*row[colid_reverse_noise]/marker_reverse_noise
if marker_reverse_noise else 0)
row.append("%.3g" %
(100.*row[colid_reverse_noise]/marker_reverse_noise
if marker_reverse_noise else 0))
if colid_total_noise is not None:
row.append(100.*row[colid_total_noise]/marker_total_noise
if marker_total_noise else 0)
row.append(
"%.3g" % (100.*row[colid_total_noise]/marker_total_noise
if marker_total_noise else 0))
if colid_forward_add is not None:
row.append(100.*row[colid_forward_add]/marker_forward_add
if marker_forward_add else 0)
row.append(
"%.3g" % (100.*row[colid_forward_add]/marker_forward_add
if marker_forward_add else 0))
if colid_reverse_add is not None:
row.append(100.*row[colid_reverse_add]/marker_reverse_add
if marker_reverse_add else 0)
row.append(
"%.3g" % (100.*row[colid_reverse_add]/marker_reverse_add
if marker_reverse_add else 0))
if colid_total_add is not None:
row.append(100.*row[colid_total_add]/marker_total_add
if marker_total_add else 0)
row.append("%.3g" % (100.*row[colid_total_add]/marker_total_add
if marker_total_add else 0))
if colid_forward_corrected is not None:
row.append(
100.*row[colid_forward_corrected]/marker_forward_corrected
if marker_forward_corrected else 0)
row.append("%.3g" %
(100.*row[colid_forward_corrected]/marker_forward_corrected
if marker_forward_corrected else 0))
if colid_reverse_corrected is not None:
row.append(
100.*row[colid_reverse_corrected]/marker_reverse_corrected
if marker_reverse_corrected else 0)
row.append("%.3g" %
(100.*row[colid_reverse_corrected]/marker_reverse_corrected
if marker_reverse_corrected else 0))
if colid_total_corrected is not None:
row.append(
100.*row[colid_total_corrected]/marker_total_corrected
if marker_total_corrected else 0)
row.append("%.3g" %
(100.*row[colid_total_corrected]/marker_total_corrected
if marker_total_corrected else 0))
# Write results.
outfile.write("\t".join(column_names) + "\n")
......
......@@ -13,11 +13,11 @@ D3) are embedded in the generated HTML file. With the -O/--online
option specified, the HTML file will instead link to the latest version
of these libraries on the Internet.
Vega also supports generating visualisations on the command line, which
is useful if you wish to generate graphs automatically in your analysis
pipeline. Specify the -V/--vega option to obtain a bare Vega graph
specification (a JSON file) instead of the full-featured HTML file. You
can pass this file through Vega to generate a PNG or SVG image file.
Vega supports generating visualisations on the command line. By
default, FDSTools produces a full-featured HTML file. Specify the
-V/--vega option if you wish to obtain a bare Vega graph specification
(a JSON file) instead. You can pass this file through Vega to generate
a PNG or SVG image file.
If an input file is specified, the visualisation will be set up
specifically to visualise the contents of this file. To this end, the
......@@ -52,6 +52,9 @@ _DEF_THRESHOLD_PCT = 0.5
# This value can be overridden by the -s command line option.
_DEF_THRESHOLD_ORIENTATION = 0
# Default percentage of reads on one strand to mark as bias.
# This value can be overridden by the -B command line option.
_DEF_THRESHOLD_BIAS = 25.0
# Default width of bars in bar graphs, in pixels.
# This value can be overridden by the -b command line option.
......@@ -84,10 +87,12 @@ _DEF_DATA_FILENAME = "data.csv"
_PAT_LOAD_SCRIPT = re.compile("<!--\s*BEGIN_LOAD_SCRIPT\s*-->\s*(.*?)\s*"
"<!--\s*END_LOAD_SCRIPT\s*-->", flags=re.DOTALL)
_PAT_TITLE = re.compile("<title>\s*(.*?)\s*"
"</title>", flags=re.DOTALL|re.IGNORECASE)
_PAT_LIBRARIES = re.compile("<!--\s*BEGIN_LIBRARIES\s*-->\s*(.*?)\s*"
"<!--\s*END_LIBRARIES\s*-->", flags=re.DOTALL)
_PAT_LOAD_SCRIPT = re.compile("<!--\s*BEGIN_LOAD_SCRIPT\s*-->\s*(.*?)\s*"
"<!--\s*END_LOAD_SCRIPT\s*-->", flags=re.DOTALL)
_SCRIPT_BEGIN = '<script type="text/javascript">'
_SCRIPT_END = '</script>'
......@@ -96,19 +101,15 @@ _EXTERNAL_LIBRARIES = ("vis/d3.min.js", "vis/vega.min.js")
def set_data_formula_transform_value(spec, dataname, fieldname, value):
for data in spec["data"]:
if data["name"] != dataname:
continue
if "transform" not in data:
return False
for transform in data["transform"]:
if (transform["type"] == "formula" and
transform["field"] == fieldname):
transform["expr"] = str(value)
return True
def set_signal_value(spec, signalname, value):
if "signals" not in spec:
return False
for signal in spec["signals"]:
if signal["name"] == signalname:
signal["init"] = value
return True
return False
#set_data_formula_transform_value
#set_signal_value
def set_axis_scale(spec, scalename, value):
......@@ -126,8 +127,9 @@ def set_axis_scale(spec, scalename, value):
def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
min_pct, min_per_strand, bar_width, padding, marker,
width, height, log_scale, repeat_unit, no_alldata):
min_pct, min_per_strand, bias_threshold, bar_width,
padding, marker, width, height, log_scale,
repeat_unit, no_alldata, title):
# Get graph spec.
spec = json.load(resource_stream(
"fdstools", "vis/%svis/%svis.json" % (vistype, vistype)))
......@@ -142,35 +144,25 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
# Apply width, height, and padding settings.
spec["width"] = width
if vistype == "stuttermodel":
set_data_formula_transform_value(spec, "yscale", "graphheight", height)
set_signal_value(spec, "graphheight", height)
else:
set_data_formula_transform_value(spec, "yscale", "barwidth", bar_width)
set_data_formula_transform_value(spec, "yscale", "subgraphoffset", padding)
set_data_formula_transform_value(
spec,
"fitfunctions" if vistype == "stuttermodel" else "table",
"filter_marker", "'" + marker + "'")
set_signal_value(spec, "barwidth", bar_width)
set_signal_value(spec, "subgraphoffset", padding)
set_signal_value(spec, "filter_marker", marker)
# Apply type-specific settings.
if vistype == "stuttermodel":
set_data_formula_transform_value(
spec, "fitfunctions", "filter_unit", "'" + repeat_unit + "'")
set_data_formula_transform_value(
spec, "fitfunctions", "show_all_data",
"false" if no_alldata else "true")
set_signal_value(spec, "filter_unit", repeat_unit)
set_signal_value(spec, "show_all_data", False if no_alldata else True)
elif vistype == "sample" or vistype == "bgraw":
set_data_formula_transform_value(
spec, "table", "amplitude_threshold", min_abs)
set_data_formula_transform_value(
spec, "table", "amplitude_pct_threshold", min_pct)
set_signal_value(spec, "amplitude_threshold", min_abs)
set_signal_value(spec, "amplitude_pct_threshold", min_pct)
elif vistype == "profile":
set_data_formula_transform_value(
spec, "table", "filter_threshold", min_pct)
set_data_formula_transform_value(
spec, "table", "low", "0.001" if log_scale else "0")
set_signal_value(spec, "filter_threshold", min_pct)
set_signal_value(spec, "low", 0.001 if log_scale else 0)
if vistype == "sample":
set_data_formula_transform_value(
spec, "table", "orientation_threshold", min_per_strand)
set_signal_value(spec, "orientation_threshold", min_per_strand)
set_signal_value(spec, "bias_threshold", bias_threshold)
# Apply axis scale settings.
if vistype != "stuttermodel":
......@@ -218,6 +210,17 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy, min_abs,
parts.append(html[match.end(1):])
html = "".join(parts)
if title is None and infile is not None and infile != sys.stdin:
try:
title = os.path.splitext(os.path.basename(infile.name))[0]
except AttributeError:
pass
if title:
match = _PAT_TITLE.search(html)
if match:
html = "".join([
html[:match.start(1)], title, " - ", html[match.start(1):]])
outfile.write(html)
#create_visualisation
......@@ -242,9 +245,9 @@ def add_arguments(parser):
default=sys.stdout,
help="file to write output to (default: write to stdout)")
parser.add_argument('-V', '--vega', action="store_true",
help="an HTML file containing an interactive visualisation is created "
"by default; if this option is specified, a Vega graph "
"specification (JSON file) is produced instead")
help="by default, a full-featured HTML file offering an interactive "
"visualisation is created; if this option is specified, only a "
"bare Vega graph specification (JSON file) is produced instead")
parser.add_argument('-O', '--online', action="store_true",
help="when generating an HTML visualisation file, required JavaScript "
"libraries (D3 and Vega) are embedded in the file; if this "
......@@ -253,6 +256,9 @@ def add_arguments(parser):
"versions of D3 and Vega")
parser.add_argument('-t', '--tidy', action="store_true",
help="tidily indent the generated JSON")
parser.add_argument('-T', '--title',
help="prepend the given value to the title of HTML visualisations "
"(default: prepend name of data file if given)")
visgroup = parser.add_argument_group("visualisation options",
description="words in [brackets] indicate applicable visualisation "
......@@ -271,6 +277,10 @@ def add_arguments(parser):
type=pos_int_arg, default=_DEF_THRESHOLD_ORIENTATION,
help="[sample] only show sequences with this minimum number of reads "
"for both orientations (forward/reverse) (default: %(default)s)")
visgroup.add_argument('-B', '--bias-threshold', metavar="N", type=float,
default=_DEF_THRESHOLD_BIAS,
help="[sample] mark sequences that have less than this percentage of "
"reads on one strand (default: %(default)s)")
visgroup.add_argument('-M', '--marker', metavar="REGEX",
default=_DEF_MARKER_REGEX,
help="[sample, profile, bgraw, stuttermodel] only show graphs for the "
......@@ -307,9 +317,8 @@ def add_arguments(parser):
def run(args):
if args.infile == "-" and not sys.stdin.isatty():
# User appears to want to pipe data in.
args.infile = sys.stdin
if args.infile == "-":
args.infile = None if sys.stdin.isatty() else sys.stdin
if (args.infile is not None and args.outfile == sys.stdout
and not os.path.exists(args.infile)):
# One filename given, and it does not exist. Assume outfile.
......@@ -327,7 +336,8 @@ def run(args):
create_visualisation(args.type, args.infile, args.outfile, args.vega,
args.online, args.tidy, args.min_abs, args.min_pct,
args.min_per_strand, args.bar_width, args.padding,
args.marker, args.width, args.height, args.log_scale,
args.repeat_unit, args.no_alldata)
args.min_per_strand, args.bias_threshold,
args.bar_width, args.padding, args.marker, args.width,
args.height, args.log_scale, args.repeat_unit,
args.no_alldata, args.title)
#run
{
"width": 600,
"height": 10,
"signals": [
{
"name": "amplitude_threshold",
"init": 5
},
{
"name": "amplitude_pct_threshold",
"init": 0.5
},
{
"name": "filter_marker",
"init": ".*"
},
{
"name": "barwidth",
"init": 15
},
{
"name": "subgraphoffset",
"init": 70
}
],
"data": [
{
"name": "table",
......@@ -21,16 +43,6 @@
"type": "filter",
"test": "datum.allele != datum.sequence"
},
{
"type": "formula",
"field": "amplitude_threshold",
"expr": "5"
},
{
"type": "formula",
"field": "amplitude_pct_threshold",
"expr": "0.5"
},
{
"type": "formula",
"field": "maxnoise",
......@@ -38,16 +50,7 @@
},
{
"type": "filter",
"test": "datum.total >= 1 && datum.total >= datum.amplitude_threshold && datum.maxnoise >= datum.amplitude_pct_threshold"
},
{
"type": "formula",
"field": "filter_marker",
"expr": "'.*'"
},
{
"type": "filter",
"test": "test('^' + datum.filter_marker + '$', datum.marker)"
"test": "datum.total >= 1 && datum.total >= amplitude_threshold && datum.maxnoise >= amplitude_pct_threshold && test('^' + filter_marker + '$', datum.marker)"
},
{
"type": "formula",
......@@ -83,11 +86,8 @@
"transform": [
{
"type": "cross",
"diagonal": false
},
{
"type": "filter",
"test": "datum.b.name < datum.a.name"
"diagonal": false,
"filter": "datum.b.name < datum.a.name"
},
{
"type": "formula",
......@@ -107,11 +107,8 @@
"transform": [
{
"type": "cross",
"with": "table"
},
{
"type": "filter",
"test": "datum.b.name < datum.a.name"
"with": "table",
"filter": "datum.b.name < datum.a.name"
},
{
"type": "formula",
......@@ -150,25 +147,15 @@
"as": ["offsetobj"],
"default": {"cumulcount": 0}
},
{
"type": "formula",
"field": "barwidth",
"expr": "15"
},
{
"type": "formula",
"field": "subgraphoffset",
"expr": "70"
},
{
"type": "formula",
"field": "offset",
"expr": "(10+datum.barwidth)*datum.offsetobj.cumulcount + datum.subgraphoffset*datum.paddingobj.cumulpadding"
"expr": "(10+barwidth)*datum.offsetobj.cumulcount + subgraphoffset*datum.paddingobj.cumulpadding"
},
{
"type": "formula",
"field": "end",
"expr": "datum.offset + (10+datum.barwidth)*datum.count"
"expr": "datum.offset + (10+barwidth)*datum.count"
}
]
}
......@@ -188,7 +175,7 @@
"data": "yscale"
},
"properties": {
"enter": {
"update": {
"x": {"field": {"group": "width"}, "mult": 0.5},
"y": {"field": "offset"},
"fontWeight": {"value": "bold"},
......@@ -218,7 +205,7 @@
]
},
"properties": {
"enter": {
"update": {
"x": {"value": 0},
"width": {"field": {"group": "width"}},
"y": {"field": "subgraphscale.offset"},
......@@ -283,22 +270,20 @@
"transform": [
{
"type": "filter",
"test": "datum.forward >= 1 && datum.forward >= datum.amplitude_threshold"
"test": "datum.forward >= 1 && datum.forward >= amplitude_threshold"
},
{
"type": "filter",
"test": "datum.fnoise >= datum.amplitude_pct_threshold"
"test": "datum.fnoise >= amplitude_pct_threshold"
}
]
},
"properties": {
"enter": {
"update": {
"x": {"scale": "x", "field": "fnoise"},
"y": {"scale": "y", "field": "sequence", "offset": -5},
"fill": {"scale": "c", "value": "Forward reads"},
"fillOpacity": {"value": 0.8}
},
"update": {
"fillOpacity": {"value": 0.8},
"size": {"value": 100},
"stroke": {"value": "transparent"}
},
......@@ -314,22 +299,20 @@
"transform": [
{
"type": "filter",
"test": "datum.reverse >= 1 && datum.reverse >= datum.amplitude_threshold"
"test": "datum.reverse >= 1 && datum.reverse >= amplitude_threshold"
},
{
"type": "filter",
"test": "datum.rnoise >= datum.amplitude_pct_threshold"
"test": "datum.rnoise >= amplitude_pct_threshold"
}
]
},
"properties": {
"enter": {
"update": {
"x": {"scale": "x", "field": "rnoise"},
"y": {"scale": "y", "field": "sequence", "offset": 5},
"fill": {"scale": "c", "value": "Reverse reads"},
"fillOpacity": {"value": 0.8}
},
"update": {
"fillOpacity": {"value": 0.8},
"size": {"value": 100},
"stroke": {"value": "transparent"}
},
......@@ -345,22 +328,20 @@
"transform": [
{
"type": "filter",
"test": "datum.total >= 1 && datum.total >= datum.amplitude_threshold"
"test": "datum.total >= 1 && datum.total >= amplitude_threshold"
},
{
"type": "filter",
"test": "datum.tnoise >= datum.amplitude_pct_threshold"
"test": "datum.tnoise >= amplitude_pct_threshold"
}
]
},
"properties": {
"enter": {
"update": {
"x": {"scale": "x", "field": "tnoise"},
"y": {"scale": "y", "field": "sequence"},
"fill": {"scale": "c", "value": "Total reads"},
"fillOpacity": {"value": 0.8}
},
"update": {
"fillOpacity": {"value": 0.8},
"size": {"value": 100},
"stroke": {"value": "transparent"}
},
......
......@@ -121,17 +121,20 @@
<script type="text/javascript">
var graph = false;
var fileName = "graph";
var stamp = 0;
function parse(){
var this_stamp = ++stamp;
vg.parse.spec(graph_spec, function(chart){
var rendererName = "canvas";
if(document.getElementById("renderSVG").checked)
rendererName="svg";
graph = chart({el: "#vis", renderer: rendererName});
// Cancel rendering if a new parse() call was made.
if(this_stamp != stamp)
return;
var visdiv = document.getElementById("vis");
graph = chart({el: visdiv, renderer: document.querySelector("input[name=renderer]:checked").value});
graph.update();
document.getElementById("saveLink").style.display = "inline";
//Scroll to the right; the graph is more interesting than the long labels.
var visdiv = document.getElementById("vis");