Commit b7d64a4f authored by jhoogenboom's avatar jhoogenboom
Browse files

Introducing Profilevis, and various bug fixes

* New visualisation Profilevis added to the package, but not yet to
  the Vis tool.
* The Vis tool now prints a helpful error message if no output file
  was specified, instead of printing half a megabyte of HTML and
  minified JavaScript to the terminal.
* Fixed crash that occurred when attempting to convert the sequence
  of an alias to its allele name.
* Fixed various bugs in the functions that convert sequences to
  TSSV-style and allele names. Only the conversion of non-matching
  sequences was affected.
* Added "max_expected_copies" section to the FDSTools library
  format. The default value is 2. Allelefinder will now use these
  as the maximum number of alleles per marker if the
  -a/--max-alleles option is not specified.
* The section headers in the FDSTools library format are now case
  insensitive.
parent 2c96971b
......@@ -285,6 +285,7 @@ def parse_library_ini(handle):
"regex_middle": {},
"length_adjust": {},
"block_length": {},
"max_expected_copies": {},
"aliases": {}
}
markers = set()
......@@ -295,7 +296,8 @@ def parse_library_ini(handle):
for section in ini.sections():
for marker in ini.options(section):
value = ini.get(section, marker)
if section == "flanks":
section_low = section.lower()
if section_low == "flanks":
values = PAT_SPLIT.split(value)
if len(values) != 2:
raise ValueError(
......@@ -308,7 +310,7 @@ def parse_library_ini(handle):
(value, marker))
library["flanks"][marker] = values
markers.add(marker)
elif section == "prefix":
elif section_low == "prefix":
values = PAT_SPLIT.split(value)
for value in values:
if PAT_SEQ_RAW.match(value) is None:
......@@ -317,7 +319,7 @@ def parse_library_ini(handle):
(value, marker))
library["prefix"][marker] = values
markers.add(marker)
elif section == "suffix":
elif section_low == "suffix":
values = PAT_SPLIT.split(value)
for value in values:
if PAT_SEQ_RAW.match(value) is None:
......@@ -326,7 +328,7 @@ def parse_library_ini(handle):
(value, marker))
library["suffix"][marker] = values
markers.add(marker)
elif section == "length_adjust":
elif section_low == "length_adjust":
try:
value = int(value)
except:
......@@ -335,7 +337,7 @@ def parse_library_ini(handle):
"integer" % (value, marker))
library["length_adjust"][marker] = value
markers.add(marker)
elif section == "block_length":
elif section_low == "block_length":
try:
value = int(value)
except:
......@@ -344,7 +346,16 @@ def parse_library_ini(handle):
% (value, marker))
library["block_length"][marker] = value
markers.add(marker)
elif section == "aliases":
elif section_low == "max_expected_copies":
try:
value = int(value)
except:
raise ValueError(
"Maximum number of expected copies '%s' of marker %s "
"is not a valid integer" % (value, marker))
library["max_expected_copies"][marker] = value
markers.add(marker)
elif section_low == "aliases":
values = PAT_SPLIT.split(value)
if len(values) != 3:
raise ValueError("Alias %s does not have 3 values, but %i"
......@@ -363,7 +374,7 @@ def parse_library_ini(handle):
"name": values[2]
}
markers.add(marker)
elif section == "repeat":
elif section_low == "repeat":
if PAT_STR_DEF.match(value) is None:
raise ValueError(
"STR definition '%s' of marker %s is invalid" %
......@@ -384,8 +395,8 @@ def parse_library_ini(handle):
parts.append("(%s){0,1}" % library["aliases"][marker]["sequence"])
partsm.append("(%s){0,1}" % library["aliases"][marker]["sequence"])
elif marker in library["regex"]:
partsm = ("(%s){%s,%s}" % x for x in
PAT_STR_DEF_BLOCK.findall(library["regex"][marker]))
partsm = ["(%s){%s,%s}" % x for x in
PAT_STR_DEF_BLOCK.findall(library["regex"][marker])]
parts += partsm
if marker in library["suffix"]:
parts += ("(%s){0,1}" % x for x in library["suffix"][marker])
......@@ -622,8 +633,8 @@ def convert_sequence_raw_tssv(seq, library, marker, return_alias=False):
# canonical prefix ends with the first few blocks that
# we obtained in the match, move these blocks into the
# prefix. Also do this with the suffix.
middle = ((match.group(i), match.end(i)+len(pre_suf[0]))
for i in range(1, match.lastindex+1) if match.group(i))
middle = [(match.group(i), match.end(i)+len(pre_suf[0]))
for i in range(1, match.lastindex+1) if match.group(i)]
pre_suf[0] += seq[:match.start()]
pre_suf[1] = seq[match.end():] + pre_suf[1]
seq = match.group()
......@@ -716,10 +727,12 @@ def convert_sequence_raw_allelename(seq, library, marker):
remaining_blocks = len(blocks)
if "prefix" in library and marker in library["prefix"]:
prefix = library["prefix"][marker][0]
remaining_blocks -= 1
if prefix and blocks[0][1] == "1":
remaining_blocks -= 1
if "suffix" in library and marker in library["suffix"]:
suffix = library["suffix"][marker][0]
remaining_blocks -= 1
if suffix and blocks[-1][1] == "1":
remaining_blocks -= 1
if remaining_blocks > 0 and prefix and blocks[0][1] == "1":
this_prefix = blocks[0][0]
blocks = blocks[1:]
......@@ -739,11 +752,12 @@ def convert_sequence_raw_allelename(seq, library, marker):
# We are ready to return the allele name of aliases.
if alias != marker:
return "_".join([library["aliases"][marker]["name"]] + variants)
return "_".join([library["aliases"][alias]["name"]] + variants)
# Compute CE allele number for the other alleles.
# TODO: perhaps produce a more intelligent name if there is exactly
# one alias with the same length
# 1 alias with the same length, or only 1 alias sequence is
# contained somewhere within the allele.
blocknames = []
blocksize = library.get("block_length", {}).get(marker, 4)
length -= library.get("length_adjust", {}).get(marker, 0)
......
......@@ -13,7 +13,7 @@ sample, no alleles are called for this sample at all.
import argparse
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files, \
ensure_sequence_format, get_sample_data, \
ensure_sequence_format, get_sample_data, parse_library, \
add_sequence_format_args
__version__ = "0.1dev"
......@@ -37,10 +37,6 @@ _DEF_MIN_ALLELE_PCT = 30.0
# This value can be overridden by the -M command line option.
_DEF_MAX_NOISE_PCT = 10.0
# Default maximum number of alleles to expect for each marker.
# This value can be overridden by the -a command line option.
_DEF_MAX_ALLELES = 2
# Default maximum number of noisy markers allowed per sample.
# This value can be overridden by the -x command line option.
_DEF_MAX_NOISY = 2
......@@ -49,8 +45,7 @@ _DEF_MAX_NOISY = 2
def find_alleles(samples_in, outfile, reportfile, min_reads, min_allele_pct,
max_noise_pct, max_alleles, max_noisy, stuttermark_column,
seqformat, library):
if seqformat is not None and library is not None:
library = parse_library(library)
library = parse_library(library) if library is not None else {}
outfile.write("\t".join(["sample", "marker", "total", "allele"]) + "\n")
allelelist = {}
......@@ -108,13 +103,14 @@ def find_alleles_sample(data, outfile, reportfile, tag, min_reads,
(tag, marker, top_allele[marker]))
alleles[marker] = {}
continue
if len(alleles[marker]) > max_alleles:
expect = get_max_expected_alleles(max_alleles, marker, library)
if len(alleles[marker]) > expect:
allele_order = sorted(alleles[marker],
key=lambda x: -alleles[marker][x])
top_noise[marker] = [allele_order[max_alleles],
alleles[marker][allele_order[max_alleles]]]
top_noise[marker] = [allele_order[expect],
alleles[marker][allele_order[expect]]]
alleles[marker] = {x: alleles[marker][x]
for x in allele_order[:max_alleles]}
for x in allele_order[:expect]}
if top_noise[marker][1] > top_allele[marker]*(max_noise_pct/100.):
reportfile.write(
"Sample %s is not suitable for marker %s:\n"
......@@ -150,6 +146,15 @@ def find_alleles_sample(data, outfile, reportfile, tag, min_reads,
#find_alleles_sample
def get_max_expected_alleles(max_alleles, marker, library):
if max_alleles is not None:
return max_alleles
if "max_expected_copies" in library:
return library["max_expected_copies"].get(marker, 2)
return 2
#get_max_expected_alleles
def add_arguments(parser):
add_input_output_args(parser, False, False, True)
filtergroup = parser.add_argument_group("filtering options")
......@@ -168,9 +173,10 @@ def add_arguments(parser):
help="require at least this number of reads for the highest allele "
"of each marker (default: %(default)s)")
filtergroup.add_argument('-a', '--max-alleles', metavar="N",
type=pos_int_arg, default=_DEF_MAX_ALLELES,
help="allow no more than this number of alleles per marker (default: "
"%(default)s)")
type=pos_int_arg,
help="allow no more than this number of alleles per marker; if "
"unspecified, the amounts given in the library file are used, "
"which have a default value of 2")
filtergroup.add_argument('-x', '--max-noisy', metavar="N",
type=pos_int_arg, default=_DEF_MAX_NOISY,
help="entirely reject a sample if more than this number of markers "
......
......@@ -214,6 +214,12 @@ def convert_library(infile, outfile, aliases=False):
ini.add_section("block_length")
ini.set("block_length", "; Specify the core repeat unit length of "
"each marker. The default length is 4.")
ini.add_section("max_expected_copies")
ini.set("max_expected_copies", "; Specify the maximum expected number "
"copies (i.e., alleles) for each "
"marker.")
ini.set("max_expected_copies", "; The default is 2. Specify 1 "
"here for markers on the Y chromosome.")
# Enter flanking sequences and STR definitions.
fmt = "%%-%is" % reduce(max, map(len,
......@@ -238,6 +244,9 @@ def convert_library(infile, outfile, aliases=False):
if block_length != 0 and block_length < 10:
ini.set("block_length", fmt%marker, block_length)
# Write max_expected_copies=2 for all markers explicitly.
ini.set("max_expected_copies", fmt%marker, 2)
# TODO: I could also do some fiddling for prefix/suffix...
# Write INI file.
......
......@@ -108,7 +108,7 @@ def create_visualisation(vistype, infile, outfile, vega, online, tidy,
set_data_formula_transform_value(
spec, "yscale", "barwidth", bar_width)
set_data_formula_transform_value(
spec, "yscale", "markeroffset", padding)
spec, "yscale", "subgraphoffset", padding)
set_data_formula_transform_value(
spec, "table", "amplitude_threshold", min_abs)
set_data_formula_transform_value(
......@@ -211,6 +211,11 @@ def run(args):
args.outfile = open(args.infile, 'w')
args.infile = None
if args.outfile.isatty():
raise ValueError("Please specify a file name to write the %s to." %
("Vega graph specification (JSON format)" if args.vega
else "HTML document"))
if args.infile is not None and args.infile != sys.stdin:
# Open the specified input file.
args.infile = open(args.infile, 'r')
......
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Background Noise Profile Visualisation - FDSTools</title/>
<!-- BEGIN_LIBRARIES -->
<script src="http://vega.github.io/vega-editor/vendor/d3.min.js"></script>
<script src="http://vega.github.io/vega/vega.min.js"></script>
<!-- END_LIBRARIES -->
<style>
* {
font-family: Helvetica Neue, Helvetica, Arial, sans-serif;
}
body {
margin: 0px
}
div.options {
position: absolute;
margin: 5px;
background-color: rgba(128, 128, 255, 0.5);
z-index: 10;
}
#optionsheader{
cursor: pointer;
}
#optionsheader:hover{
border-bottom: 1px dashed black;
}
div#vis {
position: absolute;
overflow: auto;
bottom: 0px;
top: 0px;
right: 0px;
left: 0px;
text-align: right;
}
</style>
</head>
<body>
<div class="options">
<strong id="optionsheader">Options</strong><br>
<div id="options">
<span id="fileselectspan" style="display: none">
Open sample data file (or drag a file onto this page): <input id="fileselect" type="file"><br>
</span>
<!--
Select marker: <select id="markers"></select><br>
Select allele: <select id="alleles"></select><br> -->
Display options: bar width <input type="text" value="15" id="barwidth" size="2">px; subgraph spacing <input type="text" value="70" id="subgraphoffset" size="3">px<br>
Filtering: minimum noise ratio <input type="text" value="0.5" id="minP" size="3">%<br>
Axis scale: <input type="radio" name="scale" value="linear" id="scaleLinear" checked> Linear
<input type="radio" name="scale" value="log" id="scaleLog"> Logarithmic<br>
Render as: <input type="radio" name="renderer" value="canvas" id="renderCanvas"> Canvas
<input type="radio" name="renderer" value="svg" id="renderSVG" checked> SVG
<br>
<a id="saveLink" href="javascript:void(saveImage())" style="display: none">Save image</a>
</div>
</div>
<div id="vis"></div>
<script type="text/javascript">
var graph = false;
function parse(){
vg.parse.spec(graph_spec, function(chart){
var rendererName = "canvas";
if(document.getElementById("renderSVG").checked)
rendererName="svg";
graph = chart({el: "#vis", renderer: rendererName});
graph.update();
document.getElementById("saveLink").style.display = "inline";
//Scroll to the right; the graph is more interesting than the long labels.
var visdiv = document.getElementById("vis");
visdiv.scrollLeft = visdiv.scrollWidth;
});
}
function setScale(value){
if(!graph_spec)
return;
for(i in graph_spec["marks"])
if(graph_spec["marks"][i]["scales"])
for(j in graph_spec["marks"][i]["scales"])
if(graph_spec["marks"][i]["scales"][j]["name"] == "x")
graph_spec["marks"][i]["scales"][j]["type"] = value;
setDataFormulaTransformValue("table", "low", value == "log"? "0.001" : "0");
parse();
}
function setDataFormulaTransformValue(dataname, fieldname, value){
if(!graph_spec)
return false;
for(i in graph_spec["data"]){
if(graph_spec["data"][i]["name"] == dataname){
for(j in graph_spec["data"][i]["transform"]){
if(graph_spec["data"][i]["transform"][j]["type"] == "formula" && graph_spec["data"][i]["transform"][j]["field"] == fieldname){
graph_spec["data"][i]["transform"][j]["expr"] = "" + value;
return true;
}
}
}
}
return false;
}
function getDataFormulaTransformValue(dataname, fieldname){
if(!graph_spec)
return false;
for(i in graph_spec["data"]){
if(graph_spec["data"][i]["name"] == dataname){
for(j in graph_spec["data"][i]["transform"]){
if(graph_spec["data"][i]["transform"][j]["type"] == "formula" && graph_spec["data"][i]["transform"][j]["field"] == fieldname){
return graph_spec["data"][i]["transform"][j]["expr"];
}
}
}
}
return false;
}
function setRenderer(value){
if(graph)
graph.renderer(value);
}
//Load the data (input is a fileList object; only the first file is loaded).
function loadDataset(fileList){
if(!graph_spec)
return;
var reader = new FileReader();
reader.onload = function(theFile){
graph_spec["data"][0]["values"] = reader.result;
parse();
};
reader.readAsText(fileList[0]);
}
//Save image function.
function saveImage(){
var link = document.getElementById("saveLink");
if(document.getElementById("renderSVG").checked){
var svg = document.querySelector("svg.marks");
svg.setAttribute("xmlns", "http://www.w3.org/2000/svg"); //Vega does not set this for us
link.setAttribute("href", "data:image/svg+xml," + encodeURIComponent(svg.outerHTML));
link.setAttribute("download", "graph.svg");
}
else{
link.setAttribute("href", document.getElementsByClassName('marks')[0].toDataURL());
link.setAttribute("download", "graph.png");
}
link.click();
link.setAttribute("href", "javascript:void(saveImage())");
return false;
}
function onLoadSpec(has_data){
//Allow files to be dragged onto the page.
if(!has_data){
document.addEventListener('dragover', function(evt){
evt.stopPropagation();
evt.preventDefault();
}, false);
document.addEventListener('drop', function(evt){
evt.stopPropagation();
evt.preventDefault();
loadDataset(evt.dataTransfer.files);
}, false);
//Handle files from the file input.
document.getElementById("fileselect").addEventListener('change', function(){
loadDataset(document.getElementById("fileselect").files);
}, false);
document.getElementById("fileselectspan").style.display = "inline";
}
//Update graph when rendering mode or axis scale is changed.
document.getElementById("renderCanvas").addEventListener('change', function(){
setRenderer(this.value);
}, false);
document.getElementById("renderSVG").addEventListener('change', function(){
setRenderer(this.value);
}, false);
document.getElementById("scaleLinear").addEventListener('change', function(){
setScale(this.value);
}, false);
document.getElementById("scaleLog").addEventListener('change', function(){
setScale(this.value);
}, false);
document.getElementById("barwidth").addEventListener('keyup', function(){
var value = parseFloat(this.value);
if(isNaN(value))
return;
if(setDataFormulaTransformValue("yscale", "barwidth", value) && graph)
parse();
}, false);
document.getElementById("subgraphoffset").addEventListener('keyup', function(){
var value = parseFloat(this.value);
if(isNaN(value))
return;
if(setDataFormulaTransformValue("yscale", "subgraphoffset", value) && graph)
parse();
}, false);
document.getElementById("minP").addEventListener('keyup', function(){
var value = parseFloat(this.value);
if(isNaN(value))
return;
if(setDataFormulaTransformValue("table", "filter_threshold", value) && graph)
parse();
}, false);
//Toggle options visibility.
document.getElementById("optionsheader").addEventListener('click', function(){
var opts = document.getElementById("options");
if(opts.style.display == "none")
document.getElementById("options").style.display = "block";
else
document.getElementById("options").style.display = "none";
}, false);
//Sync graph_spec and display.
if(document.getElementById("scaleLinear").checked)
setScale(document.getElementById("scaleLinear").value);
else
setScale(document.getElementById("scaleLog").value);
document.getElementById("barwidth").value = getDataFormulaTransformValue("yscale", "barwidth");
document.getElementById("subgraphoffset").value = getDataFormulaTransformValue("yscale", "subgraphoffset");
document.getElementById("minP").value = getDataFormulaTransformValue("table", "filter_threshold");
if(has_data){
document.getElementById("options").style.display = "none";
parse();
}
}
</script>
<!-- BEGIN_LOAD_SCRIPT -->
<script type="text/javascript">
var graph_spec = false;
vg.util.load({url: "profilevis.json"}, function(err, result){
graph_spec = JSON.parse(result);
onLoadSpec(false);
});
</script>
<!-- END_LOAD_SCRIPT -->
</body>
</html>
{
"width": 600,
"height": 10,
"data": [
{
"name": "raw",
"values": "VALUES HERE",
"format": {
"type": "tsv",
"parse": {
"fmean": "number",
"rmean": "number"
}
}
},
{
"name": "table",
"source": "raw",
"transform": [
{
"type": "formula",
"field": "filter_threshold",
"expr": "0.5"
},
{
"type": "filter",
"test": "datum.fmean+datum.rmean < 200 && max(datum.fmean, datum.rmean) > datum.filter_threshold"
},
{
"type": "formula",
"field": "low",
"expr": "0"
},
{
"type": "formula",
"field": "fmean",
"expr": "max(datum.fmean, datum.low)"
},
{
"type": "formula",
"field": "rmean",
"expr": "max(datum.rmean, datum.low)"
},
{
"type": "formula",
"field": "flabel",
"expr": "indexof(''+round(datum.fmean*10)/10, '.') == -1? ((round(datum.fmean*10)/10) + '.0') : slice((round(datum.fmean*10)/10)+'0', 0, indexof((round(datum.fmean*10)/10) + '.', '.')+2)"
},
{
"type": "formula",
"field": "rlabel",
"expr": "indexof(''+round(datum.rmean*10)/10, '.') == -1? ((round(datum.rmean*10)/10) + '.0') : slice((round(datum.rmean*10)/10)+'0', 0, indexof((round(datum.rmean*10)/10) + '.', '.')+2)"
},
{
"type": "formula",
"field": "name",
"expr": "datum.marker + ' ' + datum.allele"
},
{
"type": "sort",
"by": ["name", "sequence"]
}
]
},
{
"name": "barcounts",
"source": "table",
"transform": [
{
"type": "aggregate",
"groupby": ["name"],
"summarize": {"*": "count"}
}
]
},
{
"name": "subgraphpadding",
"source": "barcounts",
"transform": [
{
"type": "cross",
"with": "barcounts"
},
{
"type": "filter",
"test": "datum.b.name < datum.a.name"
},
{
"type": "aggregate",
"groupby": ["a.name"],
"summarize": [{"field": "*", "ops": ["count"], "as": ["cumulpadding"]}]
},
{
"type": "formula",
"field": "name",
"expr": "datum['a.name']"
}
]
},
{
"name": "subgraphoffsets",
"source": "barcounts",
"transform": [