Commit 5a2addb7 authored by Hoogenboom, Jerry's avatar Hoogenboom, Jerry

Developing towards v0.0.6 (v0.0.6.dev1)

* General changes in v0.0.6.dev1:
  * Tools that take a list of files as their argument (through the -i option or
    as positionals) now explicitly support glob patterns.  This means they will
    interpret '*' and '?' characters as wildcards for 'zero or more characters'
    and 'any one character', respectively.  On Unix-like systems this is
    generally done by the shell, but on Windows one had to specify every file
    name completely.
* BGEstimate v1.1.1:
  * Added option -p/--profiles which can be used to provide a previously
    created background noise profiles file.  BGEstimate will read starting
    values from this file instead of assuming zero noise.
* BGMerge v1.0.2:
  * Small code changes to facilitate explicit glob pattern matching support.
* Pipeline v1.0.1:
  * The Pipeline tool will no longer check the existence of the files specified
    for the -S/--in-samples option; instead, this is left to the downstream
    tools to find out, consistent with how this works with the other input file
    options.
* Allelevis v2.0.1:
  * Added tooltip support to HTML visualisations.  Moving the mouse pointer
    over a node or edge in the graph now displays a tooltip giving allele names
    and sample counts.
* Stuttermodelvis v2.0.1:
  * Changed the unit in the horizontal axis title from 'bp' to 'nt'.
* Library v1.0.1:
  * Updated some of the comments describing the sections.
parent abba1c04
......@@ -20,7 +20,7 @@
# along with FDSTools. If not, see <http://www.gnu.org/licenses/>.
#
import re, sys, argparse, random, itertools, textwrap
import re, sys, argparse, random, itertools, textwrap, glob
#import numpy as np # Imported only when calling nnls()
from ConfigParser import RawConfigParser, MissingSectionHeaderError
......@@ -298,7 +298,8 @@ def call_variants(template, sequence, location="suffix", cache=True,
in_matrix = M_GAP2
elif not (matrix_direction[i] & A_MATCH):
raise ValueError(
"Alignment error: Dead route! (This is a bug.) [%s,%s]" % (template,sequence))
"Alignment error: Dead route! (This is a bug.) [%s,%s]" %
(template,sequence))
if in_matrix == M_GAP1:
# Go horizontally. Deletion.
......@@ -1654,6 +1655,17 @@ def get_tag(filename, tag_expr, tag_format):
#get_tag
def glob_path(pathname):
"""Yield filenames matching pathname, or pathname if none match."""
success = False
for file in glob.iglob(pathname):
success = True
yield file
if not success:
yield pathname
#glob_path
def get_input_output_files(args, single=False, batch_support=False):
if single and not batch_support:
# One infile, one outfile. Return 2-tuple (infile, outfile).
......@@ -1667,8 +1679,9 @@ def get_input_output_files(args, single=False, batch_support=False):
if args.infiles == ["-"] and sys.stdin.isatty():
return False # No input specified.
# Glob args.infiles in case the shell didn't (e.g, on Windows).
tags_to_files = {}
for infile in args.infiles:
for infile in (x for x in args.infiles for x in glob_path(x)):
tag = get_tag(infile, args.tag_expr, args.tag_format)
try:
tags_to_files[tag].append(infile)
......@@ -1680,8 +1693,10 @@ def get_input_output_files(args, single=False, batch_support=False):
if single and batch_support:
# N infiles, N outfiles. Return generator of (tag, [ins], out).
# Each yielded tuple should cause a separate run of the tool.
infiles = args.infiles if "infiles" in args \
and args.infiles is not None else [args.infile]
# Glob args.infiles in case the shell didn't (e.g, on Windows).
infiles = [x for x in infiles for x in glob_path(x)] if "infiles" in \
args and args.infiles is not None else [args.infile]
if infiles == ["-"] and sys.stdin.isatty():
return False # No input specified.
......
......@@ -31,14 +31,15 @@ by bgcorrect to filter background noise from samples.
import sys
import time
import math
import argparse
#import numpy as np # Only imported when actually running this tool.
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files,\
add_allele_detection_args, nnls, add_sequence_format_args,\
parse_allelelist, get_sample_data, \
parse_allelelist, get_sample_data, load_profiles_new,\
add_random_subsampling_args
__version__ = "1.1.0"
__version__ = "1.1.1"
# Default values for parameters are specified below.
......@@ -67,24 +68,26 @@ _DEF_MIN_SAMPLE_PCT = 80.
_DEF_MIN_GENOTYPES = 3
def solve_profile_mixture(forward, reverse, genotypes, n, variance=False,
reportfile=None):
def solve_profile_mixture(forward, reverse, genotypes, n, starting_values={},
variance=False, reportfile=None):
if reportfile:
reportfile.write("Solving forward read profiles\n")
f = solve_profile_mixture_single(forward, genotypes, n,
variance=variance, reportfile=reportfile)
starting_values={x: starting_values[x][0] for x in starting_values},
variance=variance, reportfile=reportfile)
if reportfile:
reportfile.write("Solving reverse read profiles\n")
r = solve_profile_mixture_single(reverse, genotypes, n,
variance=variance, reportfile=reportfile)
starting_values={x: starting_values[x][1] for x in starting_values},
variance=variance, reportfile=reportfile)
if variance:
return f[0], r[0], f[1], r[1]
return f, r
#solve_profile_mixture
def solve_profile_mixture_single(samples, genotypes, n, variance=False,
reportfile=None):
def solve_profile_mixture_single(samples, genotypes, n, starting_values={},
variance=False, reportfile=None):
"""
Solve the single-allele profiles of n true alleles:
Profile of A: [100, 5, 50, 25, ...]
......@@ -115,6 +118,10 @@ def solve_profile_mixture_single(samples, genotypes, n, variance=False,
# Assume the true alleles do not have cross contributions at first.
np.fill_diagonal(P, 100.)
# Enter starting values into P.
for x, y in starting_values:
P[x, y] = starting_values[x, y]
# Enter the samples into C.
for i in range(num_samples):
for j in genotypes[i]:
......@@ -442,7 +449,8 @@ def preprocess_data(data, min_sample_pct):
def generate_profiles(samples_in, outfile, reportfile, allelefile,
annotation_column, min_pct, min_abs, min_samples,
min_sample_pct, min_genotypes, seqformat, library,
marker, homozygotes, limit_reads, drop_samples):
profiles_in, marker, homozygotes, limit_reads,
drop_samples):
if reportfile:
t0 = time.time()
......@@ -471,6 +479,10 @@ def generate_profiles(samples_in, outfile, reportfile, allelefile,
# Filter insignificant background products.
preprocess_data(data, min_sample_pct)
# Load starting profiles.
start_profiles = {} if profiles_in is None \
else load_profiles_new(profiles_in, library)
if reportfile:
t1 = time.time()
reportfile.write("Data loading and filtering took %f seconds\n" %
......@@ -482,6 +494,15 @@ def generate_profiles(samples_in, outfile, reportfile, allelefile,
p = data[marker]["profiles"]
profile_size = len(p["alleles"])
# Read relevent starting values from the starting profiles.
init = {
(p["alleles"].index(allele), p["alleles"].index(seq)): (
start_profiles[marker][allele][seq]["forward"],
start_profiles[marker][allele][seq]["reverse"])
for allele in start_profiles[marker] if allele in p["alleles"]
for seq in start_profiles[marker][allele] if seq in p["alleles"]
} if marker in start_profiles else {}
# Solve for the profiles of the true alleles.
if reportfile:
reportfile.write("Solving marker %s with n=%i, m=%i, k=%i\n" %
......@@ -493,6 +514,7 @@ def generate_profiles(samples_in, outfile, reportfile, allelefile,
p["profiles_reverse"],
data[marker]["genotypes"],
p["true alleles"],
starting_values=init,
reportfile=reportfile)
if reportfile:
t1 = time.time()
......@@ -549,6 +571,9 @@ def add_arguments(parser):
help="require this minimum number of unique heterozygous genotypes "
"for each allele for which no homozygous samples are available "
"(default: %(default)s)")
filtergroup.add_argument('-p', '--profiles', metavar="FILE",
type=argparse.FileType('r'),
help="use the given noise profiles file as a starting point")
filtergroup.add_argument('-M', '--marker', metavar="MARKER",
help="work only on MARKER")
filtergroup.add_argument('-H', '--homozygotes', action="store_true",
......@@ -571,6 +596,6 @@ def run(args):
args.annotation_column, args.min_pct, args.min_abs,
args.min_samples, args.min_sample_pct,
args.min_genotypes, args.sequence_format, args.library,
args.marker, args.homozygotes, args.limit_reads,
args.drop_samples)
args.profiles, args.marker, args.homozygotes,
args.limit_reads, args.drop_samples)
#run
......@@ -40,10 +40,10 @@ Example: fdstools bgpredict ... | fdstools bgmerge old.txt > out.txt
import argparse
import sys
from ..lib import load_profiles_new, ensure_sequence_format,\
from ..lib import load_profiles_new, ensure_sequence_format, glob_path,\
add_sequence_format_args
__version__ = "1.0.1"
__version__ = "1.0.2"
def merge_profiles(infiles, outfile, seqformat, library):
......@@ -91,11 +91,11 @@ def add_arguments(parser):
def run(args):
if len(args.infiles) < 2:
if sys.stdin.isatty() or "-" in args.infiles:
infiles = [x for x in args.infiles for x in glob_path(x)]
if len(infiles) < 2:
if sys.stdin.isatty() or "-" in infiles:
raise ValueError("please specify at least two input files")
args.infiles.append("-")
infiles.append("-")
merge_profiles(args.infiles, args.outfile, args.sequence_format,
args.library)
merge_profiles(infiles, args.outfile, args.sequence_format, args.library)
#run
......@@ -57,7 +57,7 @@ from ..lib import INI_COMMENT
from ConfigParser import RawConfigParser
from types import MethodType
__version__ = "1.0.0"
__version__ = "1.0.1"
def ini_add_comment(ini, section, comment):
......@@ -70,7 +70,7 @@ def make_empty_library_ini(type, aliases=False):
ini = RawConfigParser(allow_no_value=True)
ini.optionxform = str
ini.add_comment = MethodType(ini_add_comment, ini)
# TODO: Add good examples for aliases and non-STR markers.
# TODO: Add good examples for aliases and non-STR markers
# Create sections and add comments to explain how to use them.
if aliases:
......@@ -100,19 +100,19 @@ def make_empty_library_ini(type, aliases=False):
ini.add_section("prefix")
ini.add_comment("prefix",
"Specify all known prefix sequences of each STR marker, "
"separated by commas. The first sequence listed is used as "
"the reference sequence for that marker when generating "
"allele names. The prefix is the sequence between the left "
"flank and the repeat and is omitted from allele names. "
"Deviations from the reference are expressed as variants.")
"separated by commas. The prefix is the sequence between the left "
"flank and the repeat and is omitted from allele names. The first "
"sequence listed is used as the reference sequence for that "
"marker when generating allele names. Deviations from the "
"reference are expressed as variants.")
ini.set("prefix", ";CSF1P0 = CTAAGTACTTC")
ini.add_section("suffix")
ini.add_comment("suffix",
"Specify all known suffix sequences of each STR marker, "
"separated by commas. The first sequence listed is used as "
"separated by commas. The suffix is the sequence between the "
"repeat and the right flank. The first sequence listed is used as "
"the reference sequence for that marker when generating "
"allele names. The suffix is the sequence between the repeat "
"and the right flank.")
"allele names.")
ini.set("suffix",
";CSF1P0 = CTAATCTATCTATCTTCTATCTATGAAGGCAGTTACTGTTAATATCTTCATTTTA"
"CAGGTAGGAAAACTGAGACACAGGGTGGTTAGCAACCTGCTAGTCCTTGGCAGACTCAG, CTAA"
......@@ -141,14 +141,14 @@ def make_empty_library_ini(type, aliases=False):
"position of the last base of the fragment as well.%s" % (
" Specify 'M' as the chromosome name for markers on mitochondrial "
"DNA. Allele names generated for these markers will follow mtDNA "
"nomenclature guidelines. If one of your mtDNA fragments starts "
"near the end of the reference sequence and continues at the "
"beginning, you can obtain correct base numbering by specifying "
"the fragment's genome position as \"M, (starting position), "
"16569, 1, (ending position)\". This tells FDSTools that the "
"marker is a concatenation of two fragments, where the first "
"fragment ends at position 16569 and the second fragment starts "
"at position 1." if type != "str" else ""))
"nomenclature guidelines (Parson et al., 2014). If one of your "
"mtDNA fragments starts near the end of the reference sequence "
"and continues at the beginning, you can obtain correct base "
"numbering by specifying the fragment's genome position as \"M, "
"(starting position), 16569, 1, (ending position)\". This tells "
"FDSTools that the marker is a concatenation of two fragments, "
"where the first fragment ends at position 16569 and the second "
"fragment starts at position 1." if type != "str" else ""))
ini.add_comment("genome_position",
"Using human genome build GRCh38%s." % (
" and rCRS for human mtDNA" if type != "str" else ""))
......@@ -161,8 +161,9 @@ def make_empty_library_ini(type, aliases=False):
if type == "str" or type == "full":
ini.add_section("length_adjust")
ini.add_comment("length_adjust",
"When generating allele names for STR alleles, the CE allele "
"number is based on the length of the sequence "
"To prevent discrepancies between traditional CE allele numbers "
"and the CE number in FDSTools allele names, the CE allele number "
"as calculated by FDSTools is based on the length of the sequence "
"(prefix+repeat+suffix) minus the adjustment specified here.")
ini.set("length_adjust", ";CSF1P0 = 0")
ini.add_section("block_length")
......@@ -186,9 +187,11 @@ def make_empty_library_ini(type, aliases=False):
ini.add_section("expected_allele_length")
ini.add_comment("expected_allele_length",
"Specify one or two values for each marker. The first value gives the "
"expected minimum length (in nucleotides) of the alleles and the "
"second value (if given) specifies the maximum allele length expected "
"for that marker (both inclusive).")
"expected minimum length (in nucleotides, %sexcluding flanks) of the "
"alleles and the second value (if given) specifies the maximum allele "
"length expected for that marker (both inclusive). TSSV will filter "
"sequences that have a length outside this range." %
("including prefix and suffix, " if type != "non-str" else ""))
if type != "non-str":
ini.set("expected_allele_length", ";CSF1P0 = 100")
return ini
......
......@@ -47,17 +47,17 @@ tools in FDSTools. Please refer to the tool-specific help for a full
description of each tool. Type 'fdstools -h TOOL' to get help with the
given TOOL.
"""
import pkgutil, sys, os, tempfile, re, argparse, glob
import pkgutil, sys, os, tempfile, re, argparse
#import threading, subprocess # Only imported when running this tool.
import fdstools.tools
from ..lib import split_quoted_string, DEF_TAG_EXPR, DEF_TAG_FORMAT, get_tag, \
regex_arg, INI_COMMENT
regex_arg, INI_COMMENT, glob_path
from ConfigParser import RawConfigParser, NoSectionError, NoOptionError
__version__ = "1.0.0"
__version__ = "1.0.1"
# Pattern that matches a long argparse argument name.
......@@ -406,12 +406,12 @@ def run_ref_database_analysis(arg_defs, config):
in_samples = [
"'" + x.replace("'", "\'") + "'"
for x in split_quoted_string(ini_require_option(
config, NAME, "in-samples")) for x in glob.iglob(x)]
config, NAME, "in-samples")) for x in glob_path(x)]
if "'-'" in in_samples:
raise ValueError("The sample input files cannot be named '-'")
in_samples = " ".join(x for x in in_samples if x != "''")
if not in_samples:
raise ValueError("No sample input files found")
raise ValueError("No sample input files given")
tag_expr = ini_try_get_option(config, NAME, "tag-expr", DEF_TAG_EXPR)
tag_format = ini_try_get_option(config, NAME, "tag-format", DEF_TAG_FORMAT)
prefix = ini_try_get_option(config, NAME, "prefix", "")
......
{
"fdstools_visversion": "2.0.0",
"fdstools_visversion": "2.0.1",
"width": 600,
"height": 400,
"signals": [
{
"name": "hovered",
"init": false,
"streams": [
{"type": "symbol:mouseover", "expr": "datum"},
{"type": "symbol:mouseout", "expr": "false"},
{"type": "path:mouseover", "expr": "datum"},
{"type": "path:mouseout", "expr": "false"}
]
}
],
"data": [
{
"name": "raw",
......@@ -204,7 +216,10 @@
"path": {"field": "layout_path"},
"stroke": {"scale": "c", "field": "_source.marker"},
"strokeWidth": {"field": "count", "mult": 0.5},
"strokeOpacity": {"value": 0.3}
"strokeOpacity": [
{"test": "hovered && (hovered == datum || hovered == datum._source || hovered == datum._target)", "value": 1},
{"value": 0.3}
]
}
}
},
......@@ -234,7 +249,8 @@
"x": {"field": "layout_x"},
"y": {"field": "layout_y"},
"fill": {"value": "black"},
"stroke": {"value": "black"}
"fillOpacity": {"value": 0.7},
"stroke": {"value": "transparent"}
}
}
}
......
......@@ -24,7 +24,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta charset="UTF-8">
<title>Allele Visualisation - FDSTools</title>
<!-- VERSION 2.0.0 -->
<!-- VERSION 2.0.1 -->
<!-- BEGIN_LIBRARIES -->
<script src="https://vega.github.io/vega-editor/vendor/d3.min.js"></script>
<script src="https://vega.github.io/vega/vega.min.js"></script>
......@@ -162,6 +162,15 @@
#vis div:last-child {
padding-bottom: .8rem;
}
#vis-tooltip {
display: none;
position: fixed;
z-index: 999999;
background-color: hsla(220, 20%, 77%, 0.75);
border: 1px solid hsl(220, 20%, 3%);
border-radius: .5em;
padding: 0em .25em;
}
td.num {
text-align: right;
}
......@@ -331,6 +340,9 @@
#vis > div > * {
direction: ltr;
}
#vis-tooltip {
display: none;
}
.vega, canvas {
max-width: 100%;
height: auto;
......@@ -422,6 +434,7 @@
</div>
</div>
</div>
<div id="vis-tooltip"></div>
<script type="text/javascript">
var graph = false;
var fileName = "alleles";
......@@ -444,6 +457,58 @@ function parse(){
visdiv.appendChild(newvis);
graph = chart({el: newvis, renderer: rendererName});
graph.onSignal("hovered", function(s, datum){
var tt = document.getElementById("vis-tooltip");
if(!datum){
tt.style.display = "none";
return;
}
tt.style.display = "block";
removeChildren(tt);
if(datum.source || datum.target){
//Edge.
var re = /^CE\d+\.?\d*_/;
var a = [datum._source.allele, datum._target.allele];
if(re.test(a[0]) && re.test(a[1])){
if(parseFloat(a[0].substring(2, a[0].indexOf('_'))) > parseFloat(a[1].substring(2, a[1].indexOf('_'))))
a = [a[1], a[0]];
}
else if(a[0] > a[1])
a = [a[1], a[0]];
var b = document.createElement("b");
b.appendChild(document.createTextNode(datum._source.marker));
tt.appendChild(b);
var table = document.createElement("table");
var row = table.insertRow();
row.insertCell().appendChild(document.createTextNode("Allele 1"));
row.insertCell().appendChild(document.createTextNode(a[0]));
var row = table.insertRow();
row.insertCell().appendChild(document.createTextNode("Allele 2"));
row.insertCell().appendChild(document.createTextNode(a[1]));
row = table.insertRow();
row.insertCell().appendChild(document.createTextNode("Samples"));
row.insertCell().appendChild(document.createTextNode(datum.count));
tt.appendChild(table);
return;
}
//Node.
var b = document.createElement("b");
b.appendChild(document.createTextNode(datum.marker + " " + datum.allele));
tt.appendChild(b);
var table = document.createElement("table");
var row = table.insertRow();
row.insertCell().appendChild(document.createTextNode("Samples"));
var cell = row.insertCell();
cell.setAttribute("class", "num");
cell.appendChild(document.createTextNode(datum.count));
row = table.insertRow();
row.insertCell().appendChild(document.createTextNode("Homozygotes"));
cell = row.insertCell();
cell.setAttribute("class", "num");
cell.appendChild(document.createTextNode(datum.homcount.count));
tt.appendChild(table);
});
graph.update();
if(rendererName == "svg")
updateViewBox(graph._el.childNodes[0]);
......@@ -624,6 +689,19 @@ function onLoadSpec(has_data){
setSignalValue("height", value);
}, false);
//Automatically move the tooltip with the mouse pointer.
var tt = document.getElementById("vis-tooltip");
document.addEventListener('mousemove', function(e){
if(e.clientX < document.documentElement.clientWidth / 2)
tt.style.left = Math.max(0, Math.min(document.documentElement.clientWidth - tt.offsetWidth, e.clientX + 10)) + "px";
else
tt.style.left = Math.max(0, e.clientX - tt.offsetWidth - 10) + "px";
if(e.clientY + 10 + tt.offsetHeight > document.documentElement.clientHeight)
tt.style.top = (document.documentElement.clientHeight - tt.offsetHeight) + "px";
else
tt.style.top = (e.clientY + 10) + "px";
}, false);
//Sync graph_spec and display.
document.getElementById("graphwidth").value = getSignalValue("width");
document.getElementById("graphheight").value = getSignalValue("height");
......
{
"fdstools_visversion": "1.0.0beta2",
"fdstools_visversion": "2.0.1",
"width": 600,
"height": 10,
"signals": [
......@@ -334,7 +334,7 @@
"type": "x",
"scale": "x",
"tickSizeEnd": 0,
"title": "Length of repeat (bp)"
"title": "Length of repeat (nt)"
},
{
"type": "y",
......
......@@ -2,9 +2,6 @@ To-do:
* Additions needed for publication:
* Group tools by function in the command line help and put the new pipeline
tool on top.
* [If not too difficult to implement] Give BGEstimate a background profiles
file to start with (such as computed from homozygous samples only).
* Add built-in glob pattern matching for Windows.
* Samplevis:
* Detect whether correction was performed; hide related columns if not.
* Option to choose complete table download (all columns, not all rows).
......@@ -22,21 +19,20 @@ To-do:
be repainted between each chunk. One major issue with this is that user
input events may get scheduled between the chunks.
* Allow table filtering options to be specified for each marker separately.
* Add options for exporting data in CODIS format (and possibly others?).
* Add grouping, show/hide options, and target coverage for BGAnalyseVis to the
Vis tool.
* Add per-marker allele calling settings to Samplestats.
* Exceptions to general mtDNA nomenclature: http://empop.online/methods
* Reduce noise profile memory usage:
* Use sparse matrices in BGEstimate, and BGCorrect. May save over 90% of
* Use sparse matrices in BGEstimate and BGCorrect. May save over 90% of
memory for the profile matrix after BGMerge of BGEstimate and BGPredict.
* Add 'BGDiff' tool to compare noise profiles.
* Add section to the library file where genomic positions of known pathogenic
variants are specified. The TSSV tool should always output the reference base
at these positions to comply with ethical regulations.
* Add options to Libconvert to generate a template for STR or non-STR markers.
* Add options to Samplevis, Samplestats (and possibly other relevant tools) to
filter alleles by sequence length. The TSSV tool already supports this.
* Add a print stylesheet for the other visualisations (only Samplevis has one).
* Add visualisation with all markers in one graph ("samplesummaryvis"?).
* Add tool to analyse within-marker and between-marker coverage variation.
* Allow loading multiple files into HTML visualisations and provide prev/next
......@@ -49,9 +45,6 @@ To-do:
* When printing, IE11 respects the pagebreak hints. Chrome and FF are bugged!
* [Known bug]: pattern_longest_match does not give the longest match if a
shorter match is possible and found earlier at the same position.
* Add "allow_N" flag to [no_repeat] markers. If the flag is specified, the
reference sequence may contain Ns. People might need this for the rCRS mtDNA
reference sequence.
* Adjust BGEstimate so that it computes forward and reverse in one go. To do
this, double the number of columns in P and C and put the forward profile in
the left half and the reverse profile in the right half. The benefit of this
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment