Commit 13e0d781 authored by Hoogenboom, Jerry's avatar Hoogenboom, Jerry

Fixed a crash in Samplestats, and minor improvements

Fixed:
* Fixed crash in Samplestats. It would crash if BGCorrect columns were
  present.
* Fixed glitch in Samplevis that allowed clicking the 'Other sequences'
  bars if the input data already contained the 'Other sequences' entry.

Improved:
* The TSSV tool will now drop any sequences that contain anything other
  than A, C, T, and G. If the -A option is given, these sequences will
  still be added to the marker aggregates. Many other tools will fail
  when confronted with such invalid sequences, especially when allele
  names need to be generated.
* In Samplevis, the sequences are now consistently sorted (except for
  some inconsistency caused by a bug in Vega). The sorting is based on
  read counts and is the same as used for the allele tables in Samplevis
  HTML visualisations.
* Added a comment line that mentions genome build GRCh38 and rCRS to the
  genome_position block in the libconvert output. This is mainly for
  documentation purposes; users are free to change this line if they use
  a different reference.
* Minor styling changes to Samplevis HTML visualisations.
parent 8d67efec
......@@ -421,7 +421,7 @@ def generate_profiles(samples_in, outfile, reportfile, allelefile,
allelelist = {tag: allelelist[tag] for tag in sample_data}
ensure_min_samples(allelelist, min_samples)
# Combine data from all samples.
# Combine data from all samples. This takes most time.
data = {}
for tag in sample_data.keys():
add_sample_data(data, sample_data[tag], allelelist[tag], min_pct,
......
......@@ -277,6 +277,8 @@ def convert_library(infile, outfile, aliases=False):
ini.set("genome_position",
"; ends at position 16569 and the second fragment starts at "
"position 1.")
ini.set("genome_position",
"; Using human genome build GRCh38 and rCRS for human mtDNA.")
ini.set("genome_position",
";MyMarker = 9, 36834400")
ini.set("genome_position",
......
......@@ -597,7 +597,7 @@ def compute_stats(infile, outfile, min_reads,
100.*combined[ci["reverse_noise"]]/combined[ci["reverse"]]
if combined[ci["reverse"]] else 0)
if "total_add" in ci:
combined[ci["total_add_pct"]] = (
combined[ci["total_added_pct"]] = (
100.*combined[ci["total_add"]]/combined[ci["total"]]
if combined[ci["total"]] else 0)
if "forward_add" in ci:
......
......@@ -16,7 +16,8 @@ import math
# make_statistics_table, prepare_output_dir
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files,\
add_sequence_format_args, reverse_complement, get_column_ids
add_sequence_format_args, reverse_complement, PAT_SEQ_RAW,\
get_column_ids
__version__ = "0.1dev"
......@@ -58,12 +59,13 @@ def run_tssv_lite(infile, outfile, reportfile, is_fastq, library, seqformat,
total_reads, unrecognised, counters, sequences = process_file(
infile, file_format, tssv_library, outfiles)
# Filter out sequences with low read counts now.
# Filter out sequences with low read counts and invalid bases now.
if aggregate_below_minimum:
aggregates = {}
for marker in sequences:
for sequence in sequences[marker]:
if sum(sequences[marker][sequence]) < minimum:
if (sum(sequences[marker][sequence]) < minimum or
PAT_SEQ_RAW.match(sequence) is None):
if marker not in aggregates:
aggregates[marker] = [0, 0]
aggregates[marker][0] += sequences[marker][sequence][0]
......@@ -71,7 +73,8 @@ def run_tssv_lite(infile, outfile, reportfile, is_fastq, library, seqformat,
sequences = {marker:
{sequence: sequences[marker][sequence]
for sequence in sequences[marker]
if sum(sequences[marker][sequence]) >= minimum}
if sum(sequences[marker][sequence]) >= minimum
and PAT_SEQ_RAW.match(sequence) is not None}
for marker in sequences}
# Check presence of all markers.
......
......@@ -172,7 +172,10 @@
.markertable td:nth-child(2) {
width: auto;
}
.markertable tr:nth-child(1n) {
.markertable tr:first-child {
background-color: hsl(220, 20%, 77%);
}
.markertable tr:nth-child(1n+2) {
background-color: hsl(220, 20%, 97%);
}
.markertable tr:nth-child(2n) {
......@@ -288,7 +291,7 @@
width: 100%;
border-top: 1pt solid black;
border-right: 1pt solid black;
font-size: 8pt;
font-size: 7pt;
}
.optiongroup {
flex-grow: 1;
......@@ -365,7 +368,7 @@
.markertable td {
border: 1pt solid #888888;
padding: 2pt;
font-size: 8pt;
font-size: 7pt;
white-space: nowrap;
width: 50pt;
}
......@@ -375,6 +378,9 @@
.markertable td:nth-child(2) {
width: auto;
}
.markertable tr:first-child {
background-color: #cccccc;
}
.markertable tr:nth-child(2n) {
background-color: #eeeeee;
}
......
......@@ -154,6 +154,10 @@
"name": "annotated",
"source": "preannotated",
"transform": [
{
"type": "filter",
"test": "datum.sequence != 'Other sequences'"
},
{
"type": "formula",
"field": "shared",
......@@ -207,7 +211,7 @@
]
},
{
"name": "table",
"name": "unranked",
"source": "preannotated",
"transform": [
{
......@@ -296,10 +300,56 @@
"onKey": "markersequence",
"keys": ["markersequence"],
"as": ["thedatum"]
}
]
},
{
"name": "ranks",
"source": "unranked",
"transform": [
{
"type": "cross",
"diagonal": false,
"filter": "datum.b.marker > datum.a.marker || (datum.b.marker == datum.a.marker && (datum.b.total_added < datum.a.total_added || (datum.b.total_added == datum.a.total_added && (datum.b.total_corr < datum.a.total_corr || (datum.b.total_corr == datum.a.total_corr && (datum.b.total < datum.a.total || (datum.b.total == datum.a.total && datum.b.sequence > datum.a.sequence)))))))"
},
{
"type": "sort",
"by": ["marker", "-total_added", "-total_corr", "-total", "sequence"]
"type": "formula",
"field": "markersequence",
"expr": "datum.a.markersequence"
},
{
"type": "formula",
"field": "sequence",
"expr": "datum.a.sequence"
},
{
"type": "aggregate",
"groupby": ["markersequence", "sequence"],
"summarize": [{"field": "*", "ops": ["count"], "as": ["rank"]}]
},
{
"type": "formula",
"field": "rank",
"expr": "datum.sequence == 'Other sequences'? -1 : datum.rank"
}
]
},
{
"name": "table",
"source": "unranked",
"transform": [
{
"type": "lookup",
"on": "ranks",
"onKey": "markersequence",
"keys": ["markersequence"],
"as": ["rankobj"],
"default": {"rank": 0}
},
{
"type": "formula",
"field": "rank",
"expr": "datum.rankobj.rank"
}
]
},
......@@ -492,7 +542,8 @@
"name": "y",
"type": "ordinal",
"range": "height",
"domain": {"field": "sequence"}
"domain": {"field": "sequence", "sort": {"field": "rank", "op": "min"}},
"reverse": true
}
],
"axes": [
......
To-do:
* TRY: BGCorrect edits to get rid of overcorrection of singletons:
* Round final output to nearest integer; halves away from zero.
* Clip deviations of less than 1 read to zero while iterating.
* Both.
* Samplevis:
* Give 'Other sequences' bars a distinct look.
* Add option to truncate long allele names.
* Sort STR alleles by length by default.
* Option to adjust the sorting.
* Option to choose complete table download.
* Option to choose complete table download (all columns, not all rows).
* When we have them, add default values to table filtering (for reference).
* Some of the media query breakpoints overlap, fix this.
* Perhaps it is desirable to be able to request a list of 'Other sequences'.
......@@ -19,7 +16,11 @@ To-do:
(maybe also additional value for confidence interval).
* Visualisation to display highest remaining background (positive and
negative) in known samples after BGCorrect analysis.
* Add a way to identify sequences that do not fit the library, or an option not
to generate those.
* Add options to Libconvert to generate a template for STR or non-STR markers.
* Add options to Samplevis, Samplestats (and possibly other relevant tools) to
filter alleles by sequence length.
* Add plotting of raw data points to StuttermodelVis.
* Add a print stylesheet for the other visualisations (only Samplevis has one).
* Add visualisation with all markers in one graph ("samplesummaryvis"?).
......@@ -50,9 +51,11 @@ To-do:
(TODO: Write this list)
Open Vega issues:
* Bug in aggregate transform.
https://github.com/vega/vega/issues/530
* Lookup transform only takes simple field names for the onKey parameter.
https://github.com/vega/vega/issues/526
* Sorting is broken.
* Sorting needs the Rank transform, but that is not released yet.
https://github.com/vega/vega/issues/509
* Feature request: Id-based refs for Force transform's source and target.
https://github.com/vega/vega/issues/471
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment