Commit 9ed2f3d1 authored by Hoogenboom, Jerry's avatar Hoogenboom, Jerry
Browse files

Updated handling of 'No data' and 'Other sequences'

Improved:
* Added -A/--aggregate-below-minimum option to the TSSV tool. This will
  add a line with 'Other sequences' to the output summing all sequences
  that were not reported because they had less reads than was specified
  with the -a/--minimum option.
* Clarified the help text for the -D/--dir option of the TSSV tool.

Fixed:
* Updated all tools to consistently handle cases where 'No data' or
  'Other sequences' occurs in place of a sequence.
parent ebf700a7
......@@ -1045,13 +1045,11 @@ def convert_sequence_raw_allelename(seq, library, marker):
def ensure_sequence_format(seq, to_format, from_format=None, library=None,
marker=None, allow_special=False):
marker=None):
"""Convert seq to 'raw', 'tssv', or 'allelename' format."""
if seq in SEQ_SPECIAL_VALUES:
# Special case.
if allow_special:
return False
raise ValueError("Unable to handle special value '%s'" % seq)
return seq
known_formats = ("raw", "tssv", "allelename")
if to_format not in known_formats:
raise ValueError("Unknown format '%s', choose from %s" %
......@@ -1181,7 +1179,7 @@ def get_repeat_pattern(seq):
def read_sample_data_file(infile, data, annotation_column=None, seqformat=None,
library=None, default_marker=None,
allow_special=False):
drop_special_seq=False):
"""Add data from infile to data dict as [marker, sequence]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
......@@ -1201,12 +1199,13 @@ def read_sample_data_file(infile, data, annotation_column=None, seqformat=None,
found_alleles = []
for line in infile:
line = line.rstrip("\r\n").split("\t")
if drop_special_seq and line[colid_sequence] in SEQ_SPECIAL_VALUES:
continue
marker = line[colid_marker] if colid_marker is not None \
else default_marker
sequence = line[colid_sequence] if seqformat is None \
else ensure_sequence_format(line[colid_sequence], seqformat,
library=library, marker=marker,
allow_special=allow_special)
library=library, marker=marker)
if (annotation_column is not None and
line[colid_annotation].startswith("ALLELE")):
found_alleles.append((marker, sequence))
......@@ -1242,7 +1241,7 @@ def reduce_read_counts(data, limit_reads):
def get_sample_data(tags_to_files, callback, allelelist=None,
annotation_column=None, seqformat=None, library=None,
marker=None, homozygotes=False, limit_reads=sys.maxint,
drop_samples=0, allow_special=False):
drop_samples=0, drop_special_seq=False):
if drop_samples:
sample_tags = tags_to_files.keys()
for tag in random.sample(xrange(len(sample_tags)),
......@@ -1255,7 +1254,7 @@ def get_sample_data(tags_to_files, callback, allelelist=None,
for infile in tags_to_files[tag]:
infile = sys.stdin if infile == "-" else open(infile, "r")
alleles.update(read_sample_data_file(infile, data,
annotation_column, seqformat, library, marker, allow_special))
annotation_column, seqformat, library, marker, False))
if infile != sys.stdin:
infile.close()
if limit_reads < sys.maxint:
......
......@@ -73,31 +73,34 @@ def find_alleles_sample(data, outfile, reportfile, tag, min_reads,
top_noise = {}
top_allele = {}
alleles = {}
for marker, allele in data:
reads = sum(data[marker, allele])
for marker, sequence in data:
reads = sum(data[marker, sequence])
if marker not in alleles:
alleles[marker] = {allele: reads}
top_allele[marker] = reads
alleles[marker] = {}
top_allele[marker] = 0
top_noise[marker] = ["-", 0]
else:
if reads > top_allele[marker]:
# New highest allele!
top_allele[marker] = reads
for allelex in alleles[marker].keys():
if (alleles[marker][allelex] <
top_allele[marker] * (min_allele_pct/100.)):
if alleles[marker][allelex] > top_noise[marker][1]:
top_noise[marker] = [
allelex, alleles[marker][allelex]]
del alleles[marker][allelex]
alleles[marker][allele] = reads
elif reads >= top_allele[marker]*(min_allele_pct/100.):
# New secundary allele!
alleles[marker][allele] = reads
elif reads >= top_noise[marker][1]:
# New highest noise!
top_noise[marker] = [allele, reads]
if sequence == "Other Sequences" and reads >= top_noise[marker][1]:
# Aggregated sequences are new highest noise!
top_noise[marker] = [sequence, reads]
elif reads > top_allele[marker]:
# New highest allele!
top_allele[marker] = reads
for allele in alleles[marker].keys():
if (alleles[marker][allele] <
top_allele[marker] * (min_allele_pct/100.)):
if alleles[marker][allele] > top_noise[marker][1]:
top_noise[marker] = [
allele, alleles[marker][allele]]
del alleles[marker][allele]
alleles[marker][sequence] = reads
elif reads >= top_allele[marker]*(min_allele_pct/100.):
# New secundary allele!
alleles[marker][sequence] = reads
elif reads >= top_noise[marker][1]:
# New highest noise!
top_noise[marker] = [sequence, reads]
# Find and eliminate noisy markers in this sample first.
noisy_markers = 0
......
......@@ -47,13 +47,13 @@ def get_sample_data(infile, convert_to_raw=False, library=None):
column_names.append("reverse_corrected")
column_names.append("total_corrected")
column_names.append("correction_flags")
colid_marker, colid_sequence, colid_forward, colid_reverse = get_column_ids(
colid_marker, colid_sequence, colid_forward, colid_reverse =get_column_ids(
column_names, "marker", "sequence", "forward", "reverse")
data = {}
for line in infile:
cols = line.rstrip("\r\n").split("\t")
marker = cols[colid_marker]
if convert_to_raw and cols[colid_sequence] not in SEQ_SPECIAL_VALUES:
if convert_to_raw:
cols[colid_sequence] = ensure_sequence_format(
cols[colid_sequence], "raw", library=library, marker=marker)
cols[colid_forward] = int(cols[colid_forward])
......@@ -221,8 +221,7 @@ def match_profiles(infile, outfile, profiles, library, seqformat):
match_profile(column_names, data[marker], profiles[marker],
seqformat!="raw", library, marker)
for line in data[marker]:
if (seqformat is not None and seqformat != "raw" and
line[colid_sequence] not in SEQ_SPECIAL_VALUES):
if seqformat is not None and seqformat != "raw":
line[colid_sequence] = ensure_sequence_format(
line[colid_sequence], seqformat, library=library,
marker=marker)
......
......@@ -121,7 +121,7 @@ def compute_ratios(samples_in, outfile, allelefile, annotation_column, min_pct,
{m: allelelist[tag][m].pop() for m in allelelist[tag]},
min_pct, min_abs, tag),
allelelist, annotation_column, seqformat, library, marker, True,
allow_special=True)
drop_special_seq=True)
# Ensure minimum number of samples per allele and filter
# insignificant background products.
......
......@@ -28,7 +28,7 @@ import sys
from operator import mul
from ..lib import get_column_ids, reverse_complement, get_repeat_pattern,\
mutate_sequence,\
mutate_sequence, SEQ_SPECIAL_VALUES,\
PAT_SEQ_RAW, ensure_sequence_format, add_sequence_format_args
__version__ = "0.1dev"
......@@ -217,11 +217,10 @@ def predict_profiles(stuttermodel, seqsfile, outfile, default_marker,
model[marker] = model["All data"]
else:
continue
sequence = ensure_sequence_format(line[colid_sequence], "raw",
library=library, marker=marker,
allow_special=True)
if sequence is False:
if line[colid_sequence] in SEQ_SPECIAL_VALUES:
continue
sequence = ensure_sequence_format(line[colid_sequence], "raw",
library=library, marker=marker)
try:
seqlist[marker].append(sequence)
except KeyError:
......
......@@ -78,7 +78,7 @@ def blame(samples_in, outfile, allelefile, annotation_column, mode,
lambda tag, sample_data: add_sample_data(
data, sample_data, tag,
{m: allelelist[tag][m] for m in data if m in allelelist[tag]}),
allelelist, annotation_column, "raw", library)
allelelist, annotation_column, "raw", library, drop_special_seq=True)
outfile.write("\t".join(["marker",
"allele" if mode == "common" else "sample",
......
......@@ -403,6 +403,11 @@ def compute_stats(infile, outfile, min_reads,
row.append(100.*row[ci["reverse_add"]]/row[ci["reverse"]]
if row[ci["reverse"]] else 0)
# The 'No data' lines are fine like this.
if row[ci["sequence"]] == "No data":
row[ci["flags"]] = ",".join(row[ci["flags"]])
continue
# Get the values we will filter on.
total_added = row[ci["total"]] if "total_corrected" not in ci \
else row[ci["total_corrected"]]
......@@ -421,13 +426,15 @@ def compute_stats(infile, outfile, min_reads,
else row[ci["reverse_corrected"]])
# Check if this sequence should be filtered out.
# Always filter/combine existing 'Other sequences'.
if filter_action != "off" and (
row[ci["sequence"]] == "Other sequences" or (
total_added < min_reads_filt or
pct_of_max < min_pct_of_max_filt or
pct_of_sum < min_pct_of_sum_filt or
(correction < min_correction_filt and
recovery < min_recovery_filt) or
min_strand < min_per_strand_filt):
min_strand < min_per_strand_filt)):
filtered[marker].append(row)
# Check if this sequence is an allele.
......
......@@ -39,7 +39,7 @@ import argparse, sys
from ..lib import get_column_ids, ensure_sequence_format, parse_library,\
reverse_complement, add_input_output_args,\
get_input_output_files
get_input_output_files, SEQ_SPECIAL_VALUES
__version__ = "0.1dev"
......@@ -84,21 +84,17 @@ def convert_sequences(infile, outfile, to_format, library=None,
marker = line[colid_marker] if fixed_marker is None else fixed_marker
seq = line[colid_sequence]
if library2 != library:
if library2 != library and seq not in SEQ_SPECIAL_VALUES:
seq = ensure_sequence_format(
seq, "raw", marker=marker, library=library, allow_special=True)
if seq == False:
seq = line[colid_sequence]
elif marker in revcomp_markers:
seq, "raw", marker=marker, library=library)
if marker in revcomp_markers:
seq = reverse_complement(seq)
# TODO: The current implementation assumes identical
# flanking sequences. Introduce means to shift flanking
# sequence in/out of prefix and/or suffix.
seq = ensure_sequence_format(seq, to_format, marker=marker,
library=library2, allow_special=True)
if seq == False:
seq = line[colid_sequence]
library=library2)
line[colid_sequence_out] = seq
outfile.write("\t".join(line) + "\n")
#convert_sequences
......
......@@ -32,7 +32,8 @@ import argparse, sys
from ..lib import pos_int_arg, print_db, PAT_TSSV_BLOCK, get_column_ids, \
add_input_output_args, get_input_output_files, \
ensure_sequence_format, add_sequence_format_args
ensure_sequence_format, add_sequence_format_args, \
SEQ_SPECIAL_VALUES
__version__ = "1.5"
......@@ -115,31 +116,30 @@ def load_data(infile, colname_annotation=_DEF_COLNAME, library=None):
print("WARNING: skipped line: %s" % line)
continue
# Convert to TSSV-style sequences.
# String to integer conversion...
columns[colid_total] = int(columns[colid_total])
if columns[colid_sequence]:
# Convert to TSSV-style sequences.
marker = None if colid_marker is None else columns[colid_marker]
columns[colid_sequence] = ensure_sequence_format(
columns[colid_sequence], 'tssv', marker=marker,library=library)
# Split the sequence column into a list of tuples:
# [('ACTG','4'),('CCTC','12'),...]
columns[colid_sequence] =PAT_TSSV_BLOCK.findall(columns[colid_sequence])
# String to integer conversion...
columns[colid_sequence] = [[x[0], int(x[1])]
for x in columns[colid_sequence]]
columns[colid_total] = int(columns[colid_total])
# Repeat unit deduplication (data may contain stuff like
# "AGAT(7)AGAT(5)" instead of "AGAT(12)").
if columns[colid_sequence]:
dedup = [columns[colid_sequence][0]]
for i in range(1, len(columns[colid_sequence])):
if columns[colid_sequence][i][0] == dedup[-1][0]:
dedup[-1][1] += columns[colid_sequence][i][1]
else:
dedup.append(columns[colid_sequence][i])
columns[colid_sequence] = dedup
if columns[colid_sequence] not in SEQ_SPECIAL_VALUES:
# Split the sequence column into a list of tuples:
# [('ACTG', 4), ('CCTC', 12), ...]
columns[colid_sequence] = [[x[0], int(x[1])] for x in
PAT_TSSV_BLOCK.findall(columns[colid_sequence])]
# Repeat unit deduplication (data may contain stuff like
# "AGAT(7)AGAT(5)" instead of "AGAT(12)").
dedup = [columns[colid_sequence][0]]
for i in range(1, len(columns[colid_sequence])):
if columns[colid_sequence][i][0] == dedup[-1][0]:
dedup[-1][1] += columns[colid_sequence][i][1]
else:
dedup.append(columns[colid_sequence][i])
columns[colid_sequence] = dedup
# Add the sequence to the list, including our new column.
columns.append("UNKNOWN")
......@@ -235,6 +235,10 @@ def annotate_alleles(infile, outfile, stutter, min_reads=_DEF_MIN_READS,
if allelelist[iCurrent][colid_total] < min_reads:
continue
# Skip special sequence values.
if allelelist[iCurrent][colid_sequence] in SEQ_SPECIAL_VALUES:
continue
isStutterPeakOf = {}
maximumOccurrenceExpected = 0
......@@ -247,6 +251,10 @@ def annotate_alleles(infile, outfile, stutter, min_reads=_DEF_MIN_READS,
!= allelelist[iOther][colid_marker]):
continue
# Skip special sequence values.
if allelelist[iOther][colid_sequence] in SEQ_SPECIAL_VALUES:
continue
print_db('%i vs %i' % (iCurrent+1, iOther+1), debug)
# Must be same number of blocks.
......@@ -360,8 +368,10 @@ def annotate_alleles(infile, outfile, stutter, min_reads=_DEF_MIN_READS,
# Reconstruct the sequence and write out the findings.
for i in range(len(allelelist)):
allelelist[i][colid_sequence] = "".join(
"%s(%i)" % tuple(block) for block in allelelist[i][colid_sequence])
if allelelist[i][colid_sequence] not in SEQ_SPECIAL_VALUES:
allelelist[i][colid_sequence] = "".join(
"%s(%i)" % tuple(block)
for block in allelelist[i][colid_sequence])
outfile.write("\t".join(column_names))
outfile.write("\n")
outfile.write("\n".join(
......
......@@ -44,7 +44,8 @@ def convert_library(library, threshold):
def run_tssv_lite(infile, outfile, reportfile, is_fastq, library, seqformat,
threshold, minimum, missing_marker_action, dirname):
threshold, minimum, aggregate_below_minimum,
missing_marker_action, dirname):
file_format = "fastq" if is_fastq else "fasta"
tssv_library = convert_library(library, threshold)
......@@ -58,6 +59,15 @@ def run_tssv_lite(infile, outfile, reportfile, is_fastq, library, seqformat,
infile, file_format, tssv_library, outfiles)
# Filter out sequences with low read counts now.
if aggregate_below_minimum:
aggregates = {}
for marker in sequences:
for sequence in sequences[marker]:
if sum(sequences[marker][sequence]) < minimum:
if marker not in aggregates:
aggregates[marker] = [0, 0]
aggregates[marker][0] += sequences[marker][sequence][0]
aggregates[marker][1] += sequences[marker][sequence][1]
sequences = {marker:
{sequence: sequences[marker][sequence]
for sequence in sequences[marker]
......@@ -73,6 +83,11 @@ def run_tssv_lite(infile, outfile, reportfile, is_fastq, library, seqformat,
else:
raise ValueError("Marker %s was not detected!" % marker)
# Add aggregate rows if the user requested so.
if aggregate_below_minimum:
for marker in aggregates:
sequences[marker]["Other sequences"] = aggregates[marker]
column_names, tables = make_sequence_tables(sequences, 0)
# Convert sequences to the desired format.
......@@ -127,13 +142,22 @@ def add_arguments(parser):
default=_DEF_MINIMUM,
help="report only sequences with this minimum number of reads "
"(default: %(default)s)")
parser.add_argument("-A", "--aggregate-below-minimum", action="store_true",
help="if specified, sequences that have less than the number of reads "
"specified with the -a/--minimum option will be aggregated per "
"marker and reported as 'Other sequences'")
parser.add_argument("-M", "--missing-marker-action", metavar="ACTION",
choices=("include", "exclude", "halt"),
default="include",
help="action to take when no sequences are linked to a marker: one of "
"%(choices)s (default: %(default)s)")
parser.add_argument("-D", "--dir",
help="output directory for verbose output")
help="output directory for verbose output; when given, a subdirectory "
"will be created for each marker, each with a separate "
"sequences.csv file and a number of FASTA/FASTQ files containing "
"unrecognised reads (unknown.fa), recognised reads "
"(Marker/paired.fa), and reads that lack one of the flanks of a "
"marker (Marker/noend.fa and Marker/nostart.fa")
#add_arguments
......@@ -157,7 +181,8 @@ def run(args):
infile = sys.stdin if files[0] == "-" else open(files[0], "r")
run_tssv_lite(infile, files[1], args.report, args.is_fastq, args.library,
args.sequence_format, args.mismatches, args.minimum,
args.missing_marker_action, args.dir)
args.aggregate_below_minimum, args.missing_marker_action,
args.dir)
if infile != sys.stdin:
infile.close()
#run
\ No newline at end of file
To-do:
* Samplevis:
* Re-layout:
http://matthewjamestaylor.com/blog/perfect-multi-column-liquid-layouts.
* Allow for heavily corrected sequences to remain visible in the graphs.
* Re-layout, inspiration:
http://matthewjamestaylor.com/blog/perfect-multi-column-liquid-layouts
* Replace the alert for 'No data' markers with a message on the page.
* Allow for heavily corrected sequences to remain visible in the graphs.
* Make sure markers for which no sequences meet the filtering criteria are
still visible.
* Sort STR alleles by length by default.
* Option to adjust the sorting.
* Option to choose complete table download.
* When we have them, add default values to table filtering (for reference).
* Add 'Other sequences' line (optional) to TSSV output if -a > 1.
* Additions needed for publication:
* [If not too difficult to implement] BGEstimate should start with homozygous
samples and add heterozygous samples later to optimise correction.
......@@ -20,7 +19,6 @@ To-do:
* Visualisation to display highest remaining background (positive and
negative) in known samples after BGCorrect analysis.
* Add options to Libconvert to generate a template for STR or non-STR markers.
* Make sure blame doesn't crash on 'No data' lines.
* Add plotting of raw data points to StuttermodelVis.
* Add a print stylesheet for the other visualisations (only Samplevis has one).
* Add visualisation with all markers in one graph ("samplesummaryvis"?).
......@@ -156,6 +154,24 @@ Reserved option letters:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Action for special values in sequence columns:
TOOL 'No data' 'Other sequences'
allelefinder Not suitable for marker Treated as single noise sequence
bgcorrect Transparent Transparent
bgestimate Ignored Ignored
bghomraw Ignored Ignored
bghomstats Ignored Ignored
bgmerge Transparent Transparent
bgpredict Ignored Ignored
blame Ignored Ignored
samplestats Transparent Transparent; may remove or add more
seqconvert Transparent Transparent
stuttermark Marked as 'UNKNOWN' Marked as 'UNKNOWN'
stuttermodel Ignored Ignored
tssv May create them May create them
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Sequence format conversions:
From To
raw tssv OK - Transparent to non-STR markers (have no regex_middle).
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment