Commit be8dbe46 authored by Hoogenboom, Jerry's avatar Hoogenboom, Jerry

FDSTools v1.1.0.dev3: Fixes and pipelining enhancements

* General changes in v1.1.0.dev3:
  * Allele name heuristics: don't produce insertions at the end of the
    prefix or at the beginning of the suffix; just include extra STR
    blocks.
  * FDSTools will no longer crash with a 'column not found' error when
    an input file is empty. This situation is now treated as if the
    expected columns existed, but no lines of actual data were present.
    This greatly helps in tracking down issues in pipelines involving
    multiple tools, as tools will now shutdown gracefully if an upstream
    tool fails to write output.
* Allelefinder v1.0.1:
  * Fixed crash that occurred when converting sequences to allele names
    format while no library file was provided.
  * Don't crash when output pipe is closed.
* BGAnalyse v1.0.1:
  * Don't crash when output pipe is closed.
* BGCorrect v1.0.2:
  * Don't crash on empty input files.
  * Don't crash when output pipe is closed.
* BGEstimate v1.1.2:
  * Don't crash when output pipe is closed.
* BGHomRaw v1.0.1:
  * Clarified the 'Allele x of marker y has 0 reads' error message with
    the sample tag.
  * Don't crash when output pipe is closed.
* BGHomStats v1.0.1:
  * Error messages about the input data now contain the sample tag of
    the sample that triggered the error.
  * Don't crash when output pipe is closed.
* BGMerge v1.0.3:
  * Don't crash when output pipe is closed.
* BGPredict v1.0.2:
  * Don't crash on empty input files.
  * Don't crash when output pipe is closed.
* FindNewAlleles v1.0.1:
  * Don't crash on empty input files.
  * Don't crash when output pipe is closed.
* Libconvert v1.1.2:
  * Don't crash when output pipe is closed.
* Library v1.0.3:
  * Don't crash when output pipe is closed.
* Seqconvert v1.0.2:
  * Don't crash when output pipe is closed.
* Samplestats v1.1.1:
  * Don't crash on empty input files.
  * Don't crash when output pipe is closed.
* Stuttermark v1.5.1:
  * Don't crash on empty input files.
  * Don't crash when output pipe is closed.
* Stuttermodel v1.1.2:
  * Don't crash when output pipe is closed.
* TSSV v1.1.0 (additionally):
  * When running analysis in parallel, make tasks of 1 million alignments.
    Previously, this was 10k reads, with the number of alignments per task
    depending on the size of the library file. This caused memory issues for
    huge libraries like whole mt interval libraries.
  * Don't crash when output pipe is closed.
* Vis v1.0.4:
  * Don't crash when output pipe is closed.
parent 9b261e56
#
# Copyright (C) 2016 Jerry Hoogenboom
# Copyright (C) 2017 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
......@@ -24,7 +24,7 @@ including tools for characterisation and filtering of PCR stutter artefacts and
other systemic noise, and for automatic detection of the alleles in a sample.
"""
__version_info__ = ('1', '1', '0', 'dev2')
__version_info__ = ('1', '1', '0', 'dev3')
__version__ = '.'.join(__version_info__)
usage = __doc__.split("\n\n\n")
......
#!/usr/bin/env python
#
# Copyright (C) 2016 Jerry Hoogenboom
# Copyright (C) 2017 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
......@@ -25,6 +25,7 @@ import re, sys, argparse, random, itertools, textwrap, glob
from ConfigParser import RawConfigParser, MissingSectionHeaderError
from StringIO import StringIO
from errno import EPIPE
# Patterns that match entire sequences.
PAT_SEQ_RAW = re.compile("^[ACGT]*$")
......@@ -759,6 +760,8 @@ def parse_library_ini(handle):
def load_profiles(profilefile, library=None):
# TODO: To be replaced with load_profiles_new (less memory).
column_names = profilefile.readline().rstrip("\r\n").split("\t")
if column_names == [""]:
return {} # Empty file.
(colid_marker, colid_allele, colid_sequence, colid_fmean, colid_rmean,
colid_tool) = get_column_ids(column_names, "marker", "allele", "sequence",
"fmean", "rmean", "tool")
......@@ -824,6 +827,8 @@ def load_profiles(profilefile, library=None):
def load_profiles_new(profilefile, library=None):
# TODO, rename this to load_profiles to complete transition.
column_names = profilefile.readline().rstrip("\r\n").split("\t")
if column_names == [""]:
return {} # Empty file.
(colid_marker, colid_allele, colid_sequence, colid_fmean, colid_rmean,
colid_tool) = get_column_ids(column_names, "marker", "allele", "sequence",
"fmean", "rmean", "tool")
......@@ -1005,25 +1010,27 @@ def convert_sequence_raw_tssv(seq, library, marker, return_alias=False):
if (not pre_suf[0] and "prefix" in library
and marker in library["prefix"]):
ref = library["prefix"][marker][0]
i = min(len(ref), len(matched))
while i > 0:
if ref.endswith(matched[:i]):
start += i
matched = matched[i:]
modified = True
break
i -= 1
if start != len(ref):
i = min(len(ref), len(matched))
while i > 0:
if ref.endswith(matched[:i]):
start += i
matched = matched[i:]
modified = True
break
i -= 1
if (not pre_suf[1] and "suffix" in library
and marker in library["suffix"]):
ref = library["suffix"][marker][0]
i = min(len(ref), len(matched))
while i > 0:
if ref.startswith(matched[-i:]):
end -= i
matched = matched[:-i]
modified = True
break
i -= 1
if len(seq) - end != len(ref):
i = min(len(ref), len(matched))
while i > 0:
if ref.startswith(matched[-i:]):
end -= i
matched = matched[:-i]
modified = True
break
i -= 1
if modified:
from_start = start - match_start
from_end = match_end - end
......@@ -1041,15 +1048,24 @@ def convert_sequence_raw_tssv(seq, library, marker, return_alias=False):
else:
from_end -= len(middle[-1])
middle = middle[:-1]
if start:
if pre_suf[0]:
middle.insert(0, seq[:start])
matched = seq[:start] + matched
else:
pre_suf[0] = seq[:start]
if end < len(seq):
if pre_suf[1]:
middle.append(seq[end:])
matched = matched + seq[end:]
else:
pre_suf[1] = seq[end:]
if middle:
middle = reduce(
lambda x, y: (x[:-1] if x[-1][0] == y else x) +
[(y, x[-1][1]+len(y))], middle[1:],
[(middle[0],
start+len(middle[0])+len(pre_suf[0]))])
pre_suf[0] += seq[:start]
pre_suf[1] = seq[end:] + pre_suf[1]
len(middle[0])+len(pre_suf[0]))])
seq = matched
# Now construct parts.
......@@ -1371,6 +1387,8 @@ def read_sample_data_file(infile, data, annotation_column=None, seqformat=None,
"""Add data from infile to data dict as [marker, sequence]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
if column_names == [""]:
return [] # Empty file.
colid_sequence = get_column_ids(column_names, "sequence")
colid_forward = None
colid_reverse = None
......@@ -1508,6 +1526,8 @@ def get_column_ids(column_names, *names, **optional):
def parse_allelelist(allelelist, convert=None, library=None):
"""Read allele list from open file handle."""
column_names = allelelist.readline().rstrip("\r\n").split("\t")
if column_names == [""]:
return {} # Empty file.
colid_sample, colid_marker, colid_allele = get_column_ids(column_names,
"sample", "marker", "allele")
alleles = {}
......@@ -1791,6 +1811,17 @@ def split_quoted_string(text):
#split_quoted_string
def write_pipe(stream, *args):
"""Call stream.write(*args), ignore EPIPE errors."""
try:
stream.write(*args)
except IOError as e:
if e.errno == EPIPE:
return
raise
#write_pipe
def print_db(text, debug):
"""Print text if debug is True."""
if debug:
......
#!/usr/bin/env python
#
# Copyright (C) 2016 Jerry Hoogenboom
# Copyright (C) 2017 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
......@@ -38,11 +38,12 @@ this file to do their job. One may use the allelefinder report
(-R/--report output argument) and the blame tool to get a quick overview
of what might be wrong.
"""
from errno import EPIPE
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files, \
ensure_sequence_format, get_sample_data, SEQ_SPECIAL_VALUES,\
add_sequence_format_args
add_sequence_format_args, write_pipe
__version__ = "1.0.0"
__version__ = "1.0.1"
# Default values for parameters are specified below.
......@@ -71,8 +72,6 @@ _DEF_MAX_NOISY = 2
def find_alleles(samples_in, outfile, reportfile, min_reads, min_allele_pct,
max_noise_pct, max_alleles, max_noisy, stuttermark_column,
seqformat, library):
library = library if library is not None else {}
outfile.write("\t".join(["sample", "marker", "total", "allele"]) + "\n")
allelelist = {}
get_sample_data(
......@@ -127,7 +126,7 @@ def find_alleles_sample(data, outfile, reportfile, tag, min_reads,
noisy_markers = 0
for marker in alleles:
if top_allele[marker] < min_reads:
reportfile.write(
write_pipe(reportfile,
"Sample %s is not suitable for marker %s:\n"
"highest allele has only %i reads\n\n" %
(tag, marker, top_allele[marker]))
......@@ -142,7 +141,7 @@ def find_alleles_sample(data, outfile, reportfile, tag, min_reads,
alleles[marker] = {x: alleles[marker][x]
for x in allele_order[:expect]}
if top_noise[marker][1] > top_allele[marker]*(max_noise_pct/100.):
reportfile.write(
write_pipe(reportfile,
"Sample %s is not suitable for marker %s:\n"
"highest non-allele is %.1f%% of the highest allele\n" %
(tag, marker, 100.*top_noise[marker][1]/top_allele[marker]))
......@@ -151,18 +150,20 @@ def find_alleles_sample(data, outfile, reportfile, tag, min_reads,
seq = allele if seqformat is None \
else ensure_sequence_format(allele, seqformat,
library=library, marker=marker)
reportfile.write("%i\tALLELE\t%s\n" %
write_pipe(reportfile, "%i\tALLELE\t%s\n" %
(alleles[marker][allele], seq))
seq = top_noise[marker][0] if seqformat is None \
else ensure_sequence_format(top_noise[marker][0],
seqformat, library=library, marker=marker)
reportfile.write("%i\tNOISE\t%s\n\n" % (top_noise[marker][1], seq))
write_pipe(reportfile,
"%i\tNOISE\t%s\n\n" % (top_noise[marker][1], seq))
noisy_markers += 1
alleles[marker] = {}
# Drop this sample completely if it has too many noisy markers.
if noisy_markers > max_noisy:
reportfile.write("Sample %s appears to be contaminated!\n\n" % tag)
write_pipe(reportfile,
"Sample %s appears to be contaminated!\n\n" % tag)
return
# The sample is OK, write out its alleles.
......@@ -179,7 +180,7 @@ def find_alleles_sample(data, outfile, reportfile, tag, min_reads,
def get_max_expected_alleles(max_alleles, marker, library):
if max_alleles is not None:
return max_alleles
if "max_expected_copies" in library:
if library is not None and "max_expected_copies" in library:
return library["max_expected_copies"].get(marker, 2)
return 2
#get_max_expected_alleles
......@@ -225,8 +226,19 @@ def run(args):
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
find_alleles(files[0], files[1], args.report, args.min_reads,
args.min_allele_pct, args.max_noise_pct, args.max_alleles,
args.max_noisy, args.stuttermark_column, args.sequence_format,
args.library)
try:
find_alleles(files[0], files[1], args.report, args.min_reads,
args.min_allele_pct, args.max_noise_pct, args.max_alleles,
args.max_noisy, args.stuttermark_column,
args.sequence_format, args.library)
except IOError as e:
if e.errno == EPIPE:
if args.report:
try:
write_pipe(args.report,
"Stopped early because the output was closed.\n")
except:
pass
return
raise
#run
#!/usr/bin/env python
#
# Copyright (C) 2016 Jerry Hoogenboom
# Copyright (C) 2017 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
......@@ -43,11 +43,12 @@ cleanest x% of samples, for different values of x.
"""
import math
from errno import EPIPE
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files,\
add_allele_detection_args, parse_allelelist,\
get_sample_data, add_sequence_format_args
__version__ = "1.0.0"
__version__ = "1.0.1"
# Default values for parameters are specified below.
......@@ -237,8 +238,13 @@ def run(args):
if not files:
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
analyse_background(files[0], files[1], args.allelelist,
args.annotation_column, args.sequence_format,
args.library, args.mode,
map(float, args.percentiles.split(",")))
try:
analyse_background(files[0], files[1], args.allelelist,
args.annotation_column, args.sequence_format,
args.library, args.mode,
map(float, args.percentiles.split(",")))
except IOError as e:
if e.errno == EPIPE:
return
raise
#run
#!/usr/bin/env python
#
# Copyright (C) 2016 Jerry Hoogenboom
# Copyright (C) 2017 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
......@@ -45,11 +45,12 @@ profile was based on direct observations.
import argparse, sys
#import numpy as np # Only imported when actually running this tool.
from errno import EPIPE
from ..lib import load_profiles, ensure_sequence_format, get_column_ids, \
nnls, add_sequence_format_args, SEQ_SPECIAL_VALUES, \
add_input_output_args, get_input_output_files
__version__ = "1.0.1"
__version__ = "1.0.2"
def get_sample_data(infile, convert_to_raw=False, library=None):
......@@ -58,6 +59,8 @@ def get_sample_data(infile, convert_to_raw=False, library=None):
sample, and fill a dict with all sequences in the sample.
"""
column_names = infile.readline().rstrip("\r\n").split("\t")
if column_names == [""]:
return None, None # Empty file.
column_names.append("forward_noise")
column_names.append("reverse_noise")
column_names.append("total_noise")
......@@ -241,6 +244,8 @@ def match_profile(column_names, data, profile, convert_to_raw, library,
def match_profiles(infile, outfile, profiles, library, seqformat):
column_names, data = get_sample_data(
infile, convert_to_raw=seqformat=="raw", library=library)
if column_names is None:
return # Empty file.
colid_sequence = get_column_ids(column_names, "sequence")
outfile.write("\t".join(column_names) + "\n")
......@@ -290,9 +295,14 @@ def run(args):
if len(infiles) > 1:
raise ValueError(
"multiple input files for sample '%s' specified " % tag)
infile = sys.stdin if infiles[0] == "-" else open(infiles[0], "r")
match_profiles(infile, outfile, profiles, args.library,
args.sequence_format)
if infile != sys.stdin:
infile.close()
try:
infile = sys.stdin if infiles[0] == "-" else open(infiles[0], "r")
match_profiles(infile, outfile, profiles, args.library,
args.sequence_format)
if infile != sys.stdin:
infile.close()
except IOError as e:
if e.errno == EPIPE:
continue
raise
#run
#!/usr/bin/env python
#
# Copyright (C) 2016 Jerry Hoogenboom
# Copyright (C) 2017 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
......@@ -34,12 +34,13 @@ import math
import argparse
#import numpy as np # Only imported when actually running this tool.
from errno import EPIPE
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files,\
add_allele_detection_args, nnls, add_sequence_format_args,\
parse_allelelist, get_sample_data, load_profiles_new,\
add_random_subsampling_args
add_random_subsampling_args, write_pipe
__version__ = "1.1.1"
__version__ = "1.1.2"
# Default values for parameters are specified below.
......@@ -71,12 +72,12 @@ _DEF_MIN_GENOTYPES = 3
def solve_profile_mixture(forward, reverse, genotypes, n, starting_values={},
variance=False, reportfile=None):
if reportfile:
reportfile.write("Solving forward read profiles\n")
write_pipe(reportfile, "Solving forward read profiles\n")
f = solve_profile_mixture_single(forward, genotypes, n,
starting_values={x: starting_values[x][0] for x in starting_values},
variance=variance, reportfile=reportfile)
if reportfile:
reportfile.write("Solving reverse read profiles\n")
write_pipe(reportfile, "Solving reverse read profiles\n")
r = solve_profile_mixture_single(reverse, genotypes, n,
starting_values={x: starting_values[x][1] for x in starting_values},
variance=variance, reportfile=reportfile)
......@@ -133,7 +134,7 @@ def solve_profile_mixture_single(samples, genotypes, n, starting_values={},
scale_factor = (100.0/samples[i][j])/len(genotypes[i])
except ZeroDivisionError:
if reportfile:
reportfile.write(
write_pipe(reportfile,
"Sample %i does not have allele %i\n" % (i, j))
continue
C[j, :] += [x * scale_factor for x in samples[i]]
......@@ -173,7 +174,7 @@ def solve_profile_mixture_single(samples, genotypes, n, starting_values={},
if not E[p, p]:
# This would be an utter failure, but let's be safe.
if reportfile:
reportfile.write(
write_pipe(reportfile,
"%4i - No samples appear to have allele %i\n" %
(v, p))
P[p, nn] = 0
......@@ -193,10 +194,10 @@ def solve_profile_mixture_single(samples, genotypes, n, starting_values={},
cur_score = np.square(C - A * P).sum()
score_change = (prev_score-cur_score)/prev_score
if v and reportfile:
reportfile.write("%4i %15.6f %15.6f %6.2f\n" %
write_pipe(reportfile, "%4i %15.6f %15.6f %6.2f\n" %
(v, cur_score, prev_score-cur_score, 100*score_change))
elif reportfile:
reportfile.write("%4i %15.6f\n" % (v, cur_score))
write_pipe(reportfile, "%4i %15.6f\n" % (v, cur_score))
if not cur_score or score_change < 0.0001:
break
......@@ -210,13 +211,13 @@ def solve_profile_mixture_single(samples, genotypes, n, starting_values={},
# more appropriate to compute the sample variance, but how?
if reportfile:
if sum(map(len, genotypes))-len(genotypes):
reportfile.write(
write_pipe(reportfile,
"Computing variances...\n"
"EXPERIMENAL feature! The values produced may give a "
"sense of the amount of variation,\nbut should not be "
"used in further computations that expect true variances!")
else:
reportfile.write(
write_pipe(reportfile,
"Computing variances...\n"
"EXPERIMENAL feature! The values produced are population "
"variances.\nThis may (or may not) change to sample "
......@@ -485,7 +486,7 @@ def generate_profiles(samples_in, outfile, reportfile, allelefile,
if reportfile:
t1 = time.time()
reportfile.write("Data loading and filtering took %f seconds\n" %
write_pipe(reportfile, "Data loading and filtering took %f seconds\n" %
(t1-t0))
outfile.write("\t".join(
......@@ -505,7 +506,7 @@ def generate_profiles(samples_in, outfile, reportfile, allelefile,
# Solve for the profiles of the true alleles.
if reportfile:
reportfile.write("Solving marker %s with n=%i, m=%i, k=%i\n" %
write_pipe(reportfile, "Solving marker %s with n=%i, m=%i, k=%i\n"%
(marker, p["true alleles"], profile_size,
len(p["profiles_forward"])))
t0 = time.time()
......@@ -518,7 +519,7 @@ def generate_profiles(samples_in, outfile, reportfile, allelefile,
reportfile=reportfile)
if reportfile:
t1 = time.time()
reportfile.write("Solved marker %s in %f seconds\n" %
write_pipe(reportfile, "Solved marker %s in %f seconds\n" %
(marker, t1-t0))
# Round to 3 digits to get rid of funny rounding effects.
......@@ -592,10 +593,21 @@ def run(args):
if not files:
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
generate_profiles(files[0], files[1], args.report, args.allelelist,
args.annotation_column, args.min_pct, args.min_abs,
args.min_samples, args.min_sample_pct,
args.min_genotypes, args.sequence_format, args.library,
args.profiles, args.marker, args.homozygotes,
args.limit_reads, args.drop_samples)
try:
generate_profiles(files[0], files[1], args.report, args.allelelist,
args.annotation_column, args.min_pct, args.min_abs,
args.min_samples, args.min_sample_pct,
args.min_genotypes, args.sequence_format,
args.library, args.profiles, args.marker,
args.homozygotes, args.limit_reads,args.drop_samples)
except IOError as e:
if e.errno == EPIPE:
if args.report:
try:
write_pipe(args.report,
"Stopped early because the output was closed.\n")
except:
pass
return
raise
#run
#!/usr/bin/env python
#
# Copyright (C) 2016 Jerry Hoogenboom
# Copyright (C) 2017 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
......@@ -28,11 +28,12 @@ With this tool, separate data points are produced for each sample, which
can be visualised using "fdstools vis bgraw". Use bghomstats or
bgestimate to compute aggregate statistics on noise instead.
"""
from errno import EPIPE
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files,\
add_allele_detection_args, parse_allelelist,\
get_sample_data, add_sequence_format_args
__version__ = "1.0.0"
__version__ = "1.0.1"
# Default values for parameters are specified below.
......@@ -67,7 +68,8 @@ def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs, tag):
(allele, marker, tag))
elif 0 in sample_data[marker, allele]:
raise ValueError(
"Allele %s of marker %s has 0 reads!" % (allele, marker))
"Allele %s of marker %s has 0 reads on one strand in "
"sample %s!" % (allele, marker, tag))
# Enter the read counts into data and check the thresholds.
for marker, sequence in sample_data:
......@@ -202,8 +204,13 @@ def run(args):
if not files:
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
compute_ratios(files[0], files[1], args.allelelist, args.annotation_column,
args.min_pct, args.min_abs, args.min_samples,
args.min_sample_pct, args.sequence_format, args.library,
args.marker)
try:
compute_ratios(files[0], files[1], args.allelelist,
args.annotation_column, args.min_pct, args.min_abs,
args.min_samples, args.min_sample_pct,
args.sequence_format, args.library, args.marker)
except IOError as e:
if e.errno == EPIPE:
return
raise
#run
\ No newline at end of file
#!/usr/bin/env python
#
# Copyright (C) 2016 Jerry Hoogenboom
# Copyright (C) 2017 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
......@@ -31,12 +31,13 @@ samples are heterozygous (as is usually the case with forensic STR
markers), it is preferable to use bgestimate instead, since it can
handle heterozygous samples as well.
"""
from errno import EPIPE
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files,\
add_allele_detection_args, parse_allelelist,\
get_sample_data, add_sequence_format_args, adjust_stats,\
add_random_subsampling_args
__version__ = "1.0.0"
__version__ = "1.0.1"
# Default values for parameters are specified below.
......@@ -61,16 +62,18 @@ _DEF_MIN_SAMPLES = 2
_DEF_MIN_SAMPLE_PCT = 80.
def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs):
def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs, tag):
# Check presence of all alleles.
for marker in sample_alleles:
allele = sample_alleles[marker]
if (marker, allele) not in sample_data:
raise ValueError(
"Missing allele %s of marker %s!" % (allele, marker))
"Missing allele %s of marker %s in sample %s!" %
(allele, marker, tag))
elif 0 in sample_data[marker, allele]:
raise ValueError(
"Allele %s of marker %s has 0 reads!" % (allele, marker))
"Allele %s of marker %s has 0 reads on one strand in "
"sample %s!" % (allele, marker, tag))
# Enter the read counts into data and check the thresholds.
for marker, sequence in sample_data:
......@@ -133,7 +136,7 @@ def compute_stats(samples_in, outfile, allelefile, annotation_column, min_pct,
lambda tag, sample_data: add_sample_data(
data, sample_data,
{m: allelelist[tag][m].pop() for m in allelelist[tag]},
min_pct, min_abs),
min_pct, min_abs, tag),
allelelist, annotation_column, seqformat, library, marker, True,
limit_reads, drop_samples, True)
......@@ -196,8 +199,14 @@ def run(args):
if not files:
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
compute_stats(files[0], files[1], args.allelelist, args.annotation_column,
args.min_pct, args.min_abs, args.min_samples,
args.min_sample_pct, args.sequence_format, args.library,
args.marker, args.limit_reads, args.drop_samples)
try:
compute_stats(files[0], files[1], args.allelelist,
args.annotation_column, args.min_pct, args.min_abs,
args.min_samples, args.min_sample_pct,
args.sequence_format, args.library, args.marker,
args.limit_reads, args.drop_samples)
except IOError as e:
if e.errno == EPIPE:
return
raise
#run
#!/usr/bin/env python
#
# Copyright (C) 2016 Jerry Hoogenboom
# Copyright (C) 2017 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
......@@ -40,10 +40,11 @@ Example: fdstools bgpredict ... | fdstools bgmerge old.txt > out.txt
import argparse
import sys
from errno import EPIPE
from ..lib import load_profiles_new, ensure_sequence_format, glob_path,\
add_sequence_format_args
__version__ = "1.0.2"
__version__ = "1.0.3"
def merge_profiles(infiles, outfile, seqformat, library):
......@@ -97,5 +98,10 @@ def run(args):
raise ValueError("please specify at least two input files")
infiles.append("-")
merge_profiles(infiles, args.outfile, args.sequence_format, args.library)
try:
merge_profiles(infiles, args.outfile,args.sequence_format,args.library)
except IOError as e:
if e.errno == EPIPE:
return
raise
#run
#!/usr/bin/env python
#
# Copyright (C) 2016 Jerry Hoogenboom
# Copyright (C) 2017 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
......@@ -46,13 +46,14 @@ import argparse
import sys
#import numpy as np # Only imported when actually running this tool.
from errno import EPIPE
from operator import mul
from ..lib import get_column_ids, reverse_complement, get_repeat_pattern,\
mutate_sequence, SEQ_SPECIAL_VALUES,\
PAT_SEQ_RAW, ensure_sequence_format, add_sequence_format_args
__version__ = "1.0.1"
__version__ = "1.0.2"
# Default values for parameters are specified below.
......@@ -69,6 +70,8 @@ _DEF_MIN_R2 = 0.8
def parse_stuttermodel(stuttermodel, min_r2=0, use_all_data=False):
column_names = stuttermodel.readline().rstrip("\r\n").split("\t")
if column_names == [""]:
return {} # Empty file.
(colid_unit, colid_marker, colid_stutter, colid_lbound, colid_direction,
colid_r2) = get_column_ids(column_names, "unit", "marker", "stutter",
"lbound", "direction", "r2")
......@@ -228,6 +231,8 @@ def predict_profiles(stuttermodel, seqsfile, outfile, default_marker,
# Read list of sequences and compute stutter profiles for each.
column_names = seqsfile.readline().rstrip("\r\n").split("\t")
if column_names == [""]:
return # Empty file.
colid_sequence = get_column_ids(column_names, "sequence")
colid_marker = get_column_ids(column_names, "marker", optional=True)
for line in seqsfile:
......@@ -329,7 +334,12 @@ def run(args):
global np
import numpy as np
predict_profiles(args.stuttermodel, args.seqs, args.outfile, args.marker,
args.use_all_data, args.min_pct, args.min_r2,
args.sequence_format, args.library)
try:
predict_profiles(args.stuttermodel, args.seqs, args.outfile,
args.marker, args.use_all_data, args.min_pct,
args.min_r2, args.sequence_format, args.library)
except IOError as e:
if e.errno == EPIPE:
return
raise
#run
#!/usr/bin/env python
#
# Copyright (C) 2016 Jerry Hoogenboom
# Copyright (C) 2017 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
......@@ -29,22 +29,24 @@ provided list of known sequences.
"""
import argparse, sys