Commit 276a0439 authored by jhoogenboom's avatar jhoogenboom
Browse files

Introducing BGPredict

* New tool BGPredict predicts background noise profiles (containing
  only stutter products) for user-supplied alleles/sequences using
  a trained stutter model obtained from Stuttermodel. Currently
  only the amounts of the forward strand are predicted.
* New option -L/--min-lengths for Stuttermodel allows to set a
  minimum required number of unique repeat lengths to base the
  fits on (default: 5).
* Updated formatting of output of Stuttermodel: added '+' sign to
  positive stutter, limited r2 scores to 3 decimal places, and now
  all coefficients are written in scientific notation with 3
  decimal places.
* The --output-column option of SeqConvert now defaults to using
  the value of --allele-column.
parent 818ddd2b
......@@ -46,6 +46,12 @@ DEF_TAG_FORMAT = "\\1"
# This is the default for the -o command line option with batch support.
DEF_OUTFILE_FORMAT = "\\1-\\2.out"
# IUPAC Table of complementary bases.
COMPL = {"A": "T", "T": "A", "U": "A", "G": "C", "C": "G", "R": "Y", "Y": "R",
"K": "M", "M": "K", "B": "V", "V": "B", "D": "H", "H": "D",
"a": "t", "t": "a", "u": "a", "g": "c", "c": "g", "r": "y", "y": "r",
"k": "m", "m": "k", "b": "v", "v": "b", "d": "h", "h": "d"}
def call_variants(template, sequence, reverse_indices=False, cache=True,
debug=False):
......@@ -794,11 +800,7 @@ def ensure_sequence_format(seq, to_format, from_format=None, library=None,
def reverse_complement(sequence):
"""Return the reverse complement of the given DNA sequence."""
c = {"A": "T", "T": "A", "U": "A", "G": "C", "C": "G", "R": "Y", "Y": "R",
"K": "M", "M": "K", "B": "V", "V": "B", "D": "H", "H": "D",
"a": "t", "t": "a", "u": "a", "g": "c", "c": "g", "r": "y", "y": "r",
"k": "m", "m": "k", "b": "v", "v": "b", "d": "h", "h": "d"}
return "".join(c[x] if x in c else x for x in sequence[::-1])
return "".join(COMPL[x] if x in COMPL else x for x in reversed(sequence))
#reverse_complement
......
......@@ -56,7 +56,6 @@ def get_sample_data(infile, convert_to_raw=False, library=None):
def match_profile(column_names, data, profile, convert_to_raw, library,
marker):
import numpy as np
(colid_name, colid_allele, colid_forward, colid_reverse, colid_total,
colid_forward_noise, colid_reverse_noise, colid_total_noise,
colid_forward_add, colid_reverse_add, colid_total_add) = get_column_ids(
......@@ -172,6 +171,10 @@ def add_arguments(parser):
def run(args):
# Import numpy now.
import numpy as np
global np
gen = get_input_output_files(args, True, True)
if not gen:
raise ValueError("please specify an input file, or pipe in the output "
......
......@@ -81,7 +81,6 @@ def solve_profile_mixture_single(samples, genotypes, n, variance=False,
If reportfile is a writable handle, diagnostic/progress information
is written to it.
"""
import numpy as np
num_samples = len(samples)
profile_size = len(samples[0])
......@@ -526,6 +525,10 @@ def add_arguments(parser):
def run(args):
# Import numpy now.
import numpy as np
global np
files = get_input_output_files(args)
if not files:
raise ValueError("please specify an input file, or pipe in the output "
......
#!/usr/bin/env python
"""
Predict background profiles of new alleles based on a model of stutter
occurrence obtained from stuttermodel.
"""
import argparse
import sys
#import numpy as np # Only imported when actually running this tool.
from operator import mul
from ..lib import get_column_ids, reverse_complement, get_repeat_pattern,\
mutate_sequence,\
PAT_SEQ_RAW, ensure_sequence_format, add_sequence_format_args
__version__ = "0.1dev"
# Default values for parameters are specified below.
# Default name of the column that contains the marker name.
# This value can be overridden by the -m command line option.
_DEF_COLNAME_MARKER = "name"
# Default name of the column that contains the allele.
# This value can be overridden by the -a command line option.
_DEF_COLNAME_ALLELE = "allele"
# Default name of the column to write the output to.
# This value can be overridden by the -o command line option.
_DEF_COLNAME_ALLELE_OUT = "allele"
# Default minimum amount of background to consider, as a percentage of
# the highest allele.
# This value can be overridden by the -m command line option.
_DEF_THRESHOLD_PCT = 0.5
# Default minimum R2 score.
# This value can be overridden by the -t command line option.
_DEF_MIN_R2 = 0.8
def parse_stuttermodel(stuttermodel, min_r2=0):
column_names = stuttermodel.readline().rstrip("\r\n").split("\t")
(colid_unit, colid_marker, colid_stutter, colid_lbound, colid_direction,
colid_r2) = get_column_ids(column_names, "unit", "marker", "stutter",
"lbound", "direction", "r2")
degree = ord('a')
colids_coefs = []
while True:
try:
colids_coefs.append(get_column_ids(column_names, chr(degree)))
degree += 1
except ValueError:
break
degree -= ord('b')
if degree < 0:
raise ValueError("Invalid stutter model file: Unable to determine "
"polynomial degree!")
repeat_patterns = {}
model = {}
for line in stuttermodel:
line = line.rstrip("\r\n").split("\t")
marker = line[colid_marker]
seq = line[colid_unit]
stutter_fold = int(line[colid_stutter])
direction = line[colid_direction]
lbound = int(line[colid_lbound])
r2 = float(line[colid_r2])
if r2 < min_r2:
continue
coefs = [float(line[colid_coef]) for colid_coef in colids_coefs]
if marker not in model:
model[marker] = {}
if not seq or not PAT_SEQ_RAW.match(seq):
raise ValueError(
"Invalid stutter model file: Encountered invalid repeat "
"sequence '%s'!" % seq)
if direction == "reverse":
seq = reverse_complement(seq)
elif direction != "forward":
raise ValueError(
"Invalid stutter model file: Unknown sequence strand '%s'!" %
direction)
if (seq, stutter_fold) in model[marker]:
raise ValueError(
"Invalid stutter model file: Encountered two models for %+i "
"stutter of %s repeats in marker %s!" %
(stutter_fold, seq, marker))
if seq not in repeat_patterns:
repeat_patterns[seq] = get_repeat_pattern(seq)
model[marker][seq, stutter_fold] = {
"lbound": lbound,
"r2": r2,
"pat": repeat_patterns[seq],
"func": lambda x: 0. if x < lbound else max(0.,
sum(coefs[i] * x**(degree-i) for i in range(len(coefs))))}
return model
#parse_stuttermodel
def get_all_stutters(allele, flanks, model, min_pct):
"""Return a sorted list of all possible stutters."""
# Include flanks in case the allele starts in a repeat.
full_allele = flanks[0] + allele + flanks[1]
stutters = []
for seq, stutter_fold in model:
stutlen = len(seq) * abs(stutter_fold)
# Generate all stutter variants for this sequence.
for m in model[seq, stutter_fold]["pat"].finditer(full_allele):
start = m.start()
end = m.end()
length = end - start
if length <= stutlen:
continue # Not repeated.
if stutter_fold > 0:
if (end < len(flanks[0]) or
start > len(full_allele)-len(flanks[1])):
continue # Repeat is embedded in flank.
elif (len(flanks[0]) > end-stutlen or
start+stutlen > len(full_allele)-len(flanks[1])):
continue # Shortening repeat disrupts flank.
amount = model[seq, stutter_fold]["func"](length)
if amount < min_pct:
continue
stutters.append({
"seq": seq,
"stutlen": stutlen,
"start": min(0, start-len(flanks[0])),
"end":min(len(full_allele)-len(flanks[1]), end)-len(flanks[0]),
"fold": stutter_fold,
"amount": amount/100.})
return sorted(stutters, key=lambda x: (x["start"], x["end"]))
#get_all_stutters
def get_all_combinations(stutters, combinations=None, appendto=None, pos=0,
start=0):
"""Return a list of all non-overlapping combinations of stutters."""
if combinations is None:
combinations = []
if appendto is None:
appendto = []
for i in range(start, len(stutters)):
if stutters[i]["start"] < pos:
continue
withnewelement = [x for x in appendto]
withnewelement.append(stutters[i])
combinations.append(withnewelement)
get_all_combinations(stutters, combinations, withnewelement,
stutters[i]["end"], i+1)
return combinations
#get_all_combinations
def get_relative_frequencies(stutters, combinations):
# Compute expected amount of each combination.
A = np.array([reduce(mul, (s["amount"] for s in combo))
for combo in combinations])
C = np.array([[s in c for c in combinations] for s in stutters])
for iterations in xrange(1000): #TODO: get good stopcond
for i in range(len(stutters)):
A[C[i]] *= stutters[i]["amount"] / A[C[i]].sum()
return A.tolist()
#get_relative_frequencies
def predict_profiles(stuttermodel, seqsfile, outfile,
marker_column, allele_column, default_marker,
crosstab, min_pct, min_r2, library):
# Parse library and stutter model file.
library = parse_library(library) if library is not None else None
model = parse_stuttermodel(stuttermodel, min_r2)
# Read list of sequences.
seqlist = {}
column_names = seqsfile.readline().rstrip("\r\n").split("\t")
colid_allele = get_column_ids(column_names, "allele")
try:
colid_name = get_column_ids(column_names, "name")
except:
colid_name = None
for line in seqsfile:
line = line.rstrip("\r\n").split("\t")
marker = line[colid_name] if colid_name is not None else default_marker
if marker not in model:
continue
sequence = ensure_sequence_format(line[colid_allele], "raw", library)
try:
seqlist[marker].append(sequence)
except KeyError:
seqlist[marker] = [sequence]
if not crosstab:
outfile.write("\t".join(
["marker", "allele", "sequence", "fmean", "rmean"]) + "\n")
for marker in seqlist:
p = {
"true alleles": len(seqlist[marker]),
"alleles": seqlist[marker],
"profiles_forward":
[[100 if i == j else 0
for i in range(len(seqlist[marker]))]
for j in range(len(seqlist[marker]))],
"profiles_reverse":
[[100 if i == j else 0
for i in range(len(seqlist[marker]))]
for j in range(len(seqlist[marker]))]}
if library and "flanks" in library and marker in library["flanks"]:
flanks = library["flanks"][marker]
else:
flanks = ["", ""]
for ai in range(len(seqlist[marker])):
# TODO: repeat this part for reverse
allele = seqlist[marker][ai]
stutters = get_all_stutters(allele, flanks, model[marker], min_pct)
combinations = get_all_combinations(stutters)
frequencies = get_relative_frequencies(stutters, combinations)
for i in range(len(combinations)):
freq = frequencies[i] * 100.
if freq < min_pct:
continue
sequence = mutate_sequence(allele, [
"%+i.1->%s" %
(s["end"], allele[s["end"]-s["stutlen"]:s["end"]])
if s["fold"] > 0 else
"%+i%s>-" % (s["end"]-s["stutlen"]+1,
allele[s["end"]-s["stutlen"]:s["end"]])
for s in combinations[i]])
try:
si = p["alleles"].index(sequence)
except ValueError:
p["alleles"].append(sequence)
for profile in p["profiles_forward"]:
profile.append(0)
for profile in p["profiles_reverse"]:
profile.append(0)
si = -1
p["profiles_forward"][ai][si] = freq
if crosstab:
# Cross-tabular output (profiles in rows)
outfile.write("\t".join([marker, "0"] + p["alleles"]) + "\n")
for i in range(p["true alleles"]):
outfile.write("\t".join(
[marker, str(i+1)] + map(str, p["profiles_forward"][i])) +
"\n")
outfile.write("\t".join(
[marker, str(-i-1)] + map(str, p["profiles_reverse"][i])) +
"\n")
else:
# Tab-separated columns format.
for i in range(p["true alleles"]):
for j in range(len(p["alleles"])):
if not (p["profiles_forward"][i][j] +
p["profiles_reverse"][i][j]):
continue
outfile.write("\t".join(
[marker, p["alleles"][i], p["alleles"][j]] +
map(str, [p["profiles_forward"][i][j],
p["profiles_reverse"][i][j]])) + "\n")
#predict_profiles
def add_arguments(parser):
# TODO: Add something to enable using the "All data" fits if needed.
parser.add_argument('stuttermodel', metavar="STUT",
type=argparse.FileType("r"),
help="file containing a trained stutter model")
parser.add_argument('seqs', metavar="SEQS",
type=argparse.FileType("r"),
help="file containing the sequences for which a profile should be "
"predicted")
parser.add_argument('outfile', metavar="OUT", nargs="?", default=sys.stdout,
type=argparse.FileType("w"),
help="the file to write the output to (default: write to stdout)")
parser.add_argument('-m', '--marker-column', metavar="COLNAME",
default=_DEF_COLNAME_MARKER,
help="name of the column that contains the marker name "
"(default: '%(default)s')")
parser.add_argument('-a', '--allele-column', metavar="COLNAME",
default=_DEF_COLNAME_ALLELE,
help="name of the column that contains the allele "
"(default: '%(default)s')")
parser.add_argument('-M', '--marker', metavar="MARKER",
help="assume the specified marker for all sequences")
parser.add_argument('-C', '--cross-tabular', action="store_true",
help="if specified, a space-efficient cross-tabular output format is "
"used instead of the default tab-separated columns format")
filtergroup = parser.add_argument_group("filtering options")
filtergroup.add_argument('-n', '--min-pct', metavar="PCT", type=float,
default=_DEF_THRESHOLD_PCT,
help="minimum amount of background to consider, as a percentage "
"of the highest allele (default: %4.2f)" % _DEF_THRESHOLD_PCT)
filtergroup.add_argument('-t', '--min-r2', type=float,
default=_DEF_MIN_R2, metavar="N",
help="minimum required r-squared score (default: %(default)s)")
add_sequence_format_args(parser, "raw", True) # Force raw seqs.
#add_arguments
def run(args):
# Import numpy now.
import numpy as np
global np
predict_profiles(args.stuttermodel, args.seqs, args.outfile,
args.marker_column, args.allele_column, args.marker,
args.cross_tabular, args.min_pct, args.min_r2,
args.library)
#run
def main():
"""
Main entry point.
"""
parser = argparse.ArgumentParser(
description=__doc__)
try:
add_arguments(parser)
run(parser.parse_args())
except OSError as error:
parser.error(error)
#main
if __name__ == "__main__":
main()
......@@ -61,7 +61,6 @@ def add_sample_data(data, sample_data, sample_tag, alleles):
def blame(samples_in, outfile, allelefile, annotation_column, mode,
profilefile, num, seqformat, libfile, marker):
import numpy as np
library = parse_library(libfile) if libfile else None
allelelist = {} if allelefile is None \
else parse_allelelist(allelefile, "raw", library)
......@@ -149,6 +148,10 @@ def add_arguments(parser):
def run(args):
# Import numpy now.
import numpy as np
global np
files = get_input_output_files(args)
if not files:
raise ValueError("please specify an input file, or pipe in the output "
......
......@@ -46,16 +46,14 @@ _DEF_COLNAME_MARKER = "name"
# This value can be overridden by the -a command line option.
_DEF_COLNAME_ALLELE = "allele"
# Default name of the column to write the output to.
# This value can be overridden by the -o command line option.
_DEF_COLNAME_ALLELE_OUT = "allele"
def convert_sequences(infile, outfile, to_format, library=None,
fixed_marker=None, colname_marker=_DEF_COLNAME_MARKER,
colname_allele=_DEF_COLNAME_ALLELE,
colname_allele_out=_DEF_COLNAME_ALLELE_OUT,
colname_allele_out=None,
library2=None, revcomp_markers=[]):
if colname_allele_out is None:
colname_allele_out = colname_allele
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_allele = get_column_ids(column_names, colname_allele)
if library is None:
......@@ -107,9 +105,8 @@ def add_arguments(parser):
help="name of the column that contains the allele "
"(default: '%(default)s')")
parser.add_argument('-c', '--output-column', metavar="COLNAME",
default=_DEF_COLNAME_ALLELE_OUT,
help="name of the column to write the output to "
"(default: '%(default)s')")
"(default: same as --allele-column')")
parser.add_argument('-M', '--marker', metavar="MARKER",
help="assume the specified marker for all sequences")
parser.add_argument('-l', '--library', metavar="LIBRARY",
......
......@@ -30,7 +30,10 @@ _DEF_THRESHOLD_ABS = 1
# Default minimum number of samples for each true allele.
# This value can be overridden by the -s command line option.
_DEF_MIN_SAMPLES = 1
# TODO: add option to require minimum number of alleles (or lengths)
# Default minimum number of different repeat lengths per fit.
# This value can be overridden by the -L command line option.
_DEF_MIN_LENGTHS = 5
# Default degree of polynomials to fit.
# This value can be overridden by the -D command line option.
......@@ -84,7 +87,6 @@ def get_unique_repeats(maxlen, out=None, prefix=""):
def compute_fit(lengths, observed_amounts, degree):
import numpy as np
fit = np.polyfit(lengths, observed_amounts, degree, None, True)
if fit[2] == degree + 1:
return fit[0]
......@@ -93,7 +95,6 @@ def compute_fit(lengths, observed_amounts, degree):
def test_fit(fit, lengths, observed_amounts):
import numpy as np
p = np.poly1d(fit)
pp = p.deriv()
max_x = lengths.max()
......@@ -119,11 +120,11 @@ def test_fit(fit, lengths, observed_amounts):
def print_fit(outfile, fit, lengths, seq, marker, stutter_fold, direction,
lower_bound, r2):
# TODO: scientific, 3 digits
if lower_bound < max(lengths):
outfile.write("\t".join(map(str,
[seq, marker, stutter_fold, lower_bound, min(lengths),
max(lengths), direction, r2] + fit.tolist())) + "\n")
outfile.write("%s\t%s\t%+i\t%i\t%i\t%i\t%s\t%0.3f\t" %
(seq, marker, stutter_fold, lower_bound, min(lengths),
max(lengths), direction, r2))
outfile.write("\t".join("%.3e" % x for x in fit.tolist()) + "\n")
#print_fit
......@@ -179,8 +180,8 @@ def filter_data(data, min_samples):
def fit_stutter(samples_in, outfile, allelefile, annotation_column, min_pct,
min_abs, min_samples, library, min_r2, degree, same_shape,
ignore_zeros, max_unit_length, raw_outfile, marker,
min_abs, min_lengths, min_samples, library, min_r2, degree,
same_shape, ignore_zeros, max_unit_length, raw_outfile, marker,
limit_reads, drop_samples):
# Parse library and allele list.
......@@ -221,8 +222,8 @@ def fit_stutter(samples_in, outfile, allelefile, annotation_column, min_pct,
stutter_fold = -1
while True:
if fit_stutter_model(outfile, raw_outfile, data, library, seq,
patterns[seq], min_r2, degree, same_shape, ignore_zeros,
stutter_fold):
patterns[seq], min_r2, min_lengths, degree, same_shape,
ignore_zeros, stutter_fold):
stutter_fold += 1 if stutter_fold > 0 else -1
elif stutter_fold < 0:
stutter_fold = 1
......@@ -232,8 +233,8 @@ def fit_stutter(samples_in, outfile, allelefile, annotation_column, min_pct,
def fit_stutter_model(outfile, raw_outfile, data, library, seq, patterns,
min_r2, degree, same_shape, ignore_zeros, stutter_fold):
import numpy as np
min_r2, min_lengths, degree, same_shape, ignore_zeros,
stutter_fold):
success = False
stutlen = len(seq) * abs(stutter_fold)
......@@ -286,12 +287,12 @@ def fit_stutter_model(outfile, raw_outfile, data, library, seq, patterns,
# should be included in the analysis but it is not.
if stutter_fold > 0:
variant = "%+i.1->%s" % (
position + stutlen - len(flanks[0]),
full_allele[position:position+stutlen])
end - len(flanks[0]),
full_allele[position:end])
else:
variant = "%+i%s>-" % (
position + 1 - len(flanks[0]),
full_allele[position:position+stutlen])
full_allele[position:end])
for sample in data["alleles"][marker][allele]:
amount = [0., 0.] # Reads per 100 reads of allele.
for sequence in data["samples"][sample, marker]:
......@@ -325,6 +326,8 @@ def fit_stutter_model(outfile, raw_outfile, data, library, seq, patterns,
# Compute per-marker fit for this marker.
if raw_outfile == outfile or same_shape:
continue
if len(set(lengths)) < min_lengths:
continue
lengths = np.array(lengths)
for i in (0, 1):
if not observed_amounts[:, i].any():
......@@ -334,6 +337,8 @@ def fit_stutter_model(outfile, raw_outfile, data, library, seq, patterns,
this_lengths = lengths[observed_amounts[:, i] > 0]
this_amounts = observed_amounts[
observed_amounts[:, i] > 0, i]
if len(set(this_lengths)) < min_lengths:
continue
else:
this_lengths = lengths
this_amounts = observed_amounts[:, i]
......@@ -370,6 +375,8 @@ def fit_stutter_model(outfile, raw_outfile, data, library, seq, patterns,
all_observed_amounts = np.array(all_observed_amounts)
if raw_outfile == outfile or not all_observed_amounts.any():
return success
if len(set(all_lengths)) < min_lengths:
return success
all_lengths = np.array(all_lengths)
if same_shape:
markers = np.array(markers)
......@@ -382,6 +389,8 @@ def fit_stutter_model(outfile, raw_outfile, data, library, seq, patterns,
this_lengths = all_lengths[all_observed_amounts[:, i] > 0]
this_amounts = all_observed_amounts[
all_observed_amounts[:, i] > 0, i]
if len(set(this_lengths)) < min_lengths:
continue
if same_shape:
this_markers = []
this_from_markers = []
......@@ -448,6 +457,10 @@ def add_arguments(parser):
default=_DEF_THRESHOLD_ABS,
help="minimum amount of background to consider, as an absolute "
"number of reads (default: %(default)s)")
filtergroup.add_argument('-L', '--min-lengths', metavar="N",
type=pos_int_arg, default=_DEF_MIN_LENGTHS,
help="require this minimum number of unique repeat lengths "
"(default: %(default)s)")
filtergroup.add_argument('-s', '--min-samples', metavar="N",
type=pos_int_arg, default=_DEF_MIN_SAMPLES,
help="require this minimum number of samples for each true allele "
......@@ -481,15 +494,19 @@ def add_arguments(parser):
def run(args):
# Import numpy now.
import numpy as np
global np
files = get_input_output_files(args)
if not files:
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
fit_stutter(files[0], files[1], args.allelelist, args.annotation_column,
args.min_pct, args.min_abs, args.min_samples, args.library,
args.min_r2, args.degree, args.same_shape, args.ignore_zeros,
args.max_unit_length, args.raw_outfile, args.marker,
args.limit_reads, args.drop_samples)
args.min_pct, args.min_abs, args.min_lengths, args.min_samples,