Commit bfce7a04 authored by jhoogenboom's avatar jhoogenboom
Browse files

Introducing bgcorrect, and changed bgestimate output format

* Added BGCorrect tool for filtering noise in case samples.
* BGEstimate now writes its output in tab-separated format, instead
  of JSON.
* Small changes to help output formatting.
parent be745e64
......@@ -39,8 +39,7 @@ def main():
"""
Main entry point.
"""
parser = argparse.ArgumentParser(add_help=False, description=usage[0],
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser = argparse.ArgumentParser(add_help=False, description=usage[0])
parser.version = version(parser.prog)
parser.add_argument('-h', '--help', action=_HelpAction,
default=argparse.SUPPRESS, nargs=argparse.REMAINDER,
......
......@@ -374,6 +374,81 @@ def parse_library_ini(handle):
#parse_library_ini
def load_profiles(profilefile, library=None):
profiles = {}
# Read the profile file without assuming it is sorted.
for line in profilefile:
line = line.rstrip("\r\n").split("\t")
if line == [""]:
continue
if len(line) < 3:
raise ValueError(
"Invalid background noise profiles file: encountered line "
"with %i columns, need at least 3" % len(line))
marker = line[0]
try:
num = int(line[1])
except ValueError:
raise ValueError(
"Invalid background noise profiles file: encountered "
"non-number '%s' in the second column" % line[1])
values = line[2:]
if marker not in profiles:
profiles[marker] = {
"m": len(values),
"n": abs(num),
"seqs": [],
"forward": {},
"reverse": {}
}
elif len(values) != profiles[marker]["m"]:
raise ValueError(
"Invalid background noise profiles file: profiles of "
"marker '%s'% have inconsistent lengths" % marker)
profiles[marker]["n"] = max(abs(num), profiles[marker]["n"])
if num == 0:
if profiles[marker]["seqs"]:
raise ValueError(
"Invalid background profiles noise file: encountered "
"multiple header lines for marker '%s'" % marker)
values = map(
lambda seq: ensure_sequence_format(
seq, "raw", library=library, marker=marker),
values)
profiles[marker]["seqs"] = values
else:
direction = "forward" if num > 0 else "reverse"
if abs(num) in profiles[marker][direction]:
raise ValueError(
"Invalid background profiles noise file: encountered "
"multiple %s profiles for marker '%s' allele %i" %
(direction, marker, abs(num)))
values = map(float, values)
profiles[marker][direction][abs(num)] = values
# Check completeness and reorder true alleles.
for marker in profiles:
newprofiles = {"forward": [], "reverse": []}
if not profiles[marker]["seqs"]:
raise ValueError(
"Invalid background profiles noise file: missing header line "
"for marker '%s'" % marker)
for i in range(1, profiles[marker]["n"] + 1):
for direction in newprofiles:
if i not in profiles[marker][direction]:
raise ValueError(
"Invalid background profiles noise file: missing %s "
"profile for marker '%s' allele %i" %
(direction, marker, i))
newprofiles[direction].append(profiles[marker][direction][i])
profiles[marker]["forward"] = newprofiles["forward"]
profiles[marker]["reverse"] = newprofiles["reverse"]
return profiles
#load_profiles
def detect_sequence_format(seq):
"""Return format of seq. One of 'raw', 'tssv', or 'allelename'."""
if not seq:
......
......@@ -181,11 +181,11 @@ def add_arguments(parser):
default=_DEF_MIN_READS,
help="require at least this number of reads for the highest allele "
"(default: %(default)s)")
parser.add_argument('-m', '--min-allele-pct', metavar="N", type=float,
parser.add_argument('-m', '--min-allele-pct', metavar="PCT", type=float,
default=_DEF_MIN_ALLELE_PCT,
help="call heterozygous if the second allele is at least this "
"percentage of the highest allele (default: %(default)s)")
parser.add_argument('-M', '--max-noise-pct', metavar="N", type=float,
parser.add_argument('-M', '--max-noise-pct', metavar="PCT", type=float,
default=_DEF_MAX_NOISE_PCT,
help="a sample is considered contaminated/unsuitable for a marker if "
"the highest non-allelic sequence is at least this percentage of "
......
#!/usr/bin/env python
"""
Match background noise profiles to samples.
"""
import argparse
import sys
#import numpy as np # Only imported when actually running this tool.
from ..lib import parse_library, load_profiles, ensure_sequence_format, nnls, \
get_column_ids
__version__ = "0.1dev"
def get_sample_data(infile, convert_to_raw=False, library=None):
"""
Read data from the given file handle, corresponding to a single
sample, and fill a dict with all sequences in the sample.
"""
column_names = infile.readline().rstrip("\r\n").split("\t")
column_names.append("forward_noise")
column_names.append("reverse_noise")
column_names.append("total_noise")
column_names.append("forward_add")
column_names.append("reverse_add")
column_names.append("total_add")
colid_name, colid_allele, colid_forward, colid_reverse = get_column_ids(
column_names, "name", "allele", "forward", "reverse")
data = {}
for line in infile:
cols = line.rstrip("\r\n").split("\t")
marker = cols[colid_name]
if convert_to_raw:
cols[colid_allele] = ensure_sequence_format(
cols[colid_allele], "raw", library=library, marker=marker)
cols[colid_forward] = int(cols[colid_forward])
cols[colid_reverse] = int(cols[colid_reverse])
cols.append(0)
cols.append(0)
cols.append(0)
cols.append(0)
cols.append(0)
cols.append(0)
if marker not in data:
data[marker] = []
data[marker].append(cols)
return column_names, data
#get_sample_data
def match_profile(column_names, data, profile, convert_to_raw, library,
marker):
import numpy as np
(colid_name, colid_allele, colid_forward, colid_reverse, colid_total,
colid_forward_noise, colid_reverse_noise, colid_total_noise,
colid_forward_add, colid_reverse_add, colid_total_add) = get_column_ids(
column_names, "name", "allele", "forward", "reverse", "total",
"forward_noise", "reverse_noise", "total_noise", "forward_add",
"reverse_add", "total_add")
# Enter profiles into P.
P1 = np.matrix(profile["forward"])
P2 = np.matrix(profile["reverse"])
# Enter sample into C.
seqs = []
C1 = np.matrix(np.zeros([1, profile["m"]]))
C2 = np.matrix(np.zeros([1, profile["m"]]))
for line in data:
if convert_to_raw:
allele = ensure_sequence_format(line[colid_allele], "raw",
library=library, marker=marker)
else:
allele = line[colid_allele]
seqs.append(allele)
try:
i = profile["seqs"].index(allele)
except ValueError:
# Note: Not adding any new sequences to the profile, since
# they will just be zeroes and have no effect on the result.
continue
C1[0, i] = line[colid_forward]
C2[0, i] = line[colid_reverse]
# Compute corrected read counts.
A = nnls(np.hstack([P1, P2]).T, np.hstack([C1, C2]).T)
np.fill_diagonal(P1, 0)
np.fill_diagonal(P2, 0)
forward_noise = A.T * P1
reverse_noise = A.T * P2
forward_add = np.multiply(A, P1.sum(1)).T
reverse_add = np.multiply(A, P2.sum(1)).T
j = 0
for line in data:
j += 1
try:
i = profile["seqs"].index(seqs[j-1])
except ValueError:
continue
line[colid_forward_noise] = forward_noise[0, i]
line[colid_reverse_noise] = reverse_noise[0, i]
line[colid_total_noise] = forward_noise[0, i] + reverse_noise[0, i]
if i < profile["n"]:
line[colid_forward_add] = forward_add[0, i]
line[colid_reverse_add] = reverse_add[0, i]
line[colid_total_add] = forward_add[0, i] + reverse_add[0, i]
# Add sequences that are in the profile but not in the sample.
for i in range(profile["m"]):
if profile["seqs"][i] in seqs:
continue
amount = forward_noise[0, i] + reverse_noise[0, i]
if i < profile["n"]:
amount += forward_add[0, i] + reverse_add[0, i]
if amount > 0:
line = [""] * len(column_names)
line[colid_name] = marker
line[colid_allele] = profile["seqs"][i]
line[colid_forward] = 0
line[colid_reverse] = 0
line[colid_total] = 0
line[colid_forward_noise] = forward_noise[0, i]
line[colid_reverse_noise] = reverse_noise[0, i]
line[colid_total_noise] = forward_noise[0, i] + reverse_noise[0, i]
if i < profile["n"]:
line[colid_forward_add] = forward_add[0, i]
line[colid_reverse_add] = reverse_add[0, i]
line[colid_total_add] = forward_add[0, i] + reverse_add[0, i]
else:
line[colid_forward_add] = 0
line[colid_reverse_add] = 0
line[colid_total_add] = 0
data.append(line)
#match_profile
def match_profiles(profilefile, infile, outfile, libfile, seqformat, marker):
library = parse_library(libfile) if libfile else None
profiles = load_profiles(profilefile, library)
if marker:
profiles = {marker: profiles[marker]} if marker in profiles else {}
column_names, data = get_sample_data(
infile, convert_to_raw=seqformat=="raw", library=library)
colid_allele = get_column_ids(column_names, "allele")
outfile.write("\t".join(column_names) + "\n")
for marker in data:
if marker in profiles:
match_profile(column_names, data[marker], profiles[marker],
seqformat!="raw", library, marker)
for line in data[marker]:
if seqformat is not None and seqformat != "raw":
line[colid_allele] = ensure_sequence_format(line[colid_allele],
seqformat, library=library, marker=marker)
outfile.write("\t".join(map(str, line)) + "\n")
#match_profiles
def add_arguments(parser):
parser.add_argument('profiles', metavar="PROFILES",
type=argparse.FileType('r'),
help="file containing background noise profiles to match")
parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
type=argparse.FileType('r'),
help="the tab-separated data file to process (default: read from "
"stdin)")
parser.add_argument('outfile', nargs='?', metavar="OUT",
default=sys.stdout, type=argparse.FileType('w'),
help="the file to write the output to (default: write to stdout)")
parser.add_argument('-F', '--sequence-format', metavar="FORMAT",
choices=["raw", "tssv", "allelename"],
help="convert sequences to the specified format: one of %(choices)s "
"(default: no conversion)")
parser.add_argument('-l', '--library', metavar="LIBRARY",
type=argparse.FileType('r'),
help="library file for sequence format conversion")
parser.add_argument('-M', '--marker', metavar="MARKER",
help="work only on MARKER")
#add_arguments
def run(args):
if args.infile.isatty() and args.outfile.isatty():
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
match_profiles(args.profiles, args.infile, args.outfile, args.library,
args.sequence_format, args.marker)
#run
def main():
"""
Main entry point.
"""
parser = argparse.ArgumentParser(
description=__doc__)
try:
add_arguments(parser)
run(parser.parse_args())
except OSError as error:
parser.error(error)
#main
if __name__ == "__main__":
main()
......@@ -6,7 +6,6 @@ import argparse
import sys
import random
import time
import json
import math
#import numpy as np # Only imported when actually running this tool.
......@@ -175,7 +174,7 @@ def solve_profile_mixture_single(samples, genotypes, n, variance=False,
# The variances thus computed are population variances. It is
# more appropriate to compute the sample variance, but how?
if reportfile:
if reduce(lambda x,y:x+len(y)-1, genotypes, 0):
if sum(map(len, genotypes))-len(genotypes):
reportfile.write(
"Computing variances...\n"
"EXPERIMENAL feature! The values produced may give a "
......@@ -447,7 +446,7 @@ def preprocess_data(data, min_sample_pct):
def generate_profiles(filelist, tag_expr, tag_format, allelefile,
annotation_column, reportfile, min_pct, min_abs,
min_samples, min_sample_pct, seqformat, library, marker,
tidy, homozygotes, limit_reads, drop_samples):
homozygotes, limit_reads, drop_samples):
if reportfile:
t0 = time.time()
......@@ -508,7 +507,6 @@ def generate_profiles(filelist, tag_expr, tag_format, allelefile,
reportfile.write("Data loading and filtering took %f seconds\n" %
(t1-t0))
first_marker = True
for marker in data.keys():
p = data[marker]["profiles"]
profile_size = len(p["alleles"])
......@@ -539,15 +537,14 @@ def generate_profiles(filelist, tag_expr, tag_format, allelefile,
for i in range(profile_size):
profile[i] = round(profile[i], 3)
print('%s"%s":' % ("{" if first_marker else ",\n", marker))
if tidy:
json.dump(p, sys.stdout, indent=2,
separators=(',', ': '))
else:
json.dump(p, sys.stdout, separators=(',', ':'))
first_marker = False
# TSV output (profiles in rows)
print("\t".join([marker, "0"] + map(str, p["alleles"])))
for i in range(p["true alleles"]):
print("\t".join(
[marker, str(i+1)] + map(str, p["profiles_forward"][i])))
print("\t".join(
[marker, str(-i-1)] + map(str, p["profiles_reverse"][i])))
del data[marker]
print("\n}")
#generate_profiles
......@@ -557,19 +554,19 @@ def add_arguments(parser):
parser.add_argument('-r', '--report', metavar="OUTFILE",
type=argparse.FileType("w"),
help="write a report to the given file")
parser.add_argument('-m', '--min-pct', type=float,
parser.add_argument('-m', '--min-pct', metavar="PCT", type=float,
default=_DEF_THRESHOLD_PCT,
help="minimum amount of background to consider, as a percentage "
"of the highest allele (default: %4.2f)" % _DEF_THRESHOLD_PCT)
parser.add_argument('-n', '--min-abs', type=pos_int_arg,
parser.add_argument('-n', '--min-abs', metavar="N", type=pos_int_arg,
default=_DEF_THRESHOLD_ABS,
help="minimum amount of background to consider, as an absolute "
"number of reads (default: %(default)s)")
parser.add_argument('-s', '--min-samples', type=pos_int_arg,
parser.add_argument('-s', '--min-samples', metavar="N", type=pos_int_arg,
default=_DEF_MIN_SAMPLES,
help="require this minimum number of samples for each true allele "
"(default: %(default)s)")
parser.add_argument('-S', '--min-sample-pct', type=float,
parser.add_argument('-S', '--min-sample-pct', metavar="PCT", type=float,
default=_DEF_MIN_SAMPLE_PCT,
help="require this minimum number of samples for each background "
"product, as a percentage of the number of samples with a "
......@@ -584,15 +581,13 @@ def add_arguments(parser):
help="library file for sequence format conversion")
parser.add_argument('-M', '--marker', metavar="MARKER",
help="work only on MARKER")
parser.add_argument('-t', '--tidy', action="store_true",
help="if specified, tidily indent the generated JSON")
parser.add_argument('-H', '--homozygotes', action="store_true",
help="if specified, only homozygous samples will be considered")
parser.add_argument('-R', '--limit-reads', type=pos_int_arg,
parser.add_argument('-R', '--limit-reads', metavar="N", type=pos_int_arg,
default=sys.maxint,
help="simulate lower sequencing depth by randomly dropping reads down "
"to this maximum total number of reads for each sample")
parser.add_argument('-x', '--drop-samples', type=float,
parser.add_argument('-x', '--drop-samples', metavar="N", type=float,
default=0, help="randomly drop this fraction of input samples")
#add_arguments
......@@ -601,13 +596,12 @@ def run(args):
if args.filelist == [sys.stdin] and sys.stdin.isatty():
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
generate_profiles(args.filelist, args.tag_expr, args.tag_format,
args.allelelist, args.annotation_column, args.report,
args.min_pct, args.min_abs, args.min_samples,
args.min_sample_pct, args.sequence_format, args.library,
args.marker, args.tidy, args.homozygotes,
args.limit_reads, args.drop_samples)
args.marker, args.homozygotes, args.limit_reads,
args.drop_samples)
#run
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment