Commit a09131d9 authored by jhoogenboom's avatar jhoogenboom
Browse files

Introducing BGHomStats

* New tool BGHomStats computes statistics (minimum, maximum, mean,
  and sample variance) of noise ratios in homozygous samples.
* The default BGEstimate output format has been changed to be
  compatible with that of BGHomStats. The cross-tabular output
  format is still available as an option because it easily uses 90%
  less disk space. BGCorrect (and other future tools that use noise
  profiles) will work with both formats.
* Fixed bug in the --min-samples option of BGEstimate that could
  cause some alleles with less than the specified number of samples
  to be included if --drop-samples is used at the same time.
* The user now receives an error message if there are unknown
  arguments. The error message lists the usage string of the
  requested tool. (Argparse's default was to print the general
  FDSTools usage string, which is not helpful.)
parent 7b12cccb
......@@ -74,11 +74,15 @@ def main():
except Exception as error:
parser.error(error)
try:
if unknowns:
# Politely inform the user about unknown arguments.
__tools__[args.tool].error(
"The following arguments are not known. Please check spelling "
"and argument order: '%s'." % "', '".join(unknowns))
args.func(args)
except Exception as error:
if args.debug:
raise
# TODO: Politely inform the user about unknown arguments.
__tools__[args.tool].error(error)
#main
......
......@@ -389,6 +389,85 @@ def parse_library_ini(handle):
def load_profiles(profilefile, library=None):
if profilefile == sys.stdin:
# Can't seek on pipes, so read it into a buffer first.
profilefile = StringIO(sys.stdin.read())
headline = profilefile.readline().rstrip("\r\n").split("\t")
profilefile.seek(0)
try:
get_column_ids(headline, "marker", "allele", "sequence", "fmean",
"rmean")
except ValueError:
try:
int(headline[1])
except:
raise ValueError(
"Invalid background noise profiles file: unable to determine "
"file format!")
return load_profiles_crosstab(profilefile, library)
return load_profiles_columnar(profilefile, library)
#load_profiles
def load_profiles_columnar(profilefile, library=None):
column_names = profilefile.readline().rstrip("\r\n").split("\t")
colid_marker, colid_allele, colid_sequence, colid_fmean, colid_rmean = \
get_column_ids(column_names, "marker", "allele", "sequence", "fmean",
"rmean")
profiles = {}
for line in profilefile:
line = line.rstrip("\r\n").split("\t")
if line == [""]:
continue
marker = line[colid_marker]
if marker not in profiles:
profiles[marker] = {
"m": set(), # To be replaced by its length below.
"n": set(), # To be replaced by its length below.
"seqs": [],
"forward": {}, # To be replaced by a list below.
"reverse": {} # To be replaced by a list below.
}
allele = ensure_sequence_format(line[colid_allele], "raw",
library=library, marker=marker)
sequence = ensure_sequence_format(line[colid_sequence], "raw",
library=library, marker=marker)
if (allele, sequence) in profiles[marker]["forward"]:
raise ValueError(
"Invalid background noise profiles file: encountered "
"multiple values for marker '%s' allele '%s' sequence '%s'" %
(marker, allele, sequence))
profiles[marker]["forward"][allele,sequence] = float(line[colid_fmean])
profiles[marker]["reverse"][allele,sequence] = float(line[colid_rmean])
profiles[marker]["m"].update((allele, sequence))
profiles[marker]["n"].add(allele)
# Check completeness and reorder true alleles.
for marker in profiles:
profiles[marker]["seqs"] = list(profiles[marker]["n"]) + \
list(profiles[marker]["m"]-profiles[marker]["n"])
profiles[marker]["n"] = len(profiles[marker]["n"])
profiles[marker]["m"] = len(profiles[marker]["m"])
newprofiles = {"forward": [], "reverse": []}
for i in range(profiles[marker]["n"]):
allele = profiles[marker]["seqs"][i]
for direction in newprofiles:
newprofiles[direction].append([0] * profiles[marker]["m"])
for j in range(profiles[marker]["m"]):
sequence = profiles[marker]["seqs"][j]
if (allele, sequence) in profiles[marker]["forward"]:
for direction in newprofiles:
newprofiles[direction][i][j] = \
profiles[marker][direction][allele, sequence]
profiles[marker]["forward"] = newprofiles["forward"]
profiles[marker]["reverse"] = newprofiles["reverse"]
return profiles
#load_profiles_columnar
def load_profiles_crosstab(profilefile, library=None):
profiles = {}
# Read the profile file without assuming it is sorted.
......@@ -413,8 +492,8 @@ def load_profiles(profilefile, library=None):
"m": len(values),
"n": abs(num),
"seqs": [],
"forward": {},
"reverse": {}
"forward": {}, # To be replaced by a list below.
"reverse": {} # To be replaced by a list below.
}
elif len(values) != profiles[marker]["m"]:
raise ValueError(
......@@ -424,7 +503,7 @@ def load_profiles(profilefile, library=None):
if num == 0:
if profiles[marker]["seqs"]:
raise ValueError(
"Invalid background profiles noise file: encountered "
"Invalid background noise profiles file: encountered "
"multiple header lines for marker '%s'" % marker)
values = map(
lambda seq: ensure_sequence_format(
......@@ -435,7 +514,7 @@ def load_profiles(profilefile, library=None):
direction = "forward" if num > 0 else "reverse"
if abs(num) in profiles[marker][direction]:
raise ValueError(
"Invalid background profiles noise file: encountered "
"Invalid background noise profiles file: encountered "
"multiple %s profiles for marker '%s' allele %i" %
(direction, marker, abs(num)))
values = map(float, values)
......@@ -446,13 +525,13 @@ def load_profiles(profilefile, library=None):
newprofiles = {"forward": [], "reverse": []}
if not profiles[marker]["seqs"]:
raise ValueError(
"Invalid background profiles noise file: missing header line "
"Invalid background noise profiles file: missing header line "
"for marker '%s'" % marker)
for i in range(1, profiles[marker]["n"] + 1):
for direction in newprofiles:
if i not in profiles[marker][direction]:
raise ValueError(
"Invalid background profiles noise file: missing %s "
"Invalid background noise profiles file: missing %s "
"profile for marker '%s' allele %i" %
(direction, marker, i))
newprofiles[direction].append(profiles[marker][direction][i])
......@@ -460,7 +539,7 @@ def load_profiles(profilefile, library=None):
profiles[marker]["reverse"] = newprofiles["reverse"]
return profiles
#load_profiles
#load_profiles_crosstab
def regex_longest_match(pattern, subject):
......@@ -761,6 +840,34 @@ def nnls(A, C, B=None, max_iter=200, min_change=0.0001, debug=False):
#nnls
def adjust_stats(value, stats=None):
"""
Adjust the given stats in place with the given observed value and
return the adjusted stats as well. If no stats dict is given,
create a new stats dict with the following initial values:
{"n": 1, "min": value, "max": value, "mean": value, "m2": 0.0,
"variance": 0.0}
"""
value += 0.0
if not stats:
return {"n": 1, "min": value, "max": value, "mean": value, "m2": 0.0,
"variance": 0.0}
stats["n"] += 1
delta = value - stats["mean"]
stats["mean"] += delta / stats["n"]
stats["m2"] += delta * (value - stats["mean"])
try:
stats["variance"] = stats["m2"] / (stats["n"] - 1)
stats["min"] = min(stats["min"], value)
stats["max"] = max(stats["max"], value)
except ZeroDivisionError:
stats["variance"] = 0
stats["min"] = value
stats["max"] = value
return stats
#adjust_stats
def get_column_ids(column_names, *names):
"""Find all names in column_names and return their indices."""
result = []
......@@ -768,7 +875,7 @@ def get_column_ids(column_names, *names):
try:
result.append(column_names.index(name))
except ValueError:
raise Exception("Column not found in input file: %s" % name)
raise ValueError("Column not found in input file: %s" % name)
if len(result) == 1:
return result[0]
return tuple(result)
......
#!/usr/bin/env python
"""
Estimate allele-centric background noise profiles.
Estimate allele-centric background noise profiles (means).
"""
import argparse
import sys
......@@ -445,8 +445,9 @@ def preprocess_data(data, min_sample_pct):
def generate_profiles(filelist, tag_expr, tag_format, allelefile,
annotation_column, reportfile, min_pct, min_abs,
min_samples, min_sample_pct, seqformat, library, marker,
homozygotes, limit_reads, drop_samples):
min_samples, min_sample_pct, seqformat, library,
crosstab, marker, homozygotes, limit_reads,
drop_samples):
if reportfile:
t0 = time.time()
......@@ -489,7 +490,7 @@ def generate_profiles(filelist, tag_expr, tag_format, allelefile,
del allelelist[tag][marker]
# Ensure minimum number of samples per allele.
allelelist = {tag: allelelist[tag] for tag in sample_tags}
allelelist = {tag: allelelist[tag] for tag in tags_to_files}
ensure_min_samples(allelelist, min_samples)
# Combine data from all samples.
......@@ -507,6 +508,8 @@ def generate_profiles(filelist, tag_expr, tag_format, allelefile,
reportfile.write("Data loading and filtering took %f seconds\n" %
(t1-t0))
if not crosstab:
print("\t".join(["marker", "allele", "sequence", "fmean", "rmean"]))
for marker in data.keys():
p = data[marker]["profiles"]
profile_size = len(p["alleles"])
......@@ -537,13 +540,24 @@ def generate_profiles(filelist, tag_expr, tag_format, allelefile,
for i in range(profile_size):
profile[i] = round(profile[i], 3)
# TSV output (profiles in rows)
print("\t".join([marker, "0"] + map(str, p["alleles"])))
for i in range(p["true alleles"]):
print("\t".join(
[marker, str(i+1)] + map(str, p["profiles_forward"][i])))
print("\t".join(
[marker, str(-i-1)] + map(str, p["profiles_reverse"][i])))
if crosstab:
# Cross-tabular output (profiles in rows)
print("\t".join([marker, "0"] + p["alleles"]))
for i in range(p["true alleles"]):
print("\t".join(
[marker, str(i+1)] + map(str, p["profiles_forward"][i])))
print("\t".join(
[marker, str(-i-1)] + map(str, p["profiles_reverse"][i])))
else:
# Tab-separated columns format.
for i in range(p["true alleles"]):
for j in range(len(p["alleles"])):
if not (p["profiles_forward"][i][j] +
p["profiles_reverse"][i][j]):
continue
print("\t".join([marker, p["alleles"][i], p["alleles"][j]]+
map(str, [p["profiles_forward"][i][j],
p["profiles_reverse"][i][j]])))
del data[marker]
#generate_profiles
......@@ -579,6 +593,9 @@ def add_arguments(parser):
parser.add_argument('-l', '--library', metavar="LIBRARY",
type=argparse.FileType('r'),
help="library file for sequence format conversion")
parser.add_argument('-C', '--cross-tabular', action="store_true",
help="if specified, a space-efficient cross-tabular output format is "
"used instead of the default tab-separated columns format")
parser.add_argument('-M', '--marker', metavar="MARKER",
help="work only on MARKER")
parser.add_argument('-H', '--homozygotes', action="store_true",
......@@ -600,8 +617,8 @@ def run(args):
args.allelelist, args.annotation_column, args.report,
args.min_pct, args.min_abs, args.min_samples,
args.min_sample_pct, args.sequence_format, args.library,
args.marker, args.homozygotes, args.limit_reads,
args.drop_samples)
args.cross_tabular, args.marker, args.homozygotes,
args.limit_reads, args.drop_samples)
#run
......
#!/usr/bin/env python
"""
Compute allele-centric statistics for background noise in homozygous samples
(min, max, mean, sample variance).
"""
import argparse
import sys
import random
import time
import math
from ..lib import get_column_ids, pos_int_arg, get_tag, add_sample_files_args,\
add_allele_detection_args, map_tags_to_files, adjust_stats,\
ensure_sequence_format, parse_allelelist
__version__ = "0.1dev"
# Default values for parameters are specified below.
# Default minimum amount of background to consider, as a percentage of
# the highest allele.
# This value can be overridden by the -m command line option.
_DEF_THRESHOLD_PCT = 0.5
# Default minimum number of reads to consider.
# This value can be overridden by the -n command line option.
_DEF_THRESHOLD_ABS = 5
# Default minimum number of samples for each true allele.
# This value can be overridden by the -s command line option.
_DEF_MIN_SAMPLES = 2
# Default minimum number of samples required for each background product
# to be included in the analysis, as a percentage of the number of
# samples with a certain true allele.
# This value can be overridden by the -S command line option.
_DEF_MIN_SAMPLE_PCT = 80.
def get_sample_data(infile, data, annotation_column, seqformat, library):
"""Add data from infile to data dict as [marker, allele]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_name, colid_allele, colid_forward, colid_reverse = \
get_column_ids(column_names, "name", "allele", "forward", "reverse")
# Also try to get allele column if we have one.
if annotation_column is not None:
try:
colid_annotation = get_column_ids(column_names, annotation_column)
except:
annotation_column = None
found_alleles = []
for line in infile:
line = line.rstrip("\r\n").split("\t")
marker = line[colid_name]
allele = line[colid_allele] if seqformat is None \
else ensure_sequence_format(line[colid_allele], seqformat, library)
if (annotation_column is not None and
line[colid_annotation].startswith("ALLELE")):
found_alleles.append(marker, allele)
data[marker, allele] = map(int,
[line[colid_forward], line[colid_reverse]])
return found_alleles
#get_sample_data
def reduce_read_counts(data, limit_reads):
sum_reads = 0
for markerallele in data:
sum_reads += sum(data[markerallele])
if sum_reads <= limit_reads:
return
remove = sorted(random.sample(xrange(sum_reads), sum_reads - limit_reads))
i = 0
seen = 0
while i < len(remove) and seen > remove[i]:
# Skip the reads filtered out above.
i += 1
for markerallele in data:
for direction in (0, 1):
seen += data[markerallele][direction]
while i < len(remove) and seen > remove[i]:
data[markerallele][direction] -= 1
i += 1
#reduce_read_counts
def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs):
# Enter the read counts into data and check the thresholds.
for marker, sequence in sample_data:
if marker not in sample_alleles:
# Sample does not participate in this marker.
continue
allele = sample_alleles[marker]
factors = map(lambda x: 100./x, sample_data[marker, allele])
if (marker, allele) not in data:
data[marker, allele] = {}
if sequence not in data[marker, allele]:
data[marker, allele][sequence] = [None, None, 0]
for direction in (0, 1):
data[marker, allele][sequence][direction] = adjust_stats(
sample_data[marker, sequence][direction] * factors[direction],
data[marker, allele][sequence][direction])
if sum([count >= min_abs and count*factor >= min_pct
for count, factor in
zip(sample_data[marker, sequence], factors)]):
data[marker, allele][sequence][2] += 1
#add_sample_data
def filter_data(data, min_samples, min_sample_pct):
"""
Remove all alleles from data that have less than min_samples samples
and remove all stats of sequences that don't pass the detection
thresholds in at least min_sample_pct per cent of the samples with a
particular allele. Also add explicit zeros to the stats of the
sequences that were not seen in all samples with a given allele.
"""
for marker, allele in data.keys():
if data[marker, allele][allele][2] < min_samples:
del data[marker, allele]
continue
factor = 100./data[marker, allele][allele][2]
for sequence in data[marker, allele].keys():
if data[marker, allele][sequence][2] * factor < min_sample_pct:
del data[marker, allele][sequence]
continue
for i in range(data[marker, allele][sequence][0]["n"],
data[marker, allele][allele][2]):
for direction in (0, 1):
adjust_stats(0, data[marker, allele][sequence][direction])
#filter_data
def compute_stats(filelist, tag_expr, tag_format, allelefile,
annotation_column, min_pct, min_abs, min_samples,
min_sample_pct, seqformat, library, marker, limit_reads,
drop_samples):
# Parse library and allele list.
library = parse_library(library) if library is not None else None
allelelist = {} if allelefile is None \
else parse_allelelist(allelefile, seqformat, library)
tags_to_files = map_tags_to_files(filelist, tag_expr, tag_format)
# Randomly drop some samples.
sample_tags = tags_to_files.keys()
for tag in random.sample(xrange(len(sample_tags)),
int(len(sample_tags) * drop_samples)):
del tags_to_files[sample_tags[tag]]
# Read sample data.
data = {}
for tag in tags_to_files:
sample_data = {}
alleles = set()
for infile in tags_to_files[tag]:
alleles.update(get_sample_data(infile, sample_data,
annotation_column, seqformat, library))
if tag not in allelelist:
allelelist[tag] = {}
for markerx, allele in alleles:
if markerx not in allelelist[tag]:
allelelist[tag][markerx] = set()
allelelist[tag][markerx].add(allele)
reduce_read_counts(sample_data, limit_reads)
if marker:
if marker in allelelist[tag]:
allelelist[tag] = {marker: allelelist[tag][marker]}
else:
allelelist[tag] = {}
allelelist[tag] = {marker: allelelist[tag][marker].pop()
for marker in allelelist[tag] if len(allelelist[tag][marker]) == 1}
add_sample_data(data, sample_data, allelelist[tag], min_pct, min_abs)
# Ensure minimum number of samples per allele and filter
# insignificant background products.
filter_data(data, min_samples, min_sample_pct)
print("\t".join(["marker", "allele", "sequence", "n", "fmin", "fmax",
"fmean", "fvariance", "rmin", "rmax", "rmean",
"rvariance"]))
for marker, allele in data:
for sequence in data[marker, allele]:
print("\t".join([marker, allele, sequence] + map(
lambda x: str(x) if abs(x) > 0.0000000001 else "0", [
data[marker, allele][sequence][0]["n"],
data[marker, allele][sequence][0]["min"],
data[marker, allele][sequence][0]["max"],
data[marker, allele][sequence][0]["mean"],
data[marker, allele][sequence][0]["variance"],
data[marker, allele][sequence][1]["min"],
data[marker, allele][sequence][1]["max"],
data[marker, allele][sequence][1]["mean"],
data[marker, allele][sequence][1]["variance"]])))
#compute_stats
def add_arguments(parser):
add_sample_files_args(parser)
add_allele_detection_args(parser)
parser.add_argument('-m', '--min-pct', metavar="PCT", type=float,
default=_DEF_THRESHOLD_PCT,
help="minimum amount of background to consider, as a percentage "
"of the highest allele (default: %4.2f)" % _DEF_THRESHOLD_PCT)
parser.add_argument('-n', '--min-abs', metavar="N", type=pos_int_arg,
default=_DEF_THRESHOLD_ABS,
help="minimum amount of background to consider, as an absolute "
"number of reads (default: %(default)s)")
parser.add_argument('-s', '--min-samples', metavar="N", type=pos_int_arg,
default=_DEF_MIN_SAMPLES,
help="require this minimum number of samples for each true allele "
"(default: %(default)s)")
parser.add_argument('-S', '--min-sample-pct', metavar="PCT", type=float,
default=_DEF_MIN_SAMPLE_PCT,
help="require this minimum number of samples for each background "
"product, as a percentage of the number of samples with a "
"particular true allele (default: %(default)s)")
parser.add_argument('-F', '--sequence-format', metavar="FORMAT",
choices=["raw", "tssv", "allelename"],
help="convert sequences to the specified format: one of %(choices)s "
"(default: no conversion)")
parser.add_argument('-l', '--library', metavar="LIBRARY",
type=argparse.FileType('r'),
help="library file for sequence format conversion")
parser.add_argument('-M', '--marker', metavar="MARKER",
help="work only on MARKER")
parser.add_argument('-R', '--limit-reads', metavar="N", type=pos_int_arg,
default=sys.maxint,
help="simulate lower sequencing depth by randomly dropping reads down "
"to this maximum total number of reads for each sample")
parser.add_argument('-x', '--drop-samples', metavar="N", type=float,
default=0, help="randomly drop this fraction of input samples")
#add_arguments
def run(args):
if args.filelist == [sys.stdin] and sys.stdin.isatty():
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
compute_stats(args.filelist, args.tag_expr, args.tag_format,
args.allelelist, args.annotation_column, args.min_pct,
args.min_abs, args.min_samples, args.min_sample_pct,
args.sequence_format, args.library, args.marker,
args.limit_reads, args.drop_samples)
#run
def main():
"""
Main entry point.
"""
parser = argparse.ArgumentParser(
description=__doc__)
try:
add_arguments(parser)
run(parser.parse_args())
except OSError as error:
parser.error(error)
#main
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment