Commit 8685a304 authored by jhoogenboom's avatar jhoogenboom
Browse files

Introducing Blame

* New tool Blame can be used to find particularly dirty samples and
  to construct a DNA profile of the contaminator.
* Fixed bug BGCorrect that resulted in incorrect values in the
  *_add columns.
* BGEstimate and BGHomStats no longer crash if a library file is
  provided.
* SeqConvert can now use a different library file for the output,
  thereby offering some possibilities to update allele names when a
  library file gets updated.
* Replaced various uses of map() by generator expressions and
  listcomps for increased readability speed (although slightly).
parent a09131d9
......@@ -240,7 +240,7 @@ def parse_library_tsv(handle):
"regex_middle": {}
}
for line in handle:
line = map(lambda x: x.strip(), line.rstrip("\r\n").split("\t"))
line = [x.strip() for x in line.rstrip("\r\n").split("\t")]
if line == [""]:
continue
if len(line) < 4:
......@@ -258,8 +258,8 @@ def parse_library_tsv(handle):
raise ValueError("STR definition '%s' of marker %s is invalid" %
(line[3], marker))
library["flanks"][marker] = line[1:3]
library["regex_middle"][marker] = re.compile("".join(map(lambda x:
"(%s){%s,%s}" % x, PAT_STR_DEF_BLOCK.findall(line[3]))))
library["regex_middle"][marker] = re.compile("".join(
"(%s){%s,%s}" % x for x in PAT_STR_DEF_BLOCK.findall(line[3])))
library["regex"][marker] = re.compile(
"".join(["^", library["regex_middle"][marker].pattern, "$"]))
return library
......@@ -369,16 +369,16 @@ def parse_library_ini(handle):
parts = []
partsm = []
if marker in library["prefix"]:
parts += map(lambda x: "(%s){0,1}" % x, library["prefix"][marker])
parts += ("(%s){0,1}" % x for x in library["prefix"][marker])
if marker in library["aliases"]:
parts.append("(%s){0,1}" % library["aliases"][marker]["sequence"])
partsm.append("(%s){0,1}" % library["aliases"][marker]["sequence"])
elif marker in library["regex"]:
partsm = map(lambda x: "(%s){%s,%s}" % x,
PAT_STR_DEF_BLOCK.findall(library["regex"][marker]))
partsm = ("(%s){%s,%s}" % x for x in
PAT_STR_DEF_BLOCK.findall(library["regex"][marker]))
parts += partsm
if marker in library["suffix"]:
parts += map(lambda x: "(%s){0,1}" % x, library["suffix"][marker])
parts += ("(%s){0,1}" % x for x in library["suffix"][marker])
if parts:
library["regex"][marker] = re.compile(
"".join(["^"] + parts + ["$"]))
......@@ -505,10 +505,8 @@ def load_profiles_crosstab(profilefile, library=None):
raise ValueError(
"Invalid background noise profiles file: encountered "
"multiple header lines for marker '%s'" % marker)
values = map(
lambda seq: ensure_sequence_format(
seq, "raw", library=library, marker=marker),
values)
values = [ensure_sequence_format(seq, "raw", library=library,
marker=marker) for seq in values]
profiles[marker]["seqs"] = values
else:
direction = "forward" if num > 0 else "reverse"
......@@ -565,9 +563,8 @@ def detect_sequence_format(seq):
def convert_sequence_tssv_raw(seq):
return "".join(map(
lambda block: block[0] * int(block[1]),
PAT_TSSV_BLOCK.findall(seq)))
return "".join(block[0] * int(block[1])
for block in PAT_TSSV_BLOCK.findall(seq))
#convert_sequence_tssv_raw
......@@ -585,8 +582,8 @@ def convert_sequence_raw_tssv(seq, library, marker, return_alias=False):
if match is None and marker in library["regex"]:
match = library["regex"][marker].match(seq)
if match is not None:
parts = [(match.group(i), match.end(i))
for i in range(1, match.lastindex+1) if match.group(i)]
parts = ((match.group(i), match.end(i))
for i in range(1, match.lastindex+1) if match.group(i))
# Use heuristics if the sequence does not match the pattern.
else:
......@@ -615,8 +612,8 @@ def convert_sequence_raw_tssv(seq, library, marker, return_alias=False):
# canonical prefix ends with the first few blocks that
# we obtained in the match, move these blocks into the
# prefix. Also do this with the suffix.
middle = [(match.group(i), match.end(i)+len(pre_suf[0]))
for i in range(1, match.lastindex+1) if match.group(i)]
middle = ((match.group(i), match.end(i)+len(pre_suf[0]))
for i in range(1, match.lastindex+1) if match.group(i))
pre_suf[0] += seq[:match.start()]
pre_suf[1] = seq[match.end():] + pre_suf[1]
seq = match.group()
......@@ -695,7 +692,7 @@ def convert_sequence_allelename_tssv(seq, library, marker):
blocks.append((block[0], int(block[1])))
if suffix:
blocks.append((suffix, 1))
return "".join(map(lambda block: "%s(%i)" % block, blocks))
return "".join("%s(%i)" % block for block in blocks)
#convert_sequence_allelename_tssv
......@@ -758,7 +755,7 @@ def convert_sequence_raw_allelename(seq, library, marker):
def ensure_sequence_format(seq, to_format, from_format=None, library=None,
marker=None):
"""Convert seq to 'raw', 'tssv', or 'allelename' format."""
known_formats = ["raw", "tssv", "allelename"]
known_formats = ("raw", "tssv", "allelename")
if to_format not in known_formats:
raise ValueError("Unknown format '%s', choose from %s" %
(to_format, known_formats))
......@@ -795,6 +792,16 @@ def ensure_sequence_format(seq, to_format, from_format=None, library=None,
#ensure_sequence_format
def reverse_complement(sequence):
"""Return the reverse complement of the given DNA sequence."""
c = {"A": "T", "T": "A", "U": "A", "G": "C", "C": "G", "R": "Y", "Y": "R",
"K": "M", "M": "K", "B": "V", "V": "B", "D": "H", "H": "D",
"a": "t", "t": "a", "u": "a", "g": "c", "c": "g", "r": "y", "y": "r",
"k": "m", "m": "k", "b": "v", "v": "b", "d": "h", "h": "d"}
return "".join(c[x] if x in c else x for x in sequence[::-1])
#reverse_complement
def nnls(A, C, B=None, max_iter=200, min_change=0.0001, debug=False):
"""
Solve for B in A * B = C in the least squares sense, s.t. B >= 0.
......
......@@ -203,7 +203,7 @@ def add_arguments(parser):
"for which the value in this column does not start with ALLELE "
"are ignored")
parser.add_argument('-F', '--sequence-format', metavar="FORMAT",
choices=["raw", "tssv", "allelename"],
choices=("raw", "tssv", "allelename"),
help="convert sequences to the specified format: one of %(choices)s "
"(default: no conversion)")
parser.add_argument('-l', '--library', metavar="LIBRARY",
......
......@@ -83,13 +83,13 @@ def match_profile(column_names, data, profile, convert_to_raw, library,
C2[0, i] = line[colid_reverse]
# Compute corrected read counts.
A = nnls(np.hstack([P1, P2]).T, np.hstack([C1, C2]).T)
A = nnls(np.hstack([P1, P2]).T, np.hstack([C1, C2]).T).T
np.fill_diagonal(P1, 0)
np.fill_diagonal(P2, 0)
forward_noise = A.T * P1
reverse_noise = A.T * P2
forward_add = np.multiply(A, P1.sum(1)).T
reverse_add = np.multiply(A, P2.sum(1)).T
forward_noise = A * P1
reverse_noise = A * P2
forward_add = np.multiply(A, P1.sum(1))
reverse_add = np.multiply(A, P2.sum(1))
j = 0
for line in data:
......@@ -170,7 +170,7 @@ def add_arguments(parser):
default=sys.stdout, type=argparse.FileType('w'),
help="the file to write the output to (default: write to stdout)")
parser.add_argument('-F', '--sequence-format', metavar="FORMAT",
choices=["raw", "tssv", "allelename"],
choices=("raw", "tssv", "allelename"),
help="convert sequences to the specified format: one of %(choices)s "
"(default: no conversion)")
parser.add_argument('-l', '--library', metavar="LIBRARY",
......
......@@ -9,9 +9,9 @@ import time
import math
#import numpy as np # Only imported when actually running this tool.
from ..lib import get_column_ids, pos_int_arg, get_tag, add_sample_files_args,\
from ..lib import get_column_ids, pos_int_arg, add_sample_files_args,\
add_allele_detection_args, map_tags_to_files, nnls,\
ensure_sequence_format, parse_allelelist
ensure_sequence_format, parse_allelelist, parse_library
__version__ = "0.1dev"
......@@ -378,9 +378,8 @@ def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs):
data[marker]["allele_counts"][i] = [0] * len(p["alleles"])
p["true alleles"] += 1
data[marker]["genotypes"][-1].append(i)
thresholds[marker][i] = map(
lambda x: math.ceil(x*min_pct/100.),
sample_data[marker, allele])
thresholds[marker][i] = [math.ceil(x*min_pct/100.) for x in
sample_data[marker, allele]]
# Now enter the read counts into data and check the thresholds.
for marker, allele in sample_data:
......@@ -404,9 +403,9 @@ def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs):
p["profiles_reverse"][-1][i] = sample_data[marker, allele][1]
for gi in thresholds[marker]:
if sum([count >= max(min_abs, threshold)
for count, threshold in
zip(sample_data[marker, allele], thresholds[marker][gi])]):
if sum(count >= max(min_abs, threshold)
for count, threshold in
zip(sample_data[marker, allele], thresholds[marker][gi])):
data[marker]["allele_counts"][gi][i] += 1
#add_sample_data
......@@ -586,7 +585,7 @@ def add_arguments(parser):
"product, as a percentage of the number of samples with a "
"particular true allele (default: %(default)s)")
#parser.add_argument('-F', '--sequence-format', metavar="FORMAT",
# choices=["raw", "tssv", "allelename"],
# choices=("raw", "tssv", "allelename"),
# help="convert sequences to the specified format: one of %(choices)s "
# "(default: no conversion)")
parser.set_defaults(sequence_format="raw") # Force raw sequences.
......
......@@ -6,12 +6,10 @@ Compute allele-centric statistics for background noise in homozygous samples
import argparse
import sys
import random
import time
import math
from ..lib import get_column_ids, pos_int_arg, get_tag, add_sample_files_args,\
from ..lib import get_column_ids, pos_int_arg, add_sample_files_args,\
add_allele_detection_args, map_tags_to_files, adjust_stats,\
ensure_sequence_format, parse_allelelist
ensure_sequence_format, parse_allelelist, parse_library
__version__ = "0.1dev"
......@@ -97,7 +95,7 @@ def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs):
# Sample does not participate in this marker.
continue
allele = sample_alleles[marker]
factors = map(lambda x: 100./x, sample_data[marker, allele])
factors = [100./x for x in sample_data[marker, allele]]
if (marker, allele) not in data:
data[marker, allele] = {}
if sequence not in data[marker, allele]:
......@@ -188,8 +186,8 @@ def compute_stats(filelist, tag_expr, tag_format, allelefile,
"rvariance"]))
for marker, allele in data:
for sequence in data[marker, allele]:
print("\t".join([marker, allele, sequence] + map(
lambda x: str(x) if abs(x) > 0.0000000001 else "0", [
print("\t".join([marker, allele, sequence] + [
str(x) if abs(x) > 0.0000000001 else "0" for x in (
data[marker, allele][sequence][0]["n"],
data[marker, allele][sequence][0]["min"],
data[marker, allele][sequence][0]["max"],
......@@ -198,7 +196,7 @@ def compute_stats(filelist, tag_expr, tag_format, allelefile,
data[marker, allele][sequence][1]["min"],
data[marker, allele][sequence][1]["max"],
data[marker, allele][sequence][1]["mean"],
data[marker, allele][sequence][1]["variance"]])))
data[marker, allele][sequence][1]["variance"])]))
#compute_stats
......@@ -223,7 +221,7 @@ def add_arguments(parser):
"product, as a percentage of the number of samples with a "
"particular true allele (default: %(default)s)")
parser.add_argument('-F', '--sequence-format', metavar="FORMAT",
choices=["raw", "tssv", "allelename"],
choices=("raw", "tssv", "allelename"),
help="convert sequences to the specified format: one of %(choices)s "
"(default: no conversion)")
parser.add_argument('-l', '--library', metavar="LIBRARY",
......
#!/usr/bin/env python
"""
Find dirty samples or recurring contaminating alleles.
"""
import argparse
import sys
#import numpy as np # Only imported when actually running this tool.
from ..lib import get_column_ids, pos_int_arg, add_sample_files_args,\
add_allele_detection_args, map_tags_to_files, nnls,\
ensure_sequence_format, parse_allelelist, load_profiles,\
parse_library
__version__ = "0.1dev"
# Default values for parameters are specified below.
# Default number of results per marker.
# This value can be overridden by the -n command line option.
_DEF_NUM = 5
def get_sample_data(infile, data, annotation_column, library):
"""Add data from infile to data dict as [marker, allele]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_name, colid_allele, colid_forward, colid_reverse = \
get_column_ids(column_names, "name", "allele", "forward", "reverse")
# Also try to get allele column if we have one.
if annotation_column is not None:
try:
colid_annotation = get_column_ids(column_names, annotation_column)
except:
annotation_column = None
found_alleles = []
for line in infile:
line = line.rstrip("\r\n").split("\t")
marker = line[colid_name]
allele = ensure_sequence_format(
line[colid_allele], "raw", library=library, marker=marker)
if (annotation_column is not None and
line[colid_annotation].startswith("ALLELE")):
found_alleles.append(marker, allele)
data[marker, allele] = map(int,
[line[colid_forward], line[colid_reverse]])
return found_alleles
#get_sample_data
def add_sample_data(data, sample_data, sample_tag, alleles):
# Add this sample to the data.
use_markers = set()
for marker in alleles:
genotype = [data[marker]["seqs"].index(x)
if x in data[marker]["seqs"] and
data[marker]["seqs"].index(x) < data[marker]["n"]
else -1
for x in alleles[marker]]
if -1 in genotype:
# Don't have a profile for all of this sample's alleles.
continue
data[marker]["samples_forward"].append([0] * data[marker]["m"])
data[marker]["samples_reverse"].append([0] * data[marker]["m"])
data[marker]["genotypes"].append(genotype)
data[marker]["sample_tags"].append(sample_tag)
use_markers.add(marker)
for marker, allele in sample_data:
if marker not in use_markers:
continue
try:
i = data[marker]["seqs"].index(allele)
except ValueError:
continue
data[marker]["samples_forward"][-1][i] = sample_data[marker, allele][0]
data[marker]["samples_reverse"][-1][i] = sample_data[marker, allele][1]
#add_sample_data
def blame(filelist, tag_expr, tag_format, allelefile, annotation_column, mode,
profilefile, num, seqformat, libfile, marker):
import numpy as np
library = parse_library(libfile) if libfile else None
allelelist = {} if allelefile is None \
else parse_allelelist(allelefile, "raw", library)
data = load_profiles(profilefile, library)
if marker:
data = {marker: data[marker]} if marker in data else {}
for marker in data:
data[marker]["samples_forward"] = []
data[marker]["samples_reverse"] = []
data[marker]["genotypes"] = []
data[marker]["sample_tags"] = []
tags_to_files = map_tags_to_files(filelist, tag_expr, tag_format)
# Read sample data.
for tag in tags_to_files:
sample_data = {}
alleles = set()
for infile in tags_to_files[tag]:
alleles.update(get_sample_data(
infile, sample_data, annotation_column, library))
if tag not in allelelist:
allelelist[tag] = {}
for marker, allele in alleles:
if marker not in allelelist[tag]:
allelelist[tag][marker] = set()
allelelist[tag][marker].add(allele)
allelelist[tag] = {marker: allelelist[tag][marker] for marker in data
if marker in allelelist[tag]}
add_sample_data(data, sample_data, tag, allelelist[tag])
print("\t".join(["marker",
"allele" if mode == "common" else "sample",
"amount"]))
for marker in data:
if not data[marker]["sample_tags"]:
continue
# Estimate which alleles are present in the samples.
P1 = np.matrix(data[marker]["forward"])
P2 = np.matrix(data[marker]["reverse"])
C1 = np.matrix(data[marker]["samples_forward"])
C2 = np.matrix(data[marker]["samples_reverse"])
A = nnls(np.hstack([P1, P2]).T, np.hstack([C1, C2]).T).T
# Zero out the true alleles in each sample and scale the others.
for i in range(len(data[marker]["genotypes"])):
indices = data[marker]["genotypes"][i]
scale = A[i, indices].sum()
A[i, indices] = 0
A[i, :] /= scale
if mode == "common":
# The columns with the highest means correspond to the most
# common contaminants.
A = A.mean(0).flat
for i in np.argsort(A)[:-num-1:-1]: # Print the top N.
if A[i] == 0:
break
print("\t".join([marker, ensure_sequence_format(
data[marker]["seqs"][i], seqformat, library=library,
marker=marker), str(A[i])]))
else:
# The rows with the highest maxima/sums correspond to the
# samples with the highest amounts of contaminant/s.
A = A.max(1).flat if mode == "highest" else A.sum(1).flat
for i in np.argsort(A)[:-num-1:-1]: # Print the top N.
if A[i] == 0:
break
print("\t".join(
[marker, data[marker]["sample_tags"][i], str(A[i])]))
#blame
def add_arguments(parser):
parser.add_argument('profiles', metavar="PROFILES",
type=argparse.FileType('r'),
help="file containing background noise profiles to match")
add_sample_files_args(parser)
parser.add_argument("-m", "--mode", metavar="MODE", default="common",
choices=("common", "highest", "dirtiest"),
help="controls what kind of information is printed; 'common' (the "
"default) prints the top N contaminant alleles per marker, "
"'highest' prints the top N samples with the highest single "
"contaminant per marker, and 'dirtiest' prints the top N samples "
"with the highest total amount of contaminants per marker")
add_allele_detection_args(parser)
parser.add_argument('-n', '--num', metavar="N", type=pos_int_arg,
default=_DEF_NUM,
help="print the top N results per marker (default: %(default)s)")
parser.add_argument('-F', '--sequence-format', metavar="FORMAT",
choices=("raw", "tssv", "allelename"), default="raw",
help="convert sequences to the specified format: one of %(choices)s "
"(default: %(default)s)")
parser.add_argument('-l', '--library', metavar="LIBRARY",
type=argparse.FileType('r'),
help="library file for sequence format conversion")
parser.add_argument('-M', '--marker', metavar="MARKER",
help="work only on MARKER")
#add_arguments
def run(args):
if args.filelist == [sys.stdin] and sys.stdin.isatty():
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
blame(args.filelist, args.tag_expr, args.tag_format, args.allelelist,
args.annotation_column, args.mode, args.profiles, args.num,
args.sequence_format, args.library, args.marker)
#run
def main():
"""
Main entry point.
"""
parser = argparse.ArgumentParser(
description=__doc__)
try:
add_arguments(parser)
run(parser.parse_args())
except OSError as error:
parser.error(error)
#main
if __name__ == "__main__":
main()
......@@ -112,8 +112,8 @@ def convert_library(infile, outfile, aliases=False):
if len(suffixes):
middle = middle[:-len(suffixes)]
if unmatched:
middle = map(lambda x: (x[0], "0", x[2]), middle) + \
map(lambda x: (x, "0", "1"), unmatched)
middle = [(x[0], "0", x[2]) for x in middle] + \
[(x, "0", "1") for x in unmatched]
# Add prefixes and suffixes of aliases.
if marker in marker_aliases:
......@@ -137,7 +137,7 @@ def convert_library(infile, outfile, aliases=False):
outfile.write("%s\t%s\t%s\t%s\n" % (
marker, flanks[0], flanks[1],
" ".join(map(lambda x: "%s %s %s" % x, pattern))))
" ".join("%s %s %s" % x for x in pattern)))
else:
# TSSV -> FDSTools
......@@ -194,8 +194,8 @@ def convert_library(infile, outfile, aliases=False):
ini.set("flanks", fmt%marker, ", ".join(library["flanks"][marker]))
for marker in sorted(library["regex"]):
blocks = pattern_reverse.findall(library["regex"][marker].pattern)
ini.set("repeat", fmt%marker, " ".join(map(
lambda x: "%s %s %s" % x, blocks)))
ini.set("repeat", fmt % marker,
" ".join("%s %s %s" % x for x in blocks))
# Try to infer block length from the regular expression.
length_counts = {0: 0}
......
......@@ -5,7 +5,8 @@ Convert between raw sequences, TSSV-style sequences, and allele names.
import argparse
import sys
from ..lib import get_column_ids, ensure_sequence_format, parse_library
from ..lib import get_column_ids, ensure_sequence_format, parse_library,\
reverse_complement
__version__ = "0.1dev"
......@@ -28,8 +29,11 @@ _DEF_COLNAME_ALLELE_OUT = "allele"
def convert_sequences(infile, outfile, to_format, libfile=None,
fixed_marker=None, colname_marker=_DEF_COLNAME_MARKER,
colname_allele=_DEF_COLNAME_ALLELE,
colname_allele_out=_DEF_COLNAME_ALLELE_OUT):
colname_allele_out=_DEF_COLNAME_ALLELE_OUT,
libfile2=None, revcomp_markers=[]):
libfile = libfile if libfile is not None else libfile2
library = parse_library(libfile) if libfile is not None else None
library2 = parse_library(libfile2) if libfile2 is not None else library
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_allele = get_column_ids(column_names, colname_allele)
if library is None:
......@@ -50,15 +54,26 @@ def convert_sequences(infile, outfile, to_format, libfile=None,
if colid_allele_out == -1:
line.append("")
marker = line[colid_marker] if fixed_marker is None else fixed_marker
seq = line[colid_allele]
if library2 != library:
seq = ensure_sequence_format(
seq, "raw", marker=marker, library=library)
if marker in revcomp_markers:
seq = reverse_complement(seq)
# TODO: The current implementation assumes identical
# flanking sequences. Introduce means to shift flanking
# sequence in/out of prefix and/or suffix.
line[colid_allele_out] = ensure_sequence_format(
line[colid_allele], to_format, marker=marker, library=library)
seq, to_format, marker=marker, library=library2)
outfile.write("\t".join(line) + "\n")
#convert_sequences
def add_arguments(parser):
parser.add_argument('format', metavar="FORMAT",
choices=["raw", "tssv", "allelename"],
choices=("raw", "tssv", "allelename"),
help="the format to convert to: one of %(choices)s")
parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
type=argparse.FileType('r'),
......@@ -84,6 +99,15 @@ def add_arguments(parser):
parser.add_argument('-l', '--library', metavar="LIBRARY",
type=argparse.FileType('r'),
help="library file for sequence format conversion")
parser.add_argument('-L', '--library2', metavar="LIBRARY",
type=argparse.FileType('r'),
help="second library file to use for output; if specified, allele "
"names can be conveniently updated to fit this new library file")
parser.add_argument('-R', '--reverse-complement', metavar="MARKER",
nargs="+", default=[],
help="to be used togethwer with -L/--library2; specify the markers "
"for which the sequences are reverse-complemented in the new "
"library")
#add_arguments
......@@ -93,7 +117,8 @@ def run(args):
"of another program")
convert_sequences(args.infile, args.outfile, args.format, args.library,
args.marker, args.marker_column, args.allele_column,
args.output_column)
args.output_column, args.library2,
args.reverse_complement)
#run
......
......@@ -105,8 +105,8 @@ def load_data(infile, colname_annotation=_DEF_COLNAME, library=None):
columns[colid_allele] = PAT_TSSV_BLOCK.findall(columns[colid_allele])
# String to integer conversion...
columns[colid_allele] = map(
lambda x: [x[0], int(x[1])], columns[colid_allele])
columns[colid_allele] = [[x[0], int(x[1])]
for x in columns[colid_allele]]
columns[colid_total] = int(columns[colid_total])
# Repeat unit deduplication (data may contain stuff like
......@@ -324,16 +324,16 @@ def annotate_alleles(infile, outfile, stutter, min_reads=_DEF_MIN_READS,