blame.py 6.36 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
2
#!/usr/bin/env python
"""
3
Find dirty reference samples or recurring contaminating alleles.
jhoogenboom's avatar
jhoogenboom committed
4
5
6
7
8
9
10
11
12

Match background noise profiles (as obtained from bgestimate) to the
database of reference samples from which they were obtained to detect
samples with unexpectedly high additional alleles.  The 'amount' column
in the output lists the number of noise/contaminant reads per true
allelic read.

(The tool is called 'blame' because with default settings it might
produce a DNA profile of an untidy laboratory worker.)
jhoogenboom's avatar
jhoogenboom committed
13
14
15
16
"""
import argparse
#import numpy as np  # Only imported when actually running this tool.

17
18
19
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files,\
                  add_allele_detection_args, nnls, ensure_sequence_format,\
                  parse_allelelist, load_profiles, add_sequence_format_args,\
20
                  get_sample_data
jhoogenboom's avatar
jhoogenboom committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

__version__ = "0.1dev"


# Default values for parameters are specified below.

# Default number of results per marker.
# This value can be overridden by the -n command line option.
_DEF_NUM = 5


def add_sample_data(data, sample_data, sample_tag, alleles):
    # Add this sample to the data.
    use_markers = set()
    for marker in alleles:
        genotype = [data[marker]["seqs"].index(x)
                    if x in data[marker]["seqs"] and
                        data[marker]["seqs"].index(x) < data[marker]["n"]
                    else -1
                    for x in alleles[marker]]
        if -1 in genotype:
            # Don't have a profile for all of this sample's alleles.
            continue
        data[marker]["samples_forward"].append([0] * data[marker]["m"])
        data[marker]["samples_reverse"].append([0] * data[marker]["m"])
        data[marker]["genotypes"].append(genotype)
        data[marker]["sample_tags"].append(sample_tag)
        use_markers.add(marker)

    for marker, allele in sample_data:
        if marker not in use_markers:
            continue
        try:
            i = data[marker]["seqs"].index(allele)
        except ValueError:
            continue
        data[marker]["samples_forward"][-1][i] = sample_data[marker, allele][0]
        data[marker]["samples_reverse"][-1][i] = sample_data[marker, allele][1]
#add_sample_data


62
def blame(samples_in, outfile, allelefile, annotation_column, mode,
63
          profilefile, num, seqformat, library, marker):
jhoogenboom's avatar
jhoogenboom committed
64
65
66
67
68
69
70
71
72
73
74
75
    allelelist = {} if allelefile is None \
                    else parse_allelelist(allelefile, "raw", library)
    data = load_profiles(profilefile, library)
    if marker:
        data = {marker: data[marker]} if marker in data else {}
    for marker in data:
        data[marker]["samples_forward"] = []
        data[marker]["samples_reverse"] = []
        data[marker]["genotypes"] = []
        data[marker]["sample_tags"] = []

    # Read sample data.
76
    get_sample_data(
77
        samples_in,
78
79
80
81
        lambda tag, sample_data: add_sample_data(
            data, sample_data, tag,
            {m: allelelist[tag][m] for m in data if m in allelelist[tag]}),
        allelelist, annotation_column, "raw", library)
jhoogenboom's avatar
jhoogenboom committed
82

83
    outfile.write("\t".join(["marker",
jhoogenboom's avatar
jhoogenboom committed
84
                     "allele" if mode == "common" else "sample",
85
                     "amount"]) + "\n")
jhoogenboom's avatar
jhoogenboom committed
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
    for marker in data:
        if not data[marker]["sample_tags"]:
            continue

        # Estimate which alleles are present in the samples.
        P1 = np.matrix(data[marker]["forward"])
        P2 = np.matrix(data[marker]["reverse"])
        C1 = np.matrix(data[marker]["samples_forward"])
        C2 = np.matrix(data[marker]["samples_reverse"])
        A = nnls(np.hstack([P1, P2]).T, np.hstack([C1, C2]).T).T

        # Zero out the true alleles in each sample and scale the others.
        for i in range(len(data[marker]["genotypes"])):
            indices = data[marker]["genotypes"][i]
            scale = A[i, indices].sum()
            A[i, indices] = 0
            A[i, :] /= scale

        if mode == "common":
            # The columns with the highest means correspond to the most
            # common contaminants.
            A = A.mean(0).flat
            for i in np.argsort(A)[:-num-1:-1]:  # Print the top N.
                if A[i] == 0:
                    break
111
                outfile.write("\t".join([marker, ensure_sequence_format(
jhoogenboom's avatar
jhoogenboom committed
112
                    data[marker]["seqs"][i], seqformat, library=library,
113
                    marker=marker), str(A[i])]) + "\n")
jhoogenboom's avatar
jhoogenboom committed
114
115
116
117
118
119
120
        else:
            # The rows with the highest maxima/sums correspond to the
            # samples with the highest amounts of contaminant/s.
            A = A.max(1).flat if mode == "highest" else A.sum(1).flat
            for i in np.argsort(A)[:-num-1:-1]:  # Print the top N.
                if A[i] == 0:
                    break
121
122
                outfile.write("\t".join(
                    [marker, data[marker]["sample_tags"][i], str(A[i])])+"\n")
jhoogenboom's avatar
jhoogenboom committed
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#blame


def add_arguments(parser):
    parser.add_argument('profiles', metavar="PROFILES",
        type=argparse.FileType('r'),
        help="file containing background noise profiles to match")
    parser.add_argument("-m", "--mode", metavar="MODE", default="common",
        choices=("common", "highest", "dirtiest"),
        help="controls what kind of information is printed; 'common' (the "
             "default) prints the top N contaminant alleles per marker, "
             "'highest' prints the top N samples with the highest single "
             "contaminant per marker, and 'dirtiest' prints the top N samples "
             "with the highest total amount of contaminants per marker")
137
    add_input_output_args(parser)
jhoogenboom's avatar
jhoogenboom committed
138
    add_allele_detection_args(parser)
jhoogenboom's avatar
jhoogenboom committed
139
140
    filtergroup = parser.add_argument_group("filtering options")
    filtergroup.add_argument('-n', '--num', metavar="N", type=pos_int_arg,
jhoogenboom's avatar
jhoogenboom committed
141
142
        default=_DEF_NUM,
        help="print the top N results per marker (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
143
    filtergroup.add_argument('-M', '--marker', metavar="MARKER",
jhoogenboom's avatar
jhoogenboom committed
144
        help="work only on MARKER")
jhoogenboom's avatar
jhoogenboom committed
145
    add_sequence_format_args(parser, "raw")
jhoogenboom's avatar
jhoogenboom committed
146
147
148
149
#add_arguments


def run(args):
jhoogenboom's avatar
jhoogenboom committed
150
151
152
153
    # Import numpy now.
    import numpy as np
    global np

154
155
    files = get_input_output_files(args)
    if not files:
jhoogenboom's avatar
jhoogenboom committed
156
157
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
158
159
160
    blame(files[0], files[1], args.allelelist, args.annotation_column,
          args.mode, args.profiles, args.num, args.sequence_format,
          args.library, args.marker)
jhoogenboom's avatar
jhoogenboom committed
161
#run