bgcorrect.py 12.7 KB
Newer Older
1
#!/usr/bin/env python
2
3

#
4
# Copyright (C) 2017 Jerry Hoogenboom
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
#
# FDSTools is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# FDSTools is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with FDSTools.  If not, see <http://www.gnu.org/licenses/>.
#

23
"""
jhoogenboom's avatar
jhoogenboom committed
24
25
26
Match background noise profiles (obtained from e.g., bgestimate) to
samples.

27
28
Eleven new columns are added to the output giving, for each sequence,
the number of reads attributable to noise from other sequences (_noise
jhoogenboom's avatar
jhoogenboom committed
29
columns) and the number of noise reads caused by the prescense of this
jhoogenboom's avatar
jhoogenboom committed
30
31
sequence (_add columns), as well as the resulting number of reads after
correction (_corrected columns: original minus _noise plus _add).
32

33
34
35
36
37
38
39
40
41
42
43
The correction_flags column contains one of the following values:
'not_corrected', no background noise profile was available for this
marker; 'not_in_ref_db', the sequence was not present in the noise
profiles given; 'corrected_as_background_only', the sequence was present
in the noise profiles given, but only as noise and not as genuine
allele; 'corrected_bgpredict', the sequence was present in the noise
profiles as a genuine allele, but its noise profile consists entirely of
predictions as opposed to direct observations;
'corrected_bgestimate'/'corrected_bghomstats', the sequence was present
in the noise profiles as a genuine allele and at least part of its noise
profile was based on direct observations.
44
45
46

Finally, the weight column gives the number of times that the noise
profile of that allele fitted in the sample.
47
"""
48
import argparse, sys
49
50
#import numpy as np  # Only imported when actually running this tool.

51
from errno import EPIPE
52
53
from ..lib import load_profiles, ensure_sequence_format, get_column_ids, \
                  nnls, add_sequence_format_args, SEQ_SPECIAL_VALUES, \
54
                  add_input_output_args, get_input_output_files
55

56
__version__ = "1.0.2"
57
58
59
60
61
62
63
64


def get_sample_data(infile, convert_to_raw=False, library=None):
    """
    Read data from the given file handle, corresponding to a single
    sample, and fill a dict with all sequences in the sample.
    """
    column_names = infile.readline().rstrip("\r\n").split("\t")
65
66
    if column_names == [""]:
        return None, None  # Empty file.
67
68
69
70
71
72
    column_names.append("forward_noise")
    column_names.append("reverse_noise")
    column_names.append("total_noise")
    column_names.append("forward_add")
    column_names.append("reverse_add")
    column_names.append("total_add")
jhoogenboom's avatar
jhoogenboom committed
73
74
75
    column_names.append("forward_corrected")
    column_names.append("reverse_corrected")
    column_names.append("total_corrected")
76
    column_names.append("correction_flags")
77
    column_names.append("weight")
78
    colid_marker, colid_sequence, colid_forward, colid_reverse =get_column_ids(
79
        column_names, "marker", "sequence", "forward", "reverse")
80
81
82
    data = {}
    for line in infile:
        cols = line.rstrip("\r\n").split("\t")
83
        marker = cols[colid_marker]
84
        if convert_to_raw:
85
86
            cols[colid_sequence] = ensure_sequence_format(
                cols[colid_sequence], "raw", library=library, marker=marker)
87
88
89
90
91
92
93
94
        cols[colid_forward] = int(cols[colid_forward])
        cols[colid_reverse] = int(cols[colid_reverse])
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
95
96
97
        cols.append(cols[colid_forward])
        cols.append(cols[colid_reverse])
        cols.append(cols[colid_forward] + cols[colid_reverse])
98
        cols.append("not_corrected")
99
        cols.append((cols[colid_forward] + cols[colid_reverse]) / 100.)
100
101
102
103
104
105
106
107
108
        if marker not in data:
            data[marker] = []
        data[marker].append(cols)
    return column_names, data
#get_sample_data


def match_profile(column_names, data, profile, convert_to_raw, library,
                  marker):
109
    (colid_marker, colid_sequence, colid_forward, colid_reverse, colid_total,
110
     colid_forward_noise, colid_reverse_noise, colid_total_noise,
jhoogenboom's avatar
jhoogenboom committed
111
112
     colid_forward_add, colid_reverse_add, colid_total_add,
     colid_forward_corrected, colid_reverse_corrected,
113
114
115
116
117
     colid_total_corrected, colid_correction_flags, colid_weight) = \
        get_column_ids(column_names, "marker", "sequence", "forward",
        "reverse", "total", "forward_noise", "reverse_noise", "total_noise",
        "forward_add", "reverse_add", "total_add", "forward_corrected",
        "reverse_corrected", "total_corrected", "correction_flags", "weight")
118
119
120
121
122
123
124
125
126
127

    # Enter profiles into P.
    P1 = np.matrix(profile["forward"])
    P2 = np.matrix(profile["reverse"])

    # Enter sample into C.
    seqs = []
    C1 = np.matrix(np.zeros([1, profile["m"]]))
    C2 = np.matrix(np.zeros([1, profile["m"]]))
    for line in data:
128
129
        if line[colid_sequence] in SEQ_SPECIAL_VALUES:
            continue
130
        if convert_to_raw:
131
            sequence = ensure_sequence_format(line[colid_sequence], "raw",
132
133
                                            library=library, marker=marker)
        else:
134
135
            sequence = line[colid_sequence]
        seqs.append(sequence)
136
        try:
137
            i = profile["seqs"].index(sequence)
138
139
140
141
142
143
144
        except ValueError:
            # Note: Not adding any new sequences to the profile, since
            # they will just be zeroes and have no effect on the result.
            continue
        C1[0, i] = line[colid_forward]
        C2[0, i] = line[colid_reverse]

145
146
147
148
    # Stop if this sample has no explicit data for this marker.
    if not seqs:
        return

149
    # Compute corrected read counts.
jhoogenboom's avatar
jhoogenboom committed
150
    A = nnls(np.hstack([P1, P2]).T, np.hstack([C1, C2]).T).T
151
152
    np.fill_diagonal(P1, 0)
    np.fill_diagonal(P2, 0)
jhoogenboom's avatar
jhoogenboom committed
153
154
    forward_noise = A * P1
    reverse_noise = A * P2
155
156
    forward_add = np.multiply(A, P1.sum(1).T)
    reverse_add = np.multiply(A, P2.sum(1).T)
157

158
    # Round values to 3 decimal positions.
159
160
161
162
163
    A.round(3, A)
    forward_noise.round(3, forward_noise)
    reverse_noise.round(3, reverse_noise)
    forward_add.round(3, forward_add)
    reverse_add.round(3, reverse_add)
164

165
166
    j = 0
    for line in data:
167
168
        if line[colid_sequence] in SEQ_SPECIAL_VALUES:
            continue
169
170
171
172
        j += 1
        try:
            i = profile["seqs"].index(seqs[j-1])
        except ValueError:
173
            line[colid_correction_flags] = "not_in_ref_db"
174
175
176
177
            continue
        line[colid_forward_noise] = forward_noise[0, i]
        line[colid_reverse_noise] = reverse_noise[0, i]
        line[colid_total_noise] = forward_noise[0, i] + reverse_noise[0, i]
jhoogenboom's avatar
jhoogenboom committed
178
179
180
        line[colid_forward_corrected] -= line[colid_forward_noise]
        line[colid_reverse_corrected] -= line[colid_reverse_noise]
        line[colid_total_corrected] -= line[colid_total_noise]
181
182
183
184
        if i < profile["n"]:
            line[colid_forward_add] = forward_add[0, i]
            line[colid_reverse_add] = reverse_add[0, i]
            line[colid_total_add] = forward_add[0, i] + reverse_add[0, i]
jhoogenboom's avatar
jhoogenboom committed
185
186
187
            line[colid_forward_corrected] += line[colid_forward_add]
            line[colid_reverse_corrected] += line[colid_reverse_add]
            line[colid_total_corrected] += line[colid_total_add]
188
189
            if "bgestimate" in profile["tool"][i]:
                line[colid_correction_flags] = "corrected_bgestimate"
190
            elif "bghomstats" in profile["tool"][i]:
191
192
193
                line[colid_correction_flags] = "corrected_bghomstats"
            elif "bgpredict" in profile["tool"][i]:
                line[colid_correction_flags] = "corrected_bgpredict"
194
            else:
195
                line[colid_correction_flags] = "corrected"
196
            line[colid_weight] = A[0, i]
197
        else:
198
            line[colid_correction_flags] = "corrected_as_background_only"
199
            line[colid_weight] = line[colid_total_corrected] / 100.
200
201
202
203
204
205
206
207
208
209

    # Add sequences that are in the profile but not in the sample.
    for i in range(profile["m"]):
        if profile["seqs"][i] in seqs:
            continue
        amount = forward_noise[0, i] + reverse_noise[0, i]
        if i < profile["n"]:
            amount += forward_add[0, i] + reverse_add[0, i]
        if amount > 0:
            line = [""] * len(column_names)
210
211
            line[colid_marker] = marker
            line[colid_sequence] = profile["seqs"][i]
212
213
214
215
216
217
            line[colid_forward] = 0
            line[colid_reverse] = 0
            line[colid_total] = 0
            line[colid_forward_noise] = forward_noise[0, i]
            line[colid_reverse_noise] = reverse_noise[0, i]
            line[colid_total_noise] = forward_noise[0, i] + reverse_noise[0, i]
jhoogenboom's avatar
jhoogenboom committed
218
219
220
            line[colid_forward_corrected] = -line[colid_forward_noise]
            line[colid_reverse_corrected] = -line[colid_reverse_noise]
            line[colid_total_corrected] = -line[colid_total_noise]
221
222
223
224
            if i < profile["n"]:
                line[colid_forward_add] = forward_add[0, i]
                line[colid_reverse_add] = reverse_add[0, i]
                line[colid_total_add] = forward_add[0, i] + reverse_add[0, i]
jhoogenboom's avatar
jhoogenboom committed
225
226
227
                line[colid_forward_corrected] += line[colid_forward_add]
                line[colid_reverse_corrected] += line[colid_reverse_add]
                line[colid_total_corrected] += line[colid_total_add]
228
229
230
231
232
233
234
235
                if "bgestimate" in profile["tool"][i]:
                    line[colid_correction_flags] = "corrected_bgestimate"
                elif "bghomstats" in profile["tool"][i]:
                    line[colid_correction_flags] = "corrected_bghomstats"
                elif "bgpredict" in profile["tool"][i]:
                    line[colid_correction_flags] = "corrected_bgpredict"
                else:
                    line[colid_correction_flags] = "corrected"
236
                line[colid_weight] = A[0, i]
237
238
239
240
            else:
                line[colid_forward_add] = 0
                line[colid_reverse_add] = 0
                line[colid_total_add] = 0
241
                line[colid_correction_flags] = "corrected_as_background_only"
242
                line[colid_weight] = line[colid_total_corrected] / 100.
243
244
245
246
            data.append(line)
#match_profile


247
def match_profiles(infile, outfile, profiles, library, seqformat):
248
249
    column_names, data = get_sample_data(
        infile, convert_to_raw=seqformat=="raw", library=library)
250
251
    if column_names is None:
        return  # Empty file.
252
    colid_sequence = get_column_ids(column_names, "sequence")
253
254
255
256
257
258
259

    outfile.write("\t".join(column_names) + "\n")
    for marker in data:
        if marker in profiles:
            match_profile(column_names, data[marker], profiles[marker],
                          seqformat!="raw", library, marker)
        for line in data[marker]:
260
            if seqformat is not None and seqformat != "raw":
261
262
263
                line[colid_sequence] = ensure_sequence_format(
                    line[colid_sequence], seqformat, library=library,
                    marker=marker)
264
265
266
267
268
269
270
271
            outfile.write("\t".join(map(str, line)) + "\n")
#match_profiles


def add_arguments(parser):
    parser.add_argument('profiles', metavar="PROFILES",
        type=argparse.FileType('r'),
        help="file containing background noise profiles to match")
272
    add_input_output_args(parser, True, True, False)
jhoogenboom's avatar
jhoogenboom committed
273
274
    filtergroup = parser.add_argument_group("filtering options")
    filtergroup.add_argument('-M', '--marker', metavar="MARKER",
275
        help="work only on MARKER")
jhoogenboom's avatar
jhoogenboom committed
276
    add_sequence_format_args(parser)
277
278
279
280
#add_arguments


def run(args):
jhoogenboom's avatar
jhoogenboom committed
281
282
    # Import numpy now.
    global np
283
    import numpy as np
jhoogenboom's avatar
jhoogenboom committed
284

285
286
    gen = get_input_output_files(args, True, True)
    if not gen:
287
288
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
289

290
    # Read profiles once.
291
    profiles = load_profiles(args.profiles, args.library)
292
293
294
295
296
297
    if args.marker:
        profiles = {args.marker: profiles[args.marker]} \
                   if args.marker in profiles else {}

    for tag, infiles, outfile in gen:
        # TODO: Aggregate data from all infiles of each sample.
jhoogenboom's avatar
jhoogenboom committed
298
299
300
        if len(infiles) > 1:
            raise ValueError(
                "multiple input files for sample '%s' specified " % tag)
301
302
303
304
305
306
307
308
309
310
        try:
            infile = sys.stdin if infiles[0] == "-" else open(infiles[0], "r")
            match_profiles(infile, outfile, profiles, args.library,
                           args.sequence_format)
            if infile != sys.stdin:
                infile.close()
        except IOError as e:
            if e.errno == EPIPE:
                continue
            raise
311
#run