bgcorrect.py 12.3 KB
Newer Older
1
#!/usr/bin/env python
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22

#
# Copyright (C) 2016 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
#
# FDSTools is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# FDSTools is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with FDSTools.  If not, see <http://www.gnu.org/licenses/>.
#

23
"""
jhoogenboom's avatar
jhoogenboom committed
24 25 26
Match background noise profiles (obtained from e.g., bgestimate) to
samples.

27
Ten new columns are added to the output giving, for each sequence, the
jhoogenboom's avatar
jhoogenboom committed
28 29
number of reads attributable to noise from other sequences (_noise
columns) and the number of noise reads caused by the prescense of this
jhoogenboom's avatar
jhoogenboom committed
30 31
sequence (_add columns), as well as the resulting number of reads after
correction (_corrected columns: original minus _noise plus _add).
32

33 34 35 36 37 38 39 40 41 42 43
The correction_flags column contains one of the following values:
'not_corrected', no background noise profile was available for this
marker; 'not_in_ref_db', the sequence was not present in the noise
profiles given; 'corrected_as_background_only', the sequence was present
in the noise profiles given, but only as noise and not as genuine
allele; 'corrected_bgpredict', the sequence was present in the noise
profiles as a genuine allele, but its noise profile consists entirely of
predictions as opposed to direct observations;
'corrected_bgestimate'/'corrected_bghomstats', the sequence was present
in the noise profiles as a genuine allele and at least part of its noise
profile was based on direct observations.
44
"""
45
import argparse, sys
46 47
#import numpy as np  # Only imported when actually running this tool.

48 49
from ..lib import load_profiles, ensure_sequence_format, get_column_ids, \
                  nnls, add_sequence_format_args, SEQ_SPECIAL_VALUES, \
50
                  add_input_output_args, get_input_output_files
51

52
__version__ = "1.0.1"
53 54 55 56 57 58 59 60 61 62 63 64 65 66


def get_sample_data(infile, convert_to_raw=False, library=None):
    """
    Read data from the given file handle, corresponding to a single
    sample, and fill a dict with all sequences in the sample.
    """
    column_names = infile.readline().rstrip("\r\n").split("\t")
    column_names.append("forward_noise")
    column_names.append("reverse_noise")
    column_names.append("total_noise")
    column_names.append("forward_add")
    column_names.append("reverse_add")
    column_names.append("total_add")
jhoogenboom's avatar
jhoogenboom committed
67 68 69
    column_names.append("forward_corrected")
    column_names.append("reverse_corrected")
    column_names.append("total_corrected")
70
    column_names.append("correction_flags")
71
    column_names.append("weight")
72
    colid_marker, colid_sequence, colid_forward, colid_reverse =get_column_ids(
73
        column_names, "marker", "sequence", "forward", "reverse")
74 75 76
    data = {}
    for line in infile:
        cols = line.rstrip("\r\n").split("\t")
77
        marker = cols[colid_marker]
78
        if convert_to_raw:
79 80
            cols[colid_sequence] = ensure_sequence_format(
                cols[colid_sequence], "raw", library=library, marker=marker)
81 82 83 84 85 86 87 88
        cols[colid_forward] = int(cols[colid_forward])
        cols[colid_reverse] = int(cols[colid_reverse])
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
89 90 91
        cols.append(cols[colid_forward])
        cols.append(cols[colid_reverse])
        cols.append(cols[colid_forward] + cols[colid_reverse])
92
        cols.append("not_corrected")
93
        cols.append((cols[colid_forward] + cols[colid_reverse]) / 100.)
94 95 96 97 98 99 100 101 102
        if marker not in data:
            data[marker] = []
        data[marker].append(cols)
    return column_names, data
#get_sample_data


def match_profile(column_names, data, profile, convert_to_raw, library,
                  marker):
103
    (colid_marker, colid_sequence, colid_forward, colid_reverse, colid_total,
104
     colid_forward_noise, colid_reverse_noise, colid_total_noise,
jhoogenboom's avatar
jhoogenboom committed
105 106
     colid_forward_add, colid_reverse_add, colid_total_add,
     colid_forward_corrected, colid_reverse_corrected,
107 108 109 110 111
     colid_total_corrected, colid_correction_flags, colid_weight) = \
        get_column_ids(column_names, "marker", "sequence", "forward",
        "reverse", "total", "forward_noise", "reverse_noise", "total_noise",
        "forward_add", "reverse_add", "total_add", "forward_corrected",
        "reverse_corrected", "total_corrected", "correction_flags", "weight")
112 113 114 115 116 117 118 119 120 121

    # Enter profiles into P.
    P1 = np.matrix(profile["forward"])
    P2 = np.matrix(profile["reverse"])

    # Enter sample into C.
    seqs = []
    C1 = np.matrix(np.zeros([1, profile["m"]]))
    C2 = np.matrix(np.zeros([1, profile["m"]]))
    for line in data:
122 123
        if line[colid_sequence] in SEQ_SPECIAL_VALUES:
            continue
124
        if convert_to_raw:
125
            sequence = ensure_sequence_format(line[colid_sequence], "raw",
126 127
                                            library=library, marker=marker)
        else:
128 129
            sequence = line[colid_sequence]
        seqs.append(sequence)
130
        try:
131
            i = profile["seqs"].index(sequence)
132 133 134 135 136 137 138
        except ValueError:
            # Note: Not adding any new sequences to the profile, since
            # they will just be zeroes and have no effect on the result.
            continue
        C1[0, i] = line[colid_forward]
        C2[0, i] = line[colid_reverse]

139 140 141 142
    # Stop if this sample has no explicit data for this marker.
    if not seqs:
        return

143
    # Compute corrected read counts.
jhoogenboom's avatar
jhoogenboom committed
144
    A = nnls(np.hstack([P1, P2]).T, np.hstack([C1, C2]).T).T
145 146
    np.fill_diagonal(P1, 0)
    np.fill_diagonal(P2, 0)
jhoogenboom's avatar
jhoogenboom committed
147 148
    forward_noise = A * P1
    reverse_noise = A * P2
149 150
    forward_add = np.multiply(A, P1.sum(1).T)
    reverse_add = np.multiply(A, P2.sum(1).T)
151

152
    # Round values to 3 decimal positions.
153 154 155 156 157
    A.round(3, A)
    forward_noise.round(3, forward_noise)
    reverse_noise.round(3, reverse_noise)
    forward_add.round(3, forward_add)
    reverse_add.round(3, reverse_add)
158

159 160
    j = 0
    for line in data:
161 162
        if line[colid_sequence] in SEQ_SPECIAL_VALUES:
            continue
163 164 165 166
        j += 1
        try:
            i = profile["seqs"].index(seqs[j-1])
        except ValueError:
167
            line[colid_correction_flags] = "not_in_ref_db"
168 169 170 171
            continue
        line[colid_forward_noise] = forward_noise[0, i]
        line[colid_reverse_noise] = reverse_noise[0, i]
        line[colid_total_noise] = forward_noise[0, i] + reverse_noise[0, i]
jhoogenboom's avatar
jhoogenboom committed
172 173 174
        line[colid_forward_corrected] -= line[colid_forward_noise]
        line[colid_reverse_corrected] -= line[colid_reverse_noise]
        line[colid_total_corrected] -= line[colid_total_noise]
175 176 177 178
        if i < profile["n"]:
            line[colid_forward_add] = forward_add[0, i]
            line[colid_reverse_add] = reverse_add[0, i]
            line[colid_total_add] = forward_add[0, i] + reverse_add[0, i]
jhoogenboom's avatar
jhoogenboom committed
179 180 181
            line[colid_forward_corrected] += line[colid_forward_add]
            line[colid_reverse_corrected] += line[colid_reverse_add]
            line[colid_total_corrected] += line[colid_total_add]
182 183
            if "bgestimate" in profile["tool"][i]:
                line[colid_correction_flags] = "corrected_bgestimate"
184
            elif "bghomstats" in profile["tool"][i]:
185 186 187
                line[colid_correction_flags] = "corrected_bghomstats"
            elif "bgpredict" in profile["tool"][i]:
                line[colid_correction_flags] = "corrected_bgpredict"
188
            else:
189
                line[colid_correction_flags] = "corrected"
190
            line[colid_weight] = A[0, i]
191
        else:
192
            line[colid_correction_flags] = "corrected_as_background_only"
193
            line[colid_weight] = line[colid_total_corrected] / 100.
194 195 196 197 198 199 200 201 202 203

    # Add sequences that are in the profile but not in the sample.
    for i in range(profile["m"]):
        if profile["seqs"][i] in seqs:
            continue
        amount = forward_noise[0, i] + reverse_noise[0, i]
        if i < profile["n"]:
            amount += forward_add[0, i] + reverse_add[0, i]
        if amount > 0:
            line = [""] * len(column_names)
204 205
            line[colid_marker] = marker
            line[colid_sequence] = profile["seqs"][i]
206 207 208 209 210 211
            line[colid_forward] = 0
            line[colid_reverse] = 0
            line[colid_total] = 0
            line[colid_forward_noise] = forward_noise[0, i]
            line[colid_reverse_noise] = reverse_noise[0, i]
            line[colid_total_noise] = forward_noise[0, i] + reverse_noise[0, i]
jhoogenboom's avatar
jhoogenboom committed
212 213 214
            line[colid_forward_corrected] = -line[colid_forward_noise]
            line[colid_reverse_corrected] = -line[colid_reverse_noise]
            line[colid_total_corrected] = -line[colid_total_noise]
215 216 217 218
            if i < profile["n"]:
                line[colid_forward_add] = forward_add[0, i]
                line[colid_reverse_add] = reverse_add[0, i]
                line[colid_total_add] = forward_add[0, i] + reverse_add[0, i]
jhoogenboom's avatar
jhoogenboom committed
219 220 221
                line[colid_forward_corrected] += line[colid_forward_add]
                line[colid_reverse_corrected] += line[colid_reverse_add]
                line[colid_total_corrected] += line[colid_total_add]
222 223 224 225 226 227 228 229
                if "bgestimate" in profile["tool"][i]:
                    line[colid_correction_flags] = "corrected_bgestimate"
                elif "bghomstats" in profile["tool"][i]:
                    line[colid_correction_flags] = "corrected_bghomstats"
                elif "bgpredict" in profile["tool"][i]:
                    line[colid_correction_flags] = "corrected_bgpredict"
                else:
                    line[colid_correction_flags] = "corrected"
230
                line[colid_weight] = A[0, i]
231 232 233 234
            else:
                line[colid_forward_add] = 0
                line[colid_reverse_add] = 0
                line[colid_total_add] = 0
235
                line[colid_correction_flags] = "corrected_as_background_only"
236
                line[colid_weight] = line[colid_total_corrected] / 100.
237 238 239 240
            data.append(line)
#match_profile


241
def match_profiles(infile, outfile, profiles, library, seqformat):
242 243
    column_names, data = get_sample_data(
        infile, convert_to_raw=seqformat=="raw", library=library)
244
    colid_sequence = get_column_ids(column_names, "sequence")
245 246 247 248 249 250 251

    outfile.write("\t".join(column_names) + "\n")
    for marker in data:
        if marker in profiles:
            match_profile(column_names, data[marker], profiles[marker],
                          seqformat!="raw", library, marker)
        for line in data[marker]:
252
            if seqformat is not None and seqformat != "raw":
253 254 255
                line[colid_sequence] = ensure_sequence_format(
                    line[colid_sequence], seqformat, library=library,
                    marker=marker)
256 257 258 259 260 261 262 263
            outfile.write("\t".join(map(str, line)) + "\n")
#match_profiles


def add_arguments(parser):
    parser.add_argument('profiles', metavar="PROFILES",
        type=argparse.FileType('r'),
        help="file containing background noise profiles to match")
264
    add_input_output_args(parser, True, True, False)
jhoogenboom's avatar
jhoogenboom committed
265 266
    filtergroup = parser.add_argument_group("filtering options")
    filtergroup.add_argument('-M', '--marker', metavar="MARKER",
267
        help="work only on MARKER")
jhoogenboom's avatar
jhoogenboom committed
268
    add_sequence_format_args(parser)
269 270 271 272
#add_arguments


def run(args):
jhoogenboom's avatar
jhoogenboom committed
273 274
    # Import numpy now.
    global np
275
    import numpy as np
jhoogenboom's avatar
jhoogenboom committed
276

277 278
    gen = get_input_output_files(args, True, True)
    if not gen:
279 280
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
281

282
    # Read profiles once.
283
    profiles = load_profiles(args.profiles, args.library)
284 285 286 287 288 289
    if args.marker:
        profiles = {args.marker: profiles[args.marker]} \
                   if args.marker in profiles else {}

    for tag, infiles, outfile in gen:
        # TODO: Aggregate data from all infiles of each sample.
jhoogenboom's avatar
jhoogenboom committed
290 291 292
        if len(infiles) > 1:
            raise ValueError(
                "multiple input files for sample '%s' specified " % tag)
293 294
        infile = sys.stdin if infiles[0] == "-" else open(infiles[0], "r")
        match_profiles(infile, outfile, profiles, args.library,
jhoogenboom's avatar
jhoogenboom committed
295
                       args.sequence_format)
296 297
        if infile != sys.stdin:
            infile.close()
298
#run