bgcorrect.py 8.79 KB
Newer Older
1
2
#!/usr/bin/env python
"""
jhoogenboom's avatar
jhoogenboom committed
3
4
5
Match background noise profiles (obtained from e.g., bgestimate) to
samples.

jhoogenboom's avatar
jhoogenboom committed
6
Nine new columns are added to the output giving, for each sequence, the
jhoogenboom's avatar
jhoogenboom committed
7
8
number of reads attributable to noise from other sequences (_noise
columns) and the number of noise reads caused by the prescense of this
jhoogenboom's avatar
jhoogenboom committed
9
10
sequence (_add columns), as well as the resulting number of reads after
correction (_corrected columns: original minus _noise plus _add).
11
"""
12
import argparse, sys
13
14
#import numpy as np  # Only imported when actually running this tool.

15
from ..lib import load_profiles, ensure_sequence_format, nnls, \
16
17
                  get_column_ids, add_sequence_format_args, \
                  add_input_output_args, get_input_output_files
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

__version__ = "0.1dev"


def get_sample_data(infile, convert_to_raw=False, library=None):
    """
    Read data from the given file handle, corresponding to a single
    sample, and fill a dict with all sequences in the sample.
    """
    column_names = infile.readline().rstrip("\r\n").split("\t")
    column_names.append("forward_noise")
    column_names.append("reverse_noise")
    column_names.append("total_noise")
    column_names.append("forward_add")
    column_names.append("reverse_add")
    column_names.append("total_add")
jhoogenboom's avatar
jhoogenboom committed
34
35
36
    column_names.append("forward_corrected")
    column_names.append("reverse_corrected")
    column_names.append("total_corrected")
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
    colid_name, colid_allele, colid_forward, colid_reverse = get_column_ids(
        column_names, "name", "allele", "forward", "reverse")
    data = {}
    for line in infile:
        cols = line.rstrip("\r\n").split("\t")
        marker = cols[colid_name]
        if convert_to_raw:
            cols[colid_allele] = ensure_sequence_format(
                cols[colid_allele], "raw", library=library, marker=marker)
        cols[colid_forward] = int(cols[colid_forward])
        cols[colid_reverse] = int(cols[colid_reverse])
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
jhoogenboom's avatar
jhoogenboom committed
54
55
56
        cols.append(int(cols[colid_forward]))
        cols.append(int(cols[colid_reverse]))
        cols.append(int(cols[colid_forward]) + int(cols[colid_reverse]))
57
58
59
60
61
62
63
64
65
66
67
        if marker not in data:
            data[marker] = []
        data[marker].append(cols)
    return column_names, data
#get_sample_data


def match_profile(column_names, data, profile, convert_to_raw, library,
                  marker):
    (colid_name, colid_allele, colid_forward, colid_reverse, colid_total,
     colid_forward_noise, colid_reverse_noise, colid_total_noise,
jhoogenboom's avatar
jhoogenboom committed
68
69
70
     colid_forward_add, colid_reverse_add, colid_total_add,
     colid_forward_corrected, colid_reverse_corrected,
     colid_total_corrected) = get_column_ids(
71
72
        column_names, "name", "allele", "forward", "reverse", "total",
        "forward_noise", "reverse_noise", "total_noise", "forward_add",
jhoogenboom's avatar
jhoogenboom committed
73
74
        "reverse_add", "total_add", "forward_corrected", "reverse_corrected",
        "total_corrected")
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

    # Enter profiles into P.
    P1 = np.matrix(profile["forward"])
    P2 = np.matrix(profile["reverse"])

    # Enter sample into C.
    seqs = []
    C1 = np.matrix(np.zeros([1, profile["m"]]))
    C2 = np.matrix(np.zeros([1, profile["m"]]))
    for line in data:
        if convert_to_raw:
            allele = ensure_sequence_format(line[colid_allele], "raw",
                                            library=library, marker=marker)
        else:
            allele = line[colid_allele]
        seqs.append(allele)
        try:
            i = profile["seqs"].index(allele)
        except ValueError:
            # Note: Not adding any new sequences to the profile, since
            # they will just be zeroes and have no effect on the result.
            continue
        C1[0, i] = line[colid_forward]
        C2[0, i] = line[colid_reverse]

    # Compute corrected read counts.
jhoogenboom's avatar
jhoogenboom committed
101
    A = nnls(np.hstack([P1, P2]).T, np.hstack([C1, C2]).T).T
102
103
    np.fill_diagonal(P1, 0)
    np.fill_diagonal(P2, 0)
jhoogenboom's avatar
jhoogenboom committed
104
105
106
107
    forward_noise = A * P1
    reverse_noise = A * P2
    forward_add = np.multiply(A, P1.sum(1))
    reverse_add = np.multiply(A, P2.sum(1))
108
109
110
111
112
113
114
115
116
117
118

    j = 0
    for line in data:
        j += 1
        try:
            i = profile["seqs"].index(seqs[j-1])
        except ValueError:
            continue
        line[colid_forward_noise] = forward_noise[0, i]
        line[colid_reverse_noise] = reverse_noise[0, i]
        line[colid_total_noise] = forward_noise[0, i] + reverse_noise[0, i]
jhoogenboom's avatar
jhoogenboom committed
119
120
121
        line[colid_forward_corrected] -= line[colid_forward_noise]
        line[colid_reverse_corrected] -= line[colid_reverse_noise]
        line[colid_total_corrected] -= line[colid_total_noise]
122
123
124
125
        if i < profile["n"]:
            line[colid_forward_add] = forward_add[0, i]
            line[colid_reverse_add] = reverse_add[0, i]
            line[colid_total_add] = forward_add[0, i] + reverse_add[0, i]
jhoogenboom's avatar
jhoogenboom committed
126
127
128
            line[colid_forward_corrected] += line[colid_forward_add]
            line[colid_reverse_corrected] += line[colid_reverse_add]
            line[colid_total_corrected] += line[colid_total_add]
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

    # Add sequences that are in the profile but not in the sample.
    for i in range(profile["m"]):
        if profile["seqs"][i] in seqs:
            continue
        amount = forward_noise[0, i] + reverse_noise[0, i]
        if i < profile["n"]:
            amount += forward_add[0, i] + reverse_add[0, i]
        if amount > 0:
            line = [""] * len(column_names)
            line[colid_name] = marker
            line[colid_allele] = profile["seqs"][i]
            line[colid_forward] = 0
            line[colid_reverse] = 0
            line[colid_total] = 0
            line[colid_forward_noise] = forward_noise[0, i]
            line[colid_reverse_noise] = reverse_noise[0, i]
            line[colid_total_noise] = forward_noise[0, i] + reverse_noise[0, i]
jhoogenboom's avatar
jhoogenboom committed
147
148
149
            line[colid_forward_corrected] = -line[colid_forward_noise]
            line[colid_reverse_corrected] = -line[colid_reverse_noise]
            line[colid_total_corrected] = -line[colid_total_noise]
150
151
152
153
            if i < profile["n"]:
                line[colid_forward_add] = forward_add[0, i]
                line[colid_reverse_add] = reverse_add[0, i]
                line[colid_total_add] = forward_add[0, i] + reverse_add[0, i]
jhoogenboom's avatar
jhoogenboom committed
154
155
156
                line[colid_forward_corrected] += line[colid_forward_add]
                line[colid_reverse_corrected] += line[colid_reverse_add]
                line[colid_total_corrected] += line[colid_total_add]
157
158
159
160
161
162
163
164
            else:
                line[colid_forward_add] = 0
                line[colid_reverse_add] = 0
                line[colid_total_add] = 0
            data.append(line)
#match_profile


165
def match_profiles(infile, outfile, profiles, library, seqformat):
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
    column_names, data = get_sample_data(
        infile, convert_to_raw=seqformat=="raw", library=library)
    colid_allele = get_column_ids(column_names, "allele")

    outfile.write("\t".join(column_names) + "\n")
    for marker in data:
        if marker in profiles:
            match_profile(column_names, data[marker], profiles[marker],
                          seqformat!="raw", library, marker)
        for line in data[marker]:
            if seqformat is not None and seqformat != "raw":
                line[colid_allele] = ensure_sequence_format(line[colid_allele],
                    seqformat, library=library, marker=marker)
            outfile.write("\t".join(map(str, line)) + "\n")
#match_profiles


def add_arguments(parser):
    parser.add_argument('profiles', metavar="PROFILES",
        type=argparse.FileType('r'),
        help="file containing background noise profiles to match")
187
    add_input_output_args(parser, True, True, False)
jhoogenboom's avatar
jhoogenboom committed
188
189
    filtergroup = parser.add_argument_group("filtering options")
    filtergroup.add_argument('-M', '--marker', metavar="MARKER",
190
        help="work only on MARKER")
jhoogenboom's avatar
jhoogenboom committed
191
    add_sequence_format_args(parser)
192
193
194
195
#add_arguments


def run(args):
jhoogenboom's avatar
jhoogenboom committed
196
197
198
199
    # Import numpy now.
    import numpy as np
    global np

200
201
    gen = get_input_output_files(args, True, True)
    if not gen:
202
203
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
204
205

    # Read library and profiles once.
206
    profiles = load_profiles(args.profiles, args.library)
207
208
209
210
211
212
    if args.marker:
        profiles = {args.marker: profiles[args.marker]} \
                   if args.marker in profiles else {}

    for tag, infiles, outfile in gen:
        # TODO: Aggregate data from all infiles of each sample.
jhoogenboom's avatar
jhoogenboom committed
213
214
215
        if len(infiles) > 1:
            raise ValueError(
                "multiple input files for sample '%s' specified " % tag)
216
217
        infile = sys.stdin if infiles[0] == "-" else open(infiles[0], "r")
        match_profiles(infile, outfile, profiles, args.library,
jhoogenboom's avatar
jhoogenboom committed
218
                       args.sequence_format)
219
220
        if infile != sys.stdin:
            infile.close()
221
#run