bgcorrect.py 7.59 KB
Newer Older
1
2
#!/usr/bin/env python
"""
jhoogenboom's avatar
jhoogenboom committed
3
4
5
6
7
8
9
Match background noise profiles (obtained from e.g., bgestimate) to
samples.

Six new columns are added to the output giving, for each sequence, the
number of reads attributable to noise from other sequences (_noise
columns) and the number of noise reads caused by the prescense of this
sequence (_add columns).
10
11
12
13
14
"""
import argparse
#import numpy as np  # Only imported when actually running this tool.

from ..lib import parse_library, load_profiles, ensure_sequence_format, nnls, \
15
16
                  get_column_ids, add_sequence_format_args, \
                  add_input_output_args, get_input_output_files
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

__version__ = "0.1dev"


def get_sample_data(infile, convert_to_raw=False, library=None):
    """
    Read data from the given file handle, corresponding to a single
    sample, and fill a dict with all sequences in the sample.
    """
    column_names = infile.readline().rstrip("\r\n").split("\t")
    column_names.append("forward_noise")
    column_names.append("reverse_noise")
    column_names.append("total_noise")
    column_names.append("forward_add")
    column_names.append("reverse_add")
    column_names.append("total_add")
    colid_name, colid_allele, colid_forward, colid_reverse = get_column_ids(
        column_names, "name", "allele", "forward", "reverse")
    data = {}
    for line in infile:
        cols = line.rstrip("\r\n").split("\t")
        marker = cols[colid_name]
        if convert_to_raw:
            cols[colid_allele] = ensure_sequence_format(
                cols[colid_allele], "raw", library=library, marker=marker)
        cols[colid_forward] = int(cols[colid_forward])
        cols[colid_reverse] = int(cols[colid_reverse])
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
        if marker not in data:
            data[marker] = []
        data[marker].append(cols)
    return column_names, data
#get_sample_data


def match_profile(column_names, data, profile, convert_to_raw, library,
                  marker):
    import numpy as np
    (colid_name, colid_allele, colid_forward, colid_reverse, colid_total,
     colid_forward_noise, colid_reverse_noise, colid_total_noise,
     colid_forward_add, colid_reverse_add, colid_total_add) = get_column_ids(
        column_names, "name", "allele", "forward", "reverse", "total",
        "forward_noise", "reverse_noise", "total_noise", "forward_add",
        "reverse_add", "total_add")

    # Enter profiles into P.
    P1 = np.matrix(profile["forward"])
    P2 = np.matrix(profile["reverse"])

    # Enter sample into C.
    seqs = []
    C1 = np.matrix(np.zeros([1, profile["m"]]))
    C2 = np.matrix(np.zeros([1, profile["m"]]))
    for line in data:
        if convert_to_raw:
            allele = ensure_sequence_format(line[colid_allele], "raw",
                                            library=library, marker=marker)
        else:
            allele = line[colid_allele]
        seqs.append(allele)
        try:
            i = profile["seqs"].index(allele)
        except ValueError:
            # Note: Not adding any new sequences to the profile, since
            # they will just be zeroes and have no effect on the result.
            continue
        C1[0, i] = line[colid_forward]
        C2[0, i] = line[colid_reverse]

    # Compute corrected read counts.
jhoogenboom's avatar
jhoogenboom committed
92
    A = nnls(np.hstack([P1, P2]).T, np.hstack([C1, C2]).T).T
93
94
    np.fill_diagonal(P1, 0)
    np.fill_diagonal(P2, 0)
jhoogenboom's avatar
jhoogenboom committed
95
96
97
98
    forward_noise = A * P1
    reverse_noise = A * P2
    forward_add = np.multiply(A, P1.sum(1))
    reverse_add = np.multiply(A, P2.sum(1))
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

    j = 0
    for line in data:
        j += 1
        try:
            i = profile["seqs"].index(seqs[j-1])
        except ValueError:
            continue
        line[colid_forward_noise] = forward_noise[0, i]
        line[colid_reverse_noise] = reverse_noise[0, i]
        line[colid_total_noise] = forward_noise[0, i] + reverse_noise[0, i]
        if i < profile["n"]:
            line[colid_forward_add] = forward_add[0, i]
            line[colid_reverse_add] = reverse_add[0, i]
            line[colid_total_add] = forward_add[0, i] + reverse_add[0, i]

    # Add sequences that are in the profile but not in the sample.
    for i in range(profile["m"]):
        if profile["seqs"][i] in seqs:
            continue
        amount = forward_noise[0, i] + reverse_noise[0, i]
        if i < profile["n"]:
            amount += forward_add[0, i] + reverse_add[0, i]
        if amount > 0:
            line = [""] * len(column_names)
            line[colid_name] = marker
            line[colid_allele] = profile["seqs"][i]
            line[colid_forward] = 0
            line[colid_reverse] = 0
            line[colid_total] = 0
            line[colid_forward_noise] = forward_noise[0, i]
            line[colid_reverse_noise] = reverse_noise[0, i]
            line[colid_total_noise] = forward_noise[0, i] + reverse_noise[0, i]
            if i < profile["n"]:
                line[colid_forward_add] = forward_add[0, i]
                line[colid_reverse_add] = reverse_add[0, i]
                line[colid_total_add] = forward_add[0, i] + reverse_add[0, i]
            else:
                line[colid_forward_add] = 0
                line[colid_reverse_add] = 0
                line[colid_total_add] = 0
            data.append(line)
#match_profile


144
def match_profiles(infile, outfile, profiles, library, seqformat):
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
    column_names, data = get_sample_data(
        infile, convert_to_raw=seqformat=="raw", library=library)
    colid_allele = get_column_ids(column_names, "allele")

    outfile.write("\t".join(column_names) + "\n")
    for marker in data:
        if marker in profiles:
            match_profile(column_names, data[marker], profiles[marker],
                          seqformat!="raw", library, marker)
        for line in data[marker]:
            if seqformat is not None and seqformat != "raw":
                line[colid_allele] = ensure_sequence_format(line[colid_allele],
                    seqformat, library=library, marker=marker)
            outfile.write("\t".join(map(str, line)) + "\n")
#match_profiles


def add_arguments(parser):
    parser.add_argument('profiles', metavar="PROFILES",
        type=argparse.FileType('r'),
        help="file containing background noise profiles to match")
166
    add_input_output_args(parser, True, True, False)
jhoogenboom's avatar
jhoogenboom committed
167
168
    filtergroup = parser.add_argument_group("filtering options")
    filtergroup.add_argument('-M', '--marker', metavar="MARKER",
169
        help="work only on MARKER")
jhoogenboom's avatar
jhoogenboom committed
170
    add_sequence_format_args(parser)
171
172
173
174
#add_arguments


def run(args):
175
176
    gen = get_input_output_files(args, True, True)
    if not gen:
177
178
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
179
180
181
182
183
184
185
186
187
188
189
190
191
192

    # Read library and profiles once.
    library = parse_library(args.library) if args.library else None
    profiles = load_profiles(args.profiles, library)
    if args.marker:
        profiles = {args.marker: profiles[args.marker]} \
                   if args.marker in profiles else {}

    for tag, infiles, outfile in gen:
        # TODO: Aggregate data from all infiles of each sample.
        # This tool now only works properly with one infile per sample!
        for infile in infiles:
            match_profiles(infile, outfile, profiles, library,
                           args.sequence_format)
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
#run


def main():
    """
    Main entry point.
    """
    parser = argparse.ArgumentParser(
        description=__doc__)
    try:
        add_arguments(parser)
        run(parser.parse_args())
    except OSError as error:
        parser.error(error)
#main


if __name__ == "__main__":
    main()