samplestats.py 15.8 KB
Newer Older
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#!/usr/bin/env python
"""
Compute various statistics for each sequence in the given sample data
file.

Adds the following columns to the input data (where 'X' is a placeholder
for 'forward', 'reverse', and 'total').  Some columns may be omitted
from the output if the input does not contain the required columns.

forward_pct: The percentage of reads of this sequence on the forward
strand.
forward_corrected_pct: The percentage of reads of this sequence on the
forward strand after noise correction.
X_correction_pct: The difference between the values of X_corrected and X
as a percentage of X.
X_mp: The number of X reads of this sequence as a percentage of the
total number of X reads of the corresponding marker.
X_noise_mp: The number of X_noise reads of this sequence as a percentage
of the total X_noise of the marker.
X_add_mp: The number of X_add reads of this sequence as a percentage of
the total X_add of the marker.
X_corrected_mp: The number of X_corrected reads of this sequence as a
percentage of the total number of X_corrected reads of the marker.
24
25
26
27
flags: Sequences are flagged with 'allele' if they match the given
interpretation options. If the flags column, which always appears in
bgcorrect output, was not already present in the input, an additional
'not_corrected' flag is added to all sequences.
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
28
29
30
31
"""
import sys

from ..lib import ensure_sequence_format, add_sequence_format_args, \
32
33
                  add_input_output_args, get_input_output_files, pos_int_arg, \
                  get_column_ids
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
34
35
36
37

__version__ = "0.1dev"


38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# Default values for parameters are specified below.

# Default minimum number of reads to mark as allele.
# This value can be overridden by the -n command line option.
_DEF_MIN_READS = 30

# Default minimum number of reads per strand to mark as allele.
# This value can be overridden by the -b command line option.
_DEF_MIN_PER_STRAND = 1

# Default minimum percentage of reads w.r.t. the highest allele of the
# marker to mark as allele.
# This value can be overridden by the -m command line option.
_DEF_MIN_PCT_OF_MAX = 5.

# Default minimum percentage of reads w.r.t. the marker's total number
# of reads to mark as allele.
# This value can be overridden by the -p command line option.
_DEF_MIN_PCT_OF_SUM = 3.

# Default minimum percentage of correction to mark as allele.
# This value can be overridden by the -c command line option.
_DEF_MIN_CORRECTION = 0

# Default minimum number of recovered reads as a percentage of the
# original number of reads to mark as allele.
# This value can be overridden by the -r command line option.
_DEF_MIN_RECOVERY = 0


def compute_stats(infile, outfile, library, seqformat, min_reads,
                  min_per_strand, min_pct_of_max, min_pct_of_sum,
                  min_correction, min_recovery):
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
71
72
73
74
75
76
77
78
    # Get column numbers.
    column_names = infile.readline().rstrip("\r\n").split("\t")
    colid_name, colid_allele, colid_forward, colid_reverse, colid_total = \
        get_column_ids(column_names, "name", "allele", "forward", "reverse",
                       "total")
    (colid_forward_noise, colid_reverse_noise, colid_total_noise,
     colid_forward_add, colid_reverse_add, colid_total_add,
     colid_forward_corrected, colid_reverse_corrected,
79
80
81
82
83
84
85
86
87
     colid_total_corrected, colid_flags) = get_column_ids(column_names,
        "forward_noise", "reverse_noise", "total_noise", "forward_add",
        "reverse_add", "total_add", "forward_corrected", "reverse_corrected",
        "total_corrected", "flags", optional=True)

    # Make sure we have the flags column.
    if colid_flags is None:
        colid_flags = len(column_names)
        column_names.append("flags")
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
88
89
90
91
92
93
94
95
96
97

    # Add columns for which we have the required data.
    column_names.append("forward_pct")
    if colid_forward_corrected is not None:
        if colid_total_corrected is not None:
            column_names.append("forward_corrected_pct")
        column_names.append("forward_correction_pct")
    if colid_reverse_corrected is not None:
        column_names.append("reverse_correction_pct")
    if colid_total_corrected is not None:
98
        colid_total_correction_pct = len(column_names)
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
99
100
101
        column_names.append("total_correction_pct")
    column_names.append("forward_mp")
    column_names.append("reverse_mp")
102
    colid_total_mp = len(column_names)
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
    column_names.append("total_mp")
    if colid_forward_noise is not None:
        column_names.append("forward_noise_mp")
    if colid_reverse_noise is not None:
        column_names.append("reverse_noise_mp")
    if colid_total_noise is not None:
        column_names.append("total_noise_mp")
    if colid_forward_add is not None:
        column_names.append("forward_add_mp")
    if colid_reverse_add is not None:
        column_names.append("reverse_add_mp")
    if colid_total_add is not None:
        column_names.append("total_add_mp")
    if colid_forward_corrected is not None:
        column_names.append("forward_corrected_mp")
    if colid_reverse_corrected is not None:
        column_names.append("reverse_corrected_mp")
    if colid_total_corrected is not None:
121
        colid_total_corrected_mp = len(column_names)
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
        column_names.append("total_corrected_mp")

    # Read data.
    data = {}
    for line in infile:
        cols = line.rstrip("\r\n").split("\t")
        marker = cols[colid_name]
        if seqformat is not None:
            cols[colid_allele] = ensure_sequence_format(
                cols[colid_allele], seqformat, library=library, marker=marker)
        cols[colid_forward] = int(cols[colid_forward])
        cols[colid_reverse] = int(cols[colid_reverse])
        cols[colid_total] = int(cols[colid_total])
        if colid_forward_corrected is not None:
            cols[colid_forward_corrected]=float(cols[colid_forward_corrected])
        if colid_reverse_corrected is not None:
            cols[colid_reverse_corrected]=float(cols[colid_reverse_corrected])
        if colid_total_corrected is not None:
            cols[colid_total_corrected] = float(cols[colid_total_corrected])
        if colid_forward_noise is not None:
            cols[colid_forward_noise] = float(cols[colid_forward_noise])
        if colid_reverse_noise is not None:
            cols[colid_reverse_noise] = float(cols[colid_reverse_noise])
        if colid_total_noise is not None:
            cols[colid_total_noise] = float(cols[colid_total_noise])
        if colid_forward_add is not None:
            cols[colid_forward_add] = float(cols[colid_forward_add])
        if colid_reverse_add is not None:
            cols[colid_reverse_add] = float(cols[colid_reverse_add])
        if colid_total_add is not None:
            cols[colid_total_add] = float(cols[colid_total_add])
        if marker not in data:
            data[marker] = []
        data[marker].append(cols)

    # Compute statistics.
    for marker in data:
        marker_forward = sum(row[colid_forward] for row in data[marker])
        marker_reverse = sum(row[colid_reverse] for row in data[marker])
        marker_total = sum(row[colid_total] for row in data[marker])
        if colid_forward_noise is not None:
            marker_forward_noise = sum(
                row[colid_forward_noise] for row in data[marker])
        if colid_reverse_noise is not None:
            marker_reverse_noise = sum(
                row[colid_reverse_noise] for row in data[marker])
        if colid_total_noise is not None:
            marker_total_noise = sum(
                row[colid_total_noise] for row in data[marker])
        if colid_forward_add is not None:
            marker_forward_add = sum(
                row[colid_forward_add] for row in data[marker])
        if colid_reverse_add is not None:
            marker_reverse_add = sum(
                row[colid_reverse_add] for row in data[marker])
        if colid_total_add is not None:
            marker_total_add = sum(
                row[colid_total_add] for row in data[marker])
        if colid_forward_corrected is not None:
            marker_forward_corrected = sum(
                row[colid_forward_corrected] for row in data[marker])
        if colid_reverse_corrected is not None:
            marker_reverse_corrected = sum(
                row[colid_reverse_corrected] for row in data[marker])
        if colid_total_corrected is not None:
            marker_total_corrected = sum(
                row[colid_total_corrected] for row in data[marker])
189
190
        marker_max = max(row[colid_total] if colid_total_corrected is None
            else row[colid_total_corrected] for row in data[marker])
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
191
        for row in data[marker]:
192
193
194
195
            if len(row) == colid_flags:
                row.append(["not_corrected"])
            else:
                row[colid_flags] = map(str.strip, colid_flags.split(","))
196
            row.append("%.3g" % (100.*row[colid_forward]/row[colid_total]))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
197
198
            if colid_forward_corrected is not None:
                if colid_total_corrected is not None:
199
                    row.append("%.3g" % (100.*(
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
200
201
                        row[colid_forward_corrected]/row[colid_total_corrected]
                        if row[colid_total_corrected]
202
203
204
                        else row[colid_forward_corrected] > 0)))
                row.append("%.3g" %
                    (100.*row[colid_forward_corrected]/row[colid_forward]-100
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
205
                    if row[colid_forward]
206
                    else (row[colid_forward_corrected]>0)*200-100))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
207
            if colid_reverse_corrected is not None:
208
209
                row.append("%.3g" %
                    (100.*row[colid_reverse_corrected]/row[colid_reverse]-100
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
210
                    if row[colid_reverse]
211
                    else (row[colid_reverse_corrected]>0)*200-100))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
212
            if colid_total_corrected is not None:
213
214
215
216
217
218
219
220
                row.append("%.3g" %
                    (100.*row[colid_total_corrected]/row[colid_total]-100))
            row.append("%.3g" % (100.*row[colid_forward]/marker_forward
                if marker_forward else 0))
            row.append("%.3g" % (100.*row[colid_reverse]/marker_reverse
                if marker_reverse else 0))
            row.append("%.3g" % (100.*row[colid_total]/marker_total
                if marker_total else 0))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
221
            if colid_forward_noise is not None:
222
223
224
                row.append("%.3g" %
                    (100.*row[colid_forward_noise]/marker_forward_noise
                    if marker_forward_noise else 0))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
225
            if colid_reverse_noise is not None:
226
227
228
                row.append("%.3g" %
                    (100.*row[colid_reverse_noise]/marker_reverse_noise
                    if marker_reverse_noise else 0))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
229
            if colid_total_noise is not None:
230
231
232
                row.append(
                    "%.3g" % (100.*row[colid_total_noise]/marker_total_noise
                    if marker_total_noise else 0))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
233
            if colid_forward_add is not None:
234
235
236
                row.append(
                    "%.3g" % (100.*row[colid_forward_add]/marker_forward_add
                    if marker_forward_add else 0))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
237
            if colid_reverse_add is not None:
238
239
240
                row.append(
                    "%.3g" % (100.*row[colid_reverse_add]/marker_reverse_add
                    if marker_reverse_add else 0))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
241
            if colid_total_add is not None:
242
243
                row.append("%.3g" % (100.*row[colid_total_add]/marker_total_add
                    if marker_total_add else 0))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
244
            if colid_forward_corrected is not None:
245
246
247
                row.append("%.3g" %
                    (100.*row[colid_forward_corrected]/marker_forward_corrected
                    if marker_forward_corrected else 0))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
248
            if colid_reverse_corrected is not None:
249
250
251
                row.append("%.3g" %
                    (100.*row[colid_reverse_corrected]/marker_reverse_corrected
                    if marker_reverse_corrected else 0))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
252
            if colid_total_corrected is not None:
253
254
255
                row.append("%.3g" %
                    (100.*row[colid_total_corrected]/marker_total_corrected
                    if marker_total_corrected else 0))
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
256

257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
            # Check if this sequence is an allele.
            total_added = row[colid_total] if colid_total_corrected is None \
                else row[colid_total_corrected]
            pct_of_max = 100.*(1.*total_added/marker_max if marker_max
                else total_added > 0)
            pct_of_sum = float(row[colid_total_mp] if colid_total_corrected is
                None else row[colid_total_corrected_mp])
            correction = float(0 if colid_total_corrected is None
                else row[colid_total_correction_pct])
            recovery = 0 if colid_total_add is None \
                else 100.*row[colid_total_add]/row[colid_total]
            min_strand = min(
                row[colid_forward] if colid_forward_corrected is None
                    else row[colid_forward_corrected],
                row[colid_reverse] if colid_reverse_corrected is None
                    else row[colid_reverse_corrected])
            if (total_added >= min_reads and
                    pct_of_max >= min_pct_of_max and
                    pct_of_sum >= min_pct_of_sum and
                    correction >= min_correction and
                    recovery >= min_recovery and
                    min_strand >= min_per_strand):
                row[colid_flags].append("allele")
            row[colid_flags] = ",".join(row[colid_flags])

Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
282
283
284
285
286
287
288
289
290
291
    # Write results.
    outfile.write("\t".join(column_names) + "\n")
    for marker in data:
        for line in data[marker]:
            outfile.write("\t".join(map(str, line)) + "\n")
#compute_stats


def add_arguments(parser):
    add_input_output_args(parser, True, True, False)
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
    intergroup = parser.add_argument_group("interpretation options",
        "sequences that match all of these settings are marked as 'allele'")
    intergroup.add_argument('-n', '--min-reads', metavar="N", type=pos_int_arg,
        default=_DEF_MIN_READS,
        help="the minimum number of reads (default: %(default)s)")
    intergroup.add_argument('-b', '--min-per-strand', metavar="N",
        type=pos_int_arg, default=_DEF_MIN_PER_STRAND,
        help="the minimum number of reads in both orientations (default: "
             "%(default)s)")
    intergroup.add_argument('-m', '--min-pct-of-max', metavar="PCT",
        type=float, default=_DEF_MIN_PCT_OF_MAX,
        help="the minimum percentage of reads w.r.t. the highest allele of "
             "the marker (default: %(default)s)")
    intergroup.add_argument('-p', '--min-pct-of-sum', metavar="PCT",
        type=float, default=_DEF_MIN_PCT_OF_SUM,
        help="the minimum percentage of reads w.r.t. the marker's total "
             "number of reads (default: %(default)s)")
    intergroup.add_argument('-c', '--min-correction', metavar="PCT",
        type=float, default=_DEF_MIN_CORRECTION,
        help="the minimum change in read count due to correction by e.g., "
        "bgcorrect (default: %(default)s)")
    intergroup.add_argument('-r', '--min-recovery', metavar="PCT",
        type=float, default=_DEF_MIN_RECOVERY,
        help="the minimum number of reads that was recovered thanks to "
        "noise correction (by e.g., bgcorrect), as a percentage of the "
        "original number of reads (default: %(default)s)")
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
    add_sequence_format_args(parser)
#add_arguments


def run(args):
    gen = get_input_output_files(args, True, True)
    if not gen:
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")

    for tag, infiles, outfile in gen:
        # TODO: Aggregate data from all infiles of each sample.
        if len(infiles) > 1:
            raise ValueError(
                "multiple input files for sample '%s' specified " % tag)
        infile = sys.stdin if infiles[0] == "-" else open(infiles[0], "r")
334
335
336
337
        compute_stats(infile, outfile, args.library, args.sequence_format,
                      args.min_reads, args.min_per_strand, args.min_pct_of_max,
                      args.min_pct_of_sum, args.min_correction,
                      args.min_recovery)
Hoogenboom, Jerry's avatar
Hoogenboom, Jerry committed
338
339
340
        if infile != sys.stdin:
            infile.close()
#run