stuttermark.py 19.5 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
2
3
4
#!/usr/bin/env python
"""
Mark potential stutter products by assuming a fixed maximum percentage
of stutter product vs the parent allele.
jhoogenboom's avatar
jhoogenboom committed
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

Stuttermark adds a new column (named 'annotation' by default) to the
output.  The new column contains 'STUTTER' for possible stutter
products, or 'ALLELE' otherwise.  Lines that were not evaluated are
annotated as 'UNKNOWN'.  A sequence is considered a possible stutter
product if its total read count is less than or equal to the maximum
number of expected stutter reads.  The maximum number of stutter reads
is computed by assuming a fixed percentage of stutter product compared
to the originating allele.

Stuttermark requires TSSV-style sequences as input (automatically
converting sequences to this format if necessary) and detects possible
stutter products by comparing sequences that have the same repeat blocks
but different numbers of repeats for one or more of their blocks.

The STUTTER annotation contains additional information.  For example:
'STUTTER:146.6x1(2-1):10.4x2(2-1x9-1)'.  This is a stutter product for
which at most 146.6 reads have come from the first sequence in the
output file ('146.6x1') and at most 10.4 reads have come from the second
sequence in the output file ('10.4x2').  This sequence differs from the
first sequence in the output file by a loss of one repeat of the second
repeat block ('2-1') and it differs from the second sequence by the loss
of one repeat in the second block and one repeat in the ninth block
28
29
('2-1x9-1').  (If this allele would have more than 157 reads, it would
be annotated as 'ALLELE' instead.)
jhoogenboom's avatar
jhoogenboom committed
30
"""
31
import argparse, sys
jhoogenboom's avatar
jhoogenboom committed
32

jhoogenboom's avatar
jhoogenboom committed
33
from ..lib import pos_int_arg, print_db, PAT_TSSV_BLOCK, get_column_ids, \
34
                  add_input_output_args, get_input_output_files, \
35
                  ensure_sequence_format, add_sequence_format_args
jhoogenboom's avatar
jhoogenboom committed
36

jhoogenboom's avatar
jhoogenboom committed
37
__version__ = "1.4"
jhoogenboom's avatar
jhoogenboom committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70


# Default values for parameters are specified below.

# Link a difference in repeat count to a minimum occurrance percentage
# for stutter peaks.  Any repeat count differences other than listed
# here are taken to have zero probability.
# This value can be overridden by the -s command line option.
_DEF_STUTTERDEF = "-1:15,+1:4"

# Any peaks with less than this number of reads are not evaluated.
# This value can be overridden by the -m command line option.
_DEF_MIN_READS = 2

# Stutters are not expected to appear for blocks with less than
# _DEF_MIN_REPEATS repeats.  If two alleles have a different repeat count
# for any block with less than _DEF_MIN_REPEATS repeats, it is not possible
# that one of them is a stutter of the other.  Therefore, no comparison
# is made between any two such sequences.
# This value can be overridden by the -n command line option.
_DEF_MIN_REPEATS = 3

# Peaks are only annotated as a stutter peak of some other peak if the
# expected number of stutter occurances of this other peak is above
# this value.
# This value can be overridden by the -r command line option.
_DEF_MIN_REPORT = 0.1

# Default name of the column we are adding.
# This value can be overridden by the -c command line option.
_DEF_COLNAME = "annotation"


jhoogenboom's avatar
jhoogenboom committed
71
def load_data(infile, colname_annotation=_DEF_COLNAME, library=None):
jhoogenboom's avatar
jhoogenboom committed
72
73
74
75
76
77
78
79
80
81
82
83
    """
    Read data from the file handle infile and return a tuple
    (colum_names, data_rows).

    The input file is expected to have tab-delimited columns with
    column names on the first line and one allele per data row.
    There are three required columns:
        name    The name of the marker this allele belongs to.
        allele  The allele sequence in TSSV output format.
                E.g., AGAT(7)TGAT(3).
        total   The total number of reads with this sequence.

jhoogenboom's avatar
jhoogenboom committed
84
    An additional column is appended.  All data rows
jhoogenboom's avatar
jhoogenboom committed
85
86
87
88
89
    are given a value of "UNKNOWN" for this column.

    :arg infile: Open readable handle to data file.
    :type infile: stream

jhoogenboom's avatar
jhoogenboom committed
90
91
92
93
94
95
    :arg colname_annotation: Name of the newly added column
    :type colname_annotation: str

    :arg library: Library for sequence format conversion
    :type library: dict

jhoogenboom's avatar
jhoogenboom committed
96
97
98
99
100
    :returns: A 2-tuple (column_names, data_rows).
    :rtype: tuple(list, list)
    """
    # Get column numbers.  We are adding a column as well.
    column_names = infile.readline().rstrip("\r\n").split("\t")
jhoogenboom's avatar
jhoogenboom committed
101
102
    colid_total, colid_allele, = get_column_ids(column_names,
        "total", "allele")
jhoogenboom's avatar
jhoogenboom committed
103
104
    column_names.append(colname_annotation)

jhoogenboom's avatar
jhoogenboom committed
105
106
107
108
109
    try:
        colid_name = get_column_ids(column_names, "name")
    except:
        colid_name = None

jhoogenboom's avatar
jhoogenboom committed
110
111
112
113
114
115
116
117
118
119
120
121
    # Step through the file line by line to build the allele list.
    allelelist = []
    for line in infile:
        columns = line.rstrip("\r\n").split("\t")

        # Skip empty/malformed lines (NOTE: allowing additional columns
        # beyond the expected 4 columns).
        if len(columns) != len(column_names)-1:
            if len(line.strip()):
                print("WARNING: skipped line: %s" % line)
            continue

jhoogenboom's avatar
jhoogenboom committed
122
123
124
125
126
127
        # Convert to TSSV-style sequences.
        if columns[colid_allele]:
            marker = columns[colid_name] if colid_name is not None else None
            columns[colid_allele] = ensure_sequence_format(
                columns[colid_allele], 'tssv', marker=marker, library=library)

jhoogenboom's avatar
jhoogenboom committed
128
129
130
131
132
        # Split the allele column into a list of tuples:
        # [('ACTG','4'),('CCTC','12'),...]
        columns[colid_allele] = PAT_TSSV_BLOCK.findall(columns[colid_allele])

        # String to integer conversion...
jhoogenboom's avatar
jhoogenboom committed
133
134
        columns[colid_allele] = [[x[0], int(x[1])]
                                 for x in columns[colid_allele]]
jhoogenboom's avatar
jhoogenboom committed
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
        columns[colid_total] = int(columns[colid_total])

        # Repeat unit deduplication (data may contain stuff like
        # "AGAT(7)AGAT(5)" instead of "AGAT(12)").
        if columns[colid_allele]:
            dedup = [columns[colid_allele][0]]
            for i in range(1, len(columns[colid_allele])):
                if columns[colid_allele][i][0] == dedup[-1][0]:
                    dedup[-1][1] += columns[colid_allele][i][1]
                else:
                    dedup.append(columns[colid_allele][i])
            columns[colid_allele] = dedup

        # Add the allele to the list, including our new column.
        columns.append("UNKNOWN")
        allelelist.append(columns)
    return column_names, allelelist
#load_data


def annotate_alleles(infile, outfile, stutter, min_reads=_DEF_MIN_READS,
                     min_repeats=_DEF_MIN_REPEATS, min_report=_DEF_MIN_REPORT,
157
                     colname_annotation=_DEF_COLNAME, library=None,
jhoogenboom's avatar
jhoogenboom committed
158
                     debug=False):
jhoogenboom's avatar
jhoogenboom committed
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
    """
    Read data from the file handle infile and write annotated data to
    file handle outfile.

    The input file is expected to have tab-delimited columns with
    column names on the first line and one allele per data row.
    There are three required columns:
        name    The name of the marker this allele belongs to.
        allele  The allele sequence in TSSV output format.
                E.g., AGAT(7)TGAT(3).
        total   The total number of reads with this sequence.

    An additional annotation column is appended.  All data rows
    are given a value of "ALLELE", "STUTTER:...", or "UNKNOWN".
    Alleles that are annotated as "STUTTER" are given a colon-separated
    description of their expected origins.  For each originating allele
    a description of the form AxB(C) is given, with A the maximum
    expected number of stutter reads from this origin, B the allele
    number of the originating allele (where the first allele in the
    output file is allele 1), and C a 'x'-separated list of repeat
    blocks that had stuttered.

    Example stutter description:
        STUTTER:123.8x1(3-1):20.2x2(4+1)
    This stutter allele has two originating alleles: a maximum of 123.8
    reads from allele 1 plus a maximum of 20.2 reads from 2.  Compared
    to allele 1, this is a -1 stutter in the third repeat block.
    Compared to allele 2, this is a +1 stutter of the fourth repeat
    block.  (If this allele had more than 144 total reads, it would have
    been annotated as "ALLELE".)

    :arg infile: Open readable handle to data file.
    :type infile: stream

    :arg outfile: Open writable handle for output data.
    :type outfile: stream

    :arg stutter: A dict mapping stutter size (keys) to a maximum
                  expected percentage of stutter of this size.
    :type stutter: dict{int: float}

    :arg min_reads: Any alleles with less than this number of reads are
                    not evaluated (annotation=UNKNOWN).
    :type min_reads: int

    :arg min_repeats: Stutters are not expected to appear for blocks
                      with less than this number of repeats.
    :type min_repeats: int

    :arg min_report: Stutters with an expected number of reads below
                     this value are not reported.
    :type min_report: float

    :arg colname_annotation: Name of the newly added column.
    :type colname_annotation: str

jhoogenboom's avatar
jhoogenboom committed
215
216
217
218
    :arg libfile: Open readable handle to library file for sequence format
                  conversion
    :type libfile: stream

jhoogenboom's avatar
jhoogenboom committed
219
220
221
    :arg debug: If True, print debug output to stdout.
    :type debug: bool
    """
jhoogenboom's avatar
jhoogenboom committed
222
223
224
225
226
227
228
    column_names, allelelist = load_data(infile, colname_annotation, library)
    colid_total, colid_allele, = get_column_ids(column_names,
        "total", "allele")
    try:
        colid_name = get_column_ids(column_names, "name")
    except:
        colid_name = None
jhoogenboom's avatar
jhoogenboom committed
229
230

    # Sort (descending total reads).
jhoogenboom's avatar
jhoogenboom committed
231
232
233
234
235
    if colid_name is not None:
        allelelist.sort(key=lambda allele:
            [allele[colid_name], -allele[colid_total]])
    else:
        allelelist.sort(key=lambda allele: -allele[colid_total])
jhoogenboom's avatar
jhoogenboom committed
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250

    for iCurrent in range(len(allelelist)):

        # See if this allele appeared a reasonable number of times.
        if allelelist[iCurrent][colid_total] < min_reads:
            continue

        isStutterPeakOf = {}
        maximumOccurrenceExpected = 0

        # Find all ways to reach allele iCurrent by mutating the other
        # alleles that appeared more often.
        for iOther in range(iCurrent):

            # Must be same marker.
jhoogenboom's avatar
jhoogenboom committed
251
252
            if colid_name is not None and (allelelist[iCurrent][colid_name] !=
                                           allelelist[iOther][colid_name]):
jhoogenboom's avatar
jhoogenboom committed
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
                continue

            print_db('%i vs %i' % (iCurrent+1, iOther+1), debug)

            # Must be same number of blocks.
            if (len(allelelist[iCurrent][colid_allele]) !=
                    len(allelelist[iOther][colid_allele])):
                print_db('Different number of blocks', debug)
                continue

            maximumOccurrenceExpectedForThisOther = 1.0
            blocksThatDiffer = {}

            # What is needed to get from sequence iOther to iCurrent?
            # We will look at each block in turn.
            for block in range(len(allelelist[iCurrent][colid_allele])):

                # If mutations are needed to get from iOther to
                # iCurrent, iCurrent can't be a stutter of iOther.
                if (allelelist[iCurrent][colid_allele][block][0] !=
                        allelelist[iOther][colid_allele][block][0]):
                    print_db('Block %i has different sequence' % (block+1),
                             debug)
                    break

                # See how many times the current block appears more or
                # less often in iCurrent as compared to iOther.
                deltaRepeats = (allelelist[iCurrent][colid_allele][block][1] -
                                allelelist[iOther][colid_allele][block][1])
                if deltaRepeats == 0:
                    continue

                # iCurrent is not a stutter of iOther if any one of its
                # blocks differs in count while either count is below
                # min_repeats.  It is not a stutter of iOther because
                # those are expected not to occur for such low repeat
                # counts.  At the same time any further analysis
                # between the two is invalid because this block was not
                # the same.  Note that if this truly was a stutter of
                # iOther, maximumOccurrenceExpected is going to be a
                # bit too low now.
                if min(allelelist[iCurrent][colid_allele][block][1],
                     allelelist[iOther][colid_allele][block][1]) < min_repeats:
                    print_db('Block %i has low-count difference: %i and %i' %
                        (block+1, allelelist[iCurrent][colid_allele][block][1],
                         allelelist[iOther][colid_allele][block][1]), debug)
                    break

                # Depending on the repeat count difference, iCurrent
                # may appear at most a certain amount of times.
                # This is expressed as a percentage of the occurrence
                # of iOther.

                # If the repeat count difference is highly improbable,
                # this can't be a stutter.
                elif deltaRepeats not in stutter:
                    print_db('Improbable difference in number of repeats of '
                             'block %i: %i' % (block+1, deltaRepeats), debug)
                    break

                # Based on this block only, sequence iCurrent could be
                # a stutter of sequence iOther.  It may appear only a
                # limited number of times, however.
                else:
                    blocksThatDiffer[block] = deltaRepeats
                    maximumOccurrenceExpectedForThisOther *= \
                        stutter[deltaRepeats] / 100.0
                    print_db('Occurence percentage now %d' %
                             maximumOccurrenceExpectedForThisOther, debug)

            else:
                # If we end up here, iCurrent could very well be a
                # stutter peak of iOther, but it might be below the
                # reporting threshold.
                thisStutterOccurrenceExpected = \
                    (maximumOccurrenceExpectedForThisOther *
                     allelelist[iOther][colid_total])
                if thisStutterOccurrenceExpected < min_report:
                    print_db('Expected occurrence of stutter from '
                             '%i to %i is only %d' %
                             (iOther+1, iCurrent+1,
                              thisStutterOccurrenceExpected), debug)
                else:
                    isStutterPeakOf[iOther] = (blocksThatDiffer,
                        thisStutterOccurrenceExpected)
                    maximumOccurrenceExpected += thisStutterOccurrenceExpected
                    print_db('Expected occurence now %d' %
                             maximumOccurrenceExpected, debug)

        # Now we'll just need to check whether it does not appear
        # unrealistically often.
        if allelelist[iCurrent][colid_total] > maximumOccurrenceExpected:
            print_db('Occurs too often (%i > %d)' %
                     (allelelist[iCurrent][colid_total],
                      maximumOccurrenceExpected), debug)
            allelelist[iCurrent][-1] = "ALLELE"
        else:
            # Stutter peak detected.
jhoogenboom's avatar
jhoogenboom committed
351
352
            allelelist[iCurrent][-1] = "STUTTER:%s" % ":".join(
                "%.1fx%i(%s)" % (
jhoogenboom's avatar
jhoogenboom committed
353
354
                    isStutterPeakOf[iOther][1],
                    iOther+1,
jhoogenboom's avatar
jhoogenboom committed
355
356
                    "x".join(
                        "%i%+i" % (
jhoogenboom's avatar
jhoogenboom committed
357
                            block+1,
jhoogenboom's avatar
jhoogenboom committed
358
359
360
                            isStutterPeakOf[iOther][0][block])
                        for block in isStutterPeakOf[iOther][0]))
                for iOther in isStutterPeakOf)
jhoogenboom's avatar
jhoogenboom committed
361
362
363
364
365
366
367
            print_db("%2i is a stutter peak of %s; occur=%i, maxOccur=%s" %
                     (iCurrent+1, isStutterPeakOf,
                      allelelist[iCurrent][colid_total],
                      maximumOccurrenceExpected), debug)

    # Reconstruct the allele sequence and write out the findings.
    for i in range(len(allelelist)):
jhoogenboom's avatar
jhoogenboom committed
368
369
        allelelist[i][colid_allele] = "".join(
            "%s(%i)" % tuple(block) for block in allelelist[i][colid_allele])
jhoogenboom's avatar
jhoogenboom committed
370
371
    outfile.write("\t".join(column_names))
    outfile.write("\n")
jhoogenboom's avatar
jhoogenboom committed
372
373
    outfile.write("\n".join(
        "\t".join(map(str, allele)) for allele in allelelist))
jhoogenboom's avatar
jhoogenboom committed
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
    outfile.write("\n")
#annotate_alleles


def stutter_def_arg(value):
    """
    Convert a stutter definition string to dict, raise ArgumentTypeError
    on failure.

    The stutter definition string is a comma-separated list of int:float
    pairs, e.g., "-1:15,+1:3.5".  Whitespace around interpunction is
    permitted, e.g., "-1: 15, +1: 3.5".

    :arg min_reads: A valid stutter definition string.
    :type min_reads: str

    :returns: A dict mapping stutter size (keys) to a maximum expected
              percentage of stutter of this size.
    :rtype: dict{int: float}
    """
    if ":" not in value:
        return {}
    try:
jhoogenboom's avatar
jhoogenboom committed
397
398
        return {int(y[0]): float(y[1])
                for y in (x.split(":") for x in value.split(","))}
jhoogenboom's avatar
jhoogenboom committed
399
400
401
402
403
404
405
    except ValueError as e:
        raise argparse.ArgumentTypeError(
            "invalid stutter definition value: '%s' (%s)" % (value, str(e)))
#stutter_def_arg


def add_arguments(parser):
406
    add_input_output_args(parser, True, True, False)
jhoogenboom's avatar
jhoogenboom committed
407
408
409
410
411
412
    parser.add_argument('-s', '--stutter', metavar="DEF", type=stutter_def_arg,
        default=_DEF_STUTTERDEF,
        help="Define maximum expected stutter percentages.  The default value "
             "of '%(default)s' sets -1 stutter (loss of one repeat) to 15%%, "
             "+1 stutter (gain of one repeat) to 4%%.  Any unspecified "
             "stutter amount is assumed not to occur directly but e.g., a -2 "
jhoogenboom's avatar
jhoogenboom committed
413
             "stutter may still be recognised as two -1 stutters stacked "
jhoogenboom's avatar
jhoogenboom committed
414
             "together.  NOTE: It may be necessary to specify this option as "
jhoogenboom's avatar
jhoogenboom committed
415
             "'-s=%(default)s' (note the equals sign instead of a space).")
jhoogenboom's avatar
jhoogenboom committed
416
417
418
419
420
421
    parser.add_argument('-c', '--column-name', metavar="COLNAME",
        default=_DEF_COLNAME,
        help="name of the newly added column (default: '%(default)s')")
    filtergroup = parser.add_argument_group("filtering options")
    filtergroup.add_argument('-m', '--min-reads', metavar="N",
        type=pos_int_arg, default=_DEF_MIN_READS,
jhoogenboom's avatar
jhoogenboom committed
422
        help="set minimum number of reads to evaluate (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
423
424
    filtergroup.add_argument('-n', '--min-repeats', metavar="N",
        type=pos_int_arg, default=_DEF_MIN_REPEATS,
jhoogenboom's avatar
jhoogenboom committed
425
426
        help="set minimum number of repeats of a block that can possibly "
             "stutter (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
427
    filtergroup.add_argument('-r', '--min-report', metavar="N", type=float,
jhoogenboom's avatar
jhoogenboom committed
428
429
430
431
        default=_DEF_MIN_REPORT,
        help="alleles are only annotated as a stutter of some other allele if "
             "the expected number of stutter occurances of this other allele "
             "is above this value (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
432
    add_sequence_format_args(parser, "tssv", True)  # Force tssv seqs.
jhoogenboom's avatar
jhoogenboom committed
433
434
435
436
#add_arguments


def run(args):
437
438
    gen = get_input_output_files(args, True, True)
    if not gen:
jhoogenboom's avatar
jhoogenboom committed
439
440
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
441
442
443
444

    for tag, infiles, outfile in gen:
        # TODO: Aggregate data from all infiles of each sample.
        # This tool now only works properly with one infile per sample!
jhoogenboom's avatar
jhoogenboom committed
445
446
        if len(infiles) > 1:
            raise ValueError(
447
448
449
                "multiple input files for sample '%s' specified" % tag)
        infile = sys.stdin if infiles[0] == "-" else open(infiles[0], "r")
        annotate_alleles(infile, outfile, args.stutter, args.min_reads,
jhoogenboom's avatar
jhoogenboom committed
450
                         args.min_repeats, args.min_report, args.column_name,
451
452
453
                         args.library, args.debug)
        if infile != sys.stdin:
            infile.close()
jhoogenboom's avatar
jhoogenboom committed
454
#run