allelefinder.py 9.63 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
2
#!/usr/bin/env python
"""
3
4
Find true alleles in reference samples and detect possible
contaminations.
jhoogenboom's avatar
jhoogenboom committed
5
6
7
"""
import argparse
import sys
8
import re
jhoogenboom's avatar
jhoogenboom committed
9

10
from ..lib import get_column_ids, pos_int_arg, get_tag
jhoogenboom's avatar
jhoogenboom committed
11
12
13
14

__version__ = "0.1dev"


15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# Default values for parameters are specified below.

# Default regular expression to capture sample tags in file names.
# This value can be overridden by the -e command line option.
_DEF_TAG_EXPR = "^(.+?)(?:\.[^.]+)?$"

# Default formatting template to write sample tags.
# This value can be overridden by the -f command line option.
_DEF_TAG_FORMAT = "\\1"

# Default minimum number of reads required for the highest allele.
# This value can be overridden by the -n command line option.
_DEF_MIN_READS = 50

# Default minimum number of reads required for an allele to be called,
# as a percentage of the number of reads of the highest allele.
# This value can be overridden by the -m command line option.
jhoogenboom's avatar
jhoogenboom committed
32
_DEF_MIN_ALLELE_PCT = 30.0
33
34
35
36
37
38

# Default maximum amount of noise to allow, as a percentage of the
# number of reads of the highest allele of each marker.  If any noise
# (i.e., non-allelic sequences) above this threshold are detected, the
# sample is considered 'noisy' for this marker.
# This value can be overridden by the -M command line option.
jhoogenboom's avatar
jhoogenboom committed
39
_DEF_MAX_NOISE_PCT = 10.0
40
41
42

# Default maximum number of alleles to expect for each marker.
# This value can be overridden by the -a command line option.
jhoogenboom's avatar
jhoogenboom committed
43
44
_DEF_MAX_ALLELES = 2

45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# Default maximum number of noisy markers allowed per sample.
# This value can be overridden by the -x command line option.
_DEF_MAX_NOISY = 2


def find_alleles(filelist, reportfile, tag_expr, tag_format, min_reads,
                 min_allele_pct, max_noise_pct, max_alleles, max_noisy,
                 stuttermark_column):

    print("\t".join(["sample", "marker", "total", "allele"]))
    for infile in filelist:
        tag = get_tag(infile.name, tag_expr, tag_format)
        find_alleles_sample(infile, reportfile, tag, min_reads, min_allele_pct,
            max_noise_pct, max_alleles, max_noisy, stuttermark_column)
#find_alleles


def find_alleles_sample(infile, reportfile, tag, min_reads, min_allele_pct,
                        max_noise_pct, max_alleles, max_noisy,
                        stuttermark_column):
jhoogenboom's avatar
jhoogenboom committed
65
66
67
68
69
70
71
72
73
74

    # Get column numbers.
    column_names = infile.readline().rstrip("\r\n").split("\t")
    colid_total, colid_allele, colid_name = get_column_ids(column_names,
        "total", "allele", "name")

    # Also get stuttermark column if we have one.
    if stuttermark_column is not None:
        colid_stuttermark = get_column_ids(column_names, stuttermark_column)

75
76
    top_noise = {}
    top_allele = {}
jhoogenboom's avatar
jhoogenboom committed
77
78
79
80
81
82
83
84
85
86
87
    alleles = {}
    for line in infile:
        line = line.rstrip("\r\n").split("\t")
        if (stuttermark_column is not None and
                not line[colid_stuttermark].startswith("ALLELE")):
            continue

        marker = line[colid_name]
        allele = line[colid_allele]
        reads = int(line[colid_total])

88
89
90
91
92
93
        if marker not in alleles:
            alleles[marker] = {allele: reads}
            top_allele[marker] = reads
            top_noise[marker] = ["-", 0]
        else:
            if reads > top_allele[marker]:
jhoogenboom's avatar
jhoogenboom committed
94
                # New highest allele!
95
96
97
98
99
100
101
102
103
104
                top_allele[marker] = reads
                for allelex in alleles[marker].keys():
                    if (alleles[marker][allelex] <
                            top_allele[marker] * (min_allele_pct/100.)):
                        if alleles[marker][allelex] > top_noise[marker][1]:
                            top_noise[marker] = [
                                allelex, alleles[marker][allelex]]
                        del alleles[marker][allelex]
                alleles[marker][allele] = reads
            elif reads >= top_allele[marker]*(min_allele_pct/100.):
jhoogenboom's avatar
jhoogenboom committed
105
106
                # New secundary allele!
                alleles[marker][allele] = reads
107
            elif reads >= top_noise[marker][1]:
jhoogenboom's avatar
jhoogenboom committed
108
                # New highest noise!
109
                top_noise[marker] = [allele, reads]
jhoogenboom's avatar
jhoogenboom committed
110

111
112
    # Find and eliminate noisy markers in this sample first.
    noisy_markers = 0
jhoogenboom's avatar
jhoogenboom committed
113
    for marker in alleles:
114
115
116
117
118
119
120
121
        if top_allele[marker] < min_reads:
            if reportfile:
                reportfile.write(
                    "Sample %s is not suitable for marker %s:\n"
                    "highest allele has only %i reads\n\n" %
                        (tag, marker, top_allele[marker]))
            alleles[marker] = {}
            continue
jhoogenboom's avatar
jhoogenboom committed
122
123
124
        if len(alleles[marker]) > max_alleles:
            allele_order = sorted(alleles[marker],
                                  key=lambda x: -alleles[marker][x])
125
126
            top_noise[marker] = [allele_order[max_alleles],
                alleles[marker][allele_order[max_alleles]]]
jhoogenboom's avatar
jhoogenboom committed
127
128
            alleles[marker] = {x: alleles[marker][x]
                               for x in allele_order[:max_alleles]}
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
        if top_noise[marker][1] > top_allele[marker]*(max_noise_pct/100.):
            if reportfile:
                reportfile.write(
                    "Sample %s is not suitable for marker %s:\n"
                    "highest non-allele is %.1f%% of the highest allele\n" %
                        (tag, marker,
                        100.*top_noise[marker][1]/top_allele[marker]))
                for allele in sorted(alleles[marker],
                                     key=lambda x: -alleles[marker][x]):
                    reportfile.write("%i\tALLELE\t%s\n" %
                        (alleles[marker][allele], allele))
                reportfile.write("%i\tNOISE\t%s\n\n" %
                    (top_noise[marker][1], top_noise[marker][0]))
            noisy_markers += 1
            alleles[marker] = {}

    # Drop this sample completely if it has too many noisy markers.
    if noisy_markers > max_noisy:
        if reportfile:
            reportfile.write("Sample %s appears to be contaminated!\n\n" % tag)
        return

    # The sample is OK, write out its alleles.
    for marker in alleles:
        for allele in sorted(alleles[marker],
                             key=lambda x: -alleles[marker][x]):
            print("\t".join(
                [tag, marker, str(alleles[marker][allele]), allele]))
#find_alleles_sample
jhoogenboom's avatar
jhoogenboom committed
158
159
160


def add_arguments(parser):
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
    parser.add_argument('filelist', nargs='*', metavar="FILE",
        default=[sys.stdin], type=argparse.FileType('r'),
        help="the data file(s) to process (default: read from stdin)")
    parser.add_argument('-r', '--report', metavar="OUTFILE",
        type=argparse.FileType("w"),
        help="write a report to the given file, detailing possibly "
             "contaminated or otherwise unsuitable samples")
    parser.add_argument('-e', '--tag-expr', metavar="REGEX", type=re.compile,
        default=_DEF_TAG_EXPR,
        help="regular expression that captures (using one or more capturing "
             "groups) the sample tags from the file names; by default, the "
             "entire file name except for its extension (if any) is captured")
    parser.add_argument('-f', '--tag-format', metavar="EXPR",
        default=_DEF_TAG_FORMAT,
        help="format of the sample tags produced; a capturing group reference "
             "like '\\n' refers to the n-th capturing group in the regular "
             "expression specified with -e/--tag-expr (the default of '\\1' "
             "simply uses the first capturing group); with a single sample, "
             "you can enter the samle tag here explicitly")
    parser.add_argument('-n', '--min-reads', metavar="N", type=pos_int_arg,
        default=_DEF_MIN_READS,
        help="require at least this number of reads for the highest allele "
             "(default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
184
185
186
187
188
    parser.add_argument('-m', '--min-allele-pct', metavar="N", type=float,
        default=_DEF_MIN_ALLELE_PCT,
        help="call heterozygous if the second allele is at least this "
             "percentage of the highest allele (default: %(default)s)")
    parser.add_argument('-M', '--max-noise-pct', metavar="N", type=float,
189
190
191
192
        default=_DEF_MAX_NOISE_PCT,
        help="a sample is considered contaminated/unsuitable for a marker if "
             "the highest non-allelic sequence is at least this percentage of "
             "the highest allele (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
193
    parser.add_argument('-a', '--max-alleles', metavar="N", type=pos_int_arg,
194
195
196
197
198
199
200
        default=_DEF_MAX_ALLELES,
        help="allow no more than this number of alleles per marker (default: "
             "%(default)s)")
    parser.add_argument('-x', '--max-noisy', metavar="N", type=pos_int_arg,
        default=_DEF_MAX_NOISY,
        help="entirely reject a sample if more than this number of markers "
             "have a high non-allelic sequence (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
201
202
203
204
205
206
207
208
209
    parser.add_argument('-c', '--stuttermark-column', metavar="COLNAME",
        default=None,
        help="name of column with Stuttermark output; if specified, sequences "
             "for which the value in this column does not start with ALLELE "
             "are ignored")
#add_arguments


def run(args):
210
    if args.filelist == [sys.stdin] and sys.stdin.isatty():
jhoogenboom's avatar
jhoogenboom committed
211
212
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
213
214
215
216

    find_alleles(args.filelist, args.report, args.tag_expr, args.tag_format,
                 args.min_reads, args.min_allele_pct, args.max_noise_pct,
                 args.max_alleles, args.max_noisy, args.stuttermark_column)
jhoogenboom's avatar
jhoogenboom committed
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#run


def main():
    """
    Main entry point.
    """
    parser = argparse.ArgumentParser(
        description=__doc__)
    try:
        add_arguments(parser)
        run(parser.parse_args())
    except OSError as error:
        parser.error(error)
#main


if __name__ == "__main__":
    main()