allelefinder.py 8.96 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
2
#!/usr/bin/env python
"""
3
4
Find true alleles in reference samples and detect possible
contaminations.
jhoogenboom's avatar
jhoogenboom committed
5
6
7
8
9
10
11

In each sample, the sequences with the highest read counts of each
marker are called alleles, with a user-defined maximum number of alleles
par marker.  The allele balance is kept within given bounds.  If the
highest non-allelic sequence exceeds a given limit, no alleles are
called for this marker.  If this happens for multiple markers in one
sample, no alleles are called for this sample at all.
jhoogenboom's avatar
jhoogenboom committed
12
13
14
15
"""
import argparse
import sys

jhoogenboom's avatar
jhoogenboom committed
16
from ..lib import get_column_ids, pos_int_arg, map_tags_to_files, \
17
                  add_sample_files_args, ensure_sequence_format, \
jhoogenboom's avatar
jhoogenboom committed
18
                  get_sample_data, add_sequence_format_args, add_output_args
jhoogenboom's avatar
jhoogenboom committed
19
20
21
22

__version__ = "0.1dev"


23
24
25
26
27
28
29
30
31
# Default values for parameters are specified below.

# Default minimum number of reads required for the highest allele.
# This value can be overridden by the -n command line option.
_DEF_MIN_READS = 50

# Default minimum number of reads required for an allele to be called,
# as a percentage of the number of reads of the highest allele.
# This value can be overridden by the -m command line option.
jhoogenboom's avatar
jhoogenboom committed
32
_DEF_MIN_ALLELE_PCT = 30.0
33
34
35
36
37
38

# Default maximum amount of noise to allow, as a percentage of the
# number of reads of the highest allele of each marker.  If any noise
# (i.e., non-allelic sequences) above this threshold are detected, the
# sample is considered 'noisy' for this marker.
# This value can be overridden by the -M command line option.
jhoogenboom's avatar
jhoogenboom committed
39
_DEF_MAX_NOISE_PCT = 10.0
40
41
42

# Default maximum number of alleles to expect for each marker.
# This value can be overridden by the -a command line option.
jhoogenboom's avatar
jhoogenboom committed
43
44
_DEF_MAX_ALLELES = 2

45
46
47
48
49
# Default maximum number of noisy markers allowed per sample.
# This value can be overridden by the -x command line option.
_DEF_MAX_NOISY = 2


jhoogenboom's avatar
jhoogenboom committed
50
51
52
def find_alleles(filelist, outfile, reportfile, tag_expr, tag_format,
                 min_reads, min_allele_pct, max_noise_pct, max_alleles,
                 max_noisy, stuttermark_column, seqformat, library):
jhoogenboom's avatar
jhoogenboom committed
53
54
    if seqformat is not None and library is not None:
        library = parse_library(library)
55

jhoogenboom's avatar
jhoogenboom committed
56
    outfile.write("\t".join(["sample", "marker", "total", "allele"]) + "\n")
57
58
59
60
61
62
    allelelist = {}
    get_sample_data(
        map_tags_to_files(filelist, tag_expr, tag_format),
        lambda tag, data: find_alleles_sample(
            data if stuttermark_column is None
                 else {key: data[key] for key in allelelist[tag]},
jhoogenboom's avatar
jhoogenboom committed
63
            outfile, reportfile, tag, min_reads, min_allele_pct, max_noise_pct,
64
65
66
            max_alleles, max_noisy, seqformat, library),
        allelelist,
        stuttermark_column)
67
68
69
#find_alleles


jhoogenboom's avatar
jhoogenboom committed
70
71
72
def find_alleles_sample(data, outfile, reportfile, tag, min_reads,
                        min_allele_pct, max_noise_pct, max_alleles, max_noisy,
                        seqformat, library):
jhoogenboom's avatar
jhoogenboom committed
73
74
75
76
    top_noise = {}
    top_allele = {}
    alleles = {}
    for marker, allele in data:
77
        reads = sum(data[marker, allele])
jhoogenboom's avatar
jhoogenboom committed
78

79
80
81
82
83
84
        if marker not in alleles:
            alleles[marker] = {allele: reads}
            top_allele[marker] = reads
            top_noise[marker] = ["-", 0]
        else:
            if reads > top_allele[marker]:
jhoogenboom's avatar
jhoogenboom committed
85
                # New highest allele!
86
87
88
89
90
91
92
93
94
95
                top_allele[marker] = reads
                for allelex in alleles[marker].keys():
                    if (alleles[marker][allelex] <
                            top_allele[marker] * (min_allele_pct/100.)):
                        if alleles[marker][allelex] > top_noise[marker][1]:
                            top_noise[marker] = [
                                allelex, alleles[marker][allelex]]
                        del alleles[marker][allelex]
                alleles[marker][allele] = reads
            elif reads >= top_allele[marker]*(min_allele_pct/100.):
jhoogenboom's avatar
jhoogenboom committed
96
97
                # New secundary allele!
                alleles[marker][allele] = reads
98
            elif reads >= top_noise[marker][1]:
jhoogenboom's avatar
jhoogenboom committed
99
                # New highest noise!
100
                top_noise[marker] = [allele, reads]
jhoogenboom's avatar
jhoogenboom committed
101

102
103
    # Find and eliminate noisy markers in this sample first.
    noisy_markers = 0
jhoogenboom's avatar
jhoogenboom committed
104
    for marker in alleles:
105
        if top_allele[marker] < min_reads:
jhoogenboom's avatar
jhoogenboom committed
106
107
108
109
            reportfile.write(
                "Sample %s is not suitable for marker %s:\n"
                "highest allele has only %i reads\n\n" %
                    (tag, marker, top_allele[marker]))
110
111
            alleles[marker] = {}
            continue
jhoogenboom's avatar
jhoogenboom committed
112
113
114
        if len(alleles[marker]) > max_alleles:
            allele_order = sorted(alleles[marker],
                                  key=lambda x: -alleles[marker][x])
115
116
            top_noise[marker] = [allele_order[max_alleles],
                alleles[marker][allele_order[max_alleles]]]
jhoogenboom's avatar
jhoogenboom committed
117
118
            alleles[marker] = {x: alleles[marker][x]
                               for x in allele_order[:max_alleles]}
119
        if top_noise[marker][1] > top_allele[marker]*(max_noise_pct/100.):
jhoogenboom's avatar
jhoogenboom committed
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
            reportfile.write(
                "Sample %s is not suitable for marker %s:\n"
                "highest non-allele is %.1f%% of the highest allele\n" %
                (tag, marker, 100.*top_noise[marker][1]/top_allele[marker]))
            for allele in sorted(alleles[marker],
                                 key=lambda x: -alleles[marker][x]):
                seq = allele if seqformat is None \
                    else ensure_sequence_format(allele, seqformat,
                        library=library, marker=marker)
                reportfile.write("%i\tALLELE\t%s\n" %
                    (alleles[marker][allele], seq))
            seq = top_noise[marker][0] if seqformat is None \
                else ensure_sequence_format(top_noise[marker][0],
                    seqformat, library=library, marker=marker)
            reportfile.write("%i\tNOISE\t%s\n\n" % (top_noise[marker][1], seq))
135
136
137
138
139
            noisy_markers += 1
            alleles[marker] = {}

    # Drop this sample completely if it has too many noisy markers.
    if noisy_markers > max_noisy:
jhoogenboom's avatar
jhoogenboom committed
140
        reportfile.write("Sample %s appears to be contaminated!\n\n" % tag)
141
142
143
144
145
146
        return

    # The sample is OK, write out its alleles.
    for marker in alleles:
        for allele in sorted(alleles[marker],
                             key=lambda x: -alleles[marker][x]):
jhoogenboom's avatar
jhoogenboom committed
147
148
            seq = allele if seqformat is None else ensure_sequence_format(
                allele, seqformat, library=library, marker=marker)
jhoogenboom's avatar
jhoogenboom committed
149
150
            outfile.write("\t".join(
                [tag, marker, str(alleles[marker][allele]), seq]) + "\n")
151
#find_alleles_sample
jhoogenboom's avatar
jhoogenboom committed
152
153
154


def add_arguments(parser):
jhoogenboom's avatar
jhoogenboom committed
155
156
157
158
    add_output_args(parser)
    filtergroup = parser.add_argument_group("filtering options")
    filtergroup.add_argument('-m', '--min-allele-pct', metavar="PCT",
        type=float, default=_DEF_MIN_ALLELE_PCT,
jhoogenboom's avatar
jhoogenboom committed
159
160
        help="call heterozygous if the second allele is at least this "
             "percentage of the highest allele (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
161
162
    filtergroup.add_argument('-M', '--max-noise-pct', metavar="PCT",
        type=float, default=_DEF_MAX_NOISE_PCT,
163
164
165
        help="a sample is considered contaminated/unsuitable for a marker if "
             "the highest non-allelic sequence is at least this percentage of "
             "the highest allele (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
166
167
168
169
170
171
    filtergroup.add_argument('-n', '--min-reads', metavar="N",
        type=pos_int_arg, default=_DEF_MIN_READS,
        help="require at least this number of reads for the highest allele "
             "(default: %(default)s)")
    filtergroup.add_argument('-a', '--max-alleles', metavar="N",
        type=pos_int_arg, default=_DEF_MAX_ALLELES,
172
173
        help="allow no more than this number of alleles per marker (default: "
             "%(default)s)")
jhoogenboom's avatar
jhoogenboom committed
174
175
    filtergroup.add_argument('-x', '--max-noisy', metavar="N",
        type=pos_int_arg, default=_DEF_MAX_NOISY,
176
177
        help="entirely reject a sample if more than this number of markers "
             "have a high non-allelic sequence (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
178
    filtergroup.add_argument('-c', '--stuttermark-column', metavar="COLNAME",
jhoogenboom's avatar
jhoogenboom committed
179
180
181
        help="name of column with Stuttermark output; if specified, sequences "
             "for which the value in this column does not start with ALLELE "
             "are ignored")
jhoogenboom's avatar
jhoogenboom committed
182
183
    add_sequence_format_args(parser)
    add_sample_files_args(parser)
jhoogenboom's avatar
jhoogenboom committed
184
185
186
187
#add_arguments


def run(args):
188
    if args.filelist == [sys.stdin] and sys.stdin.isatty():
jhoogenboom's avatar
jhoogenboom committed
189
190
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
191

jhoogenboom's avatar
jhoogenboom committed
192
193
194
195
    find_alleles(args.filelist, args.output, args.report, args.tag_expr,
                 args.tag_format, args.min_reads, args.min_allele_pct,
                 args.max_noise_pct, args.max_alleles, args.max_noisy,
                 args.stuttermark_column, args.sequence_format, args.library)
jhoogenboom's avatar
jhoogenboom committed
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#run


def main():
    """
    Main entry point.
    """
    parser = argparse.ArgumentParser(
        description=__doc__)
    try:
        add_arguments(parser)
        run(parser.parse_args())
    except OSError as error:
        parser.error(error)
#main


if __name__ == "__main__":
    main()