allelefinder.py 9.25 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
2
#!/usr/bin/env python
"""
3
4
Find true alleles in reference samples and detect possible
contaminations.
jhoogenboom's avatar
jhoogenboom committed
5
6
7

In each sample, the sequences with the highest read counts of each
marker are called alleles, with a user-defined maximum number of alleles
jhoogenboom's avatar
jhoogenboom committed
8
per marker.  The allele balance is kept within given bounds.  If the
jhoogenboom's avatar
jhoogenboom committed
9
10
11
highest non-allelic sequence exceeds a given limit, no alleles are
called for this marker.  If this happens for multiple markers in one
sample, no alleles are called for this sample at all.
jhoogenboom's avatar
jhoogenboom committed
12

jhoogenboom's avatar
jhoogenboom committed
13
14
15
16
17
18
19
The allele list obtained from allelefinder should always be checked
carefully before using it as the input of various other tools operating
on reference samples.  These tools rely heavily on the correctness of
this file to do their job.  One may use the allelefinder report
(-R/--report output argument) and the blame tool to get a quick overview
of what might be wrong.
"""
20
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files, \
21
                  ensure_sequence_format, get_sample_data, \
22
                  add_sequence_format_args
jhoogenboom's avatar
jhoogenboom committed
23
24
25
26

__version__ = "0.1dev"


27
28
29
30
31
32
33
34
35
# Default values for parameters are specified below.

# Default minimum number of reads required for the highest allele.
# This value can be overridden by the -n command line option.
_DEF_MIN_READS = 50

# Default minimum number of reads required for an allele to be called,
# as a percentage of the number of reads of the highest allele.
# This value can be overridden by the -m command line option.
jhoogenboom's avatar
jhoogenboom committed
36
_DEF_MIN_ALLELE_PCT = 30.0
37
38
39
40
41
42

# Default maximum amount of noise to allow, as a percentage of the
# number of reads of the highest allele of each marker.  If any noise
# (i.e., non-allelic sequences) above this threshold are detected, the
# sample is considered 'noisy' for this marker.
# This value can be overridden by the -M command line option.
jhoogenboom's avatar
jhoogenboom committed
43
_DEF_MAX_NOISE_PCT = 10.0
44
45
46
47
48
49

# Default maximum number of noisy markers allowed per sample.
# This value can be overridden by the -x command line option.
_DEF_MAX_NOISY = 2


50
51
52
def find_alleles(samples_in, outfile, reportfile, min_reads, min_allele_pct,
                 max_noise_pct, max_alleles, max_noisy, stuttermark_column,
                 seqformat, library):
53
    library = library if library is not None else {}
54

jhoogenboom's avatar
jhoogenboom committed
55
    outfile.write("\t".join(["sample", "marker", "total", "allele"]) + "\n")
56
57
    allelelist = {}
    get_sample_data(
58
        samples_in,
59
60
        lambda tag, data: find_alleles_sample(
            data if stuttermark_column is None
jhoogenboom's avatar
jhoogenboom committed
61
62
                 else {key: data[key] for key in data if key[0] in
                       allelelist[tag]},
jhoogenboom's avatar
jhoogenboom committed
63
            outfile, reportfile, tag, min_reads, min_allele_pct, max_noise_pct,
64
65
66
            max_alleles, max_noisy, seqformat, library),
        allelelist,
        stuttermark_column)
67
68
69
#find_alleles


jhoogenboom's avatar
jhoogenboom committed
70
71
72
def find_alleles_sample(data, outfile, reportfile, tag, min_reads,
                        min_allele_pct, max_noise_pct, max_alleles, max_noisy,
                        seqformat, library):
jhoogenboom's avatar
jhoogenboom committed
73
74
75
    top_noise = {}
    top_allele = {}
    alleles = {}
76
77
    for marker, sequence in data:
        reads = sum(data[marker, sequence])
jhoogenboom's avatar
jhoogenboom committed
78

79
        if marker not in alleles:
80
81
            alleles[marker] = {}
            top_allele[marker] = 0
82
            top_noise[marker] = ["-", 0]
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103

        if sequence == "Other Sequences" and reads >= top_noise[marker][1]:
            # Aggregated sequences are new highest noise!
            top_noise[marker] = [sequence, reads]
        elif reads > top_allele[marker]:
            # New highest allele!
            top_allele[marker] = reads
            for allele in alleles[marker].keys():
                if (alleles[marker][allele] <
                        top_allele[marker] * (min_allele_pct/100.)):
                    if alleles[marker][allele] > top_noise[marker][1]:
                        top_noise[marker] = [
                            allele, alleles[marker][allele]]
                    del alleles[marker][allele]
            alleles[marker][sequence] = reads
        elif reads >= top_allele[marker]*(min_allele_pct/100.):
            # New secundary allele!
            alleles[marker][sequence] = reads
        elif reads >= top_noise[marker][1]:
            # New highest noise!
            top_noise[marker] = [sequence, reads]
jhoogenboom's avatar
jhoogenboom committed
104

105
106
    # Find and eliminate noisy markers in this sample first.
    noisy_markers = 0
jhoogenboom's avatar
jhoogenboom committed
107
    for marker in alleles:
108
        if top_allele[marker] < min_reads:
jhoogenboom's avatar
jhoogenboom committed
109
110
111
112
            reportfile.write(
                "Sample %s is not suitable for marker %s:\n"
                "highest allele has only %i reads\n\n" %
                    (tag, marker, top_allele[marker]))
113
114
            alleles[marker] = {}
            continue
115
116
        expect = get_max_expected_alleles(max_alleles, marker, library)
        if len(alleles[marker]) > expect:
jhoogenboom's avatar
jhoogenboom committed
117
118
            allele_order = sorted(alleles[marker],
                                  key=lambda x: -alleles[marker][x])
119
120
            top_noise[marker] = [allele_order[expect],
                alleles[marker][allele_order[expect]]]
jhoogenboom's avatar
jhoogenboom committed
121
            alleles[marker] = {x: alleles[marker][x]
122
                               for x in allele_order[:expect]}
123
        if top_noise[marker][1] > top_allele[marker]*(max_noise_pct/100.):
jhoogenboom's avatar
jhoogenboom committed
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
            reportfile.write(
                "Sample %s is not suitable for marker %s:\n"
                "highest non-allele is %.1f%% of the highest allele\n" %
                (tag, marker, 100.*top_noise[marker][1]/top_allele[marker]))
            for allele in sorted(alleles[marker],
                                 key=lambda x: -alleles[marker][x]):
                seq = allele if seqformat is None \
                    else ensure_sequence_format(allele, seqformat,
                        library=library, marker=marker)
                reportfile.write("%i\tALLELE\t%s\n" %
                    (alleles[marker][allele], seq))
            seq = top_noise[marker][0] if seqformat is None \
                else ensure_sequence_format(top_noise[marker][0],
                    seqformat, library=library, marker=marker)
            reportfile.write("%i\tNOISE\t%s\n\n" % (top_noise[marker][1], seq))
139
140
141
142
143
            noisy_markers += 1
            alleles[marker] = {}

    # Drop this sample completely if it has too many noisy markers.
    if noisy_markers > max_noisy:
jhoogenboom's avatar
jhoogenboom committed
144
        reportfile.write("Sample %s appears to be contaminated!\n\n" % tag)
145
146
147
148
149
150
        return

    # The sample is OK, write out its alleles.
    for marker in alleles:
        for allele in sorted(alleles[marker],
                             key=lambda x: -alleles[marker][x]):
jhoogenboom's avatar
jhoogenboom committed
151
152
            seq = allele if seqformat is None else ensure_sequence_format(
                allele, seqformat, library=library, marker=marker)
jhoogenboom's avatar
jhoogenboom committed
153
154
            outfile.write("\t".join(
                [tag, marker, str(alleles[marker][allele]), seq]) + "\n")
155
#find_alleles_sample
jhoogenboom's avatar
jhoogenboom committed
156
157


158
159
160
161
162
163
164
165
166
def get_max_expected_alleles(max_alleles, marker, library):
    if max_alleles is not None:
        return max_alleles
    if "max_expected_copies" in library:
        return library["max_expected_copies"].get(marker, 2)
    return 2
#get_max_expected_alleles


jhoogenboom's avatar
jhoogenboom committed
167
def add_arguments(parser):
168
    add_input_output_args(parser, False, False, True)
jhoogenboom's avatar
jhoogenboom committed
169
170
171
    filtergroup = parser.add_argument_group("filtering options")
    filtergroup.add_argument('-m', '--min-allele-pct', metavar="PCT",
        type=float, default=_DEF_MIN_ALLELE_PCT,
jhoogenboom's avatar
jhoogenboom committed
172
        help="call heterozygous if the second allele is at least this "
jhoogenboom's avatar
jhoogenboom committed
173
174
             "percentage of the highest allele of a marker "
             "(default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
175
176
    filtergroup.add_argument('-M', '--max-noise-pct', metavar="PCT",
        type=float, default=_DEF_MAX_NOISE_PCT,
177
178
        help="a sample is considered contaminated/unsuitable for a marker if "
             "the highest non-allelic sequence is at least this percentage of "
jhoogenboom's avatar
jhoogenboom committed
179
             "the highest allele of that marker (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
180
181
182
    filtergroup.add_argument('-n', '--min-reads', metavar="N",
        type=pos_int_arg, default=_DEF_MIN_READS,
        help="require at least this number of reads for the highest allele "
jhoogenboom's avatar
jhoogenboom committed
183
             "of each marker (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
184
    filtergroup.add_argument('-a', '--max-alleles', metavar="N",
185
186
187
188
        type=pos_int_arg,
        help="allow no more than this number of alleles per marker; if "
             "unspecified, the amounts given in the library file are used, "
             "which have a default value of 2")
jhoogenboom's avatar
jhoogenboom committed
189
190
    filtergroup.add_argument('-x', '--max-noisy', metavar="N",
        type=pos_int_arg, default=_DEF_MAX_NOISY,
191
192
        help="entirely reject a sample if more than this number of markers "
             "have a high non-allelic sequence (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
193
    filtergroup.add_argument('-c', '--stuttermark-column', metavar="COLNAME",
jhoogenboom's avatar
jhoogenboom committed
194
195
196
        help="name of column with Stuttermark output; if specified, sequences "
             "for which the value in this column does not start with ALLELE "
             "are ignored")
jhoogenboom's avatar
jhoogenboom committed
197
    add_sequence_format_args(parser)
jhoogenboom's avatar
jhoogenboom committed
198
199
200
201
#add_arguments


def run(args):
202
203
    files = get_input_output_files(args)
    if not files:
jhoogenboom's avatar
jhoogenboom committed
204
205
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
206

207
208
209
210
    find_alleles(files[0], files[1], args.report, args.min_reads,
                 args.min_allele_pct, args.max_noise_pct, args.max_alleles,
                 args.max_noisy, args.stuttermark_column, args.sequence_format,
                 args.library)
jhoogenboom's avatar
jhoogenboom committed
211
#run