allelefinder.py 10.3 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
#!/usr/bin/env python
2
3

#
4
# Copyright (C) 2017 Jerry Hoogenboom
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
#
# FDSTools is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# FDSTools is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with FDSTools.  If not, see <http://www.gnu.org/licenses/>.
#

jhoogenboom's avatar
jhoogenboom committed
23
"""
24
25
Find true alleles in reference samples and detect possible
contaminations.
jhoogenboom's avatar
jhoogenboom committed
26
27
28

In each sample, the sequences with the highest read counts of each
marker are called alleles, with a user-defined maximum number of alleles
jhoogenboom's avatar
jhoogenboom committed
29
per marker.  The allele balance is kept within given bounds.  If the
jhoogenboom's avatar
jhoogenboom committed
30
31
32
highest non-allelic sequence exceeds a given limit, no alleles are
called for this marker.  If this happens for multiple markers in one
sample, no alleles are called for this sample at all.
jhoogenboom's avatar
jhoogenboom committed
33

jhoogenboom's avatar
jhoogenboom committed
34
35
36
37
The allele list obtained from allelefinder should always be checked
carefully before using it as the input of various other tools operating
on reference samples.  These tools rely heavily on the correctness of
this file to do their job.  One may use the allelefinder report
38
39
(-R/--report output argument) and the bganalyse tool to get a quick
overview of what might be wrong.
jhoogenboom's avatar
jhoogenboom committed
40
"""
41
from errno import EPIPE
42
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files, \
43
                  ensure_sequence_format, get_sample_data, SEQ_SPECIAL_VALUES,\
44
                  add_sequence_format_args, write_pipe
jhoogenboom's avatar
jhoogenboom committed
45

46
__version__ = "1.0.1"
jhoogenboom's avatar
jhoogenboom committed
47
48


49
50
51
52
53
54
55
56
57
# Default values for parameters are specified below.

# Default minimum number of reads required for the highest allele.
# This value can be overridden by the -n command line option.
_DEF_MIN_READS = 50

# Default minimum number of reads required for an allele to be called,
# as a percentage of the number of reads of the highest allele.
# This value can be overridden by the -m command line option.
jhoogenboom's avatar
jhoogenboom committed
58
_DEF_MIN_ALLELE_PCT = 30.0
59
60
61
62
63
64

# Default maximum amount of noise to allow, as a percentage of the
# number of reads of the highest allele of each marker.  If any noise
# (i.e., non-allelic sequences) above this threshold are detected, the
# sample is considered 'noisy' for this marker.
# This value can be overridden by the -M command line option.
jhoogenboom's avatar
jhoogenboom committed
65
_DEF_MAX_NOISE_PCT = 10.0
66
67
68
69
70
71

# Default maximum number of noisy markers allowed per sample.
# This value can be overridden by the -x command line option.
_DEF_MAX_NOISY = 2


72
73
74
def find_alleles(samples_in, outfile, reportfile, min_reads, min_allele_pct,
                 max_noise_pct, max_alleles, max_noisy, stuttermark_column,
                 seqformat, library):
jhoogenboom's avatar
jhoogenboom committed
75
    outfile.write("\t".join(["sample", "marker", "total", "allele"]) + "\n")
76
77
    allelelist = {}
    get_sample_data(
78
        samples_in,
79
80
        lambda tag, data: find_alleles_sample(
            data if stuttermark_column is None
jhoogenboom's avatar
jhoogenboom committed
81
                 else {key: data[key] for key in data if key[0] in
82
                       allelelist[tag] and key[1] in allelelist[tag][key[0]]},
jhoogenboom's avatar
jhoogenboom committed
83
            outfile, reportfile, tag, min_reads, min_allele_pct, max_noise_pct,
84
85
86
            max_alleles, max_noisy, seqformat, library),
        allelelist,
        stuttermark_column)
87
88
89
#find_alleles


jhoogenboom's avatar
jhoogenboom committed
90
91
92
def find_alleles_sample(data, outfile, reportfile, tag, min_reads,
                        min_allele_pct, max_noise_pct, max_alleles, max_noisy,
                        seqformat, library):
jhoogenboom's avatar
jhoogenboom committed
93
94
95
    top_noise = {}
    top_allele = {}
    alleles = {}
96
97
    for marker, sequence in data:
        reads = sum(data[marker, sequence])
jhoogenboom's avatar
jhoogenboom committed
98

99
        if marker not in alleles:
100
101
            alleles[marker] = {}
            top_allele[marker] = 0
102
            top_noise[marker] = ["-", 0]
103

104
105
106
107
        if sequence in SEQ_SPECIAL_VALUES:
            continue

        if reads > top_allele[marker]:
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
            # New highest allele!
            top_allele[marker] = reads
            for allele in alleles[marker].keys():
                if (alleles[marker][allele] <
                        top_allele[marker] * (min_allele_pct/100.)):
                    if alleles[marker][allele] > top_noise[marker][1]:
                        top_noise[marker] = [
                            allele, alleles[marker][allele]]
                    del alleles[marker][allele]
            alleles[marker][sequence] = reads
        elif reads >= top_allele[marker]*(min_allele_pct/100.):
            # New secundary allele!
            alleles[marker][sequence] = reads
        elif reads >= top_noise[marker][1]:
            # New highest noise!
            top_noise[marker] = [sequence, reads]
jhoogenboom's avatar
jhoogenboom committed
124

125
126
    # Find and eliminate noisy markers in this sample first.
    noisy_markers = 0
jhoogenboom's avatar
jhoogenboom committed
127
    for marker in alleles:
128
        if top_allele[marker] < min_reads:
129
            write_pipe(reportfile,
jhoogenboom's avatar
jhoogenboom committed
130
131
132
                "Sample %s is not suitable for marker %s:\n"
                "highest allele has only %i reads\n\n" %
                    (tag, marker, top_allele[marker]))
133
134
            alleles[marker] = {}
            continue
135
136
        expect = get_max_expected_alleles(max_alleles, marker, library)
        if len(alleles[marker]) > expect:
jhoogenboom's avatar
jhoogenboom committed
137
138
            allele_order = sorted(alleles[marker],
                                  key=lambda x: -alleles[marker][x])
139
140
            top_noise[marker] = [allele_order[expect],
                alleles[marker][allele_order[expect]]]
jhoogenboom's avatar
jhoogenboom committed
141
            alleles[marker] = {x: alleles[marker][x]
142
                               for x in allele_order[:expect]}
143
        if top_noise[marker][1] > top_allele[marker]*(max_noise_pct/100.):
144
            write_pipe(reportfile,
jhoogenboom's avatar
jhoogenboom committed
145
146
147
148
149
150
151
152
                "Sample %s is not suitable for marker %s:\n"
                "highest non-allele is %.1f%% of the highest allele\n" %
                (tag, marker, 100.*top_noise[marker][1]/top_allele[marker]))
            for allele in sorted(alleles[marker],
                                 key=lambda x: -alleles[marker][x]):
                seq = allele if seqformat is None \
                    else ensure_sequence_format(allele, seqformat,
                        library=library, marker=marker)
153
                write_pipe(reportfile, "%i\tALLELE\t%s\n" %
jhoogenboom's avatar
jhoogenboom committed
154
155
156
157
                    (alleles[marker][allele], seq))
            seq = top_noise[marker][0] if seqformat is None \
                else ensure_sequence_format(top_noise[marker][0],
                    seqformat, library=library, marker=marker)
158
159
            write_pipe(reportfile,
                "%i\tNOISE\t%s\n\n" % (top_noise[marker][1], seq))
160
161
162
163
164
            noisy_markers += 1
            alleles[marker] = {}

    # Drop this sample completely if it has too many noisy markers.
    if noisy_markers > max_noisy:
165
166
        write_pipe(reportfile,
            "Sample %s appears to be contaminated!\n\n" % tag)
167
168
169
170
171
172
        return

    # The sample is OK, write out its alleles.
    for marker in alleles:
        for allele in sorted(alleles[marker],
                             key=lambda x: -alleles[marker][x]):
jhoogenboom's avatar
jhoogenboom committed
173
174
            seq = allele if seqformat is None else ensure_sequence_format(
                allele, seqformat, library=library, marker=marker)
jhoogenboom's avatar
jhoogenboom committed
175
176
            outfile.write("\t".join(
                [tag, marker, str(alleles[marker][allele]), seq]) + "\n")
177
#find_alleles_sample
jhoogenboom's avatar
jhoogenboom committed
178
179


180
181
182
def get_max_expected_alleles(max_alleles, marker, library):
    if max_alleles is not None:
        return max_alleles
183
    if library is not None and "max_expected_copies" in library:
184
185
186
187
188
        return library["max_expected_copies"].get(marker, 2)
    return 2
#get_max_expected_alleles


jhoogenboom's avatar
jhoogenboom committed
189
def add_arguments(parser):
190
    add_input_output_args(parser, False, False, True)
jhoogenboom's avatar
jhoogenboom committed
191
192
193
    filtergroup = parser.add_argument_group("filtering options")
    filtergroup.add_argument('-m', '--min-allele-pct', metavar="PCT",
        type=float, default=_DEF_MIN_ALLELE_PCT,
jhoogenboom's avatar
jhoogenboom committed
194
        help="call heterozygous if the second allele is at least this "
jhoogenboom's avatar
jhoogenboom committed
195
196
             "percentage of the highest allele of a marker "
             "(default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
197
198
    filtergroup.add_argument('-M', '--max-noise-pct', metavar="PCT",
        type=float, default=_DEF_MAX_NOISE_PCT,
199
200
        help="a sample is considered contaminated/unsuitable for a marker if "
             "the highest non-allelic sequence is at least this percentage of "
jhoogenboom's avatar
jhoogenboom committed
201
             "the highest allele of that marker (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
202
203
204
    filtergroup.add_argument('-n', '--min-reads', metavar="N",
        type=pos_int_arg, default=_DEF_MIN_READS,
        help="require at least this number of reads for the highest allele "
jhoogenboom's avatar
jhoogenboom committed
205
             "of each marker (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
206
    filtergroup.add_argument('-a', '--max-alleles', metavar="N",
207
208
209
210
        type=pos_int_arg,
        help="allow no more than this number of alleles per marker; if "
             "unspecified, the amounts given in the library file are used, "
             "which have a default value of 2")
jhoogenboom's avatar
jhoogenboom committed
211
212
    filtergroup.add_argument('-x', '--max-noisy', metavar="N",
        type=pos_int_arg, default=_DEF_MAX_NOISY,
213
214
        help="entirely reject a sample if more than this number of markers "
             "have a high non-allelic sequence (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
215
    filtergroup.add_argument('-c', '--stuttermark-column', metavar="COLNAME",
jhoogenboom's avatar
jhoogenboom committed
216
217
218
        help="name of column with Stuttermark output; if specified, sequences "
             "for which the value in this column does not start with ALLELE "
             "are ignored")
jhoogenboom's avatar
jhoogenboom committed
219
    add_sequence_format_args(parser)
jhoogenboom's avatar
jhoogenboom committed
220
221
222
223
#add_arguments


def run(args):
224
225
    files = get_input_output_files(args)
    if not files:
jhoogenboom's avatar
jhoogenboom committed
226
227
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
228

229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
    try:
        find_alleles(files[0], files[1], args.report, args.min_reads,
                     args.min_allele_pct, args.max_noise_pct, args.max_alleles,
                     args.max_noisy, args.stuttermark_column,
                     args.sequence_format, args.library)
    except IOError as e:
        if e.errno == EPIPE:
            if args.report:
                try:
                    write_pipe(args.report,
                        "Stopped early because the output was closed.\n")
                except:
                    pass
            return
        raise
jhoogenboom's avatar
jhoogenboom committed
244
#run