allelefinder.py 9.95 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
#!/usr/bin/env python
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22

#
# Copyright (C) 2016 Jerry Hoogenboom
#
# This file is part of FDSTools, data analysis tools for Next
# Generation Sequencing of forensic DNA markers.
#
# FDSTools is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the
# Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# FDSTools is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with FDSTools.  If not, see <http://www.gnu.org/licenses/>.
#

jhoogenboom's avatar
jhoogenboom committed
23
"""
24 25
Find true alleles in reference samples and detect possible
contaminations.
jhoogenboom's avatar
jhoogenboom committed
26 27 28

In each sample, the sequences with the highest read counts of each
marker are called alleles, with a user-defined maximum number of alleles
jhoogenboom's avatar
jhoogenboom committed
29
per marker.  The allele balance is kept within given bounds.  If the
jhoogenboom's avatar
jhoogenboom committed
30 31 32
highest non-allelic sequence exceeds a given limit, no alleles are
called for this marker.  If this happens for multiple markers in one
sample, no alleles are called for this sample at all.
jhoogenboom's avatar
jhoogenboom committed
33

jhoogenboom's avatar
jhoogenboom committed
34 35 36 37 38 39 40
The allele list obtained from allelefinder should always be checked
carefully before using it as the input of various other tools operating
on reference samples.  These tools rely heavily on the correctness of
this file to do their job.  One may use the allelefinder report
(-R/--report output argument) and the blame tool to get a quick overview
of what might be wrong.
"""
41
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files, \
42
                  ensure_sequence_format, get_sample_data, SEQ_SPECIAL_VALUES,\
43
                  add_sequence_format_args
jhoogenboom's avatar
jhoogenboom committed
44

45
__version__ = "1.0.0"
jhoogenboom's avatar
jhoogenboom committed
46 47


48 49 50 51 52 53 54 55 56
# Default values for parameters are specified below.

# Default minimum number of reads required for the highest allele.
# This value can be overridden by the -n command line option.
_DEF_MIN_READS = 50

# Default minimum number of reads required for an allele to be called,
# as a percentage of the number of reads of the highest allele.
# This value can be overridden by the -m command line option.
jhoogenboom's avatar
jhoogenboom committed
57
_DEF_MIN_ALLELE_PCT = 30.0
58 59 60 61 62 63

# Default maximum amount of noise to allow, as a percentage of the
# number of reads of the highest allele of each marker.  If any noise
# (i.e., non-allelic sequences) above this threshold are detected, the
# sample is considered 'noisy' for this marker.
# This value can be overridden by the -M command line option.
jhoogenboom's avatar
jhoogenboom committed
64
_DEF_MAX_NOISE_PCT = 10.0
65 66 67 68 69 70

# Default maximum number of noisy markers allowed per sample.
# This value can be overridden by the -x command line option.
_DEF_MAX_NOISY = 2


71 72 73
def find_alleles(samples_in, outfile, reportfile, min_reads, min_allele_pct,
                 max_noise_pct, max_alleles, max_noisy, stuttermark_column,
                 seqformat, library):
74
    library = library if library is not None else {}
75

jhoogenboom's avatar
jhoogenboom committed
76
    outfile.write("\t".join(["sample", "marker", "total", "allele"]) + "\n")
77 78
    allelelist = {}
    get_sample_data(
79
        samples_in,
80 81
        lambda tag, data: find_alleles_sample(
            data if stuttermark_column is None
jhoogenboom's avatar
jhoogenboom committed
82
                 else {key: data[key] for key in data if key[0] in
83
                       allelelist[tag] and key[1] in allelelist[tag][key[0]]},
jhoogenboom's avatar
jhoogenboom committed
84
            outfile, reportfile, tag, min_reads, min_allele_pct, max_noise_pct,
85 86 87
            max_alleles, max_noisy, seqformat, library),
        allelelist,
        stuttermark_column)
88 89 90
#find_alleles


jhoogenboom's avatar
jhoogenboom committed
91 92 93
def find_alleles_sample(data, outfile, reportfile, tag, min_reads,
                        min_allele_pct, max_noise_pct, max_alleles, max_noisy,
                        seqformat, library):
jhoogenboom's avatar
jhoogenboom committed
94 95 96
    top_noise = {}
    top_allele = {}
    alleles = {}
97 98
    for marker, sequence in data:
        reads = sum(data[marker, sequence])
jhoogenboom's avatar
jhoogenboom committed
99

100
        if marker not in alleles:
101 102
            alleles[marker] = {}
            top_allele[marker] = 0
103
            top_noise[marker] = ["-", 0]
104

105 106 107 108
        if sequence in SEQ_SPECIAL_VALUES:
            continue

        if reads > top_allele[marker]:
109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
            # New highest allele!
            top_allele[marker] = reads
            for allele in alleles[marker].keys():
                if (alleles[marker][allele] <
                        top_allele[marker] * (min_allele_pct/100.)):
                    if alleles[marker][allele] > top_noise[marker][1]:
                        top_noise[marker] = [
                            allele, alleles[marker][allele]]
                    del alleles[marker][allele]
            alleles[marker][sequence] = reads
        elif reads >= top_allele[marker]*(min_allele_pct/100.):
            # New secundary allele!
            alleles[marker][sequence] = reads
        elif reads >= top_noise[marker][1]:
            # New highest noise!
            top_noise[marker] = [sequence, reads]
jhoogenboom's avatar
jhoogenboom committed
125

126 127
    # Find and eliminate noisy markers in this sample first.
    noisy_markers = 0
jhoogenboom's avatar
jhoogenboom committed
128
    for marker in alleles:
129
        if top_allele[marker] < min_reads:
jhoogenboom's avatar
jhoogenboom committed
130 131 132 133
            reportfile.write(
                "Sample %s is not suitable for marker %s:\n"
                "highest allele has only %i reads\n\n" %
                    (tag, marker, top_allele[marker]))
134 135
            alleles[marker] = {}
            continue
136 137
        expect = get_max_expected_alleles(max_alleles, marker, library)
        if len(alleles[marker]) > expect:
jhoogenboom's avatar
jhoogenboom committed
138 139
            allele_order = sorted(alleles[marker],
                                  key=lambda x: -alleles[marker][x])
140 141
            top_noise[marker] = [allele_order[expect],
                alleles[marker][allele_order[expect]]]
jhoogenboom's avatar
jhoogenboom committed
142
            alleles[marker] = {x: alleles[marker][x]
143
                               for x in allele_order[:expect]}
144
        if top_noise[marker][1] > top_allele[marker]*(max_noise_pct/100.):
jhoogenboom's avatar
jhoogenboom committed
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
            reportfile.write(
                "Sample %s is not suitable for marker %s:\n"
                "highest non-allele is %.1f%% of the highest allele\n" %
                (tag, marker, 100.*top_noise[marker][1]/top_allele[marker]))
            for allele in sorted(alleles[marker],
                                 key=lambda x: -alleles[marker][x]):
                seq = allele if seqformat is None \
                    else ensure_sequence_format(allele, seqformat,
                        library=library, marker=marker)
                reportfile.write("%i\tALLELE\t%s\n" %
                    (alleles[marker][allele], seq))
            seq = top_noise[marker][0] if seqformat is None \
                else ensure_sequence_format(top_noise[marker][0],
                    seqformat, library=library, marker=marker)
            reportfile.write("%i\tNOISE\t%s\n\n" % (top_noise[marker][1], seq))
160 161 162 163 164
            noisy_markers += 1
            alleles[marker] = {}

    # Drop this sample completely if it has too many noisy markers.
    if noisy_markers > max_noisy:
jhoogenboom's avatar
jhoogenboom committed
165
        reportfile.write("Sample %s appears to be contaminated!\n\n" % tag)
166 167 168 169 170 171
        return

    # The sample is OK, write out its alleles.
    for marker in alleles:
        for allele in sorted(alleles[marker],
                             key=lambda x: -alleles[marker][x]):
jhoogenboom's avatar
jhoogenboom committed
172 173
            seq = allele if seqformat is None else ensure_sequence_format(
                allele, seqformat, library=library, marker=marker)
jhoogenboom's avatar
jhoogenboom committed
174 175
            outfile.write("\t".join(
                [tag, marker, str(alleles[marker][allele]), seq]) + "\n")
176
#find_alleles_sample
jhoogenboom's avatar
jhoogenboom committed
177 178


179 180 181 182 183 184 185 186 187
def get_max_expected_alleles(max_alleles, marker, library):
    if max_alleles is not None:
        return max_alleles
    if "max_expected_copies" in library:
        return library["max_expected_copies"].get(marker, 2)
    return 2
#get_max_expected_alleles


jhoogenboom's avatar
jhoogenboom committed
188
def add_arguments(parser):
189
    add_input_output_args(parser, False, False, True)
jhoogenboom's avatar
jhoogenboom committed
190 191 192
    filtergroup = parser.add_argument_group("filtering options")
    filtergroup.add_argument('-m', '--min-allele-pct', metavar="PCT",
        type=float, default=_DEF_MIN_ALLELE_PCT,
jhoogenboom's avatar
jhoogenboom committed
193
        help="call heterozygous if the second allele is at least this "
jhoogenboom's avatar
jhoogenboom committed
194 195
             "percentage of the highest allele of a marker "
             "(default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
196 197
    filtergroup.add_argument('-M', '--max-noise-pct', metavar="PCT",
        type=float, default=_DEF_MAX_NOISE_PCT,
198 199
        help="a sample is considered contaminated/unsuitable for a marker if "
             "the highest non-allelic sequence is at least this percentage of "
jhoogenboom's avatar
jhoogenboom committed
200
             "the highest allele of that marker (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
201 202 203
    filtergroup.add_argument('-n', '--min-reads', metavar="N",
        type=pos_int_arg, default=_DEF_MIN_READS,
        help="require at least this number of reads for the highest allele "
jhoogenboom's avatar
jhoogenboom committed
204
             "of each marker (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
205
    filtergroup.add_argument('-a', '--max-alleles', metavar="N",
206 207 208 209
        type=pos_int_arg,
        help="allow no more than this number of alleles per marker; if "
             "unspecified, the amounts given in the library file are used, "
             "which have a default value of 2")
jhoogenboom's avatar
jhoogenboom committed
210 211
    filtergroup.add_argument('-x', '--max-noisy', metavar="N",
        type=pos_int_arg, default=_DEF_MAX_NOISY,
212 213
        help="entirely reject a sample if more than this number of markers "
             "have a high non-allelic sequence (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
214
    filtergroup.add_argument('-c', '--stuttermark-column', metavar="COLNAME",
jhoogenboom's avatar
jhoogenboom committed
215 216 217
        help="name of column with Stuttermark output; if specified, sequences "
             "for which the value in this column does not start with ALLELE "
             "are ignored")
jhoogenboom's avatar
jhoogenboom committed
218
    add_sequence_format_args(parser)
jhoogenboom's avatar
jhoogenboom committed
219 220 221 222
#add_arguments


def run(args):
223 224
    files = get_input_output_files(args)
    if not files:
jhoogenboom's avatar
jhoogenboom committed
225 226
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
227

228 229 230 231
    find_alleles(files[0], files[1], args.report, args.min_reads,
                 args.min_allele_pct, args.max_noise_pct, args.max_alleles,
                 args.max_noisy, args.stuttermark_column, args.sequence_format,
                 args.library)
jhoogenboom's avatar
jhoogenboom committed
232
#run