allelefinder.py 9.11 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
2
#!/usr/bin/env python
"""
3
4
Find true alleles in reference samples and detect possible
contaminations.
jhoogenboom's avatar
jhoogenboom committed
5
6
7
8
"""
import argparse
import sys

jhoogenboom's avatar
jhoogenboom committed
9
from ..lib import get_column_ids, pos_int_arg, map_tags_to_files, \
10
11
                  add_sample_files_args, ensure_sequence_format, \
                  get_sample_data
jhoogenboom's avatar
jhoogenboom committed
12
13
14
15

__version__ = "0.1dev"


16
17
18
19
20
21
22
23
24
# Default values for parameters are specified below.

# Default minimum number of reads required for the highest allele.
# This value can be overridden by the -n command line option.
_DEF_MIN_READS = 50

# Default minimum number of reads required for an allele to be called,
# as a percentage of the number of reads of the highest allele.
# This value can be overridden by the -m command line option.
jhoogenboom's avatar
jhoogenboom committed
25
_DEF_MIN_ALLELE_PCT = 30.0
26
27
28
29
30
31

# Default maximum amount of noise to allow, as a percentage of the
# number of reads of the highest allele of each marker.  If any noise
# (i.e., non-allelic sequences) above this threshold are detected, the
# sample is considered 'noisy' for this marker.
# This value can be overridden by the -M command line option.
jhoogenboom's avatar
jhoogenboom committed
32
_DEF_MAX_NOISE_PCT = 10.0
33
34
35

# Default maximum number of alleles to expect for each marker.
# This value can be overridden by the -a command line option.
jhoogenboom's avatar
jhoogenboom committed
36
37
_DEF_MAX_ALLELES = 2

38
39
40
41
42
43
44
# Default maximum number of noisy markers allowed per sample.
# This value can be overridden by the -x command line option.
_DEF_MAX_NOISY = 2


def find_alleles(filelist, reportfile, tag_expr, tag_format, min_reads,
                 min_allele_pct, max_noise_pct, max_alleles, max_noisy,
jhoogenboom's avatar
jhoogenboom committed
45
46
47
                 stuttermark_column, seqformat, library):
    if seqformat is not None and library is not None:
        library = parse_library(library)
48
49

    print("\t".join(["sample", "marker", "total", "allele"]))
50
51
52
53
54
55
56
57
58
59
    allelelist = {}
    get_sample_data(
        map_tags_to_files(filelist, tag_expr, tag_format),
        lambda tag, data: find_alleles_sample(
            data if stuttermark_column is None
                 else {key: data[key] for key in allelelist[tag]},
            reportfile, tag, min_reads, min_allele_pct, max_noise_pct,
            max_alleles, max_noisy, seqformat, library),
        allelelist,
        stuttermark_column)
60
61
62
#find_alleles


jhoogenboom's avatar
jhoogenboom committed
63
64
65
66
67
68
69
def find_alleles_sample(data, reportfile, tag, min_reads, min_allele_pct,
                        max_noise_pct, max_alleles, max_noisy, seqformat,
                        library):
    top_noise = {}
    top_allele = {}
    alleles = {}
    for marker, allele in data:
70
        reads = sum(data[marker, allele])
jhoogenboom's avatar
jhoogenboom committed
71

72
73
74
75
76
77
        if marker not in alleles:
            alleles[marker] = {allele: reads}
            top_allele[marker] = reads
            top_noise[marker] = ["-", 0]
        else:
            if reads > top_allele[marker]:
jhoogenboom's avatar
jhoogenboom committed
78
                # New highest allele!
79
80
81
82
83
84
85
86
87
88
                top_allele[marker] = reads
                for allelex in alleles[marker].keys():
                    if (alleles[marker][allelex] <
                            top_allele[marker] * (min_allele_pct/100.)):
                        if alleles[marker][allelex] > top_noise[marker][1]:
                            top_noise[marker] = [
                                allelex, alleles[marker][allelex]]
                        del alleles[marker][allelex]
                alleles[marker][allele] = reads
            elif reads >= top_allele[marker]*(min_allele_pct/100.):
jhoogenboom's avatar
jhoogenboom committed
89
90
                # New secundary allele!
                alleles[marker][allele] = reads
91
            elif reads >= top_noise[marker][1]:
jhoogenboom's avatar
jhoogenboom committed
92
                # New highest noise!
93
                top_noise[marker] = [allele, reads]
jhoogenboom's avatar
jhoogenboom committed
94

95
96
    # Find and eliminate noisy markers in this sample first.
    noisy_markers = 0
jhoogenboom's avatar
jhoogenboom committed
97
    for marker in alleles:
98
99
100
101
102
103
104
105
        if top_allele[marker] < min_reads:
            if reportfile:
                reportfile.write(
                    "Sample %s is not suitable for marker %s:\n"
                    "highest allele has only %i reads\n\n" %
                        (tag, marker, top_allele[marker]))
            alleles[marker] = {}
            continue
jhoogenboom's avatar
jhoogenboom committed
106
107
108
        if len(alleles[marker]) > max_alleles:
            allele_order = sorted(alleles[marker],
                                  key=lambda x: -alleles[marker][x])
109
110
            top_noise[marker] = [allele_order[max_alleles],
                alleles[marker][allele_order[max_alleles]]]
jhoogenboom's avatar
jhoogenboom committed
111
112
            alleles[marker] = {x: alleles[marker][x]
                               for x in allele_order[:max_alleles]}
113
114
115
116
117
118
119
120
121
        if top_noise[marker][1] > top_allele[marker]*(max_noise_pct/100.):
            if reportfile:
                reportfile.write(
                    "Sample %s is not suitable for marker %s:\n"
                    "highest non-allele is %.1f%% of the highest allele\n" %
                        (tag, marker,
                        100.*top_noise[marker][1]/top_allele[marker]))
                for allele in sorted(alleles[marker],
                                     key=lambda x: -alleles[marker][x]):
jhoogenboom's avatar
jhoogenboom committed
122
123
124
                    seq = allele if seqformat is None \
                        else ensure_sequence_format(allele, seqformat,
                            library=library, marker=marker)
125
                    reportfile.write("%i\tALLELE\t%s\n" %
jhoogenboom's avatar
jhoogenboom committed
126
127
128
129
                        (alleles[marker][allele], seq))
                seq = top_noise[marker][0] if seqformat is None \
                    else ensure_sequence_format(top_noise[marker][0],
                        seqformat, library=library, marker=marker)
130
                reportfile.write("%i\tNOISE\t%s\n\n" %
jhoogenboom's avatar
jhoogenboom committed
131
                    (top_noise[marker][1], seq))
132
133
134
135
136
137
138
139
140
141
142
143
144
            noisy_markers += 1
            alleles[marker] = {}

    # Drop this sample completely if it has too many noisy markers.
    if noisy_markers > max_noisy:
        if reportfile:
            reportfile.write("Sample %s appears to be contaminated!\n\n" % tag)
        return

    # The sample is OK, write out its alleles.
    for marker in alleles:
        for allele in sorted(alleles[marker],
                             key=lambda x: -alleles[marker][x]):
jhoogenboom's avatar
jhoogenboom committed
145
146
            seq = allele if seqformat is None else ensure_sequence_format(
                allele, seqformat, library=library, marker=marker)
147
            print("\t".join(
jhoogenboom's avatar
jhoogenboom committed
148
                [tag, marker, str(alleles[marker][allele]), seq]))
149
#find_alleles_sample
jhoogenboom's avatar
jhoogenboom committed
150
151
152


def add_arguments(parser):
jhoogenboom's avatar
jhoogenboom committed
153
    add_sample_files_args(parser)
154
155
156
157
158
159
160
161
    parser.add_argument('-r', '--report', metavar="OUTFILE",
        type=argparse.FileType("w"),
        help="write a report to the given file, detailing possibly "
             "contaminated or otherwise unsuitable samples")
    parser.add_argument('-n', '--min-reads', metavar="N", type=pos_int_arg,
        default=_DEF_MIN_READS,
        help="require at least this number of reads for the highest allele "
             "(default: %(default)s)")
162
    parser.add_argument('-m', '--min-allele-pct', metavar="PCT", type=float,
jhoogenboom's avatar
jhoogenboom committed
163
164
165
        default=_DEF_MIN_ALLELE_PCT,
        help="call heterozygous if the second allele is at least this "
             "percentage of the highest allele (default: %(default)s)")
166
    parser.add_argument('-M', '--max-noise-pct', metavar="PCT", type=float,
167
168
169
170
        default=_DEF_MAX_NOISE_PCT,
        help="a sample is considered contaminated/unsuitable for a marker if "
             "the highest non-allelic sequence is at least this percentage of "
             "the highest allele (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
171
    parser.add_argument('-a', '--max-alleles', metavar="N", type=pos_int_arg,
172
173
174
175
176
177
178
        default=_DEF_MAX_ALLELES,
        help="allow no more than this number of alleles per marker (default: "
             "%(default)s)")
    parser.add_argument('-x', '--max-noisy', metavar="N", type=pos_int_arg,
        default=_DEF_MAX_NOISY,
        help="entirely reject a sample if more than this number of markers "
             "have a high non-allelic sequence (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
179
180
181
182
    parser.add_argument('-c', '--stuttermark-column', metavar="COLNAME",
        help="name of column with Stuttermark output; if specified, sequences "
             "for which the value in this column does not start with ALLELE "
             "are ignored")
jhoogenboom's avatar
jhoogenboom committed
183
    parser.add_argument('-F', '--sequence-format', metavar="FORMAT",
jhoogenboom's avatar
jhoogenboom committed
184
        choices=("raw", "tssv", "allelename"),
jhoogenboom's avatar
jhoogenboom committed
185
186
187
188
189
        help="convert sequences to the specified format: one of %(choices)s "
             "(default: no conversion)")
    parser.add_argument('-l', '--library', metavar="LIBRARY",
        type=argparse.FileType('r'),
        help="library file for sequence format conversion")
jhoogenboom's avatar
jhoogenboom committed
190
191
192
193
#add_arguments


def run(args):
194
    if args.filelist == [sys.stdin] and sys.stdin.isatty():
jhoogenboom's avatar
jhoogenboom committed
195
196
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
197
198
199

    find_alleles(args.filelist, args.report, args.tag_expr, args.tag_format,
                 args.min_reads, args.min_allele_pct, args.max_noise_pct,
jhoogenboom's avatar
jhoogenboom committed
200
201
                 args.max_alleles, args.max_noisy, args.stuttermark_column,
                 args.sequence_format, args.library)
jhoogenboom's avatar
jhoogenboom committed
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#run


def main():
    """
    Main entry point.
    """
    parser = argparse.ArgumentParser(
        description=__doc__)
    try:
        add_arguments(parser)
        run(parser.parse_args())
    except OSError as error:
        parser.error(error)
#main


if __name__ == "__main__":
    main()