allelefinder.py 9.88 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
2
#!/usr/bin/env python
"""
3
4
Find true alleles in reference samples and detect possible
contaminations.
jhoogenboom's avatar
jhoogenboom committed
5
6
7
"""
import argparse
import sys
8
import re
jhoogenboom's avatar
jhoogenboom committed
9

jhoogenboom's avatar
jhoogenboom committed
10
11
from ..lib import get_column_ids, pos_int_arg, map_tags_to_files, \
                  add_sample_files_args, ensure_sequence_format
jhoogenboom's avatar
jhoogenboom committed
12
13
14
15

__version__ = "0.1dev"


16
17
18
19
20
21
22
23
24
# Default values for parameters are specified below.

# Default minimum number of reads required for the highest allele.
# This value can be overridden by the -n command line option.
_DEF_MIN_READS = 50

# Default minimum number of reads required for an allele to be called,
# as a percentage of the number of reads of the highest allele.
# This value can be overridden by the -m command line option.
jhoogenboom's avatar
jhoogenboom committed
25
_DEF_MIN_ALLELE_PCT = 30.0
26
27
28
29
30
31

# Default maximum amount of noise to allow, as a percentage of the
# number of reads of the highest allele of each marker.  If any noise
# (i.e., non-allelic sequences) above this threshold are detected, the
# sample is considered 'noisy' for this marker.
# This value can be overridden by the -M command line option.
jhoogenboom's avatar
jhoogenboom committed
32
_DEF_MAX_NOISE_PCT = 10.0
33
34
35

# Default maximum number of alleles to expect for each marker.
# This value can be overridden by the -a command line option.
jhoogenboom's avatar
jhoogenboom committed
36
37
_DEF_MAX_ALLELES = 2

38
39
40
41
42
43
44
# Default maximum number of noisy markers allowed per sample.
# This value can be overridden by the -x command line option.
_DEF_MAX_NOISY = 2


def find_alleles(filelist, reportfile, tag_expr, tag_format, min_reads,
                 min_allele_pct, max_noise_pct, max_alleles, max_noisy,
jhoogenboom's avatar
jhoogenboom committed
45
46
47
48
                 stuttermark_column, seqformat, library):

    if seqformat is not None and library is not None:
        library = parse_library(library)
49
50

    print("\t".join(["sample", "marker", "total", "allele"]))
jhoogenboom's avatar
jhoogenboom committed
51
52
53
54
55
56
57
    tags_to_files = map_tags_to_files(filelist, tag_expr, tag_format)
    for tag in tags_to_files:
        data = {}
        for infile in tags_to_files[tag]:
            get_sample_data(infile, data, stuttermark_column)
        find_alleles_sample(data, reportfile, tag, min_reads, min_allele_pct,
            max_noise_pct, max_alleles, max_noisy, seqformat, library)
58
59
60
#find_alleles


jhoogenboom's avatar
jhoogenboom committed
61
62
def get_sample_data(infile, data, stuttermark_column):
    """Add data from infile to data dict as [marker, allele]=reads."""
jhoogenboom's avatar
jhoogenboom committed
63
64
65
66
67
    # Get column numbers.
    column_names = infile.readline().rstrip("\r\n").split("\t")
    colid_total, colid_allele, colid_name = get_column_ids(column_names,
        "total", "allele", "name")

jhoogenboom's avatar
jhoogenboom committed
68
    # Also try to get stuttermark column if we have one.
jhoogenboom's avatar
jhoogenboom committed
69
    if stuttermark_column is not None:
jhoogenboom's avatar
jhoogenboom committed
70
71
72
73
74
        try:
            colid_stuttermark = get_column_ids(column_names,
                stuttermark_column)
        except:
            stuttermark_column = None
jhoogenboom's avatar
jhoogenboom committed
75
76
77
78
79
80

    for line in infile:
        line = line.rstrip("\r\n").split("\t")
        if (stuttermark_column is not None and
                not line[colid_stuttermark].startswith("ALLELE")):
            continue
jhoogenboom's avatar
jhoogenboom committed
81
82
83
        data[line[colid_name], line[colid_allele]] = int(line[colid_total])
#get_sample_data

jhoogenboom's avatar
jhoogenboom committed
84

jhoogenboom's avatar
jhoogenboom committed
85
86
87
88
89
90
91
92
def find_alleles_sample(data, reportfile, tag, min_reads, min_allele_pct,
                        max_noise_pct, max_alleles, max_noisy, seqformat,
                        library):
    top_noise = {}
    top_allele = {}
    alleles = {}
    for marker, allele in data:
        reads = data[marker, allele]
jhoogenboom's avatar
jhoogenboom committed
93

94
95
96
97
98
99
        if marker not in alleles:
            alleles[marker] = {allele: reads}
            top_allele[marker] = reads
            top_noise[marker] = ["-", 0]
        else:
            if reads > top_allele[marker]:
jhoogenboom's avatar
jhoogenboom committed
100
                # New highest allele!
101
102
103
104
105
106
107
108
109
110
                top_allele[marker] = reads
                for allelex in alleles[marker].keys():
                    if (alleles[marker][allelex] <
                            top_allele[marker] * (min_allele_pct/100.)):
                        if alleles[marker][allelex] > top_noise[marker][1]:
                            top_noise[marker] = [
                                allelex, alleles[marker][allelex]]
                        del alleles[marker][allelex]
                alleles[marker][allele] = reads
            elif reads >= top_allele[marker]*(min_allele_pct/100.):
jhoogenboom's avatar
jhoogenboom committed
111
112
                # New secundary allele!
                alleles[marker][allele] = reads
113
            elif reads >= top_noise[marker][1]:
jhoogenboom's avatar
jhoogenboom committed
114
                # New highest noise!
115
                top_noise[marker] = [allele, reads]
jhoogenboom's avatar
jhoogenboom committed
116

117
118
    # Find and eliminate noisy markers in this sample first.
    noisy_markers = 0
jhoogenboom's avatar
jhoogenboom committed
119
    for marker in alleles:
120
121
122
123
124
125
126
127
        if top_allele[marker] < min_reads:
            if reportfile:
                reportfile.write(
                    "Sample %s is not suitable for marker %s:\n"
                    "highest allele has only %i reads\n\n" %
                        (tag, marker, top_allele[marker]))
            alleles[marker] = {}
            continue
jhoogenboom's avatar
jhoogenboom committed
128
129
130
        if len(alleles[marker]) > max_alleles:
            allele_order = sorted(alleles[marker],
                                  key=lambda x: -alleles[marker][x])
131
132
            top_noise[marker] = [allele_order[max_alleles],
                alleles[marker][allele_order[max_alleles]]]
jhoogenboom's avatar
jhoogenboom committed
133
134
            alleles[marker] = {x: alleles[marker][x]
                               for x in allele_order[:max_alleles]}
135
136
137
138
139
140
141
142
143
        if top_noise[marker][1] > top_allele[marker]*(max_noise_pct/100.):
            if reportfile:
                reportfile.write(
                    "Sample %s is not suitable for marker %s:\n"
                    "highest non-allele is %.1f%% of the highest allele\n" %
                        (tag, marker,
                        100.*top_noise[marker][1]/top_allele[marker]))
                for allele in sorted(alleles[marker],
                                     key=lambda x: -alleles[marker][x]):
jhoogenboom's avatar
jhoogenboom committed
144
145
146
                    seq = allele if seqformat is None \
                        else ensure_sequence_format(allele, seqformat,
                            library=library, marker=marker)
147
                    reportfile.write("%i\tALLELE\t%s\n" %
jhoogenboom's avatar
jhoogenboom committed
148
149
150
151
                        (alleles[marker][allele], seq))
                seq = top_noise[marker][0] if seqformat is None \
                    else ensure_sequence_format(top_noise[marker][0],
                        seqformat, library=library, marker=marker)
152
                reportfile.write("%i\tNOISE\t%s\n\n" %
jhoogenboom's avatar
jhoogenboom committed
153
                    (top_noise[marker][1], seq))
154
155
156
157
158
159
160
161
162
163
164
165
166
            noisy_markers += 1
            alleles[marker] = {}

    # Drop this sample completely if it has too many noisy markers.
    if noisy_markers > max_noisy:
        if reportfile:
            reportfile.write("Sample %s appears to be contaminated!\n\n" % tag)
        return

    # The sample is OK, write out its alleles.
    for marker in alleles:
        for allele in sorted(alleles[marker],
                             key=lambda x: -alleles[marker][x]):
jhoogenboom's avatar
jhoogenboom committed
167
168
            seq = allele if seqformat is None else ensure_sequence_format(
                allele, seqformat, library=library, marker=marker)
169
            print("\t".join(
jhoogenboom's avatar
jhoogenboom committed
170
                [tag, marker, str(alleles[marker][allele]), seq]))
171
#find_alleles_sample
jhoogenboom's avatar
jhoogenboom committed
172
173
174


def add_arguments(parser):
jhoogenboom's avatar
jhoogenboom committed
175
    add_sample_files_args(parser)
176
177
178
179
180
181
182
183
    parser.add_argument('-r', '--report', metavar="OUTFILE",
        type=argparse.FileType("w"),
        help="write a report to the given file, detailing possibly "
             "contaminated or otherwise unsuitable samples")
    parser.add_argument('-n', '--min-reads', metavar="N", type=pos_int_arg,
        default=_DEF_MIN_READS,
        help="require at least this number of reads for the highest allele "
             "(default: %(default)s)")
184
    parser.add_argument('-m', '--min-allele-pct', metavar="PCT", type=float,
jhoogenboom's avatar
jhoogenboom committed
185
186
187
        default=_DEF_MIN_ALLELE_PCT,
        help="call heterozygous if the second allele is at least this "
             "percentage of the highest allele (default: %(default)s)")
188
    parser.add_argument('-M', '--max-noise-pct', metavar="PCT", type=float,
189
190
191
192
        default=_DEF_MAX_NOISE_PCT,
        help="a sample is considered contaminated/unsuitable for a marker if "
             "the highest non-allelic sequence is at least this percentage of "
             "the highest allele (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
193
    parser.add_argument('-a', '--max-alleles', metavar="N", type=pos_int_arg,
194
195
196
197
198
199
200
        default=_DEF_MAX_ALLELES,
        help="allow no more than this number of alleles per marker (default: "
             "%(default)s)")
    parser.add_argument('-x', '--max-noisy', metavar="N", type=pos_int_arg,
        default=_DEF_MAX_NOISY,
        help="entirely reject a sample if more than this number of markers "
             "have a high non-allelic sequence (default: %(default)s)")
jhoogenboom's avatar
jhoogenboom committed
201
202
203
204
    parser.add_argument('-c', '--stuttermark-column', metavar="COLNAME",
        help="name of column with Stuttermark output; if specified, sequences "
             "for which the value in this column does not start with ALLELE "
             "are ignored")
jhoogenboom's avatar
jhoogenboom committed
205
    parser.add_argument('-F', '--sequence-format', metavar="FORMAT",
jhoogenboom's avatar
jhoogenboom committed
206
        choices=("raw", "tssv", "allelename"),
jhoogenboom's avatar
jhoogenboom committed
207
208
209
210
211
        help="convert sequences to the specified format: one of %(choices)s "
             "(default: no conversion)")
    parser.add_argument('-l', '--library', metavar="LIBRARY",
        type=argparse.FileType('r'),
        help="library file for sequence format conversion")
jhoogenboom's avatar
jhoogenboom committed
212
213
214
215
#add_arguments


def run(args):
216
    if args.filelist == [sys.stdin] and sys.stdin.isatty():
jhoogenboom's avatar
jhoogenboom committed
217
218
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
219
220
221

    find_alleles(args.filelist, args.report, args.tag_expr, args.tag_format,
                 args.min_reads, args.min_allele_pct, args.max_noise_pct,
jhoogenboom's avatar
jhoogenboom committed
222
223
                 args.max_alleles, args.max_noisy, args.stuttermark_column,
                 args.sequence_format, args.library)
jhoogenboom's avatar
jhoogenboom committed
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#run


def main():
    """
    Main entry point.
    """
    parser = argparse.ArgumentParser(
        description=__doc__)
    try:
        add_arguments(parser)
        run(parser.parse_args())
    except OSError as error:
        parser.error(error)
#main


if __name__ == "__main__":
    main()