bghomraw.py 7.83 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
2
3
4
5
#!/usr/bin/env python
"""
Compute noise ratios for all noise detected in homozygous reference
samples.

jhoogenboom's avatar
jhoogenboom committed
6
7
8
9
With this tool, separate data points are produced for each sample, which
can be visualised using "fdstools vis bgraw".  Use bghomstats or
bgestimate to compute aggregate statistics on noise instead.
"""
jhoogenboom's avatar
jhoogenboom committed
10
from ..lib import pos_int_arg, add_input_output_args, get_input_output_files,\
11
                  add_allele_detection_args, parse_allelelist,\
jhoogenboom's avatar
jhoogenboom committed
12
                  get_sample_data, add_sequence_format_args
jhoogenboom's avatar
jhoogenboom committed
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

__version__ = "0.1dev"


# Default values for parameters are specified below.

# Default minimum amount of background to consider, as a percentage of
# the highest allele.
# This value can be overridden by the -m command line option.
_DEF_THRESHOLD_PCT = 0.5

# Default minimum number of reads to consider.
# This value can be overridden by the -n command line option.
_DEF_THRESHOLD_ABS = 5

# Default minimum number of samples for each true allele.
# This value can be overridden by the -s command line option.
_DEF_MIN_SAMPLES = 2

# Default minimum number of samples required for each background product
# to be included in the analysis, as a percentage of the number of
# samples with a certain true allele.
# This value can be overridden by the -S command line option.
_DEF_MIN_SAMPLE_PCT = 80.


def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs, tag):
    # Check presence of all alleles.
    for marker in sample_alleles:
        allele = sample_alleles[marker]
        if (marker, allele) not in sample_data:
            raise ValueError(
                "Missing allele %s of marker %s!" % (allele, marker))
        elif 0 in sample_data[marker, allele]:
            raise ValueError(
                "Allele %s of marker %s has 0 reads!" % (allele, marker))

    # Enter the read counts into data and check the thresholds.
    for marker, sequence in sample_data:
52
        if marker not in sample_alleles or sequence is False:
jhoogenboom's avatar
jhoogenboom committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
            # Sample does not participate in this marker.
            continue
        allele = sample_alleles[marker]
        factors = [100./x for x in sample_data[marker, allele]]
        factors.append(100./sum(sample_data[marker, allele]))
        if (marker, allele) not in data:
            data[marker, allele] = {}
        if sequence not in data[marker, allele]:
            data[marker, allele][sequence] = {
                "tag": [],
                "forward": [],
                "reverse": [],
                "fnoise": [],
                "rnoise": [],
                "tnoise": [],
                "passed_filter": 0}
        data[marker, allele][sequence]["tag"].append(tag)
        data[marker, allele][sequence]["forward"].append(
            sample_data[marker, sequence][0])
        data[marker, allele][sequence]["reverse"].append(
            sample_data[marker, sequence][1])
        data[marker, allele][sequence]["fnoise"].append(
            sample_data[marker, sequence][0] * factors[0])
        data[marker, allele][sequence]["rnoise"].append(
            sample_data[marker, sequence][1] * factors[1])
        data[marker, allele][sequence]["tnoise"].append(
            sum(sample_data[marker, sequence]) * factors[2])
        if sum(count >= min_abs and count*factor >= min_pct
               for count, factor in
               zip(sample_data[marker, sequence], factors[:2])):
            data[marker, allele][sequence]["passed_filter"] += 1
#add_sample_data


def filter_data(data, min_samples, min_sample_pct):
    """
    Remove all alleles from data that have less than min_samples samples
    and remove all data of sequences that don't pass the detection
    thresholds in at least min_sample_pct per cent of the samples with a
    particular allele.
    """
    for marker, allele in data.keys():
        if data[marker, allele][allele]["passed_filter"] < min_samples:
            del data[marker, allele]
            continue
        factor = 100./data[marker, allele][allele]["passed_filter"]
        for sequence in data[marker, allele].keys():
            if (data[marker, allele][sequence]["passed_filter"] * factor <
                    min_sample_pct):
                del data[marker, allele][sequence]
                continue
#filter_data


def compute_ratios(samples_in, outfile, allelefile, annotation_column, min_pct,
                   min_abs, min_samples, min_sample_pct, seqformat, library,
                   marker):

111
    # Parse allele list.
jhoogenboom's avatar
jhoogenboom committed
112
113
114
115
116
117
118
119
120
121
122
    allelelist = {} if allelefile is None \
                    else parse_allelelist(allelefile, seqformat, library)

    # Read sample data.
    data = {}
    get_sample_data(
        samples_in,
        lambda tag, sample_data: add_sample_data(
            data, sample_data,
            {m: allelelist[tag][m].pop() for m in allelelist[tag]},
            min_pct, min_abs, tag),
123
        allelelist, annotation_column, seqformat, library, marker, True,
124
        drop_special_seq=True)
jhoogenboom's avatar
jhoogenboom committed
125
126
127
128
129
130
131
132
133
134
135

    # Ensure minimum number of samples per allele and filter
    # insignificant background products.
    filter_data(data, min_samples, min_sample_pct)

    outfile.write("\t".join(["sample", "marker", "allele", "sequence",
        "forward", "reverse", "total", "fnoise", "rnoise", "tnoise"]) + "\n")
    for marker, allele in data:
        for sequence in data[marker, allele]:
            for i in range(len(data[marker, allele][sequence]["tag"])):
                outfile.write("\t".join([
jhoogenboom's avatar
jhoogenboom committed
136
                    data[marker, allele][sequence]["tag"][i], marker, allele,
jhoogenboom's avatar
jhoogenboom committed
137
                    sequence] + [
138
                    "%.3g" % x if abs(x) > 0.0000000001 else "0" for x in (
jhoogenboom's avatar
jhoogenboom committed
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
                        data[marker, allele][sequence]["forward"][i],
                        data[marker, allele][sequence]["reverse"][i],
                        data[marker, allele][sequence]["forward"][i] +
                        data[marker, allele][sequence]["reverse"][i],
                        data[marker, allele][sequence]["fnoise"][i],
                        data[marker, allele][sequence]["rnoise"][i],
                        data[marker, allele][sequence]["tnoise"][i])]) + "\n")
#compute_ratios


def add_arguments(parser):
    add_input_output_args(parser)
    add_allele_detection_args(parser)
    filtergroup = parser.add_argument_group("filtering options")
    filtergroup.add_argument('-m', '--min-pct', metavar="PCT", type=float,
        default=_DEF_THRESHOLD_PCT,
        help="minimum amount of background to consider, as a percentage "
             "of the highest allele (default: %4.2f)" % _DEF_THRESHOLD_PCT)
    filtergroup.add_argument('-n', '--min-abs', metavar="N", type=pos_int_arg,
        default=_DEF_THRESHOLD_ABS,
        help="minimum amount of background to consider, as an absolute "
             "number of reads (default: %(default)s)")
    filtergroup.add_argument('-s', '--min-samples', metavar="N",
        type=pos_int_arg,
        default=_DEF_MIN_SAMPLES,
        help="require this minimum number of samples for each true allele "
             "(default: %(default)s)")
    filtergroup.add_argument('-S', '--min-sample-pct', metavar="PCT",
        type=float,
        default=_DEF_MIN_SAMPLE_PCT,
        help="require this minimum number of samples for each background "
             "product, as a percentage of the number of samples with a "
             "particular true allele (default: %(default)s)")
    filtergroup.add_argument('-M', '--marker', metavar="MARKER",
        help="work only on MARKER")
    add_sequence_format_args(parser)
#add_arguments


def run(args):
    files = get_input_output_files(args)
    if not files:
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
    compute_ratios(files[0], files[1], args.allelelist, args.annotation_column,
                   args.min_pct, args.min_abs, args.min_samples,
                   args.min_sample_pct, args.sequence_format, args.library,
                   args.marker)
#run