bgcorrect.py 8.99 KB
Newer Older
1 2
#!/usr/bin/env python
"""
jhoogenboom's avatar
jhoogenboom committed
3 4 5
Match background noise profiles (obtained from e.g., bgestimate) to
samples.

jhoogenboom's avatar
jhoogenboom committed
6
Nine new columns are added to the output giving, for each sequence, the
jhoogenboom's avatar
jhoogenboom committed
7 8
number of reads attributable to noise from other sequences (_noise
columns) and the number of noise reads caused by the prescense of this
jhoogenboom's avatar
jhoogenboom committed
9 10
sequence (_add columns), as well as the resulting number of reads after
correction (_corrected columns: original minus _noise plus _add).
11 12 13 14 15
"""
import argparse
#import numpy as np  # Only imported when actually running this tool.

from ..lib import parse_library, load_profiles, ensure_sequence_format, nnls, \
16 17
                  get_column_ids, add_sequence_format_args, \
                  add_input_output_args, get_input_output_files
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33

__version__ = "0.1dev"


def get_sample_data(infile, convert_to_raw=False, library=None):
    """
    Read data from the given file handle, corresponding to a single
    sample, and fill a dict with all sequences in the sample.
    """
    column_names = infile.readline().rstrip("\r\n").split("\t")
    column_names.append("forward_noise")
    column_names.append("reverse_noise")
    column_names.append("total_noise")
    column_names.append("forward_add")
    column_names.append("reverse_add")
    column_names.append("total_add")
jhoogenboom's avatar
jhoogenboom committed
34 35 36
    column_names.append("forward_corrected")
    column_names.append("reverse_corrected")
    column_names.append("total_corrected")
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
    colid_name, colid_allele, colid_forward, colid_reverse = get_column_ids(
        column_names, "name", "allele", "forward", "reverse")
    data = {}
    for line in infile:
        cols = line.rstrip("\r\n").split("\t")
        marker = cols[colid_name]
        if convert_to_raw:
            cols[colid_allele] = ensure_sequence_format(
                cols[colid_allele], "raw", library=library, marker=marker)
        cols[colid_forward] = int(cols[colid_forward])
        cols[colid_reverse] = int(cols[colid_reverse])
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
        cols.append(0)
jhoogenboom's avatar
jhoogenboom committed
54 55 56
        cols.append(int(cols[colid_forward]))
        cols.append(int(cols[colid_reverse]))
        cols.append(int(cols[colid_forward]) + int(cols[colid_reverse]))
57 58 59 60 61 62 63 64 65 66 67
        if marker not in data:
            data[marker] = []
        data[marker].append(cols)
    return column_names, data
#get_sample_data


def match_profile(column_names, data, profile, convert_to_raw, library,
                  marker):
    (colid_name, colid_allele, colid_forward, colid_reverse, colid_total,
     colid_forward_noise, colid_reverse_noise, colid_total_noise,
jhoogenboom's avatar
jhoogenboom committed
68 69 70
     colid_forward_add, colid_reverse_add, colid_total_add,
     colid_forward_corrected, colid_reverse_corrected,
     colid_total_corrected) = get_column_ids(
71 72
        column_names, "name", "allele", "forward", "reverse", "total",
        "forward_noise", "reverse_noise", "total_noise", "forward_add",
jhoogenboom's avatar
jhoogenboom committed
73 74
        "reverse_add", "total_add", "forward_corrected", "reverse_corrected",
        "total_corrected")
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100

    # Enter profiles into P.
    P1 = np.matrix(profile["forward"])
    P2 = np.matrix(profile["reverse"])

    # Enter sample into C.
    seqs = []
    C1 = np.matrix(np.zeros([1, profile["m"]]))
    C2 = np.matrix(np.zeros([1, profile["m"]]))
    for line in data:
        if convert_to_raw:
            allele = ensure_sequence_format(line[colid_allele], "raw",
                                            library=library, marker=marker)
        else:
            allele = line[colid_allele]
        seqs.append(allele)
        try:
            i = profile["seqs"].index(allele)
        except ValueError:
            # Note: Not adding any new sequences to the profile, since
            # they will just be zeroes and have no effect on the result.
            continue
        C1[0, i] = line[colid_forward]
        C2[0, i] = line[colid_reverse]

    # Compute corrected read counts.
jhoogenboom's avatar
jhoogenboom committed
101
    A = nnls(np.hstack([P1, P2]).T, np.hstack([C1, C2]).T).T
102 103
    np.fill_diagonal(P1, 0)
    np.fill_diagonal(P2, 0)
jhoogenboom's avatar
jhoogenboom committed
104 105 106 107
    forward_noise = A * P1
    reverse_noise = A * P2
    forward_add = np.multiply(A, P1.sum(1))
    reverse_add = np.multiply(A, P2.sum(1))
108 109 110 111 112 113 114 115 116 117 118

    j = 0
    for line in data:
        j += 1
        try:
            i = profile["seqs"].index(seqs[j-1])
        except ValueError:
            continue
        line[colid_forward_noise] = forward_noise[0, i]
        line[colid_reverse_noise] = reverse_noise[0, i]
        line[colid_total_noise] = forward_noise[0, i] + reverse_noise[0, i]
jhoogenboom's avatar
jhoogenboom committed
119 120 121
        line[colid_forward_corrected] -= line[colid_forward_noise]
        line[colid_reverse_corrected] -= line[colid_reverse_noise]
        line[colid_total_corrected] -= line[colid_total_noise]
122 123 124 125
        if i < profile["n"]:
            line[colid_forward_add] = forward_add[0, i]
            line[colid_reverse_add] = reverse_add[0, i]
            line[colid_total_add] = forward_add[0, i] + reverse_add[0, i]
jhoogenboom's avatar
jhoogenboom committed
126 127 128
            line[colid_forward_corrected] += line[colid_forward_add]
            line[colid_reverse_corrected] += line[colid_reverse_add]
            line[colid_total_corrected] += line[colid_total_add]
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146

    # Add sequences that are in the profile but not in the sample.
    for i in range(profile["m"]):
        if profile["seqs"][i] in seqs:
            continue
        amount = forward_noise[0, i] + reverse_noise[0, i]
        if i < profile["n"]:
            amount += forward_add[0, i] + reverse_add[0, i]
        if amount > 0:
            line = [""] * len(column_names)
            line[colid_name] = marker
            line[colid_allele] = profile["seqs"][i]
            line[colid_forward] = 0
            line[colid_reverse] = 0
            line[colid_total] = 0
            line[colid_forward_noise] = forward_noise[0, i]
            line[colid_reverse_noise] = reverse_noise[0, i]
            line[colid_total_noise] = forward_noise[0, i] + reverse_noise[0, i]
jhoogenboom's avatar
jhoogenboom committed
147 148 149
            line[colid_forward_corrected] = -line[colid_forward_noise]
            line[colid_reverse_corrected] = -line[colid_reverse_noise]
            line[colid_total_corrected] = -line[colid_total_noise]
150 151 152 153
            if i < profile["n"]:
                line[colid_forward_add] = forward_add[0, i]
                line[colid_reverse_add] = reverse_add[0, i]
                line[colid_total_add] = forward_add[0, i] + reverse_add[0, i]
jhoogenboom's avatar
jhoogenboom committed
154 155 156
                line[colid_forward_corrected] += line[colid_forward_add]
                line[colid_reverse_corrected] += line[colid_reverse_add]
                line[colid_total_corrected] += line[colid_total_add]
157 158 159 160 161 162 163 164
            else:
                line[colid_forward_add] = 0
                line[colid_reverse_add] = 0
                line[colid_total_add] = 0
            data.append(line)
#match_profile


165
def match_profiles(infile, outfile, profiles, library, seqformat):
166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
    column_names, data = get_sample_data(
        infile, convert_to_raw=seqformat=="raw", library=library)
    colid_allele = get_column_ids(column_names, "allele")

    outfile.write("\t".join(column_names) + "\n")
    for marker in data:
        if marker in profiles:
            match_profile(column_names, data[marker], profiles[marker],
                          seqformat!="raw", library, marker)
        for line in data[marker]:
            if seqformat is not None and seqformat != "raw":
                line[colid_allele] = ensure_sequence_format(line[colid_allele],
                    seqformat, library=library, marker=marker)
            outfile.write("\t".join(map(str, line)) + "\n")
#match_profiles


def add_arguments(parser):
    parser.add_argument('profiles', metavar="PROFILES",
        type=argparse.FileType('r'),
        help="file containing background noise profiles to match")
187
    add_input_output_args(parser, True, True, False)
jhoogenboom's avatar
jhoogenboom committed
188 189
    filtergroup = parser.add_argument_group("filtering options")
    filtergroup.add_argument('-M', '--marker', metavar="MARKER",
190
        help="work only on MARKER")
jhoogenboom's avatar
jhoogenboom committed
191
    add_sequence_format_args(parser)
192 193 194 195
#add_arguments


def run(args):
jhoogenboom's avatar
jhoogenboom committed
196 197 198 199
    # Import numpy now.
    import numpy as np
    global np

200 201
    gen = get_input_output_files(args, True, True)
    if not gen:
202 203
        raise ValueError("please specify an input file, or pipe in the output "
                         "of another program")
204 205 206 207 208 209 210 211 212 213 214 215 216 217

    # Read library and profiles once.
    library = parse_library(args.library) if args.library else None
    profiles = load_profiles(args.profiles, library)
    if args.marker:
        profiles = {args.marker: profiles[args.marker]} \
                   if args.marker in profiles else {}

    for tag, infiles, outfile in gen:
        # TODO: Aggregate data from all infiles of each sample.
        # This tool now only works properly with one infile per sample!
        for infile in infiles:
            match_profiles(infile, outfile, profiles, library,
                           args.sequence_format)
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236
#run


def main():
    """
    Main entry point.
    """
    parser = argparse.ArgumentParser(
        description=__doc__)
    try:
        add_arguments(parser)
        run(parser.parse_args())
    except OSError as error:
        parser.error(error)
#main


if __name__ == "__main__":
    main()