bgmerge.py 4.73 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
2
3
4
5
6
7
8
9
10
11
12
#!/usr/bin/env python
"""
Merge multiple files containing background noise profiles.

Background noise profiles are merged in the order in which they are
specified.  If multple files specify a different value for the same
allele and sequence, the value of the first file is used.

It is convenient to pipe the output of bgpredict and/or bgestimate
into bgmerge to merge that with an existing file containing background
profiles.  Specify '-' as one of the input files to read from stdin
(i.e., read input from a pipe).  If only one input file is specified,
jhoogenboom's avatar
jhoogenboom committed
13
14
15
'-' is implicitly used as the second input file.  Note that as a result,
in case of conflicting values, the value in the specified input file
will take precedence over the value in the data that was piped in.
jhoogenboom's avatar
jhoogenboom committed
16
17
18
19
20
21

Example: fdstools bgpredict ... | fdstools bgmerge old.txt > out.txt
"""
import argparse
import sys

22
from ..lib import load_profiles, ensure_sequence_format,\
jhoogenboom's avatar
jhoogenboom committed
23
24
25
26
27
28
29
30
                  add_sequence_format_args

__version__ = "0.1dev"


def merge_profiles(infiles, outfile, crosstab, seqformat, library):
    amounts = {}
    for infile in infiles:
31
32
33
34
35
        if infile == "-":
            profiles = load_profiles(sys.stdin, library)
        else:
            with open(infile, "r") as handle:
                profiles = load_profiles(handle, library)
jhoogenboom's avatar
jhoogenboom committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
        for marker in profiles:
            if marker not in amounts:
                amounts[marker] = {}
            for i in range(profiles[marker]["n"]):
                for j in range(profiles[marker]["m"]):
                    key = (profiles[marker]["seqs"][i],
                           profiles[marker]["seqs"][j])
                    if key not in amounts[marker]:
                        this_amounts = (profiles[marker]["forward"][i][j],
                                        profiles[marker]["reverse"][i][j])
                        if sum(this_amounts):
                            amounts[marker][key] = this_amounts

    if not crosstab:
        # Tab-separated columns format.
        outfile.write("\t".join(
            ["marker", "allele", "sequence", "fmean", "rmean"]) + "\n")
        for marker in amounts:
            for allele, sequence in amounts[marker]:
                outfile.write("\t".join([marker] +
                    [ensure_sequence_format(seq, seqformat, library=library,
                        marker=marker) for seq in (allele, sequence)] +
                    map(str, amounts[marker][allele, sequence])) + "\n")
        return

    # Cross-tabular format (profiles in rows).
    for marker in amounts:
        alleles = set(allele for allele, sequence in amounts[marker])
        seqs = list(alleles) + list(set(
            sequence for allele, sequence in amounts[marker]
            if sequence not in alleles))
        outfile.write("\t".join([marker, "0"] +
            [ensure_sequence_format(seq, seqformat, library=library,
                marker=marker) for seq in seqs]) + "\n")
        forward = [[0 if (allele, sequence) not in amounts[marker] else
                    amounts[marker][allele, sequence][0] for sequence in seqs]
                   for allele in seqs[:len(alleles)]]
        reverse = [[0 if (allele, sequence) not in amounts[marker] else
                    amounts[marker][allele, sequence][1] for sequence in seqs]
                   for allele in seqs[:len(alleles)]]
        for i in range(len(alleles)):
            outfile.write("\t".join(
                [marker, str(i+1)] + map(str, forward[i])) + "\n")
            outfile.write("\t".join(
                [marker, str(-i-1)] + map(str, reverse[i])) + "\n")
#merge_profiles


def add_arguments(parser):
    parser.add_argument('infiles', nargs='+', metavar="FILE",
        help="files containing the background noise profiles to combine; "
             "if a single file is given, it is merged with input from stdin; "
             "use '-' to use stdin as an explicit input source")
    outgroup = parser.add_argument_group("output file options")
    outgroup.add_argument('-o', '--output', dest="outfile", metavar="FILE",
        type=argparse.FileType('w'),
        default=sys.stdout,
        help="file to write output to (default: write to stdout)")
    parser.add_argument('-C', '--cross-tabular', action="store_true",
        help="if specified, a space-efficient cross-tabular output format is "
             "used instead of the default tab-separated columns format")
    add_sequence_format_args(parser, "raw", True)  # Force raw seqs.
#add_arguments


def run(args):
    if len(args.infiles) < 2:
103
        if sys.stdin.isatty() or "-" in args.infiles:
jhoogenboom's avatar
jhoogenboom committed
104
            raise ValueError("please specify at least two input files")
105
        args.infiles.append("-")
jhoogenboom's avatar
jhoogenboom committed
106
107
108
109

    merge_profiles(args.infiles, args.outfile, args.cross_tabular,
                   args.sequence_format, args.library)
#run