libconvert.py 17.1 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1
2
#!/usr/bin/env python
"""
3
4
Convert between TSSV (tab-separated) and FDSTools (ini-style) library
formats.  When no input is given, an empty FDSTools library is produced.
jhoogenboom's avatar
jhoogenboom committed
5
6
7
8
9
10
11
12
13
14
15
16
17

FDSTools uses library files to convert sequences to allele names and
vice versa.  Because FDSTools was made to work with the output of TSSV,
it has been made compatible with TSSV's library files as well.  The TSSV
library format is much less suitable for the generation of allele names,
which makes FDSTools libraries the recommended choice.  The libconvert
tool can be used to create a compatible TSSV library.

In FDSTools, sequences of STR alleles are split up into three parts: a
prefix, the STR, and a suffix.  The prefix and suffix are optional and
are meant to fill the gap between the STR and the primer binding sites.
The primer binding sites are called 'flanks' in the library file.

18
19
20
21
22
23
For non-STR markers, FDSTools library files simply contain the reference
sequence of the region between the flanks.  All markers in TSSV library
files are assumed to be STR markers, but the libconvert tool will
include the non-STR markers on a best-effort basis when converting to
the TSSV format.

jhoogenboom's avatar
jhoogenboom committed
24
25
26
27
28
29
30
31
32
33
34
35
36
Allele names typically consist of an allele number compatible with those
obtained from Capillary Electrophoresis (CE), followed by the STR
sequence in a shortened form and any substitutions or other variants
that occur in the prefix and suffix.  The first prefix/suffix in the
library file is used as the reference sequence for calling variants.

Special alleles, such as the 'X' and 'Y' allele from the Amelogenin
gender test, may be given an explicit allele name by specifying an Alias
in the FDSTools library file.

Run libconvert without any arguments to obtain a default FDSTools
library to start with.  The default library contains commentary lines
that explain the use of each section in more detail.
jhoogenboom's avatar
jhoogenboom committed
37
38
39
40
"""
import argparse
import sys
import re
41
import os.path
jhoogenboom's avatar
jhoogenboom committed
42
43
44

from ..lib import parse_library
from ConfigParser import RawConfigParser
45
from StringIO import StringIO
jhoogenboom's avatar
jhoogenboom committed
46

47
__version__ = "1.0.0"
jhoogenboom's avatar
jhoogenboom committed
48
49


50
51
52
# If no input is given, convert the following to FDSTools format.
_DEFAULT_LIBRARY = "\t".join([
    "MyMarker",
53
54
55
    "CTGTTTCTGAGTTTCAAGTATGTCTGAG",
    "TTACATGCTCGTGCACCTTATGGAGGGG",
    "GT 0 4 AGGGGA 1 1 GTGA 0 5 GT 8 25"])
56

jhoogenboom's avatar
jhoogenboom committed
57
58
def convert_library(infile, outfile, aliases=False):
    pattern_reverse = re.compile("\(([ACGT]+)\)\{(\d+),(\d+)\}")
59
    library = parse_library(infile, stream=True)
jhoogenboom's avatar
jhoogenboom committed
60
61
62
63
64
65
66
67
68
69
70
    if "aliases" in library:
        # FDSTools -> TSSV
        markers = set()
        for marker in library["flanks"]:
            markers.add(marker)
        for marker in library["prefix"]:
            markers.add(marker)
        for marker in library["suffix"]:
            markers.add(marker)
        for marker in library["regex"]:
            markers.add(marker)
71
72
        for marker in library["nostr_reference"]:
            markers.add(marker)
jhoogenboom's avatar
jhoogenboom committed
73
74
75
76
77

        marker_aliases = {}
        for alias in library["aliases"]:
            marker = library["aliases"][alias]["marker"]
            markers.add(marker)
jhoogenboom's avatar
jhoogenboom committed
78
            try:
jhoogenboom's avatar
jhoogenboom committed
79
                marker_aliases[marker].append(alias)
jhoogenboom's avatar
jhoogenboom committed
80
81
            except KeyError:
                marker_aliases[marker] = [alias]
jhoogenboom's avatar
jhoogenboom committed
82
83

        for marker in sorted(markers):
jhoogenboom's avatar
jhoogenboom committed
84
            pattern = []
jhoogenboom's avatar
jhoogenboom committed
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
            if marker in library["aliases"] and not aliases:
                # Ignore this alias, it will be merged into its marker.
                continue
            if marker in library["aliases"] and aliases:
                # Output this alias as a separate marker.
                if marker in library["flanks"]:
                    flanks = library["flanks"][marker]
                elif library["aliases"][marker]["marker"] in library["flanks"]:
                    flanks = library["flanks"][
                        library["aliases"][marker]["marker"]]
                else:
                    continue  # Worthless, no flanks.
                if marker in library["regex"]:
                    pattern = pattern_reverse.findall(
                        library["regex"][marker].pattern)
            elif aliases or marker not in marker_aliases:
101
                # Normal marker, or separately from its aliases.
jhoogenboom's avatar
jhoogenboom committed
102
103
104
105
106
107
                if marker not in library["flanks"]:
                    continue  # Worthless, no flanks.
                flanks = library["flanks"][marker]
                if marker in library["regex"]:
                    pattern = pattern_reverse.findall(
                        library["regex"][marker].pattern)
108
109
                elif marker in library["nostr_reference"]:
                    pattern = [(library["nostr_reference"][marker], "1", "1")]
jhoogenboom's avatar
jhoogenboom committed
110
111
112
113
114
115
116
117
118
119
120
121
122
            else:
                # Merge marker with its aliases.
                flanks = False
                if marker in library["flanks"]:
                    flanks = library["flanks"][marker]
                else:
                    for alias in marker_aliases[marker]:
                        if alias in library["flanks"]:
                            flanks = library["flanks"][alias]
                            break
                if not flanks:
                    continue  # Worthless, no flanks.

123
124
                prefixes = []
                suffixes = []
jhoogenboom's avatar
jhoogenboom committed
125
                if marker in library["prefix"]:
126
                    prefixes += library["prefix"][marker]
jhoogenboom's avatar
jhoogenboom committed
127
                if marker in library["suffix"]:
128
                    suffixes += library["suffix"][marker]
jhoogenboom's avatar
jhoogenboom committed
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
                middle = []

                if marker in library["regex"]:
                    # This marker has a regex next to its aliases.
                    # Check if the aliases fit the regex without change.
                    unmatched = []
                    for alias in marker_aliases[marker]:
                        allele = []
                        if marker in library["prefix"]:
                            allele.append(library["prefix"][marker][0])
                        allele.append(library["aliases"][alias]["sequence"])
                        if marker in library["suffix"]:
                            allele.append(library["suffix"][marker][0])
                        allele = "".join(allele)
                        if library["regex"][marker].match(allele) is None:
                            unmatched.append(
                                library["aliases"][alias]["sequence"])

                    middle = pattern_reverse.findall(
                        library["regex"][marker].pattern)[len(prefixes):]
                    if len(suffixes):
                        middle = middle[:-len(suffixes)]
                    if unmatched:
152
153
154
                        middle = [(x, "0", "1") for x in unmatched] + \
                                 [(x[0], "0", x[2]) for x in middle]

155
156
157
                elif marker in library["nostr_reference"]:
                    middle = [(library["nostr_reference"][marker],
                        "0" if marker in marker_aliases else "1", "1")]
jhoogenboom's avatar
jhoogenboom committed
158
159
160
161
162

                # Add prefixes and suffixes of aliases.
                if marker in marker_aliases:
                    for alias in marker_aliases[marker]:
                        if alias in library["prefix"]:
163
                            prefixes += library["prefix"][alias]
jhoogenboom's avatar
jhoogenboom committed
164
                        if alias in library["suffix"]:
165
                            suffixes += library["suffix"][alias]
166
167
168
169
                        if marker not in library["regex"] and (
                                marker not in library["nostr_reference"] or
                                library["nostr_reference"][marker] !=
                                    library["aliases"][alias]["sequence"]):
jhoogenboom's avatar
jhoogenboom committed
170
171
172
173
174
175
                            middle.append((
                                library["aliases"][alias]["sequence"],
                                "0", "1"))

                # Final regex is prefixes + middle + suffixes.
                pattern = []
176
177
178
                for i in range(len(prefixes)):
                    if i == prefixes.index(prefixes[i]):
                        pattern.append((prefixes[i], "0", "1"))
jhoogenboom's avatar
jhoogenboom committed
179
                pattern += middle
180
181
182
                for i in range(len(suffixes)):
                    if i == suffixes.index(suffixes[i]):
                        pattern.append((suffixes[i], "0", "1"))
jhoogenboom's avatar
jhoogenboom committed
183

184
            outfile.write("%s\t%s\t%s\t%s\n" % (
jhoogenboom's avatar
jhoogenboom committed
185
                marker, flanks[0], flanks[1],
jhoogenboom's avatar
jhoogenboom committed
186
                " ".join("%s %s %s" % x for x in pattern)))
jhoogenboom's avatar
jhoogenboom committed
187
188
189

    else:
        # TSSV -> FDSTools
190
191
        outfile.write("; Lines beginning with a semicolon (;) are ignored by "
            "FDSTools.\n\n")
jhoogenboom's avatar
jhoogenboom committed
192
193
194
195
196
197
        ini = RawConfigParser(allow_no_value=True)
        ini.optionxform = str

        # Create sections.  Most of them will be empty but we will put
        # comments in them to explain how to use them.
        ini.add_section("aliases")
198
199
200
201
202
203
204
205
206
207
208
        ini.set("aliases",
                "; Specify three comma-separated values: marker name, "
                "sequence, and allele name.")
        ini.set("aliases",
                "; You may use the alias name to specify flanks, prefix, and "
                "suffix for this")
        ini.set("aliases",
                "; allele specifically. You cannot specify a repeat structure "
                "for an alias.")
        ini.set("aliases",
                ";MyAlias = MyMarker, AGCTAGC, MySpecialAlleleName")
jhoogenboom's avatar
jhoogenboom committed
209
        ini.add_section("flanks")
210
211
212
        ini.set("flanks",
                "; Specify two comma-separated values: left flank and right "
                "flank.")
jhoogenboom's avatar
jhoogenboom committed
213
        ini.add_section("prefix")
214
215
216
217
218
219
220
221
222
223
224
225
        ini.set("prefix",
                "; Specify all known prefix sequences separated by commas. "
                "The first sequence")
        ini.set("prefix",
                "; listed is used as the reference sequence when generating "
                "allele names. The")
        ini.set("prefix",
                "; prefix is the sequence between the left flank and the "
                "repeat and is omitted")
        ini.set("prefix",
                "; from allele names. Deviations from the reference are "
                "expressed as variants.")
jhoogenboom's avatar
jhoogenboom committed
226
        ini.add_section("suffix")
227
228
229
230
231
232
233
234
235
        ini.set("suffix",
                "; Specify all known suffix sequences separated by commas. "
                "The first sequence")
        ini.set("suffix",
                "; listed is used as the reference sequence when generating "
                "allele names. The")
        ini.set("suffix",
                "; suffix is the sequence between the repeat and the right "
                "flank.")
jhoogenboom's avatar
jhoogenboom committed
236
        ini.add_section("repeat")
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
        ini.set("repeat",
                "; Specify the STR repeat structure in space-separated "
                "triples of sequence,")
        ini.set("repeat",
                "; minimum number of repeats, and maximum number of repeats.")
        ini.add_section("no_repeat")
        ini.set("no_repeat",
                "; Specify the reference sequence for non-STR markers.")
        ini.set("no_repeat",
                ";MySNPMarker = TTTTAACACAAAAAATTTAAAATAAGAAGAATAAATAGTGCTTGCTTT")
        ini.set("no_repeat",
                ";MyMtMarker  = AACCCCCCCT")
        ini.add_section("genome_position")
        ini.set("genome_position",
                "; Specify the chromosome number and position of the first "
                "base after the first")
        ini.set("genome_position",
254
255
                "; flank of each marker. Optionally, you may specify the "
                "position of the last")
256
        ini.set("genome_position",
257
258
                "; base of the fragment as well. Specify 'M' as the "
                "chromosome name for markers")
259
        ini.set("genome_position",
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
                "; on mitochondrial DNA. Allele names generated for these "
                "markers will follow")
        ini.set("genome_position",
                "; mtDNA nomenclature guidelines. If one of your mtDNA "
                "fragments starts near the")
        ini.set("genome_position",
                "; end of the reference sequence and continues at the "
                "beginning, you can obtain")
        ini.set("genome_position",
                "; correct base numbering by specifying the fragment's genome "
                "position as")
        ini.set("genome_position",
                "; \"M, (starting position), 16569, 1, (ending position)\". "
                "This tells FDSTools")
        ini.set("genome_position",
                "; that the marker is a concatenation of two fragments, where "
                "the first fragment")
        ini.set("genome_position",
                "; ends at position 16569 and the second fragment starts at "
                "position 1.")
280
281
        ini.set("genome_position",
                "; Using human genome build GRCh38 and rCRS for human mtDNA.")
282
283
284
285
286
287
        ini.set("genome_position",
                ";MyMarker    = 9, 36834400")
        ini.set("genome_position",
                ";MySNPMarker = X, 21214600")
        ini.set("genome_position",
                ";MyMtMarker  = M, 301")
jhoogenboom's avatar
jhoogenboom committed
288
        ini.add_section("length_adjust")
289
290
291
292
293
294
295
296
        ini.set("length_adjust",
                "; When generating allele names for STR alleles, the CE "
                "allele number is based")
        ini.set("length_adjust",
                "; on the length of the sequence (prefix+repeat+suffix) minus "
                "the adjustment")
        ini.set("length_adjust",
                "; specified here.")
jhoogenboom's avatar
jhoogenboom committed
297
        ini.add_section("block_length")
298
        ini.set("block_length",
299
                "; Specify the repeat unit length of each STR marker. The "
300
                "default length is 4.")
301
        ini.add_section("max_expected_copies")
302
303
304
305
306
307
308
309
310
311
312
        ini.set("max_expected_copies",
                "; Specify the maximum expected number of copies (i.e., "
                "alleles) for each")
        ini.set("max_expected_copies",
                "; marker in a single reference sample (only used for "
                "allelefinder). The default")
        ini.set("max_expected_copies",
                "; is 2. Specify 1 here for haploid markers (i.e., those on "
                "mitochondrial DNA or")
        ini.set("max_expected_copies",
                "; on the Y chromosome).")
313
314
315
316
317
318
319
320
321
322
        ini.add_section("expected_allele_length")
        ini.set("expected_allele_length",
                "; Specify one or two values for each marker. The first value "
                "gives the expected")
        ini.set("expected_allele_length",
                "; minimum length of the alleles and the second value (if "
                "given) specifies the")
        ini.set("expected_allele_length",
                "; maximum allele length expected for this marker (both "
                "inclusive).")
jhoogenboom's avatar
jhoogenboom committed
323
324
325
326
327
328
329
330

        # Enter flanking sequences and STR definitions.
        fmt = "%%-%is" % reduce(max, map(len,
            set(library["flanks"].keys() + library["regex"].keys())), 0)
        for marker in sorted(library["flanks"]):
            ini.set("flanks", fmt%marker, ", ".join(library["flanks"][marker]))
        for marker in sorted(library["regex"]):
            blocks = pattern_reverse.findall(library["regex"][marker].pattern)
jhoogenboom's avatar
jhoogenboom committed
331
332
            ini.set("repeat", fmt % marker,
                    " ".join("%s %s %s" % x for x in blocks))
jhoogenboom's avatar
jhoogenboom committed
333
334
335
336
337
338
339
340
341
342
343
344
345
346

            # Try to infer block length from the regular expression.
            length_counts = {0: 0}
            for block in blocks:
                amount = (int(block[1])+int(block[2]))/2.
                if len(block[0]) not in length_counts:
                    length_counts[len(block[0])] = amount
                else:
                    length_counts[len(block[0])] += amount
            block_length = sorted(
                length_counts, key=lambda x: -length_counts[x])[0]
            if block_length != 0 and block_length < 10:
                ini.set("block_length", fmt%marker, block_length)

347
348
349
            # Write max_expected_copies=2 for all markers explicitly.
            ini.set("max_expected_copies", fmt%marker, 2)

jhoogenboom's avatar
jhoogenboom committed
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
            # TODO: I could also do some fiddling for prefix/suffix...

        # Write INI file.
        ini.write(outfile)
#convert_library


def add_arguments(parser):
    parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
        help="input library file, the format is automatically detected "
             "(default: read from stdin)")
    parser.add_argument('outfile', nargs='?', metavar="OUT",
        default=sys.stdout, type=argparse.FileType('w'),
        help="the file to write the output to (default: write to stdout)")
    parser.add_argument('-a', '--aliases', action="store_true",
        help="if specified, aliases in FDSTools libraries are converted to "
             "separate markers in the output library; otherwise, they are "
             "merged into their respective markers")
#add_arguments


def run(args):
372
373
374
375
376
377
378
379
380
381
    if (args.infile != sys.stdin and args.outfile == sys.stdout
            and not os.path.exists(args.infile)):
        # One filename given, and it does not exist.  Assume outfile.
        args.outfile = open(args.infile, 'w')
        args.infile = sys.stdin

    if args.infile != sys.stdin:
        # Open the specified input file.
        args.infile = open(args.infile, 'r')
    elif args.infile.isatty():
382
383
        # No input given.  Produce a default FDSTools library.
        args.infile = StringIO(_DEFAULT_LIBRARY)
384

jhoogenboom's avatar
jhoogenboom committed
385
386
    convert_library(args.infile, args.outfile, args.aliases)
#run