libconvert.py 17.1 KB
Newer Older
jhoogenboom's avatar
jhoogenboom committed
1 2
#!/usr/bin/env python
"""
3 4
Convert between TSSV (tab-separated) and FDSTools (ini-style) library
formats.  When no input is given, an empty FDSTools library is produced.
jhoogenboom's avatar
jhoogenboom committed
5 6 7 8 9 10 11 12 13 14 15 16 17

FDSTools uses library files to convert sequences to allele names and
vice versa.  Because FDSTools was made to work with the output of TSSV,
it has been made compatible with TSSV's library files as well.  The TSSV
library format is much less suitable for the generation of allele names,
which makes FDSTools libraries the recommended choice.  The libconvert
tool can be used to create a compatible TSSV library.

In FDSTools, sequences of STR alleles are split up into three parts: a
prefix, the STR, and a suffix.  The prefix and suffix are optional and
are meant to fill the gap between the STR and the primer binding sites.
The primer binding sites are called 'flanks' in the library file.

18 19 20 21 22 23
For non-STR markers, FDSTools library files simply contain the reference
sequence of the region between the flanks.  All markers in TSSV library
files are assumed to be STR markers, but the libconvert tool will
include the non-STR markers on a best-effort basis when converting to
the TSSV format.

jhoogenboom's avatar
jhoogenboom committed
24 25 26 27 28 29 30 31 32 33 34 35 36
Allele names typically consist of an allele number compatible with those
obtained from Capillary Electrophoresis (CE), followed by the STR
sequence in a shortened form and any substitutions or other variants
that occur in the prefix and suffix.  The first prefix/suffix in the
library file is used as the reference sequence for calling variants.

Special alleles, such as the 'X' and 'Y' allele from the Amelogenin
gender test, may be given an explicit allele name by specifying an Alias
in the FDSTools library file.

Run libconvert without any arguments to obtain a default FDSTools
library to start with.  The default library contains commentary lines
that explain the use of each section in more detail.
jhoogenboom's avatar
jhoogenboom committed
37 38 39 40
"""
import argparse
import sys
import re
41
import os.path
jhoogenboom's avatar
jhoogenboom committed
42 43 44

from ..lib import parse_library
from ConfigParser import RawConfigParser
45
from StringIO import StringIO
jhoogenboom's avatar
jhoogenboom committed
46 47 48 49

__version__ = "0.1dev"


50 51 52
# If no input is given, convert the following to FDSTools format.
_DEFAULT_LIBRARY = "\t".join([
    "MyMarker",
53 54 55
    "CTGTTTCTGAGTTTCAAGTATGTCTGAG",
    "TTACATGCTCGTGCACCTTATGGAGGGG",
    "GT 0 4 AGGGGA 1 1 GTGA 0 5 GT 8 25"])
56

jhoogenboom's avatar
jhoogenboom committed
57 58
def convert_library(infile, outfile, aliases=False):
    pattern_reverse = re.compile("\(([ACGT]+)\)\{(\d+),(\d+)\}")
59
    library = parse_library(infile, stream=True)
jhoogenboom's avatar
jhoogenboom committed
60 61 62 63 64 65 66 67 68 69 70
    if "aliases" in library:
        # FDSTools -> TSSV
        markers = set()
        for marker in library["flanks"]:
            markers.add(marker)
        for marker in library["prefix"]:
            markers.add(marker)
        for marker in library["suffix"]:
            markers.add(marker)
        for marker in library["regex"]:
            markers.add(marker)
71 72
        for marker in library["nostr_reference"]:
            markers.add(marker)
jhoogenboom's avatar
jhoogenboom committed
73 74 75 76 77

        marker_aliases = {}
        for alias in library["aliases"]:
            marker = library["aliases"][alias]["marker"]
            markers.add(marker)
jhoogenboom's avatar
jhoogenboom committed
78
            try:
jhoogenboom's avatar
jhoogenboom committed
79
                marker_aliases[marker].append(alias)
jhoogenboom's avatar
jhoogenboom committed
80 81
            except KeyError:
                marker_aliases[marker] = [alias]
jhoogenboom's avatar
jhoogenboom committed
82 83

        for marker in sorted(markers):
jhoogenboom's avatar
jhoogenboom committed
84
            pattern = []
jhoogenboom's avatar
jhoogenboom committed
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
            if marker in library["aliases"] and not aliases:
                # Ignore this alias, it will be merged into its marker.
                continue
            if marker in library["aliases"] and aliases:
                # Output this alias as a separate marker.
                if marker in library["flanks"]:
                    flanks = library["flanks"][marker]
                elif library["aliases"][marker]["marker"] in library["flanks"]:
                    flanks = library["flanks"][
                        library["aliases"][marker]["marker"]]
                else:
                    continue  # Worthless, no flanks.
                if marker in library["regex"]:
                    pattern = pattern_reverse.findall(
                        library["regex"][marker].pattern)
            elif aliases or marker not in marker_aliases:
101
                # Normal marker, or separately from its aliases.
jhoogenboom's avatar
jhoogenboom committed
102 103 104 105 106 107
                if marker not in library["flanks"]:
                    continue  # Worthless, no flanks.
                flanks = library["flanks"][marker]
                if marker in library["regex"]:
                    pattern = pattern_reverse.findall(
                        library["regex"][marker].pattern)
108 109
                elif marker in library["nostr_reference"]:
                    pattern = [(library["nostr_reference"][marker], "1", "1")]
jhoogenboom's avatar
jhoogenboom committed
110 111 112 113 114 115 116 117 118 119 120 121 122
            else:
                # Merge marker with its aliases.
                flanks = False
                if marker in library["flanks"]:
                    flanks = library["flanks"][marker]
                else:
                    for alias in marker_aliases[marker]:
                        if alias in library["flanks"]:
                            flanks = library["flanks"][alias]
                            break
                if not flanks:
                    continue  # Worthless, no flanks.

123 124
                prefixes = []
                suffixes = []
jhoogenboom's avatar
jhoogenboom committed
125
                if marker in library["prefix"]:
126
                    prefixes += library["prefix"][marker]
jhoogenboom's avatar
jhoogenboom committed
127
                if marker in library["suffix"]:
128
                    suffixes += library["suffix"][marker]
jhoogenboom's avatar
jhoogenboom committed
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
                middle = []

                if marker in library["regex"]:
                    # This marker has a regex next to its aliases.
                    # Check if the aliases fit the regex without change.
                    unmatched = []
                    for alias in marker_aliases[marker]:
                        allele = []
                        if marker in library["prefix"]:
                            allele.append(library["prefix"][marker][0])
                        allele.append(library["aliases"][alias]["sequence"])
                        if marker in library["suffix"]:
                            allele.append(library["suffix"][marker][0])
                        allele = "".join(allele)
                        if library["regex"][marker].match(allele) is None:
                            unmatched.append(
                                library["aliases"][alias]["sequence"])

                    middle = pattern_reverse.findall(
                        library["regex"][marker].pattern)[len(prefixes):]
                    if len(suffixes):
                        middle = middle[:-len(suffixes)]
                    if unmatched:
152 153 154
                        middle = [(x, "0", "1") for x in unmatched] + \
                                 [(x[0], "0", x[2]) for x in middle]

155 156 157
                elif marker in library["nostr_reference"]:
                    middle = [(library["nostr_reference"][marker],
                        "0" if marker in marker_aliases else "1", "1")]
jhoogenboom's avatar
jhoogenboom committed
158 159 160 161 162

                # Add prefixes and suffixes of aliases.
                if marker in marker_aliases:
                    for alias in marker_aliases[marker]:
                        if alias in library["prefix"]:
163
                            prefixes += library["prefix"][alias]
jhoogenboom's avatar
jhoogenboom committed
164
                        if alias in library["suffix"]:
165
                            suffixes += library["suffix"][alias]
166 167 168 169
                        if marker not in library["regex"] and (
                                marker not in library["nostr_reference"] or
                                library["nostr_reference"][marker] !=
                                    library["aliases"][alias]["sequence"]):
jhoogenboom's avatar
jhoogenboom committed
170 171 172 173 174 175
                            middle.append((
                                library["aliases"][alias]["sequence"],
                                "0", "1"))

                # Final regex is prefixes + middle + suffixes.
                pattern = []
176 177 178
                for i in range(len(prefixes)):
                    if i == prefixes.index(prefixes[i]):
                        pattern.append((prefixes[i], "0", "1"))
jhoogenboom's avatar
jhoogenboom committed
179
                pattern += middle
180 181 182
                for i in range(len(suffixes)):
                    if i == suffixes.index(suffixes[i]):
                        pattern.append((suffixes[i], "0", "1"))
jhoogenboom's avatar
jhoogenboom committed
183

184
            outfile.write("%s\t%s\t%s\t%s\n" % (
jhoogenboom's avatar
jhoogenboom committed
185
                marker, flanks[0], flanks[1],
jhoogenboom's avatar
jhoogenboom committed
186
                " ".join("%s %s %s" % x for x in pattern)))
jhoogenboom's avatar
jhoogenboom committed
187 188 189

    else:
        # TSSV -> FDSTools
190 191
        outfile.write("; Lines beginning with a semicolon (;) are ignored by "
            "FDSTools.\n\n")
jhoogenboom's avatar
jhoogenboom committed
192 193 194 195 196 197
        ini = RawConfigParser(allow_no_value=True)
        ini.optionxform = str

        # Create sections.  Most of them will be empty but we will put
        # comments in them to explain how to use them.
        ini.add_section("aliases")
198 199 200 201 202 203 204 205 206 207 208
        ini.set("aliases",
                "; Specify three comma-separated values: marker name, "
                "sequence, and allele name.")
        ini.set("aliases",
                "; You may use the alias name to specify flanks, prefix, and "
                "suffix for this")
        ini.set("aliases",
                "; allele specifically. You cannot specify a repeat structure "
                "for an alias.")
        ini.set("aliases",
                ";MyAlias = MyMarker, AGCTAGC, MySpecialAlleleName")
jhoogenboom's avatar
jhoogenboom committed
209
        ini.add_section("flanks")
210 211 212
        ini.set("flanks",
                "; Specify two comma-separated values: left flank and right "
                "flank.")
jhoogenboom's avatar
jhoogenboom committed
213
        ini.add_section("prefix")
214 215 216 217 218 219 220 221 222 223 224 225
        ini.set("prefix",
                "; Specify all known prefix sequences separated by commas. "
                "The first sequence")
        ini.set("prefix",
                "; listed is used as the reference sequence when generating "
                "allele names. The")
        ini.set("prefix",
                "; prefix is the sequence between the left flank and the "
                "repeat and is omitted")
        ini.set("prefix",
                "; from allele names. Deviations from the reference are "
                "expressed as variants.")
jhoogenboom's avatar
jhoogenboom committed
226
        ini.add_section("suffix")
227 228 229 230 231 232 233 234 235
        ini.set("suffix",
                "; Specify all known suffix sequences separated by commas. "
                "The first sequence")
        ini.set("suffix",
                "; listed is used as the reference sequence when generating "
                "allele names. The")
        ini.set("suffix",
                "; suffix is the sequence between the repeat and the right "
                "flank.")
jhoogenboom's avatar
jhoogenboom committed
236
        ini.add_section("repeat")
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
        ini.set("repeat",
                "; Specify the STR repeat structure in space-separated "
                "triples of sequence,")
        ini.set("repeat",
                "; minimum number of repeats, and maximum number of repeats.")
        ini.add_section("no_repeat")
        ini.set("no_repeat",
                "; Specify the reference sequence for non-STR markers.")
        ini.set("no_repeat",
                ";MySNPMarker = TTTTAACACAAAAAATTTAAAATAAGAAGAATAAATAGTGCTTGCTTT")
        ini.set("no_repeat",
                ";MyMtMarker  = AACCCCCCCT")
        ini.add_section("genome_position")
        ini.set("genome_position",
                "; Specify the chromosome number and position of the first "
                "base after the first")
        ini.set("genome_position",
254 255
                "; flank of each marker. Optionally, you may specify the "
                "position of the last")
256
        ini.set("genome_position",
257 258
                "; base of the fragment as well. Specify 'M' as the "
                "chromosome name for markers")
259
        ini.set("genome_position",
260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
                "; on mitochondrial DNA. Allele names generated for these "
                "markers will follow")
        ini.set("genome_position",
                "; mtDNA nomenclature guidelines. If one of your mtDNA "
                "fragments starts near the")
        ini.set("genome_position",
                "; end of the reference sequence and continues at the "
                "beginning, you can obtain")
        ini.set("genome_position",
                "; correct base numbering by specifying the fragment's genome "
                "position as")
        ini.set("genome_position",
                "; \"M, (starting position), 16569, 1, (ending position)\". "
                "This tells FDSTools")
        ini.set("genome_position",
                "; that the marker is a concatenation of two fragments, where "
                "the first fragment")
        ini.set("genome_position",
                "; ends at position 16569 and the second fragment starts at "
                "position 1.")
280 281
        ini.set("genome_position",
                "; Using human genome build GRCh38 and rCRS for human mtDNA.")
282 283 284 285 286 287
        ini.set("genome_position",
                ";MyMarker    = 9, 36834400")
        ini.set("genome_position",
                ";MySNPMarker = X, 21214600")
        ini.set("genome_position",
                ";MyMtMarker  = M, 301")
jhoogenboom's avatar
jhoogenboom committed
288
        ini.add_section("length_adjust")
289 290 291 292 293 294 295 296
        ini.set("length_adjust",
                "; When generating allele names for STR alleles, the CE "
                "allele number is based")
        ini.set("length_adjust",
                "; on the length of the sequence (prefix+repeat+suffix) minus "
                "the adjustment")
        ini.set("length_adjust",
                "; specified here.")
jhoogenboom's avatar
jhoogenboom committed
297
        ini.add_section("block_length")
298
        ini.set("block_length",
299
                "; Specify the repeat unit length of each STR marker. The "
300
                "default length is 4.")
301
        ini.add_section("max_expected_copies")
302 303 304 305 306 307 308 309 310 311 312
        ini.set("max_expected_copies",
                "; Specify the maximum expected number of copies (i.e., "
                "alleles) for each")
        ini.set("max_expected_copies",
                "; marker in a single reference sample (only used for "
                "allelefinder). The default")
        ini.set("max_expected_copies",
                "; is 2. Specify 1 here for haploid markers (i.e., those on "
                "mitochondrial DNA or")
        ini.set("max_expected_copies",
                "; on the Y chromosome).")
313 314 315 316 317 318 319 320 321 322
        ini.add_section("expected_allele_length")
        ini.set("expected_allele_length",
                "; Specify one or two values for each marker. The first value "
                "gives the expected")
        ini.set("expected_allele_length",
                "; minimum length of the alleles and the second value (if "
                "given) specifies the")
        ini.set("expected_allele_length",
                "; maximum allele length expected for this marker (both "
                "inclusive).")
jhoogenboom's avatar
jhoogenboom committed
323 324 325 326 327 328 329 330

        # Enter flanking sequences and STR definitions.
        fmt = "%%-%is" % reduce(max, map(len,
            set(library["flanks"].keys() + library["regex"].keys())), 0)
        for marker in sorted(library["flanks"]):
            ini.set("flanks", fmt%marker, ", ".join(library["flanks"][marker]))
        for marker in sorted(library["regex"]):
            blocks = pattern_reverse.findall(library["regex"][marker].pattern)
jhoogenboom's avatar
jhoogenboom committed
331 332
            ini.set("repeat", fmt % marker,
                    " ".join("%s %s %s" % x for x in blocks))
jhoogenboom's avatar
jhoogenboom committed
333 334 335 336 337 338 339 340 341 342 343 344 345 346

            # Try to infer block length from the regular expression.
            length_counts = {0: 0}
            for block in blocks:
                amount = (int(block[1])+int(block[2]))/2.
                if len(block[0]) not in length_counts:
                    length_counts[len(block[0])] = amount
                else:
                    length_counts[len(block[0])] += amount
            block_length = sorted(
                length_counts, key=lambda x: -length_counts[x])[0]
            if block_length != 0 and block_length < 10:
                ini.set("block_length", fmt%marker, block_length)

347 348 349
            # Write max_expected_copies=2 for all markers explicitly.
            ini.set("max_expected_copies", fmt%marker, 2)

jhoogenboom's avatar
jhoogenboom committed
350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371
            # TODO: I could also do some fiddling for prefix/suffix...

        # Write INI file.
        ini.write(outfile)
#convert_library


def add_arguments(parser):
    parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
        help="input library file, the format is automatically detected "
             "(default: read from stdin)")
    parser.add_argument('outfile', nargs='?', metavar="OUT",
        default=sys.stdout, type=argparse.FileType('w'),
        help="the file to write the output to (default: write to stdout)")
    parser.add_argument('-a', '--aliases', action="store_true",
        help="if specified, aliases in FDSTools libraries are converted to "
             "separate markers in the output library; otherwise, they are "
             "merged into their respective markers")
#add_arguments


def run(args):
372 373 374 375 376 377 378 379 380 381
    if (args.infile != sys.stdin and args.outfile == sys.stdout
            and not os.path.exists(args.infile)):
        # One filename given, and it does not exist.  Assume outfile.
        args.outfile = open(args.infile, 'w')
        args.infile = sys.stdin

    if args.infile != sys.stdin:
        # Open the specified input file.
        args.infile = open(args.infile, 'r')
    elif args.infile.isatty():
382 383
        # No input given.  Produce a default FDSTools library.
        args.infile = StringIO(_DEFAULT_LIBRARY)
384

jhoogenboom's avatar
jhoogenboom committed
385 386
    convert_library(args.infile, args.outfile, args.aliases)
#run