Commit 160594c5 authored by jhoogenboom's avatar jhoogenboom
Browse files

Laying foundations

* Introducing a new, extended library file format to support
  allele name generation.  The new libconvert tool can convert
  TSSV libraries to the new format and vice versa.
* Added functions for converting between raw sequences, TSSV-style
  sequences, and allele names.
* Added global -d/--debug option.

Stuttermark updates:
* Stuttermark now automatically converts input sequences to
  TSSV-style if a library is provided.
* Stuttermark will no longer crash if there is no 'name' column.
  Instead, all sequences are taken to belong to the same marker.

New tools:
* libconvert converts between FDSTools and TSSV library formats.
* seqconvert converts between raw sequences, TSSV-style sequences,
  and allele names.
* allelefinder detects the true alleles in reference samples.
parent 830aaf82
...@@ -29,6 +29,10 @@ Alternatively, FDSTools can be installed by running: ...@@ -29,6 +29,10 @@ Alternatively, FDSTools can be installed by running:
FDSTools Changelog FDSTools Changelog
------------------ ------------------
v0.0.2
- Added global -d/--debug switch
- Includes Stuttermark v1.4
v0.0.1 v0.0.1
- Initial version - Initial version
- Includes Stuttermark v1.3 - Includes Stuttermark v1.3
...@@ -41,13 +45,13 @@ Mark potential stutter products by assuming a fixed maximum percentage of ...@@ -41,13 +45,13 @@ Mark potential stutter products by assuming a fixed maximum percentage of
stutter product vs the parent allele. stutter product vs the parent allele.
Input Input
Tab-seperated file with at least these three columns: Tab-seperated file with the following columns:
- 'name': the name of the marker - 'allele': the allele name, as a TSSV_-style sequence, e.g.,
- 'allele': the allele name, as a TSSV-style sequence, e.g., "``AGAT(12)TGAT(4)``" (required)
"``AGAT(12)TGAT(4)``" - 'total': the total number of reads (required)
- 'total': the total number of reads - 'name': the name of the marker (optional)
This format is compatible with 'knownalleles.csv' files created by TSSV. This format is compatible with 'knownalleles.csv' files created by TSSV_.
Output Output
The same file, with an additional column (named 'annotation' by default). The same file, with an additional column (named 'annotation' by default).
...@@ -70,12 +74,18 @@ Output ...@@ -70,12 +74,18 @@ Output
Changelog Changelog
~~~~~~~~~ ~~~~~~~~~
v1.4
- Stuttermark now accepts raw sequences and allele names as input, which
are automatically rewritten as TSSV-style sequences using a specified
library file
- The 'name' column is now optional
v1.3 v1.3
- First version of Stuttermark to be included in ``fdstools`` - First version of Stuttermark to be included in ``fdstools``
- Fixed crash that occurred when an empty allele (e.g., a primer dimer) - Fixed crash that occurred when an empty allele (e.g., a primer dimer)
was encountered was encountered
- Stuttermark now prints a warning if an allele is encountered that is - Stuttermark now prints a warning if an allele is encountered that is
not a TSSV-style sequence not a TSSV_-style sequence
v1.2 v1.2
- All settings are now available from the command line - All settings are now available from the command line
...@@ -89,3 +99,22 @@ v1.0 ...@@ -89,3 +99,22 @@ v1.0
- Initial version - Initial version
Libconvert
----------
Convert between TSSV (tab-separated) and FDSTools (ini-style) library formats.
Seqconvert
----------
Convert between raw sequences, TSSV-style sequences, and allele names.
Allelefinder
------------
Find true alleles in a single-person reference sample.
.. _TSSV: https://pypi.python.org/pypi/tssv/
...@@ -3,7 +3,7 @@ Tools for characterisation and filtering of PCR stutter artefacts and other ...@@ -3,7 +3,7 @@ Tools for characterisation and filtering of PCR stutter artefacts and other
systemic noise in Next Generation Sequencing data of forensic STR markers. systemic noise in Next Generation Sequencing data of forensic STR markers.
""" """
__version_info__ = ('0', '0', '1') __version_info__ = ('0', '0', '2')
__version__ = '.'.join(__version_info__) __version__ = '.'.join(__version_info__)
usage = __doc__.split("\n\n\n") usage = __doc__.split("\n\n\n")
......
...@@ -42,6 +42,8 @@ def main(): ...@@ -42,6 +42,8 @@ def main():
parser = argparse.ArgumentParser(add_help=False, description=usage[0], parser = argparse.ArgumentParser(add_help=False, description=usage[0],
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.version = version(parser.prog) parser.version = version(parser.prog)
parser.add_argument('-d', "--debug", action="store_true",
help="if specified, debug output is printed to stdout")
parser.add_argument('-v', "--version", action=_VersionAction, parser.add_argument('-v', "--version", action=_VersionAction,
default=argparse.SUPPRESS, nargs=argparse.REMAINDER, default=argparse.SUPPRESS, nargs=argparse.REMAINDER,
help="show version number and exit") help="show version number and exit")
...@@ -66,11 +68,18 @@ def main(): ...@@ -66,11 +68,18 @@ def main():
__tools__[name] = subparser __tools__[name] = subparser
module.add_arguments(subparser) module.add_arguments(subparser)
subparser.set_defaults(func=module.run) subparser.set_defaults(func=module.run)
subparser.add_argument('-d', "--debug", action="store_true",
help="if specified, debug output is printed to stdout")
try: try:
args = parser.parse_args() args = parser.parse_args()
args.func(args) except Exception as error:
except OSError as error:
parser.error(error) parser.error(error)
try:
args.func(args)
except Exception as error:
if args.debug:
raise
__tools__[args.tool].error(error)
#main #main
......
This diff is collapsed.
#!/usr/bin/env python
"""
Find true alleles in a single-person reference sample.
"""
import argparse
import sys
from ..lib import get_column_ids, pos_int_arg
__version__ = "0.1dev"
_DEF_MIN_ALLELE_PCT = 30.0
_DEF_MAX_NOISE_PCT = 10.0
_DEF_MAX_ALLELES = 2
def find_alleles(infile, outfile, min_allele_pct, max_noise_pct,
stuttermark_column, max_alleles):
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_total, colid_allele, colid_name = get_column_ids(column_names,
"total", "allele", "name")
# Also get stuttermark column if we have one.
if stuttermark_column is not None:
colid_stuttermark = get_column_ids(column_names, stuttermark_column)
highest_noise = {}
highest_allele = {}
alleles = {}
for line in infile:
line = line.rstrip("\r\n").split("\t")
if (stuttermark_column is not None and
not line[colid_stuttermark].startswith("ALLELE")):
continue
marker = line[colid_name]
allele = line[colid_allele]
reads = int(line[colid_total])
if marker in alleles:
if reads > highest_allele[marker]:
# New highest allele!
highest_allele[marker] = reads
for allele in alleles[marker]:
if (alleles[marker][allele] <
marker_max[marker] * (min_allele_pct/100.)):
if alleles[marker][allele] > highest_noise[marker]:
highest_noise[marker] = alleles[marker][allele]
del alleles[marker][allele]
elif reads >= highest_allele[marker]*(min_allele_pct/100.):
# New secundary allele!
alleles[marker][allele] = reads
elif reads >= highest_noise[marker]:
# New highest noise!
highest_noise[marker] = reads
else:
alleles[marker] = {allele: reads}
highest_allele[marker] = reads
highest_noise[marker] = 0
outfile.write("\t".join(["marker", "allele"]) + "\n")
for marker in alleles:
if len(alleles[marker]) > max_alleles:
allele_order = sorted(alleles[marker],
key=lambda x: -alleles[marker][x])
highest_noise[marker] = alleles[marker][allele_order[max_alleles]]
alleles[marker] = {x: alleles[marker][x]
for x in allele_order[:max_alleles]}
for allele in alleles[marker]:
outfile.write("\t".join([marker, allele]) + "\n")
if highest_noise[marker] > highest_allele[marker]*(max_noise_pct/100.):
outfile.write("\t".join([marker, "NOISY"]) + "\n")
#find_alleles
def add_arguments(parser):
parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
type=argparse.FileType('r'),
help="the CSV data file to process (default: read from stdin)")
parser.add_argument('outfile', nargs='?', metavar="OUT",
default=sys.stdout, type=argparse.FileType('w'),
help="the file to write the output to (default: write to stdout)")
parser.add_argument('-m', '--min-allele-pct', metavar="N", type=float,
default=_DEF_MIN_ALLELE_PCT,
help="call heterozygous if the second allele is at least this "
"percentage of the highest allele (default: %(default)s)")
parser.add_argument('-M', '--max-noise-pct', metavar="N", type=float,
default=_DEF_MAX_NOISE_PCT, help="output additional \"NOISY\" allele "
"if the highest non-allelic sequence is at least this "
"percentage of the highest allele (default: %(default)s)")
parser.add_argument('-a', '--max-alleles', metavar="N", type=pos_int_arg,
default=_DEF_MAX_ALLELES, help="allow no more than this number of "
"alleles per marker (default: %(default)s)")
parser.add_argument('-c', '--stuttermark-column', metavar="COLNAME",
default=None,
help="name of column with Stuttermark output; if specified, sequences "
"for which the value in this column does not start with ALLELE "
"are ignored")
#add_arguments
def run(args):
if args.infile.isatty() and args.outfile.isatty():
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
find_alleles(args.infile, args.outfile, args.min_allele_pct,
args.max_noise_pct, args.stuttermark_column, args.max_alleles)
#run
def main():
"""
Main entry point.
"""
parser = argparse.ArgumentParser(
description=__doc__)
try:
add_arguments(parser)
run(parser.parse_args())
except OSError as error:
parser.error(error)
#main
if __name__ == "__main__":
main()
#!/usr/bin/env python
"""
Convert between TSSV (tab-separated) and FDSTools (ini-style) library formats.
"""
import argparse
import sys
import re
from ..lib import parse_library
from ConfigParser import RawConfigParser
__version__ = "0.1dev"
def convert_library(infile, outfile, aliases=False):
pattern_reverse = re.compile("\(([ACGT]+)\)\{(\d+),(\d+)\}")
library = parse_library(infile)
if "aliases" in library:
# FDSTools -> TSSV
markers = set()
for marker in library["flanks"]:
markers.add(marker)
for marker in library["prefix"]:
markers.add(marker)
for marker in library["suffix"]:
markers.add(marker)
for marker in library["regex"]:
markers.add(marker)
marker_aliases = {}
for alias in library["aliases"]:
marker = library["aliases"][alias]["marker"]
markers.add(marker)
if marker not in marker_aliases:
marker_aliases[marker] = [alias]
else:
marker_aliases[marker].append(alias)
newline = ""
for marker in sorted(markers):
if marker in library["aliases"] and not aliases:
# Ignore this alias, it will be merged into its marker.
continue
if marker in library["aliases"] and aliases:
# Output this alias as a separate marker.
if marker in library["flanks"]:
flanks = library["flanks"][marker]
elif library["aliases"][marker]["marker"] in library["flanks"]:
flanks = library["flanks"][
library["aliases"][marker]["marker"]]
else:
continue # Worthless, no flanks.
if marker in library["regex"]:
pattern = pattern_reverse.findall(
library["regex"][marker].pattern)
elif aliases or marker not in marker_aliases:
# Normal marker, or separtely from its aliases.
if marker not in library["flanks"]:
continue # Worthless, no flanks.
flanks = library["flanks"][marker]
if marker in library["regex"]:
pattern = pattern_reverse.findall(
library["regex"][marker].pattern)
else:
# Merge marker with its aliases.
flanks = False
if marker in library["flanks"]:
flanks = library["flanks"][marker]
else:
for alias in marker_aliases[marker]:
if alias in library["flanks"]:
flanks = library["flanks"][alias]
break
if not flanks:
continue # Worthless, no flanks.
prefixes = set()
suffixes = set()
if marker in library["prefix"]:
prefixes.update(library["prefix"][marker])
if marker in library["suffix"]:
suffixes.update(library["suffix"][marker])
middle = []
if marker in library["regex"]:
# This marker has a regex next to its aliases.
# Check if the aliases fit the regex without change.
unmatched = []
for alias in marker_aliases[marker]:
allele = []
if marker in library["prefix"]:
allele.append(library["prefix"][marker][0])
allele.append(library["aliases"][alias]["sequence"])
if marker in library["suffix"]:
allele.append(library["suffix"][marker][0])
allele = "".join(allele)
if library["regex"][marker].match(allele) is None:
unmatched.append(
library["aliases"][alias]["sequence"])
middle = pattern_reverse.findall(
library["regex"][marker].pattern)[len(prefixes):]
if len(suffixes):
middle = middle[:-len(suffixes)]
if unmatched:
middle = map(lambda x: (x[0], "0", x[2]), middle) + \
map(lambda x: (x, "0", "1"), unmatched)
# Add prefixes and suffixes of aliases.
if marker in marker_aliases:
for alias in marker_aliases[marker]:
if alias in library["prefix"]:
prefixes.update(library["prefix"][alias])
if alias in library["suffix"]:
suffixes.update(library["suffix"][alias])
if marker not in library["regex"]:
middle.append((
library["aliases"][alias]["sequence"],
"0", "1"))
# Final regex is prefixes + middle + suffixes.
pattern = []
for prefix in prefixes:
pattern.append((prefix, "0", "1"))
pattern += middle
for suffix in suffixes:
pattern.append((suffix, "0", "1"))
outfile.write(newline + "%s\t%s\t%s\t%s" % (
marker, flanks[0], flanks[1],
" ".join(map(lambda x: "%s %s %s" % x, pattern))))
newline = "\n"
else:
# TSSV -> FDSTools
ini = RawConfigParser(allow_no_value=True)
ini.optionxform = str
# Create sections. Most of them will be empty but we will put
# comments in them to explain how to use them.
ini.add_section("aliases")
ini.set("aliases", "; Specify three comma-separated values: marker "
"name, sequence, and allele name.")
ini.add_section("flanks")
ini.set("flanks", "; Specify two comma-separated values: left flank "
"and right flank.")
ini.add_section("prefix")
ini.set("prefix", "; Specify all possible prefix sequences separated "
"by commas. The first sequence")
ini.set("prefix", "; listed is used as the reference sequence when "
"generating allele names.")
ini.add_section("suffix")
ini.set("suffix", "; Specify all possible suffix sequences separated "
"by commas. The first sequence")
ini.set("suffix", "; listed is used as the reference sequence when "
"generating allele names.")
ini.add_section("repeat")
ini.set("repeat", "; Specify the STR repeat structure in "
"space-separated triples of sequence,")
ini.set("repeat", "; minimum number of repeats, and maximum number of "
"repeats.")
ini.add_section("length_adjust")
ini.set("length_adjust", "; When generating allele names, the CE "
"allele number is based on the length")
ini.set("length_adjust", "; of the sequence (prefix+repeat+suffix) "
"minus the adjustment specified here.")
ini.add_section("block_length")
ini.set("block_length", "; Specify the core repeat unit lengths. The "
"default length is 4.")
# Enter flanking sequences and STR definitions.
fmt = "%%-%is" % reduce(max, map(len,
set(library["flanks"].keys() + library["regex"].keys())), 0)
for marker in sorted(library["flanks"]):
ini.set("flanks", fmt%marker, ", ".join(library["flanks"][marker]))
for marker in sorted(library["regex"]):
blocks = pattern_reverse.findall(library["regex"][marker].pattern)
ini.set("repeat", fmt%marker, " ".join(map(
lambda x: "%s %s %s" % x, blocks)))
# Try to infer block length from the regular expression.
length_counts = {0: 0}
for block in blocks:
amount = (int(block[1])+int(block[2]))/2.
if len(block[0]) not in length_counts:
length_counts[len(block[0])] = amount
else:
length_counts[len(block[0])] += amount
block_length = sorted(
length_counts, key=lambda x: -length_counts[x])[0]
if block_length != 0 and block_length < 10:
ini.set("block_length", fmt%marker, block_length)
# TODO: I could also do some fiddling for prefix/suffix...
# Write INI file.
ini.write(outfile)
#convert_library
def add_arguments(parser):
parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
type=argparse.FileType('r'),
help="input library file, the format is automatically detected "
"(default: read from stdin)")
parser.add_argument('outfile', nargs='?', metavar="OUT",
default=sys.stdout, type=argparse.FileType('w'),
help="the file to write the output to (default: write to stdout)")
parser.add_argument('-a', '--aliases', action="store_true",
help="if specified, aliases in FDSTools libraries are converted to "
"separate markers in the output library; otherwise, they are "
"merged into their respective markers")
#add_arguments
def run(args):
if args.infile.isatty() and args.outfile.isatty():
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
convert_library(args.infile, args.outfile, args.aliases)
#run
def main():
"""
Main entry point.
"""
parser = argparse.ArgumentParser(
description=__doc__)
try:
add_arguments(parser)
run(parser.parse_args())
except OSError as error:
parser.error(error)
#main
if __name__ == "__main__":
main()
#!/usr/bin/env python
"""
Convert between raw sequences, TSSV-style sequences, and allele names.
"""
import argparse
import sys
from ..lib import get_column_ids, ensure_sequence_format, parse_library
__version__ = "0.1dev"
# Default values for parameters are specified below.
# Default name of the column that contains the marker name.
# This value can be overridden by the -m command line option.
_DEF_COLNAME_MARKER = "name"
# Default name of the column that contains the allele.
# This value can be overridden by the -a command line option.
_DEF_COLNAME_ALLELE = "allele"
# Default name of the column to write the output to.
# This value can be overridden by the -o command line option.
_DEF_COLNAME_ALLELE_OUT = "allele"
def convert_sequences(infile, outfile, to_format, libfile=None,
fixed_marker=None, colname_marker=_DEF_COLNAME_MARKER,
colname_allele=_DEF_COLNAME_ALLELE,
colname_allele_out=_DEF_COLNAME_ALLELE_OUT):
library = parse_library(libfile) if libfile is not None else None
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_allele = get_column_ids(column_names, colname_allele)
if library is None:
fixed_marker = "" # Don't need marker names without library.
if fixed_marker is None:
colid_marker = get_column_ids(column_names, colname_marker)
try:
colid_allele_out = get_column_ids(column_names, colname_allele_out)
except:
column_names.append(colname_allele_out)
colid_allele_out = -1
outfile.write("\t".join(column_names) + "\n")
for line in infile:
line = line.rstrip("\r\n").split("\t")
if colid_allele_out == -1:
line.append("")
marker = line[colid_marker] if fixed_marker is None else fixed_marker
line[colid_allele_out] = ensure_sequence_format(
line[colid_allele], to_format, marker=marker, library=library)
outfile.write("\t".join(line) + "\n")
#convert_sequences
def add_arguments(parser):
parser.add_argument('format', metavar="FORMAT",
help="the format to convert to: one of 'raw', 'tssv', or 'allelename'")
parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
type=argparse.FileType('r'),
help="the tab-separated data file to process (default: read from "
"stdin)")
parser.add_argument('outfile', nargs='?', metavar="OUT",
default=sys.stdout, type=argparse.FileType('w'),
help="the file to write the output to (default: write to stdout)")
parser.add_argument('-m', '--marker-column', metavar="COLNAME",
default=_DEF_COLNAME_MARKER,
help="name of the column that contains the marker name "
"(default: '%(default)s')")
parser.add_argument('-a', '--allele-column', metavar="COLNAME",
default=_DEF_COLNAME_ALLELE,
help="name of the column that contains the allele "
"(default: '%(default)s')")
parser.add_argument('-o', '--output-column', metavar="COLNAME",
default=_DEF_COLNAME_ALLELE_OUT,
help="name of the column to write the output to "
"(default: '%(default)s')")
parser.add_argument('-M', '--marker', metavar="MARKER",
help="assume the specified marker for all sequences in the file")
parser.add_argument('-l', '--library', metavar="LIBRARY",
type=argparse.FileType('r'),
help="library file for sequence format conversion")
#add_arguments
def run(args):
if args.infile.isatty() and args.outfile.isatty():
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
convert_sequences(args.infile, args.outfile, args.format, args.library,
args.marker, args.marker_column, args.allele_column,