Commit 160594c5 authored by jhoogenboom's avatar jhoogenboom
Browse files

Laying foundations

* Introducing a new, extended library file format to support
  allele name generation.  The new libconvert tool can convert
  TSSV libraries to the new format and vice versa.
* Added functions for converting between raw sequences, TSSV-style
  sequences, and allele names.
* Added global -d/--debug option.

Stuttermark updates:
* Stuttermark now automatically converts input sequences to
  TSSV-style if a library is provided.
* Stuttermark will no longer crash if there is no 'name' column.
  Instead, all sequences are taken to belong to the same marker.

New tools:
* libconvert converts between FDSTools and TSSV library formats.
* seqconvert converts between raw sequences, TSSV-style sequences,
  and allele names.
* allelefinder detects the true alleles in reference samples.
parent 830aaf82
......@@ -29,6 +29,10 @@ Alternatively, FDSTools can be installed by running:
FDSTools Changelog
------------------
v0.0.2
- Added global -d/--debug switch
- Includes Stuttermark v1.4
v0.0.1
- Initial version
- Includes Stuttermark v1.3
......@@ -41,13 +45,13 @@ Mark potential stutter products by assuming a fixed maximum percentage of
stutter product vs the parent allele.
Input
Tab-seperated file with at least these three columns:
- 'name': the name of the marker
- 'allele': the allele name, as a TSSV-style sequence, e.g.,
"``AGAT(12)TGAT(4)``"
- 'total': the total number of reads
Tab-seperated file with the following columns:
- 'allele': the allele name, as a TSSV_-style sequence, e.g.,
"``AGAT(12)TGAT(4)``" (required)
- 'total': the total number of reads (required)
- 'name': the name of the marker (optional)
This format is compatible with 'knownalleles.csv' files created by TSSV.
This format is compatible with 'knownalleles.csv' files created by TSSV_.
Output
The same file, with an additional column (named 'annotation' by default).
......@@ -70,12 +74,18 @@ Output
Changelog
~~~~~~~~~
v1.4
- Stuttermark now accepts raw sequences and allele names as input, which
are automatically rewritten as TSSV-style sequences using a specified
library file
- The 'name' column is now optional
v1.3
- First version of Stuttermark to be included in ``fdstools``
- Fixed crash that occurred when an empty allele (e.g., a primer dimer)
was encountered
- Stuttermark now prints a warning if an allele is encountered that is
not a TSSV-style sequence
not a TSSV_-style sequence
v1.2
- All settings are now available from the command line
......@@ -89,3 +99,22 @@ v1.0
- Initial version
Libconvert
----------
Convert between TSSV (tab-separated) and FDSTools (ini-style) library formats.
Seqconvert
----------
Convert between raw sequences, TSSV-style sequences, and allele names.
Allelefinder
------------
Find true alleles in a single-person reference sample.
.. _TSSV: https://pypi.python.org/pypi/tssv/
......@@ -3,7 +3,7 @@ Tools for characterisation and filtering of PCR stutter artefacts and other
systemic noise in Next Generation Sequencing data of forensic STR markers.
"""
__version_info__ = ('0', '0', '1')
__version_info__ = ('0', '0', '2')
__version__ = '.'.join(__version_info__)
usage = __doc__.split("\n\n\n")
......
......@@ -42,6 +42,8 @@ def main():
parser = argparse.ArgumentParser(add_help=False, description=usage[0],
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.version = version(parser.prog)
parser.add_argument('-d', "--debug", action="store_true",
help="if specified, debug output is printed to stdout")
parser.add_argument('-v', "--version", action=_VersionAction,
default=argparse.SUPPRESS, nargs=argparse.REMAINDER,
help="show version number and exit")
......@@ -66,11 +68,18 @@ def main():
__tools__[name] = subparser
module.add_arguments(subparser)
subparser.set_defaults(func=module.run)
subparser.add_argument('-d', "--debug", action="store_true",
help="if specified, debug output is printed to stdout")
try:
args = parser.parse_args()
args.func(args)
except OSError as error:
except Exception as error:
parser.error(error)
try:
args.func(args)
except Exception as error:
if args.debug:
raise
__tools__[args.tool].error(error)
#main
......
This diff is collapsed.
#!/usr/bin/env python
"""
Find true alleles in a single-person reference sample.
"""
import argparse
import sys
from ..lib import get_column_ids, pos_int_arg
__version__ = "0.1dev"
_DEF_MIN_ALLELE_PCT = 30.0
_DEF_MAX_NOISE_PCT = 10.0
_DEF_MAX_ALLELES = 2
def find_alleles(infile, outfile, min_allele_pct, max_noise_pct,
stuttermark_column, max_alleles):
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_total, colid_allele, colid_name = get_column_ids(column_names,
"total", "allele", "name")
# Also get stuttermark column if we have one.
if stuttermark_column is not None:
colid_stuttermark = get_column_ids(column_names, stuttermark_column)
highest_noise = {}
highest_allele = {}
alleles = {}
for line in infile:
line = line.rstrip("\r\n").split("\t")
if (stuttermark_column is not None and
not line[colid_stuttermark].startswith("ALLELE")):
continue
marker = line[colid_name]
allele = line[colid_allele]
reads = int(line[colid_total])
if marker in alleles:
if reads > highest_allele[marker]:
# New highest allele!
highest_allele[marker] = reads
for allele in alleles[marker]:
if (alleles[marker][allele] <
marker_max[marker] * (min_allele_pct/100.)):
if alleles[marker][allele] > highest_noise[marker]:
highest_noise[marker] = alleles[marker][allele]
del alleles[marker][allele]
elif reads >= highest_allele[marker]*(min_allele_pct/100.):
# New secundary allele!
alleles[marker][allele] = reads
elif reads >= highest_noise[marker]:
# New highest noise!
highest_noise[marker] = reads
else:
alleles[marker] = {allele: reads}
highest_allele[marker] = reads
highest_noise[marker] = 0
outfile.write("\t".join(["marker", "allele"]) + "\n")
for marker in alleles:
if len(alleles[marker]) > max_alleles:
allele_order = sorted(alleles[marker],
key=lambda x: -alleles[marker][x])
highest_noise[marker] = alleles[marker][allele_order[max_alleles]]
alleles[marker] = {x: alleles[marker][x]
for x in allele_order[:max_alleles]}
for allele in alleles[marker]:
outfile.write("\t".join([marker, allele]) + "\n")
if highest_noise[marker] > highest_allele[marker]*(max_noise_pct/100.):
outfile.write("\t".join([marker, "NOISY"]) + "\n")
#find_alleles
def add_arguments(parser):
parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
type=argparse.FileType('r'),
help="the CSV data file to process (default: read from stdin)")
parser.add_argument('outfile', nargs='?', metavar="OUT",
default=sys.stdout, type=argparse.FileType('w'),
help="the file to write the output to (default: write to stdout)")
parser.add_argument('-m', '--min-allele-pct', metavar="N", type=float,
default=_DEF_MIN_ALLELE_PCT,
help="call heterozygous if the second allele is at least this "
"percentage of the highest allele (default: %(default)s)")
parser.add_argument('-M', '--max-noise-pct', metavar="N", type=float,
default=_DEF_MAX_NOISE_PCT, help="output additional \"NOISY\" allele "
"if the highest non-allelic sequence is at least this "
"percentage of the highest allele (default: %(default)s)")
parser.add_argument('-a', '--max-alleles', metavar="N", type=pos_int_arg,
default=_DEF_MAX_ALLELES, help="allow no more than this number of "
"alleles per marker (default: %(default)s)")
parser.add_argument('-c', '--stuttermark-column', metavar="COLNAME",
default=None,
help="name of column with Stuttermark output; if specified, sequences "
"for which the value in this column does not start with ALLELE "
"are ignored")
#add_arguments
def run(args):
if args.infile.isatty() and args.outfile.isatty():
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
find_alleles(args.infile, args.outfile, args.min_allele_pct,
args.max_noise_pct, args.stuttermark_column, args.max_alleles)
#run
def main():
"""
Main entry point.
"""
parser = argparse.ArgumentParser(
description=__doc__)
try:
add_arguments(parser)
run(parser.parse_args())
except OSError as error:
parser.error(error)
#main
if __name__ == "__main__":
main()
#!/usr/bin/env python
"""
Convert between TSSV (tab-separated) and FDSTools (ini-style) library formats.
"""
import argparse
import sys
import re
from ..lib import parse_library
from ConfigParser import RawConfigParser
__version__ = "0.1dev"
def convert_library(infile, outfile, aliases=False):
pattern_reverse = re.compile("\(([ACGT]+)\)\{(\d+),(\d+)\}")
library = parse_library(infile)
if "aliases" in library:
# FDSTools -> TSSV
markers = set()
for marker in library["flanks"]:
markers.add(marker)
for marker in library["prefix"]:
markers.add(marker)
for marker in library["suffix"]:
markers.add(marker)
for marker in library["regex"]:
markers.add(marker)
marker_aliases = {}
for alias in library["aliases"]:
marker = library["aliases"][alias]["marker"]
markers.add(marker)
if marker not in marker_aliases:
marker_aliases[marker] = [alias]
else:
marker_aliases[marker].append(alias)
newline = ""
for marker in sorted(markers):
if marker in library["aliases"] and not aliases:
# Ignore this alias, it will be merged into its marker.
continue
if marker in library["aliases"] and aliases:
# Output this alias as a separate marker.
if marker in library["flanks"]:
flanks = library["flanks"][marker]
elif library["aliases"][marker]["marker"] in library["flanks"]:
flanks = library["flanks"][
library["aliases"][marker]["marker"]]
else:
continue # Worthless, no flanks.
if marker in library["regex"]:
pattern = pattern_reverse.findall(
library["regex"][marker].pattern)
elif aliases or marker not in marker_aliases:
# Normal marker, or separtely from its aliases.
if marker not in library["flanks"]:
continue # Worthless, no flanks.
flanks = library["flanks"][marker]
if marker in library["regex"]:
pattern = pattern_reverse.findall(
library["regex"][marker].pattern)
else:
# Merge marker with its aliases.
flanks = False
if marker in library["flanks"]:
flanks = library["flanks"][marker]
else:
for alias in marker_aliases[marker]:
if alias in library["flanks"]:
flanks = library["flanks"][alias]
break
if not flanks:
continue # Worthless, no flanks.
prefixes = set()
suffixes = set()
if marker in library["prefix"]:
prefixes.update(library["prefix"][marker])
if marker in library["suffix"]:
suffixes.update(library["suffix"][marker])
middle = []
if marker in library["regex"]:
# This marker has a regex next to its aliases.
# Check if the aliases fit the regex without change.
unmatched = []
for alias in marker_aliases[marker]:
allele = []
if marker in library["prefix"]:
allele.append(library["prefix"][marker][0])
allele.append(library["aliases"][alias]["sequence"])
if marker in library["suffix"]:
allele.append(library["suffix"][marker][0])
allele = "".join(allele)
if library["regex"][marker].match(allele) is None:
unmatched.append(
library["aliases"][alias]["sequence"])
middle = pattern_reverse.findall(
library["regex"][marker].pattern)[len(prefixes):]
if len(suffixes):
middle = middle[:-len(suffixes)]
if unmatched:
middle = map(lambda x: (x[0], "0", x[2]), middle) + \
map(lambda x: (x, "0", "1"), unmatched)
# Add prefixes and suffixes of aliases.
if marker in marker_aliases:
for alias in marker_aliases[marker]:
if alias in library["prefix"]:
prefixes.update(library["prefix"][alias])
if alias in library["suffix"]:
suffixes.update(library["suffix"][alias])
if marker not in library["regex"]:
middle.append((
library["aliases"][alias]["sequence"],
"0", "1"))
# Final regex is prefixes + middle + suffixes.
pattern = []
for prefix in prefixes:
pattern.append((prefix, "0", "1"))
pattern += middle
for suffix in suffixes:
pattern.append((suffix, "0", "1"))
outfile.write(newline + "%s\t%s\t%s\t%s" % (
marker, flanks[0], flanks[1],
" ".join(map(lambda x: "%s %s %s" % x, pattern))))
newline = "\n"
else:
# TSSV -> FDSTools
ini = RawConfigParser(allow_no_value=True)
ini.optionxform = str
# Create sections. Most of them will be empty but we will put
# comments in them to explain how to use them.
ini.add_section("aliases")
ini.set("aliases", "; Specify three comma-separated values: marker "
"name, sequence, and allele name.")
ini.add_section("flanks")
ini.set("flanks", "; Specify two comma-separated values: left flank "
"and right flank.")
ini.add_section("prefix")
ini.set("prefix", "; Specify all possible prefix sequences separated "
"by commas. The first sequence")
ini.set("prefix", "; listed is used as the reference sequence when "
"generating allele names.")
ini.add_section("suffix")
ini.set("suffix", "; Specify all possible suffix sequences separated "
"by commas. The first sequence")
ini.set("suffix", "; listed is used as the reference sequence when "
"generating allele names.")
ini.add_section("repeat")
ini.set("repeat", "; Specify the STR repeat structure in "
"space-separated triples of sequence,")
ini.set("repeat", "; minimum number of repeats, and maximum number of "
"repeats.")
ini.add_section("length_adjust")
ini.set("length_adjust", "; When generating allele names, the CE "
"allele number is based on the length")
ini.set("length_adjust", "; of the sequence (prefix+repeat+suffix) "
"minus the adjustment specified here.")
ini.add_section("block_length")
ini.set("block_length", "; Specify the core repeat unit lengths. The "
"default length is 4.")
# Enter flanking sequences and STR definitions.
fmt = "%%-%is" % reduce(max, map(len,
set(library["flanks"].keys() + library["regex"].keys())), 0)
for marker in sorted(library["flanks"]):
ini.set("flanks", fmt%marker, ", ".join(library["flanks"][marker]))
for marker in sorted(library["regex"]):
blocks = pattern_reverse.findall(library["regex"][marker].pattern)
ini.set("repeat", fmt%marker, " ".join(map(
lambda x: "%s %s %s" % x, blocks)))
# Try to infer block length from the regular expression.
length_counts = {0: 0}
for block in blocks:
amount = (int(block[1])+int(block[2]))/2.
if len(block[0]) not in length_counts:
length_counts[len(block[0])] = amount
else:
length_counts[len(block[0])] += amount
block_length = sorted(
length_counts, key=lambda x: -length_counts[x])[0]
if block_length != 0 and block_length < 10:
ini.set("block_length", fmt%marker, block_length)
# TODO: I could also do some fiddling for prefix/suffix...
# Write INI file.
ini.write(outfile)
#convert_library
def add_arguments(parser):
parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
type=argparse.FileType('r'),
help="input library file, the format is automatically detected "
"(default: read from stdin)")
parser.add_argument('outfile', nargs='?', metavar="OUT",
default=sys.stdout, type=argparse.FileType('w'),
help="the file to write the output to (default: write to stdout)")
parser.add_argument('-a', '--aliases', action="store_true",
help="if specified, aliases in FDSTools libraries are converted to "
"separate markers in the output library; otherwise, they are "
"merged into their respective markers")
#add_arguments
def run(args):
if args.infile.isatty() and args.outfile.isatty():
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
convert_library(args.infile, args.outfile, args.aliases)
#run
def main():
"""
Main entry point.
"""
parser = argparse.ArgumentParser(
description=__doc__)
try:
add_arguments(parser)
run(parser.parse_args())
except OSError as error:
parser.error(error)
#main
if __name__ == "__main__":
main()
#!/usr/bin/env python
"""
Convert between raw sequences, TSSV-style sequences, and allele names.
"""
import argparse
import sys
from ..lib import get_column_ids, ensure_sequence_format, parse_library
__version__ = "0.1dev"
# Default values for parameters are specified below.
# Default name of the column that contains the marker name.
# This value can be overridden by the -m command line option.
_DEF_COLNAME_MARKER = "name"
# Default name of the column that contains the allele.
# This value can be overridden by the -a command line option.
_DEF_COLNAME_ALLELE = "allele"
# Default name of the column to write the output to.
# This value can be overridden by the -o command line option.
_DEF_COLNAME_ALLELE_OUT = "allele"
def convert_sequences(infile, outfile, to_format, libfile=None,
fixed_marker=None, colname_marker=_DEF_COLNAME_MARKER,
colname_allele=_DEF_COLNAME_ALLELE,
colname_allele_out=_DEF_COLNAME_ALLELE_OUT):
library = parse_library(libfile) if libfile is not None else None
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_allele = get_column_ids(column_names, colname_allele)
if library is None:
fixed_marker = "" # Don't need marker names without library.
if fixed_marker is None:
colid_marker = get_column_ids(column_names, colname_marker)
try:
colid_allele_out = get_column_ids(column_names, colname_allele_out)
except:
column_names.append(colname_allele_out)
colid_allele_out = -1
outfile.write("\t".join(column_names) + "\n")
for line in infile:
line = line.rstrip("\r\n").split("\t")
if colid_allele_out == -1:
line.append("")
marker = line[colid_marker] if fixed_marker is None else fixed_marker
line[colid_allele_out] = ensure_sequence_format(
line[colid_allele], to_format, marker=marker, library=library)
outfile.write("\t".join(line) + "\n")
#convert_sequences
def add_arguments(parser):
parser.add_argument('format', metavar="FORMAT",
help="the format to convert to: one of 'raw', 'tssv', or 'allelename'")
parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
type=argparse.FileType('r'),
help="the tab-separated data file to process (default: read from "
"stdin)")
parser.add_argument('outfile', nargs='?', metavar="OUT",
default=sys.stdout, type=argparse.FileType('w'),
help="the file to write the output to (default: write to stdout)")
parser.add_argument('-m', '--marker-column', metavar="COLNAME",
default=_DEF_COLNAME_MARKER,
help="name of the column that contains the marker name "
"(default: '%(default)s')")
parser.add_argument('-a', '--allele-column', metavar="COLNAME",
default=_DEF_COLNAME_ALLELE,
help="name of the column that contains the allele "
"(default: '%(default)s')")
parser.add_argument('-o', '--output-column', metavar="COLNAME",
default=_DEF_COLNAME_ALLELE_OUT,
help="name of the column to write the output to "
"(default: '%(default)s')")
parser.add_argument('-M', '--marker', metavar="MARKER",
help="assume the specified marker for all sequences in the file")
parser.add_argument('-l', '--library', metavar="LIBRARY",
type=argparse.FileType('r'),
help="library file for sequence format conversion")
#add_arguments
def run(args):
if args.infile.isatty() and args.outfile.isatty():
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
convert_sequences(args.infile, args.outfile, args.format, args.library,
args.marker, args.marker_column, args.allele_column,
args.output_column)
#run
def main():
"""
Main entry point.
"""
parser = argparse.ArgumentParser(
description=__doc__)
try:
add_arguments(parser)
run(parser.parse_args())
except OSError as error:
parser.error(error)
#main
if __name__ == "__main__":
main()
......@@ -5,11 +5,11 @@ of stutter product vs the parent allele.
"""
import argparse
import sys
import re
from ..lib import pos_int_arg, print_db, PAT_TSSV_BLOCK, get_column_ids
from ..lib import pos_int_arg, print_db, PAT_TSSV_BLOCK, get_column_ids, \
ensure_sequence_format, parse_library