Commit 668970ed authored by jhoogenboom's avatar jhoogenboom
Browse files

Initial commit

FDSTools v0.0.1 with Stuttermark v1.3.
Other tools will come later.
parents
*.pyc
dist/*
*.egg-info/*
Forensic DNA Sequencing Tools
=============================
Tools for filtering and interpretation of Next Generation Sequencing data of
forensic DNA samples. To obtain a list of included tools with a brief
description of each tool, run:
``fdstools -h``
For a complete description of the command line arguments of a specific tool,
run:
``fdstools -h TOOLNAME``
Installation
------------
The recommended way to install FDSTools is by using the ``pip`` package
installer. If you have ``pip`` installed, you can easily install FDSTools by
typing:
``pip install fdstools``
Alternatively, FDSTools can be installed by running:
``python setup.py install``
FDSTools Changelog
------------------
v0.0.1
- Initial version
- Includes Stuttermark v1.3
Stuttermark
-----------
Mark potential stutter products by assuming a fixed maximum percentage of
stutter product vs the parent allele.
Input
Tab-seperated file with at least these three columns:
- 'name': the name of the marker
- 'allele': the allele name, as a TSSV-style sequence, e.g.,
"``AGAT(12)TGAT(4)``"
- 'total': the total number of reads
This format is compatible with 'knownalleles.csv' files created by TSSV.
Output
The same file, with an additional column (named 'annotation' by default).
The new column contains '``STUTTER``' for possible stutter products, or
'``ALLELE``' otherwise. Lines that were not evaluated are annotated as
'``UNKNOWN``'. The ``STUTTER`` annotation contains additional information.
For example,
``STUTTER:146.6x1(2-1):10.4x2(2-1x9-1)``
This is a stutter product for which at most 146.6 reads have come from the
first sequence in the output file ("``146.6x1``") and at most 10.4 reads
have come from the second sequence in the output file ("``10.4x2``"). This
sequence differs from the first sequence in the output file by a loss of
one repeat of the second repeat block ("``2-1``") and it differs from the
second sequence by the loss of one repeat in the second block *and* one
repeat in the ninth block ("``2-1x9-1``").
Changelog
~~~~~~~~~
v1.3
- First version of Stuttermark to be included in ``fdstools``
- Fixed crash that occurred when an empty allele (e.g., a primer dimer)
was encountered
- Stuttermark now prints a warning if an allele is encountered that is
not a TSSV-style sequence
v1.2
- All settings are now available from the command line
- Use 1-based indexing in ``STUTTER`` annotations
v1.1
- Stuttermark now accepts file names and the minimum number of reads to
evaluate as command line arguments
v1.0
- Initial version
"""
Tools for characterisation and filtering of PCR stutter artefacts and other
systemic noise in Next Generation Sequencing data of forensic STR markers.
"""
__version_info__ = ('0', '0', '1')
__version__ = '.'.join(__version_info__)
usage = __doc__.split("\n\n\n")
def version(name, toolname=None, toolversion=None):
"""Return a version string for the package or a given tool."""
verformat = "%s %s"
toolverformat = "%s (part of %s)"
if toolname is None:
return verformat % (name, __version__)
if toolversion is None:
return toolverformat % (toolname, verformat % (name, __version__))
return toolverformat % (verformat % (toolname,toolversion),
verformat % (name, __version__))
#!/usr/bin/env python
import argparse, pkgutil, os
import tools
from . import usage, version
# Map tool names to argument parsers.
__tools__ = {}
class _HelpAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
# If valid 'tool' argument is given, call that tool's help.
if values and values[0] in __tools__:
__tools__[values[0]].print_help()
__tools__[values[0]].exit()
else:
parser.print_help()
parser.exit()
#__call__
#_HelpAction
class _VersionAction(argparse.Action):
def __call__(self, parser, namespace, values, option_string=None):
# If valid 'tool' argument is given, print that tool's version.
if values and values[0] in __tools__:
parser = __tools__[values[0]]
formatter = parser._get_formatter()
formatter.add_text(parser.version)
parser.exit(message=formatter.format_help())
#__call__
#_VersionAction
def main():
"""
Main entry point.
"""
parser = argparse.ArgumentParser(add_help=False, description=usage[0],
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.version = version(parser.prog)
parser.add_argument('-v', "--version", action=_VersionAction,
default=argparse.SUPPRESS, nargs=argparse.REMAINDER,
help="show version number and exit")
parser.add_argument('-h', '--help', action=_HelpAction,
default=argparse.SUPPRESS, nargs=argparse.REMAINDER,
help="show this help message, or help for the "
"specified TOOL, and exit")
subparsers = parser.add_subparsers(title='available tools', dest='tool',
metavar='TOOL', help="specify which "
"tool to run")
subparsers.required = True
prefix = tools.__name__ + "."
for importer, name, ispkg in pkgutil.iter_modules(
path=[os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"tools")]):
module = importer.find_module(prefix + name).load_module(prefix + name)
subparser = subparsers.add_parser(
name, help=module.__doc__, description=module.__doc__,
version=version(parser.prog, name, module.__version__))
__tools__[name] = subparser
module.add_arguments(subparser)
subparser.set_defaults(func=module.run)
try:
args = parser.parse_args()
args.func(args)
except OSError as error:
parser.error(error)
#main
if __name__ == "__main__":
main()
#!/usr/bin/env python
import re
# Patterns that match entire sequences.
PAT_SEQ_RAW = re.compile("^[ACGT]*$")
PAT_SEQ_TSSV = re.compile("^(?:[ACGT]+\(\d+\))*$")
PAT_SEQ_ALLELENAME = re.compile(
"^(?:(?:\d+(?:\.\d+)?_(?:[ACGT]+\[\d+\])+)|[XY])" # nn_ACGT[nn]...
"(?:_[-+]\d+(?:\.1)?(?P<a>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>" # _+3A>
"(?!(?P=a))(?:[ACTG]+|-))*$") # Portion of variants after '>'.
# Pattern that matches blocks of TSSV-style sequences.
PAT_TSSV_BLOCK = re.compile("([ACTG]+)\((\d+)\)")
def detect_sequence_format(seq):
"""Return format of seq. One of 'raw', 'tssv', or 'allelename'."""
if not seq:
raise ValueError("Empty sequence.")
if PAT_SEQ_RAW.match(seq):
return 'raw'
if PAT_SEQ_TSSV.match(seq):
return 'tssv'
if PAT_SEQ_ALLELENAME.match(seq):
return 'allelename'
raise ValueError("Unrecognised sequence format.")
#detect_sequence_format
def get_column_ids(column_names, *names):
"""Find all names in column_names and return their indices."""
result = []
for name in names:
try:
result.append(column_names.index(name))
except ValueError:
raise Exception("Column not found in input file: %s" % name)
if len(result) == 1:
return result[0]
return tuple(result)
#get_column_ids
def pos_int_arg(value):
"""Convert str to int, raise ArgumentTypeError if not positive."""
if not value.isdigit() or not int(value):
raise argparse.ArgumentTypeError(
"invalid positive int value: '%s'" % value)
return int(value)
#pos_int_arg
def print_db(text, debug):
"""Print text if debug is True."""
if debug:
print(text)
#print_db
#!/usr/bin/env python
"""
Mark potential stutter products by assuming a fixed maximum percentage
of stutter product vs the parent allele.
"""
import argparse
import sys
import re
from ..lib import pos_int_arg, print_db, PAT_TSSV_BLOCK, get_column_ids
__version__ = "1.3"
# Default values for parameters are specified below.
# Link a difference in repeat count to a minimum occurrance percentage
# for stutter peaks. Any repeat count differences other than listed
# here are taken to have zero probability.
# This value can be overridden by the -s command line option.
_DEF_STUTTERDEF = "-1:15,+1:4"
# Any peaks with less than this number of reads are not evaluated.
# This value can be overridden by the -m command line option.
_DEF_MIN_READS = 2
# Stutters are not expected to appear for blocks with less than
# _DEF_MIN_REPEATS repeats. If two alleles have a different repeat count
# for any block with less than _DEF_MIN_REPEATS repeats, it is not possible
# that one of them is a stutter of the other. Therefore, no comparison
# is made between any two such sequences.
# This value can be overridden by the -n command line option.
_DEF_MIN_REPEATS = 3
# Peaks are only annotated as a stutter peak of some other peak if the
# expected number of stutter occurances of this other peak is above
# this value.
# This value can be overridden by the -r command line option.
_DEF_MIN_REPORT = 0.1
# Default name of the column we are adding.
# This value can be overridden by the -c command line option.
_DEF_COLNAME = "annotation"
def load_data(infile, colname_annotation=_DEF_COLNAME):
"""
Read data from the file handle infile and return a tuple
(colum_names, data_rows).
The input file is expected to have tab-delimited columns with
column names on the first line and one allele per data row.
There are three required columns:
name The name of the marker this allele belongs to.
allele The allele sequence in TSSV output format.
E.g., AGAT(7)TGAT(3).
total The total number of reads with this sequence.
An additional column "annotation" is appended. All data rows
are given a value of "UNKNOWN" for this column.
:arg infile: Open readable handle to data file.
:type infile: stream
:returns: A 2-tuple (column_names, data_rows).
:rtype: tuple(list, list)
"""
# Get column numbers. We are adding a column as well.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_total, colid_allele, colid_name = get_column_ids(column_names,
"total", "allele", "name")
column_names.append(colname_annotation)
# Step through the file line by line to build the allele list.
allelelist = []
for line in infile:
columns = line.rstrip("\r\n").split("\t")
# Skip empty/malformed lines (NOTE: allowing additional columns
# beyond the expected 4 columns).
if len(columns) != len(column_names)-1:
if len(line.strip()):
print("WARNING: skipped line: %s" % line)
continue
# Split the allele column into a list of tuples:
# [('ACTG','4'),('CCTC','12'),...]
columns[colid_allele] = PAT_TSSV_BLOCK.findall(columns[colid_allele])
if columns[colid_allele] == None:
print("WARNING: skipped line: %s" % line)
continue
# String to integer conversion...
columns[colid_allele] = map(
lambda x: [x[0], int(x[1])], columns[colid_allele])
columns[colid_total] = int(columns[colid_total])
# Repeat unit deduplication (data may contain stuff like
# "AGAT(7)AGAT(5)" instead of "AGAT(12)").
if columns[colid_allele]:
dedup = [columns[colid_allele][0]]
for i in range(1, len(columns[colid_allele])):
if columns[colid_allele][i][0] == dedup[-1][0]:
dedup[-1][1] += columns[colid_allele][i][1]
else:
dedup.append(columns[colid_allele][i])
columns[colid_allele] = dedup
# Add the allele to the list, including our new column.
columns.append("UNKNOWN")
allelelist.append(columns)
return column_names, allelelist
#load_data
def annotate_alleles(infile, outfile, stutter, min_reads=_DEF_MIN_READS,
min_repeats=_DEF_MIN_REPEATS, min_report=_DEF_MIN_REPORT,
colname_annotation=_DEF_COLNAME, debug=False):
"""
Read data from the file handle infile and write annotated data to
file handle outfile.
The input file is expected to have tab-delimited columns with
column names on the first line and one allele per data row.
There are three required columns:
name The name of the marker this allele belongs to.
allele The allele sequence in TSSV output format.
E.g., AGAT(7)TGAT(3).
total The total number of reads with this sequence.
An additional annotation column is appended. All data rows
are given a value of "ALLELE", "STUTTER:...", or "UNKNOWN".
Alleles that are annotated as "STUTTER" are given a colon-separated
description of their expected origins. For each originating allele
a description of the form AxB(C) is given, with A the maximum
expected number of stutter reads from this origin, B the allele
number of the originating allele (where the first allele in the
output file is allele 1), and C a 'x'-separated list of repeat
blocks that had stuttered.
Example stutter description:
STUTTER:123.8x1(3-1):20.2x2(4+1)
This stutter allele has two originating alleles: a maximum of 123.8
reads from allele 1 plus a maximum of 20.2 reads from 2. Compared
to allele 1, this is a -1 stutter in the third repeat block.
Compared to allele 2, this is a +1 stutter of the fourth repeat
block. (If this allele had more than 144 total reads, it would have
been annotated as "ALLELE".)
:arg infile: Open readable handle to data file.
:type infile: stream
:arg outfile: Open writable handle for output data.
:type outfile: stream
:arg stutter: A dict mapping stutter size (keys) to a maximum
expected percentage of stutter of this size.
:type stutter: dict{int: float}
:arg min_reads: Any alleles with less than this number of reads are
not evaluated (annotation=UNKNOWN).
:type min_reads: int
:arg min_repeats: Stutters are not expected to appear for blocks
with less than this number of repeats.
:type min_repeats: int
:arg min_report: Stutters with an expected number of reads below
this value are not reported.
:type min_report: float
:arg colname_annotation: Name of the newly added column.
:type colname_annotation: str
:arg debug: If True, print debug output to stdout.
:type debug: bool
"""
column_names, allelelist = load_data(infile, colname_annotation)
colid_total, colid_allele, colid_name = get_column_ids(column_names,
"total", "allele", "name")
# Sort (descending total reads).
allelelist.sort(key=lambda allele:
[allele[colid_name], -allele[colid_total]])
for iCurrent in range(len(allelelist)):
# See if this allele appeared a reasonable number of times.
if allelelist[iCurrent][colid_total] < min_reads:
continue
isStutterPeakOf = {}
maximumOccurrenceExpected = 0
# Find all ways to reach allele iCurrent by mutating the other
# alleles that appeared more often.
for iOther in range(iCurrent):
# Must be same marker.
if (allelelist[iCurrent][colid_name] !=
allelelist[iOther][colid_name]):
continue
print_db('%i vs %i' % (iCurrent+1, iOther+1), debug)
# Must be same number of blocks.
if (len(allelelist[iCurrent][colid_allele]) !=
len(allelelist[iOther][colid_allele])):
print_db('Different number of blocks', debug)
continue
maximumOccurrenceExpectedForThisOther = 1.0
blocksThatDiffer = {}
# What is needed to get from sequence iOther to iCurrent?
# We will look at each block in turn.
for block in range(len(allelelist[iCurrent][colid_allele])):
# If mutations are needed to get from iOther to
# iCurrent, iCurrent can't be a stutter of iOther.
if (allelelist[iCurrent][colid_allele][block][0] !=
allelelist[iOther][colid_allele][block][0]):
print_db('Block %i has different sequence' % (block+1),
debug)
break
# See how many times the current block appears more or
# less often in iCurrent as compared to iOther.
deltaRepeats = (allelelist[iCurrent][colid_allele][block][1] -
allelelist[iOther][colid_allele][block][1])
if deltaRepeats == 0:
continue
# iCurrent is not a stutter of iOther if any one of its
# blocks differs in count while either count is below
# min_repeats. It is not a stutter of iOther because
# those are expected not to occur for such low repeat
# counts. At the same time any further analysis
# between the two is invalid because this block was not
# the same. Note that if this truly was a stutter of
# iOther, maximumOccurrenceExpected is going to be a
# bit too low now.
if min(allelelist[iCurrent][colid_allele][block][1],
allelelist[iOther][colid_allele][block][1]) < min_repeats:
print_db('Block %i has low-count difference: %i and %i' %
(block+1, allelelist[iCurrent][colid_allele][block][1],
allelelist[iOther][colid_allele][block][1]), debug)
break
# Depending on the repeat count difference, iCurrent
# may appear at most a certain amount of times.
# This is expressed as a percentage of the occurrence
# of iOther.
# If the repeat count difference is highly improbable,
# this can't be a stutter.
elif deltaRepeats not in stutter:
print_db('Improbable difference in number of repeats of '
'block %i: %i' % (block+1, deltaRepeats), debug)
break
# Based on this block only, sequence iCurrent could be
# a stutter of sequence iOther. It may appear only a
# limited number of times, however.
else:
blocksThatDiffer[block] = deltaRepeats
maximumOccurrenceExpectedForThisOther *= \
stutter[deltaRepeats] / 100.0
print_db('Occurence percentage now %d' %
maximumOccurrenceExpectedForThisOther, debug)
else:
# If we end up here, iCurrent could very well be a
# stutter peak of iOther, but it might be below the
# reporting threshold.
thisStutterOccurrenceExpected = \
(maximumOccurrenceExpectedForThisOther *
allelelist[iOther][colid_total])
if thisStutterOccurrenceExpected < min_report:
print_db('Expected occurrence of stutter from '
'%i to %i is only %d' %
(iOther+1, iCurrent+1,
thisStutterOccurrenceExpected), debug)
else:
isStutterPeakOf[iOther] = (blocksThatDiffer,
thisStutterOccurrenceExpected)
maximumOccurrenceExpected += thisStutterOccurrenceExpected
print_db('Expected occurence now %d' %
maximumOccurrenceExpected, debug)
# Now we'll just need to check whether it does not appear
# unrealistically often.
if allelelist[iCurrent][colid_total] > maximumOccurrenceExpected:
print_db('Occurs too often (%i > %d)' %
(allelelist[iCurrent][colid_total],
maximumOccurrenceExpected), debug)
allelelist[iCurrent][-1] = "ALLELE"
else:
# Stutter peak detected.
allelelist[iCurrent][-1] = "STUTTER:%s" % ":".join(map(
lambda iOther: "%.1fx%i(%s)" % (
isStutterPeakOf[iOther][1],
iOther+1,
"x".join(map(
lambda block: "%i%+i" % (
block+1,
isStutterPeakOf[iOther][0][block]),
isStutterPeakOf[iOther][0]))),
isStutterPeakOf))
print_db("%2i is a stutter peak of %s; occur=%i, maxOccur=%s" %
(iCurrent+1, isStutterPeakOf,
allelelist[iCurrent][colid_total],
maximumOccurrenceExpected), debug)
# Reconstruct the allele sequence and write out the findings.
for i in range(len(allelelist)):
allelelist[i][colid_allele] = "".join(map(
lambda block: "%s(%i)" % tuple(block),
allelelist[i][colid_allele]))
outfile.write("\t".join(column_names))
outfile.write("\n")
outfile.write("\n".join(map(
lambda allele: "\t".join(map(str, allele)), allelelist)))
outfile.write("\n")
#annotate_alleles
def stutter_def_arg(value):
"""
Convert a stutter definition string to dict, raise ArgumentTypeError
on failure.
The stutter definition string is a comma-separated list of int:float
pairs, e.g., "-1:15,+1:3.5". Whitespace around interpunction is
permitted, e.g., "-1: 15, +1: 3.5".
:arg min_reads: A valid stutter definition string.
:type min_reads: str
:returns: A dict mapping stutter size (keys) to a maximum expected
percentage of stutter of this size.
:rtype: dict{int: float}
"""
if ":" not in value:
return {}
try:
return {int(x[0]): float(x[1])
for x in map(lambda x: x.split(":"), value.split(","))}
except ValueError as e:
raise argparse.ArgumentTypeError(
"invalid stutter definition value: '%s' (%s)" % (value, str(e)))
#stutter_def_arg
def add_arguments(parser):
parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
type=argparse.FileType('r'),
help="the CSV data file to process (default: read from stdin)")
parser.add_argument('outfile', nargs='?', metavar="OUT",
default=sys.