Commit d14b28d7 authored by jhoogenboom's avatar jhoogenboom
Browse files

Various progress, mostly developing allelefinder

* Fixed crash when attempting to read a TSSV library from sys.stdin.
* Various large updates to allelefinder.
* libconvert now gives a useful default FDSTools library when given
  no input.
parent 160594c5
......@@ -52,6 +52,9 @@ Input
- 'name': the name of the marker (optional)
This format is compatible with 'knownalleles.csv' files created by TSSV_.
If raw sequences or allele names are provided, Stuttermark can convert
those to TSSV-style sequences automatically if a library file is given as
well.
Output
The same file, with an additional column (named 'annotation' by default).
......
......@@ -42,15 +42,15 @@ def main():
parser = argparse.ArgumentParser(add_help=False, description=usage[0],
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.version = version(parser.prog)
parser.add_argument('-d', "--debug", action="store_true",
help="if specified, debug output is printed to stdout")
parser.add_argument('-v', "--version", action=_VersionAction,
default=argparse.SUPPRESS, nargs=argparse.REMAINDER,
help="show version number and exit")
parser.add_argument('-h', '--help', action=_HelpAction,
default=argparse.SUPPRESS, nargs=argparse.REMAINDER,
help="show this help message, or help for the "
"specified TOOL, and exit")
parser.add_argument('-v', "--version", action=_VersionAction,
default=argparse.SUPPRESS, nargs=argparse.REMAINDER,
help="show version number and exit")
parser.add_argument('-d', "--debug", action="store_true",
help="if specified, debug output is printed to stdout")
subparsers = parser.add_subparsers(title='available tools', dest='tool',
metavar='TOOL', help="specify which "
"tool to run")
......@@ -66,10 +66,10 @@ def main():
name, help=module.__doc__, description=module.__doc__,
version=version(parser.prog, name, module.__version__))
__tools__[name] = subparser
module.add_arguments(subparser)
subparser.set_defaults(func=module.run)
subparser.add_argument('-d', "--debug", action="store_true",
help="if specified, debug output is printed to stdout")
module.add_arguments(subparser)
subparser.set_defaults(func=module.run)
try:
args = parser.parse_args()
except Exception as error:
......
......@@ -3,6 +3,7 @@
import re, sys
from ConfigParser import RawConfigParser, MissingSectionHeaderError
from StringIO import StringIO
from collections import defaultdict
# Patterns that match entire sequences.
......@@ -13,7 +14,7 @@ PAT_SEQ_ALLELENAME = re.compile(
"(?:_[-+]\d+(?:\.1)?(?P<a>(?:(?<=\.1)-)|(?<!\.1)[ACGT]+)>" # _+3A>
"(?!(?P=a))(?:[ACTG]+|-))*$") # Portion of variants after '>'.
# Pattern that matches blocks of TSSV-style sequences and allele names.
# Patterns that match blocks of TSSV-style sequences and allele names.
PAT_TSSV_BLOCK = re.compile("([ACGT]+)\((\d+)\)")
PAT_ALLELENAME_BLOCK = re.compile("([ACGT]+)\[(\d+)\]")
......@@ -201,6 +202,9 @@ def mutate_sequence(sequence, variants):
def parse_library(handle):
if handle == sys.stdin:
# Can't seek on pipes, so read it into a buffer first.
handle = StringIO(sys.stdin.read())
try:
return parse_library_ini(handle)
except MissingSectionHeaderError:
......@@ -570,6 +574,15 @@ def ensure_sequence_format(seq, to_format, from_format=None, library=None,
#ensure_sequence_format
def get_tag(filename, tag_expr, tag_format):
"""Return formatted sample tag from filename using regex."""
try:
return tag_expr.search(filename).expand(tag_format)
except:
return filename
#get_tag
def get_column_ids(column_names, *names):
"""Find all names in column_names and return their indices."""
result = []
......
#!/usr/bin/env python
"""
Find true alleles in a single-person reference sample.
Find true alleles in reference samples and detect possible
contaminations.
"""
import argparse
import sys
import re
from ..lib import get_column_ids, pos_int_arg
from ..lib import get_column_ids, pos_int_arg, get_tag
__version__ = "0.1dev"
# Default values for parameters are specified below.
# Default regular expression to capture sample tags in file names.
# This value can be overridden by the -e command line option.
_DEF_TAG_EXPR = "^(.+?)(?:\.[^.]+)?$"
# Default formatting template to write sample tags.
# This value can be overridden by the -f command line option.
_DEF_TAG_FORMAT = "\\1"
# Default minimum number of reads required for the highest allele.
# This value can be overridden by the -n command line option.
_DEF_MIN_READS = 50
# Default minimum number of reads required for an allele to be called,
# as a percentage of the number of reads of the highest allele.
# This value can be overridden by the -m command line option.
_DEF_MIN_ALLELE_PCT = 30.0
# Default maximum amount of noise to allow, as a percentage of the
# number of reads of the highest allele of each marker. If any noise
# (i.e., non-allelic sequences) above this threshold are detected, the
# sample is considered 'noisy' for this marker.
# This value can be overridden by the -M command line option.
_DEF_MAX_NOISE_PCT = 10.0
# Default maximum number of alleles to expect for each marker.
# This value can be overridden by the -a command line option.
_DEF_MAX_ALLELES = 2
# Default maximum number of noisy markers allowed per sample.
# This value can be overridden by the -x command line option.
_DEF_MAX_NOISY = 2
def find_alleles(filelist, reportfile, tag_expr, tag_format, min_reads,
min_allele_pct, max_noise_pct, max_alleles, max_noisy,
stuttermark_column):
print("\t".join(["sample", "marker", "total", "allele"]))
for infile in filelist:
tag = get_tag(infile.name, tag_expr, tag_format)
find_alleles_sample(infile, reportfile, tag, min_reads, min_allele_pct,
max_noise_pct, max_alleles, max_noisy, stuttermark_column)
#find_alleles
def find_alleles_sample(infile, reportfile, tag, min_reads, min_allele_pct,
max_noise_pct, max_alleles, max_noisy,
stuttermark_column):
def find_alleles(infile, outfile, min_allele_pct, max_noise_pct,
stuttermark_column, max_alleles):
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_total, colid_allele, colid_name = get_column_ids(column_names,
......@@ -26,8 +72,8 @@ def find_alleles(infile, outfile, min_allele_pct, max_noise_pct,
if stuttermark_column is not None:
colid_stuttermark = get_column_ids(column_names, stuttermark_column)
highest_noise = {}
highest_allele = {}
top_noise = {}
top_allele = {}
alleles = {}
for line in infile:
line = line.rstrip("\r\n").split("\t")
......@@ -39,60 +85,119 @@ def find_alleles(infile, outfile, min_allele_pct, max_noise_pct,
allele = line[colid_allele]
reads = int(line[colid_total])
if marker in alleles:
if reads > highest_allele[marker]:
if marker not in alleles:
alleles[marker] = {allele: reads}
top_allele[marker] = reads
top_noise[marker] = ["-", 0]
else:
if reads > top_allele[marker]:
# New highest allele!
highest_allele[marker] = reads
for allele in alleles[marker]:
if (alleles[marker][allele] <
marker_max[marker] * (min_allele_pct/100.)):
if alleles[marker][allele] > highest_noise[marker]:
highest_noise[marker] = alleles[marker][allele]
del alleles[marker][allele]
elif reads >= highest_allele[marker]*(min_allele_pct/100.):
top_allele[marker] = reads
for allelex in alleles[marker].keys():
if (alleles[marker][allelex] <
top_allele[marker] * (min_allele_pct/100.)):
if alleles[marker][allelex] > top_noise[marker][1]:
top_noise[marker] = [
allelex, alleles[marker][allelex]]
del alleles[marker][allelex]
alleles[marker][allele] = reads
elif reads >= top_allele[marker]*(min_allele_pct/100.):
# New secundary allele!
alleles[marker][allele] = reads
elif reads >= highest_noise[marker]:
elif reads >= top_noise[marker][1]:
# New highest noise!
highest_noise[marker] = reads
else:
alleles[marker] = {allele: reads}
highest_allele[marker] = reads
highest_noise[marker] = 0
top_noise[marker] = [allele, reads]
outfile.write("\t".join(["marker", "allele"]) + "\n")
# Find and eliminate noisy markers in this sample first.
noisy_markers = 0
for marker in alleles:
if top_allele[marker] < min_reads:
if reportfile:
reportfile.write(
"Sample %s is not suitable for marker %s:\n"
"highest allele has only %i reads\n\n" %
(tag, marker, top_allele[marker]))
alleles[marker] = {}
continue
if len(alleles[marker]) > max_alleles:
allele_order = sorted(alleles[marker],
key=lambda x: -alleles[marker][x])
highest_noise[marker] = alleles[marker][allele_order[max_alleles]]
top_noise[marker] = [allele_order[max_alleles],
alleles[marker][allele_order[max_alleles]]]
alleles[marker] = {x: alleles[marker][x]
for x in allele_order[:max_alleles]}
for allele in alleles[marker]:
outfile.write("\t".join([marker, allele]) + "\n")
if highest_noise[marker] > highest_allele[marker]*(max_noise_pct/100.):
outfile.write("\t".join([marker, "NOISY"]) + "\n")
#find_alleles
if top_noise[marker][1] > top_allele[marker]*(max_noise_pct/100.):
if reportfile:
reportfile.write(
"Sample %s is not suitable for marker %s:\n"
"highest non-allele is %.1f%% of the highest allele\n" %
(tag, marker,
100.*top_noise[marker][1]/top_allele[marker]))
for allele in sorted(alleles[marker],
key=lambda x: -alleles[marker][x]):
reportfile.write("%i\tALLELE\t%s\n" %
(alleles[marker][allele], allele))
reportfile.write("%i\tNOISE\t%s\n\n" %
(top_noise[marker][1], top_noise[marker][0]))
noisy_markers += 1
alleles[marker] = {}
# Drop this sample completely if it has too many noisy markers.
if noisy_markers > max_noisy:
if reportfile:
reportfile.write("Sample %s appears to be contaminated!\n\n" % tag)
return
# The sample is OK, write out its alleles.
for marker in alleles:
for allele in sorted(alleles[marker],
key=lambda x: -alleles[marker][x]):
print("\t".join(
[tag, marker, str(alleles[marker][allele]), allele]))
#find_alleles_sample
def add_arguments(parser):
parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
type=argparse.FileType('r'),
help="the CSV data file to process (default: read from stdin)")
parser.add_argument('outfile', nargs='?', metavar="OUT",
default=sys.stdout, type=argparse.FileType('w'),
help="the file to write the output to (default: write to stdout)")
parser.add_argument('filelist', nargs='*', metavar="FILE",
default=[sys.stdin], type=argparse.FileType('r'),
help="the data file(s) to process (default: read from stdin)")
parser.add_argument('-r', '--report', metavar="OUTFILE",
type=argparse.FileType("w"),
help="write a report to the given file, detailing possibly "
"contaminated or otherwise unsuitable samples")
parser.add_argument('-e', '--tag-expr', metavar="REGEX", type=re.compile,
default=_DEF_TAG_EXPR,
help="regular expression that captures (using one or more capturing "
"groups) the sample tags from the file names; by default, the "
"entire file name except for its extension (if any) is captured")
parser.add_argument('-f', '--tag-format', metavar="EXPR",
default=_DEF_TAG_FORMAT,
help="format of the sample tags produced; a capturing group reference "
"like '\\n' refers to the n-th capturing group in the regular "
"expression specified with -e/--tag-expr (the default of '\\1' "
"simply uses the first capturing group); with a single sample, "
"you can enter the samle tag here explicitly")
parser.add_argument('-n', '--min-reads', metavar="N", type=pos_int_arg,
default=_DEF_MIN_READS,
help="require at least this number of reads for the highest allele "
"(default: %(default)s)")
parser.add_argument('-m', '--min-allele-pct', metavar="N", type=float,
default=_DEF_MIN_ALLELE_PCT,
help="call heterozygous if the second allele is at least this "
"percentage of the highest allele (default: %(default)s)")
parser.add_argument('-M', '--max-noise-pct', metavar="N", type=float,
default=_DEF_MAX_NOISE_PCT, help="output additional \"NOISY\" allele "
"if the highest non-allelic sequence is at least this "
"percentage of the highest allele (default: %(default)s)")
default=_DEF_MAX_NOISE_PCT,
help="a sample is considered contaminated/unsuitable for a marker if "
"the highest non-allelic sequence is at least this percentage of "
"the highest allele (default: %(default)s)")
parser.add_argument('-a', '--max-alleles', metavar="N", type=pos_int_arg,
default=_DEF_MAX_ALLELES, help="allow no more than this number of "
"alleles per marker (default: %(default)s)")
default=_DEF_MAX_ALLELES,
help="allow no more than this number of alleles per marker (default: "
"%(default)s)")
parser.add_argument('-x', '--max-noisy', metavar="N", type=pos_int_arg,
default=_DEF_MAX_NOISY,
help="entirely reject a sample if more than this number of markers "
"have a high non-allelic sequence (default: %(default)s)")
parser.add_argument('-c', '--stuttermark-column', metavar="COLNAME",
default=None,
help="name of column with Stuttermark output; if specified, sequences "
......@@ -102,11 +207,13 @@ def add_arguments(parser):
def run(args):
if args.infile.isatty() and args.outfile.isatty():
if args.filelist == [sys.stdin] and sys.stdin.isatty():
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
find_alleles(args.infile, args.outfile, args.min_allele_pct,
args.max_noise_pct, args.stuttermark_column, args.max_alleles)
find_alleles(args.filelist, args.report, args.tag_expr, args.tag_format,
args.min_reads, args.min_allele_pct, args.max_noise_pct,
args.max_alleles, args.max_noisy, args.stuttermark_column)
#run
......
#!/usr/bin/env python
"""
Convert between TSSV (tab-separated) and FDSTools (ini-style) library formats.
Convert between TSSV (tab-separated) and FDSTools (ini-style) library
formats. When no input is given, an empty FDSTools library is produced.
"""
import argparse
import sys
......@@ -8,10 +9,19 @@ import re
from ..lib import parse_library
from ConfigParser import RawConfigParser
from StringIO import StringIO
__version__ = "0.1dev"
# If no input is given, convert the following to FDSTools format.
_DEFAULT_LIBRARY = "\t".join([
"MyMarker",
"ACTAGCTAGCGCTA",
"GCTCGATCGATCGA",
"TGAT 0 2 AGAT 3 20 ACCT 0 5"])
def convert_library(infile, outfile, aliases=False):
pattern_reverse = re.compile("\(([ACGT]+)\)\{(\d+),(\d+)\}")
library = parse_library(infile)
......@@ -36,7 +46,6 @@ def convert_library(infile, outfile, aliases=False):
else:
marker_aliases[marker].append(alias)
newline = ""
for marker in sorted(markers):
if marker in library["aliases"] and not aliases:
# Ignore this alias, it will be merged into its marker.
......@@ -126,10 +135,9 @@ def convert_library(infile, outfile, aliases=False):
for suffix in suffixes:
pattern.append((suffix, "0", "1"))
outfile.write(newline + "%s\t%s\t%s\t%s" % (
outfile.write("%s\t%s\t%s\t%s\n" % (
marker, flanks[0], flanks[1],
" ".join(map(lambda x: "%s %s %s" % x, pattern))))
newline = "\n"
else:
# TSSV -> FDSTools
......@@ -141,6 +149,11 @@ def convert_library(infile, outfile, aliases=False):
ini.add_section("aliases")
ini.set("aliases", "; Specify three comma-separated values: marker "
"name, sequence, and allele name.")
ini.set("aliases", "; You may use the alias name to specify flanks, "
"prefix, and suffix for this")
ini.set("aliases", "; allele specifically. You cannot specify a "
"repeat structure for an alias.")
ini.set("aliases", ";MyAlias = MyMarker, AGCTAGC, MySpecialAlleleName")
ini.add_section("flanks")
ini.set("flanks", "; Specify two comma-separated values: left flank "
"and right flank.")
......@@ -148,12 +161,18 @@ def convert_library(infile, outfile, aliases=False):
ini.set("prefix", "; Specify all possible prefix sequences separated "
"by commas. The first sequence")
ini.set("prefix", "; listed is used as the reference sequence when "
"generating allele names.")
"generating allele names. The")
ini.set("prefix", "; prefix is the sequence between the left flank "
"and the repeat and is omitted")
ini.set("prefix", "; from allele names. Deviations from the reference "
"are expressed as variants.")
ini.add_section("suffix")
ini.set("suffix", "; Specify all possible suffix sequences separated "
"by commas. The first sequence")
ini.set("suffix", "; listed is used as the reference sequence when "
"generating allele names.")
"generating allele names. The")
ini.set("suffix", "; suffix is the sequence between the repeat and "
"the right flank.")
ini.add_section("repeat")
ini.set("repeat", "; Specify the STR repeat structure in "
"space-separated triples of sequence,")
......@@ -165,7 +184,7 @@ def convert_library(infile, outfile, aliases=False):
ini.set("length_adjust", "; of the sequence (prefix+repeat+suffix) "
"minus the adjustment specified here.")
ini.add_section("block_length")
ini.set("block_length", "; Specify the core repeat unit lengths. The "
ini.set("block_length", "; Specify the core repeat unit length. The "
"default length is 4.")
# Enter flanking sequences and STR definitions.
......@@ -214,9 +233,9 @@ def add_arguments(parser):
def run(args):
if args.infile.isatty() and args.outfile.isatty():
raise ValueError("please specify an input file, or pipe in the output "
"of another program")
if args.infile.isatty():
# No input given. Produce a default FDSTools library.
args.infile = StringIO(_DEFAULT_LIBRARY)
convert_library(args.infile, args.outfile, args.aliases)
#run
......
......@@ -56,7 +56,8 @@ def convert_sequences(infile, outfile, to_format, libfile=None,
def add_arguments(parser):
parser.add_argument('format', metavar="FORMAT",
help="the format to convert to: one of 'raw', 'tssv', or 'allelename'")
choices=["raw", "tssv", "allelename"],
help="the format to convert to: one of %(choices)s")
parser.add_argument('infile', nargs='?', metavar="IN", default=sys.stdin,
type=argparse.FileType('r'),
help="the tab-separated data file to process (default: read from "
......@@ -77,7 +78,7 @@ def add_arguments(parser):
help="name of the column to write the output to "
"(default: '%(default)s')")
parser.add_argument('-M', '--marker', metavar="MARKER",
help="assume the specified marker for all sequences in the file")
help="assume the specified marker for all sequences")
parser.add_argument('-l', '--library', metavar="LIBRARY",
type=argparse.FileType('r'),
help="library file for sequence format conversion")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment