Commit 732e83ba authored by jhoogenboom's avatar jhoogenboom
Browse files

Moved sample data reading code into lib.py

parent 8685a304
...@@ -62,7 +62,8 @@ def main(): ...@@ -62,7 +62,8 @@ def main():
"tools")]): "tools")]):
module = importer.find_module(prefix + name).load_module(prefix + name) module = importer.find_module(prefix + name).load_module(prefix + name)
subparser = subparsers.add_parser( subparser = subparsers.add_parser(
name, help=module.__doc__, description=module.__doc__, name, help=module.__doc__.split("\n\n\n", 1)[0],
description=module.__doc__,
version=version(parser.prog, name, module.__version__)) version=version(parser.prog, name, module.__version__))
__tools__[name] = subparser __tools__[name] = subparser
subparser.add_argument('-d', "--debug", action="store_true", subparser.add_argument('-d', "--debug", action="store_true",
......
#!/usr/bin/env python #!/usr/bin/env python
import re, sys, argparse import re, sys, argparse, random
#import numpy as np # Imported only when calling nnls() #import numpy as np # Imported only when calling nnls()
from ConfigParser import RawConfigParser, MissingSectionHeaderError from ConfigParser import RawConfigParser, MissingSectionHeaderError
from StringIO import StringIO from StringIO import StringIO
from collections import defaultdict
# Patterns that match entire sequences. # Patterns that match entire sequences.
PAT_SEQ_RAW = re.compile("^[ACGT]*$") PAT_SEQ_RAW = re.compile("^[ACGT]*$")
...@@ -273,8 +272,8 @@ def parse_library_ini(handle): ...@@ -273,8 +272,8 @@ def parse_library_ini(handle):
"suffix": {}, "suffix": {},
"regex": {}, "regex": {},
"regex_middle": {}, "regex_middle": {},
"length_adjust": defaultdict(int), "length_adjust": {},
"block_length": defaultdict(lambda: 4), "block_length": {},
"aliases": {} "aliases": {}
} }
markers = set() markers = set()
...@@ -735,12 +734,8 @@ def convert_sequence_raw_allelename(seq, library, marker): ...@@ -735,12 +734,8 @@ def convert_sequence_raw_allelename(seq, library, marker):
# TODO: perhaps produce a more intelligent name if there is exactly # TODO: perhaps produce a more intelligent name if there is exactly
# one alias with the same length # one alias with the same length
blocknames = [] blocknames = []
if "block_length" in library: blocksize = library.get("block_length", {}).get(marker, 4)
blocksize = library["block_length"][marker] length -= library.get("length_adjust", {}).get(marker, 0)
else:
blocksize = 4
if "length_adjust" in library:
length -= library["length_adjust"][marker]
for block in blocks: for block in blocks:
blocknames.append("%s[%s]" % (block[0], block[1])) blocknames.append("%s[%s]" % (block[0], block[1]))
length += len(block[0]) * int(block[1]) length += len(block[0]) * int(block[1])
...@@ -875,6 +870,103 @@ def adjust_stats(value, stats=None): ...@@ -875,6 +870,103 @@ def adjust_stats(value, stats=None):
#adjust_stats #adjust_stats
def read_sample_data_file(infile, data, annotation_column=None, seqformat=None,
library=None, default_marker=None):
"""Add data from infile to data dict as [marker, allele]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_allele, colid_forward, colid_reverse = \
get_column_ids(column_names, "allele", "forward", "reverse")
# Get marker name column if it exists.
try:
colid_name = get_column_ids(column_names, "name")
except:
colid_name = None
# Also try to get annotation column if we have one.
if annotation_column is not None:
try:
colid_annotation = get_column_ids(column_names, annotation_column)
except:
annotation_column = None
found_alleles = []
for line in infile:
line = line.rstrip("\r\n").split("\t")
marker = line[colid_name] if colid_name is not None else default_marker
allele = line[colid_allele] if seqformat is None \
else ensure_sequence_format(line[colid_allele], seqformat, library)
if (annotation_column is not None and
line[colid_annotation].startswith("ALLELE")):
found_alleles.append(marker, allele)
data[marker, allele] = map(int,
(line[colid_forward], line[colid_reverse]))
return found_alleles
#read_sample_data_file
def reduce_read_counts(data, limit_reads):
sum_reads = 0
for markerallele in data:
sum_reads += sum(data[markerallele])
if sum_reads <= limit_reads:
return
remove = sorted(random.sample(xrange(sum_reads), sum_reads - limit_reads))
i = 0
seen = 0
while i < len(remove) and seen > remove[i]:
# Skip the reads filtered out above.
i += 1
for markerallele in data:
for direction in (0, 1):
seen += data[markerallele][direction]
while i < len(remove) and seen > remove[i]:
data[markerallele][direction] -= 1
i += 1
#reduce_read_counts
def get_sample_data(tags_to_files, callback, allelelist=None,
annotation_column=None, seqformat=None, library=None,
marker=None, homozygotes=False, limit_reads=sys.maxint,
drop_samples=0):
if drop_samples:
sample_tags = tags_to_files.keys()
for tag in random.sample(xrange(len(sample_tags)),
int(len(sample_tags) * drop_samples)):
del tags_to_files[sample_tags[tag]]
for tag in tags_to_files:
data = {}
alleles = set()
for infile in tags_to_files[tag]:
alleles.update(read_sample_data_file(infile, data,
annotation_column, seqformat, library, marker))
if limit_reads < sys.maxint:
reduce_read_counts(data, limit_reads)
if allelelist is not None:
if tag not in allelelist:
allelelist[tag] = {}
for markerx, allele in alleles:
if markerx not in allelelist[tag]:
allelelist[tag][markerx] = set()
allelelist[tag][markerx].add(allele)
if marker:
if marker in allelelist[tag]:
allelelist[tag] = {marker: allelelist[tag][marker]}
else:
allelelist[tag] = {}
if homozygotes:
for marker in allelelist[tag].keys():
if len(allelelist[tag][marker]) > 1:
del allelelist[tag][marker]
callback(tag, data)
#get_sample_data
def get_column_ids(column_names, *names): def get_column_ids(column_names, *names):
"""Find all names in column_names and return their indices.""" """Find all names in column_names and return their indices."""
result = [] result = []
......
...@@ -5,10 +5,10 @@ contaminations. ...@@ -5,10 +5,10 @@ contaminations.
""" """
import argparse import argparse
import sys import sys
import re
from ..lib import get_column_ids, pos_int_arg, map_tags_to_files, \ from ..lib import get_column_ids, pos_int_arg, map_tags_to_files, \
add_sample_files_args, ensure_sequence_format add_sample_files_args, ensure_sequence_format, \
get_sample_data
__version__ = "0.1dev" __version__ = "0.1dev"
...@@ -43,45 +43,23 @@ _DEF_MAX_NOISY = 2 ...@@ -43,45 +43,23 @@ _DEF_MAX_NOISY = 2
def find_alleles(filelist, reportfile, tag_expr, tag_format, min_reads, def find_alleles(filelist, reportfile, tag_expr, tag_format, min_reads,
min_allele_pct, max_noise_pct, max_alleles, max_noisy, min_allele_pct, max_noise_pct, max_alleles, max_noisy,
stuttermark_column, seqformat, library): stuttermark_column, seqformat, library):
if seqformat is not None and library is not None: if seqformat is not None and library is not None:
library = parse_library(library) library = parse_library(library)
print("\t".join(["sample", "marker", "total", "allele"])) print("\t".join(["sample", "marker", "total", "allele"]))
tags_to_files = map_tags_to_files(filelist, tag_expr, tag_format) allelelist = {}
for tag in tags_to_files: get_sample_data(
data = {} map_tags_to_files(filelist, tag_expr, tag_format),
for infile in tags_to_files[tag]: lambda tag, data: find_alleles_sample(
get_sample_data(infile, data, stuttermark_column) data if stuttermark_column is None
find_alleles_sample(data, reportfile, tag, min_reads, min_allele_pct, else {key: data[key] for key in allelelist[tag]},
max_noise_pct, max_alleles, max_noisy, seqformat, library) reportfile, tag, min_reads, min_allele_pct, max_noise_pct,
max_alleles, max_noisy, seqformat, library),
allelelist,
stuttermark_column)
#find_alleles #find_alleles
def get_sample_data(infile, data, stuttermark_column):
"""Add data from infile to data dict as [marker, allele]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_total, colid_allele, colid_name = get_column_ids(column_names,
"total", "allele", "name")
# Also try to get stuttermark column if we have one.
if stuttermark_column is not None:
try:
colid_stuttermark = get_column_ids(column_names,
stuttermark_column)
except:
stuttermark_column = None
for line in infile:
line = line.rstrip("\r\n").split("\t")
if (stuttermark_column is not None and
not line[colid_stuttermark].startswith("ALLELE")):
continue
data[line[colid_name], line[colid_allele]] = int(line[colid_total])
#get_sample_data
def find_alleles_sample(data, reportfile, tag, min_reads, min_allele_pct, def find_alleles_sample(data, reportfile, tag, min_reads, min_allele_pct,
max_noise_pct, max_alleles, max_noisy, seqformat, max_noise_pct, max_alleles, max_noisy, seqformat,
library): library):
...@@ -89,7 +67,7 @@ def find_alleles_sample(data, reportfile, tag, min_reads, min_allele_pct, ...@@ -89,7 +67,7 @@ def find_alleles_sample(data, reportfile, tag, min_reads, min_allele_pct,
top_allele = {} top_allele = {}
alleles = {} alleles = {}
for marker, allele in data: for marker, allele in data:
reads = data[marker, allele] reads = sum(data[marker, allele])
if marker not in alleles: if marker not in alleles:
alleles[marker] = {allele: reads} alleles[marker] = {allele: reads}
......
...@@ -4,14 +4,14 @@ Estimate allele-centric background noise profiles (means). ...@@ -4,14 +4,14 @@ Estimate allele-centric background noise profiles (means).
""" """
import argparse import argparse
import sys import sys
import random
import time import time
import math import math
#import numpy as np # Only imported when actually running this tool. #import numpy as np # Only imported when actually running this tool.
from ..lib import get_column_ids, pos_int_arg, add_sample_files_args,\ from ..lib import get_column_ids, pos_int_arg, add_sample_files_args,\
add_allele_detection_args, map_tags_to_files, nnls,\ add_allele_detection_args, map_tags_to_files, nnls,\
ensure_sequence_format, parse_allelelist, parse_library ensure_sequence_format, parse_allelelist, parse_library,\
get_sample_data
__version__ = "0.1dev" __version__ = "0.1dev"
...@@ -246,57 +246,6 @@ def solve_profile_mixture_single(samples, genotypes, n, variance=False, ...@@ -246,57 +246,6 @@ def solve_profile_mixture_single(samples, genotypes, n, variance=False,
#solve_profile_mixture #solve_profile_mixture
def get_sample_data(infile, data, annotation_column, seqformat, library):
"""Add data from infile to data dict as [marker, allele]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_name, colid_allele, colid_forward, colid_reverse = \
get_column_ids(column_names, "name", "allele", "forward", "reverse")
# Also try to get allele column if we have one.
if annotation_column is not None:
try:
colid_annotation = get_column_ids(column_names, annotation_column)
except:
annotation_column = None
found_alleles = []
for line in infile:
line = line.rstrip("\r\n").split("\t")
marker = line[colid_name]
allele = ensure_sequence_format(line[colid_allele], seqformat, library)
if (annotation_column is not None and
line[colid_annotation].startswith("ALLELE")):
found_alleles.append(marker, allele)
data[marker, allele] = map(int,
[line[colid_forward], line[colid_reverse]])
return found_alleles
#get_sample_data
def reduce_read_counts(data, limit_reads):
sum_reads = 0
for markerallele in data:
sum_reads += sum(data[markerallele])
if sum_reads <= limit_reads:
return
remove = sorted(random.sample(xrange(sum_reads), sum_reads - limit_reads))
i = 0
seen = 0
while i < len(remove) and seen > remove[i]:
# Skip the reads filtered out above.
i += 1
for markerallele in data:
for direction in (0, 1):
seen += data[markerallele][direction]
while i < len(remove) and seen > remove[i]:
data[markerallele][direction] -= 1
i += 1
#reduce_read_counts
def ensure_min_samples(allelelist, min_samples): def ensure_min_samples(allelelist, min_samples):
if min_samples <= 1: if min_samples <= 1:
return return
...@@ -455,41 +404,15 @@ def generate_profiles(filelist, tag_expr, tag_format, allelefile, ...@@ -455,41 +404,15 @@ def generate_profiles(filelist, tag_expr, tag_format, allelefile,
allelelist = {} if allelefile is None \ allelelist = {} if allelefile is None \
else parse_allelelist(allelefile, seqformat, library) else parse_allelelist(allelefile, seqformat, library)
tags_to_files = map_tags_to_files(filelist, tag_expr, tag_format)
# Randomly drop some samples.
sample_tags = tags_to_files.keys()
for tag in random.sample(xrange(len(sample_tags)),
int(len(sample_tags) * drop_samples)):
del tags_to_files[sample_tags[tag]]
# Read sample data. # Read sample data.
sample_data = {} sample_data = {}
for tag in tags_to_files: get_sample_data(map_tags_to_files(filelist, tag_expr, tag_format),
sample_data[tag] = {} lambda tag, data: sample_data.update({tag: data}),
alleles = set() allelelist, annotation_column, seqformat, library, marker,
for infile in tags_to_files[tag]: homozygotes, limit_reads, drop_samples)
alleles.update(get_sample_data(infile, sample_data[tag],
annotation_column, seqformat, library))
if tag not in allelelist:
allelelist[tag] = {}
for markerx, allele in alleles:
if markerx not in allelelist[tag]:
allelelist[tag][markerx] = set()
allelelist[tag][markerx].add(allele)
reduce_read_counts(sample_data[tag], limit_reads)
if marker:
if marker in allelelist[tag]:
allelelist[tag] = {marker: allelelist[tag][marker]}
else:
allelelist[tag] = {}
if homozygotes:
for marker in allelelist[tag].keys():
if len(allelelist[tag][marker]) > 1:
del allelelist[tag][marker]
# Ensure minimum number of samples per allele. # Ensure minimum number of samples per allele.
allelelist = {tag: allelelist[tag] for tag in tags_to_files} allelelist = {tag: allelelist[tag] for tag in sample_data}
ensure_min_samples(allelelist, min_samples) ensure_min_samples(allelelist, min_samples)
# Combine data from all samples. # Combine data from all samples.
......
...@@ -5,11 +5,11 @@ Compute allele-centric statistics for background noise in homozygous samples ...@@ -5,11 +5,11 @@ Compute allele-centric statistics for background noise in homozygous samples
""" """
import argparse import argparse
import sys import sys
import random
from ..lib import get_column_ids, pos_int_arg, add_sample_files_args,\ from ..lib import get_column_ids, pos_int_arg, add_sample_files_args,\
add_allele_detection_args, map_tags_to_files, adjust_stats,\ add_allele_detection_args, map_tags_to_files, adjust_stats,\
ensure_sequence_format, parse_allelelist, parse_library ensure_sequence_format, parse_allelelist, parse_library,\
get_sample_data
__version__ = "0.1dev" __version__ = "0.1dev"
...@@ -36,58 +36,6 @@ _DEF_MIN_SAMPLES = 2 ...@@ -36,58 +36,6 @@ _DEF_MIN_SAMPLES = 2
_DEF_MIN_SAMPLE_PCT = 80. _DEF_MIN_SAMPLE_PCT = 80.
def get_sample_data(infile, data, annotation_column, seqformat, library):
"""Add data from infile to data dict as [marker, allele]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_name, colid_allele, colid_forward, colid_reverse = \
get_column_ids(column_names, "name", "allele", "forward", "reverse")
# Also try to get allele column if we have one.
if annotation_column is not None:
try:
colid_annotation = get_column_ids(column_names, annotation_column)
except:
annotation_column = None
found_alleles = []
for line in infile:
line = line.rstrip("\r\n").split("\t")
marker = line[colid_name]
allele = line[colid_allele] if seqformat is None \
else ensure_sequence_format(line[colid_allele], seqformat, library)
if (annotation_column is not None and
line[colid_annotation].startswith("ALLELE")):
found_alleles.append(marker, allele)
data[marker, allele] = map(int,
[line[colid_forward], line[colid_reverse]])
return found_alleles
#get_sample_data
def reduce_read_counts(data, limit_reads):
sum_reads = 0
for markerallele in data:
sum_reads += sum(data[markerallele])
if sum_reads <= limit_reads:
return
remove = sorted(random.sample(xrange(sum_reads), sum_reads - limit_reads))
i = 0
seen = 0
while i < len(remove) and seen > remove[i]:
# Skip the reads filtered out above.
i += 1
for markerallele in data:
for direction in (0, 1):
seen += data[markerallele][direction]
while i < len(remove) and seen > remove[i]:
data[markerallele][direction] -= 1
i += 1
#reduce_read_counts
def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs): def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs):
# Enter the read counts into data and check the thresholds. # Enter the read counts into data and check the thresholds.
for marker, sequence in sample_data: for marker, sequence in sample_data:
...@@ -145,37 +93,16 @@ def compute_stats(filelist, tag_expr, tag_format, allelefile, ...@@ -145,37 +93,16 @@ def compute_stats(filelist, tag_expr, tag_format, allelefile,
allelelist = {} if allelefile is None \ allelelist = {} if allelefile is None \
else parse_allelelist(allelefile, seqformat, library) else parse_allelelist(allelefile, seqformat, library)
tags_to_files = map_tags_to_files(filelist, tag_expr, tag_format)
# Randomly drop some samples.
sample_tags = tags_to_files.keys()
for tag in random.sample(xrange(len(sample_tags)),
int(len(sample_tags) * drop_samples)):
del tags_to_files[sample_tags[tag]]
# Read sample data. # Read sample data.
data = {} data = {}
for tag in tags_to_files: get_sample_data(
sample_data = {} map_tags_to_files(filelist, tag_expr, tag_format),
alleles = set() lambda tag, sample_data: add_sample_data(
for infile in tags_to_files[tag]: data, sample_data,
alleles.update(get_sample_data(infile, sample_data, {m: allelelist[tag][m].pop() for m in allelelist[tag]},
annotation_column, seqformat, library)) min_pct, min_abs),
if tag not in allelelist: allelelist, annotation_column, seqformat, library, marker, True,
allelelist[tag] = {} limit_reads, drop_samples)
for markerx, allele in alleles:
if markerx not in allelelist[tag]:
allelelist[tag][markerx] = set()
allelelist[tag][markerx].add(allele)
reduce_read_counts(sample_data, limit_reads)
if marker:
if marker in allelelist[tag]:
allelelist[tag] = {marker: allelelist[tag][marker]}
else:
allelelist[tag] = {}
allelelist[tag] = {marker: allelelist[tag][marker].pop()
for marker in allelelist[tag] if len(allelelist[tag][marker]) == 1}
add_sample_data(data, sample_data, allelelist[tag], min_pct, min_abs)
# Ensure minimum number of samples per allele and filter # Ensure minimum number of samples per allele and filter
# insignificant background products. # insignificant background products.
......
#!/usr/bin/env python #!/usr/bin/env python
""" """
Find dirty samples or recurring contaminating alleles. Find dirty reference samples or recurring contaminating alleles.
""" """
import argparse import argparse
import sys import sys
...@@ -9,7 +9,7 @@ import sys ...@@ -9,7 +9,7 @@ import sys
from ..lib import get_column_ids, pos_int_arg, add_sample_files_args,\ from ..lib import get_column_ids, pos_int_arg, add_sample_files_args,\
add_allele_detection_args, map_tags_to_files, nnls,\ add_allele_detection_args, map_tags_to_files, nnls,\
ensure_sequence_format, parse_allelelist, load_profiles,\ ensure_sequence_format, parse_allelelist, load_profiles,\
parse_library parse_library, get_sample_data
__version__ = "0.1dev" __version__ = "0.1dev"
...@@ -21,36 +21,6 @@ __version__ = "0.1dev" ...@@ -21,36 +21,6 @@ __version__ = "0.1dev"
_DEF_NUM = 5 _DEF_NUM = 5
def get_sample_data(infile, data, annotation_column, library):
"""Add data from infile to data dict as [marker, allele]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_name, colid_allele, colid_forward, colid_reverse = \
get_column_ids(column_names, "name", "allele", "forward", "reverse")
# Also try to get allele column if we have one.
if annotation_column is not None:
try:
colid_annotation = get_column_ids(column_names, annotation_column)
except:
annotation_column = None
found_alleles = []
for line in infile:
line = line.rstrip("\r\n").split("\t")
marker = line[colid_name]
allele = ensure_sequence_format(