Commit 732e83ba authored by jhoogenboom's avatar jhoogenboom

Moved sample data reading code into lib.py

parent 8685a304
......@@ -62,7 +62,8 @@ def main():
"tools")]):
module = importer.find_module(prefix + name).load_module(prefix + name)
subparser = subparsers.add_parser(
name, help=module.__doc__, description=module.__doc__,
name, help=module.__doc__.split("\n\n\n", 1)[0],
description=module.__doc__,
version=version(parser.prog, name, module.__version__))
__tools__[name] = subparser
subparser.add_argument('-d', "--debug", action="store_true",
......
#!/usr/bin/env python
import re, sys, argparse
import re, sys, argparse, random
#import numpy as np # Imported only when calling nnls()
from ConfigParser import RawConfigParser, MissingSectionHeaderError
from StringIO import StringIO
from collections import defaultdict
# Patterns that match entire sequences.
PAT_SEQ_RAW = re.compile("^[ACGT]*$")
......@@ -273,8 +272,8 @@ def parse_library_ini(handle):
"suffix": {},
"regex": {},
"regex_middle": {},
"length_adjust": defaultdict(int),
"block_length": defaultdict(lambda: 4),
"length_adjust": {},
"block_length": {},
"aliases": {}
}
markers = set()
......@@ -735,12 +734,8 @@ def convert_sequence_raw_allelename(seq, library, marker):
# TODO: perhaps produce a more intelligent name if there is exactly
# one alias with the same length
blocknames = []
if "block_length" in library:
blocksize = library["block_length"][marker]
else:
blocksize = 4
if "length_adjust" in library:
length -= library["length_adjust"][marker]
blocksize = library.get("block_length", {}).get(marker, 4)
length -= library.get("length_adjust", {}).get(marker, 0)
for block in blocks:
blocknames.append("%s[%s]" % (block[0], block[1]))
length += len(block[0]) * int(block[1])
......@@ -875,6 +870,103 @@ def adjust_stats(value, stats=None):
#adjust_stats
def read_sample_data_file(infile, data, annotation_column=None, seqformat=None,
library=None, default_marker=None):
"""Add data from infile to data dict as [marker, allele]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_allele, colid_forward, colid_reverse = \
get_column_ids(column_names, "allele", "forward", "reverse")
# Get marker name column if it exists.
try:
colid_name = get_column_ids(column_names, "name")
except:
colid_name = None
# Also try to get annotation column if we have one.
if annotation_column is not None:
try:
colid_annotation = get_column_ids(column_names, annotation_column)
except:
annotation_column = None
found_alleles = []
for line in infile:
line = line.rstrip("\r\n").split("\t")
marker = line[colid_name] if colid_name is not None else default_marker
allele = line[colid_allele] if seqformat is None \
else ensure_sequence_format(line[colid_allele], seqformat, library)
if (annotation_column is not None and
line[colid_annotation].startswith("ALLELE")):
found_alleles.append(marker, allele)
data[marker, allele] = map(int,
(line[colid_forward], line[colid_reverse]))
return found_alleles
#read_sample_data_file
def reduce_read_counts(data, limit_reads):
sum_reads = 0
for markerallele in data:
sum_reads += sum(data[markerallele])
if sum_reads <= limit_reads:
return
remove = sorted(random.sample(xrange(sum_reads), sum_reads - limit_reads))
i = 0
seen = 0
while i < len(remove) and seen > remove[i]:
# Skip the reads filtered out above.
i += 1
for markerallele in data:
for direction in (0, 1):
seen += data[markerallele][direction]
while i < len(remove) and seen > remove[i]:
data[markerallele][direction] -= 1
i += 1
#reduce_read_counts
def get_sample_data(tags_to_files, callback, allelelist=None,
annotation_column=None, seqformat=None, library=None,
marker=None, homozygotes=False, limit_reads=sys.maxint,
drop_samples=0):
if drop_samples:
sample_tags = tags_to_files.keys()
for tag in random.sample(xrange(len(sample_tags)),
int(len(sample_tags) * drop_samples)):
del tags_to_files[sample_tags[tag]]
for tag in tags_to_files:
data = {}
alleles = set()
for infile in tags_to_files[tag]:
alleles.update(read_sample_data_file(infile, data,
annotation_column, seqformat, library, marker))
if limit_reads < sys.maxint:
reduce_read_counts(data, limit_reads)
if allelelist is not None:
if tag not in allelelist:
allelelist[tag] = {}
for markerx, allele in alleles:
if markerx not in allelelist[tag]:
allelelist[tag][markerx] = set()
allelelist[tag][markerx].add(allele)
if marker:
if marker in allelelist[tag]:
allelelist[tag] = {marker: allelelist[tag][marker]}
else:
allelelist[tag] = {}
if homozygotes:
for marker in allelelist[tag].keys():
if len(allelelist[tag][marker]) > 1:
del allelelist[tag][marker]
callback(tag, data)
#get_sample_data
def get_column_ids(column_names, *names):
"""Find all names in column_names and return their indices."""
result = []
......
......@@ -5,10 +5,10 @@ contaminations.
"""
import argparse
import sys
import re
from ..lib import get_column_ids, pos_int_arg, map_tags_to_files, \
add_sample_files_args, ensure_sequence_format
add_sample_files_args, ensure_sequence_format, \
get_sample_data
__version__ = "0.1dev"
......@@ -43,45 +43,23 @@ _DEF_MAX_NOISY = 2
def find_alleles(filelist, reportfile, tag_expr, tag_format, min_reads,
min_allele_pct, max_noise_pct, max_alleles, max_noisy,
stuttermark_column, seqformat, library):
if seqformat is not None and library is not None:
library = parse_library(library)
print("\t".join(["sample", "marker", "total", "allele"]))
tags_to_files = map_tags_to_files(filelist, tag_expr, tag_format)
for tag in tags_to_files:
data = {}
for infile in tags_to_files[tag]:
get_sample_data(infile, data, stuttermark_column)
find_alleles_sample(data, reportfile, tag, min_reads, min_allele_pct,
max_noise_pct, max_alleles, max_noisy, seqformat, library)
allelelist = {}
get_sample_data(
map_tags_to_files(filelist, tag_expr, tag_format),
lambda tag, data: find_alleles_sample(
data if stuttermark_column is None
else {key: data[key] for key in allelelist[tag]},
reportfile, tag, min_reads, min_allele_pct, max_noise_pct,
max_alleles, max_noisy, seqformat, library),
allelelist,
stuttermark_column)
#find_alleles
def get_sample_data(infile, data, stuttermark_column):
"""Add data from infile to data dict as [marker, allele]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_total, colid_allele, colid_name = get_column_ids(column_names,
"total", "allele", "name")
# Also try to get stuttermark column if we have one.
if stuttermark_column is not None:
try:
colid_stuttermark = get_column_ids(column_names,
stuttermark_column)
except:
stuttermark_column = None
for line in infile:
line = line.rstrip("\r\n").split("\t")
if (stuttermark_column is not None and
not line[colid_stuttermark].startswith("ALLELE")):
continue
data[line[colid_name], line[colid_allele]] = int(line[colid_total])
#get_sample_data
def find_alleles_sample(data, reportfile, tag, min_reads, min_allele_pct,
max_noise_pct, max_alleles, max_noisy, seqformat,
library):
......@@ -89,7 +67,7 @@ def find_alleles_sample(data, reportfile, tag, min_reads, min_allele_pct,
top_allele = {}
alleles = {}
for marker, allele in data:
reads = data[marker, allele]
reads = sum(data[marker, allele])
if marker not in alleles:
alleles[marker] = {allele: reads}
......
......@@ -4,14 +4,14 @@ Estimate allele-centric background noise profiles (means).
"""
import argparse
import sys
import random
import time
import math
#import numpy as np # Only imported when actually running this tool.
from ..lib import get_column_ids, pos_int_arg, add_sample_files_args,\
add_allele_detection_args, map_tags_to_files, nnls,\
ensure_sequence_format, parse_allelelist, parse_library
ensure_sequence_format, parse_allelelist, parse_library,\
get_sample_data
__version__ = "0.1dev"
......@@ -246,57 +246,6 @@ def solve_profile_mixture_single(samples, genotypes, n, variance=False,
#solve_profile_mixture
def get_sample_data(infile, data, annotation_column, seqformat, library):
"""Add data from infile to data dict as [marker, allele]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_name, colid_allele, colid_forward, colid_reverse = \
get_column_ids(column_names, "name", "allele", "forward", "reverse")
# Also try to get allele column if we have one.
if annotation_column is not None:
try:
colid_annotation = get_column_ids(column_names, annotation_column)
except:
annotation_column = None
found_alleles = []
for line in infile:
line = line.rstrip("\r\n").split("\t")
marker = line[colid_name]
allele = ensure_sequence_format(line[colid_allele], seqformat, library)
if (annotation_column is not None and
line[colid_annotation].startswith("ALLELE")):
found_alleles.append(marker, allele)
data[marker, allele] = map(int,
[line[colid_forward], line[colid_reverse]])
return found_alleles
#get_sample_data
def reduce_read_counts(data, limit_reads):
sum_reads = 0
for markerallele in data:
sum_reads += sum(data[markerallele])
if sum_reads <= limit_reads:
return
remove = sorted(random.sample(xrange(sum_reads), sum_reads - limit_reads))
i = 0
seen = 0
while i < len(remove) and seen > remove[i]:
# Skip the reads filtered out above.
i += 1
for markerallele in data:
for direction in (0, 1):
seen += data[markerallele][direction]
while i < len(remove) and seen > remove[i]:
data[markerallele][direction] -= 1
i += 1
#reduce_read_counts
def ensure_min_samples(allelelist, min_samples):
if min_samples <= 1:
return
......@@ -455,41 +404,15 @@ def generate_profiles(filelist, tag_expr, tag_format, allelefile,
allelelist = {} if allelefile is None \
else parse_allelelist(allelefile, seqformat, library)
tags_to_files = map_tags_to_files(filelist, tag_expr, tag_format)
# Randomly drop some samples.
sample_tags = tags_to_files.keys()
for tag in random.sample(xrange(len(sample_tags)),
int(len(sample_tags) * drop_samples)):
del tags_to_files[sample_tags[tag]]
# Read sample data.
sample_data = {}
for tag in tags_to_files:
sample_data[tag] = {}
alleles = set()
for infile in tags_to_files[tag]:
alleles.update(get_sample_data(infile, sample_data[tag],
annotation_column, seqformat, library))
if tag not in allelelist:
allelelist[tag] = {}
for markerx, allele in alleles:
if markerx not in allelelist[tag]:
allelelist[tag][markerx] = set()
allelelist[tag][markerx].add(allele)
reduce_read_counts(sample_data[tag], limit_reads)
if marker:
if marker in allelelist[tag]:
allelelist[tag] = {marker: allelelist[tag][marker]}
else:
allelelist[tag] = {}
if homozygotes:
for marker in allelelist[tag].keys():
if len(allelelist[tag][marker]) > 1:
del allelelist[tag][marker]
get_sample_data(map_tags_to_files(filelist, tag_expr, tag_format),
lambda tag, data: sample_data.update({tag: data}),
allelelist, annotation_column, seqformat, library, marker,
homozygotes, limit_reads, drop_samples)
# Ensure minimum number of samples per allele.
allelelist = {tag: allelelist[tag] for tag in tags_to_files}
allelelist = {tag: allelelist[tag] for tag in sample_data}
ensure_min_samples(allelelist, min_samples)
# Combine data from all samples.
......
......@@ -5,11 +5,11 @@ Compute allele-centric statistics for background noise in homozygous samples
"""
import argparse
import sys
import random
from ..lib import get_column_ids, pos_int_arg, add_sample_files_args,\
add_allele_detection_args, map_tags_to_files, adjust_stats,\
ensure_sequence_format, parse_allelelist, parse_library
ensure_sequence_format, parse_allelelist, parse_library,\
get_sample_data
__version__ = "0.1dev"
......@@ -36,58 +36,6 @@ _DEF_MIN_SAMPLES = 2
_DEF_MIN_SAMPLE_PCT = 80.
def get_sample_data(infile, data, annotation_column, seqformat, library):
"""Add data from infile to data dict as [marker, allele]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_name, colid_allele, colid_forward, colid_reverse = \
get_column_ids(column_names, "name", "allele", "forward", "reverse")
# Also try to get allele column if we have one.
if annotation_column is not None:
try:
colid_annotation = get_column_ids(column_names, annotation_column)
except:
annotation_column = None
found_alleles = []
for line in infile:
line = line.rstrip("\r\n").split("\t")
marker = line[colid_name]
allele = line[colid_allele] if seqformat is None \
else ensure_sequence_format(line[colid_allele], seqformat, library)
if (annotation_column is not None and
line[colid_annotation].startswith("ALLELE")):
found_alleles.append(marker, allele)
data[marker, allele] = map(int,
[line[colid_forward], line[colid_reverse]])
return found_alleles
#get_sample_data
def reduce_read_counts(data, limit_reads):
sum_reads = 0
for markerallele in data:
sum_reads += sum(data[markerallele])
if sum_reads <= limit_reads:
return
remove = sorted(random.sample(xrange(sum_reads), sum_reads - limit_reads))
i = 0
seen = 0
while i < len(remove) and seen > remove[i]:
# Skip the reads filtered out above.
i += 1
for markerallele in data:
for direction in (0, 1):
seen += data[markerallele][direction]
while i < len(remove) and seen > remove[i]:
data[markerallele][direction] -= 1
i += 1
#reduce_read_counts
def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs):
# Enter the read counts into data and check the thresholds.
for marker, sequence in sample_data:
......@@ -145,37 +93,16 @@ def compute_stats(filelist, tag_expr, tag_format, allelefile,
allelelist = {} if allelefile is None \
else parse_allelelist(allelefile, seqformat, library)
tags_to_files = map_tags_to_files(filelist, tag_expr, tag_format)
# Randomly drop some samples.
sample_tags = tags_to_files.keys()
for tag in random.sample(xrange(len(sample_tags)),
int(len(sample_tags) * drop_samples)):
del tags_to_files[sample_tags[tag]]
# Read sample data.
data = {}
for tag in tags_to_files:
sample_data = {}
alleles = set()
for infile in tags_to_files[tag]:
alleles.update(get_sample_data(infile, sample_data,
annotation_column, seqformat, library))
if tag not in allelelist:
allelelist[tag] = {}
for markerx, allele in alleles:
if markerx not in allelelist[tag]:
allelelist[tag][markerx] = set()
allelelist[tag][markerx].add(allele)
reduce_read_counts(sample_data, limit_reads)
if marker:
if marker in allelelist[tag]:
allelelist[tag] = {marker: allelelist[tag][marker]}
else:
allelelist[tag] = {}
allelelist[tag] = {marker: allelelist[tag][marker].pop()
for marker in allelelist[tag] if len(allelelist[tag][marker]) == 1}
add_sample_data(data, sample_data, allelelist[tag], min_pct, min_abs)
get_sample_data(
map_tags_to_files(filelist, tag_expr, tag_format),
lambda tag, sample_data: add_sample_data(
data, sample_data,
{m: allelelist[tag][m].pop() for m in allelelist[tag]},
min_pct, min_abs),
allelelist, annotation_column, seqformat, library, marker, True,
limit_reads, drop_samples)
# Ensure minimum number of samples per allele and filter
# insignificant background products.
......
#!/usr/bin/env python
"""
Find dirty samples or recurring contaminating alleles.
Find dirty reference samples or recurring contaminating alleles.
"""
import argparse
import sys
......@@ -9,7 +9,7 @@ import sys
from ..lib import get_column_ids, pos_int_arg, add_sample_files_args,\
add_allele_detection_args, map_tags_to_files, nnls,\
ensure_sequence_format, parse_allelelist, load_profiles,\
parse_library
parse_library, get_sample_data
__version__ = "0.1dev"
......@@ -21,36 +21,6 @@ __version__ = "0.1dev"
_DEF_NUM = 5
def get_sample_data(infile, data, annotation_column, library):
"""Add data from infile to data dict as [marker, allele]=reads."""
# Get column numbers.
column_names = infile.readline().rstrip("\r\n").split("\t")
colid_name, colid_allele, colid_forward, colid_reverse = \
get_column_ids(column_names, "name", "allele", "forward", "reverse")
# Also try to get allele column if we have one.
if annotation_column is not None:
try:
colid_annotation = get_column_ids(column_names, annotation_column)
except:
annotation_column = None
found_alleles = []
for line in infile:
line = line.rstrip("\r\n").split("\t")
marker = line[colid_name]
allele = ensure_sequence_format(
line[colid_allele], "raw", library=library, marker=marker)
if (annotation_column is not None and
line[colid_annotation].startswith("ALLELE")):
found_alleles.append(marker, allele)
data[marker, allele] = map(int,
[line[colid_forward], line[colid_reverse]])
return found_alleles
#get_sample_data
def add_sample_data(data, sample_data, sample_tag, alleles):
# Add this sample to the data.
use_markers = set()
......@@ -96,24 +66,13 @@ def blame(filelist, tag_expr, tag_format, allelefile, annotation_column, mode,
data[marker]["genotypes"] = []
data[marker]["sample_tags"] = []
tags_to_files = map_tags_to_files(filelist, tag_expr, tag_format)
# Read sample data.
for tag in tags_to_files:
sample_data = {}
alleles = set()
for infile in tags_to_files[tag]:
alleles.update(get_sample_data(
infile, sample_data, annotation_column, library))
if tag not in allelelist:
allelelist[tag] = {}
for marker, allele in alleles:
if marker not in allelelist[tag]:
allelelist[tag][marker] = set()
allelelist[tag][marker].add(allele)
allelelist[tag] = {marker: allelelist[tag][marker] for marker in data
if marker in allelelist[tag]}
add_sample_data(data, sample_data, tag, allelelist[tag])
get_sample_data(
map_tags_to_files(filelist, tag_expr, tag_format),
lambda tag, sample_data: add_sample_data(
data, sample_data, tag,
{m: allelelist[tag][m] for m in data if m in allelelist[tag]}),
allelelist, annotation_column, "raw", library)
print("\t".join(["marker",
"allele" if mode == "common" else "sample",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment