Commit 818ddd2b authored by jhoogenboom's avatar jhoogenboom
Browse files

Intoducing StutterModel

* New tool StutterModel fits polynomials to stutter ratio vs repeat
  length.
* Changed -R to -Q (--limit-reads) so that I can reassign -R to an
  option that is used more often.
* Changed -r to -R (--report) to make sure it will not collide with
  the -r option in Stuttermark, if I ever want to add report output
  to Stuttermark.
* BGHomStats now checks whether all alleles are detected
parent 7f23c2e0
......@@ -875,6 +875,17 @@ def adjust_stats(value, stats=None):
#adjust_stats
def get_repeat_pattern(seq):
"""Return compiled regular expression that finds repeats of seq."""
return re.compile("".join( # For AGAT, one obtains:
["(?:" * (len(seq)-1)] + # (?:(?:(?:
["%s)?" % x for x in seq[1:]] + # G)?A)?T)?
["(?:", seq, ")+"] + # (?AGAT)+
["(?:%s" % x for x in seq[:-1]] + # (?:A(?:G(?:A
[")?" * (len(seq)-1)])) # )?)?)?
#get_repeat_pattern
def read_sample_data_file(infile, data, annotation_column=None, seqformat=None,
library=None, default_marker=None):
"""Add data from infile to data dict as [marker, allele]=reads."""
......@@ -1032,7 +1043,7 @@ def add_allele_detection_args(parser):
def add_random_subsampling_args(parser):
group = parser.add_argument_group("random subsampling options (advanced)")
group.add_argument('-R', '--limit-reads', metavar="N", type=pos_int_arg,
group.add_argument('-Q', '--limit-reads', metavar="N", type=pos_int_arg,
default=sys.maxint,
help="simulate lower sequencing depth by randomly dropping reads down "
"to this maximum total number of reads for each sample")
......@@ -1113,7 +1124,7 @@ def add_input_output_args(parser, single_in=False, batch_support=False,
default=sys.stdout,
help="file to write output to (default: write to stdout)")
if report_out:
group.add_argument('-r', '--report', metavar="FILE",
group.add_argument('-R', '--report', metavar="FILE",
type=argparse.FileType('w'),
default=sys.stderr,
help="file to write a report to (default: write to stderr)")
......
......@@ -40,6 +40,16 @@ _DEF_MIN_SAMPLE_PCT = 80.
def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs):
# Check presence of all alleles.
for marker in sample_alleles:
allele = sample_alleles[marker]
if (marker, allele) not in sample_data:
raise ValueError(
"Missing allele %s of marker %s!" % (allele, marker))
elif 0 in sample_data[marker, allele]:
raise ValueError(
"Allele %s of marker %s has 0 reads!" % (allele, marker))
# Enter the read counts into data and check the thresholds.
for marker, sequence in sample_data:
if marker not in sample_alleles:
......@@ -55,9 +65,9 @@ def add_sample_data(data, sample_data, sample_alleles, min_pct, min_abs):
data[marker, allele][sequence][direction] = adjust_stats(
sample_data[marker, sequence][direction] * factors[direction],
data[marker, allele][sequence][direction])
if sum([count >= min_abs and count*factor >= min_pct
for count, factor in
zip(sample_data[marker, sequence], factors)]):
if sum(count >= min_abs and count*factor >= min_pct
for count, factor in
zip(sample_data[marker, sequence], factors)):
data[marker, allele][sequence][2] += 1
#add_sample_data
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment