Commit 05f10363 authored by van den Berg's avatar van den Berg
Browse files

Replace biocontainer wc with pywc.py

The wc which is present in the biocontainers has a maximum integer size
of just 2^32, which means the counter resets every 4 billion bases. This
commit adds a custom python script to solve issue #39.

While the python tool is slower than wc, it has the advantage that we
only have to parse the bam file once, since it can count both the lines
and the number of bases in one go. That should save a bit on the time.

This commit also updates samtools from version 1.6 to 1.7.
parent 160b4e21
......@@ -83,6 +83,7 @@ seq = fsrc_dir("src", "seqtk.sh")
fqpy = fsrc_dir("src", "fastqc_stats.py")
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
fqsc = fsrc_dir("src", "safe_fastqc.sh")
pywc = fsrc_dir("src", "pywc.py")
# sample config parsing
with open(config.get("SAMPLE_CONFIG")) as handle:
......@@ -479,46 +480,30 @@ rule split_vcf:
## bam metrics
rule mapped_num:
rule mapped_reads_bases:
"""Calculate number of mapped reads"""
input:
bam="{sample}/bams/{sample}.sorted.bam"
output:
num="{sample}/bams/{sample}.mapped.num"
singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"
rule mapped_basenum:
"""Calculate number of mapped bases"""
input:
bam="{sample}/bams/{sample}.sorted.bam"
bam="{sample}/bams/{sample}.sorted.bam",
pywc=pywc
output:
num="{sample}/bams/{sample}.mapped.basenum"
singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
shell: "samtools view -F 4 {input.bam} | cut -f10 | wc -c > {output.num}"
reads="{sample}/bams/{sample}.mapped.num",
bases="{sample}/bams/{sample}.mapped.basenum"
singularity: "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0"
shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
"--reads {output.reads} --bases {output.bases}"
rule unique_num:
rule unique_reads_bases:
"""Calculate number of unique reads"""
input:
bam="{sample}/bams/{sample}.markdup.bam"
output:
num="{sample}/bams/{sample}.unique.num"
singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l > {output.num}"
rule usable_basenum:
"""Calculate number of bases on unique reads"""
input:
bam="{sample}/bams/{sample}.markdup.bam"
bam="{sample}/bams/{sample}.markdup.bam",
pywc=pywc
output:
num="{sample}/bams/{sample}.usable.basenum"
singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > "
"{output.num}"
reads="{sample}/bams/{sample}.unique.num",
bases="{sample}/bams/{sample}.usable.basenum"
singularity: "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0"
shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
"--reads {output.reads} --bases {output.bases}"
## fastqc
......
#!/usr/bin/env python3
import argparse
import sys
def main(args):
bases = 0
for reads, line in enumerate(sys.stdin, 1):
bases += len(line.strip())
with open(args.reads, 'w') as fout:
print(reads, file=fout)
with open(args.bases, 'w') as fout:
print(bases, file=fout)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--reads', required=True)
parser.add_argument('--bases', required=True)
arguments = parser.parse_args()
main(arguments)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment