Commit c6cadb60 authored by van den Berg's avatar van den Berg
Browse files

Remove custom statistics in favor of picard

parent d563632b
......@@ -270,39 +270,6 @@ rule genotype_gather:
"--output {output.vcf} --output-type z 2> {log} && "
"bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
rule mapped_reads_bases:
"""Calculate number of mapped reads"""
input:
bam = rules.markdup.output.bam,
pywc = config["py_wordcount"]
output:
reads = "{sample}/bams/{sample}.mapped.num",
bases = "{sample}/bams/{sample}.mapped.basenum"
log:
"log/{sample}/mapped_reads_bases.log"
container:
containers["samtools-1.7-python-3.6"]
shell:
"samtools view -F 4 {input.bam} 2> {log} | cut -f 10 | python {input.pywc} "
"--reads {output.reads} --bases {output.bases}"
rule unique_reads_bases:
"""Calculate number of unique reads"""
input:
bam = rules.markdup.output.bam,
pywc = config["py_wordcount"]
output:
reads = "{sample}/bams/{sample}.unique.num",
bases = "{sample}/bams/{sample}.usable.basenum"
log:
"log/{sample}/unique_reads_bases.log"
container:
containers["samtools-1.7-python-3.6"]
shell:
"samtools view -F 4 -F 1024 {input.bam} 2> {log} | cut -f 10 | "
"python {input.pywc} --reads {output.reads} "
"--bases {output.bases} 2>> {log}"
rule fastqc_raw:
"""Run fastqc on raw fastq files"""
input:
......@@ -400,10 +367,6 @@ rule collect_cutadapt_summary:
rule collectstats:
"""Collect all stats for a particular sample"""
input:
mnum = rules.mapped_reads_bases.output.reads,
mbnum = rules.mapped_reads_bases.output.bases,
unum = rules.unique_reads_bases.output.reads,
ubnum = rules.unique_reads_bases.output.bases,
cov = rules.covstats.output.covj if "targetsfile" in config else [],
cutadapt = rules.collect_cutadapt_summary.output,
colpy = config["collect_stats"]
......@@ -417,8 +380,6 @@ rule collectstats:
containers["python3"]
shell:
"python {input.colpy} --sample-name {wildcards.sample} "
"--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
"--unique-num {input.unum} --usable-basenum {input.ubnum} "
"--female-threshold {params.fthresh} "
"--cutadapt {input.cutadapt} "
"--covstats {input.cov} > {output} 2> {log}"
......
......@@ -7,7 +7,6 @@ containers = {
'bcftools': 'docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4',
'bedtools-2.26-python-2.7': 'docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0',
'biopet-scatterregions': 'docker://quay.io/biocontainers/biopet-scatterregions:0.2--0',
'bwa-0.7.17-picard-2.22.8': 'docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:76d16eabff506ac13338d7f14644a0ad301b9d7e-0',
'bwa-0.7.17-samtools-1.10': 'docker://quay.io/biocontainers/mulled-v2-ad317f19f5881324e963f6a6d464d696a2825ab6:c59b7a73c87a9fe81737d5d628e10a3b5807f453-0',
'cutadapt': 'docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0',
'debian': 'docker://debian:buster-slim',
......@@ -17,7 +16,6 @@ containers = {
'multiqc': 'docker://quay.io/biocontainers/multiqc:1.8--py_2',
'picard': 'docker://quay.io/biocontainers/picard:2.22.8--0',
'python3': 'docker://python:3.6-slim',
'samtools-1.7-python-3.6': 'docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0',
'vtools': 'docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0'
}
......
......@@ -62,10 +62,6 @@ def determine_gender(covstat, fthresh):
def main(args):
mpnum = parse_num_file(args.mapped_num)
mpbnum = parse_num_file(args.mapped_basenum)
unum = parse_num_file(args.unique_num)
ubnum = parse_num_file(args.usable_basenum)
cutadapt = parse_json_file(args.cutadapt)
d = {
......@@ -73,11 +69,7 @@ def main(args):
"preqc_reads": cutadapt["preqc_reads"],
"preqc_bases": cutadapt["preqc_bases"],
"postqc_reads": cutadapt["postqc_reads"],
"postqc_bases": cutadapt["postqc_bases"],
"n_mapped_reads": mpnum,
"n_mapped_bases": mpbnum,
"n_usable_reads": unum,
"n_usable_bases": ubnum
"postqc_bases": cutadapt["postqc_bases"]
}
# If a covstats file was specified
......@@ -101,14 +93,6 @@ if __name__ == "__main__":
parser.add_argument("--sample-name", required=True,
help="Sample name")
parser.add_argument("--mapped-num", required=True,
help="Mapped num file")
parser.add_argument("--mapped-basenum", required=True,
help="Mapped basenum file")
parser.add_argument("--unique-num", required=True,
help="Unique num file")
parser.add_argument("--usable-basenum", required=True,
help="Usable basenum")
parser.add_argument("--female-threshold", default=0.6, type=float,
help="Female threshold of X/all cov")
parser.add_argument("--cutadapt", required=True,
......
......@@ -28,21 +28,6 @@ from collections import OrderedDict
from pathlib import Path
def get_vcf_stats(sample_name, vcfstats):
vcf_sample = next(x for x in vcfstats['samples'] if x['name'] == sample_name)
return {
"total_variants": vcf_sample['total_variants'],
"snps": vcf_sample['variant_types']['snps'],
"insertions": vcf_sample['variant_types']['insertions'],
"deletions": vcf_sample['variant_types']['deletions'],
"transversions": vcf_sample['transversions'],
"transitions": vcf_sample['transitions'],
"ti_tv_ratio": vcf_sample['ti_tv_ratio'],
"homozygous_variants": vcf_sample['genotypes']['hom_alt'],
"heterozygous_variants": vcf_sample['genotypes']['het']
}
def get_covstats(cov_d):
s_d = cov_d['_all']
return {
......@@ -79,10 +64,8 @@ if __name__ == "__main__":
"preqc_bases" : sample['preqc_bases'],
"postqc_reads": sample['postqc_reads'],
"postqc_bases": sample['postqc_bases'],
"mapped_reads": sample['n_mapped_reads'],
"mapped_bases": sample['n_mapped_bases'],
"usable_reads": sample['n_usable_reads'],
"usable_bases": sample['n_usable_bases'],
"mapped_reads": int(sample['picard_AlignmentSummaryMetrics']['PF_HQ_ALIGNED_READS']),
"mapped_bases": int(sample['picard_AlignmentSummaryMetrics']['PF_HQ_ALIGNED_BASES'])
})
if 'coverage' in sample:
sample_dict.update(get_covstats(sample['coverage']))
......
......@@ -123,8 +123,8 @@
- '"width_nonzero": 16569'
- path: stats.tsv
contains:
- "sample_name\tpreqc_reads\tpreqc_bases\tpostqc_reads\tpostqc_bases\tmapped_reads\tmapped_bases\tusable_reads\tusable_bases\tmedian_coverage"
- "micro\t15440\t2276743\t15440\t2274413\t15558\t2280294\t15515\t2275470\t136"
- "sample_name\tpreqc_reads\tpreqc_bases\tpostqc_reads\tpostqc_bases\tmapped_reads\tmapped_bases\tmedian_coverage"
- "micro\t15440\t2276743\t15440\t2274413\t15424\t2262944\t136"
- name: test-integration-gene-bedfile
tags:
......@@ -143,8 +143,8 @@
files:
- path: stats.tsv
contains:
- "sample_name\tpreqc_reads\tpreqc_bases\tpostqc_reads\tpostqc_bases\tmapped_reads\tmapped_bases\tusable_reads\tusable_bases\tmedian_coverage"
- "micro\t15440\t2276743\t15440\t2274413\t15558\t2280294\t15515\t2275470\t133.0"
- "sample_name\tpreqc_reads\tpreqc_bases\tpostqc_reads\tpostqc_bases\tmapped_reads\tmapped_bases\tmedian_coverage"
- "micro\t15440\t2276743\t15440\t2274413\t15424\t2262944\t133.0"
- name: test-integration-two-known-sites
tags:
......
......@@ -38,7 +38,7 @@ def test_stats_file_mapped_reads(workflow_dir):
values = next(fin).strip().split('\t')
data = dict(zip(header, values))
assert data['mapped_reads'] == '15558'
assert data['mapped_reads'] == '15424'
@pytest.mark.workflow('test-integration-no-cluster')
......@@ -52,35 +52,7 @@ def test_stats_file_mapped_bases(workflow_dir):
values = next(fin).strip().split('\t')
data = dict(zip(header, values))
assert data['mapped_bases'] == '2280294'
@pytest.mark.workflow('test-integration-no-cluster')
def test_stats_file_usable_reads(workflow_dir):
""" Read in the stats file and do some tests """
stats_file = 'stats.tsv'
full_path = workflow_dir / pathlib.Path(stats_file)
# Since we only have two lines in the test stats file, this works
with open(full_path, 'r') as fin:
header = next(fin).strip().split('\t')
values = next(fin).strip().split('\t')
data = dict(zip(header, values))
assert data['usable_reads'] == '15515'
@pytest.mark.workflow('test-integration-no-cluster')
def test_stats_file_usable_bases(workflow_dir):
""" Read in the stats file and do some tests """
stats_file = 'stats.tsv'
full_path = workflow_dir / pathlib.Path(stats_file)
# Since we only have two lines in the test stats file, this works
with open(full_path, 'r') as fin:
header = next(fin).strip().split('\t')
values = next(fin).strip().split('\t')
data = dict(zip(header, values))
assert data['usable_bases'] == '2275470'
assert data['mapped_bases'] == '2262944'
@pytest.mark.workflow('test-integration-no-cluster')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment