Commit d51305c4 authored by van den Berg's avatar van den Berg
Browse files

Fix tests, remove vcf statistics from stats file

parent f5032967
......@@ -70,7 +70,6 @@ containers = {
"bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
"cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
"debian": "docker://debian:buster-slim",
"fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
"fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
"gatk": "docker://broadinstitute/gatk3:3.7-0",
"multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
......@@ -126,7 +125,7 @@ rule all:
metrics_json = "metrics.json",
metrics_tsv = "metrics.tsv",
coverage_stats = coverage_stats,
covstat_png=expand("{sample}/coverage/covstats.png", sample=settings['samples'])
#covstat_png=expand("{sample}/coverage/covstats.png", sample=settings['samples'])
rule create_markdup_tmp:
......@@ -381,30 +380,6 @@ rule fastqc_postqc:
shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
## fastq-count
rule fqcount_preqc:
"""Calculate number of reads and bases before pre-processing"""
input:
r1=get_forward,
r2=get_reverse
output:
"{sample}/pre_process/{sample}-{read_group}.preqc_count.json"
singularity: containers["fastq-count"]
shell: "fastq-count {input.r1} {input.r2} > {output}"
rule fqcount_postqc:
"""Calculate number of reads and bases after pre-processing"""
input:
r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq",
r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq"
output:
"{sample}/pre_process/{sample}-{read_group}.postqc_count.json"
singularity: containers["fastq-count"]
shell: "fastq-count {input.r1} {input.r2} > {output}"
## coverages
rule covstats:
......@@ -479,8 +454,6 @@ else:
rule collectstats:
"""Collect all stats for a particular sample without beds"""
input:
preqc = "{sample}/pre_process/{sample}.preqc_count.json",
postq = "{sample}/pre_process/{sample}.postqc_count.json",
mnum = "{sample}/bams/{sample}.mapped.num",
mbnum = "{sample}/bams/{sample}.mapped.basenum",
unum = "{sample}/bams/{sample}.unique.num",
......@@ -494,7 +467,6 @@ else:
"{sample}/{sample}.stats.json"
singularity: containers["vtools"]
shell: "python {input.colpy} --sample-name {params.sample_name} "
"--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
"--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
"--unique-num {input.unum} --usable-basenum {input.ubnum} "
"--female-threshold {params.fthresh} "
......@@ -534,7 +506,7 @@ rule merge_stats:
output:
stats="stats.json"
singularity: containers["vtools"]
shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
shell: "python {input.mpy} --vcfstats {input.vstat} --collectstats {input.cols} "
"> {output.stats}"
......
......@@ -22,7 +22,7 @@ collect_stats.py
:license: AGPL-3.0
"""
import click
import argparse
import json
......@@ -31,24 +31,29 @@ def parse_json(path):
return json.load(handle)
@click.command()
@click.option("--vcfstats",
type=click.Path(exists=True, dir_okay=False, readable=True),
required=True,
help="Path to vcfstats json")
@click.argument("collectstats",
type=click.Path(exists=True, dir_okay=False, readable=True),
nargs=-1,
required=True)
def main(vcfstats, collectstats):
v = parse_json(vcfstats)
cs = [parse_json(x) for x in collectstats]
d = {
"sample_stats": cs,
"multisample_vcfstats": v
}
print(json.dumps(d))
data = dict()
data["sample_stats"] = list()
for vcf, stats in zip(vcfstats, collectstats):
v = parse_json(vcf)
cs = parse_json(stats)
cs['vcfstats'] = v
data["sample_stats"].append(cs)
print(json.dumps(data))
if __name__ == "__main__":
main()
parser = argparse.ArgumentParser()
parser.add_argument('--vcfstats',
nargs='+',
required=True,
help='Path to the vcfstats json for each sample')
parser.add_argument('--collectstats',
nargs='+',
required=True,
help='Path to the collected stats for each sample')
args = parser.parse_args()
assert len(args.vcfstats) == len(args.collectstats)
main(args.vcfstats, args.collectstats)
......@@ -72,7 +72,7 @@ if __name__ == "__main__":
sdicts = []
vcfstats = orig_dict['multisample_vcfstats']
#vcfstats = orig_dict['multisample_vcfstats']
for sample in orig_dict['sample_stats']:
sname = sample['sample_name']
......@@ -88,7 +88,7 @@ if __name__ == "__main__":
"usable_reads": sample['n_usable_reads'],
"usable_bases": sample['n_usable_bases'],
})
sample_dict.update(get_vcf_stats(sname, vcfstats))
#sample_dict.update(get_vcf_stats(sname, vcfstats))
if "covstats" in sample:
for cov_d in sample['covstats']:
sample_dict.update(get_covstats(cov_d))
......
{
"samples": {
"micro": {
"libraries": {
"lib_01": {
"R1": "tests/data/fastq/micro_R1.fq.gz",
"R2": "tests/data/fastq/micro_R2.fq.gz"
}
}
}
},
"reference":"tests/data/reference/ref.fa",
"dbsnp": "tests/data/reference/database.vcf.gz",
"known_sites": ["tests/data/reference/database.vcf.gz"],
"bedfile": "tests/data/reference/target_genes.bed"
}
......@@ -162,13 +162,12 @@
- "\"width_nonzero\": 16569"
- path: "stats.tsv"
contains:
- "sample_name\tpreqc_reads\tpreqc_bases\tpostqc_reads\tpostqc_bases\tmapped_reads\tmapped_bases\tusable_reads\tusable_bases\ttotal_variants\tsnps\tinsertions\tdeletions\ttransversions\ttransitions\tti_tv_ratio\thomozygous_variants\theterozygous_variants\tcovstats.json_median_coverage"
- "micro\t15440\t2276743\t15440\t2274413\t15558\t2280294\t15515\t2275470\t17\t15\t2\t0\t0\t15\tnan\t16\t1\t136"
- "sample_name\tpreqc_reads\tpreqc_bases\tpostqc_reads\tpostqc_bases\tmapped_reads\tmapped_bases\tusable_reads\tusable_bases\tcovstats.json_median_coverage"
- "micro\t15440\t2276743\t15440\t2274413\t15558\t2280294\t15515\t2275470\t136"
- name: test-integration-gene-bedfile
tags:
- integration
- new
command: >
snakemake
--use-singularity
......@@ -189,7 +188,7 @@
- path: "metrics.tsv"
contains:
- "sample_name\tpostqc_bases\tpostqc_reads\tpreqc_bases\tpreqc_reads"
- "micro\t2274413\t7720\t2276743\t7720"
- "micro\t2274413\t15440\t2276743\t15440"
- name: test-integration-two-known-sites
tags:
......@@ -215,7 +214,6 @@
- name: test-integration-two-readgroups
tags:
- integration
- new
command: >
snakemake
--use-singularity
......@@ -233,8 +231,6 @@
must_not_contain:
- "rror"
files:
#- path: "micro/coverage/covstats.png"
#- path: "stats.tsv"
- path: "micro/pre_process/trimmed-micro-lib_01/micro-lib_01_R1_fastqc.zip"
- path: "micro/pre_process/trimmed-micro-lib_01/micro-lib_01_R2_fastqc.zip"
- path: "micro/pre_process/trimmed-micro-lib_02/micro-lib_02_R1_fastqc.zip"
......@@ -250,15 +246,8 @@
- path: "micro/metrics.json"
contains:
- "sample_name\": \"micro"
- "preqc_reads\": 7720"
- "postqc_reads\": 7720"
#- path: "micro/coverage/covstats.json"
# contains:
# - "\"frac_min_100x\": 0.97"
# - "\"mean\": 137"
# - "\"width_nonzero\": 16569"
- "preqc_reads\": 15440"
- "postqc_reads\": 15440"
- path: "micro/pre_process/micro-lib_01.txt"
contains:
......@@ -273,7 +262,6 @@
- name: test-integration-two-samples
tags:
- integration
- new
command: >
snakemake
--use-singularity
......@@ -303,4 +291,4 @@
contains:
- "sample_name\tpostqc_bases\tpostqc_reads\tpreqc_bases\tpreqc_reads"
- "micro1"
- "micro2\t1137997\t3860\t1139177\t3860"
- "micro2\t1137997\t7720\t1139177\t7720"
......@@ -37,7 +37,7 @@ def test_stats_file_mapped_reads(workflow_dir):
values = next(fin).strip().split('\t')
data = dict(zip(header, values))
assert data['mapped_reads'] == '15515'
assert data['mapped_reads'] == '15558'
@pytest.mark.workflow(name='test-integration-no-cluster')
......@@ -51,7 +51,7 @@ def test_stats_file_mapped_bases(workflow_dir):
values = next(fin).strip().split('\t')
data = dict(zip(header, values))
assert data['mapped_bases'] == '2275114'
assert data['mapped_bases'] == '2280294'
@pytest.mark.workflow(name='test-integration-no-cluster')
......@@ -65,7 +65,7 @@ def test_stats_file_usable_reads(workflow_dir):
values = next(fin).strip().split('\t')
data = dict(zip(header, values))
assert data['usable_reads'] == '15477'
assert data['usable_reads'] == '15515'
@pytest.mark.workflow(name='test-integration-no-cluster')
......@@ -79,4 +79,4 @@ def test_stats_file_usable_bases(workflow_dir):
values = next(fin).strip().split('\t')
data = dict(zip(header, values))
assert data['usable_bases'] == '2270739'
assert data['usable_bases'] == '2275470'
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment