Commit 97809466 authored by van den Berg's avatar van den Berg
Browse files

Add collect metrics script

This script currently only collects data from the log file of cutadapt,
which has details on the number of reads and bases before and after
trimming.
parent fcfa28c2
......@@ -60,6 +60,7 @@ set_default("merge_stats", "src/merge_stats.py")
set_default("fastq_stats", "src/fastqc_stats.py")
set_default("stats_to_tsv", "src/stats_to_tsv.py")
set_default("py_wordcount", "src/pywc.py")
set_default("collect_metrics", "src/collect_metrics.py")
containers = {
"bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
......@@ -118,7 +119,9 @@ rule all:
gvcf_tbi=expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=settings['samples']),
fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
fastqc_trimmed = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
#coverage_stats = coverage_stats,
cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt" for read_group, sample in get_readgroup_per_sample()),
metrics = expand("{sample}/metrics.json", sample=settings['samples']),
coverage_stats = coverage_stats,
rule create_markdup_tmp:
......@@ -444,73 +447,85 @@ rule vcfstats:
## collection
if "bedfile" in settings:
rule collectstats:
"""Collect all stats for a particular sample with beds"""
input:
mnum="{sample}/bams/{sample}.mapped.num",
mbnum="{sample}/bams/{sample}.mapped.basenum",
unum="{sample}/bams/{sample}.unique.num",
ubnum="{sample}/bams/{sample}.usable.basenum",
cov="{sample}/coverage/covstats.json",
colpy=settings["collect_stats"]
params:
sample_name="{sample}",
fthresh=settings["female_threshold"]
output:
"{sample}/{sample}.stats.json"
singularity: containers["vtools"]
shell: "python {input.colpy} --sample-name {params.sample_name} "
"--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
"--unique-num {input.unum} --usable-basenum {input.ubnum} "
"--female-threshold {params.fthresh} "
"{input.cov} > {output}"
else:
rule collectstats:
"""Collect all stats for a particular sample without beds"""
input:
preqc = "{sample}/pre_process/{sample}.preqc_count.json",
postq = "{sample}/pre_process/{sample}.postqc_count.json",
mnum = "{sample}/bams/{sample}.mapped.num",
mbnum = "{sample}/bams/{sample}.mapped.basenum",
unum = "{sample}/bams/{sample}.unique.num",
ubnum = "{sample}/bams/{sample}.usable.basenum",
colpy = settings["collect_stats"]
params:
sample_name = "{sample}",
fthresh = settings["female_threshold"]
output:
"{sample}/{sample}.stats.json"
singularity: containers["vtools"]
shell: "python {input.colpy} --sample-name {params.sample_name} "
"--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
"--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
"--unique-num {input.unum} --usable-basenum {input.ubnum} "
"--female-threshold {params.fthresh} "
"> {output}"
rule merge_stats:
"""Merge all stats of all samples"""
input:
cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=settings['samples']),
mpy=settings["merge_stats"]
output:
stats="stats.json"
singularity: containers["vtools"]
shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
"> {output.stats}"
rule stats_tsv:
"""Convert stats.json to tsv"""
#if "bedfile" in settings:
# rule collectstats:
# """Collect all stats for a particular sample with beds"""
# input:
# mnum="{sample}/bams/{sample}.mapped.num",
# mbnum="{sample}/bams/{sample}.mapped.basenum",
# unum="{sample}/bams/{sample}.unique.num",
# ubnum="{sample}/bams/{sample}.usable.basenum",
# cov="{sample}/coverage/covstats.json",
# colpy=settings["collect_stats"]
# params:
# sample_name="{sample}",
# fthresh=settings["female_threshold"]
# output:
# "{sample}/{sample}.stats.json"
# singularity: containers["vtools"]
# shell: "python {input.colpy} --sample-name {params.sample_name} "
# "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
# "--unique-num {input.unum} --usable-basenum {input.ubnum} "
# "--female-threshold {params.fthresh} "
# "{input.cov} > {output}"
#else:
# rule collectstats:
# """Collect all stats for a particular sample without beds"""
# input:
# preqc = "{sample}/pre_process/{sample}.preqc_count.json",
# postq = "{sample}/pre_process/{sample}.postqc_count.json",
# mnum = "{sample}/bams/{sample}.mapped.num",
# mbnum = "{sample}/bams/{sample}.mapped.basenum",
# unum = "{sample}/bams/{sample}.unique.num",
# ubnum = "{sample}/bams/{sample}.usable.basenum",
# colpy = settings["collect_stats"]
# params:
# sample_name = "{sample}",
# fthresh = settings["female_threshold"]
# output:
# "{sample}/{sample}.stats.json"
# singularity: containers["vtools"]
# shell: "python {input.colpy} --sample-name {params.sample_name} "
# "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
# "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
# "--unique-num {input.unum} --usable-basenum {input.ubnum} "
# "--female-threshold {params.fthresh} "
# "> {output}"
#
rule collect_metrics:
""" Colect metrics for each sample """
input:
stats="stats.json",
sc=settings["stats_to_tsv"]
output:
stats="stats.tsv"
cutadapt = lambda wildcards:
("{sample}/pre_process/{sample}-{read_group}.txt".format(sample=wildcards.sample,
read_group=read_group) for read_group in get_readgroup(wildcards)),
collect_metrics = settings["collect_metrics"]
output: "{sample}/metrics.json"
singularity: containers["python3"]
shell: "python {input.sc} -i {input.stats} > {output.stats}"
shell: "python {input.collect_metrics} --sample {wildcards.sample} "
"--cutadapt-summary {input.cutadapt} > {output}"
#rule merge_stats:
# """Merge all stats of all samples"""
# input:
# cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
# vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=settings['samples']),
# mpy=settings["merge_stats"]
# output:
# stats="stats.json"
# singularity: containers["vtools"]
# shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
# "> {output.stats}"
#
#
#rule stats_tsv:
# """Convert stats.json to tsv"""
# input:
# stats="stats.json",
# sc=settings["stats_to_tsv"]
# output:
# stats="stats.tsv"
# singularity: containers["python3"]
# shell: "python {input.sc} -i {input.stats} > {output.stats}"
rule multiqc:
......@@ -519,7 +534,7 @@ rule multiqc:
Depends on stats.tsv to forcefully run at end of pipeline
"""
input:
stats="stats.tsv"
stats=expand("{sample}/metrics.json", sample=settings["samples"])
params:
odir=".",
rdir="multiqc_report"
......
#!/usr/bin/env python3
import argparse
import json
from collections import defaultdict
def collect_cutadapt_summary(data, files):
""" Read the different cutadapt files, sum the results and add the relevant
metrics to the data dictionary """
for filename in files:
cutadapt = read_cutadapt(filename)
data['preqc_reads'] += cutadapt['in_reads']
data['preqc_bases'] += cutadapt['in_bp']
data['postqc_reads'] += cutadapt['out_reads']
# For some reason, cutadapt outputs the basepairs out separate for
# forward and reverse reads
data['postqc_bases'] += cutadapt['out_bp'] + cutadapt['out2_bp']
def read_cutadapt(filename):
""" Read the cutadapt file and return the data """
with open(filename, 'rt') as fin:
header = next(fin).strip().split()
values = next(fin).strip().split()
data = dict()
for field, value in zip(header, values):
try:
data[field] = int(value)
except ValueError:
data[field] = value
return data
def main(args):
# Data structure to store all data
data = defaultdict(int)
# Set the sample name
data['sample_name'] = args.sample
# Collect cutadapt data
collect_cutadapt_summary(data, args.cutadapt_summary)
print(json.dumps(data, indent=2))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--cutadapt-summary',
required = True,
help = 'Cutadapt summary output',
nargs = '+'
)
parser.add_argument('--sample',
required = True,
help = 'Name of the sample'
)
args = parser.parse_args()
main(args)
......@@ -210,8 +210,8 @@
must_not_contain:
- "rror"
files:
- path: "micro/coverage/covstats.png"
- path: "stats.tsv"
#- path: "micro/coverage/covstats.png"
#- path: "stats.tsv"
- path: "micro/pre_process/trimmed-micro-lib_01/micro-lib_01_R1_fastqc.zip"
- path: "micro/pre_process/trimmed-micro-lib_01/micro-lib_01_R2_fastqc.zip"
- path: "micro/pre_process/trimmed-micro-lib_02/micro-lib_02_R1_fastqc.zip"
......@@ -222,12 +222,18 @@
- path: "micro/pre_process/raw-micro-lib_02/micro_rg2_R2_fastqc.zip"
- path: "micro/pre_process/micro-lib_01.txt"
- path: "micro/pre_process/micro-lib_02.txt"
- path: "micro/coverage/covstats.json"
- path: "micro/metrics.json"
contains:
- "\"frac_min_100x\": 0.97"
- "\"mean\": 137"
- "\"width_nonzero\": 16569"
- "sample_name\": \"micro"
- "preqc_reads\": 7720"
- "postqc_reads\": 7720"
#- path: "micro/coverage/covstats.json"
# contains:
# - "\"frac_min_100x\": 0.97"
# - "\"mean\": 137"
# - "\"width_nonzero\": 16569"
- path: "micro/pre_process/micro-lib_01.txt"
contains:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment