Commit f5032967 authored by van den Berg's avatar van den Berg
Browse files

Integrate cutadapt summary in stats.tsv

parent 00ee0dcd
......@@ -112,7 +112,8 @@ def coverage_stats(wildcards):
rule all:
input:
multiqc="multiqc_report/multiqc_report.html",
#stats = "stats.json",
stats = "stats.json",
stats_tsv = "stats.tsv",
bais=expand("{sample}/bams/{sample}.markdup.bam.bai", sample=settings['samples']),
vcfs=expand("{sample}/vcf/{sample}.vcf.gz", sample=settings['samples']),
vcf_tbi=expand("{sample}/vcf/{sample}.vcf.gz.tbi", sample=settings['samples']),
......@@ -125,6 +126,7 @@ rule all:
metrics_json = "metrics.json",
metrics_tsv = "metrics.tsv",
coverage_stats = coverage_stats,
covstat_png=expand("{sample}/coverage/covstats.png", sample=settings['samples'])
rule create_markdup_tmp:
......@@ -449,52 +451,56 @@ rule vcfstats:
shell: "vtools-stats -i {input.vcf} > {output.stats}"
## collection
#if "bedfile" in settings:
# rule collectstats:
# """Collect all stats for a particular sample with beds"""
# input:
# mnum="{sample}/bams/{sample}.mapped.num",
# mbnum="{sample}/bams/{sample}.mapped.basenum",
# unum="{sample}/bams/{sample}.unique.num",
# ubnum="{sample}/bams/{sample}.usable.basenum",
# cov="{sample}/coverage/covstats.json",
# colpy=settings["collect_stats"]
# params:
# sample_name="{sample}",
# fthresh=settings["female_threshold"]
# output:
# "{sample}/{sample}.stats.json"
# singularity: containers["vtools"]
# shell: "python {input.colpy} --sample-name {params.sample_name} "
# "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
# "--unique-num {input.unum} --usable-basenum {input.ubnum} "
# "--female-threshold {params.fthresh} "
# "{input.cov} > {output}"
#else:
# rule collectstats:
# """Collect all stats for a particular sample without beds"""
# input:
# preqc = "{sample}/pre_process/{sample}.preqc_count.json",
# postq = "{sample}/pre_process/{sample}.postqc_count.json",
# mnum = "{sample}/bams/{sample}.mapped.num",
# mbnum = "{sample}/bams/{sample}.mapped.basenum",
# unum = "{sample}/bams/{sample}.unique.num",
# ubnum = "{sample}/bams/{sample}.usable.basenum",
# colpy = settings["collect_stats"]
# params:
# sample_name = "{sample}",
# fthresh = settings["female_threshold"]
# output:
# "{sample}/{sample}.stats.json"
# singularity: containers["vtools"]
# shell: "python {input.colpy} --sample-name {params.sample_name} "
# "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
# "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
# "--unique-num {input.unum} --usable-basenum {input.ubnum} "
# "--female-threshold {params.fthresh} "
# "> {output}"
#
# collection
if "bedfile" in settings:
rule collectstats:
"""Collect all stats for a particular sample with beds"""
input:
mnum="{sample}/bams/{sample}.mapped.num",
mbnum="{sample}/bams/{sample}.mapped.basenum",
unum="{sample}/bams/{sample}.unique.num",
ubnum="{sample}/bams/{sample}.usable.basenum",
cov="{sample}/coverage/covstats.json",
cutadapt = "{sample}/metrics.json",
colpy=settings["collect_stats"]
params:
sample_name="{sample}",
fthresh=settings["female_threshold"]
output:
"{sample}/{sample}.stats.json"
singularity: containers["vtools"]
shell: "python {input.colpy} --sample-name {params.sample_name} "
"--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
"--unique-num {input.unum} --usable-basenum {input.ubnum} "
"--female-threshold {params.fthresh} "
"--cutadapt {input.cutadapt} "
"{input.cov} > {output}"
else:
rule collectstats:
"""Collect all stats for a particular sample without beds"""
input:
preqc = "{sample}/pre_process/{sample}.preqc_count.json",
postq = "{sample}/pre_process/{sample}.postqc_count.json",
mnum = "{sample}/bams/{sample}.mapped.num",
mbnum = "{sample}/bams/{sample}.mapped.basenum",
unum = "{sample}/bams/{sample}.unique.num",
ubnum = "{sample}/bams/{sample}.usable.basenum",
cutadapt = "{sample}/metrics.json",
colpy = settings["collect_stats"]
params:
sample_name = "{sample}",
fthresh = settings["female_threshold"]
output:
"{sample}/{sample}.stats.json"
singularity: containers["vtools"]
shell: "python {input.colpy} --sample-name {params.sample_name} "
"--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
"--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
"--unique-num {input.unum} --usable-basenum {input.ubnum} "
"--female-threshold {params.fthresh} "
"--cutadapt {input.cutadapt} "
"> {output}"
rule collect_metrics:
""" Colect metrics for each sample """
input:
......@@ -519,28 +525,28 @@ rule merge_metrics:
shell: "python {input.merge_metrics} --metrics {input.metrics} "
"--json {output.json} --tsv {output.tsv}"
#rule merge_stats:
# """Merge all stats of all samples"""
# input:
# cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
# vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=settings['samples']),
# mpy=settings["merge_stats"]
# output:
# stats="stats.json"
# singularity: containers["vtools"]
# shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
# "> {output.stats}"
#
#
#rule stats_tsv:
# """Convert stats.json to tsv"""
# input:
# stats="stats.json",
# sc=settings["stats_to_tsv"]
# output:
# stats="stats.tsv"
# singularity: containers["python3"]
# shell: "python {input.sc} -i {input.stats} > {output.stats}"
rule merge_stats:
"""Merge all stats of all samples"""
input:
cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=settings['samples']),
mpy=settings["merge_stats"]
output:
stats="stats.json"
singularity: containers["vtools"]
shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
"> {output.stats}"
rule stats_tsv:
"""Convert stats.json to tsv"""
input:
stats="stats.json",
sc=settings["stats_to_tsv"]
output:
stats="stats.tsv"
singularity: containers["python3"]
shell: "python {input.sc} -i {input.stats} > {output.stats}"
rule multiqc:
......
......@@ -10,9 +10,11 @@ def collect_cutadapt_summary(data, files):
metrics to the data dictionary """
for filename in files:
cutadapt = read_cutadapt(filename)
data['preqc_reads'] += cutadapt['in_reads']
# Times 2 since cutadapt outputs read pairs
data['preqc_reads'] += cutadapt['in_reads']*2
data['preqc_bases'] += cutadapt['in_bp']
data['postqc_reads'] += cutadapt['out_reads']
# Times 2 since cutadapt outputs read pairs
data['postqc_reads'] += cutadapt['out_reads']*2
# For some reason, cutadapt outputs the basepairs out separate for
# forward and reverse reads
data['postqc_bases'] += cutadapt['out_bp'] + cutadapt['out2_bp']
......
......@@ -86,17 +86,21 @@ def determine_gender(covstat, fthresh):
type=click.FLOAT,
default=0.6,
help="Female threshold of X/all cov")
@click.option("--cutadapt",
type=click.Path(dir_okay=False, exists=True, readable=True),
help="Cutadapt summary output")
@click.argument("covstats",
type=click.Path(dir_okay=False, exists=True, readable=True),
nargs=-1)
def main(sample_name, mapped_num, mapped_basenum,
unique_num, usable_basenum, female_threshold, covstats):
unique_num, usable_basenum, female_threshold, covstats, cutadapt):
mpnum = parse_num_file(mapped_num)
mpbnum = parse_num_file(mapped_basenum)
unum = parse_num_file(unique_num)
ubnum = parse_num_file(usable_basenum)
cutadapt = parse_json_file(cutadapt)
covl = []
for c in covstats:
......@@ -110,6 +114,10 @@ def main(sample_name, mapped_num, mapped_basenum,
d = {
"sample_name": sample_name,
"preqc_reads": cutadapt["preqc_reads"],
"preqc_bases": cutadapt["preqc_bases"],
"postqc_reads": cutadapt["postqc_reads"],
"postqc_bases": cutadapt["postqc_bases"],
"n_mapped_reads": mpnum,
"n_mapped_bases": mpbnum,
"n_usable_reads": unum,
......
......@@ -79,10 +79,14 @@ if __name__ == "__main__":
sample_dict = OrderedDict()
sample_dict.update({
"sample_name": sname,
"preqc_reads" : sample['preqc_reads'],
"preqc_bases" : sample['preqc_bases'],
"postqc_reads": sample['postqc_reads'],
"postqc_bases": sample['postqc_bases'],
"mapped_reads": sample['n_mapped_reads'],
"mapped_bases": sample['n_mapped_bases'],
"usable_reads": sample['n_usable_reads'],
"usable_bases": sample['n_usable_bases']
"usable_bases": sample['n_usable_bases'],
})
sample_dict.update(get_vcf_stats(sname, vcfstats))
if "covstats" in sample:
......
......@@ -163,10 +163,33 @@
- path: "stats.tsv"
contains:
- "sample_name\tpreqc_reads\tpreqc_bases\tpostqc_reads\tpostqc_bases\tmapped_reads\tmapped_bases\tusable_reads\tusable_bases\ttotal_variants\tsnps\tinsertions\tdeletions\ttransversions\ttransitions\tti_tv_ratio\thomozygous_variants\theterozygous_variants\tcovstats.json_median_coverage"
- "micro\t15440\t2276743\t15398\t2269171\t15515\t2275114\t15477\t2270739\t17\t15\t2\t0\t0\t15\tnan\t16\t1\t136"
- "micro\t15440\t2276743\t15440\t2274413\t15558\t2280294\t15515\t2275470\t17\t15\t2\t0\t0\t15\tnan\t16\t1\t136"
- "\tcovstats.json_modal_coverage\tcovstats.json_horizontal_coverage\t"
- "\t137\t1.0\t"
- name: test-integration-gene-bedfile
tags:
- integration
- new
command: >
snakemake
--use-singularity
--singularity-prefix /tmp/singularity
--singularity-args ' --cleanenv --bind /tmp'
--jobs 1 -w 120
-r -p -s Snakefile
--config
CONFIG_JSON=tests/data/config/sample_config_bed_target.json
stderr:
contains:
- "Job counts"
- "localrule all:"
- "(100%) done"
must_not_contain:
- "rror"
files:
- path: "metrics.tsv"
contains:
- "sample_name\tpostqc_bases\tpostqc_reads\tpreqc_bases\tpreqc_reads"
- "micro\t2274413\t7720\t2276743\t7720"
- name: test-integration-two-known-sites
tags:
......@@ -192,6 +215,7 @@
- name: test-integration-two-readgroups
tags:
- integration
- new
command: >
snakemake
--use-singularity
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment