Skip to content
Snippets Groups Projects
Snakefile 23.6 KiB
Newer Older
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
Main Snakefile for the pipeline.

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
import json
Sander Bollen's avatar
Sander Bollen committed
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
from os.path import join, basename
from pathlib import Path
import itertools

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed

REFERENCE = config.get("REFERENCE")
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
Sander Bollen's avatar
Sander Bollen committed
DBSNP = config.get("DBSNP")
if DBSNP is None:
    raise ValueError("You must set --config DBSNP=<path>")
Sander Bollen's avatar
Sander Bollen committed
if not Path(DBSNP).exists():
    raise FileNotFoundError("{0} does not exist".format(DBSNP))
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

KNOWN_SITES = config.get("KNOWN_SITES")
if KNOWN_SITES is None:
    raise ValueError("You must set --config KNOWN_SITES=<path>")

# The user can specify multiple known sites
KNOWN_SITES = KNOWN_SITES.split(',')
for filename in KNOWN_SITES:
    if not Path(filename).exists():
        raise FileNotFoundError("{0} does not exist".format(filename))


# these are all optional
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
Sander Bollen's avatar
Sander Bollen committed
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed

# Generate the input string for basrecalibration
BSQR_known_sites = ''
for argument, filename in zip(itertools.repeat('-knownSites'), KNOWN_SITES):
    BSQR_known_sites +=' {} {}'.format(argument, filename)
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed

covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
fqpy = fsrc_dir("src", "fastqc_stats.py")
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
fqsc = fsrc_dir("src", "safe_fastqc.sh")
pywc = fsrc_dir("src", "pywc.py")
Sander Bollen's avatar
Sander Bollen committed

Sander Bollen's avatar
Sander Bollen committed
# sample config parsing
Sander Bollen's avatar
Sander Bollen committed
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed

containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:1.14--py36_0",
    "debian": "docker://debian:buster-slim",
    "fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.5--py36_0",
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "seqtk-1.2-jq-1.6": "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
    rs = []
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed

Sander Bollen's avatar
Sander Bollen committed
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed


Sander Bollen's avatar
Sander Bollen committed
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

Sander Bollen's avatar
Sander Bollen committed
    fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt",
                  sample=SAMPLES)
    fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
                  sample=SAMPLES)
    fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
                  sample=SAMPLES)
    if len(REFFLATS) >= 1:
Sander Bollen's avatar
Sander Bollen committed
        coverage_stats = expand("{sample}/coverage/{ref}.coverages.tsv",
                                sample=SAMPLES, ref=BASE_REFFLATS)
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
    stats = "stats.json"
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
Sander Bollen's avatar
Sander Bollen committed
rule all:
    input:
        report="multiqc_report/multiqc_report.html",
Sander Bollen's avatar
Sander Bollen committed
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai",sample=SAMPLES),
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz",sample=SAMPLES),
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed

Sander Bollen's avatar
Sander Bollen committed

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
rule genome:
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
    input: REFERENCE
Sander Bollen's avatar
Sander Bollen committed
    output: "current.genome"
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
    shell: "cat {input} > {output}"

rule merge_r2:
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed

rule seqtk_r1:
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        stats="{sample}/pre_process/{sample}.preqc_count.json",
        fastq="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
    params:
Sander Bollen's avatar
Sander Bollen committed
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        fastq=temp("{sample}/pre_process/{sample}.sampled_R1.fastq.gz")
    singularity: containers["seqtk-1.2-jq-1.6"]
Sander Bollen's avatar
Sander Bollen committed
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed


rule seqtk_r2:
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        stats = "{sample}/pre_process/{sample}.preqc_count.json",
        fastq = "{sample}/pre_process/{sample}.merged_R2.fastq.gz",
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
    params:
Sander Bollen's avatar
Sander Bollen committed
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        fastq = temp("{sample}/pre_process/{sample}.sampled_R2.fastq.gz")
    singularity: containers["seqtk-1.2-jq-1.6"]
Sander Bollen's avatar
Sander Bollen committed
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
rule sickle:
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1 = "{sample}/pre_process/{sample}.sampled_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.sampled_R2.fastq.gz",
        rr1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        rr2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
    singularity: containers["sickle"]
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
           "-p {output.r2} -s {output.s}"

rule cutadapt:
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
    singularity: containers["cutadapt"]
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
        ref = REFERENCE,
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
    output: temp("{sample}/bams/{sample}.sorted.bam")
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
    singularity: containers["picard-2.14"]
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
rule baserecal:
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        bam = "{sample}/bams/{sample}.markdup.bam",
Sander Bollen's avatar
Sander Bollen committed
        ref = REFERENCE,
    output:
Sander Bollen's avatar
Sander Bollen committed
        grp = "{sample}/bams/{sample}.baserecal.grp"
        known_sites = BSQR_known_sites
    singularity: containers["gatk"]
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed

checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
    """Scatter the reference genome"""
    input:
        ref = REFERENCE,
    output:
        directory("scatter")
van den Berg's avatar
van den Berg committed
    singularity: containers["biopet-scatterregions"]
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
           "biopet-scatterregions "
           "--referenceFasta {input.ref} --scatterSize 1000000000 "
van den Berg's avatar
van den Berg committed
           "--outputDir scatter"

rule gvcf_scatter:
    """Run HaplotypeCaller in GVCF mode by chunk"""
    input:
Sander Bollen's avatar
Sander Bollen committed
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
Sander Bollen's avatar
Sander Bollen committed
        dbsnp=DBSNP,
        ref=REFERENCE,
van den Berg's avatar
van den Berg committed
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
    output:
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
    wildcard_constraints:
        chunk="[0-9]+"
    singularity: containers["gatk"]
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
           "-T HaplotypeCaller -ERC GVCF -I "
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
           "-L '{input.region}' -o '{output.gvcf}' "
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed


rule genotype_scatter:
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
    input:
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
        ref=REFERENCE
    output:
van den Berg's avatar
van den Berg committed
        vcf="{sample}/vcf/{sample}.{chunk}.vcf.gz",
        vcf_tbi="{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi"
    wildcard_constraints:
        chunk="[0-9]+"
    singularity: containers["gatk"]
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
           "GenotypeGVCFs -R {input.ref} "
           "-V {input.gvcf} -o '{output.vcf}'"
def aggregate_input(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


rule genotype_gather:
    """Gather all genotyping VCFs"""
    input:
        vcfs = aggregate_input
Sander Bollen's avatar
Sander Bollen committed
    output:
        vcf = "{sample}/vcf/{sample}.vcf.gz"
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
           "--output-type z"


rule genotype_gather_tbi:
    """Index genotyped vcf file"""
    input:
        vcf = "{sample}/vcf/{sample}.vcf.gz"
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
    singularity: containers["tabix"]
    shell: "tabix -pvcf {input.vcf}"
Sander Bollen's avatar
Sander Bollen committed
## bam metrics
rule mapped_reads_bases:
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
    input:
        bam="{sample}/bams/{sample}.sorted.bam",
        pywc=pywc
Sander Bollen's avatar
Sander Bollen committed
    output:
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
    singularity: containers["samtools-1.7-python-3.6"]
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
rule unique_reads_bases:
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
    input:
        bam="{sample}/bams/{sample}.markdup.bam",
        pywc=pywc
Sander Bollen's avatar
Sander Bollen committed
    output:
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
    singularity: containers["samtools-1.7-python-3.6"]
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed


## fastqc
Sander Bollen's avatar
Sander Bollen committed
rule fastqc_raw:
    """
    Run fastqc on raw fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
    shell: "fastqc --threads 4 --nogroup -o {params.odir} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
rule fastqc_merged:
    Run fastqc on merged fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
    params:
Sander Bollen's avatar
Sander Bollen committed
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
rule fastqc_postqc:
    """
    Run fastqc on fastq files post pre-processing
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
    params:
Sander Bollen's avatar
Sander Bollen committed
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed


## fastq-count

rule fqcount_preqc:
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        "{sample}/pre_process/{sample}.preqc_count.json"
    singularity: containers["fastq-count"]
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed


rule fqcount_postqc:
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        "{sample}/pre_process/{sample}.postqc_count.json"
    singularity: containers["fastq-count"]
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed


Sander Bollen's avatar
Sander Bollen committed
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
Sander Bollen's avatar
Sander Bollen committed
        sc=fqpy
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        "{sample}/pre_process/fastq_stats.json"
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
## coverages

rule covstats:
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
Sander Bollen's avatar
Sander Bollen committed
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
        covj="{sample}/coverage/{bed}.covstats.json",
        covp="{sample}/coverage/{bed}.covstats.png"
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed


rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
        ref=get_refflatpath
    output:
Sander Bollen's avatar
Sander Bollen committed
        tsv="{sample}/coverage/{ref}.coverages.tsv"
    singularity: containers["vtools"]
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
## vcfstats

rule vcfstats:
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
    input:
        vcf="{sample}/vcf/{sample}.vcf.gz",
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
    output:
        stats="{sampel}/vcf/{sample}.vcfstats.json"
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
## collection
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
            cov=expand("{{sample}}/coverage/{bed}.covstats.json",
                       bed=BASE_BEDS),
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
            "{sample}/{sample}.stats.json"
        singularity: containers["vtools"]
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
Sander Bollen's avatar
Sander Bollen committed
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
        input:
Sander Bollen's avatar
Sander Bollen committed
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
            "{sample}/{sample}.stats.json"
        singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed

rule merge_stats:
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        cols=expand("{sample}/{sample}.stats.json", sample=SAMPLES),
        vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
        mpy=mpy
    output:
Sander Bollen's avatar
Sander Bollen committed
        stats="stats.json"
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
        stats="stats.json",
        sc=tsvpy
    output:
Sander Bollen's avatar
Sander Bollen committed
        stats="stats.tsv"
    singularity: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
rule multiqc:
    """
    Create multiQC report
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
    params:
Sander Bollen's avatar
Sander Bollen committed
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        report="multiqc_report/multiqc_report.html"
    singularity: containers["multiqc"]
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"