Skip to content
Snippets Groups Projects
Snakefile 26.9 KiB
Newer Older
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
Main Snakefile for the pipeline.

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
import json
Sander Bollen's avatar
Sander Bollen committed
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
from os.path import join, basename
from pathlib import Path
import itertools

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed

REFERENCE = config.get("REFERENCE")
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
Sander Bollen's avatar
Sander Bollen committed
DBSNP = config.get("DBSNP")
if DBSNP is None:
    raise ValueError("You must set --config DBSNP=<path>")
Sander Bollen's avatar
Sander Bollen committed
if not Path(DBSNP).exists():
    raise FileNotFoundError("{0} does not exist".format(DBSNP))
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

KNOWN_SITES = config.get("KNOWN_SITES")
if KNOWN_SITES is None:
    raise ValueError("You must set --config KNOWN_SITES=<path>")

# The user can specify multiple known sites
KNOWN_SITES = KNOWN_SITES.split(',')
for filename in KNOWN_SITES:
    if not Path(filename).exists():
        raise FileNotFoundError("{0} does not exist".format(filename))


# these are all optional
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
FASTQ_COUNT = config.get("FASTQ_COUNT")
Sander Bollen's avatar
Sander Bollen committed
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed

# Generate the input string for basrecalibration
BSQR_known_sites = ''
for argument, filename in zip(itertools.repeat('-knownSites'), KNOWN_SITES):
    BSQR_known_sites +=' {} {}'.format(argument, filename)
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed

covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
fqpy = fsrc_dir("src", "fastqc_stats.py")
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
fqsc = fsrc_dir("src", "safe_fastqc.sh")
Sander Bollen's avatar
Sander Bollen committed

Sander Bollen's avatar
Sander Bollen committed
# sample config parsing
Sander Bollen's avatar
Sander Bollen committed
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed


def split_genome(ref, approx_n_chunks=100):
    """
    Split genome in chunks.

    Chunks are strings in the format: `<ctg>:<start>-<end>`
    These follow the region string format as used by htslib,
    which uses _1_-based indexing.
    See: http://www.htslib.org/doc/tabix.html
    """
    fa = Fasta(ref)
    tot_size = sum([len(x) for x in fa.records.values()])
    chunk_size = tot_size//approx_n_chunks
    chunks = []
    for chrom_name, chrom_value in fa.records.items():
        pos = 1
        while pos <= len(chrom_value):
            end = pos+chunk_size
            if end <= len(chrom_value):
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, end)
            else:
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, len(chrom_value))
            chunks.append(chunk)
            pos = end
    return chunks

CHUNKS = split_genome(REFERENCE)


Sander Bollen's avatar
Sander Bollen committed

Sander Bollen's avatar
Sander Bollen committed
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
    rs = []
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed

Sander Bollen's avatar
Sander Bollen committed
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed


Sander Bollen's avatar
Sander Bollen committed
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

Sander Bollen's avatar
Sander Bollen committed
    fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt",
                  sample=SAMPLES)
    fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
                  sample=SAMPLES)
    fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
                  sample=SAMPLES)
    if len(REFFLATS) >= 1:
Sander Bollen's avatar
Sander Bollen committed
        coverage_stats = expand("{sample}/coverage/{ref}.coverages.tsv",
                                sample=SAMPLES, ref=BASE_REFFLATS)
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
    stats = "stats.json"
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
localrules: gvcf_chunkfile, genotype_chunkfile


Sander Bollen's avatar
Sander Bollen committed
rule all:
    input:
Sander Bollen's avatar
Sander Bollen committed
        combined="multisample/genotyped.vcf.gz",
        report="multiqc_report/multiqc_report.html",
Sander Bollen's avatar
Sander Bollen committed
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai",sample=SAMPLES),
        vcfs=expand("{sample}/vcf/{sample}_single.vcf.gz",sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed

Sander Bollen's avatar
Sander Bollen committed

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
rule genome:
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
    input: REFERENCE
Sander Bollen's avatar
Sander Bollen committed
    output: "current.genome"
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
    shell: "cat {input} > {output}"

rule merge_r2:
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed

rule seqtk_r1:
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        stats="{sample}/pre_process/{sample}.preqc_count.json",
        fastq="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
    params:
Sander Bollen's avatar
Sander Bollen committed
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        fastq=temp("{sample}/pre_process/{sample}.sampled_R1.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
    singularity: "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0"
Sander Bollen's avatar
Sander Bollen committed
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed


rule seqtk_r2:
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        stats = "{sample}/pre_process/{sample}.preqc_count.json",
        fastq = "{sample}/pre_process/{sample}.merged_R2.fastq.gz",
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
    params:
Sander Bollen's avatar
Sander Bollen committed
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        fastq = temp("{sample}/pre_process/{sample}.sampled_R2.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
    singularity: "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0"
Sander Bollen's avatar
Sander Bollen committed
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
rule sickle:
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1 = "{sample}/pre_process/{sample}.sampled_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.sampled_R2.fastq.gz",
        rr1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        rr2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
Sander Bollen's avatar
Sander Bollen committed
    singularity: "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4"
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
           "-p {output.r2} -s {output.s}"

rule cutadapt:
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
Sander Bollen's avatar
Sander Bollen committed
    singularity: "docker://quay.io/biocontainers/cutadapt:1.14--py36_0"
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
        ref = REFERENCE,
        temp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
    output: temp("{sample}/bams/{sample}.sorted.bam")
    singularity: "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0"
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
           "| picard -Xmx4G SortSam CREATE_INDEX=TRUE TMP_DIR={input.temp} "
Sander Bollen's avatar
Sander Bollen committed
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
Sander Bollen's avatar
Sander Bollen committed
    singularity: "docker://quay.io/biocontainers/picard:2.14--py36_0"
    shell: "picard -Xmx4G MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
rule baserecal:
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        bam = "{sample}/bams/{sample}.markdup.bam",
Sander Bollen's avatar
Sander Bollen committed
        ref = REFERENCE,
    output:
Sander Bollen's avatar
Sander Bollen committed
        grp = "{sample}/bams/{sample}.baserecal.grp"
        known_sites = BSQR_known_sites
    singularity: "docker://broadinstitute/gatk3:3.7-0"
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed

rule gvcf_scatter:
    """Run HaplotypeCaller in GVCF mode by chunk"""
    input:
Sander Bollen's avatar
Sander Bollen committed
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
Sander Bollen's avatar
Sander Bollen committed
        dbsnp=DBSNP,
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
    params:
        chunk="{chunk}"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.part.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.part.vcf.gz.tbi")
    singularity: "docker://broadinstitute/gatk3:3.7-0"
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
           "-T HaplotypeCaller -ERC GVCF -I "
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
           "-L '{params.chunk}' -o '{output.gvcf}' "
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
           "-BQSR {input.bqsr}"
rule gvcf_chunkfile:
    """
    Create simple text file with paths to chunks for GVCF.
    This uses a run directive in stead of a shell directive because
    the amount of chunks may be so large the shell would error out with
    an "argument list too long" error.
    See https://unix.stackexchange.com/a/120842 for more info
    This also means this rule lives outside of singularity and is
    executed in snakemake's own environment.
    """
    params:
        chunkfiles = expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
                            chunk=CHUNKS)
    output:
        file = "{sample}/vcf/chunkfile.txt"
    run:
        with open(output.file, "w") as ohandle:
            for filename in params.chunkfiles:
                corrected = filename.format(sample=wildcards.sample)
                ohandle.write(corrected + "\n")
rule gvcf_gather:
    """Gather all GVCF scatters"""
Sander Bollen's avatar
Sander Bollen committed
    input:
        gvcfs = expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
                       chunk=CHUNKS),
        chunkfile = "{sample}/vcf/chunkfile.txt"
Sander Bollen's avatar
Sander Bollen committed
    output:
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
    singularity: "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4"
    shell: "bcftools concat -f {input.chunkfile} -n > {output.gvcf}"


rule gvcf_gather_tbi:
    """Index GVCF"""
    input:
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
    output:
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
    singularity: "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0"
    shell: "tabix -pvcf {input.gvcf}"


rule genotype_scatter:
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz", sample=SAMPLES),
        tbis = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
Sander Bollen's avatar
Sander Bollen committed
                      sample=SAMPLES),
    params:
Sander Bollen's avatar
Sander Bollen committed
        li=" -V ".join(expand("{sample}/vcf/{sample}.g.vcf.gz",
                              sample=SAMPLES)),
        chunk="{chunk}"
    output:
Sander Bollen's avatar
Sander Bollen committed
        vcf=temp("multisample/genotype.{chunk}.part.vcf.gz"),
        vcf_tbi=temp("multisample/genotype.{chunk}.part.vcf.gz.tbi")
    singularity: "docker://broadinstitute/gatk3:3.7-0"
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
           "GenotypeGVCFs -R {input.ref} "
           "-V {params.li} -L '{params.chunk}' -o '{output.vcf}'"
rule genotype_chunkfile:
    """
    Create simple text file with paths to chunks for genotyping
    This uses a run directive in stead of a shell directive because
    the amount of chunks may be so large the shell would error out with
    an "argument list too long" error.
    See https://unix.stackexchange.com/a/120842 for more info
    This also means this rule lives outside of singularity and is
    executed in snakemake's own environment.
Sander Bollen's avatar
Sander Bollen committed
        vcfs = expand("multisample/genotype.{chunk}.part.vcf.gz",
                      chunk=CHUNKS)
    output:
        file = "multisample/chunkfile.txt"
    run:
        with open(output.file, "w") as ohandle:
            for filename in params.vcfs:
                ohandle.write(filename + "\n")


rule genotype_gather:
    """Gather all genotyping VCFs"""
    input:
        vcfs = expand("multisample/genotype.{chunk}.part.vcf.gz",
                      chunk=CHUNKS),
        chunkfile = "multisample/chunkfile.txt"
Sander Bollen's avatar
Sander Bollen committed
    output:
        vcf = "multisample/genotyped.vcf.gz"
    singularity: "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4"
    shell: "bcftools concat -f {input.chunkfile} -n > {output.vcf}"


rule genotype_gather_tbi:
    """Index genotyped vcf file"""
    input:
        vcf = "multisample/genotyped.vcf.gz"
    output:
Sander Bollen's avatar
Sander Bollen committed
        tbi = "multisample/genotyped.vcf.gz.tbi"
    singularity: "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0"
    shell: "tabix -pvcf {input.vcf}"
rule split_vcf:
    """Split multisample VCF in single samples"""
    input:
Sander Bollen's avatar
Sander Bollen committed
        vcf="multisample/genotyped.vcf.gz",
        tbi = "multisample/genotyped.vcf.gz.tbi",
Sander Bollen's avatar
Sander Bollen committed
        splitted="{sample}/vcf/{sample}_single.vcf.gz"
    singularity: "docker://broadinstitute/gatk3:3.7-0"
    shell: "java -Xmx15G -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
           "-T SelectVariants -sn {params.s} -env -R {input.ref} -V "
           "{input.vcf} -o {output.splitted}"
Sander Bollen's avatar
Sander Bollen committed
## bam metrics

rule mapped_num:
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        bam="{sample}/bams/{sample}.sorted.bam"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        num="{sample}/bams/{sample}.mapped.num"
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
    shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"


rule mapped_basenum:
    """Calculate number of mapped bases"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        bam="{sample}/bams/{sample}.sorted.bam"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        num="{sample}/bams/{sample}.mapped.basenum"
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
    shell: "samtools view -F 4 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed


rule unique_num:
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        bam="{sample}/bams/{sample}.markdup.bam"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        num="{sample}/bams/{sample}.unique.num"
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
    shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l > {output.num}"
Sander Bollen's avatar
Sander Bollen committed


rule usable_basenum:
    """Calculate number of bases on unique reads"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        bam="{sample}/bams/{sample}.markdup.bam"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        num="{sample}/bams/{sample}.usable.basenum"
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > "
           "{output.num}"
Sander Bollen's avatar
Sander Bollen committed
rule fastqc_raw:
    """
    Run fastqc on raw fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
Sander Bollen's avatar
Sander Bollen committed
    shell: "fastqc --nogroup -o {params.odir} {input.r1} {input.r2} "
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
rule fastqc_merged:
    Run fastqc on merged fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
    params:
Sander Bollen's avatar
Sander Bollen committed
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
Sander Bollen's avatar
Sander Bollen committed
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
rule fastqc_postqc:
    """
    Run fastqc on fastq files post pre-processing
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
    params:
Sander Bollen's avatar
Sander Bollen committed
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
Sander Bollen's avatar
Sander Bollen committed
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed


## fastq-count

rule fqcount_preqc:
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        "{sample}/pre_process/{sample}.preqc_count.json"
    singularity: "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0"
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed


rule fqcount_postqc:
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        "{sample}/pre_process/{sample}.postqc_count.json"
    singularity: "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0"
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed


Sander Bollen's avatar
Sander Bollen committed
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
Sander Bollen's avatar
Sander Bollen committed
        sc=fqpy
    singularity: "docker://python:3.6-slim"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        "{sample}/pre_process/fastq_stats.json"
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
## coverages

rule covstats:
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
Sander Bollen's avatar
Sander Bollen committed
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
        covj="{sample}/coverage/{bed}.covstats.json",
        covp="{sample}/coverage/{bed}.covstats.png"
    singularity: "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0"
Sander Bollen's avatar
Sander Bollen committed
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed


rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
        ref=get_refflatpath
    output:
Sander Bollen's avatar
Sander Bollen committed
        tsv="{sample}/coverage/{ref}.coverages.tsv"
Sander Bollen's avatar
Sander Bollen committed
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
## vcfstats

rule vcfstats:
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        vcf="multisample/genotyped.vcf.gz",
        tbi = "multisample/genotyped.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        stats="multisample/vcfstats.json"
Sander Bollen's avatar
Sander Bollen committed
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
## collection
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
            cov=expand("{{sample}}/coverage/{bed}.covstats.json",
                       bed=BASE_BEDS),
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
            "{sample}/{sample}.stats.json"
        singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
Sander Bollen's avatar
Sander Bollen committed
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
        input:
Sander Bollen's avatar
Sander Bollen committed
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
            "{sample}/{sample}.stats.json"
        singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed

rule merge_stats:
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
    input:
Sander Bollen's avatar
Sander Bollen committed
        cols=expand("{sample}/{sample}.stats.json", sample=SAMPLES),
        vstat="multisample/vcfstats.json",
Sander Bollen's avatar
Sander Bollen committed
        mpy=mpy
    output:
Sander Bollen's avatar
Sander Bollen committed
        stats="stats.json"
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
        stats="stats.json",
        sc=tsvpy
    output:
Sander Bollen's avatar
Sander Bollen committed
        stats="stats.tsv"
    singularity: "docker://python:3.6-slim"
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
rule multiqc:
    """
    Create multiQC report
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
    params:
Sander Bollen's avatar
Sander Bollen committed
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
    output:
Sander Bollen's avatar
Sander Bollen committed
        report="multiqc_report/multiqc_report.html"
Sander Bollen's avatar
Sander Bollen committed
    singularity: "docker://quay.io/biocontainers/multiqc:1.5--py36_0"
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"