Snakefile 27.9 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
Sander Bollen's avatar
Sander Bollen committed
25
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
26
from os.path import join, basename
27
from pathlib import Path
Sander Bollen's avatar
Sander Bollen committed
28
29

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
30
31

REFERENCE = config.get("REFERENCE")
32
33
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
34
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
35
36
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
37

Sander Bollen's avatar
Sander Bollen committed
38
GATK = config.get("GATK")
39
40
if GATK is None:
    raise ValueError("You must set --config GATK=<path>")
Sander Bollen's avatar
Sander Bollen committed
41
42
if not Path(GATK).exists():
    raise FileNotFoundError("{0} does not exist.".format(GATK))
43
44

# these are all optional
45
46
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
47
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
48
FASTQ_COUNT = config.get("FASTQ_COUNT")
Sander Bollen's avatar
Sander Bollen committed
49
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
50

51
52
53
54
55
56
57
58
59
60
61
# Make sure all files with known sites exist
known_sites = config.get("KNOWN_SITES").split(',')
for filename in known_sites:
    if not Path(filename).exists():
        raise FileNotFoundError("{0} does not exist".format(DBSNP))

# Generate the input string for basrecalibration
known_sites_argument = ''
for argument, filename in zip(repeat('-knownSites'), known_sites):
    known_sites_argument +=' {argument} {filename}'.format(argument, filename)

62
63
64
65
66
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
67

68
69
70
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
71
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
72
fqpy = fsrc_dir("src", "fastqc_stats.py")
Sander Bollen's avatar
Sander Bollen committed
73
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
74
fqsc = fsrc_dir("src", "safe_fastqc.sh")
Sander Bollen's avatar
Sander Bollen committed
75

Sander Bollen's avatar
Sander Bollen committed
76
77
78
79
80
81
82
# sample config parsing
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

Sander Bollen's avatar
Sander Bollen committed
83
84
85
86
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
87
88
89
90
91
92
93
94
95
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed
96
97

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
98
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed
99

Sander Bollen's avatar
Sander Bollen committed
100
101

def split_genome(ref, approx_n_chunks=100):
Sander Bollen's avatar
Sander Bollen committed
102
103
104
105
106
107
108
109
    """
    Split genome in chunks.

    Chunks are strings in the format: `<ctg>:<start>-<end>`
    These follow the region string format as used by htslib,
    which uses _1_-based indexing.
    See: http://www.htslib.org/doc/tabix.html
    """
Sander Bollen's avatar
Sander Bollen committed
110
111
112
113
114
    fa = Fasta(ref)
    tot_size = sum([len(x) for x in fa.records.values()])
    chunk_size = tot_size//approx_n_chunks
    chunks = []
    for chrom_name, chrom_value in fa.records.items():
Sander Bollen's avatar
Sander Bollen committed
115
        pos = 1
Sander Bollen's avatar
Sander Bollen committed
116
117
118
119
120
121
122
123
124
125
126
127
128
        while pos <= len(chrom_value):
            end = pos+chunk_size
            if end <= len(chrom_value):
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, end)
            else:
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, len(chrom_value))
            chunks.append(chunk)
            pos = end
    return chunks

CHUNKS = split_genome(REFERENCE)


Sander Bollen's avatar
Sander Bollen committed
129

Sander Bollen's avatar
Sander Bollen committed
130
131
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
132
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
133
    rs = []
134
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
135
136
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
137

Sander Bollen's avatar
Sander Bollen committed
138
139
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
140
141


Sander Bollen's avatar
Sander Bollen committed
142
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
143
144
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
145
146
147


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
148
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
149
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed
150
151


Sander Bollen's avatar
Sander Bollen committed
152
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
153
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
154
155
156
157
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
158
159
160
161
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

Sander Bollen's avatar
Sander Bollen committed
162
163
164
165
166
167
    fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt",
                  sample=SAMPLES)
    fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
                  sample=SAMPLES)
    fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
                  sample=SAMPLES)
168
    if len(REFFLATS) >= 1:
Sander Bollen's avatar
Sander Bollen committed
169
170
        coverage_stats = expand("{sample}/coverage/{ref}.coverages.tsv",
                                sample=SAMPLES, ref=BASE_REFFLATS)
171
172
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
173
    stats = "stats.json"
Sander Bollen's avatar
Sander Bollen committed
174
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
Sander Bollen's avatar
Sander Bollen committed
175
176


Sander Bollen's avatar
Sander Bollen committed
177
178
179
localrules: gvcf_chunkfile, genotype_chunkfile


Sander Bollen's avatar
Sander Bollen committed
180
181
rule all:
    input:
Sander Bollen's avatar
Sander Bollen committed
182
183
        combined="multisample/genotyped.vcf.gz",
        report="multiqc_report/multiqc_report.html",
Sander Bollen's avatar
Sander Bollen committed
184
185
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai",sample=SAMPLES),
        vcfs=expand("{sample}/vcf/{sample}_single.vcf.gz",sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
186
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
187

Sander Bollen's avatar
Sander Bollen committed
188
189
190

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
191
    output: directory("tmp")
192
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
193
194
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
195
rule genome:
Sander Bollen's avatar
Sander Bollen committed
196
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
197
    input: REFERENCE
Sander Bollen's avatar
Sander Bollen committed
198
    output: "current.genome"
199
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
200
201
202
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
203
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
204
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
205
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
206
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
207
208
209
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
210
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
211
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
212
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
213
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
214
215
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
216
217

rule seqtk_r1:
Sander Bollen's avatar
Sander Bollen committed
218
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
219
    input:
Sander Bollen's avatar
Sander Bollen committed
220
221
        stats="{sample}/pre_process/{sample}.preqc_count.json",
        fastq="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
222
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
223
    params:
Sander Bollen's avatar
Sander Bollen committed
224
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
225
    output:
Sander Bollen's avatar
Sander Bollen committed
226
        fastq=temp("{sample}/pre_process/{sample}.sampled_R1.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
227
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
228
    singularity: "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0"
Sander Bollen's avatar
Sander Bollen committed
229
230
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
231
232
233


rule seqtk_r2:
Sander Bollen's avatar
Sander Bollen committed
234
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
235
    input:
Sander Bollen's avatar
Sander Bollen committed
236
237
        stats = "{sample}/pre_process/{sample}.preqc_count.json",
        fastq = "{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
238
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
239
    params:
Sander Bollen's avatar
Sander Bollen committed
240
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
241
    output:
Sander Bollen's avatar
Sander Bollen committed
242
        fastq = temp("{sample}/pre_process/{sample}.sampled_R2.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
243
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
244
    singularity: "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0"
Sander Bollen's avatar
Sander Bollen committed
245
246
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
247
248


249
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
250
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
251
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
252
    input:
Sander Bollen's avatar
Sander Bollen committed
253
254
255
256
        r1 = "{sample}/pre_process/{sample}.sampled_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.sampled_R2.fastq.gz",
        rr1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        rr2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
257
    output:
Sander Bollen's avatar
Sander Bollen committed
258
259
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
260
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
Sander Bollen's avatar
Sander Bollen committed
261
    singularity: "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4"
Sander Bollen's avatar
Sander Bollen committed
262
    conda: "envs/sickle.yml"
Sander Bollen's avatar
Sander Bollen committed
263
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
264
265
266
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
267
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
268
    input:
Sander Bollen's avatar
Sander Bollen committed
269
270
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
271
    output:
Sander Bollen's avatar
Sander Bollen committed
272
273
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
Sander Bollen's avatar
Sander Bollen committed
274
    singularity: "docker://quay.io/biocontainers/cutadapt:1.14--py36_0"
Sander Bollen's avatar
Sander Bollen committed
275
    conda: "envs/cutadapt.yml"
Sander Bollen's avatar
Sander Bollen committed
276
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
277
278
279
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
280
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
281
    input:
Sander Bollen's avatar
Sander Bollen committed
282
283
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
284
285
        ref = REFERENCE,
        temp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
286
287
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
288
    output: temp("{sample}/bams/{sample}.sorted.bam")
289
    singularity: "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0"
Sander Bollen's avatar
Sander Bollen committed
290
    conda: "envs/bwa.yml"
Sander Bollen's avatar
Sander Bollen committed
291
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
292
           "| picard -Xmx4G SortSam CREATE_INDEX=TRUE TMP_DIR={input.temp} "
Sander Bollen's avatar
Sander Bollen committed
293
294
295
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
296
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
297
    input:
Sander Bollen's avatar
Sander Bollen committed
298
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
299
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
300
    output:
Sander Bollen's avatar
Sander Bollen committed
301
302
303
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
Sander Bollen's avatar
Sander Bollen committed
304
    singularity: "docker://quay.io/biocontainers/picard:2.14--py36_0"
Sander Bollen's avatar
Sander Bollen committed
305
    conda: "envs/picard.yml"
306
    shell: "picard -Xmx4G MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
307
308
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
309
310
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
311
312
313
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
314
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
315
    output:
Sander Bollen's avatar
Sander Bollen committed
316
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
317
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
318
319
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
320
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
321
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
322
    input:
Sander Bollen's avatar
Sander Bollen committed
323
        bam = "{sample}/bams/{sample}.markdup.bam",
Sander Bollen's avatar
Sander Bollen committed
324
325
326
327
328
329
        gatk = GATK,
        ref = REFERENCE,
        dbsnp = DBSNP,
        one1kg = ONETHOUSAND,
        hapmap = HAPMAP
    output:
Sander Bollen's avatar
Sander Bollen committed
330
        grp = "{sample}/bams/{sample}.baserecal.grp"
331
332
    params:
        knownsites = known_sites_argument
Sander Bollen's avatar
Sander Bollen committed
333
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
Sander Bollen's avatar
Sander Bollen committed
334
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
335
    shell: "java -XX:ParallelGCThreads=1 -jar {input.gatk} -T "
Sander Bollen's avatar
Sander Bollen committed
336
337
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
338
           "-cov CycleCovariate -cov ContextCovariate {params.knownsites}"
Sander Bollen's avatar
Sander Bollen committed
339

Sander Bollen's avatar
Sander Bollen committed
340
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
341
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
342
    input:
Sander Bollen's avatar
Sander Bollen committed
343
344
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
Sander Bollen's avatar
Sander Bollen committed
345
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
346
347
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
348
    params:
Sander Bollen's avatar
Sander Bollen committed
349
        chunk="{chunk}"
Sander Bollen's avatar
Sander Bollen committed
350
    output:
Sander Bollen's avatar
Sander Bollen committed
351
352
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.part.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.part.vcf.gz.tbi")
Sander Bollen's avatar
Sander Bollen committed
353
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
Sander Bollen's avatar
Sander Bollen committed
354
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
355
356
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 {input.gatk} "
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
357
358
359
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
           "-L '{params.chunk}' -o '{output.gvcf}' "
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
360
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
361
362


Sander Bollen's avatar
Sander Bollen committed
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
rule gvcf_chunkfile:
    """
    Create simple text file with paths to chunks for GVCF.
    
    This uses a run directive in stead of a shell directive because
    the amount of chunks may be so large the shell would error out with
    an "argument list too long" error. 
    See https://unix.stackexchange.com/a/120842 for more info
    
    This also means this rule lives outside of singularity/conda and is
    executed in snakemake's own environment. 
    """
    params:
        chunkfiles = expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
                            chunk=CHUNKS)
    output:
        file = "{sample}/vcf/chunkfile.txt"
    run:
381
        with open(output.file, "w") as ohandle:
Sander Bollen's avatar
Sander Bollen committed
382
            for filename in params.chunkfiles:
383
384
                corrected = filename.format(sample=wildcards.sample)
                ohandle.write(corrected + "\n")
Sander Bollen's avatar
Sander Bollen committed
385

Sander Bollen's avatar
Sander Bollen committed
386
rule gvcf_gather:
Sander Bollen's avatar
Sander Bollen committed
387
    """Gather all GVCF scatters"""
Sander Bollen's avatar
Sander Bollen committed
388
    input:
Sander Bollen's avatar
Sander Bollen committed
389
390
391
        gvcfs = expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
                       chunk=CHUNKS),
        chunkfile = "{sample}/vcf/chunkfile.txt"
Sander Bollen's avatar
Sander Bollen committed
392
    output:
Sander Bollen's avatar
Sander Bollen committed
393
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
Sander Bollen's avatar
Sander Bollen committed
394
    conda: "envs/bcftools.yml"
Sander Bollen's avatar
Sander Bollen committed
395
396
397
398
399
400
401
402
403
404
    singularity: "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4"
    shell: "bcftools concat -f {input.chunkfile} -n > {output.gvcf}"


rule gvcf_gather_tbi:
    """Index GVCF"""
    input:
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
    output:
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
405
    conda: "envs/tabix.yml"
Sander Bollen's avatar
Sander Bollen committed
406
407
    singularity: "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0"
    shell: "tabix -pvcf {input.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
408
409
410


rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
411
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
412
    input:
Sander Bollen's avatar
Sander Bollen committed
413
        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz", sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
414
        tbis = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
Sander Bollen's avatar
Sander Bollen committed
415
                      sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
416
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
417
418
        gatk=GATK
    params:
Sander Bollen's avatar
Sander Bollen committed
419
420
        li=" -V ".join(expand("{sample}/vcf/{sample}.g.vcf.gz",
                              sample=SAMPLES)),
Sander Bollen's avatar
Sander Bollen committed
421
422
        chunk="{chunk}"
    output:
Sander Bollen's avatar
Sander Bollen committed
423
424
        vcf=temp("multisample/genotype.{chunk}.part.vcf.gz"),
        vcf_tbi=temp("multisample/genotype.{chunk}.part.vcf.gz.tbi")
Sander Bollen's avatar
Sander Bollen committed
425
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
Sander Bollen's avatar
Sander Bollen committed
426
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
427
428
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 {input.gatk} -T "
           "GenotypeGVCFs -R {input.ref} "
Sander Bollen's avatar
Sander Bollen committed
429
           "-V {params.li} -L '{params.chunk}' -o '{output.vcf}'"
Sander Bollen's avatar
Sander Bollen committed
430
431


432
433
434
435
436
437
438
439
440
441
442
443
444
rule genotype_chunkfile:
    """
    Create simple text file with paths to chunks for genotyping
    
    This uses a run directive in stead of a shell directive because
    the amount of chunks may be so large the shell would error out with
    an "argument list too long" error. 
    See https://unix.stackexchange.com/a/120842 for more info
    
    This also means this rule lives outside of singularity/conda and is
    executed in snakemake's own environment. 
    """
    params:
Sander Bollen's avatar
Sander Bollen committed
445
446
        vcfs = expand("multisample/genotype.{chunk}.part.vcf.gz",
                      chunk=CHUNKS)
447
448
449
450
451
452
453
454
    output:
        file = "multisample/chunkfile.txt"
    run:
        with open(output.file, "w") as ohandle:
            for filename in params.vcfs:
                ohandle.write(filename + "\n")


Sander Bollen's avatar
Sander Bollen committed
455
rule genotype_gather:
456
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
457
    input:
458
459
460
        vcfs = expand("multisample/genotype.{chunk}.part.vcf.gz",
                      chunk=CHUNKS),
        chunkfile = "multisample/chunkfile.txt"
Sander Bollen's avatar
Sander Bollen committed
461
    output:
462
        vcf = "multisample/genotyped.vcf.gz"
Sander Bollen's avatar
Sander Bollen committed
463
    conda: "envs/bcftools.yml"
464
465
466
467
468
469
470
471
472
    singularity: "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4"
    shell: "bcftools concat -f {input.chunkfile} -n > {output.vcf}"


rule genotype_gather_tbi:
    """Index genotyped vcf file"""
    input:
        vcf = "multisample/genotyped.vcf.gz"
    output:
Sander Bollen's avatar
Sander Bollen committed
473
        tbi = "multisample/genotyped.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
474
    conda: "envs/tabix.yml"
475
476
    singularity: "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0"
    shell: "tabix -pvcf {input.vcf}"
Sander Bollen's avatar
Sander Bollen committed
477
478


479
480
481
rule split_vcf:
    """Split multisample VCF in single samples"""
    input:
Sander Bollen's avatar
Sander Bollen committed
482
        vcf="multisample/genotyped.vcf.gz",
483
        tbi = "multisample/genotyped.vcf.gz.tbi",
484
485
486
487
488
        gatk=GATK,
        ref=REFERENCE
    params:
        s="{sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
489
        splitted="{sample}/vcf/{sample}_single.vcf.gz"
Sander Bollen's avatar
Sander Bollen committed
490
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
491
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
492
493
494
    shell: "java -Xmx15G -XX:ParallelGCThreads=1 -jar {input.gatk} "
           "-T SelectVariants -sn {params.s} -env -R {input.ref} -V "
           "{input.vcf} -o {output.splitted}"
495
496


Sander Bollen's avatar
Sander Bollen committed
497
498
499
## bam metrics

rule mapped_num:
Sander Bollen's avatar
Sander Bollen committed
500
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
501
    input:
Sander Bollen's avatar
Sander Bollen committed
502
        bam="{sample}/bams/{sample}.sorted.bam"
Sander Bollen's avatar
Sander Bollen committed
503
    output:
Sander Bollen's avatar
Sander Bollen committed
504
        num="{sample}/bams/{sample}.mapped.num"
505
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
506
507
508
509
510
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"


rule mapped_basenum:
Sander Bollen's avatar
Sander Bollen committed
511
    """Calculate number of mapped bases"""
Sander Bollen's avatar
Sander Bollen committed
512
    input:
Sander Bollen's avatar
Sander Bollen committed
513
        bam="{sample}/bams/{sample}.sorted.bam"
Sander Bollen's avatar
Sander Bollen committed
514
    output:
Sander Bollen's avatar
Sander Bollen committed
515
        num="{sample}/bams/{sample}.mapped.basenum"
516
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
517
    conda: "envs/samtools.yml"
Sander Bollen's avatar
Sander Bollen committed
518
    shell: "samtools view -F 4 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
519
520
521


rule unique_num:
Sander Bollen's avatar
Sander Bollen committed
522
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
523
    input:
Sander Bollen's avatar
Sander Bollen committed
524
        bam="{sample}/bams/{sample}.markdup.bam"
Sander Bollen's avatar
Sander Bollen committed
525
    output:
Sander Bollen's avatar
Sander Bollen committed
526
        num="{sample}/bams/{sample}.unique.num"
527
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
528
    conda: "envs/samtools.yml"
529
    shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
530
531
532


rule usable_basenum:
Sander Bollen's avatar
Sander Bollen committed
533
    """Calculate number of bases on unique reads"""
Sander Bollen's avatar
Sander Bollen committed
534
    input:
Sander Bollen's avatar
Sander Bollen committed
535
        bam="{sample}/bams/{sample}.markdup.bam"
Sander Bollen's avatar
Sander Bollen committed
536
    output:
Sander Bollen's avatar
Sander Bollen committed
537
        num="{sample}/bams/{sample}.usable.basenum"
538
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
539
    conda: "envs/samtools.yml"
Sander Bollen's avatar
Sander Bollen committed
540
541
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > "
           "{output.num}"
Sander Bollen's avatar
Sander Bollen committed
542
543
544
545


## fastqc

Sander Bollen's avatar
Sander Bollen committed
546
rule fastqc_raw:
547
548
549
550
551
    """
    Run fastqc on raw fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to 
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
552
    input:
Sander Bollen's avatar
Sander Bollen committed
553
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
554
555
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
556
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
557
    output:
Sander Bollen's avatar
Sander Bollen committed
558
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
559
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
560
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
561
562
    shell: "fastqc --nogroup -o {params.odir} {input.r1} {input.r2} "
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
563
564


Sander Bollen's avatar
Sander Bollen committed
565
rule fastqc_merged:
566
567
568
569
570
    """
    Run fastqc on merged fastq files    
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to 
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
571
    input:
Sander Bollen's avatar
Sander Bollen committed
572
573
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
574
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
575
    params:
Sander Bollen's avatar
Sander Bollen committed
576
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
577
    output:
Sander Bollen's avatar
Sander Bollen committed
578
579
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
580
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
581
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
582
583
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
584
585


Sander Bollen's avatar
Sander Bollen committed
586
rule fastqc_postqc:
587
588
589
590
591
    """
    Run fastqc on fastq files post pre-processing
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to 
    perl missing in the container of 0.11.5     
    """
Sander Bollen's avatar
Sander Bollen committed
592
    input:
Sander Bollen's avatar
Sander Bollen committed
593
594
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
595
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
596
    params:
Sander Bollen's avatar
Sander Bollen committed
597
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
598
    output:
Sander Bollen's avatar
Sander Bollen committed
599
600
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
601
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
602
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
603
604
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
605
606
607
608
609


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
610
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
611
    input:
Sander Bollen's avatar
Sander Bollen committed
612
613
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
614
    output:
Sander Bollen's avatar
Sander Bollen committed
615
        "{sample}/pre_process/{sample}.preqc_count.json"
Sander Bollen's avatar
Sander Bollen committed
616
    singularity: "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0"
Sander Bollen's avatar
Sander Bollen committed
617
618
    conda: "envs/fastq-count.yml"
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
619
620
621


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
622
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
623
    input:
Sander Bollen's avatar
Sander Bollen committed
624
625
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
626
    output:
Sander Bollen's avatar
Sander Bollen committed
627
        "{sample}/pre_process/{sample}.postqc_count.json"
Sander Bollen's avatar
Sander Bollen committed
628
    singularity: "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0"
Sander Bollen's avatar
Sander Bollen committed
629
630
    conda: "envs/fastq-count.yml"
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
631
632


Sander Bollen's avatar
Sander Bollen committed
633
634
635
636
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
637
638
639
640
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
Sander Bollen's avatar
Sander Bollen committed
641
        sc=fqpy
Sander Bollen's avatar
Sander Bollen committed
642
    singularity: "docker://python:3.6-slim"
643
    conda: "envs/collectstats.yml"
Sander Bollen's avatar
Sander Bollen committed
644
    output:
Sander Bollen's avatar
Sander Bollen committed
645
        "{sample}/pre_process/fastq_stats.json"
646
647
648
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
649
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
650

Sander Bollen's avatar
Sander Bollen committed
651
652
653
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
654
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
655
    input:
Sander Bollen's avatar
Sander Bollen committed
656
657
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
Sander Bollen's avatar
Sander Bollen committed
658
659
660
661
662
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
663
664
        covj="{sample}/coverage/{bed}.covstats.json",
        covp="{sample}/coverage/{bed}.covstats.png"
665
    singularity: "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0"
Sander Bollen's avatar
Sander Bollen committed
666
    conda: "envs/covstat.yml"
Sander Bollen's avatar
Sander Bollen committed
667
668
669
670
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
671
672


Sander Bollen's avatar
Sander Bollen committed
673
674
675
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
676
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
677
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
Sander Bollen's avatar
Sander Bollen committed
678
679
        ref=get_refflatpath
    output:
Sander Bollen's avatar
Sander Bollen committed
680
        tsv="{sample}/coverage/{ref}.coverages.tsv"
Sander Bollen's avatar
Sander Bollen committed
681
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
682
683
684
685
    conda: "envs/vcfstats.yml"
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
686
687
688
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
689
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
690
    input:
Sander Bollen's avatar
Sander Bollen committed
691
692
        vcf="multisample/genotyped.vcf.gz",
        tbi = "multisample/genotyped.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
693
    output:
Sander Bollen's avatar
Sander Bollen committed
694
        stats="multisample/vcfstats.json"
Sander Bollen's avatar
Sander Bollen committed
695
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
696
    conda: "envs/vcfstats.yml"
Sander Bollen's avatar
Sander Bollen committed
697
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
698
699


Sander Bollen's avatar
Sander Bollen committed
700
## collection
701
702
703
704
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
705
706
707
708
709
710
711
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
712
713
            cov=expand("{{sample}}/coverage/{bed}.covstats.json",
                       bed=BASE_BEDS),
714
715
716
717
718
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
719
            "{sample}/{sample}.stats.json"
Sander Bollen's avatar
Sander Bollen committed
720
        singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
721
722
723
724
725
        conda: "envs/collectstats.yml"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
726
727
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
728
729
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
730
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
731
        input:
Sander Bollen's avatar
Sander Bollen committed
732
733
734
735
736
737
738
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
739
740
741
742
743
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
744
            "{sample}/{sample}.stats.json"
Sander Bollen's avatar
Sander Bollen committed
745
        singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
746
747
748
749
750
        conda: "envs/collectstats.yml"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
751
752
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
753
754

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
755
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
756
    input:
Sander Bollen's avatar
Sander Bollen committed
757
758
        cols=expand("{sample}/{sample}.stats.json", sample=SAMPLES),
        vstat="multisample/vcfstats.json",
Sander Bollen's avatar
Sander Bollen committed
759
760
        mpy=mpy
    output:
Sander Bollen's avatar
Sander Bollen committed
761
        stats="stats.json"
762
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
763
    conda: "envs/collectstats.yml"
Sander Bollen's avatar
Sander Bollen committed
764
765
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
766
767


Sander Bollen's avatar
Sander Bollen committed
768
769
770
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
771
        stats="stats.json",
Sander Bollen's avatar
Sander Bollen committed
772
773
        sc=tsvpy
    output:
Sander Bollen's avatar
Sander Bollen committed
774
        stats="stats.tsv"
775
    singularity: "docker://python:3.6-slim"
Sander Bollen's avatar
Sander Bollen committed
776
777
778
779
    conda: "envs/collectstats.yml"
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
780
781
782
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
783
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
784
785
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
786
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
787
    params:
Sander Bollen's avatar
Sander Bollen committed
788
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
789
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
790
    output:
Sander Bollen's avatar
Sander Bollen committed
791
        report="multiqc_report/multiqc_report.html"
Sander Bollen's avatar
Sander Bollen committed
792
    singularity: "docker://quay.io/biocontainers/multiqc:1.5--py36_0"
Sander Bollen's avatar
Sander Bollen committed
793
    conda: "envs/multiqc.yml"
794
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"