Snakefile 24.5 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
Sander Bollen's avatar
Sander Bollen committed
25
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
26
from os.path import join, basename
27
from pathlib import Path
28
import itertools
Sander Bollen's avatar
Sander Bollen committed
29
30

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
31
32

REFERENCE = config.get("REFERENCE")
33
34
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
35
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
36
37
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
Sander Bollen's avatar
Sander Bollen committed
38
DBSNP = config.get("DBSNP")
39
40
if DBSNP is None:
    raise ValueError("You must set --config DBSNP=<path>")
Sander Bollen's avatar
Sander Bollen committed
41
42
if not Path(DBSNP).exists():
    raise FileNotFoundError("{0} does not exist".format(DBSNP))
43

44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

KNOWN_SITES = config.get("KNOWN_SITES")
if KNOWN_SITES is None:
    raise ValueError("You must set --config KNOWN_SITES=<path>")

# The user can specify multiple known sites
KNOWN_SITES = KNOWN_SITES.split(',')
for filename in KNOWN_SITES:
    if not Path(filename).exists():
        raise FileNotFoundError("{0} does not exist".format(filename))

60
61

# these are all optional
62
63
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
64
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
Sander Bollen's avatar
Sander Bollen committed
65
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
66

67
# Generate the input string for basrecalibration
68
69
70
BSQR_known_sites = ''
for argument, filename in zip(itertools.repeat('-knownSites'), KNOWN_SITES):
    BSQR_known_sites +=' {} {}'.format(argument, filename)
71

72
73
74
75
76
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
77

78
79
80
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
81
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
82
fqpy = fsrc_dir("src", "fastqc_stats.py")
Sander Bollen's avatar
Sander Bollen committed
83
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
84
fqsc = fsrc_dir("src", "safe_fastqc.sh")
85
pywc = fsrc_dir("src", "pywc.py")
Sander Bollen's avatar
Sander Bollen committed
86

Sander Bollen's avatar
Sander Bollen committed
87
# sample config parsing
Sander Bollen's avatar
Sander Bollen committed
88
89
90
91
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
92
93
94
95
96
97
98
99
100
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed
101
102

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
103
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed
104

105
106
107
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
108
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:1.14--py36_0",
    "debian": "docker://debian:buster-slim",
    "fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.5--py36_0",
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "seqtk-1.2-jq-1.6": "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
124

Sander Bollen's avatar
Sander Bollen committed
125
126
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
127
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
128
    rs = []
129
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
130
131
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
132

Sander Bollen's avatar
Sander Bollen committed
133
134
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
135
136


Sander Bollen's avatar
Sander Bollen committed
137
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
138
139
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
140
141
142


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
143
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
144
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed
145
146


Sander Bollen's avatar
Sander Bollen committed
147
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
148
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
149
150
151
152
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
153
154
155
156
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

Sander Bollen's avatar
Sander Bollen committed
157
158
159
160
161
162
    fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt",
                  sample=SAMPLES)
    fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
                  sample=SAMPLES)
    fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
                  sample=SAMPLES)
163
    if len(REFFLATS) >= 1:
Sander Bollen's avatar
Sander Bollen committed
164
165
        coverage_stats = expand("{sample}/coverage/{ref}.coverages.tsv",
                                sample=SAMPLES, ref=BASE_REFFLATS)
166
167
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
168
    stats = "stats.json"
Sander Bollen's avatar
Sander Bollen committed
169
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
Sander Bollen's avatar
Sander Bollen committed
170
171


Sander Bollen's avatar
Sander Bollen committed
172
173
rule all:
    input:
174
        report="multiqc_report/multiqc_report.html",
Sander Bollen's avatar
Sander Bollen committed
175
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai",sample=SAMPLES),
176
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz",sample=SAMPLES),
177
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz",sample=SAMPLES),
178
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
179

Sander Bollen's avatar
Sander Bollen committed
180
181
182

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
183
    output: directory("tmp")
184
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
185
186
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
187
rule genome:
Sander Bollen's avatar
Sander Bollen committed
188
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
189
    input: REFERENCE
Sander Bollen's avatar
Sander Bollen committed
190
    output: "current.genome"
191
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
192
193
194
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
195
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
196
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
197
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
198
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
199
200
201
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
202
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
203
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
204
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
205
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
206
207
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
208
209

rule seqtk_r1:
Sander Bollen's avatar
Sander Bollen committed
210
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
211
    input:
Sander Bollen's avatar
Sander Bollen committed
212
213
        stats="{sample}/pre_process/{sample}.preqc_count.json",
        fastq="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
214
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
215
    params:
Sander Bollen's avatar
Sander Bollen committed
216
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
217
    output:
Sander Bollen's avatar
Sander Bollen committed
218
        fastq=temp("{sample}/pre_process/{sample}.sampled_R1.fastq.gz")
219
    singularity: containers["seqtk-1.2-jq-1.6"]
Sander Bollen's avatar
Sander Bollen committed
220
221
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
222
223
224


rule seqtk_r2:
Sander Bollen's avatar
Sander Bollen committed
225
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
226
    input:
Sander Bollen's avatar
Sander Bollen committed
227
228
        stats = "{sample}/pre_process/{sample}.preqc_count.json",
        fastq = "{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
229
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
230
    params:
Sander Bollen's avatar
Sander Bollen committed
231
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
232
    output:
Sander Bollen's avatar
Sander Bollen committed
233
        fastq = temp("{sample}/pre_process/{sample}.sampled_R2.fastq.gz")
234
    singularity: containers["seqtk-1.2-jq-1.6"]
Sander Bollen's avatar
Sander Bollen committed
235
236
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
237
238


239
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
240
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
241
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
242
    input:
Sander Bollen's avatar
Sander Bollen committed
243
244
245
246
        r1 = "{sample}/pre_process/{sample}.sampled_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.sampled_R2.fastq.gz",
        rr1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        rr2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
247
    output:
Sander Bollen's avatar
Sander Bollen committed
248
249
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
250
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
251
    singularity: containers["sickle"]
Sander Bollen's avatar
Sander Bollen committed
252
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
253
254
255
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
256
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
257
    input:
Sander Bollen's avatar
Sander Bollen committed
258
259
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
260
    output:
Sander Bollen's avatar
Sander Bollen committed
261
262
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
263
    singularity: containers["cutadapt"]
Sander Bollen's avatar
Sander Bollen committed
264
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
265
266
267
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
268
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
269
    input:
Sander Bollen's avatar
Sander Bollen committed
270
271
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
272
        ref = REFERENCE,
273
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
274
275
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
276
    output: temp("{sample}/bams/{sample}.sorted.bam")
277
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
278
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
279
280
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
281
282
283
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
284
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
285
    input:
Sander Bollen's avatar
Sander Bollen committed
286
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
287
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
288
    output:
Sander Bollen's avatar
Sander Bollen committed
289
290
291
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
292
    singularity: containers["picard-2.14"]
293
294
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
295
296
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
297
298
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
299
300
301
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
302
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
303
    output:
Sander Bollen's avatar
Sander Bollen committed
304
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
305
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
306
307
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
308
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
309
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
310
    input:
Sander Bollen's avatar
Sander Bollen committed
311
        bam = "{sample}/bams/{sample}.markdup.bam",
Sander Bollen's avatar
Sander Bollen committed
312
313
        ref = REFERENCE,
    output:
Sander Bollen's avatar
Sander Bollen committed
314
        grp = "{sample}/bams/{sample}.baserecal.grp"
315
    params:
316
        known_sites = BSQR_known_sites
317
    singularity: containers["gatk"]
318
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
319
320
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
321
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
322

323
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
324
325
326
327
    """Scatter the reference genome"""
    input:
        ref = REFERENCE,
    output:
328
        directory("scatter")
van den Berg's avatar
van den Berg committed
329
    singularity: containers["biopet-scatterregions"]
330
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
331
           "biopet-scatterregions "
332
           "--referenceFasta {input.ref} --scatterSize 1000000000 "
van den Berg's avatar
van den Berg committed
333
334
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
335
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
336
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
337
    input:
Sander Bollen's avatar
Sander Bollen committed
338
339
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
Sander Bollen's avatar
Sander Bollen committed
340
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
341
        ref=REFERENCE,
van den Berg's avatar
van den Berg committed
342
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
343
    output:
344
345
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
346
347
    wildcard_constraints:
        chunk="[0-9]+"
348
    singularity: containers["gatk"]
349
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
350
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
351
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
352
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
353
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
354
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
355

356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
           "--output-type z"
Sander Bollen's avatar
Sander Bollen committed
376
377

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
378
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
379
    input:
380
381
382
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
        ref=REFERENCE
Sander Bollen's avatar
Sander Bollen committed
383
    output:
van den Berg's avatar
van den Berg committed
384
385
386
387
        vcf="{sample}/vcf/{sample}.{chunk}.vcf.gz",
        vcf_tbi="{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi"
    wildcard_constraints:
        chunk="[0-9]+"
388
    singularity: containers["gatk"]
389
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
390
           "GenotypeGVCFs -R {input.ref} "
391
           "-V {input.gvcf} -o '{output.vcf}'"
392
393


394
def aggregate_vcf(wildcards):
395
396
397
398
399
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
400
rule genotype_gather:
401
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
402
    input:
403
        vcfs = aggregate_vcf
Sander Bollen's avatar
Sander Bollen committed
404
    output:
405
        vcf = "{sample}/vcf/{sample}.vcf.gz"
406
    singularity: containers["bcftools"]
407
408
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
           "--output-type z"
409
410
411
412
413


rule genotype_gather_tbi:
    """Index genotyped vcf file"""
    input:
414
        vcf = "{sample}/vcf/{sample}.vcf.gz"
415
    output:
416
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
417
    singularity: containers["tabix"]
418
    shell: "tabix -pvcf {input.vcf}"
Sander Bollen's avatar
Sander Bollen committed
419
420


Sander Bollen's avatar
Sander Bollen committed
421
## bam metrics
422
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
423
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
424
    input:
425
426
        bam="{sample}/bams/{sample}.sorted.bam",
        pywc=pywc
Sander Bollen's avatar
Sander Bollen committed
427
    output:
428
429
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
430
    singularity: containers["samtools-1.7-python-3.6"]
431
432
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
433
434


435
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
436
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
437
    input:
438
439
        bam="{sample}/bams/{sample}.markdup.bam",
        pywc=pywc
Sander Bollen's avatar
Sander Bollen committed
440
    output:
441
442
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
443
    singularity: containers["samtools-1.7-python-3.6"]
444
445
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
446
447
448


## fastqc
Sander Bollen's avatar
Sander Bollen committed
449
rule fastqc_raw:
450
451
    """
    Run fastqc on raw fastq files
452
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
453
454
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
455
    input:
Sander Bollen's avatar
Sander Bollen committed
456
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
457
458
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
459
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
460
    output:
Sander Bollen's avatar
Sander Bollen committed
461
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
462
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
463
    shell: "fastqc --threads 4 --nogroup -o {params.odir} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
464
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
465
466


Sander Bollen's avatar
Sander Bollen committed
467
rule fastqc_merged:
468
    """
469
470
    Run fastqc on merged fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
471
472
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
473
    input:
Sander Bollen's avatar
Sander Bollen committed
474
475
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
476
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
477
    params:
Sander Bollen's avatar
Sander Bollen committed
478
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
479
    output:
Sander Bollen's avatar
Sander Bollen committed
480
481
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
482
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
483
484
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
485
486


Sander Bollen's avatar
Sander Bollen committed
487
rule fastqc_postqc:
488
489
    """
    Run fastqc on fastq files post pre-processing
490
491
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
492
    """
Sander Bollen's avatar
Sander Bollen committed
493
    input:
Sander Bollen's avatar
Sander Bollen committed
494
495
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
496
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
497
    params:
Sander Bollen's avatar
Sander Bollen committed
498
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
499
    output:
Sander Bollen's avatar
Sander Bollen committed
500
501
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
502
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
503
504
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
505
506
507
508
509


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
510
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
511
    input:
Sander Bollen's avatar
Sander Bollen committed
512
513
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
514
    output:
Sander Bollen's avatar
Sander Bollen committed
515
        "{sample}/pre_process/{sample}.preqc_count.json"
516
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
517
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
518
519
520


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
521
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
522
    input:
Sander Bollen's avatar
Sander Bollen committed
523
524
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
525
    output:
Sander Bollen's avatar
Sander Bollen committed
526
        "{sample}/pre_process/{sample}.postqc_count.json"
527
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
528
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
529
530


Sander Bollen's avatar
Sander Bollen committed
531
532
533
534
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
535
536
537
538
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
Sander Bollen's avatar
Sander Bollen committed
539
        sc=fqpy
540
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
541
    output:
Sander Bollen's avatar
Sander Bollen committed
542
        "{sample}/pre_process/fastq_stats.json"
543
544
545
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
546
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
547

Sander Bollen's avatar
Sander Bollen committed
548
549
550
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
551
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
552
    input:
Sander Bollen's avatar
Sander Bollen committed
553
554
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
Sander Bollen's avatar
Sander Bollen committed
555
556
557
558
559
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
560
561
        covj="{sample}/coverage/{bed}.covstats.json",
        covp="{sample}/coverage/{bed}.covstats.png"
562
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
563
564
565
566
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
567
568


Sander Bollen's avatar
Sander Bollen committed
569
570
571
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
572
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
573
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
Sander Bollen's avatar
Sander Bollen committed
574
575
        ref=get_refflatpath
    output:
Sander Bollen's avatar
Sander Bollen committed
576
        tsv="{sample}/coverage/{ref}.coverages.tsv"
577
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
578
579
580
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
581
582
583
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
584
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
585
    input:
586
        vcf="{sample}/vcf/{sample}.vcf.gz",
587
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
588
    output:
589
        stats="{sampel}/vcf/{sample}.vcfstats.json"
590
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
591
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
592
593


Sander Bollen's avatar
Sander Bollen committed
594
## collection
595
596
597
598
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
599
600
601
602
603
604
605
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
606
607
            cov=expand("{{sample}}/coverage/{bed}.covstats.json",
                       bed=BASE_BEDS),
608
609
610
611
612
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
613
            "{sample}/{sample}.stats.json"
614
        singularity: containers["vtools"]
615
616
617
618
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
619
620
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
621
622
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
623
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
624
        input:
Sander Bollen's avatar
Sander Bollen committed
625
626
627
628
629
630
631
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
632
633
634
635
636
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
637
            "{sample}/{sample}.stats.json"
638
        singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
639
640
641
642
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
643
644
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
645
646

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
647
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
648
    input:
Sander Bollen's avatar
Sander Bollen committed
649
        cols=expand("{sample}/{sample}.stats.json", sample=SAMPLES),
650
        vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
651
652
        mpy=mpy
    output:
Sander Bollen's avatar
Sander Bollen committed
653
        stats="stats.json"
654
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
655
656
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
657
658


Sander Bollen's avatar
Sander Bollen committed
659
660
661
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
662
        stats="stats.json",
Sander Bollen's avatar
Sander Bollen committed
663
664
        sc=tsvpy
    output:
Sander Bollen's avatar
Sander Bollen committed
665
        stats="stats.tsv"
666
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
667
668
669
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
670
671
672
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
673
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
674
675
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
676
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
677
    params:
Sander Bollen's avatar
Sander Bollen committed
678
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
679
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
680
    output:
Sander Bollen's avatar
Sander Bollen committed
681
        report="multiqc_report/multiqc_report.html"
682
    singularity: containers["multiqc"]
683
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"