Snakefile 25.1 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
Sander Bollen's avatar
Sander Bollen committed
25
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
26
from os.path import join, basename
27
from pathlib import Path
28
import itertools
Sander Bollen's avatar
Sander Bollen committed
29

Sander Bollen's avatar
Sander Bollen committed
30
REFERENCE = config.get("REFERENCE")
31
32
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
33
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
34
35
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
Sander Bollen's avatar
Sander Bollen committed
36
DBSNP = config.get("DBSNP")
37
38
if DBSNP is None:
    raise ValueError("You must set --config DBSNP=<path>")
Sander Bollen's avatar
Sander Bollen committed
39
40
if not Path(DBSNP).exists():
    raise FileNotFoundError("{0} does not exist".format(DBSNP))
41

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

KNOWN_SITES = config.get("KNOWN_SITES")
if KNOWN_SITES is None:
    raise ValueError("You must set --config KNOWN_SITES=<path>")

# The user can specify multiple known sites
KNOWN_SITES = KNOWN_SITES.split(',')
for filename in KNOWN_SITES:
    if not Path(filename).exists():
        raise FileNotFoundError("{0} does not exist".format(filename))

58
59
60
61
62
63
64
65
# Set default values
def set_default(key, value):
    """ Set default config values """
    if key not in config:
        config[key] = value

# Set the default scatter size to 1 billion
set_default('SCATTER_SIZE', 1000000000)
66
67

# these are all optional
68
69
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
70
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
Sander Bollen's avatar
Sander Bollen committed
71
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
72

73
# Generate the input string for basrecalibration
74
75
76
BSQR_known_sites = ''
for argument, filename in zip(itertools.repeat('-knownSites'), KNOWN_SITES):
    BSQR_known_sites +=' {} {}'.format(argument, filename)
77

78
79
80
81
82
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
83

84
85
86
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
87
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
88
fqpy = fsrc_dir("src", "fastqc_stats.py")
Sander Bollen's avatar
Sander Bollen committed
89
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
90
fqsc = fsrc_dir("src", "safe_fastqc.sh")
91
pywc = fsrc_dir("src", "pywc.py")
Sander Bollen's avatar
Sander Bollen committed
92

Sander Bollen's avatar
Sander Bollen committed
93
# sample config parsing
Sander Bollen's avatar
Sander Bollen committed
94
95
96
97
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
98
99
100
101
102
103
104
105
106
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed
107
108

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
109
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed
110

111
112
113
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
114
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:1.14--py36_0",
    "debian": "docker://debian:buster-slim",
    "fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.5--py36_0",
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "seqtk-1.2-jq-1.6": "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
130

Sander Bollen's avatar
Sander Bollen committed
131
132
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
133
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
134
    rs = []
135
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
136
137
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
138

Sander Bollen's avatar
Sander Bollen committed
139
140
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
141
142


Sander Bollen's avatar
Sander Bollen committed
143
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
144
145
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
146
147
148


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
149
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
150
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed
151
152


Sander Bollen's avatar
Sander Bollen committed
153
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
154
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
155
156
157
158
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
159
160
161
162
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

Sander Bollen's avatar
Sander Bollen committed
163
164
165
166
167
168
    fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt",
                  sample=SAMPLES)
    fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
                  sample=SAMPLES)
    fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
                  sample=SAMPLES)
169
    if len(REFFLATS) >= 1:
Sander Bollen's avatar
Sander Bollen committed
170
171
        coverage_stats = expand("{sample}/coverage/{ref}.coverages.tsv",
                                sample=SAMPLES, ref=BASE_REFFLATS)
172
173
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
174
    stats = "stats.json"
Sander Bollen's avatar
Sander Bollen committed
175
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
Sander Bollen's avatar
Sander Bollen committed
176
177


Sander Bollen's avatar
Sander Bollen committed
178
179
rule all:
    input:
180
        report="multiqc_report/multiqc_report.html",
Sander Bollen's avatar
Sander Bollen committed
181
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai",sample=SAMPLES),
182
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz",sample=SAMPLES),
183
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz",sample=SAMPLES),
184
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
185

Sander Bollen's avatar
Sander Bollen committed
186
187
188

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
189
    output: directory("tmp")
190
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
191
192
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
193
rule genome:
Sander Bollen's avatar
Sander Bollen committed
194
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
195
    input: REFERENCE
Sander Bollen's avatar
Sander Bollen committed
196
    output: "current.genome"
197
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
198
199
200
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
201
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
202
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
203
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
204
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
205
206
207
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
208
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
209
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
210
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
211
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
212
213
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
214
215

rule seqtk_r1:
Sander Bollen's avatar
Sander Bollen committed
216
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
217
    input:
Sander Bollen's avatar
Sander Bollen committed
218
219
        stats="{sample}/pre_process/{sample}.preqc_count.json",
        fastq="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
220
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
221
    params:
Sander Bollen's avatar
Sander Bollen committed
222
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
223
    output:
Sander Bollen's avatar
Sander Bollen committed
224
        fastq=temp("{sample}/pre_process/{sample}.sampled_R1.fastq.gz")
225
    singularity: containers["seqtk-1.2-jq-1.6"]
Sander Bollen's avatar
Sander Bollen committed
226
227
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
228
229
230


rule seqtk_r2:
Sander Bollen's avatar
Sander Bollen committed
231
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
232
    input:
Sander Bollen's avatar
Sander Bollen committed
233
234
        stats = "{sample}/pre_process/{sample}.preqc_count.json",
        fastq = "{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
235
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
236
    params:
Sander Bollen's avatar
Sander Bollen committed
237
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
238
    output:
Sander Bollen's avatar
Sander Bollen committed
239
        fastq = temp("{sample}/pre_process/{sample}.sampled_R2.fastq.gz")
240
    singularity: containers["seqtk-1.2-jq-1.6"]
Sander Bollen's avatar
Sander Bollen committed
241
242
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
243
244


245
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
246
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
247
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
248
    input:
Sander Bollen's avatar
Sander Bollen committed
249
250
251
252
        r1 = "{sample}/pre_process/{sample}.sampled_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.sampled_R2.fastq.gz",
        rr1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        rr2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
253
    output:
Sander Bollen's avatar
Sander Bollen committed
254
255
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
256
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
257
    singularity: containers["sickle"]
Sander Bollen's avatar
Sander Bollen committed
258
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
259
260
261
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
262
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
263
    input:
Sander Bollen's avatar
Sander Bollen committed
264
265
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
266
    output:
Sander Bollen's avatar
Sander Bollen committed
267
268
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
269
    singularity: containers["cutadapt"]
Sander Bollen's avatar
Sander Bollen committed
270
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
271
272
273
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
274
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
275
    input:
Sander Bollen's avatar
Sander Bollen committed
276
277
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
278
        ref = REFERENCE,
279
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
280
281
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
282
    output: temp("{sample}/bams/{sample}.sorted.bam")
283
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
284
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
285
286
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
287
288
289
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
290
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
291
    input:
Sander Bollen's avatar
Sander Bollen committed
292
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
293
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
294
    output:
Sander Bollen's avatar
Sander Bollen committed
295
296
297
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
298
    singularity: containers["picard-2.14"]
299
300
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
301
302
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
303
304
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
305
306
307
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
308
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
309
    output:
Sander Bollen's avatar
Sander Bollen committed
310
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
311
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
312
313
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
314
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
315
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
316
    input:
Sander Bollen's avatar
Sander Bollen committed
317
        bam = "{sample}/bams/{sample}.markdup.bam",
Sander Bollen's avatar
Sander Bollen committed
318
319
        ref = REFERENCE,
    output:
Sander Bollen's avatar
Sander Bollen committed
320
        grp = "{sample}/bams/{sample}.baserecal.grp"
321
    params:
322
        known_sites = BSQR_known_sites
323
    singularity: containers["gatk"]
324
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
325
326
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
327
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
328

329
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
330
331
332
    """Scatter the reference genome"""
    input:
        ref = REFERENCE,
333
334
    params:
        size = config['SCATTER_SIZE']
van den Berg's avatar
van den Berg committed
335
    output:
336
        directory("scatter")
van den Berg's avatar
van den Berg committed
337
    singularity: containers["biopet-scatterregions"]
338
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
339
           "biopet-scatterregions "
340
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
341
342
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
343
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
344
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
345
    input:
Sander Bollen's avatar
Sander Bollen committed
346
347
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
Sander Bollen's avatar
Sander Bollen committed
348
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
349
        ref=REFERENCE,
van den Berg's avatar
van den Berg committed
350
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
351
    output:
352
353
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
354
355
    wildcard_constraints:
        chunk="[0-9]+"
356
    singularity: containers["gatk"]
357
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
358
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
359
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
360
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
361
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
362
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
363

364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
           "--output-type z"
Sander Bollen's avatar
Sander Bollen committed
384
385

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
386
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
387
    input:
388
389
390
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
        ref=REFERENCE
Sander Bollen's avatar
Sander Bollen committed
391
    output:
392
393
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
394
395
    wildcard_constraints:
        chunk="[0-9]+"
396
    singularity: containers["gatk"]
397
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
398
           "GenotypeGVCFs -R {input.ref} "
399
           "-V {input.gvcf} -o '{output.vcf}'"
400
401


402
def aggregate_vcf(wildcards):
403
404
405
406
407
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


408
409
410
411
412
413
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
414
rule genotype_gather:
415
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
416
    input:
417
418
        vcfs = aggregate_vcf,
        tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
419
    output:
420
        vcf = "{sample}/vcf/{sample}.vcf.gz"
421
    singularity: containers["bcftools"]
422
423
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
           "--output-type z"
424
425
426
427
428


rule genotype_gather_tbi:
    """Index genotyped vcf file"""
    input:
429
        vcf = "{sample}/vcf/{sample}.vcf.gz"
430
    output:
431
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
432
    singularity: containers["tabix"]
433
    shell: "tabix -pvcf {input.vcf}"
Sander Bollen's avatar
Sander Bollen committed
434
435


Sander Bollen's avatar
Sander Bollen committed
436
## bam metrics
437
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
438
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
439
    input:
440
441
        bam="{sample}/bams/{sample}.sorted.bam",
        pywc=pywc
Sander Bollen's avatar
Sander Bollen committed
442
    output:
443
444
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
445
    singularity: containers["samtools-1.7-python-3.6"]
446
447
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
448
449


450
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
451
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
452
    input:
453
454
        bam="{sample}/bams/{sample}.markdup.bam",
        pywc=pywc
Sander Bollen's avatar
Sander Bollen committed
455
    output:
456
457
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
458
    singularity: containers["samtools-1.7-python-3.6"]
459
460
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
461
462
463


## fastqc
Sander Bollen's avatar
Sander Bollen committed
464
rule fastqc_raw:
465
466
    """
    Run fastqc on raw fastq files
467
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
468
469
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
470
    input:
Sander Bollen's avatar
Sander Bollen committed
471
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
472
473
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
474
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
475
    output:
Sander Bollen's avatar
Sander Bollen committed
476
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
477
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
478
    shell: "fastqc --threads 4 --nogroup -o {params.odir} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
479
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
480
481


Sander Bollen's avatar
Sander Bollen committed
482
rule fastqc_merged:
483
    """
484
485
    Run fastqc on merged fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
486
487
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
488
    input:
Sander Bollen's avatar
Sander Bollen committed
489
490
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
491
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
492
    params:
Sander Bollen's avatar
Sander Bollen committed
493
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
494
    output:
Sander Bollen's avatar
Sander Bollen committed
495
496
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
497
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
498
499
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
500
501


Sander Bollen's avatar
Sander Bollen committed
502
rule fastqc_postqc:
503
504
    """
    Run fastqc on fastq files post pre-processing
505
506
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
507
    """
Sander Bollen's avatar
Sander Bollen committed
508
    input:
Sander Bollen's avatar
Sander Bollen committed
509
510
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
511
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
512
    params:
Sander Bollen's avatar
Sander Bollen committed
513
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
514
    output:
Sander Bollen's avatar
Sander Bollen committed
515
516
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
517
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
518
519
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
520
521
522
523
524


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
525
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
526
    input:
Sander Bollen's avatar
Sander Bollen committed
527
528
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
529
    output:
Sander Bollen's avatar
Sander Bollen committed
530
        "{sample}/pre_process/{sample}.preqc_count.json"
531
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
532
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
533
534
535


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
536
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
537
    input:
Sander Bollen's avatar
Sander Bollen committed
538
539
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
540
    output:
Sander Bollen's avatar
Sander Bollen committed
541
        "{sample}/pre_process/{sample}.postqc_count.json"
542
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
543
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
544
545


Sander Bollen's avatar
Sander Bollen committed
546
547
548
549
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
550
551
552
553
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
Sander Bollen's avatar
Sander Bollen committed
554
        sc=fqpy
555
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
556
    output:
Sander Bollen's avatar
Sander Bollen committed
557
        "{sample}/pre_process/fastq_stats.json"
558
559
560
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
561
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
562

Sander Bollen's avatar
Sander Bollen committed
563
564
565
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
566
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
567
    input:
Sander Bollen's avatar
Sander Bollen committed
568
569
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
Sander Bollen's avatar
Sander Bollen committed
570
571
572
573
574
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
575
576
        covj="{sample}/coverage/{bed}.covstats.json",
        covp="{sample}/coverage/{bed}.covstats.png"
577
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
578
579
580
581
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
582
583


Sander Bollen's avatar
Sander Bollen committed
584
585
586
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
587
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
588
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
Sander Bollen's avatar
Sander Bollen committed
589
590
        ref=get_refflatpath
    output:
Sander Bollen's avatar
Sander Bollen committed
591
        tsv="{sample}/coverage/{ref}.coverages.tsv"
592
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
593
594
595
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
596
597
598
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
599
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
600
    input:
601
        vcf="{sample}/vcf/{sample}.vcf.gz",
602
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
603
    output:
604
        stats="{sampel}/vcf/{sample}.vcfstats.json"
605
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
606
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
607
608


Sander Bollen's avatar
Sander Bollen committed
609
## collection
610
611
612
613
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
614
615
616
617
618
619
620
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
621
622
            cov=expand("{{sample}}/coverage/{bed}.covstats.json",
                       bed=BASE_BEDS),
623
624
625
626
627
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
628
            "{sample}/{sample}.stats.json"
629
        singularity: containers["vtools"]
630
631
632
633
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
634
635
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
636
637
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
638
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
639
        input:
Sander Bollen's avatar
Sander Bollen committed
640
641
642
643
644
645
646
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
647
648
649
650
651
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
652
            "{sample}/{sample}.stats.json"
653
        singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
654
655
656
657
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
658
659
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
660
661

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
662
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
663
    input:
Sander Bollen's avatar
Sander Bollen committed
664
        cols=expand("{sample}/{sample}.stats.json", sample=SAMPLES),
665
        vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
666
667
        mpy=mpy
    output:
Sander Bollen's avatar
Sander Bollen committed
668
        stats="stats.json"
669
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
670
671
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
672
673


Sander Bollen's avatar
Sander Bollen committed
674
675
676
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
677
        stats="stats.json",
Sander Bollen's avatar
Sander Bollen committed
678
679
        sc=tsvpy
    output:
Sander Bollen's avatar
Sander Bollen committed
680
        stats="stats.tsv"
681
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
682
683
684
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
685
686
687
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
688
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
689
690
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
691
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
692
    params:
Sander Bollen's avatar
Sander Bollen committed
693
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
694
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
695
    output:
Sander Bollen's avatar
Sander Bollen committed
696
        report="multiqc_report/multiqc_report.html"
697
    singularity: containers["multiqc"]
698
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"