Snakefile 25.3 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
Sander Bollen's avatar
Sander Bollen committed
25
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
26
from os.path import join, basename
27
from pathlib import Path
28
import itertools
Sander Bollen's avatar
Sander Bollen committed
29

Sander Bollen's avatar
Sander Bollen committed
30
REFERENCE = config.get("REFERENCE")
31
32
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
33
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
34
35
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
Sander Bollen's avatar
Sander Bollen committed
36
DBSNP = config.get("DBSNP")
37
38
if DBSNP is None:
    raise ValueError("You must set --config DBSNP=<path>")
Sander Bollen's avatar
Sander Bollen committed
39
40
if not Path(DBSNP).exists():
    raise FileNotFoundError("{0} does not exist".format(DBSNP))
41

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

KNOWN_SITES = config.get("KNOWN_SITES")
if KNOWN_SITES is None:
    raise ValueError("You must set --config KNOWN_SITES=<path>")

# The user can specify multiple known sites
KNOWN_SITES = KNOWN_SITES.split(',')
for filename in KNOWN_SITES:
    if not Path(filename).exists():
        raise FileNotFoundError("{0} does not exist".format(filename))

58
59
60
61
62
63
64
65
# Set default values
def set_default(key, value):
    """ Set default config values """
    if key not in config:
        config[key] = value

# Set the default scatter size to 1 billion
set_default('SCATTER_SIZE', 1000000000)
66
67

# these are all optional
68
69
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
70
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
Sander Bollen's avatar
Sander Bollen committed
71
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
72

73
# Generate the input string for basrecalibration
74
75
76
BSQR_known_sites = ''
for argument, filename in zip(itertools.repeat('-knownSites'), KNOWN_SITES):
    BSQR_known_sites +=' {} {}'.format(argument, filename)
77

78
79
80
81
82
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
83

84
85
86
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
87
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
88
fqpy = fsrc_dir("src", "fastqc_stats.py")
Sander Bollen's avatar
Sander Bollen committed
89
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
90
fqsc = fsrc_dir("src", "safe_fastqc.sh")
91
pywc = fsrc_dir("src", "pywc.py")
Sander Bollen's avatar
Sander Bollen committed
92

Sander Bollen's avatar
Sander Bollen committed
93
# sample config parsing
Sander Bollen's avatar
Sander Bollen committed
94
95
96
97
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
98
99
100
101
102
103
104
105
106
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed
107
108

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
109
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed
110

111
112
113
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
114
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:1.14--py36_0",
    "debian": "docker://debian:buster-slim",
    "fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.5--py36_0",
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "seqtk-1.2-jq-1.6": "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
130

Sander Bollen's avatar
Sander Bollen committed
131
132
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
133
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
134
    rs = []
135
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
136
137
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
138

Sander Bollen's avatar
Sander Bollen committed
139
140
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
141
142


Sander Bollen's avatar
Sander Bollen committed
143
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
144
145
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
146
147
148


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
149
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
150
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed
151
152


Sander Bollen's avatar
Sander Bollen committed
153
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
154
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
155
156
157
158
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
159
160
161
162
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

Sander Bollen's avatar
Sander Bollen committed
163
164
165
166
167
168
    fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt",
                  sample=SAMPLES)
    fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
                  sample=SAMPLES)
    fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
                  sample=SAMPLES)
169
    if len(REFFLATS) >= 1:
Sander Bollen's avatar
Sander Bollen committed
170
171
        coverage_stats = expand("{sample}/coverage/{ref}.coverages.tsv",
                                sample=SAMPLES, ref=BASE_REFFLATS)
172
173
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
174
    stats = "stats.json"
Sander Bollen's avatar
Sander Bollen committed
175
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
Sander Bollen's avatar
Sander Bollen committed
176
177


Sander Bollen's avatar
Sander Bollen committed
178
179
rule all:
    input:
180
        report="multiqc_report/multiqc_report.html",
Sander Bollen's avatar
Sander Bollen committed
181
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai",sample=SAMPLES),
182
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz",sample=SAMPLES),
183
        vcf_tbi=expand("{sample}/vcf/{sample}.vcf.gz.tbi",sample=SAMPLES),
184
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz",sample=SAMPLES),
185
        gvcf_tbi=expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",sample=SAMPLES),
186
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
187

Sander Bollen's avatar
Sander Bollen committed
188
189
190

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
191
    output: directory("tmp")
192
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
193
194
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
195
rule genome:
Sander Bollen's avatar
Sander Bollen committed
196
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
197
    input: REFERENCE
Sander Bollen's avatar
Sander Bollen committed
198
    output: "current.genome"
199
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
200
201
202
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
203
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
204
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
205
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
206
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
207
208
209
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
210
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
211
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
212
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
213
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
214
215
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
216
217

rule seqtk_r1:
Sander Bollen's avatar
Sander Bollen committed
218
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
219
    input:
Sander Bollen's avatar
Sander Bollen committed
220
221
        stats="{sample}/pre_process/{sample}.preqc_count.json",
        fastq="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
222
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
223
    params:
Sander Bollen's avatar
Sander Bollen committed
224
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
225
    output:
Sander Bollen's avatar
Sander Bollen committed
226
        fastq=temp("{sample}/pre_process/{sample}.sampled_R1.fastq.gz")
227
    singularity: containers["seqtk-1.2-jq-1.6"]
Sander Bollen's avatar
Sander Bollen committed
228
229
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
230
231
232


rule seqtk_r2:
Sander Bollen's avatar
Sander Bollen committed
233
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
234
    input:
Sander Bollen's avatar
Sander Bollen committed
235
236
        stats = "{sample}/pre_process/{sample}.preqc_count.json",
        fastq = "{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
237
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
238
    params:
Sander Bollen's avatar
Sander Bollen committed
239
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
240
    output:
Sander Bollen's avatar
Sander Bollen committed
241
        fastq = temp("{sample}/pre_process/{sample}.sampled_R2.fastq.gz")
242
    singularity: containers["seqtk-1.2-jq-1.6"]
Sander Bollen's avatar
Sander Bollen committed
243
244
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
245
246


247
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
248
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
249
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
250
    input:
Sander Bollen's avatar
Sander Bollen committed
251
252
253
254
        r1 = "{sample}/pre_process/{sample}.sampled_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.sampled_R2.fastq.gz",
        rr1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        rr2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
255
    output:
Sander Bollen's avatar
Sander Bollen committed
256
257
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
258
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
259
    singularity: containers["sickle"]
Sander Bollen's avatar
Sander Bollen committed
260
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
261
262
263
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
264
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
265
    input:
Sander Bollen's avatar
Sander Bollen committed
266
267
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
268
    output:
Sander Bollen's avatar
Sander Bollen committed
269
270
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
271
    singularity: containers["cutadapt"]
Sander Bollen's avatar
Sander Bollen committed
272
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
273
274
275
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
276
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
277
    input:
Sander Bollen's avatar
Sander Bollen committed
278
279
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
280
        ref = REFERENCE,
281
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
282
283
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
284
    output: temp("{sample}/bams/{sample}.sorted.bam")
285
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
286
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
287
288
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
289
290
291
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
292
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
293
    input:
Sander Bollen's avatar
Sander Bollen committed
294
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
295
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
296
    output:
Sander Bollen's avatar
Sander Bollen committed
297
298
299
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
300
    singularity: containers["picard-2.14"]
301
302
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
303
304
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
305
306
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
307
308
309
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
310
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
311
    output:
Sander Bollen's avatar
Sander Bollen committed
312
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
313
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
314
315
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
316
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
317
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
318
    input:
Sander Bollen's avatar
Sander Bollen committed
319
        bam = "{sample}/bams/{sample}.markdup.bam",
Sander Bollen's avatar
Sander Bollen committed
320
321
        ref = REFERENCE,
    output:
Sander Bollen's avatar
Sander Bollen committed
322
        grp = "{sample}/bams/{sample}.baserecal.grp"
323
    params:
324
        known_sites = BSQR_known_sites
325
    singularity: containers["gatk"]
326
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
327
328
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
329
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
330

331
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
332
333
334
    """Scatter the reference genome"""
    input:
        ref = REFERENCE,
335
336
    params:
        size = config['SCATTER_SIZE']
van den Berg's avatar
van den Berg committed
337
    output:
338
        directory("scatter")
van den Berg's avatar
van den Berg committed
339
    singularity: containers["biopet-scatterregions"]
340
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
341
           "biopet-scatterregions "
342
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
343
344
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
345
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
346
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
347
    input:
Sander Bollen's avatar
Sander Bollen committed
348
349
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
Sander Bollen's avatar
Sander Bollen committed
350
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
351
        ref=REFERENCE,
van den Berg's avatar
van den Berg committed
352
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
353
    output:
354
355
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
356
357
    wildcard_constraints:
        chunk="[0-9]+"
358
    singularity: containers["gatk"]
359
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
360
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
361
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
362
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
363
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
364
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
365

366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
382
383
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
384
385
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
386
387
           "--output-type z && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
388
389

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
390
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
391
    input:
392
393
394
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
        ref=REFERENCE
Sander Bollen's avatar
Sander Bollen committed
395
    output:
396
397
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
398
399
    wildcard_constraints:
        chunk="[0-9]+"
400
    singularity: containers["gatk"]
401
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
402
           "GenotypeGVCFs -R {input.ref} "
403
           "-V {input.gvcf} -o '{output.vcf}'"
404
405


406
def aggregate_vcf(wildcards):
407
408
409
410
411
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


412
413
414
415
416
417
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
418
rule genotype_gather:
419
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
420
    input:
421
        vcfs = aggregate_vcf,
422
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
423
    output:
424
425
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
426
    singularity: containers["bcftools"]
427
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
428
429
           "--output-type z && "
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
430
431


Sander Bollen's avatar
Sander Bollen committed
432
## bam metrics
433
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
434
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
435
    input:
436
437
        bam="{sample}/bams/{sample}.sorted.bam",
        pywc=pywc
Sander Bollen's avatar
Sander Bollen committed
438
    output:
439
440
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
441
    singularity: containers["samtools-1.7-python-3.6"]
442
443
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
444
445


446
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
447
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
448
    input:
449
450
        bam="{sample}/bams/{sample}.markdup.bam",
        pywc=pywc
Sander Bollen's avatar
Sander Bollen committed
451
    output:
452
453
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
454
    singularity: containers["samtools-1.7-python-3.6"]
455
456
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
457
458
459


## fastqc
Sander Bollen's avatar
Sander Bollen committed
460
rule fastqc_raw:
461
462
    """
    Run fastqc on raw fastq files
463
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
464
465
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
466
    input:
Sander Bollen's avatar
Sander Bollen committed
467
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
468
469
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
470
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
471
    output:
Sander Bollen's avatar
Sander Bollen committed
472
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
473
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
474
    shell: "fastqc --threads 4 --nogroup -o {params.odir} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
475
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
476
477


Sander Bollen's avatar
Sander Bollen committed
478
rule fastqc_merged:
479
    """
480
481
    Run fastqc on merged fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
482
483
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
484
    input:
Sander Bollen's avatar
Sander Bollen committed
485
486
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
487
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
488
    params:
Sander Bollen's avatar
Sander Bollen committed
489
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
490
    output:
Sander Bollen's avatar
Sander Bollen committed
491
492
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
493
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
494
495
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
496
497


Sander Bollen's avatar
Sander Bollen committed
498
rule fastqc_postqc:
499
500
    """
    Run fastqc on fastq files post pre-processing
501
502
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
503
    """
Sander Bollen's avatar
Sander Bollen committed
504
    input:
Sander Bollen's avatar
Sander Bollen committed
505
506
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
507
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
508
    params:
Sander Bollen's avatar
Sander Bollen committed
509
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
510
    output:
Sander Bollen's avatar
Sander Bollen committed
511
512
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
513
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
514
515
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
516
517
518
519
520


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
521
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
522
    input:
Sander Bollen's avatar
Sander Bollen committed
523
524
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
525
    output:
Sander Bollen's avatar
Sander Bollen committed
526
        "{sample}/pre_process/{sample}.preqc_count.json"
527
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
528
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
529
530
531


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
532
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
533
    input:
Sander Bollen's avatar
Sander Bollen committed
534
535
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
536
    output:
Sander Bollen's avatar
Sander Bollen committed
537
        "{sample}/pre_process/{sample}.postqc_count.json"
538
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
539
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
540
541


Sander Bollen's avatar
Sander Bollen committed
542
543
544
545
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
546
547
548
549
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
Sander Bollen's avatar
Sander Bollen committed
550
        sc=fqpy
551
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
552
    output:
Sander Bollen's avatar
Sander Bollen committed
553
        "{sample}/pre_process/fastq_stats.json"
554
555
556
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
557
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
558

Sander Bollen's avatar
Sander Bollen committed
559
560
561
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
562
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
563
    input:
Sander Bollen's avatar
Sander Bollen committed
564
565
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
Sander Bollen's avatar
Sander Bollen committed
566
567
568
569
570
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
571
572
        covj="{sample}/coverage/{bed}.covstats.json",
        covp="{sample}/coverage/{bed}.covstats.png"
573
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
574
575
576
577
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
578
579


Sander Bollen's avatar
Sander Bollen committed
580
581
582
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
583
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
584
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
Sander Bollen's avatar
Sander Bollen committed
585
586
        ref=get_refflatpath
    output:
Sander Bollen's avatar
Sander Bollen committed
587
        tsv="{sample}/coverage/{ref}.coverages.tsv"
588
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
589
590
591
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
592
593
594
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
595
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
596
    input:
597
        vcf="{sample}/vcf/{sample}.vcf.gz",
598
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
599
    output:
600
        stats="{sampel}/vcf/{sample}.vcfstats.json"
601
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
602
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
603
604


Sander Bollen's avatar
Sander Bollen committed
605
## collection
606
607
608
609
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
610
611
612
613
614
615
616
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
617
618
            cov=expand("{{sample}}/coverage/{bed}.covstats.json",
                       bed=BASE_BEDS),
619
620
621
622
623
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
624
            "{sample}/{sample}.stats.json"
625
        singularity: containers["vtools"]
626
627
628
629
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
630
631
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
632
633
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
634
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
635
        input:
Sander Bollen's avatar
Sander Bollen committed
636
637
638
639
640
641
642
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
643
644
645
646
647
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
648
            "{sample}/{sample}.stats.json"
649
        singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
650
651
652
653
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
654
655
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
656
657

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
658
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
659
    input:
Sander Bollen's avatar
Sander Bollen committed
660
        cols=expand("{sample}/{sample}.stats.json", sample=SAMPLES),
661
        vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
662
663
        mpy=mpy
    output:
Sander Bollen's avatar
Sander Bollen committed
664
        stats="stats.json"
665
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
666
667
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
668
669


Sander Bollen's avatar
Sander Bollen committed
670
671
672
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
673
        stats="stats.json",
Sander Bollen's avatar
Sander Bollen committed
674
675
        sc=tsvpy
    output:
Sander Bollen's avatar
Sander Bollen committed
676
        stats="stats.tsv"
677
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
678
679
680
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
681
682
683
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
684
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
685
686
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
687
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
688
    params:
Sander Bollen's avatar
Sander Bollen committed
689
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
690
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
691
    output:
Sander Bollen's avatar
Sander Bollen committed
692
        report="multiqc_report/multiqc_report.html"
693
    singularity: containers["multiqc"]
694
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"