Snakefile 23.6 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
Sander Bollen's avatar
Sander Bollen committed
25
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
26
from os.path import join, basename
27
from pathlib import Path
28
import itertools
Sander Bollen's avatar
Sander Bollen committed
29
30

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
31
32

REFERENCE = config.get("REFERENCE")
33
34
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
35
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
36
37
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
Sander Bollen's avatar
Sander Bollen committed
38
DBSNP = config.get("DBSNP")
39
40
if DBSNP is None:
    raise ValueError("You must set --config DBSNP=<path>")
Sander Bollen's avatar
Sander Bollen committed
41
42
if not Path(DBSNP).exists():
    raise FileNotFoundError("{0} does not exist".format(DBSNP))
43

44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

KNOWN_SITES = config.get("KNOWN_SITES")
if KNOWN_SITES is None:
    raise ValueError("You must set --config KNOWN_SITES=<path>")

# The user can specify multiple known sites
KNOWN_SITES = KNOWN_SITES.split(',')
for filename in KNOWN_SITES:
    if not Path(filename).exists():
        raise FileNotFoundError("{0} does not exist".format(filename))

60
61

# these are all optional
62
63
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
64
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
Sander Bollen's avatar
Sander Bollen committed
65
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
66

67
# Generate the input string for basrecalibration
68
69
70
BSQR_known_sites = ''
for argument, filename in zip(itertools.repeat('-knownSites'), KNOWN_SITES):
    BSQR_known_sites +=' {} {}'.format(argument, filename)
71

72
73
74
75
76
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
77

78
79
80
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
81
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
82
fqpy = fsrc_dir("src", "fastqc_stats.py")
Sander Bollen's avatar
Sander Bollen committed
83
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
84
fqsc = fsrc_dir("src", "safe_fastqc.sh")
85
pywc = fsrc_dir("src", "pywc.py")
Sander Bollen's avatar
Sander Bollen committed
86

Sander Bollen's avatar
Sander Bollen committed
87
# sample config parsing
Sander Bollen's avatar
Sander Bollen committed
88
89
90
91
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
92
93
94
95
96
97
98
99
100
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed
101
102

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
103
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed
104

105
106
107
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
108
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:1.14--py36_0",
    "debian": "docker://debian:buster-slim",
    "fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.5--py36_0",
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "seqtk-1.2-jq-1.6": "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
124

Sander Bollen's avatar
Sander Bollen committed
125
126
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
127
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
128
    rs = []
129
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
130
131
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
132

Sander Bollen's avatar
Sander Bollen committed
133
134
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
135
136


Sander Bollen's avatar
Sander Bollen committed
137
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
138
139
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
140
141
142


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
143
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
144
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed
145
146


Sander Bollen's avatar
Sander Bollen committed
147
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
148
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
149
150
151
152
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
153
154
155
156
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

Sander Bollen's avatar
Sander Bollen committed
157
158
159
160
161
162
    fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt",
                  sample=SAMPLES)
    fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
                  sample=SAMPLES)
    fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
                  sample=SAMPLES)
163
    if len(REFFLATS) >= 1:
Sander Bollen's avatar
Sander Bollen committed
164
165
        coverage_stats = expand("{sample}/coverage/{ref}.coverages.tsv",
                                sample=SAMPLES, ref=BASE_REFFLATS)
166
167
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
168
    stats = "stats.json"
Sander Bollen's avatar
Sander Bollen committed
169
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
Sander Bollen's avatar
Sander Bollen committed
170
171


Sander Bollen's avatar
Sander Bollen committed
172
173
rule all:
    input:
174
        report="multiqc_report/multiqc_report.html",
Sander Bollen's avatar
Sander Bollen committed
175
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai",sample=SAMPLES),
176
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz",sample=SAMPLES),
177
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
178

Sander Bollen's avatar
Sander Bollen committed
179
180
181

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
182
    output: directory("tmp")
183
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
184
185
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
186
rule genome:
Sander Bollen's avatar
Sander Bollen committed
187
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
188
    input: REFERENCE
Sander Bollen's avatar
Sander Bollen committed
189
    output: "current.genome"
190
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
191
192
193
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
194
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
195
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
196
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
197
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
198
199
200
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
201
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
202
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
203
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
204
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
205
206
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
207
208

rule seqtk_r1:
Sander Bollen's avatar
Sander Bollen committed
209
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
210
    input:
Sander Bollen's avatar
Sander Bollen committed
211
212
        stats="{sample}/pre_process/{sample}.preqc_count.json",
        fastq="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
213
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
214
    params:
Sander Bollen's avatar
Sander Bollen committed
215
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
216
    output:
Sander Bollen's avatar
Sander Bollen committed
217
        fastq=temp("{sample}/pre_process/{sample}.sampled_R1.fastq.gz")
218
    singularity: containers["seqtk-1.2-jq-1.6"]
Sander Bollen's avatar
Sander Bollen committed
219
220
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
221
222
223


rule seqtk_r2:
Sander Bollen's avatar
Sander Bollen committed
224
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
225
    input:
Sander Bollen's avatar
Sander Bollen committed
226
227
        stats = "{sample}/pre_process/{sample}.preqc_count.json",
        fastq = "{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
228
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
229
    params:
Sander Bollen's avatar
Sander Bollen committed
230
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
231
    output:
Sander Bollen's avatar
Sander Bollen committed
232
        fastq = temp("{sample}/pre_process/{sample}.sampled_R2.fastq.gz")
233
    singularity: containers["seqtk-1.2-jq-1.6"]
Sander Bollen's avatar
Sander Bollen committed
234
235
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
236
237


238
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
239
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
240
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
241
    input:
Sander Bollen's avatar
Sander Bollen committed
242
243
244
245
        r1 = "{sample}/pre_process/{sample}.sampled_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.sampled_R2.fastq.gz",
        rr1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        rr2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
246
    output:
Sander Bollen's avatar
Sander Bollen committed
247
248
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
249
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
250
    singularity: containers["sickle"]
Sander Bollen's avatar
Sander Bollen committed
251
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
252
253
254
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
255
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
256
    input:
Sander Bollen's avatar
Sander Bollen committed
257
258
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
259
    output:
Sander Bollen's avatar
Sander Bollen committed
260
261
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
262
    singularity: containers["cutadapt"]
Sander Bollen's avatar
Sander Bollen committed
263
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
264
265
266
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
267
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
268
    input:
Sander Bollen's avatar
Sander Bollen committed
269
270
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
271
        ref = REFERENCE,
272
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
273
274
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
275
    output: temp("{sample}/bams/{sample}.sorted.bam")
276
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
277
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
278
279
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
280
281
282
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
283
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
284
    input:
Sander Bollen's avatar
Sander Bollen committed
285
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
286
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
287
    output:
Sander Bollen's avatar
Sander Bollen committed
288
289
290
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
291
    singularity: containers["picard-2.14"]
292
293
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
294
295
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
296
297
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
298
299
300
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
301
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
302
    output:
Sander Bollen's avatar
Sander Bollen committed
303
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
304
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
305
306
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
307
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
308
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
309
    input:
Sander Bollen's avatar
Sander Bollen committed
310
        bam = "{sample}/bams/{sample}.markdup.bam",
Sander Bollen's avatar
Sander Bollen committed
311
312
        ref = REFERENCE,
    output:
Sander Bollen's avatar
Sander Bollen committed
313
        grp = "{sample}/bams/{sample}.baserecal.grp"
314
    params:
315
        known_sites = BSQR_known_sites
316
    singularity: containers["gatk"]
317
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
318
319
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
320
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
321

322
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
323
324
325
326
    """Scatter the reference genome"""
    input:
        ref = REFERENCE,
    output:
327
        directory("scatter")
van den Berg's avatar
van den Berg committed
328
    singularity: containers["biopet-scatterregions"]
329
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
330
           "biopet-scatterregions "
331
           "--referenceFasta {input.ref} --scatterSize 1000000000 "
van den Berg's avatar
van den Berg committed
332
333
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
334
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
335
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
336
    input:
Sander Bollen's avatar
Sander Bollen committed
337
338
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
Sander Bollen's avatar
Sander Bollen committed
339
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
340
        ref=REFERENCE,
van den Berg's avatar
van den Berg committed
341
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
342
    output:
343
344
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
345
346
    wildcard_constraints:
        chunk="[0-9]+"
347
    singularity: containers["gatk"]
348
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
349
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
350
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
351
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
352
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
353
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
354

Sander Bollen's avatar
Sander Bollen committed
355
356

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
357
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
358
    input:
359
360
361
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
        ref=REFERENCE
Sander Bollen's avatar
Sander Bollen committed
362
    output:
van den Berg's avatar
van den Berg committed
363
364
365
366
        vcf="{sample}/vcf/{sample}.{chunk}.vcf.gz",
        vcf_tbi="{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi"
    wildcard_constraints:
        chunk="[0-9]+"
367
    singularity: containers["gatk"]
368
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
369
           "GenotypeGVCFs -R {input.ref} "
370
           "-V {input.gvcf} -o '{output.vcf}'"
371
372


373
374
375
376
377
378
def aggregate_input(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
379
rule genotype_gather:
380
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
381
    input:
382
        vcfs = aggregate_input
Sander Bollen's avatar
Sander Bollen committed
383
    output:
384
        vcf = "{sample}/vcf/{sample}.vcf.gz"
385
    singularity: containers["bcftools"]
386
387
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
           "--output-type z"
388
389
390
391
392


rule genotype_gather_tbi:
    """Index genotyped vcf file"""
    input:
393
        vcf = "{sample}/vcf/{sample}.vcf.gz"
394
    output:
395
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
396
    singularity: containers["tabix"]
397
    shell: "tabix -pvcf {input.vcf}"
Sander Bollen's avatar
Sander Bollen committed
398
399


Sander Bollen's avatar
Sander Bollen committed
400
## bam metrics
401
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
402
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
403
    input:
404
405
        bam="{sample}/bams/{sample}.sorted.bam",
        pywc=pywc
Sander Bollen's avatar
Sander Bollen committed
406
    output:
407
408
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
409
    singularity: containers["samtools-1.7-python-3.6"]
410
411
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
412
413


414
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
415
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
416
    input:
417
418
        bam="{sample}/bams/{sample}.markdup.bam",
        pywc=pywc
Sander Bollen's avatar
Sander Bollen committed
419
    output:
420
421
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
422
    singularity: containers["samtools-1.7-python-3.6"]
423
424
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
425
426
427


## fastqc
Sander Bollen's avatar
Sander Bollen committed
428
rule fastqc_raw:
429
430
    """
    Run fastqc on raw fastq files
431
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
432
433
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
434
    input:
Sander Bollen's avatar
Sander Bollen committed
435
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
436
437
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
438
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
439
    output:
Sander Bollen's avatar
Sander Bollen committed
440
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
441
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
442
    shell: "fastqc --threads 4 --nogroup -o {params.odir} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
443
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
444
445


Sander Bollen's avatar
Sander Bollen committed
446
rule fastqc_merged:
447
    """
448
449
    Run fastqc on merged fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
450
451
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
452
    input:
Sander Bollen's avatar
Sander Bollen committed
453
454
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
455
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
456
    params:
Sander Bollen's avatar
Sander Bollen committed
457
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
458
    output:
Sander Bollen's avatar
Sander Bollen committed
459
460
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
461
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
462
463
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
464
465


Sander Bollen's avatar
Sander Bollen committed
466
rule fastqc_postqc:
467
468
    """
    Run fastqc on fastq files post pre-processing
469
470
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
471
    """
Sander Bollen's avatar
Sander Bollen committed
472
    input:
Sander Bollen's avatar
Sander Bollen committed
473
474
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
475
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
476
    params:
Sander Bollen's avatar
Sander Bollen committed
477
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
478
    output:
Sander Bollen's avatar
Sander Bollen committed
479
480
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
481
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
482
483
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
484
485
486
487
488


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
489
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
490
    input:
Sander Bollen's avatar
Sander Bollen committed
491
492
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
493
    output:
Sander Bollen's avatar
Sander Bollen committed
494
        "{sample}/pre_process/{sample}.preqc_count.json"
495
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
496
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
497
498
499


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
500
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
501
    input:
Sander Bollen's avatar
Sander Bollen committed
502
503
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
504
    output:
Sander Bollen's avatar
Sander Bollen committed
505
        "{sample}/pre_process/{sample}.postqc_count.json"
506
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
507
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
508
509


Sander Bollen's avatar
Sander Bollen committed
510
511
512
513
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
514
515
516
517
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
Sander Bollen's avatar
Sander Bollen committed
518
        sc=fqpy
519
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
520
    output:
Sander Bollen's avatar
Sander Bollen committed
521
        "{sample}/pre_process/fastq_stats.json"
522
523
524
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
525
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
526

Sander Bollen's avatar
Sander Bollen committed
527
528
529
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
530
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
531
    input:
Sander Bollen's avatar
Sander Bollen committed
532
533
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
Sander Bollen's avatar
Sander Bollen committed
534
535
536
537
538
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
539
540
        covj="{sample}/coverage/{bed}.covstats.json",
        covp="{sample}/coverage/{bed}.covstats.png"
541
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
542
543
544
545
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
546
547


Sander Bollen's avatar
Sander Bollen committed
548
549
550
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
551
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
552
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
Sander Bollen's avatar
Sander Bollen committed
553
554
        ref=get_refflatpath
    output:
Sander Bollen's avatar
Sander Bollen committed
555
        tsv="{sample}/coverage/{ref}.coverages.tsv"
556
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
557
558
559
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
560
561
562
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
563
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
564
    input:
565
        vcf="{sample}/vcf/{sample}.vcf.gz",
566
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
567
    output:
568
        stats="{sampel}/vcf/{sample}.vcfstats.json"
569
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
570
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
571
572


Sander Bollen's avatar
Sander Bollen committed
573
## collection
574
575
576
577
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
578
579
580
581
582
583
584
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
585
586
            cov=expand("{{sample}}/coverage/{bed}.covstats.json",
                       bed=BASE_BEDS),
587
588
589
590
591
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
592
            "{sample}/{sample}.stats.json"
593
        singularity: containers["vtools"]
594
595
596
597
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
598
599
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
600
601
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
602
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
603
        input:
Sander Bollen's avatar
Sander Bollen committed
604
605
606
607
608
609
610
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
611
612
613
614
615
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
616
            "{sample}/{sample}.stats.json"
617
        singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
618
619
620
621
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
622
623
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
624
625

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
626
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
627
    input:
Sander Bollen's avatar
Sander Bollen committed
628
        cols=expand("{sample}/{sample}.stats.json", sample=SAMPLES),
629
        vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
630
631
        mpy=mpy
    output:
Sander Bollen's avatar
Sander Bollen committed
632
        stats="stats.json"
633
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
634
635
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
636
637


Sander Bollen's avatar
Sander Bollen committed
638
639
640
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
641
        stats="stats.json",
Sander Bollen's avatar
Sander Bollen committed
642
643
        sc=tsvpy
    output:
Sander Bollen's avatar
Sander Bollen committed
644
        stats="stats.tsv"
645
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
646
647
648
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
649
650
651
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
652
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
653
654
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
655
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
656
    params:
Sander Bollen's avatar
Sander Bollen committed
657
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
658
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
659
    output:
Sander Bollen's avatar
Sander Bollen committed
660
        report="multiqc_report/multiqc_report.html"
661
    singularity: containers["multiqc"]
662
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"