Snakefile 22.4 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Read the config json
with open(config['CONFIG_JSON'], 'rt') as fin:
    settings = json.load(fin)

# Read the json schema
with open('config/schema.json', 'rt') as fin:
    schema = json.load(fin)

# Validate the settings against the schema
try:
    jsonschema.validate(settings, schema)
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

45

46
47
48
# Set default values
def set_default(key, value):
    """ Set default config values """
van den Berg's avatar
van den Berg committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
    if key not in settings:
        settings[key] = value

# Set the default settings
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
set_default("covstats", "src/covstats.py")
set_default("collect_stats", "src/collect_stats.py")
set_default("merge_stats", "src/merge_stats.py")
set_default("fastq_stats", "src/fastqc_stats.py")
set_default("stats_to_tsv", "src/stats_to_tsv.py")
set_default("safe_fastqc", "src/safe_fastqc.sh")
set_default("py_wordcount", "src/pywc.py")
Sander Bollen's avatar
Sander Bollen committed
64

65
66
67
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
68
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
69
70
71
72
73
74
75
76
77
78
79
80
81
82
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:1.14--py36_0",
    "debian": "docker://debian:buster-slim",
    "fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.5--py36_0",
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
83

Sander Bollen's avatar
Sander Bollen committed
84
85
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
van den Berg's avatar
van den Berg committed
86
    s = settings['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
87
    rs = []
88
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
89
90
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
91

Sander Bollen's avatar
Sander Bollen committed
92
93
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
94
95


96
97
98
def coverage_stats(wildcards):
    files = expand("{sample}/coverage/refFlat_coverage.tsv", sample=settings['samples'])
    return files if "refflat" in settings else []
Sander Bollen's avatar
Sander Bollen committed
99

Sander Bollen's avatar
Sander Bollen committed
100
101
rule all:
    input:
van den Berg's avatar
van den Berg committed
102
        multiqc="multiqc_report/multiqc_report.html",
103
        stats = "stats.json",
van den Berg's avatar
van den Berg committed
104
105
106
107
108
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai", sample=settings['samples']),
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz", sample=settings['samples']),
        vcf_tbi=expand("{sample}/vcf/{sample}.vcf.gz.tbi", sample=settings['samples']),
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz", sample=settings['samples']),
        gvcf_tbi=expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=settings['samples']),
109
110
111
112
        fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt", sample=settings['samples']),
        fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip", sample=settings['samples']),
        fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip", sample=settings['samples']),
        coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
113

Sander Bollen's avatar
Sander Bollen committed
114
115
116

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
117
    output: directory("tmp")
118
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
119
120
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
121
rule genome:
Sander Bollen's avatar
Sander Bollen committed
122
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
123
    input: settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
124
    output: "current.genome"
125
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
126
127
128
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
129
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
130
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
131
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
132
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
133
134
135
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
136
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
137
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
138
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
139
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
140
141
    shell: "cat {input} > {output}"

142
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
143
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
144
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
145
    input:
van den Berg's avatar
van den Berg committed
146
147
        r1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
148
    output:
Sander Bollen's avatar
Sander Bollen committed
149
150
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
151
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
152
    singularity: containers["sickle"]
Sander Bollen's avatar
Sander Bollen committed
153
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
154
155
156
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
157
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
158
    input:
Sander Bollen's avatar
Sander Bollen committed
159
160
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
161
    output:
Sander Bollen's avatar
Sander Bollen committed
162
163
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
164
    singularity: containers["cutadapt"]
Sander Bollen's avatar
Sander Bollen committed
165
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
166
167
168
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
169
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
170
    input:
Sander Bollen's avatar
Sander Bollen committed
171
172
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
van den Berg's avatar
van den Berg committed
173
        ref = settings["reference"],
174
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
175
176
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
177
    output: temp("{sample}/bams/{sample}.sorted.bam")
178
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
179
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
180
181
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
182
183
184
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
185
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
186
    input:
Sander Bollen's avatar
Sander Bollen committed
187
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
188
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
189
    output:
Sander Bollen's avatar
Sander Bollen committed
190
191
192
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
193
    singularity: containers["picard-2.14"]
194
195
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
196
197
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
198
199
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
200
201
202
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
203
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
204
    output:
Sander Bollen's avatar
Sander Bollen committed
205
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
206
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
207
208
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
209
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
210
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
211
    input:
Sander Bollen's avatar
Sander Bollen committed
212
        bam = "{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
213
        ref = settings["reference"],
214
        vcfs = settings["known_sites"]
Sander Bollen's avatar
Sander Bollen committed
215
    output:
Sander Bollen's avatar
Sander Bollen committed
216
        grp = "{sample}/bams/{sample}.baserecal.grp"
217
    params: " ".join(expand("-knownSites {vcf}", vcf=settings["known_sites"]))
218
    singularity: containers["gatk"]
219
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
220
221
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
222
           "-cov CycleCovariate -cov ContextCovariate {params}"
Sander Bollen's avatar
Sander Bollen committed
223

224
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
225
226
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
227
        ref = settings["reference"],
228
    params:
van den Berg's avatar
van den Berg committed
229
        size = settings['scatter_size']
van den Berg's avatar
van den Berg committed
230
    output:
231
        directory("scatter")
van den Berg's avatar
van den Berg committed
232
    singularity: containers["biopet-scatterregions"]
233
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
234
           "biopet-scatterregions "
235
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
236
237
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
238
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
239
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
240
    input:
Sander Bollen's avatar
Sander Bollen committed
241
242
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
van den Berg's avatar
van den Berg committed
243
244
        dbsnp=settings["dbsnp"],
        ref=settings["reference"],
van den Berg's avatar
van den Berg committed
245
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
246
    output:
247
248
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
249
250
    wildcard_constraints:
        chunk="[0-9]+"
251
    singularity: containers["gatk"]
252
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
253
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
254
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
255
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
256
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
257
258
259
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
260

261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
277
278
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
279
280
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
281
282
           "--output-type z && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
283
284

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
285
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
286
    input:
287
288
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
289
        ref= settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
290
    output:
291
292
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
293
294
    wildcard_constraints:
        chunk="[0-9]+"
295
    singularity: containers["gatk"]
296
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
297
           "GenotypeGVCFs -R {input.ref} "
298
           "-V {input.gvcf} -o '{output.vcf}'"
299
300


301
def aggregate_vcf(wildcards):
302
303
304
305
306
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


307
308
309
310
311
312
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
313
rule genotype_gather:
314
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
315
    input:
316
        vcfs = aggregate_vcf,
317
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
318
    output:
319
320
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
321
    singularity: containers["bcftools"]
322
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
323
324
           "--output-type z && "
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
325
326


Sander Bollen's avatar
Sander Bollen committed
327
## bam metrics
328
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
329
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
330
    input:
331
        bam="{sample}/bams/{sample}.sorted.bam",
van den Berg's avatar
van den Berg committed
332
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
333
    output:
334
335
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
336
    singularity: containers["samtools-1.7-python-3.6"]
337
338
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
339
340


341
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
342
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
343
    input:
344
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
345
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
346
    output:
347
348
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
349
    singularity: containers["samtools-1.7-python-3.6"]
350
351
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
352
353
354


## fastqc
Sander Bollen's avatar
Sander Bollen committed
355
rule fastqc_raw:
356
357
    """
    Run fastqc on raw fastq files
358
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
359
360
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
361
    input:
Sander Bollen's avatar
Sander Bollen committed
362
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
363
364
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
365
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
366
    output:
Sander Bollen's avatar
Sander Bollen committed
367
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
368
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
369
    shell: "fastqc --threads 4 --nogroup -o {params.odir} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
370
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
371
372


Sander Bollen's avatar
Sander Bollen committed
373
rule fastqc_merged:
374
    """
375
376
    Run fastqc on merged fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
377
378
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
379
    input:
Sander Bollen's avatar
Sander Bollen committed
380
381
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
van den Berg's avatar
van den Berg committed
382
        fq=settings["safe_fastqc"]
Sander Bollen's avatar
Sander Bollen committed
383
    params:
Sander Bollen's avatar
Sander Bollen committed
384
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
385
    output:
Sander Bollen's avatar
Sander Bollen committed
386
387
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
388
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
389
390
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
391
392


Sander Bollen's avatar
Sander Bollen committed
393
rule fastqc_postqc:
394
395
    """
    Run fastqc on fastq files post pre-processing
396
397
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
398
    """
Sander Bollen's avatar
Sander Bollen committed
399
    input:
Sander Bollen's avatar
Sander Bollen committed
400
401
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
van den Berg's avatar
van den Berg committed
402
        fq=settings["safe_fastqc"]
Sander Bollen's avatar
Sander Bollen committed
403
    params:
Sander Bollen's avatar
Sander Bollen committed
404
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
405
    output:
Sander Bollen's avatar
Sander Bollen committed
406
407
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
408
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
409
410
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
411
412
413
414
415


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
416
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
417
    input:
Sander Bollen's avatar
Sander Bollen committed
418
419
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
420
    output:
Sander Bollen's avatar
Sander Bollen committed
421
        "{sample}/pre_process/{sample}.preqc_count.json"
422
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
423
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
424
425
426


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
427
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
428
    input:
Sander Bollen's avatar
Sander Bollen committed
429
430
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
431
    output:
Sander Bollen's avatar
Sander Bollen committed
432
        "{sample}/pre_process/{sample}.postqc_count.json"
433
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
434
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
435
436


Sander Bollen's avatar
Sander Bollen committed
437
438
439
440
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
441
442
443
444
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
van den Berg's avatar
van den Berg committed
445
        sc=settings["fastq_stats"]
446
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
447
    output:
Sander Bollen's avatar
Sander Bollen committed
448
        "{sample}/pre_process/fastq_stats.json"
449
450
451
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
452
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
453

Sander Bollen's avatar
Sander Bollen committed
454
455
456
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
457
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
458
    input:
Sander Bollen's avatar
Sander Bollen committed
459
460
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
van den Berg's avatar
van den Berg committed
461
        covpy=settings["covstats"],
van den Berg's avatar
van den Berg committed
462
        bed=settings.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
463
464
465
    params:
        subt="Sample {sample}"
    output:
van den Berg's avatar
van den Berg committed
466
467
        covj="{sample}/coverage/covstats.json",
        covp="{sample}/coverage/covstats.png"
468
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
469
470
471
472
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
473
474


Sander Bollen's avatar
Sander Bollen committed
475
476
477
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
478
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
479
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
480
        ref = settings.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
481
    output:
482
        tsv="{sample}/coverage/refFlat_coverage.tsv"
483
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
484
485
486
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
487
488
489
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
490
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
491
    input:
492
        vcf="{sample}/vcf/{sample}.vcf.gz",
493
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
494
    output:
495
        stats="{sampel}/vcf/{sample}.vcfstats.json"
496
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
497
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
498
499


Sander Bollen's avatar
Sander Bollen committed
500
## collection
van den Berg's avatar
van den Berg committed
501
if "bedfile" in settings:
502
503
504
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
505
506
507
508
509
510
511
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
van den Berg's avatar
van den Berg committed
512
            cov="{sample}/coverage/covstats.json",
van den Berg's avatar
van den Berg committed
513
            colpy=settings["collect_stats"]
514
515
        params:
            sample_name="{sample}",
van den Berg's avatar
van den Berg committed
516
            fthresh=settings["female_threshold"]
517
        output:
Sander Bollen's avatar
Sander Bollen committed
518
            "{sample}/{sample}.stats.json"
519
        singularity: containers["vtools"]
520
521
522
523
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
524
525
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
526
527
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
528
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
529
        input:
Sander Bollen's avatar
Sander Bollen committed
530
531
532
533
534
535
536
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
van den Berg's avatar
van den Berg committed
537
            colpy = settings["collect_stats"]
Sander Bollen's avatar
Sander Bollen committed
538
539
        params:
            sample_name = "{sample}",
van den Berg's avatar
van den Berg committed
540
            fthresh = settings["female_threshold"]
Sander Bollen's avatar
Sander Bollen committed
541
        output:
Sander Bollen's avatar
Sander Bollen committed
542
            "{sample}/{sample}.stats.json"
543
        singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
544
545
546
547
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
548
549
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
550
551

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
552
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
553
    input:
van den Berg's avatar
van den Berg committed
554
555
556
        cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
        vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=settings['samples']),
        mpy=settings["merge_stats"]
Sander Bollen's avatar
Sander Bollen committed
557
    output:
Sander Bollen's avatar
Sander Bollen committed
558
        stats="stats.json"
559
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
560
561
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
562
563


Sander Bollen's avatar
Sander Bollen committed
564
565
566
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
567
        stats="stats.json",
van den Berg's avatar
van den Berg committed
568
        sc=settings["stats_to_tsv"]
Sander Bollen's avatar
Sander Bollen committed
569
    output:
Sander Bollen's avatar
Sander Bollen committed
570
        stats="stats.tsv"
571
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
572
573
574
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
575
576
577
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
578
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
579
580
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
581
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
582
    params:
Sander Bollen's avatar
Sander Bollen committed
583
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
584
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
585
    output:
van den Berg's avatar
van den Berg committed
586
        "multiqc_report/multiqc_report.html"
587
    singularity: containers["multiqc"]
van den Berg's avatar
van den Berg committed
588
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output}"