Snakefile 22.9 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Read the config json
with open(config['CONFIG_JSON'], 'rt') as fin:
    settings = json.load(fin)

# Read the json schema
with open('config/schema.json', 'rt') as fin:
    schema = json.load(fin)

# Validate the settings against the schema
try:
    jsonschema.validate(settings, schema)
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

45

46
47
48
# Set default values
def set_default(key, value):
    """ Set default config values """
van den Berg's avatar
van den Berg committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
    if key not in settings:
        settings[key] = value

# Set the default settings
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
set_default("covstats", "src/covstats.py")
set_default("collect_stats", "src/collect_stats.py")
set_default("merge_stats", "src/merge_stats.py")
set_default("fastq_stats", "src/fastqc_stats.py")
set_default("stats_to_tsv", "src/stats_to_tsv.py")
set_default("safe_fastqc", "src/safe_fastqc.sh")
set_default("py_wordcount", "src/pywc.py")
Sander Bollen's avatar
Sander Bollen committed
64

65
# Generate the input string for basrecalibration
66
BSQR_known_sites = ''
van den Berg's avatar
van den Berg committed
67
for argument, filename in zip(itertools.repeat('-knownSites'), settings["known_sites"]):
68
    BSQR_known_sites +=' {} {}'.format(argument, filename)
69

70
71
72
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
73
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
74
75
76
77
78
79
80
81
82
83
84
85
86
87
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:1.14--py36_0",
    "debian": "docker://debian:buster-slim",
    "fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.5--py36_0",
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
88

Sander Bollen's avatar
Sander Bollen committed
89
90
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
van den Berg's avatar
van den Berg committed
91
    s = settings['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
92
    rs = []
93
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
94
95
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
96

Sander Bollen's avatar
Sander Bollen committed
97
98
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
99
100


Sander Bollen's avatar
Sander Bollen committed
101
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
102
103
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
104
105


Sander Bollen's avatar
Sander Bollen committed
106
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
107
    """Get sample gender"""
van den Berg's avatar
van den Berg committed
108
    sam = settings['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
109
110
111
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
112
113
114
115
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

Sander Bollen's avatar
Sander Bollen committed
116
    fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt",
van den Berg's avatar
van den Berg committed
117
                  sample=settings['samples']),
Sander Bollen's avatar
Sander Bollen committed
118
    fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
van den Berg's avatar
van den Berg committed
119
                  sample=settings['samples']),
Sander Bollen's avatar
Sander Bollen committed
120
    fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
van den Berg's avatar
van den Berg committed
121
122
                  sample=settings['samples']),
    if "refflat" in settings:
123
        coverage_stats = tuple(expand("{sample}/coverage/refFlat_coverage.tsv", sample=settings['samples']))
124
    else:
van den Berg's avatar
van den Berg committed
125
126
127
128
        coverage_stats = tuple()
    stats = "stats.json",
    print(coverage_stats)
    return  fqcr + fqcm + fqcp + coverage_stats + stats
Sander Bollen's avatar
Sander Bollen committed
129
130


Sander Bollen's avatar
Sander Bollen committed
131
132
rule all:
    input:
van den Berg's avatar
van den Berg committed
133
134
135
136
137
138
        multiqc="multiqc_report/multiqc_report.html",
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai", sample=settings['samples']),
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz", sample=settings['samples']),
        vcf_tbi=expand("{sample}/vcf/{sample}.vcf.gz.tbi", sample=settings['samples']),
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz", sample=settings['samples']),
        gvcf_tbi=expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=settings['samples']),
139
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
140

Sander Bollen's avatar
Sander Bollen committed
141
142
143

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
144
    output: directory("tmp")
145
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
146
147
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
148
rule genome:
Sander Bollen's avatar
Sander Bollen committed
149
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
150
    input: settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
151
    output: "current.genome"
152
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
153
154
155
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
156
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
157
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
158
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
159
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
160
161
162
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
163
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
164
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
165
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
166
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
167
168
    shell: "cat {input} > {output}"

169
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
170
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
171
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
172
    input:
van den Berg's avatar
van den Berg committed
173
174
        r1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
175
    output:
Sander Bollen's avatar
Sander Bollen committed
176
177
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
178
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
179
    singularity: containers["sickle"]
Sander Bollen's avatar
Sander Bollen committed
180
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
181
182
183
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
184
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
185
    input:
Sander Bollen's avatar
Sander Bollen committed
186
187
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
188
    output:
Sander Bollen's avatar
Sander Bollen committed
189
190
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
191
    singularity: containers["cutadapt"]
Sander Bollen's avatar
Sander Bollen committed
192
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
193
194
195
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
196
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
197
    input:
Sander Bollen's avatar
Sander Bollen committed
198
199
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
van den Berg's avatar
van den Berg committed
200
        ref = settings["reference"],
201
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
202
203
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
204
    output: temp("{sample}/bams/{sample}.sorted.bam")
205
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
206
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
207
208
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
209
210
211
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
212
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
213
    input:
Sander Bollen's avatar
Sander Bollen committed
214
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
215
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
216
    output:
Sander Bollen's avatar
Sander Bollen committed
217
218
219
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
220
    singularity: containers["picard-2.14"]
221
222
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
223
224
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
225
226
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
227
228
229
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
230
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
231
    output:
Sander Bollen's avatar
Sander Bollen committed
232
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
233
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
234
235
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
236
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
237
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
238
    input:
Sander Bollen's avatar
Sander Bollen committed
239
        bam = "{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
240
        ref = settings["reference"],
Sander Bollen's avatar
Sander Bollen committed
241
    output:
Sander Bollen's avatar
Sander Bollen committed
242
        grp = "{sample}/bams/{sample}.baserecal.grp"
243
    params:
244
        known_sites = BSQR_known_sites
245
    singularity: containers["gatk"]
246
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
247
248
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
249
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
250

251
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
252
253
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
254
        ref = settings["reference"],
255
    params:
van den Berg's avatar
van den Berg committed
256
        size = settings['scatter_size']
van den Berg's avatar
van den Berg committed
257
    output:
258
        directory("scatter")
van den Berg's avatar
van den Berg committed
259
    singularity: containers["biopet-scatterregions"]
260
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
261
           "biopet-scatterregions "
262
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
263
264
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
265
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
266
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
267
    input:
Sander Bollen's avatar
Sander Bollen committed
268
269
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
van den Berg's avatar
van den Berg committed
270
271
        dbsnp=settings["dbsnp"],
        ref=settings["reference"],
van den Berg's avatar
van den Berg committed
272
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
273
    output:
274
275
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
276
277
    wildcard_constraints:
        chunk="[0-9]+"
278
    singularity: containers["gatk"]
279
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
280
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
281
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
282
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
283
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
284
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
285

286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
302
303
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
304
305
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
306
307
           "--output-type z && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
308
309

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
310
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
311
    input:
312
313
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
314
        ref= settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
315
    output:
316
317
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
318
319
    wildcard_constraints:
        chunk="[0-9]+"
320
    singularity: containers["gatk"]
321
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
322
           "GenotypeGVCFs -R {input.ref} "
323
           "-V {input.gvcf} -o '{output.vcf}'"
324
325


326
def aggregate_vcf(wildcards):
327
328
329
330
331
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


332
333
334
335
336
337
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
338
rule genotype_gather:
339
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
340
    input:
341
        vcfs = aggregate_vcf,
342
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
343
    output:
344
345
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
346
    singularity: containers["bcftools"]
347
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
348
349
           "--output-type z && "
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
350
351


Sander Bollen's avatar
Sander Bollen committed
352
## bam metrics
353
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
354
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
355
    input:
356
        bam="{sample}/bams/{sample}.sorted.bam",
van den Berg's avatar
van den Berg committed
357
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
358
    output:
359
360
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
361
    singularity: containers["samtools-1.7-python-3.6"]
362
363
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
364
365


366
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
367
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
368
    input:
369
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
370
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
371
    output:
372
373
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
374
    singularity: containers["samtools-1.7-python-3.6"]
375
376
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
377
378
379


## fastqc
Sander Bollen's avatar
Sander Bollen committed
380
rule fastqc_raw:
381
382
    """
    Run fastqc on raw fastq files
383
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
384
385
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
386
    input:
Sander Bollen's avatar
Sander Bollen committed
387
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
388
389
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
390
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
391
    output:
Sander Bollen's avatar
Sander Bollen committed
392
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
393
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
394
    shell: "fastqc --threads 4 --nogroup -o {params.odir} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
395
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
396
397


Sander Bollen's avatar
Sander Bollen committed
398
rule fastqc_merged:
399
    """
400
401
    Run fastqc on merged fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
402
403
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
404
    input:
Sander Bollen's avatar
Sander Bollen committed
405
406
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
van den Berg's avatar
van den Berg committed
407
        fq=settings["safe_fastqc"]
Sander Bollen's avatar
Sander Bollen committed
408
    params:
Sander Bollen's avatar
Sander Bollen committed
409
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
410
    output:
Sander Bollen's avatar
Sander Bollen committed
411
412
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
413
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
414
415
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
416
417


Sander Bollen's avatar
Sander Bollen committed
418
rule fastqc_postqc:
419
420
    """
    Run fastqc on fastq files post pre-processing
421
422
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
423
    """
Sander Bollen's avatar
Sander Bollen committed
424
    input:
Sander Bollen's avatar
Sander Bollen committed
425
426
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
van den Berg's avatar
van den Berg committed
427
        fq=settings["safe_fastqc"]
Sander Bollen's avatar
Sander Bollen committed
428
    params:
Sander Bollen's avatar
Sander Bollen committed
429
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
430
    output:
Sander Bollen's avatar
Sander Bollen committed
431
432
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
433
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
434
435
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
436
437
438
439
440


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
441
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
442
    input:
Sander Bollen's avatar
Sander Bollen committed
443
444
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
445
    output:
Sander Bollen's avatar
Sander Bollen committed
446
        "{sample}/pre_process/{sample}.preqc_count.json"
447
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
448
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
449
450
451


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
452
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
453
    input:
Sander Bollen's avatar
Sander Bollen committed
454
455
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
456
    output:
Sander Bollen's avatar
Sander Bollen committed
457
        "{sample}/pre_process/{sample}.postqc_count.json"
458
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
459
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
460
461


Sander Bollen's avatar
Sander Bollen committed
462
463
464
465
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
466
467
468
469
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
van den Berg's avatar
van den Berg committed
470
        sc=settings["fastq_stats"]
471
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
472
    output:
Sander Bollen's avatar
Sander Bollen committed
473
        "{sample}/pre_process/fastq_stats.json"
474
475
476
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
477
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
478

Sander Bollen's avatar
Sander Bollen committed
479
480
481
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
482
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
483
    input:
Sander Bollen's avatar
Sander Bollen committed
484
485
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
van den Berg's avatar
van den Berg committed
486
        covpy=settings["covstats"],
van den Berg's avatar
van den Berg committed
487
        bed=settings.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
488
489
490
    params:
        subt="Sample {sample}"
    output:
van den Berg's avatar
van den Berg committed
491
492
        covj="{sample}/coverage/covstats.json",
        covp="{sample}/coverage/covstats.png"
493
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
494
495
496
497
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
498
499


Sander Bollen's avatar
Sander Bollen committed
500
501
502
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
503
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
504
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
505
        ref = settings.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
506
    output:
507
        tsv="{sample}/coverage/refFlat_coverage.tsv"
508
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
509
510
511
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
512
513
514
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
515
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
516
    input:
517
        vcf="{sample}/vcf/{sample}.vcf.gz",
518
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
519
    output:
520
        stats="{sampel}/vcf/{sample}.vcfstats.json"
521
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
522
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
523
524


Sander Bollen's avatar
Sander Bollen committed
525
## collection
van den Berg's avatar
van den Berg committed
526
if "bedfile" in settings:
527
528
529
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
530
531
532
533
534
535
536
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
van den Berg's avatar
van den Berg committed
537
            cov="{sample}/coverage/covstats.json",
van den Berg's avatar
van den Berg committed
538
            colpy=settings["collect_stats"]
539
540
        params:
            sample_name="{sample}",
van den Berg's avatar
van den Berg committed
541
            fthresh=settings["female_threshold"]
542
        output:
Sander Bollen's avatar
Sander Bollen committed
543
            "{sample}/{sample}.stats.json"
544
        singularity: containers["vtools"]
545
546
547
548
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
549
550
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
551
552
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
553
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
554
        input:
Sander Bollen's avatar
Sander Bollen committed
555
556
557
558
559
560
561
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
van den Berg's avatar
van den Berg committed
562
            colpy = settings["collect_stats"]
Sander Bollen's avatar
Sander Bollen committed
563
564
        params:
            sample_name = "{sample}",
van den Berg's avatar
van den Berg committed
565
            fthresh = settings["female_threshold"]
Sander Bollen's avatar
Sander Bollen committed
566
        output:
Sander Bollen's avatar
Sander Bollen committed
567
            "{sample}/{sample}.stats.json"
568
        singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
569
570
571
572
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
573
574
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
575
576

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
577
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
578
    input:
van den Berg's avatar
van den Berg committed
579
580
581
        cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
        vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=settings['samples']),
        mpy=settings["merge_stats"]
Sander Bollen's avatar
Sander Bollen committed
582
    output:
Sander Bollen's avatar
Sander Bollen committed
583
        stats="stats.json"
584
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
585
586
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
587
588


Sander Bollen's avatar
Sander Bollen committed
589
590
591
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
592
        stats="stats.json",
van den Berg's avatar
van den Berg committed
593
        sc=settings["stats_to_tsv"]
Sander Bollen's avatar
Sander Bollen committed
594
    output:
Sander Bollen's avatar
Sander Bollen committed
595
        stats="stats.tsv"
596
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
597
598
599
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
600
601
602
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
603
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
604
605
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
606
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
607
    params:
Sander Bollen's avatar
Sander Bollen committed
608
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
609
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
610
    output:
van den Berg's avatar
van den Berg committed
611
        "multiqc_report/multiqc_report.html"
612
    singularity: containers["multiqc"]
van den Berg's avatar
van den Berg committed
613
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output}"