Snakefile 23.2 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Read the config json
with open(config['CONFIG_JSON'], 'rt') as fin:
    settings = json.load(fin)

# Read the json schema
with open('config/schema.json', 'rt') as fin:
    schema = json.load(fin)

# Validate the settings against the schema
try:
    jsonschema.validate(settings, schema)
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

45

46
47
48
# Set default values
def set_default(key, value):
    """ Set default config values """
van den Berg's avatar
van den Berg committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
    if key not in settings:
        settings[key] = value

# Set the default settings
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
set_default("covstats", "src/covstats.py")
set_default("collect_stats", "src/collect_stats.py")
set_default("merge_stats", "src/merge_stats.py")
set_default("fastq_stats", "src/fastqc_stats.py")
set_default("stats_to_tsv", "src/stats_to_tsv.py")
set_default("safe_fastqc", "src/safe_fastqc.sh")
set_default("py_wordcount", "src/pywc.py")
Sander Bollen's avatar
Sander Bollen committed
64

65
66
67
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
68
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
69
70
71
72
73
74
75
76
77
78
79
80
81
82
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:1.14--py36_0",
    "debian": "docker://debian:buster-slim",
    "fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.5--py36_0",
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
83

Sander Bollen's avatar
Sander Bollen committed
84
85
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
van den Berg's avatar
van den Berg committed
86
    s = settings['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
87
    rs = []
88
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
89
90
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
91

Sander Bollen's avatar
Sander Bollen committed
92
93
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
94

95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
def get_forward(wildcards):
    """ Get the forward fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
                [wildcards.read_group]["R1"]
    )

def get_reverse(wildcards):
    """ Get the reverse fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
            [wildcards.read_group]["R2"]
    )

def get_readgroup(wildcards):
    return settings["samples"][wildcards.sample]["libraries"]
Sander Bollen's avatar
Sander Bollen committed
111

112
113
114
def coverage_stats(wildcards):
    files = expand("{sample}/coverage/refFlat_coverage.tsv", sample=settings['samples'])
    return files if "refflat" in settings else []
Sander Bollen's avatar
Sander Bollen committed
115

Sander Bollen's avatar
Sander Bollen committed
116
117
rule all:
    input:
van den Berg's avatar
van den Berg committed
118
        multiqc="multiqc_report/multiqc_report.html",
119
        stats = "stats.json",
van den Berg's avatar
van den Berg committed
120
121
122
123
124
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai", sample=settings['samples']),
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz", sample=settings['samples']),
        vcf_tbi=expand("{sample}/vcf/{sample}.vcf.gz.tbi", sample=settings['samples']),
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz", sample=settings['samples']),
        gvcf_tbi=expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=settings['samples']),
125
126
127
        #fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt", sample=settings['samples']),
        #fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip", sample=settings['samples']),
        #fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip", sample=settings['samples']),
128
        coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
129

Sander Bollen's avatar
Sander Bollen committed
130
131
132

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
133
    output: directory("tmp")
134
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
135
136
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
137
rule genome:
Sander Bollen's avatar
Sander Bollen committed
138
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
139
    input: settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
140
    output: "current.genome"
141
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
142
143
144
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
145
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
146
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
147
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
148
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
149
150
151
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
152
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
153
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
154
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
155
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
156
157
158
    shell: "cat {input} > {output}"

rule sickle:
Sander Bollen's avatar
Sander Bollen committed
159
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
160
    input:
161
162
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
163
    output:
164
165
166
        r1 = "{sample}/pre_process/{sample}-{read_group}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}-{read_group}.trimmed_R2.fastq",
        s = "{sample}/pre_process/{sample}-{read_group}.trimmed_singles.fastq"
167
    singularity: containers["sickle"]
Sander Bollen's avatar
Sander Bollen committed
168
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
169
170
171
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
172
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
173
    input:
174
175
        r1 = "{sample}/pre_process/{sample}-{read_group}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}-{read_group}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
176
    output:
177
178
        r1 = "{sample}/pre_process/{sample}-{read_group}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}-{read_group}.cutadapt_R2.fastq"
179
    singularity: containers["cutadapt"]
Sander Bollen's avatar
Sander Bollen committed
180
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
181
182
183
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
184
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
185
    input:
186
187
        r1 = "{sample}/pre_process/{sample}-{read_group}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}-{read_group}.cutadapt_R2.fastq",
van den Berg's avatar
van den Berg committed
188
        ref = settings["reference"],
189
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
190
    params:
191
192
        rg = "@RG\\tID:{read_group}\\tSM:{sample}\\tPL:ILLUMINA"
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
193
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
194
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
195
196
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
197
198
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

199
200
201
202
203
def markdup_bam_input(wildcards):
    """ Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
            read_group=rg) for rg in get_readgroup(wildcards)]

Sander Bollen's avatar
Sander Bollen committed
204
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
205
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
206
    input:
207
208
209
        bam = lambda wildcards:
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
        read_group=rg) for rg in get_readgroup(wildcards)),
Sander Bollen's avatar
Sander Bollen committed
210
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
211
    output:
Sander Bollen's avatar
Sander Bollen committed
212
213
214
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
215
216
    params:
        bams=markdup_bam_input
217
    singularity: containers["picard-2.14"]
218
219
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
220
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
221
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
222
223
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
224
225
226
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
227
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
228
    output:
Sander Bollen's avatar
Sander Bollen committed
229
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
230
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
231
232
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
233
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
234
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
235
    input:
Sander Bollen's avatar
Sander Bollen committed
236
        bam = "{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
237
        ref = settings["reference"],
238
        vcfs = settings["known_sites"]
Sander Bollen's avatar
Sander Bollen committed
239
    output:
Sander Bollen's avatar
Sander Bollen committed
240
        grp = "{sample}/bams/{sample}.baserecal.grp"
241
    params: " ".join(expand("-knownSites {vcf}", vcf=settings["known_sites"]))
242
    singularity: containers["gatk"]
243
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
244
245
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
246
           "-cov CycleCovariate -cov ContextCovariate {params}"
Sander Bollen's avatar
Sander Bollen committed
247

248
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
249
250
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
251
        ref = settings["reference"],
252
    params:
van den Berg's avatar
van den Berg committed
253
        size = settings['scatter_size']
van den Berg's avatar
van den Berg committed
254
    output:
255
        directory("scatter")
van den Berg's avatar
van den Berg committed
256
    singularity: containers["biopet-scatterregions"]
257
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
258
           "biopet-scatterregions "
259
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
260
261
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
262
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
263
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
264
    input:
Sander Bollen's avatar
Sander Bollen committed
265
266
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
van den Berg's avatar
van den Berg committed
267
268
        dbsnp=settings["dbsnp"],
        ref=settings["reference"],
van den Berg's avatar
van den Berg committed
269
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
270
    output:
271
272
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
273
274
    wildcard_constraints:
        chunk="[0-9]+"
275
    singularity: containers["gatk"]
276
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
277
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
278
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
279
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
280
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
281
282
283
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
284

285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
301
302
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
303
304
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
305
306
           "--output-type z && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
307
308

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
309
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
310
    input:
311
312
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
313
        ref= settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
314
    output:
315
316
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
317
318
    wildcard_constraints:
        chunk="[0-9]+"
319
    singularity: containers["gatk"]
320
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
321
           "GenotypeGVCFs -R {input.ref} "
322
           "-V {input.gvcf} -o '{output.vcf}'"
323
324


325
def aggregate_vcf(wildcards):
326
327
328
329
330
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


331
332
333
334
335
336
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
337
rule genotype_gather:
338
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
339
    input:
340
        vcfs = aggregate_vcf,
341
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
342
    output:
343
344
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
345
    singularity: containers["bcftools"]
346
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
347
348
           "--output-type z && "
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
349
350


Sander Bollen's avatar
Sander Bollen committed
351
## bam metrics
352
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
353
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
354
    input:
355
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
356
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
357
    output:
358
359
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
360
    singularity: containers["samtools-1.7-python-3.6"]
361
362
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
363
364


365
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
366
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
367
    input:
368
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
369
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
370
    output:
371
372
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
373
    singularity: containers["samtools-1.7-python-3.6"]
374
375
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
376
377
378


## fastqc
Sander Bollen's avatar
Sander Bollen committed
379
rule fastqc_raw:
380
381
    """
    Run fastqc on raw fastq files
382
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
383
384
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
385
    input:
Sander Bollen's avatar
Sander Bollen committed
386
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
387
388
        r2=get_r2
    params:
389
        odir="{sample}/pre_process/raw_fastqc-{read_group}"
Sander Bollen's avatar
Sander Bollen committed
390
    output:
391
        aux="{sample}/pre_process/raw_fastqc-{read_group}/.done.txt"
392
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
393
    shell: "fastqc --threads 4 --nogroup -o {params.odir} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
394
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
395
396


Sander Bollen's avatar
Sander Bollen committed
397
rule fastqc_merged:
398
    """
399
    Run fastqc on merged fastq files
400
    """
Sander Bollen's avatar
Sander Bollen committed
401
    input:
Sander Bollen's avatar
Sander Bollen committed
402
403
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
van den Berg's avatar
van den Berg committed
404
        fq=settings["safe_fastqc"]
Sander Bollen's avatar
Sander Bollen committed
405
    params:
Sander Bollen's avatar
Sander Bollen committed
406
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
407
    output:
Sander Bollen's avatar
Sander Bollen committed
408
409
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
410
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
411
412
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
413
414


Sander Bollen's avatar
Sander Bollen committed
415
rule fastqc_postqc:
416
417
    """
    Run fastqc on fastq files post pre-processing
418
419
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
420
    """
Sander Bollen's avatar
Sander Bollen committed
421
    input:
422
423
        r1="{sample}/pre_process/{sample}-{read_group}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}-{read_group}.cutadapt_R2.fastq",
van den Berg's avatar
van den Berg committed
424
        fq=settings["safe_fastqc"]
Sander Bollen's avatar
Sander Bollen committed
425
    params:
Sander Bollen's avatar
Sander Bollen committed
426
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
427
    output:
428
429
        r1="{sample}/pre_process/postqc_fastqc/{sample}-{read_group}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}-{read_group}.cutadapt_R2_fastqc.zip"
430
    singularity: containers["fastqc"]
Sander Bollen's avatar
Sander Bollen committed
431
432
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
433
434
435
436
437


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
438
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
439
    input:
440
441
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
442
    output:
443
        "{sample}/pre_process/{sample}-{read_group}.preqc_count.json"
444
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
445
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
446
447
448


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
449
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
450
    input:
451
452
        r1="{sample}/pre_process/{sample}-{read_group}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}-{read_group}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
453
    output:
454
        "{sample}/pre_process/{sample}-{read_group}.postqc_count.json"
455
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
456
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
457
458


Sander Bollen's avatar
Sander Bollen committed
459
460
461
462
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
463
464
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
465
466
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}-{read_group}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}-{read_group}.cutadapt_R2_fastqc.zip",
van den Berg's avatar
van den Berg committed
467
        sc=settings["fastq_stats"]
468
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
469
    output:
470
        "{sample}/pre_process/{read_group}-fastq_stats.json"
471
472
473
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
474
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
475

Sander Bollen's avatar
Sander Bollen committed
476
477
478
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
479
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
480
    input:
Sander Bollen's avatar
Sander Bollen committed
481
482
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
van den Berg's avatar
van den Berg committed
483
        covpy=settings["covstats"],
van den Berg's avatar
van den Berg committed
484
        bed=settings.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
485
486
487
    params:
        subt="Sample {sample}"
    output:
van den Berg's avatar
van den Berg committed
488
489
        covj="{sample}/coverage/covstats.json",
        covp="{sample}/coverage/covstats.png"
490
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
491
492
493
494
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
495
496


Sander Bollen's avatar
Sander Bollen committed
497
498
499
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
500
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
501
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
502
        ref = settings.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
503
    output:
504
        tsv="{sample}/coverage/refFlat_coverage.tsv"
505
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
506
507
508
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
509
510
511
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
512
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
513
    input:
514
        vcf="{sample}/vcf/{sample}.vcf.gz",
515
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
516
    output:
517
        stats="{sampel}/vcf/{sample}.vcfstats.json"
518
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
519
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
520
521


Sander Bollen's avatar
Sander Bollen committed
522
## collection
van den Berg's avatar
van den Berg committed
523
if "bedfile" in settings:
524
525
526
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
527
528
            #preqc="{sample}/pre_process/{sample}.preqc_count.json",
            #postq="{sample}/pre_process/{sample}.postqc_count.json",
Sander Bollen's avatar
Sander Bollen committed
529
530
531
532
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
533
            #fastqc="{sample}/pre_process/{read_group}-fastq_stats.json",
van den Berg's avatar
van den Berg committed
534
            cov="{sample}/coverage/covstats.json",
van den Berg's avatar
van den Berg committed
535
            colpy=settings["collect_stats"]
536
537
        params:
            sample_name="{sample}",
van den Berg's avatar
van den Berg committed
538
            fthresh=settings["female_threshold"]
539
        output:
Sander Bollen's avatar
Sander Bollen committed
540
            "{sample}/{sample}.stats.json"
541
        singularity: containers["vtools"]
542
543
544
545
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
546
547
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
548
549
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
550
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
551
        input:
Sander Bollen's avatar
Sander Bollen committed
552
553
554
555
556
557
558
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
van den Berg's avatar
van den Berg committed
559
            colpy = settings["collect_stats"]
Sander Bollen's avatar
Sander Bollen committed
560
561
        params:
            sample_name = "{sample}",
van den Berg's avatar
van den Berg committed
562
            fthresh = settings["female_threshold"]
Sander Bollen's avatar
Sander Bollen committed
563
        output:
Sander Bollen's avatar
Sander Bollen committed
564
            "{sample}/{sample}.stats.json"
565
        singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
566
567
568
569
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
570
571
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
572
573

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
574
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
575
    input:
van den Berg's avatar
van den Berg committed
576
577
578
        cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
        vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=settings['samples']),
        mpy=settings["merge_stats"]
Sander Bollen's avatar
Sander Bollen committed
579
    output:
Sander Bollen's avatar
Sander Bollen committed
580
        stats="stats.json"
581
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
582
583
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
584
585


Sander Bollen's avatar
Sander Bollen committed
586
587
588
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
589
        stats="stats.json",
van den Berg's avatar
van den Berg committed
590
        sc=settings["stats_to_tsv"]
Sander Bollen's avatar
Sander Bollen committed
591
    output:
Sander Bollen's avatar
Sander Bollen committed
592
        stats="stats.tsv"
593
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
594
595
596
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
597
598
599
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
600
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
601
602
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
603
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
604
    params:
Sander Bollen's avatar
Sander Bollen committed
605
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
606
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
607
    output:
van den Berg's avatar
van den Berg committed
608
        "multiqc_report/multiqc_report.html"
609
    singularity: containers["multiqc"]
van den Berg's avatar
van den Berg committed
610
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output}"