Snakefile 22 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
#   hutspot - a DNAseq variant calling pipeline
van den Berg's avatar
van den Berg committed
2
#   Copyright (C) 2017-2019, Leiden University Medical Center
Sander Bollen's avatar
Sander Bollen committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

31
include: "common.smk"
van den Berg's avatar
van den Berg committed
32

33
process_config()
Sander Bollen's avatar
Sander Bollen committed
34

35
def get_readgroup(wildcards):
36
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
37

van den Berg's avatar
van den Berg committed
38
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
39
    for sample in config["samples"]:
40
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
41
42
            yield rg, sample

43
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
44
45
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
46
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
47

Sander Bollen's avatar
Sander Bollen committed
48
49
rule all:
    input:
van den Berg's avatar
van den Berg committed
50
        multiqc = "multiqc_report/multiqc_report.html",
51
52
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68

        bam = expand("{sample}/bams/{sample}.bam",
                     sample=config["samples"]),

        vcfs = expand("{sample}/vcf/{sample}.vcf.gz",
                      sample=config["samples"]),

        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi",
                         sample=config["samples"]),

        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz",
                       sample=config["samples"]),

        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                          sample=config["samples"]),

69
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/.done"
van den Berg's avatar
van den Berg committed
70
71
                      for read_group, sample in get_readgroup_per_sample()),

72
        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
van den Berg's avatar
van den Berg committed
73
74
75
76
                      for read_group, sample in get_readgroup_per_sample()),

        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt"
                    for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
77
        coverage_stats = coverage_stats,
78
79
80
81
        coverage_files = (f"{sample}/vcf/{sample}_{threshold}.bed"
                          for sample, threshold in itertools.product(
                              config['samples'], config['coverage_threshold'])
                          ) if 'coverage_threshold' in config else []
Sander Bollen's avatar
Sander Bollen committed
82

83
84
85
rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    output: directory("tmp")
86
    log: "log/create_markdup_tmp.log"
87
    container: containers["debian"]
88
    shell: "mkdir -p {output} 2> {log}"
89

Sander Bollen's avatar
Sander Bollen committed
90
rule genome:
Sander Bollen's avatar
Sander Bollen committed
91
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
92
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
93
    output: "current.genome"
94
    log: "log/genome.log"
van den Berg's avatar
van den Berg committed
95
    container: containers["debian"]
96
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
97
98

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
99
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
100
    input:
101
102
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
103
    output:
van den Berg's avatar
van den Berg committed
104
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
105
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
106
107
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
108
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
109
110
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
van den Berg's avatar
van den Berg committed
111
           "--compression-level=1 -j 8 "
van den Berg's avatar
van den Berg committed
112
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
113
           "{input.r1} {input.r2} "
114
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
115
116

rule align:
Sander Bollen's avatar
Sander Bollen committed
117
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
118
    input:
119
120
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
121
        ref = config["reference"],
122
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
123
    params:
124
        rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"
125
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
126
    log: "log/{sample}/align.{read_group}.log"
van den Berg's avatar
van den Berg committed
127
    container: containers["bwa-0.7.17-picard-2.22.8"]
Sander Bollen's avatar
Sander Bollen committed
128
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
129
130
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
131
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
132

133
def markdup_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
134
135
136
137
    """Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(
                sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)]
138

Sander Bollen's avatar
Sander Bollen committed
139
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
140
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
141
    input:
142
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
143
144
145
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
146
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
147
    output:
148
149
150
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
151
    log: "log/{sample}/markdup.log"
152
153
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
154
    container: containers["picard"]
155
156
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
157
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
158
           "METRICS_FILE={output.metrics} "
159
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
160

161
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
162
163
164
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
165
166
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
167
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
168
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
169
    input:
170
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
171
172
173
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
174
175
        ref = config["reference"],
        vcfs = config["known_sites"]
176
    output: "{sample}/bams/{sample}.baserecal.grp"
177
    log: "log/{sample}/baserecal.log"
178
    params:
van den Berg's avatar
van den Berg committed
179
180
181
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
182
        region = f"-L {config['restrict_BQSR']}" if "restrict_BQSR" in config else "",
183
        gatk_jar = config["gatk_jar"],
184
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
185
    container: containers["gatk"]
186
    shell: "java -XX:ParallelGCThreads=1 -jar {params.gatk_jar} -T "
187
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
188
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
189
190
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites} "
           "{params.region} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
191

192
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
193
194
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
195
        ref = config["reference"],
196
    params:
van den Berg's avatar
van den Berg committed
197
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
198
    output:
199
        directory("scatter")
200
    log: "log/scatterregions.log"
van den Berg's avatar
van den Berg committed
201
    container: containers["biopet-scatterregions"]
202
    shell: "mkdir -p {output} && "
203
           "biopet-scatterregions -Xmx24G "
204
           "--referenceFasta {input.ref} --scatterSize {params.size} "
205
           "--outputDir scatter 2> {log}"
van den Berg's avatar
van den Berg committed
206

Sander Bollen's avatar
Sander Bollen committed
207
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
208
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
209
    input:
210
        bam = rules.markdup.output.bam,
211
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
212
213
214
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
215
    output:
van den Berg's avatar
van den Berg committed
216
217
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
218
219
    params:
        gatk_jar = config["gatk_jar"],
220
    log: "log/{sample}/gvcf_scatter.{chunk}.log"
van den Berg's avatar
van den Berg committed
221
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
222
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
223
    container: containers["gatk"]
224
225
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 {params.gatk_jar} -T "
           "HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
226
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
227
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
228
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
229
230
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
231
           "--GVCFGQBands 80 --GVCFGQBands 100 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
232

233
234
235
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
236
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
237
238
239
240

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
241
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
242
243

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
244
    """Join the gvcf files together"""
245
246
247
248
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
249
250
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
251
252
253
    log:
        concat = "log/{sample}/gvcf_gather_concat.log",
        index = "log/{sample}/gvcf_gather_index.log"
van den Berg's avatar
van den Berg committed
254
255
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
256
           "--output {output.gvcf} --output-type z 2> {log.concat} && "
257
           "bcftools index --tbi --output-file {output.gvcf_tbi} "
258
           "{output.gvcf} 2> {log.index}"
Sander Bollen's avatar
Sander Bollen committed
259
260

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
261
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
262
    input:
263
264
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
265
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
266
    output:
267
268
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
269
270
    params:
        gatk_jar = config["gatk_jar"]
271
    log: "log/{sample}/genotype_scatter.{chunk}.log"
van den Berg's avatar
van den Berg committed
272
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
273
274
        chunk = "[0-9]+"
    container: containers["gatk"]
275
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 {params.gatk_jar} -T "
Sander Bollen's avatar
Sander Bollen committed
276
           "GenotypeGVCFs -R {input.ref} "
277
           "-V {input.gvcf} -o '{output.vcf}' 2> {log}"
278

279
def aggregate_vcf(wildcards):
280
281
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
282
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
283

284
285
286
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
287
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
288

Sander Bollen's avatar
Sander Bollen committed
289
rule genotype_gather:
290
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
291
    input:
292
        vcfs = aggregate_vcf,
293
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
294
    output:
295
296
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
297
    log: "log/{sample}/genotype_gather.log"
van den Berg's avatar
van den Berg committed
298
299
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
300
           "--output {output.vcf} --output-type z 2> {log} && "
301
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
302

Sander Bollen's avatar
Sander Bollen committed
303
## bam metrics
304
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
305
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
306
    input:
307
308
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
309
    output:
van den Berg's avatar
van den Berg committed
310
311
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
312
    log: "log/{sample}/mapped_reads_bases.log"
van den Berg's avatar
van den Berg committed
313
    container: containers["samtools-1.7-python-3.6"]
314
    shell: "samtools view -F 4 {input.bam} 2> {log} | cut -f 10 | python {input.pywc} "
315
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
316

317
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
318
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
319
    input:
320
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
321
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
322
    output:
van den Berg's avatar
van den Berg committed
323
324
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
325
    log: "log/{sample}/unique_reads_bases.log"
van den Berg's avatar
van den Berg committed
326
    container: containers["samtools-1.7-python-3.6"]
327
328
329
    shell: "samtools view -F 4 -F 1024 {input.bam} 2> {log} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} "
           "--bases {output.bases} 2>> {log}"
Sander Bollen's avatar
Sander Bollen committed
330
331

## fastqc
Sander Bollen's avatar
Sander Bollen committed
332
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
333
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
334
    input:
335
336
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
337
    output:
338
        done = "{sample}/pre_process/raw-{sample}-{read_group}/.done"
339
    log: "log/{sample}/fastqc_raw.{read_group}.log"
van den Berg's avatar
van den Berg committed
340
    container: containers["fastqc"]
341
    shell: "fastqc --threads 4 --nogroup -o $(dirname {output.done}) "
342
           "{input.r1} {input.r2} 2> {log} && "
343
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
344

Sander Bollen's avatar
Sander Bollen committed
345
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
346
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
347
    input:
348
349
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
Sander Bollen's avatar
Sander Bollen committed
350
    output:
351
        done = "{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
352
    log: "log/{sample}/fastqc_postqc.{read_group}.log"
van den Berg's avatar
van den Berg committed
353
    container: containers["fastqc"]
354
    shell: "fastqc --threads 4 --nogroup -o $(dirname {output.done}) "
355
           "{input.r1} {input.r2} 2> {log} && "
356
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
357

358
## coverage
Sander Bollen's avatar
Sander Bollen committed
359
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
360
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
361
    input:
362
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
363
364
        genome = "current.genome",
        covpy = config["covstats"],
van den Berg's avatar
van den Berg committed
365
        bed = config.get("targetsfile","")
Sander Bollen's avatar
Sander Bollen committed
366
    params:
van den Berg's avatar
van den Berg committed
367
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
368
    output:
van den Berg's avatar
van den Berg committed
369
370
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
371
372
373
    log:
        bedtools = "log/{sample}/covstats_bedtools.log",
        covpy = "log/{sample}/covstats_covpy.log"
van den Berg's avatar
van den Berg committed
374
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
375
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
376
           "-b {input.bam} -d  2> {log.bedtools} | python {input.covpy} - "
377
           "--plot {output.covp} --title 'Targets coverage' "
378
           "--subtitle '{params.subt}' > {output.covj} 2> {log.covpy}"
Sander Bollen's avatar
Sander Bollen committed
379

Sander Bollen's avatar
Sander Bollen committed
380
381
382
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
383
384
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
385
        ref = config.get('refflat', "")
386
387
    output: "{sample}/coverage/refFlat_coverage.tsv"
    log: "log/{sample}/vtools_coverage.log"
van den Berg's avatar
van den Berg committed
388
    container: containers["vtools"]
389
390
    shell: "vtools-gcoverage -I {input.gvcf} "
           "-R {input.ref} > {output} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
391

392
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
393
    """Colect cutadapt summary from each readgroup per sample """
394
395
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
396
397
398
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
399
400
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
401
    log: "log/{sample}/collect_cutadapt_summary.log"
van den Berg's avatar
van den Berg committed
402
    container: containers["python3"]
403
404
405
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

406
407
408
409
410
411
412
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
van den Berg's avatar
van den Berg committed
413
        cov = rules.covstats.output.covj if "targetsfile" in config else [],
414
415
416
417
418
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
419
    log: "log/{sample}/collectstats.log"
420
421
422
423
424
425
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
426
           "--covstats {input.cov} > {output} 2> {log}"
427

van den Berg's avatar
van den Berg committed
428
429
430
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
431
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
432
433
        ref = config["reference"],
    params:
434
        prefix = lambda wildcards, output: output.alignment[:-26]
van den Berg's avatar
van den Berg committed
435
    output:
van den Berg's avatar
van den Berg committed
436
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
437
        insertMetrics = "{sample}/bams/{sample}.insert_size_metrics"
438
    log: "log/{sample}/multiple_metrics.log"
van den Berg's avatar
van den Berg committed
439
    container: containers["picard"]
440
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectMultipleMetrics "
van den Berg's avatar
van den Berg committed
441
442
443
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
444
           "PROGRAM=CollectInsertSizeMetrics 2> {log}"
van den Berg's avatar
van den Berg committed
445

446
447
448
449
450
451
452
rule bed_to_interval:
    """Run picard BedToIntervalList

    This is needed to convert the bed files for the capture kit to a format
    picard can read
    """
    input:
van den Berg's avatar
van den Berg committed
453
        targets = config.get("targetsfile",""),
454
455
456
457
458
        baits = config.get("baitsfile",""),
        ref = config["reference"]
    output:
        target_interval = "target.interval",
        baits_interval = "baits.interval"
459
460
461
    log:
        target = "log/bed_to_interval_target.log",
        baits = "log/bed_to_interval_target.log"
462
    container: containers["picard"]
463
    shell: "picard -Xmx4G BedToIntervalList "
464
            "I={input.targets} O={output.target_interval} "
465
            "SD={input.ref} 2> {log.target} && "
466
467
            "picard BedToIntervalList "
            "I={input.baits} O={output.baits_interval} "
468
            "SD={input.ref} 2> {log.baits}"
469
470
471
472
473
474
475
476
477

rule hs_metrics:
    """Run picard CollectHsMetrics"""
    input:
        bam = rules.markdup.output.bam,
        ref = config["reference"],
        targets = rules.bed_to_interval.output.target_interval,
        baits = rules.bed_to_interval.output.baits_interval,
    output: "{sample}/bams/{sample}.hs_metrics.txt"
478
    log: "log/{sample}/hs_metrics.log"
479
    container: containers["picard"]
480
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectHsMetrics "
481
482
483
484
485
           "I={input.bam} O={output} "
           "R={input.ref} "
           "BAIT_INTERVALS={input.baits} "
           "TARGET_INTERVALS={input.targets}"

Sander Bollen's avatar
Sander Bollen committed
486
487
488
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
489
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
490
491
    """
    input:
van den Berg's avatar
van den Berg committed
492
493
494
495
496
497
498
499
500
501
502
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
503
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/.done"
504
505
                      for read_group, sample in get_readgroup_per_sample()),

506
        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
507
508
509
510
511
                      for read_group, sample in get_readgroup_per_sample()),

        hs_metric = expand("{sample}/bams/{sample}.hs_metrics.txt",
                           sample=config["samples"]) if "baitsfile" in config else []

512
513
    output:
        html = "multiqc_report/multiqc_report.html",
514
        insertSize = "multiqc_report/multiqc_data/multiqc_picard_insertSize.json",
515
        AlignmentMetrics = "multiqc_report/multiqc_data/multiqc_picard_AlignmentSummaryMetrics.json",
516
        DuplicationMetrics = "multiqc_report/multiqc_data/multiqc_picard_dups.json",
517
        HsMetrics = "multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json" if "baitsfile" in config else []
518
    log: "log/multiqc.log"
van den Berg's avatar
van den Berg committed
519
    container: containers["multiqc"]
520
521
    shell: "multiqc --data-format json --force "
           "--outdir multiqc_report . 2> {log} "
van den Berg's avatar
van den Berg committed
522
           "|| touch {output}"
523
524
525
526
527
528
529

rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"],
530
        insertSize = rules.multiqc.output.insertSize,
531
        AlignmentMetrics = rules.multiqc.output.AlignmentMetrics,
532
        DuplicationMetrics = rules.multiqc.output.DuplicationMetrics,
533
        HsMetrics = rules.multiqc.output.HsMetrics
534
    output: "stats.json"
535
    log: "log/stats.tsv.log"
536
537
    container: containers["vtools"]
    shell: "python {input.mpy} --collectstats {input.cols} "
538
           "--picard-insertSize {input.insertSize} "
539
           "--picard-AlignmentMetrics {input.AlignmentMetrics} "
540
           "--picard-DuplicationMetrics {input.DuplicationMetrics} "
541
           "--picard-HsMetrics {input.HsMetrics} > {output} 2> {log}"
542
543
544
545
546
547
548

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats = rules.merge_stats.output,
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
549
    log: "log/stats.tsv.log"
550
    container: containers["python3"]
551
    shell: "python {input.sc} -i {input.stats} > {output} 2> {log}"
552

553
554
555
556
557

rule gvcf2coverage:
    """ Determine coverage from gvcf files """
    input: rules.gvcf_gather.output.gvcf
    output: "{sample}/vcf/{sample}_{threshold}.bed"
558
    log: "log/{sample}/gvcf2coverage.{threshold}.log"
559
    container: containers["gvcf2coverage"]
560
    shell: "gvcf2coverage -t {wildcards.threshold} < {input} 2> {log} | cut -f 1,2,3 > {output}"