Snakefile 20.6 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
#   hutspot - a DNAseq variant calling pipeline
van den Berg's avatar
van den Berg committed
2
#   Copyright (C) 2017-2019, Leiden University Medical Center
Sander Bollen's avatar
Sander Bollen committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

31
include: "common.smk"
van den Berg's avatar
van den Berg committed
32

33
process_config()
Sander Bollen's avatar
Sander Bollen committed
34

35
def get_readgroup(wildcards):
36
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
37

van den Berg's avatar
van den Berg committed
38
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
39
    for sample in config["samples"]:
40
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
41
42
            yield rg, sample

43
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
44
45
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
46
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
47

Sander Bollen's avatar
Sander Bollen committed
48
49
rule all:
    input:
van den Berg's avatar
van den Berg committed
50
        multiqc = "multiqc_report/multiqc_report.html",
51
52
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
53
54
55
56
57
        bam = expand("{s}/bams/{s}.bam", s=config["samples"]),
        vcfs = expand("{s}/vcf/{s}.vcf.gz", s=config["samples"]),
        vcf_tbi = expand("{s}/vcf/{s}.vcf.gz.tbi", s=config["samples"]),
        gvcfs = expand("{s}/vcf/{s}.g.vcf.gz", s=config["samples"]),
        gvcf_tbi = expand("{s}/vcf/{s}.g.vcf.gz.tbi", s=config["samples"]),
van den Berg's avatar
van den Berg committed
58
        coverage_stats = coverage_stats,
van den Berg's avatar
van den Berg committed
59
        coverage_files = coverage_files
Sander Bollen's avatar
Sander Bollen committed
60

61
62
63
rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    output: directory("tmp")
64
    log: "log/create_markdup_tmp.log"
65
    container: containers["debian"]
66
    shell: "mkdir -p {output} 2> {log}"
67

Sander Bollen's avatar
Sander Bollen committed
68
rule genome:
Sander Bollen's avatar
Sander Bollen committed
69
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
70
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
71
    output: "current.genome"
72
    log: "log/genome.log"
van den Berg's avatar
van den Berg committed
73
    container: containers["debian"]
74
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
75
76

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
77
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
78
    input:
79
80
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
81
    output:
van den Berg's avatar
van den Berg committed
82
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
83
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
84
85
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
86
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
87
88
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
van den Berg's avatar
van den Berg committed
89
           "--compression-level=1 -j 8 "
van den Berg's avatar
van den Berg committed
90
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
91
           "{input.r1} {input.r2} "
92
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
93
94

rule align:
Sander Bollen's avatar
Sander Bollen committed
95
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
96
    input:
97
98
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
99
        ref = config["reference"],
100
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
101
    params:
102
        rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"
103
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
104
    log: "log/{sample}/align.{read_group}.log"
van den Berg's avatar
van den Berg committed
105
    container: containers["bwa-0.7.17-picard-2.22.8"]
Sander Bollen's avatar
Sander Bollen committed
106
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
107
108
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
109
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
110
111

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
112
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
113
    input:
114
        bam = sample_bamfiles,
115
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
116
    output:
117
118
119
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
120
    log: "log/{sample}/markdup.log"
121
    params:
122
        bams = lambda wc: expand('INPUT={bam}', bam=sample_bamfiles(wc))
van den Berg's avatar
van den Berg committed
123
    container: containers["picard"]
124
125
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
126
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
127
           "METRICS_FILE={output.metrics} "
128
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
129

130
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
131
132
133
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
134
135
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
136
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
137
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
138
    input:
139
        bam = sample_bamfiles,
van den Berg's avatar
van den Berg committed
140
141
        ref = config["reference"],
        vcfs = config["known_sites"]
142
    output: "{sample}/bams/{sample}.baserecal.grp"
143
    log: "log/{sample}/baserecal.log"
144
    params:
145
        known_sites = expand("-knownSites {vcf}", vcf=config["known_sites"]),
146
        region = f"-L {config['restrict_BQSR']}" if "restrict_BQSR" in config else "",
147
        gatk_jar = config["gatk_jar"],
148
        bams = lambda wc: expand("-I {bam}", bam=sample_bamfiles(wc))
van den Berg's avatar
van den Berg committed
149
    container: containers["gatk"]
150
    shell: "java -XX:ParallelGCThreads=1 -jar {params.gatk_jar} -T "
151
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
152
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
153
154
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites} "
           "{params.region} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
155

156
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
157
158
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
159
        ref = config["reference"],
160
    params:
van den Berg's avatar
van den Berg committed
161
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
162
    output:
163
        directory("scatter")
164
    log: "log/scatterregions.log"
van den Berg's avatar
van den Berg committed
165
    container: containers["biopet-scatterregions"]
166
    shell: "mkdir -p {output} && "
167
           "biopet-scatterregions -Xmx24G "
168
           "--referenceFasta {input.ref} --scatterSize {params.size} "
169
           "--outputDir scatter 2> {log}"
van den Berg's avatar
van den Berg committed
170

Sander Bollen's avatar
Sander Bollen committed
171
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
172
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
173
    input:
174
        bam = rules.markdup.output.bam,
175
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
176
177
178
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
179
    output:
van den Berg's avatar
van den Berg committed
180
181
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
182
183
    params:
        gatk_jar = config["gatk_jar"],
184
    log: "log/{sample}/gvcf_scatter.{chunk}.log"
van den Berg's avatar
van den Berg committed
185
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
186
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
187
    container: containers["gatk"]
188
189
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 {params.gatk_jar} -T "
           "HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
190
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
191
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
192
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
193
194
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
195
           "--GVCFGQBands 80 --GVCFGQBands 100 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
196

197
198
199
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
200
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
201
202
203
204

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
205
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
206
207

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
208
    """Join the gvcf files together"""
209
210
211
212
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
213
214
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
215
216
217
    log:
        concat = "log/{sample}/gvcf_gather_concat.log",
        index = "log/{sample}/gvcf_gather_index.log"
van den Berg's avatar
van den Berg committed
218
219
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
220
           "--output {output.gvcf} --output-type z 2> {log.concat} && "
221
           "bcftools index --tbi --output-file {output.gvcf_tbi} "
222
           "{output.gvcf} 2> {log.index}"
Sander Bollen's avatar
Sander Bollen committed
223
224

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
225
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
226
    input:
227
228
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
229
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
230
    output:
231
232
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
233
234
    params:
        gatk_jar = config["gatk_jar"]
235
    log: "log/{sample}/genotype_scatter.{chunk}.log"
van den Berg's avatar
van den Berg committed
236
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
237
238
        chunk = "[0-9]+"
    container: containers["gatk"]
239
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 {params.gatk_jar} -T "
Sander Bollen's avatar
Sander Bollen committed
240
           "GenotypeGVCFs -R {input.ref} "
241
           "-V {input.gvcf} -o '{output.vcf}' 2> {log}"
242

243
def aggregate_vcf(wildcards):
244
245
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
246
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
247

248
249
250
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
251
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
252

Sander Bollen's avatar
Sander Bollen committed
253
rule genotype_gather:
254
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
255
    input:
256
        vcfs = aggregate_vcf,
257
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
258
    output:
259
260
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
261
    log: "log/{sample}/genotype_gather.log"
van den Berg's avatar
van den Berg committed
262
263
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
264
           "--output {output.vcf} --output-type z 2> {log} && "
265
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
266

Sander Bollen's avatar
Sander Bollen committed
267
## bam metrics
268
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
269
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
270
    input:
271
272
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
273
    output:
van den Berg's avatar
van den Berg committed
274
275
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
276
    log: "log/{sample}/mapped_reads_bases.log"
van den Berg's avatar
van den Berg committed
277
    container: containers["samtools-1.7-python-3.6"]
278
    shell: "samtools view -F 4 {input.bam} 2> {log} | cut -f 10 | python {input.pywc} "
279
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
280

281
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
282
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
283
    input:
284
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
285
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
286
    output:
van den Berg's avatar
van den Berg committed
287
288
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
289
    log: "log/{sample}/unique_reads_bases.log"
van den Berg's avatar
van den Berg committed
290
    container: containers["samtools-1.7-python-3.6"]
291
292
293
    shell: "samtools view -F 4 -F 1024 {input.bam} 2> {log} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} "
           "--bases {output.bases} 2>> {log}"
Sander Bollen's avatar
Sander Bollen committed
294
295

## fastqc
Sander Bollen's avatar
Sander Bollen committed
296
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
297
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
298
    input:
299
300
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
301
    output:
302
        done = "{sample}/pre_process/raw-{sample}-{read_group}/.done"
303
    log: "log/{sample}/fastqc_raw.{read_group}.log"
van den Berg's avatar
van den Berg committed
304
    container: containers["fastqc"]
305
    shell: "fastqc --threads 4 --nogroup -o $(dirname {output.done}) "
306
           "{input.r1} {input.r2} 2> {log} && "
307
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
308

Sander Bollen's avatar
Sander Bollen committed
309
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
310
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
311
    input:
312
313
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
Sander Bollen's avatar
Sander Bollen committed
314
    output:
315
        done = "{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
316
    log: "log/{sample}/fastqc_postqc.{read_group}.log"
van den Berg's avatar
van den Berg committed
317
    container: containers["fastqc"]
318
    shell: "fastqc --threads 4 --nogroup -o $(dirname {output.done}) "
319
           "{input.r1} {input.r2} 2> {log} && "
320
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
321

322
## coverage
Sander Bollen's avatar
Sander Bollen committed
323
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
324
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
325
    input:
326
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
327
328
        genome = "current.genome",
        covpy = config["covstats"],
van den Berg's avatar
van den Berg committed
329
        bed = config.get("targetsfile","")
Sander Bollen's avatar
Sander Bollen committed
330
    params:
van den Berg's avatar
van den Berg committed
331
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
332
    output:
van den Berg's avatar
van den Berg committed
333
334
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
335
336
337
    log:
        bedtools = "log/{sample}/covstats_bedtools.log",
        covpy = "log/{sample}/covstats_covpy.log"
van den Berg's avatar
van den Berg committed
338
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
339
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
340
           "-b {input.bam} -d  2> {log.bedtools} | python {input.covpy} - "
341
           "--plot {output.covp} --title 'Targets coverage' "
342
           "--subtitle '{params.subt}' > {output.covj} 2> {log.covpy}"
Sander Bollen's avatar
Sander Bollen committed
343

Sander Bollen's avatar
Sander Bollen committed
344
345
346
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
347
348
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
349
        ref = config.get('refflat', "")
350
351
    output: "{sample}/coverage/refFlat_coverage.tsv"
    log: "log/{sample}/vtools_coverage.log"
van den Berg's avatar
van den Berg committed
352
    container: containers["vtools"]
353
354
    shell: "vtools-gcoverage -I {input.gvcf} "
           "-R {input.ref} > {output} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
355

356
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
357
    """Colect cutadapt summary from each readgroup per sample """
358
359
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
360
361
362
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
363
364
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
365
    log: "log/{sample}/collect_cutadapt_summary.log"
van den Berg's avatar
van den Berg committed
366
    container: containers["python3"]
367
368
369
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

370
371
372
373
374
375
376
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
van den Berg's avatar
van den Berg committed
377
        cov = rules.covstats.output.covj if "targetsfile" in config else [],
378
379
380
381
382
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
383
    log: "log/{sample}/collectstats.log"
384
385
386
387
388
389
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
390
           "--covstats {input.cov} > {output} 2> {log}"
391

van den Berg's avatar
van den Berg committed
392
393
394
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
395
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
396
397
        ref = config["reference"],
    params:
398
        prefix = lambda wildcards, output: output.alignment[:-26]
van den Berg's avatar
van den Berg committed
399
    output:
van den Berg's avatar
van den Berg committed
400
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
401
        insertMetrics = "{sample}/bams/{sample}.insert_size_metrics"
402
    log: "log/{sample}/multiple_metrics.log"
van den Berg's avatar
van den Berg committed
403
    container: containers["picard"]
404
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectMultipleMetrics "
van den Berg's avatar
van den Berg committed
405
406
407
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
408
           "PROGRAM=CollectInsertSizeMetrics 2> {log}"
van den Berg's avatar
van den Berg committed
409

410
411
412
413
414
415
416
rule bed_to_interval:
    """Run picard BedToIntervalList

    This is needed to convert the bed files for the capture kit to a format
    picard can read
    """
    input:
van den Berg's avatar
van den Berg committed
417
        targets = config.get("targetsfile",""),
418
419
420
421
422
        baits = config.get("baitsfile",""),
        ref = config["reference"]
    output:
        target_interval = "target.interval",
        baits_interval = "baits.interval"
423
424
425
    log:
        target = "log/bed_to_interval_target.log",
        baits = "log/bed_to_interval_target.log"
426
    container: containers["picard"]
427
    shell: "picard -Xmx4G BedToIntervalList "
428
            "I={input.targets} O={output.target_interval} "
429
            "SD={input.ref} 2> {log.target} && "
430
431
            "picard BedToIntervalList "
            "I={input.baits} O={output.baits_interval} "
432
            "SD={input.ref} 2> {log.baits}"
433
434
435
436
437
438
439
440
441

rule hs_metrics:
    """Run picard CollectHsMetrics"""
    input:
        bam = rules.markdup.output.bam,
        ref = config["reference"],
        targets = rules.bed_to_interval.output.target_interval,
        baits = rules.bed_to_interval.output.baits_interval,
    output: "{sample}/bams/{sample}.hs_metrics.txt"
442
    log: "log/{sample}/hs_metrics.log"
443
    container: containers["picard"]
444
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectHsMetrics "
445
446
447
448
449
           "I={input.bam} O={output} "
           "R={input.ref} "
           "BAIT_INTERVALS={input.baits} "
           "TARGET_INTERVALS={input.targets}"

Sander Bollen's avatar
Sander Bollen committed
450
451
452
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
453
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
454
455
    """
    input:
van den Berg's avatar
van den Berg committed
456
457
458
459
460
461
462
463
464
465
466
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
467
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/.done"
468
469
                      for read_group, sample in get_readgroup_per_sample()),

470
        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
471
472
473
474
475
                      for read_group, sample in get_readgroup_per_sample()),

        hs_metric = expand("{sample}/bams/{sample}.hs_metrics.txt",
                           sample=config["samples"]) if "baitsfile" in config else []

476
477
    output:
        html = "multiqc_report/multiqc_report.html",
478
        insertSize = "multiqc_report/multiqc_data/multiqc_picard_insertSize.json",
479
        AlignmentMetrics = "multiqc_report/multiqc_data/multiqc_picard_AlignmentSummaryMetrics.json",
480
        DuplicationMetrics = "multiqc_report/multiqc_data/multiqc_picard_dups.json",
481
        HsMetrics = "multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json" if "baitsfile" in config else []
482
    log: "log/multiqc.log"
van den Berg's avatar
van den Berg committed
483
    container: containers["multiqc"]
484
485
    shell: "multiqc --data-format json --force "
           "--outdir multiqc_report . 2> {log} "
van den Berg's avatar
van den Berg committed
486
           "|| touch {output}"
487
488
489
490
491
492
493

rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"],
494
        insertSize = rules.multiqc.output.insertSize,
495
        AlignmentMetrics = rules.multiqc.output.AlignmentMetrics,
496
        DuplicationMetrics = rules.multiqc.output.DuplicationMetrics,
497
        HsMetrics = rules.multiqc.output.HsMetrics
498
    output: "stats.json"
499
    log: "log/stats.tsv.log"
500
501
    container: containers["vtools"]
    shell: "python {input.mpy} --collectstats {input.cols} "
502
           "--picard-insertSize {input.insertSize} "
503
           "--picard-AlignmentMetrics {input.AlignmentMetrics} "
504
           "--picard-DuplicationMetrics {input.DuplicationMetrics} "
505
           "--picard-HsMetrics {input.HsMetrics} > {output} 2> {log}"
506
507
508
509
510
511
512

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats = rules.merge_stats.output,
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
513
    log: "log/stats.tsv.log"
514
    container: containers["python3"]
515
    shell: "python {input.sc} -i {input.stats} > {output} 2> {log}"
516

517
518
519
520
521

rule gvcf2coverage:
    """ Determine coverage from gvcf files """
    input: rules.gvcf_gather.output.gvcf
    output: "{sample}/vcf/{sample}_{threshold}.bed"
522
    log: "log/{sample}/gvcf2coverage.{threshold}.log"
523
    container: containers["gvcf2coverage"]
524
    shell: "gvcf2coverage -t {wildcards.threshold} < {input} 2> {log} | cut -f 1,2,3 > {output}"