Snakefile 20.7 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
#   hutspot - a DNAseq variant calling pipeline
van den Berg's avatar
van den Berg committed
2
#   Copyright (C) 2017-2019, Leiden University Medical Center
Sander Bollen's avatar
Sander Bollen committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

31
include: "common.smk"
van den Berg's avatar
van den Berg committed
32

33
process_config()
Sander Bollen's avatar
Sander Bollen committed
34

35
def get_readgroup(wildcards):
36
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
37

van den Berg's avatar
van den Berg committed
38
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
39
    for sample in config["samples"]:
40
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
41
42
            yield rg, sample

43
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
44
45
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
46
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
47

Sander Bollen's avatar
Sander Bollen committed
48
49
rule all:
    input:
van den Berg's avatar
van den Berg committed
50
        multiqc = "multiqc_report/multiqc_report.html",
51
52
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
53
54
55
56
57
        bam = expand("{s}/bams/{s}.bam", s=config["samples"]),
        vcfs = expand("{s}/vcf/{s}.vcf.gz", s=config["samples"]),
        vcf_tbi = expand("{s}/vcf/{s}.vcf.gz.tbi", s=config["samples"]),
        gvcfs = expand("{s}/vcf/{s}.g.vcf.gz", s=config["samples"]),
        gvcf_tbi = expand("{s}/vcf/{s}.g.vcf.gz.tbi", s=config["samples"]),
van den Berg's avatar
van den Berg committed
58
        coverage_stats = coverage_stats,
van den Berg's avatar
van den Berg committed
59
        coverage_files = coverage_files
Sander Bollen's avatar
Sander Bollen committed
60

61
62
63
rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    output: directory("tmp")
64
    log: "log/create_markdup_tmp.log"
65
    container: containers["debian"]
66
    shell: "mkdir -p {output} 2> {log}"
67

Sander Bollen's avatar
Sander Bollen committed
68
rule genome:
Sander Bollen's avatar
Sander Bollen committed
69
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
70
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
71
    output: "current.genome"
72
    log: "log/genome.log"
van den Berg's avatar
van den Berg committed
73
    container: containers["debian"]
74
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
75
76

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
77
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
78
    input:
79
80
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
81
    output:
van den Berg's avatar
van den Berg committed
82
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
83
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
84
85
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
86
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
87
88
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
van den Berg's avatar
van den Berg committed
89
           "--compression-level=1 -j 8 "
van den Berg's avatar
van den Berg committed
90
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
91
           "{input.r1} {input.r2} "
92
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
93
94

rule align:
Sander Bollen's avatar
Sander Bollen committed
95
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
96
    input:
97
98
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
99
        ref = config["reference"],
100
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
101
    params:
102
        rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"
103
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
104
    log: "log/{sample}/align.{read_group}.log"
van den Berg's avatar
van den Berg committed
105
    container: containers["bwa-0.7.17-picard-2.22.8"]
Sander Bollen's avatar
Sander Bollen committed
106
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
107
108
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
109
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
110
111

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
112
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
113
    input:
114
        bam = markdup_input_files,
115
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
116
    output:
117
118
119
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
120
    log: "log/{sample}/markdup.log"
121
    params:
122
        bams=markdup_input_string
van den Berg's avatar
van den Berg committed
123
    container: containers["picard"]
124
125
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
126
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
127
           "METRICS_FILE={output.metrics} "
128
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
129

130
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
131
132
133
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
134
135
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
136
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
137
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
138
    input:
139
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
140
141
142
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
143
144
        ref = config["reference"],
        vcfs = config["known_sites"]
145
    output: "{sample}/bams/{sample}.baserecal.grp"
146
    log: "log/{sample}/baserecal.log"
147
    params:
van den Berg's avatar
van den Berg committed
148
149
150
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
151
        region = f"-L {config['restrict_BQSR']}" if "restrict_BQSR" in config else "",
152
        gatk_jar = config["gatk_jar"],
153
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
154
    container: containers["gatk"]
155
    shell: "java -XX:ParallelGCThreads=1 -jar {params.gatk_jar} -T "
156
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
157
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
158
159
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites} "
           "{params.region} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
160

161
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
162
163
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
164
        ref = config["reference"],
165
    params:
van den Berg's avatar
van den Berg committed
166
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
167
    output:
168
        directory("scatter")
169
    log: "log/scatterregions.log"
van den Berg's avatar
van den Berg committed
170
    container: containers["biopet-scatterregions"]
171
    shell: "mkdir -p {output} && "
172
           "biopet-scatterregions -Xmx24G "
173
           "--referenceFasta {input.ref} --scatterSize {params.size} "
174
           "--outputDir scatter 2> {log}"
van den Berg's avatar
van den Berg committed
175

Sander Bollen's avatar
Sander Bollen committed
176
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
177
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
178
    input:
179
        bam = rules.markdup.output.bam,
180
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
181
182
183
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
184
    output:
van den Berg's avatar
van den Berg committed
185
186
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
187
188
    params:
        gatk_jar = config["gatk_jar"],
189
    log: "log/{sample}/gvcf_scatter.{chunk}.log"
van den Berg's avatar
van den Berg committed
190
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
191
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
192
    container: containers["gatk"]
193
194
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 {params.gatk_jar} -T "
           "HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
195
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
196
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
197
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
198
199
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
200
           "--GVCFGQBands 80 --GVCFGQBands 100 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
201

202
203
204
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
205
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
206
207
208
209

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
210
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
211
212

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
213
    """Join the gvcf files together"""
214
215
216
217
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
218
219
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
220
221
222
    log:
        concat = "log/{sample}/gvcf_gather_concat.log",
        index = "log/{sample}/gvcf_gather_index.log"
van den Berg's avatar
van den Berg committed
223
224
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
225
           "--output {output.gvcf} --output-type z 2> {log.concat} && "
226
           "bcftools index --tbi --output-file {output.gvcf_tbi} "
227
           "{output.gvcf} 2> {log.index}"
Sander Bollen's avatar
Sander Bollen committed
228
229

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
230
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
231
    input:
232
233
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
234
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
235
    output:
236
237
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
238
239
    params:
        gatk_jar = config["gatk_jar"]
240
    log: "log/{sample}/genotype_scatter.{chunk}.log"
van den Berg's avatar
van den Berg committed
241
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
242
243
        chunk = "[0-9]+"
    container: containers["gatk"]
244
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 {params.gatk_jar} -T "
Sander Bollen's avatar
Sander Bollen committed
245
           "GenotypeGVCFs -R {input.ref} "
246
           "-V {input.gvcf} -o '{output.vcf}' 2> {log}"
247

248
def aggregate_vcf(wildcards):
249
250
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
251
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
252

253
254
255
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
256
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
257

Sander Bollen's avatar
Sander Bollen committed
258
rule genotype_gather:
259
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
260
    input:
261
        vcfs = aggregate_vcf,
262
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
263
    output:
264
265
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
266
    log: "log/{sample}/genotype_gather.log"
van den Berg's avatar
van den Berg committed
267
268
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
269
           "--output {output.vcf} --output-type z 2> {log} && "
270
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
271

Sander Bollen's avatar
Sander Bollen committed
272
## bam metrics
273
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
274
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
275
    input:
276
277
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
278
    output:
van den Berg's avatar
van den Berg committed
279
280
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
281
    log: "log/{sample}/mapped_reads_bases.log"
van den Berg's avatar
van den Berg committed
282
    container: containers["samtools-1.7-python-3.6"]
283
    shell: "samtools view -F 4 {input.bam} 2> {log} | cut -f 10 | python {input.pywc} "
284
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
285

286
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
287
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
288
    input:
289
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
290
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
291
    output:
van den Berg's avatar
van den Berg committed
292
293
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
294
    log: "log/{sample}/unique_reads_bases.log"
van den Berg's avatar
van den Berg committed
295
    container: containers["samtools-1.7-python-3.6"]
296
297
298
    shell: "samtools view -F 4 -F 1024 {input.bam} 2> {log} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} "
           "--bases {output.bases} 2>> {log}"
Sander Bollen's avatar
Sander Bollen committed
299
300

## fastqc
Sander Bollen's avatar
Sander Bollen committed
301
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
302
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
303
    input:
304
305
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
306
    output:
307
        done = "{sample}/pre_process/raw-{sample}-{read_group}/.done"
308
    log: "log/{sample}/fastqc_raw.{read_group}.log"
van den Berg's avatar
van den Berg committed
309
    container: containers["fastqc"]
310
    shell: "fastqc --threads 4 --nogroup -o $(dirname {output.done}) "
311
           "{input.r1} {input.r2} 2> {log} && "
312
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
313

Sander Bollen's avatar
Sander Bollen committed
314
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
315
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
316
    input:
317
318
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
Sander Bollen's avatar
Sander Bollen committed
319
    output:
320
        done = "{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
321
    log: "log/{sample}/fastqc_postqc.{read_group}.log"
van den Berg's avatar
van den Berg committed
322
    container: containers["fastqc"]
323
    shell: "fastqc --threads 4 --nogroup -o $(dirname {output.done}) "
324
           "{input.r1} {input.r2} 2> {log} && "
325
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
326

327
## coverage
Sander Bollen's avatar
Sander Bollen committed
328
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
329
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
330
    input:
331
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
332
333
        genome = "current.genome",
        covpy = config["covstats"],
van den Berg's avatar
van den Berg committed
334
        bed = config.get("targetsfile","")
Sander Bollen's avatar
Sander Bollen committed
335
    params:
van den Berg's avatar
van den Berg committed
336
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
337
    output:
van den Berg's avatar
van den Berg committed
338
339
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
340
341
342
    log:
        bedtools = "log/{sample}/covstats_bedtools.log",
        covpy = "log/{sample}/covstats_covpy.log"
van den Berg's avatar
van den Berg committed
343
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
344
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
345
           "-b {input.bam} -d  2> {log.bedtools} | python {input.covpy} - "
346
           "--plot {output.covp} --title 'Targets coverage' "
347
           "--subtitle '{params.subt}' > {output.covj} 2> {log.covpy}"
Sander Bollen's avatar
Sander Bollen committed
348

Sander Bollen's avatar
Sander Bollen committed
349
350
351
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
352
353
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
354
        ref = config.get('refflat', "")
355
356
    output: "{sample}/coverage/refFlat_coverage.tsv"
    log: "log/{sample}/vtools_coverage.log"
van den Berg's avatar
van den Berg committed
357
    container: containers["vtools"]
358
359
    shell: "vtools-gcoverage -I {input.gvcf} "
           "-R {input.ref} > {output} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
360

361
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
362
    """Colect cutadapt summary from each readgroup per sample """
363
364
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
365
366
367
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
368
369
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
370
    log: "log/{sample}/collect_cutadapt_summary.log"
van den Berg's avatar
van den Berg committed
371
    container: containers["python3"]
372
373
374
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

375
376
377
378
379
380
381
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
van den Berg's avatar
van den Berg committed
382
        cov = rules.covstats.output.covj if "targetsfile" in config else [],
383
384
385
386
387
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
388
    log: "log/{sample}/collectstats.log"
389
390
391
392
393
394
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
395
           "--covstats {input.cov} > {output} 2> {log}"
396

van den Berg's avatar
van den Berg committed
397
398
399
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
400
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
401
402
        ref = config["reference"],
    params:
403
        prefix = lambda wildcards, output: output.alignment[:-26]
van den Berg's avatar
van den Berg committed
404
    output:
van den Berg's avatar
van den Berg committed
405
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
406
        insertMetrics = "{sample}/bams/{sample}.insert_size_metrics"
407
    log: "log/{sample}/multiple_metrics.log"
van den Berg's avatar
van den Berg committed
408
    container: containers["picard"]
409
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectMultipleMetrics "
van den Berg's avatar
van den Berg committed
410
411
412
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
413
           "PROGRAM=CollectInsertSizeMetrics 2> {log}"
van den Berg's avatar
van den Berg committed
414

415
416
417
418
419
420
421
rule bed_to_interval:
    """Run picard BedToIntervalList

    This is needed to convert the bed files for the capture kit to a format
    picard can read
    """
    input:
van den Berg's avatar
van den Berg committed
422
        targets = config.get("targetsfile",""),
423
424
425
426
427
        baits = config.get("baitsfile",""),
        ref = config["reference"]
    output:
        target_interval = "target.interval",
        baits_interval = "baits.interval"
428
429
430
    log:
        target = "log/bed_to_interval_target.log",
        baits = "log/bed_to_interval_target.log"
431
    container: containers["picard"]
432
    shell: "picard -Xmx4G BedToIntervalList "
433
            "I={input.targets} O={output.target_interval} "
434
            "SD={input.ref} 2> {log.target} && "
435
436
            "picard BedToIntervalList "
            "I={input.baits} O={output.baits_interval} "
437
            "SD={input.ref} 2> {log.baits}"
438
439
440
441
442
443
444
445
446

rule hs_metrics:
    """Run picard CollectHsMetrics"""
    input:
        bam = rules.markdup.output.bam,
        ref = config["reference"],
        targets = rules.bed_to_interval.output.target_interval,
        baits = rules.bed_to_interval.output.baits_interval,
    output: "{sample}/bams/{sample}.hs_metrics.txt"
447
    log: "log/{sample}/hs_metrics.log"
448
    container: containers["picard"]
449
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectHsMetrics "
450
451
452
453
454
           "I={input.bam} O={output} "
           "R={input.ref} "
           "BAIT_INTERVALS={input.baits} "
           "TARGET_INTERVALS={input.targets}"

Sander Bollen's avatar
Sander Bollen committed
455
456
457
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
458
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
459
460
    """
    input:
van den Berg's avatar
van den Berg committed
461
462
463
464
465
466
467
468
469
470
471
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
472
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/.done"
473
474
                      for read_group, sample in get_readgroup_per_sample()),

475
        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
476
477
478
479
480
                      for read_group, sample in get_readgroup_per_sample()),

        hs_metric = expand("{sample}/bams/{sample}.hs_metrics.txt",
                           sample=config["samples"]) if "baitsfile" in config else []

481
482
    output:
        html = "multiqc_report/multiqc_report.html",
483
        insertSize = "multiqc_report/multiqc_data/multiqc_picard_insertSize.json",
484
        AlignmentMetrics = "multiqc_report/multiqc_data/multiqc_picard_AlignmentSummaryMetrics.json",
485
        DuplicationMetrics = "multiqc_report/multiqc_data/multiqc_picard_dups.json",
486
        HsMetrics = "multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json" if "baitsfile" in config else []
487
    log: "log/multiqc.log"
van den Berg's avatar
van den Berg committed
488
    container: containers["multiqc"]
489
490
    shell: "multiqc --data-format json --force "
           "--outdir multiqc_report . 2> {log} "
van den Berg's avatar
van den Berg committed
491
           "|| touch {output}"
492
493
494
495
496
497
498

rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"],
499
        insertSize = rules.multiqc.output.insertSize,
500
        AlignmentMetrics = rules.multiqc.output.AlignmentMetrics,
501
        DuplicationMetrics = rules.multiqc.output.DuplicationMetrics,
502
        HsMetrics = rules.multiqc.output.HsMetrics
503
    output: "stats.json"
504
    log: "log/stats.tsv.log"
505
506
    container: containers["vtools"]
    shell: "python {input.mpy} --collectstats {input.cols} "
507
           "--picard-insertSize {input.insertSize} "
508
           "--picard-AlignmentMetrics {input.AlignmentMetrics} "
509
           "--picard-DuplicationMetrics {input.DuplicationMetrics} "
510
           "--picard-HsMetrics {input.HsMetrics} > {output} 2> {log}"
511
512
513
514
515
516
517

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats = rules.merge_stats.output,
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
518
    log: "log/stats.tsv.log"
519
    container: containers["python3"]
520
    shell: "python {input.sc} -i {input.stats} > {output} 2> {log}"
521

522
523
524
525
526

rule gvcf2coverage:
    """ Determine coverage from gvcf files """
    input: rules.gvcf_gather.output.gvcf
    output: "{sample}/vcf/{sample}_{threshold}.bed"
527
    log: "log/{sample}/gvcf2coverage.{threshold}.log"
528
    container: containers["gvcf2coverage"]
529
    shell: "gvcf2coverage -t {wildcards.threshold} < {input} 2> {log} | cut -f 1,2,3 > {output}"