Snakefile 24.8 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
#   hutspot - a DNAseq variant calling pipeline
van den Berg's avatar
van den Berg committed
2
#   Copyright (C) 2017-2019, Leiden University Medical Center
Sander Bollen's avatar
Sander Bollen committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
# Read the json schema
32
with open(srcdir('config/schema.json'), 'rt') as fin:
van den Berg's avatar
van den Berg committed
33
34
    schema = json.load(fin)

van den Berg's avatar
van den Berg committed
35
# Validate the config against the schema
van den Berg's avatar
van den Berg committed
36
try:
van den Berg's avatar
van den Berg committed
37
    jsonschema.validate(config, schema)
van den Berg's avatar
van den Berg committed
38
except jsonschema.ValidationError as e:
van den Berg's avatar
van den Berg committed
39
    raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')
van den Berg's avatar
van den Berg committed
40

van den Berg's avatar
van den Berg committed
41
42
43
44
45
# If you specify a baitsfile, you also have to specify a targets file for
# picard
if "baitsfile" in config and "targetsfile" not in config:
    msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
    raise jsonschema.ValidationError(msg)
46

47
48
49
50
51
52
53
# A sample name cannot be a substring of another sample, since that breaks picard
# metrics parsing by multiqc
msg = 'Invalid --configfile: sample names should not overlap ("{s1}" is contained in "{s2}")'
for s1, s2 in itertools.permutations(config['samples'], 2):
    if s1 in s2:
        raise jsonschema.ValidationError(msg.format(s1=s1, s2=s2))

54
55
# Set default values
def set_default(key, value):
van den Berg's avatar
van den Berg committed
56
    """Set default config values"""
van den Berg's avatar
van den Berg committed
57
58
    if key not in config:
        config[key] = value
van den Berg's avatar
van den Berg committed
59

van den Berg's avatar
van den Berg committed
60
# Set the default config
van den Berg's avatar
van den Berg committed
61
62
63
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

64
# Hide the absolute path so the snakemake linter doesn't cry about it
van den Berg's avatar
van den Berg committed
65
set_default('gatk_jar', os.path.join(os.path.sep,'usr','GenomeAnalysisTK.jar'))
66

van den Berg's avatar
van den Berg committed
67
# Set the script paths
68
69
70
71
72
73
set_default("covstats", srcdir("src/covstats.py"))
set_default("collect_stats", srcdir("src/collect_stats.py"))
set_default("merge_stats", srcdir("src/merge_stats.py"))
set_default("stats_to_tsv", srcdir("src/stats_to_tsv.py"))
set_default("py_wordcount", srcdir("src/pywc.py"))
set_default("cutadapt_summary", srcdir("src/cutadapt_summary.py"))
Sander Bollen's avatar
Sander Bollen committed
74

75
76
77
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
78
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
van den Berg's avatar
van den Berg committed
79
    "bwa-0.7.17-picard-2.22.8": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:76d16eabff506ac13338d7f14644a0ad301b9d7e-0",
van den Berg's avatar
van den Berg committed
80
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
81
82
83
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
84
    "gvcf2coverage": "docker://lumc/gvcf2coverage:0.1-dirty-2",
85
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
van den Berg's avatar
van den Berg committed
86
    "picard": "docker://quay.io/biocontainers/picard:2.22.8--0",
87
88
89
90
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
91

92
def get_readgroup(wildcards):
93
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
94

van den Berg's avatar
van den Berg committed
95
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
96
    for sample in config["samples"]:
97
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
98
99
            yield rg, sample

100
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
101
102
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
103
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
104

Sander Bollen's avatar
Sander Bollen committed
105
106
rule all:
    input:
van den Berg's avatar
van den Berg committed
107
        multiqc = "multiqc_report/multiqc_report.html",
108
109
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

        bam = expand("{sample}/bams/{sample}.bam",
                     sample=config["samples"]),

        vcfs = expand("{sample}/vcf/{sample}.vcf.gz",
                      sample=config["samples"]),

        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi",
                         sample=config["samples"]),

        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz",
                       sample=config["samples"]),

        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                          sample=config["samples"]),

126
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/.done"
van den Berg's avatar
van den Berg committed
127
128
                      for read_group, sample in get_readgroup_per_sample()),

129
        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
van den Berg's avatar
van den Berg committed
130
131
132
133
                      for read_group, sample in get_readgroup_per_sample()),

        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt"
                    for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
134
        coverage_stats = coverage_stats,
135
136
137
138
        coverage_files = (f"{sample}/vcf/{sample}_{threshold}.bed"
                          for sample, threshold in itertools.product(
                              config['samples'], config['coverage_threshold'])
                          ) if 'coverage_threshold' in config else []
Sander Bollen's avatar
Sander Bollen committed
139

140
141
142
rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    output: directory("tmp")
143
    log: "log/create_markdup_tmp.log"
144
    container: containers["debian"]
145
    shell: "mkdir -p {output} 2> {log}"
146

Sander Bollen's avatar
Sander Bollen committed
147
rule genome:
Sander Bollen's avatar
Sander Bollen committed
148
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
149
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
150
    output: "current.genome"
151
    log: "log/genome.log"
van den Berg's avatar
van den Berg committed
152
    container: containers["debian"]
153
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
154
155

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
156
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
157
    input:
158
159
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
160
    output:
van den Berg's avatar
van den Berg committed
161
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
162
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
163
164
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
165
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
166
167
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
van den Berg's avatar
van den Berg committed
168
           "--compression-level=1 -j 8 "
van den Berg's avatar
van den Berg committed
169
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
170
           "{input.r1} {input.r2} "
171
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
172
173

rule align:
Sander Bollen's avatar
Sander Bollen committed
174
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
175
    input:
176
177
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
178
        ref = config["reference"],
179
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
180
    params:
181
        rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"
182
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
183
    log: "log/{sample}/align.{read_group}.log"
van den Berg's avatar
van den Berg committed
184
    container: containers["bwa-0.7.17-picard-2.22.8"]
Sander Bollen's avatar
Sander Bollen committed
185
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
186
187
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
188
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
189

190
def markdup_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
191
192
193
194
    """Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(
                sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)]
195

Sander Bollen's avatar
Sander Bollen committed
196
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
197
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
198
    input:
199
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
200
201
202
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
203
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
204
    output:
205
206
207
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
208
    log: "log/{sample}/markdup.log"
209
210
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
211
    container: containers["picard"]
212
213
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
214
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
215
           "METRICS_FILE={output.metrics} "
216
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
217

218
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
219
220
221
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
222
223
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
224
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
225
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
226
    input:
227
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
228
229
230
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
231
232
        ref = config["reference"],
        vcfs = config["known_sites"]
233
    output: "{sample}/bams/{sample}.baserecal.grp"
234
    log: "log/{sample}/baserecal.log"
235
    params:
van den Berg's avatar
van den Berg committed
236
237
238
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
239
        region = f"-L {config['restrict_BQSR']}" if "restrict_BQSR" in config else "",
240
        gatk_jar = config["gatk_jar"],
241
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
242
    container: containers["gatk"]
243
    shell: "java -XX:ParallelGCThreads=1 -jar {params.gatk_jar} -T "
244
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
245
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
246
247
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites} "
           "{params.region} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
248

249
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
250
251
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
252
        ref = config["reference"],
253
    params:
van den Berg's avatar
van den Berg committed
254
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
255
    output:
256
        directory("scatter")
257
    log: "log/scatterregions.log"
van den Berg's avatar
van den Berg committed
258
    container: containers["biopet-scatterregions"]
259
    shell: "mkdir -p {output} && "
260
           "biopet-scatterregions -Xmx24G "
261
           "--referenceFasta {input.ref} --scatterSize {params.size} "
262
           "--outputDir scatter 2> {log}"
van den Berg's avatar
van den Berg committed
263

Sander Bollen's avatar
Sander Bollen committed
264
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
265
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
266
    input:
267
        bam = rules.markdup.output.bam,
268
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
269
270
271
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
272
    output:
van den Berg's avatar
van den Berg committed
273
274
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
275
276
    params:
        gatk_jar = config["gatk_jar"],
277
    log: "log/{sample}/gvcf_scatter.{chunk}.log"
van den Berg's avatar
van den Berg committed
278
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
279
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
280
    container: containers["gatk"]
281
282
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 {params.gatk_jar} -T "
           "HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
283
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
284
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
285
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
286
287
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
288
           "--GVCFGQBands 80 --GVCFGQBands 100 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
289

290
291
292
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
293
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
294
295
296
297

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
298
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
299
300

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
301
    """Join the gvcf files together"""
302
303
304
305
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
306
307
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
308
309
310
    log:
        concat = "log/{sample}/gvcf_gather_concat.log",
        index = "log/{sample}/gvcf_gather_index.log"
van den Berg's avatar
van den Berg committed
311
312
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
313
           "--output {output.gvcf} --output-type z 2> {log.concat} && "
314
           "bcftools index --tbi --output-file {output.gvcf_tbi} "
315
           "{output.gvcf} 2> {log.index}"
Sander Bollen's avatar
Sander Bollen committed
316
317

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
318
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
319
    input:
320
321
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
322
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
323
    output:
324
325
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
326
327
    params:
        gatk_jar = config["gatk_jar"]
328
    log: "log/{sample}/genotype_scatter.{chunk}.log"
van den Berg's avatar
van den Berg committed
329
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
330
331
        chunk = "[0-9]+"
    container: containers["gatk"]
332
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 {params.gatk_jar} -T "
Sander Bollen's avatar
Sander Bollen committed
333
           "GenotypeGVCFs -R {input.ref} "
334
           "-V {input.gvcf} -o '{output.vcf}' 2> {log}"
335

336
def aggregate_vcf(wildcards):
337
338
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
339
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
340

341
342
343
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
344
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
345

Sander Bollen's avatar
Sander Bollen committed
346
rule genotype_gather:
347
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
348
    input:
349
        vcfs = aggregate_vcf,
350
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
351
    output:
352
353
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
354
    log: "log/{sample}/genotype_gather.log"
van den Berg's avatar
van den Berg committed
355
356
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
357
           "--output {output.vcf} --output-type z 2> {log} && "
358
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
359

Sander Bollen's avatar
Sander Bollen committed
360
## bam metrics
361
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
362
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
363
    input:
364
365
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
366
    output:
van den Berg's avatar
van den Berg committed
367
368
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
369
    log: "log/{sample}/mapped_reads_bases.log"
van den Berg's avatar
van den Berg committed
370
    container: containers["samtools-1.7-python-3.6"]
371
    shell: "samtools view -F 4 {input.bam} 2> {log} | cut -f 10 | python {input.pywc} "
372
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
373

374
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
375
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
376
    input:
377
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
378
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
379
    output:
van den Berg's avatar
van den Berg committed
380
381
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
382
    log: "log/{sample}/unique_reads_bases.log"
van den Berg's avatar
van den Berg committed
383
    container: containers["samtools-1.7-python-3.6"]
384
385
386
    shell: "samtools view -F 4 -F 1024 {input.bam} 2> {log} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} "
           "--bases {output.bases} 2>> {log}"
Sander Bollen's avatar
Sander Bollen committed
387
388

## fastqc
Sander Bollen's avatar
Sander Bollen committed
389
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
390
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
391
    input:
392
393
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
394
    output:
395
        done = "{sample}/pre_process/raw-{sample}-{read_group}/.done"
396
    log: "log/{sample}/fastqc_raw.{read_group}.log"
van den Berg's avatar
van den Berg committed
397
    container: containers["fastqc"]
398
    shell: "fastqc --threads 4 --nogroup -o $(dirname {output.done}) "
399
           "{input.r1} {input.r2} 2> {log} && "
400
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
401

Sander Bollen's avatar
Sander Bollen committed
402
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
403
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
404
    input:
405
406
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
Sander Bollen's avatar
Sander Bollen committed
407
    output:
408
        done = "{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
409
    log: "log/{sample}/fastqc_postqc.{read_group}.log"
van den Berg's avatar
van den Berg committed
410
    container: containers["fastqc"]
411
    shell: "fastqc --threads 4 --nogroup -o $(dirname {output.done}) "
412
           "{input.r1} {input.r2} 2> {log} && "
413
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
414

415
## coverage
Sander Bollen's avatar
Sander Bollen committed
416
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
417
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
418
    input:
419
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
420
421
        genome = "current.genome",
        covpy = config["covstats"],
van den Berg's avatar
van den Berg committed
422
        bed = config.get("targetsfile","")
Sander Bollen's avatar
Sander Bollen committed
423
    params:
van den Berg's avatar
van den Berg committed
424
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
425
    output:
van den Berg's avatar
van den Berg committed
426
427
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
428
429
430
    log:
        bedtools = "log/{sample}/covstats_bedtools.log",
        covpy = "log/{sample}/covstats_covpy.log"
van den Berg's avatar
van den Berg committed
431
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
432
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
433
           "-b {input.bam} -d  2> {log.bedtools} | python {input.covpy} - "
434
           "--plot {output.covp} --title 'Targets coverage' "
435
           "--subtitle '{params.subt}' > {output.covj} 2> {log.covpy}"
Sander Bollen's avatar
Sander Bollen committed
436

Sander Bollen's avatar
Sander Bollen committed
437
438
439
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
440
441
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
442
        ref = config.get('refflat', "")
443
444
    output: "{sample}/coverage/refFlat_coverage.tsv"
    log: "log/{sample}/vtools_coverage.log"
van den Berg's avatar
van den Berg committed
445
    container: containers["vtools"]
446
447
    shell: "vtools-gcoverage -I {input.gvcf} "
           "-R {input.ref} > {output} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
448

449
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
450
    """Colect cutadapt summary from each readgroup per sample """
451
452
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
453
454
455
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
456
457
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
458
    log: "log/{sample}/collect_cutadapt_summary.log"
van den Berg's avatar
van den Berg committed
459
    container: containers["python3"]
460
461
462
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

463
464
465
466
467
468
469
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
van den Berg's avatar
van den Berg committed
470
        cov = rules.covstats.output.covj if "targetsfile" in config else [],
471
472
473
474
475
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
476
    log: "log/{sample}/collectstats.log"
477
478
479
480
481
482
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
483
           "--covstats {input.cov} > {output} 2> {log}"
484

van den Berg's avatar
van den Berg committed
485
486
487
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
488
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
489
490
        ref = config["reference"],
    params:
491
        prefix = lambda wildcards, output: output.alignment[:-26]
van den Berg's avatar
van den Berg committed
492
    output:
van den Berg's avatar
van den Berg committed
493
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
494
        insertMetrics = "{sample}/bams/{sample}.insert_size_metrics"
495
    log: "log/{sample}/multiple_metrics.log"
van den Berg's avatar
van den Berg committed
496
    container: containers["picard"]
497
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectMultipleMetrics "
van den Berg's avatar
van den Berg committed
498
499
500
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
501
           "PROGRAM=CollectInsertSizeMetrics 2> {log}"
van den Berg's avatar
van den Berg committed
502

503
504
505
506
507
508
509
rule bed_to_interval:
    """Run picard BedToIntervalList

    This is needed to convert the bed files for the capture kit to a format
    picard can read
    """
    input:
van den Berg's avatar
van den Berg committed
510
        targets = config.get("targetsfile",""),
511
512
513
514
515
        baits = config.get("baitsfile",""),
        ref = config["reference"]
    output:
        target_interval = "target.interval",
        baits_interval = "baits.interval"
516
517
518
    log:
        target = "log/bed_to_interval_target.log",
        baits = "log/bed_to_interval_target.log"
519
    container: containers["picard"]
520
    shell: "picard -Xmx4G BedToIntervalList "
521
            "I={input.targets} O={output.target_interval} "
522
            "SD={input.ref} 2> {log.target} && "
523
524
            "picard BedToIntervalList "
            "I={input.baits} O={output.baits_interval} "
525
            "SD={input.ref} 2> {log.baits}"
526
527
528
529
530
531
532
533
534

rule hs_metrics:
    """Run picard CollectHsMetrics"""
    input:
        bam = rules.markdup.output.bam,
        ref = config["reference"],
        targets = rules.bed_to_interval.output.target_interval,
        baits = rules.bed_to_interval.output.baits_interval,
    output: "{sample}/bams/{sample}.hs_metrics.txt"
535
    log: "log/{sample}/hs_metrics.log"
536
    container: containers["picard"]
537
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectHsMetrics "
538
539
540
541
542
           "I={input.bam} O={output} "
           "R={input.ref} "
           "BAIT_INTERVALS={input.baits} "
           "TARGET_INTERVALS={input.targets}"

Sander Bollen's avatar
Sander Bollen committed
543
544
545
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
546
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
547
548
    """
    input:
van den Berg's avatar
van den Berg committed
549
550
551
552
553
554
555
556
557
558
559
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
560
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/.done"
561
562
                      for read_group, sample in get_readgroup_per_sample()),

563
        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
564
565
566
567
568
                      for read_group, sample in get_readgroup_per_sample()),

        hs_metric = expand("{sample}/bams/{sample}.hs_metrics.txt",
                           sample=config["samples"]) if "baitsfile" in config else []

569
570
    output:
        html = "multiqc_report/multiqc_report.html",
571
        insertSize = "multiqc_report/multiqc_data/multiqc_picard_insertSize.json",
572
        AlignmentMetrics = "multiqc_report/multiqc_data/multiqc_picard_AlignmentSummaryMetrics.json",
573
        DuplicationMetrics = "multiqc_report/multiqc_data/multiqc_picard_dups.json",
574
        HsMetrics = "multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json" if "baitsfile" in config else []
575
    log: "log/multiqc.log"
van den Berg's avatar
van den Berg committed
576
    container: containers["multiqc"]
577
578
    shell: "multiqc --data-format json --force "
           "--outdir multiqc_report . 2> {log} "
van den Berg's avatar
van den Berg committed
579
           "|| touch {output}"
580
581
582
583
584
585
586

rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"],
587
        insertSize = rules.multiqc.output.insertSize,
588
        AlignmentMetrics = rules.multiqc.output.AlignmentMetrics,
589
        DuplicationMetrics = rules.multiqc.output.DuplicationMetrics,
590
        HsMetrics = rules.multiqc.output.HsMetrics
591
    output: "stats.json"
592
    log: "log/stats.tsv.log"
593
594
    container: containers["vtools"]
    shell: "python {input.mpy} --collectstats {input.cols} "
595
           "--picard-insertSize {input.insertSize} "
596
           "--picard-AlignmentMetrics {input.AlignmentMetrics} "
597
           "--picard-DuplicationMetrics {input.DuplicationMetrics} "
598
           "--picard-HsMetrics {input.HsMetrics} > {output} 2> {log}"
599
600
601
602
603
604
605

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats = rules.merge_stats.output,
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
606
    log: "log/stats.tsv.log"
607
    container: containers["python3"]
608
    shell: "python {input.sc} -i {input.stats} > {output} 2> {log}"
609

610
611
612
613
614

rule gvcf2coverage:
    """ Determine coverage from gvcf files """
    input: rules.gvcf_gather.output.gvcf
    output: "{sample}/vcf/{sample}_{threshold}.bed"
615
    log: "log/{sample}/gvcf2coverage.{threshold}.log"
616
    container: containers["gvcf2coverage"]
617
    shell: "gvcf2coverage -t {wildcards.threshold} < {input} 2> {log} | cut -f 1,2,3 > {output}"