Snakefile 24.7 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
#   hutspot - a DNAseq variant calling pipeline
van den Berg's avatar
van den Berg committed
2
#   Copyright (C) 2017-2019, Leiden University Medical Center
Sander Bollen's avatar
Sander Bollen committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
# Read the json schema
32
with open(srcdir('config/schema.json'), 'rt') as fin:
van den Berg's avatar
van den Berg committed
33
34
    schema = json.load(fin)

van den Berg's avatar
van den Berg committed
35
# Validate the config against the schema
van den Berg's avatar
van den Berg committed
36
try:
van den Berg's avatar
van den Berg committed
37
    jsonschema.validate(config, schema)
van den Berg's avatar
van den Berg committed
38
except jsonschema.ValidationError as e:
van den Berg's avatar
van den Berg committed
39
    raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')
van den Berg's avatar
van den Berg committed
40

van den Berg's avatar
van den Berg committed
41
42
43
44
45
# If you specify a baitsfile, you also have to specify a targets file for
# picard
if "baitsfile" in config and "targetsfile" not in config:
    msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
    raise jsonschema.ValidationError(msg)
46

47
48
49
50
51
52
53
# A sample name cannot be a substring of another sample, since that breaks picard
# metrics parsing by multiqc
msg = 'Invalid --configfile: sample names should not overlap ("{s1}" is contained in "{s2}")'
for s1, s2 in itertools.permutations(config['samples'], 2):
    if s1 in s2:
        raise jsonschema.ValidationError(msg.format(s1=s1, s2=s2))

54
55
# Set default values
def set_default(key, value):
van den Berg's avatar
van den Berg committed
56
    """Set default config values"""
van den Berg's avatar
van den Berg committed
57
58
    if key not in config:
        config[key] = value
van den Berg's avatar
van den Berg committed
59

van den Berg's avatar
van den Berg committed
60
# Set the default config
van den Berg's avatar
van den Berg committed
61
62
63
64
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
65
66
67
68
69
70
set_default("covstats", srcdir("src/covstats.py"))
set_default("collect_stats", srcdir("src/collect_stats.py"))
set_default("merge_stats", srcdir("src/merge_stats.py"))
set_default("stats_to_tsv", srcdir("src/stats_to_tsv.py"))
set_default("py_wordcount", srcdir("src/pywc.py"))
set_default("cutadapt_summary", srcdir("src/cutadapt_summary.py"))
Sander Bollen's avatar
Sander Bollen committed
71

72
73
74
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
75
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
van den Berg's avatar
van den Berg committed
76
    "bwa-0.7.17-picard-2.22.8": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:76d16eabff506ac13338d7f14644a0ad301b9d7e-0",
van den Berg's avatar
van den Berg committed
77
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
78
79
80
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
81
    "gvcf2coverage": "docker://lumc/gvcf2coverage:0.1-dirty-2",
82
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
van den Berg's avatar
van den Berg committed
83
    "picard": "docker://quay.io/biocontainers/picard:2.22.8--0",
84
85
86
87
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
88

89
def get_readgroup(wildcards):
90
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
91

van den Berg's avatar
van den Berg committed
92
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
93
    for sample in config["samples"]:
94
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
95
96
            yield rg, sample

97
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
98
99
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
100
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
101

Sander Bollen's avatar
Sander Bollen committed
102
103
rule all:
    input:
van den Berg's avatar
van den Berg committed
104
        multiqc = "multiqc_report/multiqc_report.html",
105
106
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

        bam = expand("{sample}/bams/{sample}.bam",
                     sample=config["samples"]),

        vcfs = expand("{sample}/vcf/{sample}.vcf.gz",
                      sample=config["samples"]),

        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi",
                         sample=config["samples"]),

        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz",
                       sample=config["samples"]),

        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                          sample=config["samples"]),

123
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/.done"
van den Berg's avatar
van den Berg committed
124
125
                      for read_group, sample in get_readgroup_per_sample()),

126
        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
van den Berg's avatar
van den Berg committed
127
128
129
130
                      for read_group, sample in get_readgroup_per_sample()),

        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt"
                    for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
131
        coverage_stats = coverage_stats,
132
133
134
135
        coverage_files = (f"{sample}/vcf/{sample}_{threshold}.bed"
                          for sample, threshold in itertools.product(
                              config['samples'], config['coverage_threshold'])
                          ) if 'coverage_threshold' in config else []
Sander Bollen's avatar
Sander Bollen committed
136

137
138
139
rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    output: directory("tmp")
140
    log: "log/create_markdup_tmp.log"
141
    container: containers["debian"]
142
    shell: "mkdir -p {output} 2> {log}"
143

Sander Bollen's avatar
Sander Bollen committed
144
rule genome:
Sander Bollen's avatar
Sander Bollen committed
145
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
146
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
147
    output: "current.genome"
148
    log: "log/genome.log"
van den Berg's avatar
van den Berg committed
149
    container: containers["debian"]
150
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
151
152

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
153
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
154
    input:
155
156
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
157
    output:
van den Berg's avatar
van den Berg committed
158
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
159
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
160
161
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
162
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
163
164
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
van den Berg's avatar
van den Berg committed
165
           "--compression-level=1 -j 8 "
van den Berg's avatar
van den Berg committed
166
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
167
           "{input.r1} {input.r2} "
168
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
169
170

rule align:
Sander Bollen's avatar
Sander Bollen committed
171
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
172
    input:
173
174
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
175
        ref = config["reference"],
176
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
177
    params:
178
        rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"
179
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
180
    log: "log/{sample}/align.{read_group}.log"
van den Berg's avatar
van den Berg committed
181
    container: containers["bwa-0.7.17-picard-2.22.8"]
Sander Bollen's avatar
Sander Bollen committed
182
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
183
184
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
185
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
186

187
def markdup_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
188
189
190
191
    """Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(
                sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)]
192

Sander Bollen's avatar
Sander Bollen committed
193
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
194
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
195
    input:
196
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
197
198
199
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
200
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
201
    output:
202
203
204
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
205
    log: "log/{sample}/markdup.log"
206
207
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
208
    container: containers["picard"]
209
210
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
211
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
212
           "METRICS_FILE={output.metrics} "
213
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
214

215
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
216
217
218
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
219
220
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
221
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
222
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
223
    input:
224
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
225
226
227
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
228
229
        ref = config["reference"],
        vcfs = config["known_sites"]
230
    output: "{sample}/bams/{sample}.baserecal.grp"
231
    log: "log/{sample}/baserecal.log"
232
    params:
van den Berg's avatar
van den Berg committed
233
234
235
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
236
        region = "-L "+ config["restrict_BQSR"] if "restrict_BQSR" in config else "",
237
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
238
    container: containers["gatk"]
239
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
240
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
241
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
242
243
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites} "
           "{params.region} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
244

245
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
246
247
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
248
        ref = config["reference"],
249
    params:
van den Berg's avatar
van den Berg committed
250
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
251
    output:
252
        directory("scatter")
253
    log: "log/scatterregions.log"
van den Berg's avatar
van den Berg committed
254
    container: containers["biopet-scatterregions"]
255
    shell: "mkdir -p {output} && "
256
           "biopet-scatterregions -Xmx24G "
257
           "--referenceFasta {input.ref} --scatterSize {params.size} "
258
           "--outputDir scatter 2> {log}"
van den Berg's avatar
van den Berg committed
259

Sander Bollen's avatar
Sander Bollen committed
260
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
261
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
262
    input:
263
        bam = rules.markdup.output.bam,
264
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
265
266
267
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
268
    output:
van den Berg's avatar
van den Berg committed
269
270
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
271
    log: "log/{sample}/gvcf_scatter.{chunk}.log"
van den Berg's avatar
van den Berg committed
272
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
273
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
274
    container: containers["gatk"]
275
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
276
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
277
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
278
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
279
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
280
281
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
282
           "--GVCFGQBands 80 --GVCFGQBands 100 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
283

284
285
286
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
287
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
288
289
290
291

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
292
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
293
294

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
295
    """Join the gvcf files together"""
296
297
298
299
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
300
301
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
302
303
304
    log:
        concat = "log/{sample}/gvcf_gather_concat.log",
        index = "log/{sample}/gvcf_gather_index.log"
van den Berg's avatar
van den Berg committed
305
306
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
307
           "--output {output.gvcf} --output-type z 2> {log.concat} && "
308
           "bcftools index --tbi --output-file {output.gvcf_tbi} "
309
           "{output.gvcf} 2> {log.index}"
Sander Bollen's avatar
Sander Bollen committed
310
311

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
312
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
313
    input:
314
315
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
316
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
317
    output:
318
319
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
320
    log: "log/{sample}/genotype_scatter.{chunk}.log"
van den Berg's avatar
van den Berg committed
321
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
322
323
324
325
        chunk = "[0-9]+"
    container: containers["gatk"]
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 "
           "/usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
326
           "GenotypeGVCFs -R {input.ref} "
327
           "-V {input.gvcf} -o '{output.vcf}' 2> {log}"
328

329
def aggregate_vcf(wildcards):
330
331
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
332
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
333

334
335
336
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
337
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
338

Sander Bollen's avatar
Sander Bollen committed
339
rule genotype_gather:
340
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
341
    input:
342
        vcfs = aggregate_vcf,
343
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
344
    output:
345
346
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
347
    log: "log/{sample}/genotype_gather.log"
van den Berg's avatar
van den Berg committed
348
349
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
350
           "--output {output.vcf} --output-type z 2> {log} && "
351
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
352

Sander Bollen's avatar
Sander Bollen committed
353
## bam metrics
354
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
355
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
356
    input:
357
358
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
359
    output:
van den Berg's avatar
van den Berg committed
360
361
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
362
    log: "log/{sample}/mapped_reads_bases.log"
van den Berg's avatar
van den Berg committed
363
    container: containers["samtools-1.7-python-3.6"]
364
    shell: "samtools view -F 4 {input.bam} 2> {log} | cut -f 10 | python {input.pywc} "
365
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
366

367
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
368
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
369
    input:
370
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
371
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
372
    output:
van den Berg's avatar
van den Berg committed
373
374
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
375
    log: "log/{sample}/unique_reads_bases.log"
van den Berg's avatar
van den Berg committed
376
    container: containers["samtools-1.7-python-3.6"]
377
378
379
    shell: "samtools view -F 4 -F 1024 {input.bam} 2> {log} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} "
           "--bases {output.bases} 2>> {log}"
Sander Bollen's avatar
Sander Bollen committed
380
381

## fastqc
Sander Bollen's avatar
Sander Bollen committed
382
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
383
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
384
    input:
385
386
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
387
388
    params:
        folder = "{sample}/pre_process/raw-{sample}-{read_group}"
Sander Bollen's avatar
Sander Bollen committed
389
    output:
390
        done = "{sample}/pre_process/raw-{sample}-{read_group}/.done"
391
    log: "log/{sample}/fastqc_raw.{read_group}.log"
van den Berg's avatar
van den Berg committed
392
    container: containers["fastqc"]
393
394
    shell: "fastqc --threads 4 --nogroup -o {params.folder} "
           "{input.r1} {input.r2} 2> {log} && "
395
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
396

Sander Bollen's avatar
Sander Bollen committed
397
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
398
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
399
    input:
400
401
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
402
403
    params:
        folder = "{sample}/pre_process/trimmed-{sample}-{read_group}"
Sander Bollen's avatar
Sander Bollen committed
404
    output:
405
        done = "{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
406
    log: "log/{sample}/fastqc_postqc.{read_group}.log"
van den Berg's avatar
van den Berg committed
407
    container: containers["fastqc"]
408
409
    shell: "fastqc --threads 4 --nogroup -o {params.folder} "
           "{input.r1} {input.r2} 2> {log} && "
410
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
411

412
## coverage
Sander Bollen's avatar
Sander Bollen committed
413
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
414
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
415
    input:
416
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
417
418
        genome = "current.genome",
        covpy = config["covstats"],
van den Berg's avatar
van den Berg committed
419
        bed = config.get("targetsfile","")
Sander Bollen's avatar
Sander Bollen committed
420
    params:
van den Berg's avatar
van den Berg committed
421
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
422
    output:
van den Berg's avatar
van den Berg committed
423
424
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
425
426
427
    log:
        bedtools = "log/{sample}/covstats_bedtools.log",
        covpy = "log/{sample}/covstats_covpy.log"
van den Berg's avatar
van den Berg committed
428
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
429
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
430
           "-b {input.bam} -d  2> {log.bedtools} | python {input.covpy} - "
431
           "--plot {output.covp} --title 'Targets coverage' "
432
           "--subtitle '{params.subt}' > {output.covj} 2> {log.covpy}"
Sander Bollen's avatar
Sander Bollen committed
433

Sander Bollen's avatar
Sander Bollen committed
434
435
436
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
437
438
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
439
        ref = config.get('refflat', "")
440
441
    output: "{sample}/coverage/refFlat_coverage.tsv"
    log: "log/{sample}/vtools_coverage.log"
van den Berg's avatar
van den Berg committed
442
    container: containers["vtools"]
443
444
    shell: "vtools-gcoverage -I {input.gvcf} "
           "-R {input.ref} > {output} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
445

446
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
447
    """Colect cutadapt summary from each readgroup per sample """
448
449
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
450
451
452
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
453
454
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
455
    log: "log/{sample}/collect_cutadapt_summary.log"
van den Berg's avatar
van den Berg committed
456
    container: containers["python3"]
457
458
459
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

460
461
462
463
464
465
466
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
van den Berg's avatar
van den Berg committed
467
        cov = rules.covstats.output.covj if "targetsfile" in config else [],
468
469
470
471
472
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
473
    log: "log/{sample}/collectstats.log"
474
475
476
477
478
479
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
480
           "--covstats {input.cov} > {output} 2> {log}"
481

van den Berg's avatar
van den Berg committed
482
483
484
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
485
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
486
487
        ref = config["reference"],
    params:
van den Berg's avatar
van den Berg committed
488
        prefix = "{sample}/bams/{sample}",
van den Berg's avatar
van den Berg committed
489
    output:
van den Berg's avatar
van den Berg committed
490
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
491
        insertMetrics = "{sample}/bams/{sample}.insert_size_metrics"
492
    log: "log/{sample}/multiple_metrics.log"
van den Berg's avatar
van den Berg committed
493
    container: containers["picard"]
494
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectMultipleMetrics "
van den Berg's avatar
van den Berg committed
495
496
497
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
498
           "PROGRAM=CollectInsertSizeMetrics 2> {log}"
van den Berg's avatar
van den Berg committed
499

500
501
502
503
504
505
506
rule bed_to_interval:
    """Run picard BedToIntervalList

    This is needed to convert the bed files for the capture kit to a format
    picard can read
    """
    input:
van den Berg's avatar
van den Berg committed
507
        targets = config.get("targetsfile",""),
508
509
510
511
512
        baits = config.get("baitsfile",""),
        ref = config["reference"]
    output:
        target_interval = "target.interval",
        baits_interval = "baits.interval"
513
514
515
    log:
        target = "log/bed_to_interval_target.log",
        baits = "log/bed_to_interval_target.log"
516
    container: containers["picard"]
517
    shell: "picard -Xmx4G BedToIntervalList "
518
            "I={input.targets} O={output.target_interval} "
519
            "SD={input.ref} 2> {log.target} && "
520
521
            "picard BedToIntervalList "
            "I={input.baits} O={output.baits_interval} "
522
            "SD={input.ref} 2> {log.baits}"
523
524
525
526
527
528
529
530
531

rule hs_metrics:
    """Run picard CollectHsMetrics"""
    input:
        bam = rules.markdup.output.bam,
        ref = config["reference"],
        targets = rules.bed_to_interval.output.target_interval,
        baits = rules.bed_to_interval.output.baits_interval,
    output: "{sample}/bams/{sample}.hs_metrics.txt"
532
    log: "log/{sample}/hs_metrics.log"
533
    container: containers["picard"]
534
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectHsMetrics "
535
536
537
538
539
           "I={input.bam} O={output} "
           "R={input.ref} "
           "BAIT_INTERVALS={input.baits} "
           "TARGET_INTERVALS={input.targets}"

Sander Bollen's avatar
Sander Bollen committed
540
541
542
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
543
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
544
545
    """
    input:
van den Berg's avatar
van den Berg committed
546
547
548
549
550
551
552
553
554
555
556
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
557
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/.done"
558
559
                      for read_group, sample in get_readgroup_per_sample()),

560
        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
561
562
563
564
565
                      for read_group, sample in get_readgroup_per_sample()),

        hs_metric = expand("{sample}/bams/{sample}.hs_metrics.txt",
                           sample=config["samples"]) if "baitsfile" in config else []

566
567
    output:
        html = "multiqc_report/multiqc_report.html",
568
        insertSize = "multiqc_report/multiqc_data/multiqc_picard_insertSize.json",
569
        AlignmentMetrics = "multiqc_report/multiqc_data/multiqc_picard_AlignmentSummaryMetrics.json",
570
        DuplicationMetrics = "multiqc_report/multiqc_data/multiqc_picard_dups.json",
571
        HsMetrics = "multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json" if "baitsfile" in config else []
572
    log: "log/multiqc.log"
van den Berg's avatar
van den Berg committed
573
    container: containers["multiqc"]
574
575
    shell: "multiqc --data-format json --force "
           "--outdir multiqc_report . 2> {log} "
van den Berg's avatar
van den Berg committed
576
           "|| touch {output}"
577
578
579
580
581
582
583

rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"],
584
        insertSize = rules.multiqc.output.insertSize,
585
        AlignmentMetrics = rules.multiqc.output.AlignmentMetrics,
586
        DuplicationMetrics = rules.multiqc.output.DuplicationMetrics,
587
        HsMetrics = rules.multiqc.output.HsMetrics
588
    output: "stats.json"
589
    log: "log/stats.tsv.log"
590
591
    container: containers["vtools"]
    shell: "python {input.mpy} --collectstats {input.cols} "
592
           "--picard-insertSize {input.insertSize} "
593
           "--picard-AlignmentMetrics {input.AlignmentMetrics} "
594
           "--picard-DuplicationMetrics {input.DuplicationMetrics} "
595
           "--picard-HsMetrics {input.HsMetrics} > {output} 2> {log}"
596
597
598
599
600
601
602

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats = rules.merge_stats.output,
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
603
    log: "log/stats.tsv.log"
604
    container: containers["python3"]
605
    shell: "python {input.sc} -i {input.stats} > {output} 2> {log}"
606

607
608
609
610
611

rule gvcf2coverage:
    """ Determine coverage from gvcf files """
    input: rules.gvcf_gather.output.gvcf
    output: "{sample}/vcf/{sample}_{threshold}.bed"
612
    log: "log/{sample}/gvcf2coverage.{threshold}.log"
613
    container: containers["gvcf2coverage"]
614
    shell: "gvcf2coverage -t {wildcards.threshold} < {input} 2> {log} | cut -f 1,2,3 > {output}"