Snakefile 21.5 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
#   hutspot - a DNAseq variant calling pipeline
van den Berg's avatar
van den Berg committed
2
#   Copyright (C) 2017-2019, Leiden University Medical Center
Sander Bollen's avatar
Sander Bollen committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
# Read the json schema
32
with open(srcdir('config/schema.json'), 'rt') as fin:
van den Berg's avatar
van den Berg committed
33
34
    schema = json.load(fin)

van den Berg's avatar
van den Berg committed
35
# Validate the config against the schema
van den Berg's avatar
van den Berg committed
36
try:
van den Berg's avatar
van den Berg committed
37
    jsonschema.validate(config, schema)
van den Berg's avatar
van den Berg committed
38
except jsonschema.ValidationError as e:
van den Berg's avatar
van den Berg committed
39
    raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')
van den Berg's avatar
van den Berg committed
40

van den Berg's avatar
van den Berg committed
41
42
43
44
45
# If you specify a baitsfile, you also have to specify a targets file for
# picard
if "baitsfile" in config and "targetsfile" not in config:
    msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
    raise jsonschema.ValidationError(msg)
46

47
48
# Set default values
def set_default(key, value):
van den Berg's avatar
van den Berg committed
49
    """Set default config values"""
van den Berg's avatar
van den Berg committed
50
51
    if key not in config:
        config[key] = value
van den Berg's avatar
van den Berg committed
52

van den Berg's avatar
van den Berg committed
53
# Set the default config
van den Berg's avatar
van den Berg committed
54
55
56
57
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
58
59
60
61
62
63
set_default("covstats", srcdir("src/covstats.py"))
set_default("collect_stats", srcdir("src/collect_stats.py"))
set_default("merge_stats", srcdir("src/merge_stats.py"))
set_default("stats_to_tsv", srcdir("src/stats_to_tsv.py"))
set_default("py_wordcount", srcdir("src/pywc.py"))
set_default("cutadapt_summary", srcdir("src/cutadapt_summary.py"))
Sander Bollen's avatar
Sander Bollen committed
64

65
66
67
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
68
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
69
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
van den Berg's avatar
van den Berg committed
70
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
71
72
73
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
74
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
van den Berg's avatar
van den Berg committed
75
    "picard": "docker://quay.io/biocontainers/picard:2.22.8--0",
76
77
78
79
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
80

81
def get_readgroup(wildcards):
82
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
83

van den Berg's avatar
van den Berg committed
84
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
85
    for sample in config["samples"]:
86
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
87
88
            yield rg, sample

89
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
90
91
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
92
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
93

Sander Bollen's avatar
Sander Bollen committed
94
95
rule all:
    input:
van den Berg's avatar
van den Berg committed
96
        multiqc = "multiqc_report/multiqc_report.html",
97
98
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

        bam = expand("{sample}/bams/{sample}.bam",
                     sample=config["samples"]),

        vcfs = expand("{sample}/vcf/{sample}.vcf.gz",
                      sample=config["samples"]),

        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi",
                         sample=config["samples"]),

        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz",
                       sample=config["samples"]),

        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                          sample=config["samples"]),

        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt"
                    for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
123
        coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
124

Sander Bollen's avatar
Sander Bollen committed
125
126
rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
127
    output: directory("tmp")
van den Berg's avatar
van den Berg committed
128
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
129
130
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
131
rule genome:
Sander Bollen's avatar
Sander Bollen committed
132
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
133
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
134
    output: "current.genome"
van den Berg's avatar
van den Berg committed
135
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
136
137
138
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
139
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
140
    input:
141
142
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
143
    output:
van den Berg's avatar
van den Berg committed
144
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
145
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
146
147
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
148
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
149
150
151
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
152
           "{input.r1} {input.r2} "
153
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
154
155

rule align:
Sander Bollen's avatar
Sander Bollen committed
156
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
157
    input:
158
159
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
160
        ref = config["reference"],
161
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
162
    params:
163
164
        rg = "@RG\\tID:{read_group}\\tSM:{sample}\\tPL:ILLUMINA"
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
van den Berg's avatar
van den Berg committed
165
    container: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
166
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
167
168
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
169
170
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

171
def markdup_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
172
173
174
175
    """Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(
                sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)]
176

Sander Bollen's avatar
Sander Bollen committed
177
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
178
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
179
    input:
180
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
181
182
183
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
184
        #bam = lambda wc: ('f{wc.sample}/bams/{wc.sample}-{readgroup}' for read_group in config['samples'][wc.sample}]['read_groups']),
Sander Bollen's avatar
Sander Bollen committed
185
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
186
    output:
187
188
189
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
190
191
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
192
    container: containers["picard"]
193
194
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
195
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
196
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
197
198
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
199
200
201
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
202
        bai = rules.markdup.output.bai
Sander Bollen's avatar
Sander Bollen committed
203
    output:
204
        bai = "{sample}/bams/{sample}.bam.bai"
van den Berg's avatar
van den Berg committed
205
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
206
207
    shell: "cp {input.bai} {output.bai}"

208
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
209
210
211
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
212
213
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
214
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
215
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
216
    input:
217
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
218
219
220
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
221
222
        ref = config["reference"],
        vcfs = config["known_sites"]
223
    output: "{sample}/bams/{sample}.baserecal.grp"
224
    params:
van den Berg's avatar
van den Berg committed
225
226
227
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
228
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
229
    container: containers["gatk"]
230
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
231
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
232
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
233
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
234

235
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
236
237
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
238
        ref = config["reference"],
239
    params:
van den Berg's avatar
van den Berg committed
240
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
241
    output:
242
        directory("scatter")
van den Berg's avatar
van den Berg committed
243
    container: containers["biopet-scatterregions"]
244
    shell: "mkdir -p {output} && "
245
           "biopet-scatterregions -Xmx24G "
246
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
247
248
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
249
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
250
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
251
    input:
252
        bam = rules.markdup.output.bam,
253
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
254
255
256
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
257
    output:
van den Berg's avatar
van den Berg committed
258
259
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
260
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
261
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
262
    container: containers["gatk"]
263
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
264
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
265
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
266
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
267
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
268
269
270
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
271

272
273
274
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
275
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
276
277
278
279

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
280
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
281
282

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
283
    """Join the gvcf files together"""
284
285
286
287
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
288
289
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
290
291
292
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
           "--output {output.gvcf} --output-type z && "
293
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
294
295

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
296
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
297
    input:
298
299
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
300
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
301
    output:
302
303
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
304
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
305
306
307
308
        chunk = "[0-9]+"
    container: containers["gatk"]
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 "
           "/usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
309
           "GenotypeGVCFs -R {input.ref} "
310
           "-V {input.gvcf} -o '{output.vcf}'"
311

312
def aggregate_vcf(wildcards):
313
314
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
315
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
316

317
318
319
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
320
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
321

Sander Bollen's avatar
Sander Bollen committed
322
rule genotype_gather:
323
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
324
    input:
325
        vcfs = aggregate_vcf,
326
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
327
    output:
328
329
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
330
331
332
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
           "--output {output.vcf} --output-type z && "
333
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
334

Sander Bollen's avatar
Sander Bollen committed
335
## bam metrics
336
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
337
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
338
    input:
339
340
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
341
    output:
van den Berg's avatar
van den Berg committed
342
343
344
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
    container: containers["samtools-1.7-python-3.6"]
345
346
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
347

348
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
349
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
350
    input:
351
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
352
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
353
    output:
van den Berg's avatar
van den Berg committed
354
355
356
357
358
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
    container: containers["samtools-1.7-python-3.6"]
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
359
360

## fastqc
Sander Bollen's avatar
Sander Bollen committed
361
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
362
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
363
    input:
364
365
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
366
    output:
van den Berg's avatar
van den Berg committed
367
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
368
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
369
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
370

Sander Bollen's avatar
Sander Bollen committed
371
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
372
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
373
    input:
374
375
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
Sander Bollen's avatar
Sander Bollen committed
376
    output:
van den Berg's avatar
van den Berg committed
377
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
378
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
379
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
380

381
## coverage
Sander Bollen's avatar
Sander Bollen committed
382
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
383
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
384
    input:
385
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
386
387
        genome = "current.genome",
        covpy = config["covstats"],
van den Berg's avatar
van den Berg committed
388
        bed = config.get("targetsfile","")
Sander Bollen's avatar
Sander Bollen committed
389
    params:
van den Berg's avatar
van den Berg committed
390
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
391
    output:
van den Berg's avatar
van den Berg committed
392
393
394
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
395
396
397
398
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
399

Sander Bollen's avatar
Sander Bollen committed
400
401
402
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
403
404
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
405
        ref = config.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
406
    output:
van den Berg's avatar
van den Berg committed
407
408
        tsv = "{sample}/coverage/refFlat_coverage.tsv"
    container: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
409
410
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"

411
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
412
    """Colect cutadapt summary from each readgroup per sample """
413
414
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
415
416
417
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
418
419
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
van den Berg's avatar
van den Berg committed
420
    container: containers["python3"]
421
422
423
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

424
425
426
427
428
429
430
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
van den Berg's avatar
van den Berg committed
431
        cov = rules.covstats.output.covj if "targetsfile" in config else [],
432
433
434
435
436
437
438
439
440
441
442
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
443
           "--covstats {input.cov} > {output}"
444

van den Berg's avatar
van den Berg committed
445
446
447
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
448
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
449
450
        ref = config["reference"],
    params:
van den Berg's avatar
van den Berg committed
451
        prefix = "{sample}/bams/{sample}",
van den Berg's avatar
van den Berg committed
452
    output:
van den Berg's avatar
van den Berg committed
453
454
455
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
        insert = "{sample}/bams/{sample}.insert_size_metrics"
    container: containers["picard"]
van den Berg's avatar
van den Berg committed
456
457
458
459
460
461
    shell: "picard CollectMultipleMetrics "
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
           "PROGRAM=CollectInsertSizeMetrics "

462
463
464
465
466
467
468
rule bed_to_interval:
    """Run picard BedToIntervalList

    This is needed to convert the bed files for the capture kit to a format
    picard can read
    """
    input:
van den Berg's avatar
van den Berg committed
469
        targets = config.get("targetsfile",""),
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
        baits = config.get("baitsfile",""),
        ref = config["reference"]
    output:
        target_interval = "target.interval",
        baits_interval = "baits.interval"
    output:
    container: containers["picard"]
    shell: "picard BedToIntervalList "
            "I={input.targets} O={output.target_interval} "
            "SD={input.ref} && "
            "picard BedToIntervalList "
            "I={input.baits} O={output.baits_interval} "
            "SD={input.ref} "

rule hs_metrics:
    """Run picard CollectHsMetrics"""
    input:
        bam = rules.markdup.output.bam,
        ref = config["reference"],
        targets = rules.bed_to_interval.output.target_interval,
        baits = rules.bed_to_interval.output.baits_interval,
    output: "{sample}/bams/{sample}.hs_metrics.txt"
    container: containers["picard"]
    shell: "picard CollectHsMetrics "
           "I={input.bam} O={output} "
           "R={input.ref} "
           "BAIT_INTERVALS={input.baits} "
           "TARGET_INTERVALS={input.targets}"

Sander Bollen's avatar
Sander Bollen committed
499
500
501
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
502
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
503
504
    """
    input:
van den Berg's avatar
van den Berg committed
505
506
507
508
509
510
511
512
513
514
515
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
516
517
518
519
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/"
520
521
522
523
524
                      for read_group, sample in get_readgroup_per_sample()),

        hs_metric = expand("{sample}/bams/{sample}.hs_metrics.txt",
                           sample=config["samples"]) if "baitsfile" in config else []

525
526
    output:
        html = "multiqc_report/multiqc_report.html",
527
528
        insert = "multiqc_report/multiqc_data/multiqc_picard_insertSize.json",
        HsMetrics = "multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json" if "baitsfile" in config else []
van den Berg's avatar
van den Berg committed
529
530
531
    container: containers["multiqc"]
    shell: "multiqc --data-format json --force --outdir multiqc_report . "
           "|| touch {output}"
532
533
534
535
536
537
538

rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"],
539
540
        insertSize = rules.multiqc.output.insert,
        HsMetrics = rules.multiqc.output.HsMetrics
541
542
543
    output: "stats.json"
    container: containers["vtools"]
    shell: "python {input.mpy} --collectstats {input.cols} "
544
545
           "--picard-insertSize {input.insertSize} "
           "--picard-HsMetrics {input.HsMetrics} > {output}"
546
547
548
549
550
551
552
553
554
555

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats = rules.merge_stats.output,
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
    container: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output}"