Snakefile 22.6 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
#   hutspot - a DNAseq variant calling pipeline
van den Berg's avatar
van den Berg committed
2
#   Copyright (C) 2017-2019, Leiden University Medical Center
Sander Bollen's avatar
Sander Bollen committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
# Read the json schema
32
with open(srcdir('config/schema.json'), 'rt') as fin:
van den Berg's avatar
van den Berg committed
33
34
    schema = json.load(fin)

van den Berg's avatar
van den Berg committed
35
# Validate the config against the schema
van den Berg's avatar
van den Berg committed
36
try:
van den Berg's avatar
van den Berg committed
37
    jsonschema.validate(config, schema)
van den Berg's avatar
van den Berg committed
38
except jsonschema.ValidationError as e:
van den Berg's avatar
van den Berg committed
39
    raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')
van den Berg's avatar
van den Berg committed
40

van den Berg's avatar
van den Berg committed
41
42
43
44
45
# If you specify a baitsfile, you also have to specify a targets file for
# picard
if "baitsfile" in config and "targetsfile" not in config:
    msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
    raise jsonschema.ValidationError(msg)
46

47
48
49
50
51
52
53
# A sample name cannot be a substring of another sample, since that breaks picard
# metrics parsing by multiqc
msg = 'Invalid --configfile: sample names should not overlap ("{s1}" is contained in "{s2}")'
for s1, s2 in itertools.permutations(config['samples'], 2):
    if s1 in s2:
        raise jsonschema.ValidationError(msg.format(s1=s1, s2=s2))

54
55
# Set default values
def set_default(key, value):
van den Berg's avatar
van den Berg committed
56
    """Set default config values"""
van den Berg's avatar
van den Berg committed
57
58
    if key not in config:
        config[key] = value
van den Berg's avatar
van den Berg committed
59

van den Berg's avatar
van den Berg committed
60
# Set the default config
van den Berg's avatar
van den Berg committed
61
62
63
64
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
65
66
67
68
69
70
set_default("covstats", srcdir("src/covstats.py"))
set_default("collect_stats", srcdir("src/collect_stats.py"))
set_default("merge_stats", srcdir("src/merge_stats.py"))
set_default("stats_to_tsv", srcdir("src/stats_to_tsv.py"))
set_default("py_wordcount", srcdir("src/pywc.py"))
set_default("cutadapt_summary", srcdir("src/cutadapt_summary.py"))
Sander Bollen's avatar
Sander Bollen committed
71

72
73
74
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
75
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
van den Berg's avatar
van den Berg committed
76
    "bwa-0.7.17-picard-2.22.8": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:76d16eabff506ac13338d7f14644a0ad301b9d7e-0",
van den Berg's avatar
van den Berg committed
77
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
78
79
80
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
81
    "gvcf2coverage": "docker://lumc/gvcf2coverage:0.1-dirty-2",
82
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
van den Berg's avatar
van den Berg committed
83
    "picard": "docker://quay.io/biocontainers/picard:2.22.8--0",
84
85
86
87
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
88

89
def get_readgroup(wildcards):
90
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
91

van den Berg's avatar
van den Berg committed
92
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
93
    for sample in config["samples"]:
94
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
95
96
            yield rg, sample

97
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
98
99
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
100
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
101

Sander Bollen's avatar
Sander Bollen committed
102
103
rule all:
    input:
van den Berg's avatar
van den Berg committed
104
        multiqc = "multiqc_report/multiqc_report.html",
105
106
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

        bam = expand("{sample}/bams/{sample}.bam",
                     sample=config["samples"]),

        vcfs = expand("{sample}/vcf/{sample}.vcf.gz",
                      sample=config["samples"]),

        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi",
                         sample=config["samples"]),

        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz",
                       sample=config["samples"]),

        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                          sample=config["samples"]),

        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt"
                    for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
131
        coverage_stats = coverage_stats,
132
133
134
135
        coverage_files = (f"{sample}/vcf/{sample}_{threshold}.bed"
                          for sample, threshold in itertools.product(
                              config['samples'], config['coverage_threshold'])
                          ) if 'coverage_threshold' in config else []
Sander Bollen's avatar
Sander Bollen committed
136
137

rule genome:
Sander Bollen's avatar
Sander Bollen committed
138
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
139
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
140
    output: "current.genome"
van den Berg's avatar
van den Berg committed
141
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
142
143
144
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
145
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
146
    input:
147
148
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
149
    output:
van den Berg's avatar
van den Berg committed
150
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
151
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
152
153
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
154
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
155
156
157
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
158
           "{input.r1} {input.r2} "
159
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
160
161

rule align:
Sander Bollen's avatar
Sander Bollen committed
162
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
163
    input:
164
165
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
166
        ref = config["reference"],
Sander Bollen's avatar
Sander Bollen committed
167
    params:
168
        rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"
169
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
van den Berg's avatar
van den Berg committed
170
    container: containers["bwa-0.7.17-picard-2.22.8"]
Sander Bollen's avatar
Sander Bollen committed
171
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
van den Berg's avatar
van den Berg committed
172
173
           "| picard -Xmx4G SortSam "
           "CREATE_INDEX=TRUE "
Sander Bollen's avatar
Sander Bollen committed
174
175
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

176
def markdup_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
177
178
179
180
    """Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(
                sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)]
181

Sander Bollen's avatar
Sander Bollen committed
182
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
183
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
184
    input:
185
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
186
187
188
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
Sander Bollen's avatar
Sander Bollen committed
189
    output:
190
191
192
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
193
194
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
195
    container: containers["picard"]
van den Berg's avatar
van den Berg committed
196
197
    shell: "picard -Xmx4G MarkDuplicates "
           "CREATE_INDEX=TRUE "
198
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
199
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
200
201
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
202
203
204
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
205
        bai = rules.markdup.output.bai
Sander Bollen's avatar
Sander Bollen committed
206
    output:
207
        bai = "{sample}/bams/{sample}.bam.bai"
van den Berg's avatar
van den Berg committed
208
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
209
210
    shell: "cp {input.bai} {output.bai}"

211
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
212
213
214
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
215
216
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
217
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
218
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
219
    input:
220
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
221
222
223
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
224
225
        ref = config["reference"],
        vcfs = config["known_sites"]
226
    output: "{sample}/bams/{sample}.baserecal.grp"
227
    params:
van den Berg's avatar
van den Berg committed
228
229
230
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
231
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
232
    container: containers["gatk"]
233
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
234
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
235
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
236
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
237

238
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
239
240
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
241
        ref = config["reference"],
242
    params:
van den Berg's avatar
van den Berg committed
243
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
244
    output:
245
        directory("scatter")
van den Berg's avatar
van den Berg committed
246
    container: containers["biopet-scatterregions"]
247
    shell: "mkdir -p {output} && "
248
           "biopet-scatterregions -Xmx24G "
249
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
250
251
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
252
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
253
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
254
    input:
255
        bam = rules.markdup.output.bam,
256
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
257
258
259
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
260
    output:
van den Berg's avatar
van den Berg committed
261
262
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
263
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
264
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
265
    container: containers["gatk"]
266
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
267
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
268
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
269
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
270
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
271
272
273
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
274

275
276
277
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
278
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
279
280
281
282

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
283
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
284
285

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
286
    """Join the gvcf files together"""
287
288
289
290
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
291
292
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
293
294
295
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
           "--output {output.gvcf} --output-type z && "
296
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
297
298

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
299
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
300
    input:
301
302
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
303
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
304
    output:
305
306
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
307
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
308
309
310
311
        chunk = "[0-9]+"
    container: containers["gatk"]
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 "
           "/usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
312
           "GenotypeGVCFs -R {input.ref} "
313
           "-V {input.gvcf} -o '{output.vcf}'"
314

315
def aggregate_vcf(wildcards):
316
317
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
318
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
319

320
321
322
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
323
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
324

Sander Bollen's avatar
Sander Bollen committed
325
rule genotype_gather:
326
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
327
    input:
328
        vcfs = aggregate_vcf,
329
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
330
    output:
331
332
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
333
334
335
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
           "--output {output.vcf} --output-type z && "
336
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
337

Sander Bollen's avatar
Sander Bollen committed
338
## bam metrics
339
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
340
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
341
    input:
342
343
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
344
    output:
van den Berg's avatar
van den Berg committed
345
346
347
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
    container: containers["samtools-1.7-python-3.6"]
348
349
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
350

351
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
352
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
353
    input:
354
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
355
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
356
    output:
van den Berg's avatar
van den Berg committed
357
358
359
360
361
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
    container: containers["samtools-1.7-python-3.6"]
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
362
363

## fastqc
Sander Bollen's avatar
Sander Bollen committed
364
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
365
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
366
    input:
367
368
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
369
    output:
van den Berg's avatar
van den Berg committed
370
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
371
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
372
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
373

Sander Bollen's avatar
Sander Bollen committed
374
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
375
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
376
    input:
377
378
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
Sander Bollen's avatar
Sander Bollen committed
379
    output:
van den Berg's avatar
van den Berg committed
380
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
381
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
382
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
383

384
## coverage
Sander Bollen's avatar
Sander Bollen committed
385
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
386
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
387
    input:
388
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
389
390
        genome = "current.genome",
        covpy = config["covstats"],
van den Berg's avatar
van den Berg committed
391
        bed = config.get("targetsfile","")
Sander Bollen's avatar
Sander Bollen committed
392
    params:
van den Berg's avatar
van den Berg committed
393
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
394
    output:
van den Berg's avatar
van den Berg committed
395
396
397
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
398
399
400
401
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
402

Sander Bollen's avatar
Sander Bollen committed
403
404
405
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
406
407
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
408
        ref = config.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
409
    output:
van den Berg's avatar
van den Berg committed
410
411
        tsv = "{sample}/coverage/refFlat_coverage.tsv"
    container: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
412
413
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"

414
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
415
    """Colect cutadapt summary from each readgroup per sample """
416
417
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
418
419
420
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
421
422
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
van den Berg's avatar
van den Berg committed
423
    container: containers["python3"]
424
425
426
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

427
428
429
430
431
432
433
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
van den Berg's avatar
van den Berg committed
434
        cov = rules.covstats.output.covj if "targetsfile" in config else [],
435
436
437
438
439
440
441
442
443
444
445
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
446
           "--covstats {input.cov} > {output}"
447

van den Berg's avatar
van den Berg committed
448
449
450
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
451
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
452
453
        ref = config["reference"],
    params:
van den Berg's avatar
van den Berg committed
454
        prefix = "{sample}/bams/{sample}",
van den Berg's avatar
van den Berg committed
455
    output:
van den Berg's avatar
van den Berg committed
456
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
457
        insertMetrics = "{sample}/bams/{sample}.insert_size_metrics"
van den Berg's avatar
van den Berg committed
458
    container: containers["picard"]
van den Berg's avatar
van den Berg committed
459
460
461
462
463
464
    shell: "picard CollectMultipleMetrics "
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
           "PROGRAM=CollectInsertSizeMetrics "

465
466
467
468
469
470
471
rule bed_to_interval:
    """Run picard BedToIntervalList

    This is needed to convert the bed files for the capture kit to a format
    picard can read
    """
    input:
van den Berg's avatar
van den Berg committed
472
        targets = config.get("targetsfile",""),
473
474
475
476
477
478
479
        baits = config.get("baitsfile",""),
        ref = config["reference"]
    output:
        target_interval = "target.interval",
        baits_interval = "baits.interval"
    output:
    container: containers["picard"]
480
    shell: "picard -Xmx4G BedToIntervalList "
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
            "I={input.targets} O={output.target_interval} "
            "SD={input.ref} && "
            "picard BedToIntervalList "
            "I={input.baits} O={output.baits_interval} "
            "SD={input.ref} "

rule hs_metrics:
    """Run picard CollectHsMetrics"""
    input:
        bam = rules.markdup.output.bam,
        ref = config["reference"],
        targets = rules.bed_to_interval.output.target_interval,
        baits = rules.bed_to_interval.output.baits_interval,
    output: "{sample}/bams/{sample}.hs_metrics.txt"
    container: containers["picard"]
    shell: "picard CollectHsMetrics "
           "I={input.bam} O={output} "
           "R={input.ref} "
           "BAIT_INTERVALS={input.baits} "
           "TARGET_INTERVALS={input.targets}"

Sander Bollen's avatar
Sander Bollen committed
502
503
504
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
505
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
506
507
    """
    input:
van den Berg's avatar
van den Berg committed
508
509
510
511
512
513
514
515
516
517
518
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
519
520
521
522
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/"
523
524
525
526
527
                      for read_group, sample in get_readgroup_per_sample()),

        hs_metric = expand("{sample}/bams/{sample}.hs_metrics.txt",
                           sample=config["samples"]) if "baitsfile" in config else []

528
529
    output:
        html = "multiqc_report/multiqc_report.html",
530
        insertSize = "multiqc_report/multiqc_data/multiqc_picard_insertSize.json",
531
        AlignmentMetrics = "multiqc_report/multiqc_data/multiqc_picard_AlignmentSummaryMetrics.json",
532
        DuplicationMetrics = "multiqc_report/multiqc_data/multiqc_picard_dups.json",
533
        HsMetrics = "multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json" if "baitsfile" in config else []
van den Berg's avatar
van den Berg committed
534
535
536
    container: containers["multiqc"]
    shell: "multiqc --data-format json --force --outdir multiqc_report . "
           "|| touch {output}"
537
538
539
540
541
542
543

rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"],
544
        insertSize = rules.multiqc.output.insertSize,
545
        AlignmentMetrics = rules.multiqc.output.AlignmentMetrics,
546
        DuplicationMetrics = rules.multiqc.output.DuplicationMetrics,
547
        HsMetrics = rules.multiqc.output.HsMetrics
548
549
550
    output: "stats.json"
    container: containers["vtools"]
    shell: "python {input.mpy} --collectstats {input.cols} "
551
           "--picard-insertSize {input.insertSize} "
552
           "--picard-AlignmentMetrics {input.AlignmentMetrics} "
553
           "--picard-DuplicationMetrics {input.DuplicationMetrics} "
554
           "--picard-HsMetrics {input.HsMetrics} > {output}"
555
556
557
558
559
560
561
562
563
564

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats = rules.merge_stats.output,
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
    container: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output}"

565
566
567
568
569
570

rule gvcf2coverage:
    """ Determine coverage from gvcf files """
    input: rules.gvcf_gather.output.gvcf
    output: "{sample}/vcf/{sample}_{threshold}.bed"
    container: containers["gvcf2coverage"]
van den Berg's avatar
van den Berg committed
571
    shell: "gvcf2coverage -t {wildcards.threshold} < {input} | cut -f 1,2,3 > {output}"