Snakefile 22.2 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
#   hutspot - a DNAseq variant calling pipeline
van den Berg's avatar
van den Berg committed
2
#   Copyright (C) 2017-2019, Leiden University Medical Center
Sander Bollen's avatar
Sander Bollen committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
# Read the json schema
32
with open(srcdir('config/schema.json'), 'rt') as fin:
van den Berg's avatar
van den Berg committed
33
34
    schema = json.load(fin)

van den Berg's avatar
van den Berg committed
35
# Validate the config against the schema
van den Berg's avatar
van den Berg committed
36
try:
van den Berg's avatar
van den Berg committed
37
    jsonschema.validate(config, schema)
van den Berg's avatar
van den Berg committed
38
except jsonschema.ValidationError as e:
van den Berg's avatar
van den Berg committed
39
    raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')
van den Berg's avatar
van den Berg committed
40

van den Berg's avatar
van den Berg committed
41
42
43
44
45
# If you specify a baitsfile, you also have to specify a targets file for
# picard
if "baitsfile" in config and "targetsfile" not in config:
    msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
    raise jsonschema.ValidationError(msg)
46

47
48
49
50
51
52
53
# A sample name cannot be a substring of another sample, since that breaks picard
# metrics parsing by multiqc
msg = 'Invalid --configfile: sample names should not overlap ("{s1}" is contained in "{s2}")'
for s1, s2 in itertools.permutations(config['samples'], 2):
    if s1 in s2:
        raise jsonschema.ValidationError(msg.format(s1=s1, s2=s2))

54
55
# Set default values
def set_default(key, value):
van den Berg's avatar
van den Berg committed
56
    """Set default config values"""
van den Berg's avatar
van den Berg committed
57
58
    if key not in config:
        config[key] = value
van den Berg's avatar
van den Berg committed
59

van den Berg's avatar
van den Berg committed
60
# Set the default config
van den Berg's avatar
van den Berg committed
61
62
63
64
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
65
66
67
68
69
70
set_default("covstats", srcdir("src/covstats.py"))
set_default("collect_stats", srcdir("src/collect_stats.py"))
set_default("merge_stats", srcdir("src/merge_stats.py"))
set_default("stats_to_tsv", srcdir("src/stats_to_tsv.py"))
set_default("py_wordcount", srcdir("src/pywc.py"))
set_default("cutadapt_summary", srcdir("src/cutadapt_summary.py"))
Sander Bollen's avatar
Sander Bollen committed
71

72
73
74
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
75
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
76
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
van den Berg's avatar
van den Berg committed
77
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
78
79
80
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
81
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
van den Berg's avatar
van den Berg committed
82
    "picard": "docker://quay.io/biocontainers/picard:2.22.8--0",
83
84
85
86
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
87

88
def get_readgroup(wildcards):
89
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
90

van den Berg's avatar
van den Berg committed
91
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
92
    for sample in config["samples"]:
93
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
94
95
            yield rg, sample

96
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
97
98
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
99
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
100

Sander Bollen's avatar
Sander Bollen committed
101
102
rule all:
    input:
van den Berg's avatar
van den Berg committed
103
        multiqc = "multiqc_report/multiqc_report.html",
104
105
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

        bam = expand("{sample}/bams/{sample}.bam",
                     sample=config["samples"]),

        vcfs = expand("{sample}/vcf/{sample}.vcf.gz",
                      sample=config["samples"]),

        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi",
                         sample=config["samples"]),

        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz",
                       sample=config["samples"]),

        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                          sample=config["samples"]),

        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt"
                    for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
130
        coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
131

Sander Bollen's avatar
Sander Bollen committed
132
133
rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
134
    output: directory("tmp")
van den Berg's avatar
van den Berg committed
135
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
136
137
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
138
rule genome:
Sander Bollen's avatar
Sander Bollen committed
139
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
140
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
141
    output: "current.genome"
van den Berg's avatar
van den Berg committed
142
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
143
144
145
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
146
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
147
    input:
148
149
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
150
    output:
van den Berg's avatar
van den Berg committed
151
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
152
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
153
154
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
155
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
156
157
158
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
159
           "{input.r1} {input.r2} "
160
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
161
162

rule align:
Sander Bollen's avatar
Sander Bollen committed
163
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
164
    input:
165
166
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
167
        ref = config["reference"],
168
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
169
    params:
170
        rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"
171
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
van den Berg's avatar
van den Berg committed
172
    container: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
173
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
174
175
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
176
177
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

178
def markdup_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
179
180
181
182
    """Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(
                sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)]
183

Sander Bollen's avatar
Sander Bollen committed
184
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
185
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
186
    input:
187
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
188
189
190
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
Sander Bollen's avatar
Sander Bollen committed
191
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
192
    output:
193
194
195
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
196
197
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
198
    container: containers["picard"]
199
200
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
201
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
202
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
203
204
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
205
206
207
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
208
        bai = rules.markdup.output.bai
Sander Bollen's avatar
Sander Bollen committed
209
    output:
210
        bai = "{sample}/bams/{sample}.bam.bai"
van den Berg's avatar
van den Berg committed
211
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
212
213
    shell: "cp {input.bai} {output.bai}"

214
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
215
216
217
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
218
219
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
220
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
221
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
222
    input:
223
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
224
225
226
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
227
228
        ref = config["reference"],
        vcfs = config["known_sites"]
229
    output: "{sample}/bams/{sample}.baserecal.grp"
230
    params:
van den Berg's avatar
van den Berg committed
231
232
233
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
234
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
235
    container: containers["gatk"]
236
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
237
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
238
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
239
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
240

241
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
242
243
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
244
        ref = config["reference"],
245
    params:
van den Berg's avatar
van den Berg committed
246
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
247
    output:
248
        directory("scatter")
van den Berg's avatar
van den Berg committed
249
    container: containers["biopet-scatterregions"]
250
    shell: "mkdir -p {output} && "
251
           "biopet-scatterregions -Xmx24G "
252
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
253
254
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
255
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
256
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
257
    input:
258
        bam = rules.markdup.output.bam,
259
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
260
261
262
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
263
    output:
van den Berg's avatar
van den Berg committed
264
265
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
266
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
267
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
268
    container: containers["gatk"]
269
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
270
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
271
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
272
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
273
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
274
275
276
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
277

278
279
280
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
281
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
282
283
284
285

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
286
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
287
288

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
289
    """Join the gvcf files together"""
290
291
292
293
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
294
295
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
296
297
298
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
           "--output {output.gvcf} --output-type z && "
299
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
300
301

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
302
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
303
    input:
304
305
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
306
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
307
    output:
308
309
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
310
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
311
312
313
314
        chunk = "[0-9]+"
    container: containers["gatk"]
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 "
           "/usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
315
           "GenotypeGVCFs -R {input.ref} "
316
           "-V {input.gvcf} -o '{output.vcf}'"
317

318
def aggregate_vcf(wildcards):
319
320
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
321
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
322

323
324
325
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
326
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
327

Sander Bollen's avatar
Sander Bollen committed
328
rule genotype_gather:
329
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
330
    input:
331
        vcfs = aggregate_vcf,
332
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
333
    output:
334
335
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
336
337
338
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
           "--output {output.vcf} --output-type z && "
339
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
340

Sander Bollen's avatar
Sander Bollen committed
341
## bam metrics
342
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
343
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
344
    input:
345
346
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
347
    output:
van den Berg's avatar
van den Berg committed
348
349
350
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
    container: containers["samtools-1.7-python-3.6"]
351
352
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
353

354
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
355
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
356
    input:
357
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
358
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
359
    output:
van den Berg's avatar
van den Berg committed
360
361
362
363
364
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
    container: containers["samtools-1.7-python-3.6"]
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
365
366

## fastqc
Sander Bollen's avatar
Sander Bollen committed
367
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
368
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
369
    input:
370
371
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
372
    output:
van den Berg's avatar
van den Berg committed
373
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
374
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
375
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
376

Sander Bollen's avatar
Sander Bollen committed
377
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
378
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
379
    input:
380
381
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
Sander Bollen's avatar
Sander Bollen committed
382
    output:
van den Berg's avatar
van den Berg committed
383
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
384
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
385
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
386

387
## coverage
Sander Bollen's avatar
Sander Bollen committed
388
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
389
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
390
    input:
391
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
392
393
        genome = "current.genome",
        covpy = config["covstats"],
van den Berg's avatar
van den Berg committed
394
        bed = config.get("targetsfile","")
Sander Bollen's avatar
Sander Bollen committed
395
    params:
van den Berg's avatar
van den Berg committed
396
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
397
    output:
van den Berg's avatar
van den Berg committed
398
399
400
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
401
402
403
404
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
405

Sander Bollen's avatar
Sander Bollen committed
406
407
408
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
409
410
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
411
        ref = config.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
412
    output:
van den Berg's avatar
van den Berg committed
413
414
        tsv = "{sample}/coverage/refFlat_coverage.tsv"
    container: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
415
416
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"

417
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
418
    """Colect cutadapt summary from each readgroup per sample """
419
420
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
421
422
423
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
424
425
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
van den Berg's avatar
van den Berg committed
426
    container: containers["python3"]
427
428
429
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

430
431
432
433
434
435
436
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
van den Berg's avatar
van den Berg committed
437
        cov = rules.covstats.output.covj if "targetsfile" in config else [],
438
439
440
441
442
443
444
445
446
447
448
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
449
           "--covstats {input.cov} > {output}"
450

van den Berg's avatar
van den Berg committed
451
452
453
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
454
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
455
456
        ref = config["reference"],
    params:
van den Berg's avatar
van den Berg committed
457
        prefix = "{sample}/bams/{sample}",
van den Berg's avatar
van den Berg committed
458
    output:
van den Berg's avatar
van den Berg committed
459
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
460
        insertMetrics = "{sample}/bams/{sample}.insert_size_metrics"
van den Berg's avatar
van den Berg committed
461
    container: containers["picard"]
van den Berg's avatar
van den Berg committed
462
463
464
465
466
467
    shell: "picard CollectMultipleMetrics "
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
           "PROGRAM=CollectInsertSizeMetrics "

468
469
470
471
472
473
474
rule bed_to_interval:
    """Run picard BedToIntervalList

    This is needed to convert the bed files for the capture kit to a format
    picard can read
    """
    input:
van den Berg's avatar
van den Berg committed
475
        targets = config.get("targetsfile",""),
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
        baits = config.get("baitsfile",""),
        ref = config["reference"]
    output:
        target_interval = "target.interval",
        baits_interval = "baits.interval"
    output:
    container: containers["picard"]
    shell: "picard BedToIntervalList "
            "I={input.targets} O={output.target_interval} "
            "SD={input.ref} && "
            "picard BedToIntervalList "
            "I={input.baits} O={output.baits_interval} "
            "SD={input.ref} "

rule hs_metrics:
    """Run picard CollectHsMetrics"""
    input:
        bam = rules.markdup.output.bam,
        ref = config["reference"],
        targets = rules.bed_to_interval.output.target_interval,
        baits = rules.bed_to_interval.output.baits_interval,
    output: "{sample}/bams/{sample}.hs_metrics.txt"
    container: containers["picard"]
    shell: "picard CollectHsMetrics "
           "I={input.bam} O={output} "
           "R={input.ref} "
           "BAIT_INTERVALS={input.baits} "
           "TARGET_INTERVALS={input.targets}"

Sander Bollen's avatar
Sander Bollen committed
505
506
507
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
508
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
509
510
    """
    input:
van den Berg's avatar
van den Berg committed
511
512
513
514
515
516
517
518
519
520
521
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
522
523
524
525
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/"
526
527
528
529
530
                      for read_group, sample in get_readgroup_per_sample()),

        hs_metric = expand("{sample}/bams/{sample}.hs_metrics.txt",
                           sample=config["samples"]) if "baitsfile" in config else []

531
532
    output:
        html = "multiqc_report/multiqc_report.html",
533
        insertSize = "multiqc_report/multiqc_data/multiqc_picard_insertSize.json",
534
        AlignmentMetrics = "multiqc_report/multiqc_data/multiqc_picard_AlignmentSummaryMetrics.json",
535
        DuplicationMetrics = "multiqc_report/multiqc_data/multiqc_picard_dups.json",
536
        HsMetrics = "multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json" if "baitsfile" in config else []
van den Berg's avatar
van den Berg committed
537
538
539
    container: containers["multiqc"]
    shell: "multiqc --data-format json --force --outdir multiqc_report . "
           "|| touch {output}"
540
541
542
543
544
545
546

rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"],
547
        insertSize = rules.multiqc.output.insertSize,
548
        AlignmentMetrics = rules.multiqc.output.AlignmentMetrics,
549
        DuplicationMetrics = rules.multiqc.output.DuplicationMetrics,
550
        HsMetrics = rules.multiqc.output.HsMetrics
551
552
553
    output: "stats.json"
    container: containers["vtools"]
    shell: "python {input.mpy} --collectstats {input.cols} "
554
           "--picard-insertSize {input.insertSize} "
555
           "--picard-AlignmentMetrics {input.AlignmentMetrics} "
556
           "--picard-DuplicationMetrics {input.DuplicationMetrics} "
557
           "--picard-HsMetrics {input.HsMetrics} > {output}"
558
559
560
561
562
563
564
565
566
567

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats = rules.merge_stats.output,
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
    container: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output}"