Snakefile 23.4 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
#   hutspot - a DNAseq variant calling pipeline
van den Berg's avatar
van den Berg committed
2
#   Copyright (C) 2017-2019, Leiden University Medical Center
Sander Bollen's avatar
Sander Bollen committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
# Read the json schema
32
with open(srcdir('config/schema.json'), 'rt') as fin:
van den Berg's avatar
van den Berg committed
33
34
    schema = json.load(fin)

van den Berg's avatar
van den Berg committed
35
# Validate the config against the schema
van den Berg's avatar
van den Berg committed
36
try:
van den Berg's avatar
van den Berg committed
37
    jsonschema.validate(config, schema)
van den Berg's avatar
van den Berg committed
38
except jsonschema.ValidationError as e:
van den Berg's avatar
van den Berg committed
39
    raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')
van den Berg's avatar
van den Berg committed
40

van den Berg's avatar
van den Berg committed
41
42
43
44
45
# If you specify a baitsfile, you also have to specify a targets file for
# picard
if "baitsfile" in config and "targetsfile" not in config:
    msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
    raise jsonschema.ValidationError(msg)
46

47
48
49
50
51
52
53
# A sample name cannot be a substring of another sample, since that breaks picard
# metrics parsing by multiqc
msg = 'Invalid --configfile: sample names should not overlap ("{s1}" is contained in "{s2}")'
for s1, s2 in itertools.permutations(config['samples'], 2):
    if s1 in s2:
        raise jsonschema.ValidationError(msg.format(s1=s1, s2=s2))

54
55
# Set default values
def set_default(key, value):
van den Berg's avatar
van den Berg committed
56
    """Set default config values"""
van den Berg's avatar
van den Berg committed
57
58
    if key not in config:
        config[key] = value
van den Berg's avatar
van den Berg committed
59

van den Berg's avatar
van den Berg committed
60
# Set the default config
van den Berg's avatar
van den Berg committed
61
62
63
64
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
65
66
67
68
69
70
set_default("covstats", srcdir("src/covstats.py"))
set_default("collect_stats", srcdir("src/collect_stats.py"))
set_default("merge_stats", srcdir("src/merge_stats.py"))
set_default("stats_to_tsv", srcdir("src/stats_to_tsv.py"))
set_default("py_wordcount", srcdir("src/pywc.py"))
set_default("cutadapt_summary", srcdir("src/cutadapt_summary.py"))
Sander Bollen's avatar
Sander Bollen committed
71

72
73
74
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
75
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
van den Berg's avatar
van den Berg committed
76
    "bwa-0.7.17-picard-2.22.8": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:76d16eabff506ac13338d7f14644a0ad301b9d7e-0",
van den Berg's avatar
van den Berg committed
77
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
78
79
80
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
81
    "gvcf2coverage": "docker://lumc/gvcf2coverage:0.1-dirty-2",
82
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
van den Berg's avatar
van den Berg committed
83
    "picard": "docker://quay.io/biocontainers/picard:2.22.8--0",
84
85
86
87
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
88

89
def get_readgroup(wildcards):
90
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
91

van den Berg's avatar
van den Berg committed
92
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
93
    for sample in config["samples"]:
94
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
95
96
            yield rg, sample

97
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
98
99
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
100
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
101

Sander Bollen's avatar
Sander Bollen committed
102
103
rule all:
    input:
van den Berg's avatar
van den Berg committed
104
        multiqc = "multiqc_report/multiqc_report.html",
105
106
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

        bam = expand("{sample}/bams/{sample}.bam",
                     sample=config["samples"]),

        vcfs = expand("{sample}/vcf/{sample}.vcf.gz",
                      sample=config["samples"]),

        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi",
                         sample=config["samples"]),

        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz",
                       sample=config["samples"]),

        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                          sample=config["samples"]),

123
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/.done"
van den Berg's avatar
van den Berg committed
124
125
                      for read_group, sample in get_readgroup_per_sample()),

126
        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
van den Berg's avatar
van den Berg committed
127
128
129
130
                      for read_group, sample in get_readgroup_per_sample()),

        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt"
                    for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
131
        coverage_stats = coverage_stats,
132
133
134
135
        coverage_files = (f"{sample}/vcf/{sample}_{threshold}.bed"
                          for sample, threshold in itertools.product(
                              config['samples'], config['coverage_threshold'])
                          ) if 'coverage_threshold' in config else []
Sander Bollen's avatar
Sander Bollen committed
136

137
138
139
140
141
142
rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    output: directory("tmp")
    container: containers["debian"]
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
143
rule genome:
Sander Bollen's avatar
Sander Bollen committed
144
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
145
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
146
    output: "current.genome"
van den Berg's avatar
van den Berg committed
147
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
148
149
150
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
151
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
152
    input:
153
154
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
155
    output:
van den Berg's avatar
van den Berg committed
156
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
157
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
158
159
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
160
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
161
162
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
van den Berg's avatar
van den Berg committed
163
           "--compression-level=1 -j 8 "
van den Berg's avatar
van den Berg committed
164
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
165
           "{input.r1} {input.r2} "
166
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
167
168

rule align:
Sander Bollen's avatar
Sander Bollen committed
169
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
170
    input:
171
172
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
173
        ref = config["reference"],
174
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
175
    params:
176
        rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"
177
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
van den Berg's avatar
van den Berg committed
178
    container: containers["bwa-0.7.17-picard-2.22.8"]
Sander Bollen's avatar
Sander Bollen committed
179
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
180
181
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
182
183
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

184
def markdup_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
185
186
187
188
    """Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(
                sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)]
189

Sander Bollen's avatar
Sander Bollen committed
190
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
191
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
192
    input:
193
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
194
195
196
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
197
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
198
    output:
199
200
201
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
202
203
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
204
    container: containers["picard"]
205
206
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
207
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
208
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
209
210
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
211
212
213
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
214
        bai = rules.markdup.output.bai
Sander Bollen's avatar
Sander Bollen committed
215
    output:
216
        bai = "{sample}/bams/{sample}.bam.bai"
van den Berg's avatar
van den Berg committed
217
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
218
219
    shell: "cp {input.bai} {output.bai}"

220
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
221
222
223
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
224
225
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
226
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
227
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
228
    input:
229
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
230
231
232
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
233
234
        ref = config["reference"],
        vcfs = config["known_sites"]
235
    output: "{sample}/bams/{sample}.baserecal.grp"
236
    params:
van den Berg's avatar
van den Berg committed
237
238
239
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
240
        region = "-L "+ config["restrict_BQSR"] if "restrict_BQSR" in config else "",
241
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
242
    container: containers["gatk"]
243
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
244
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
245
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
246
           "{params.region} "
247
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
248

249
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
250
251
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
252
        ref = config["reference"],
253
    params:
van den Berg's avatar
van den Berg committed
254
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
255
    output:
256
        directory("scatter")
van den Berg's avatar
van den Berg committed
257
    container: containers["biopet-scatterregions"]
258
    shell: "mkdir -p {output} && "
259
           "biopet-scatterregions -Xmx24G "
260
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
261
262
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
263
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
264
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
265
    input:
266
        bam = rules.markdup.output.bam,
267
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
268
269
270
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
271
    output:
van den Berg's avatar
van den Berg committed
272
273
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
274
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
275
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
276
    container: containers["gatk"]
277
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
278
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
279
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
280
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
281
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
282
283
284
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
285

286
287
288
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
289
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
290
291
292
293

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
294
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
295
296

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
297
    """Join the gvcf files together"""
298
299
300
301
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
302
303
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
304
305
306
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
           "--output {output.gvcf} --output-type z && "
307
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
308
309

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
310
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
311
    input:
312
313
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
314
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
315
    output:
316
317
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
318
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
319
320
321
322
        chunk = "[0-9]+"
    container: containers["gatk"]
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 "
           "/usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
323
           "GenotypeGVCFs -R {input.ref} "
324
           "-V {input.gvcf} -o '{output.vcf}'"
325

326
def aggregate_vcf(wildcards):
327
328
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
329
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
330

331
332
333
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
334
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
335

Sander Bollen's avatar
Sander Bollen committed
336
rule genotype_gather:
337
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
338
    input:
339
        vcfs = aggregate_vcf,
340
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
341
    output:
342
343
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
344
345
346
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
           "--output {output.vcf} --output-type z && "
347
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
348

Sander Bollen's avatar
Sander Bollen committed
349
## bam metrics
350
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
351
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
352
    input:
353
354
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
355
    output:
van den Berg's avatar
van den Berg committed
356
357
358
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
    container: containers["samtools-1.7-python-3.6"]
359
360
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
361

362
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
363
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
364
    input:
365
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
366
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
367
    output:
van den Berg's avatar
van den Berg committed
368
369
370
371
372
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
    container: containers["samtools-1.7-python-3.6"]
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
373
374

## fastqc
Sander Bollen's avatar
Sander Bollen committed
375
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
376
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
377
    input:
378
379
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
380
381
    params:
        folder = "{sample}/pre_process/raw-{sample}-{read_group}"
Sander Bollen's avatar
Sander Bollen committed
382
    output:
383
        done = "{sample}/pre_process/raw-{sample}-{read_group}/.done"
van den Berg's avatar
van den Berg committed
384
    container: containers["fastqc"]
385
386
    shell: "fastqc --threads 4 --nogroup -o {params.folder} {input.r1} {input.r2} && "
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
387

Sander Bollen's avatar
Sander Bollen committed
388
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
389
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
390
    input:
391
392
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
393
394
    params:
        folder = "{sample}/pre_process/trimmed-{sample}-{read_group}"
Sander Bollen's avatar
Sander Bollen committed
395
    output:
396
        done = "{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
van den Berg's avatar
van den Berg committed
397
    container: containers["fastqc"]
398
399
    shell: "fastqc --threads 4 --nogroup -o {params.folder} {input.r1} {input.r2} && "
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
400

401
## coverage
Sander Bollen's avatar
Sander Bollen committed
402
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
403
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
404
    input:
405
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
406
407
        genome = "current.genome",
        covpy = config["covstats"],
van den Berg's avatar
van den Berg committed
408
        bed = config.get("targetsfile","")
Sander Bollen's avatar
Sander Bollen committed
409
    params:
van den Berg's avatar
van den Berg committed
410
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
411
    output:
van den Berg's avatar
van den Berg committed
412
413
414
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
415
416
417
418
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
419

Sander Bollen's avatar
Sander Bollen committed
420
421
422
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
423
424
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
425
        ref = config.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
426
    output:
van den Berg's avatar
van den Berg committed
427
428
        tsv = "{sample}/coverage/refFlat_coverage.tsv"
    container: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
429
430
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"

431
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
432
    """Colect cutadapt summary from each readgroup per sample """
433
434
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
435
436
437
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
438
439
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
van den Berg's avatar
van den Berg committed
440
    container: containers["python3"]
441
442
443
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

444
445
446
447
448
449
450
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
van den Berg's avatar
van den Berg committed
451
        cov = rules.covstats.output.covj if "targetsfile" in config else [],
452
453
454
455
456
457
458
459
460
461
462
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
463
           "--covstats {input.cov} > {output}"
464

van den Berg's avatar
van den Berg committed
465
466
467
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
468
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
469
470
        ref = config["reference"],
    params:
van den Berg's avatar
van den Berg committed
471
        prefix = "{sample}/bams/{sample}",
van den Berg's avatar
van den Berg committed
472
    output:
van den Berg's avatar
van den Berg committed
473
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
474
        insertMetrics = "{sample}/bams/{sample}.insert_size_metrics"
van den Berg's avatar
van den Berg committed
475
    container: containers["picard"]
476
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectMultipleMetrics "
van den Berg's avatar
van den Berg committed
477
478
479
480
481
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
           "PROGRAM=CollectInsertSizeMetrics "

482
483
484
485
486
487
488
rule bed_to_interval:
    """Run picard BedToIntervalList

    This is needed to convert the bed files for the capture kit to a format
    picard can read
    """
    input:
van den Berg's avatar
van den Berg committed
489
        targets = config.get("targetsfile",""),
490
491
492
493
494
495
496
        baits = config.get("baitsfile",""),
        ref = config["reference"]
    output:
        target_interval = "target.interval",
        baits_interval = "baits.interval"
    output:
    container: containers["picard"]
497
    shell: "picard -Xmx4G BedToIntervalList "
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
            "I={input.targets} O={output.target_interval} "
            "SD={input.ref} && "
            "picard BedToIntervalList "
            "I={input.baits} O={output.baits_interval} "
            "SD={input.ref} "

rule hs_metrics:
    """Run picard CollectHsMetrics"""
    input:
        bam = rules.markdup.output.bam,
        ref = config["reference"],
        targets = rules.bed_to_interval.output.target_interval,
        baits = rules.bed_to_interval.output.baits_interval,
    output: "{sample}/bams/{sample}.hs_metrics.txt"
    container: containers["picard"]
513
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectHsMetrics "
514
515
516
517
518
           "I={input.bam} O={output} "
           "R={input.ref} "
           "BAIT_INTERVALS={input.baits} "
           "TARGET_INTERVALS={input.targets}"

Sander Bollen's avatar
Sander Bollen committed
519
520
521
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
522
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
523
524
    """
    input:
van den Berg's avatar
van den Berg committed
525
526
527
528
529
530
531
532
533
534
535
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
536
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/.done"
537
538
                      for read_group, sample in get_readgroup_per_sample()),

539
        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
540
541
542
543
544
                      for read_group, sample in get_readgroup_per_sample()),

        hs_metric = expand("{sample}/bams/{sample}.hs_metrics.txt",
                           sample=config["samples"]) if "baitsfile" in config else []

545
546
    output:
        html = "multiqc_report/multiqc_report.html",
547
        insertSize = "multiqc_report/multiqc_data/multiqc_picard_insertSize.json",
548
        AlignmentMetrics = "multiqc_report/multiqc_data/multiqc_picard_AlignmentSummaryMetrics.json",
549
        DuplicationMetrics = "multiqc_report/multiqc_data/multiqc_picard_dups.json",
550
        HsMetrics = "multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json" if "baitsfile" in config else []
van den Berg's avatar
van den Berg committed
551
552
553
    container: containers["multiqc"]
    shell: "multiqc --data-format json --force --outdir multiqc_report . "
           "|| touch {output}"
554
555
556
557
558
559
560

rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"],
561
        insertSize = rules.multiqc.output.insertSize,
562
        AlignmentMetrics = rules.multiqc.output.AlignmentMetrics,
563
        DuplicationMetrics = rules.multiqc.output.DuplicationMetrics,
564
        HsMetrics = rules.multiqc.output.HsMetrics
565
566
567
    output: "stats.json"
    container: containers["vtools"]
    shell: "python {input.mpy} --collectstats {input.cols} "
568
           "--picard-insertSize {input.insertSize} "
569
           "--picard-AlignmentMetrics {input.AlignmentMetrics} "
570
           "--picard-DuplicationMetrics {input.DuplicationMetrics} "
571
           "--picard-HsMetrics {input.HsMetrics} > {output}"
572
573
574
575
576
577
578
579
580
581

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats = rules.merge_stats.output,
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
    container: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output}"

582
583
584
585
586
587

rule gvcf2coverage:
    """ Determine coverage from gvcf files """
    input: rules.gvcf_gather.output.gvcf
    output: "{sample}/vcf/{sample}_{threshold}.bed"
    container: containers["gvcf2coverage"]
van den Berg's avatar
van den Berg committed
588
    shell: "gvcf2coverage -t {wildcards.threshold} < {input} | cut -f 1,2,3 > {output}"