Snakefile 23.1 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
#   hutspot - a DNAseq variant calling pipeline
van den Berg's avatar
van den Berg committed
2
#   Copyright (C) 2017-2019, Leiden University Medical Center
Sander Bollen's avatar
Sander Bollen committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
# Read the json schema
32
with open(srcdir('config/schema.json'), 'rt') as fin:
van den Berg's avatar
van den Berg committed
33
34
    schema = json.load(fin)

van den Berg's avatar
van den Berg committed
35
# Validate the config against the schema
van den Berg's avatar
van den Berg committed
36
try:
van den Berg's avatar
van den Berg committed
37
    jsonschema.validate(config, schema)
van den Berg's avatar
van den Berg committed
38
except jsonschema.ValidationError as e:
van den Berg's avatar
van den Berg committed
39
    raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')
van den Berg's avatar
van den Berg committed
40

van den Berg's avatar
van den Berg committed
41
42
43
44
45
# If you specify a baitsfile, you also have to specify a targets file for
# picard
if "baitsfile" in config and "targetsfile" not in config:
    msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
    raise jsonschema.ValidationError(msg)
46

47
48
49
50
51
52
53
# A sample name cannot be a substring of another sample, since that breaks picard
# metrics parsing by multiqc
msg = 'Invalid --configfile: sample names should not overlap ("{s1}" is contained in "{s2}")'
for s1, s2 in itertools.permutations(config['samples'], 2):
    if s1 in s2:
        raise jsonschema.ValidationError(msg.format(s1=s1, s2=s2))

54
55
# Set default values
def set_default(key, value):
van den Berg's avatar
van den Berg committed
56
    """Set default config values"""
van den Berg's avatar
van den Berg committed
57
58
    if key not in config:
        config[key] = value
van den Berg's avatar
van den Berg committed
59

van den Berg's avatar
van den Berg committed
60
# Set the default config
van den Berg's avatar
van den Berg committed
61
62
63
64
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
65
66
67
68
69
70
set_default("covstats", srcdir("src/covstats.py"))
set_default("collect_stats", srcdir("src/collect_stats.py"))
set_default("merge_stats", srcdir("src/merge_stats.py"))
set_default("stats_to_tsv", srcdir("src/stats_to_tsv.py"))
set_default("py_wordcount", srcdir("src/pywc.py"))
set_default("cutadapt_summary", srcdir("src/cutadapt_summary.py"))
Sander Bollen's avatar
Sander Bollen committed
71

72
73
74
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
75
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
van den Berg's avatar
van den Berg committed
76
    "bwa-0.7.17-picard-2.22.8": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:76d16eabff506ac13338d7f14644a0ad301b9d7e-0",
van den Berg's avatar
van den Berg committed
77
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
78
79
80
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
81
    "gvcf2coverage": "docker://lumc/gvcf2coverage:0.1-dirty-2",
82
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
van den Berg's avatar
van den Berg committed
83
    "picard": "docker://quay.io/biocontainers/picard:2.22.8--0",
84
85
86
87
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
88

89
def get_readgroup(wildcards):
90
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
91

van den Berg's avatar
van den Berg committed
92
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
93
    for sample in config["samples"]:
94
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
95
96
            yield rg, sample

97
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
98
99
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
100
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
101

Sander Bollen's avatar
Sander Bollen committed
102
103
rule all:
    input:
van den Berg's avatar
van den Berg committed
104
        multiqc = "multiqc_report/multiqc_report.html",
105
106
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

        bam = expand("{sample}/bams/{sample}.bam",
                     sample=config["samples"]),

        vcfs = expand("{sample}/vcf/{sample}.vcf.gz",
                      sample=config["samples"]),

        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi",
                         sample=config["samples"]),

        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz",
                       sample=config["samples"]),

        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                          sample=config["samples"]),

        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt"
                    for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
131
        coverage_stats = coverage_stats,
132
133
134
135
        coverage_files = (f"{sample}/vcf/{sample}_{threshold}.bed"
                          for sample, threshold in itertools.product(
                              config['samples'], config['coverage_threshold'])
                          ) if 'coverage_threshold' in config else []
Sander Bollen's avatar
Sander Bollen committed
136

137
138
139
140
141
142
rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    output: directory("tmp")
    container: containers["debian"]
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
143
rule genome:
Sander Bollen's avatar
Sander Bollen committed
144
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
145
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
146
    output: "current.genome"
van den Berg's avatar
van den Berg committed
147
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
148
149
150
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
151
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
152
    input:
153
154
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
155
    output:
van den Berg's avatar
van den Berg committed
156
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
157
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
158
159
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
160
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
161
162
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
van den Berg's avatar
van den Berg committed
163
           "--compression-level=1 -j 8 "
van den Berg's avatar
van den Berg committed
164
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
165
           "{input.r1} {input.r2} "
166
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
167
168

rule align:
Sander Bollen's avatar
Sander Bollen committed
169
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
170
    input:
171
172
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
173
        ref = config["reference"],
174
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
175
    params:
176
        rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"
177
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
van den Berg's avatar
van den Berg committed
178
    container: containers["bwa-0.7.17-picard-2.22.8"]
Sander Bollen's avatar
Sander Bollen committed
179
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
180
181
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
182
183
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

184
def markdup_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
185
186
187
188
    """Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(
                sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)]
189

Sander Bollen's avatar
Sander Bollen committed
190
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
191
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
192
    input:
193
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
194
195
196
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
197
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
198
    output:
199
200
201
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
202
203
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
204
    container: containers["picard"]
205
206
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
207
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
208
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
209
210
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
211
212
213
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
214
        bai = rules.markdup.output.bai
Sander Bollen's avatar
Sander Bollen committed
215
    output:
216
        bai = "{sample}/bams/{sample}.bam.bai"
van den Berg's avatar
van den Berg committed
217
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
218
219
    shell: "cp {input.bai} {output.bai}"

220
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
221
222
223
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
224
225
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
226
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
227
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
228
    input:
229
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
230
231
232
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
233
234
        ref = config["reference"],
        vcfs = config["known_sites"]
235
    output: "{sample}/bams/{sample}.baserecal.grp"
236
    params:
van den Berg's avatar
van den Berg committed
237
238
239
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
240
        region = "-L "+ config["restrict_BQSR"] if "restrict_BQSR" in config else "",
241
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
242
    container: containers["gatk"]
243
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
244
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
245
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
246
           "{params.region} "
247
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
248

249
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
250
251
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
252
        ref = config["reference"],
253
    params:
van den Berg's avatar
van den Berg committed
254
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
255
    output:
256
        directory("scatter")
van den Berg's avatar
van den Berg committed
257
    container: containers["biopet-scatterregions"]
258
    shell: "mkdir -p {output} && "
259
           "biopet-scatterregions -Xmx24G "
260
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
261
262
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
263
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
264
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
265
    input:
266
        bam = rules.markdup.output.bam,
267
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
268
269
270
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
271
    output:
van den Berg's avatar
van den Berg committed
272
273
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
274
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
275
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
276
    container: containers["gatk"]
277
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
278
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
279
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
280
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
281
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
282
283
284
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
285

286
287
288
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
289
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
290
291
292
293

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
294
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
295
296

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
297
    """Join the gvcf files together"""
298
299
300
301
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
302
303
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
304
305
306
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
           "--output {output.gvcf} --output-type z && "
307
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
308
309

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
310
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
311
    input:
312
313
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
314
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
315
    output:
316
317
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
318
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
319
320
321
322
        chunk = "[0-9]+"
    container: containers["gatk"]
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 "
           "/usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
323
           "GenotypeGVCFs -R {input.ref} "
324
           "-V {input.gvcf} -o '{output.vcf}'"
325

326
def aggregate_vcf(wildcards):
327
328
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
329
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
330

331
332
333
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
334
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
335

Sander Bollen's avatar
Sander Bollen committed
336
rule genotype_gather:
337
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
338
    input:
339
        vcfs = aggregate_vcf,
340
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
341
    output:
342
343
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
344
345
346
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
           "--output {output.vcf} --output-type z && "
347
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
348

Sander Bollen's avatar
Sander Bollen committed
349
## bam metrics
350
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
351
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
352
    input:
353
354
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
355
    output:
van den Berg's avatar
van den Berg committed
356
357
358
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
    container: containers["samtools-1.7-python-3.6"]
359
360
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
361

362
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
363
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
364
    input:
365
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
366
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
367
    output:
van den Berg's avatar
van den Berg committed
368
369
370
371
372
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
    container: containers["samtools-1.7-python-3.6"]
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
373
374

## fastqc
Sander Bollen's avatar
Sander Bollen committed
375
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
376
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
377
    input:
378
379
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
380
    output:
van den Berg's avatar
van den Berg committed
381
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
382
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
383
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
384

Sander Bollen's avatar
Sander Bollen committed
385
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
386
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
387
    input:
388
389
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
Sander Bollen's avatar
Sander Bollen committed
390
    output:
van den Berg's avatar
van den Berg committed
391
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
392
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
393
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
394

395
## coverage
Sander Bollen's avatar
Sander Bollen committed
396
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
397
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
398
    input:
399
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
400
401
        genome = "current.genome",
        covpy = config["covstats"],
van den Berg's avatar
van den Berg committed
402
        bed = config.get("targetsfile","")
Sander Bollen's avatar
Sander Bollen committed
403
    params:
van den Berg's avatar
van den Berg committed
404
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
405
    output:
van den Berg's avatar
van den Berg committed
406
407
408
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
409
410
411
412
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
413

Sander Bollen's avatar
Sander Bollen committed
414
415
416
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
417
418
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
419
        ref = config.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
420
    output:
van den Berg's avatar
van den Berg committed
421
422
        tsv = "{sample}/coverage/refFlat_coverage.tsv"
    container: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
423
424
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"

425
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
426
    """Colect cutadapt summary from each readgroup per sample """
427
428
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
429
430
431
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
432
433
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
van den Berg's avatar
van den Berg committed
434
    container: containers["python3"]
435
436
437
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

438
439
440
441
442
443
444
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
van den Berg's avatar
van den Berg committed
445
        cov = rules.covstats.output.covj if "targetsfile" in config else [],
446
447
448
449
450
451
452
453
454
455
456
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
457
           "--covstats {input.cov} > {output}"
458

van den Berg's avatar
van den Berg committed
459
460
461
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
462
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
463
464
        ref = config["reference"],
    params:
van den Berg's avatar
van den Berg committed
465
        prefix = "{sample}/bams/{sample}",
van den Berg's avatar
van den Berg committed
466
    output:
van den Berg's avatar
van den Berg committed
467
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
468
        insertMetrics = "{sample}/bams/{sample}.insert_size_metrics"
van den Berg's avatar
van den Berg committed
469
    container: containers["picard"]
470
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectMultipleMetrics "
van den Berg's avatar
van den Berg committed
471
472
473
474
475
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
           "PROGRAM=CollectInsertSizeMetrics "

476
477
478
479
480
481
482
rule bed_to_interval:
    """Run picard BedToIntervalList

    This is needed to convert the bed files for the capture kit to a format
    picard can read
    """
    input:
van den Berg's avatar
van den Berg committed
483
        targets = config.get("targetsfile",""),
484
485
486
487
488
489
490
        baits = config.get("baitsfile",""),
        ref = config["reference"]
    output:
        target_interval = "target.interval",
        baits_interval = "baits.interval"
    output:
    container: containers["picard"]
491
    shell: "picard -Xmx4G BedToIntervalList "
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
            "I={input.targets} O={output.target_interval} "
            "SD={input.ref} && "
            "picard BedToIntervalList "
            "I={input.baits} O={output.baits_interval} "
            "SD={input.ref} "

rule hs_metrics:
    """Run picard CollectHsMetrics"""
    input:
        bam = rules.markdup.output.bam,
        ref = config["reference"],
        targets = rules.bed_to_interval.output.target_interval,
        baits = rules.bed_to_interval.output.baits_interval,
    output: "{sample}/bams/{sample}.hs_metrics.txt"
    container: containers["picard"]
507
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectHsMetrics "
508
509
510
511
512
           "I={input.bam} O={output} "
           "R={input.ref} "
           "BAIT_INTERVALS={input.baits} "
           "TARGET_INTERVALS={input.targets}"

Sander Bollen's avatar
Sander Bollen committed
513
514
515
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
516
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
517
518
    """
    input:
van den Berg's avatar
van den Berg committed
519
520
521
522
523
524
525
526
527
528
529
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
530
        fastqc_raw = (directory(f"{sample}/pre_process/raw-{sample}-{read_group}/")
531
532
                      for read_group, sample in get_readgroup_per_sample()),

533
        fastqc_trim = (directory(f"{sample}/pre_process/trimmed-{sample}-{read_group}/")
534
535
536
537
538
                      for read_group, sample in get_readgroup_per_sample()),

        hs_metric = expand("{sample}/bams/{sample}.hs_metrics.txt",
                           sample=config["samples"]) if "baitsfile" in config else []

539
540
    output:
        html = "multiqc_report/multiqc_report.html",
541
        insertSize = "multiqc_report/multiqc_data/multiqc_picard_insertSize.json",
542
        AlignmentMetrics = "multiqc_report/multiqc_data/multiqc_picard_AlignmentSummaryMetrics.json",
543
        DuplicationMetrics = "multiqc_report/multiqc_data/multiqc_picard_dups.json",
544
        HsMetrics = "multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json" if "baitsfile" in config else []
van den Berg's avatar
van den Berg committed
545
546
547
    container: containers["multiqc"]
    shell: "multiqc --data-format json --force --outdir multiqc_report . "
           "|| touch {output}"
548
549
550
551
552
553
554

rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"],
555
        insertSize = rules.multiqc.output.insertSize,
556
        AlignmentMetrics = rules.multiqc.output.AlignmentMetrics,
557
        DuplicationMetrics = rules.multiqc.output.DuplicationMetrics,
558
        HsMetrics = rules.multiqc.output.HsMetrics
559
560
561
    output: "stats.json"
    container: containers["vtools"]
    shell: "python {input.mpy} --collectstats {input.cols} "
562
           "--picard-insertSize {input.insertSize} "
563
           "--picard-AlignmentMetrics {input.AlignmentMetrics} "
564
           "--picard-DuplicationMetrics {input.DuplicationMetrics} "
565
           "--picard-HsMetrics {input.HsMetrics} > {output}"
566
567
568
569
570
571
572
573
574
575

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats = rules.merge_stats.output,
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
    container: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output}"

576
577
578
579
580
581

rule gvcf2coverage:
    """ Determine coverage from gvcf files """
    input: rules.gvcf_gather.output.gvcf
    output: "{sample}/vcf/{sample}_{threshold}.bed"
    container: containers["gvcf2coverage"]
van den Berg's avatar
van den Berg committed
582
    shell: "gvcf2coverage -t {wildcards.threshold} < {input} | cut -f 1,2,3 > {output}"