Snakefile 24.4 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
#   hutspot - a DNAseq variant calling pipeline
van den Berg's avatar
van den Berg committed
2
#   Copyright (C) 2017-2019, Leiden University Medical Center
Sander Bollen's avatar
Sander Bollen committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
# Read the json schema
32
with open(srcdir('config/schema.json'), 'rt') as fin:
van den Berg's avatar
van den Berg committed
33
34
    schema = json.load(fin)

van den Berg's avatar
van den Berg committed
35
# Validate the config against the schema
van den Berg's avatar
van den Berg committed
36
try:
van den Berg's avatar
van den Berg committed
37
    jsonschema.validate(config, schema)
van den Berg's avatar
van den Berg committed
38
except jsonschema.ValidationError as e:
van den Berg's avatar
van den Berg committed
39
    raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')
van den Berg's avatar
van den Berg committed
40

van den Berg's avatar
van den Berg committed
41
42
43
44
45
# If you specify a baitsfile, you also have to specify a targets file for
# picard
if "baitsfile" in config and "targetsfile" not in config:
    msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
    raise jsonschema.ValidationError(msg)
46

47
48
49
50
51
52
53
# A sample name cannot be a substring of another sample, since that breaks picard
# metrics parsing by multiqc
msg = 'Invalid --configfile: sample names should not overlap ("{s1}" is contained in "{s2}")'
for s1, s2 in itertools.permutations(config['samples'], 2):
    if s1 in s2:
        raise jsonschema.ValidationError(msg.format(s1=s1, s2=s2))

54
55
# Set default values
def set_default(key, value):
van den Berg's avatar
van den Berg committed
56
    """Set default config values"""
van den Berg's avatar
van den Berg committed
57
58
    if key not in config:
        config[key] = value
van den Berg's avatar
van den Berg committed
59

van den Berg's avatar
van den Berg committed
60
# Set the default config
van den Berg's avatar
van den Berg committed
61
62
63
64
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
65
66
67
68
69
70
set_default("covstats", srcdir("src/covstats.py"))
set_default("collect_stats", srcdir("src/collect_stats.py"))
set_default("merge_stats", srcdir("src/merge_stats.py"))
set_default("stats_to_tsv", srcdir("src/stats_to_tsv.py"))
set_default("py_wordcount", srcdir("src/pywc.py"))
set_default("cutadapt_summary", srcdir("src/cutadapt_summary.py"))
Sander Bollen's avatar
Sander Bollen committed
71

72
73
74
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
75
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
van den Berg's avatar
van den Berg committed
76
    "bwa-0.7.17-picard-2.22.8": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:76d16eabff506ac13338d7f14644a0ad301b9d7e-0",
van den Berg's avatar
van den Berg committed
77
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
78
79
80
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
81
    "gvcf2coverage": "docker://lumc/gvcf2coverage:0.1-dirty-2",
82
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
van den Berg's avatar
van den Berg committed
83
    "picard": "docker://quay.io/biocontainers/picard:2.22.8--0",
84
85
86
87
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
88

89
def get_readgroup(wildcards):
90
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
91

van den Berg's avatar
van den Berg committed
92
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
93
    for sample in config["samples"]:
94
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
95
96
            yield rg, sample

97
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
98
99
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
100
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
101

Sander Bollen's avatar
Sander Bollen committed
102
103
rule all:
    input:
van den Berg's avatar
van den Berg committed
104
        multiqc = "multiqc_report/multiqc_report.html",
105
106
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

        bam = expand("{sample}/bams/{sample}.bam",
                     sample=config["samples"]),

        vcfs = expand("{sample}/vcf/{sample}.vcf.gz",
                      sample=config["samples"]),

        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi",
                         sample=config["samples"]),

        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz",
                       sample=config["samples"]),

        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                          sample=config["samples"]),

123
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/.done"
van den Berg's avatar
van den Berg committed
124
125
                      for read_group, sample in get_readgroup_per_sample()),

126
        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
van den Berg's avatar
van den Berg committed
127
128
129
130
                      for read_group, sample in get_readgroup_per_sample()),

        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt"
                    for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
131
        coverage_stats = coverage_stats,
132
133
134
135
        coverage_files = (f"{sample}/vcf/{sample}_{threshold}.bed"
                          for sample, threshold in itertools.product(
                              config['samples'], config['coverage_threshold'])
                          ) if 'coverage_threshold' in config else []
Sander Bollen's avatar
Sander Bollen committed
136

137
138
139
rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    output: directory("tmp")
140
    log: "log/create_markdup_tmp.log"
141
    container: containers["debian"]
142
    shell: "mkdir -p {output} 2> {log}"
143

Sander Bollen's avatar
Sander Bollen committed
144
rule genome:
Sander Bollen's avatar
Sander Bollen committed
145
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
146
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
147
    output: "current.genome"
148
    log: "log/genome.log"
van den Berg's avatar
van den Berg committed
149
    container: containers["debian"]
150
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
151
152

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
153
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
154
    input:
155
156
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
157
    output:
van den Berg's avatar
van den Berg committed
158
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
159
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
160
161
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
162
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
163
164
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
van den Berg's avatar
van den Berg committed
165
           "--compression-level=1 -j 8 "
van den Berg's avatar
van den Berg committed
166
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
167
           "{input.r1} {input.r2} "
168
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
169
170

rule align:
Sander Bollen's avatar
Sander Bollen committed
171
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
172
    input:
173
174
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
175
        ref = config["reference"],
176
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
177
    params:
178
        rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"
179
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
180
    log: "log/{sample}/align.{read_group}.log"
van den Berg's avatar
van den Berg committed
181
    container: containers["bwa-0.7.17-picard-2.22.8"]
Sander Bollen's avatar
Sander Bollen committed
182
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
183
184
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
185
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
186

187
def markdup_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
188
189
190
191
    """Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(
                sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)]
192

Sander Bollen's avatar
Sander Bollen committed
193
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
194
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
195
    input:
196
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
197
198
199
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
200
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
201
    output:
202
203
204
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
205
    log: "log/{sample}/markdup.log"
206
207
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
208
    container: containers["picard"]
209
210
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
211
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
212
           "METRICS_FILE={output.metrics} "
213
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
214

215
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
216
217
218
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
219
220
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
221
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
222
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
223
    input:
224
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
225
226
227
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
228
229
        ref = config["reference"],
        vcfs = config["known_sites"]
230
    output: "{sample}/bams/{sample}.baserecal.grp"
231
    log: "log/{sample}/baserecal.log"
232
    params:
van den Berg's avatar
van den Berg committed
233
234
235
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
236
        region = "-L "+ config["restrict_BQSR"] if "restrict_BQSR" in config else "",
237
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
238
    container: containers["gatk"]
239
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
240
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
241
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
242
243
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites} "
           "{params.region} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
244

245
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
246
247
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
248
        ref = config["reference"],
249
    params:
van den Berg's avatar
van den Berg committed
250
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
251
    output:
252
        directory("scatter")
253
    log: "log/scatterregions.log"
van den Berg's avatar
van den Berg committed
254
    container: containers["biopet-scatterregions"]
255
    shell: "mkdir -p {output} && "
256
           "biopet-scatterregions -Xmx24G "
257
           "--referenceFasta {input.ref} --scatterSize {params.size} "
258
           "--outputDir scatter 2> {log}"
van den Berg's avatar
van den Berg committed
259

Sander Bollen's avatar
Sander Bollen committed
260
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
261
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
262
    input:
263
        bam = rules.markdup.output.bam,
264
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
265
266
267
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
268
    output:
van den Berg's avatar
van den Berg committed
269
270
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
271
    log: "log/{sample}/gvcf_scatter.{chunk}.log"
van den Berg's avatar
van den Berg committed
272
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
273
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
274
    container: containers["gatk"]
275
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
276
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
277
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
278
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
279
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
280
281
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
282
           "--GVCFGQBands 80 --GVCFGQBands 100 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
283

284
285
286
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
287
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
288
289
290
291

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
292
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
293
294

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
295
    """Join the gvcf files together"""
296
297
298
299
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
300
301
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
302
    log: "log/{sample}/gvcf_gather.log"
van den Berg's avatar
van den Berg committed
303
304
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
305
306
307
           "--output {output.gvcf} --output-type z 2> {log} && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} "
           "{output.gvcf} 2>> {log}"
Sander Bollen's avatar
Sander Bollen committed
308
309

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
310
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
311
    input:
312
313
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
314
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
315
    output:
316
317
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
318
    log: "log/{sample}/genotype_scatter.{chunk}.log"
van den Berg's avatar
van den Berg committed
319
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
320
321
322
323
        chunk = "[0-9]+"
    container: containers["gatk"]
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 "
           "/usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
324
           "GenotypeGVCFs -R {input.ref} "
325
           "-V {input.gvcf} -o '{output.vcf}' 2> {log}"
326

327
def aggregate_vcf(wildcards):
328
329
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
330
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
331

332
333
334
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
335
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
336

Sander Bollen's avatar
Sander Bollen committed
337
rule genotype_gather:
338
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
339
    input:
340
        vcfs = aggregate_vcf,
341
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
342
    output:
343
344
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
345
    log: "log/{sample}/genotype_gather.log"
van den Berg's avatar
van den Berg committed
346
347
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
348
           "--output {output.vcf} --output-type z 2> {log} && "
349
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
350

Sander Bollen's avatar
Sander Bollen committed
351
## bam metrics
352
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
353
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
354
    input:
355
356
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
357
    output:
van den Berg's avatar
van den Berg committed
358
359
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
360
    log: "log/{sample}/mapped_reads_bases.log"
van den Berg's avatar
van den Berg committed
361
    container: containers["samtools-1.7-python-3.6"]
362
    shell: "samtools view -F 4 {input.bam} 2> {log} | cut -f 10 | python {input.pywc} "
363
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
364

365
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
366
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
367
    input:
368
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
369
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
370
    output:
van den Berg's avatar
van den Berg committed
371
372
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
373
    log: "log/{sample}/unique_reads_bases.log"
van den Berg's avatar
van den Berg committed
374
    container: containers["samtools-1.7-python-3.6"]
375
376
377
    shell: "samtools view -F 4 -F 1024 {input.bam} 2> {log} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} "
           "--bases {output.bases} 2>> {log}"
Sander Bollen's avatar
Sander Bollen committed
378
379

## fastqc
Sander Bollen's avatar
Sander Bollen committed
380
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
381
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
382
    input:
383
384
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
385
386
    params:
        folder = "{sample}/pre_process/raw-{sample}-{read_group}"
Sander Bollen's avatar
Sander Bollen committed
387
    output:
388
        done = "{sample}/pre_process/raw-{sample}-{read_group}/.done"
389
    log: "log/{sample}/fastqc_raw.{read_group}.log"
van den Berg's avatar
van den Berg committed
390
    container: containers["fastqc"]
391
392
    shell: "fastqc --threads 4 --nogroup -o {params.folder} "
           "{input.r1} {input.r2} 2> {log} && "
393
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
394

Sander Bollen's avatar
Sander Bollen committed
395
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
396
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
397
    input:
398
399
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
400
401
    params:
        folder = "{sample}/pre_process/trimmed-{sample}-{read_group}"
Sander Bollen's avatar
Sander Bollen committed
402
    output:
403
        done = "{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
404
    log: "log/{sample}/fastqc_postqc.{read_group}.log"
van den Berg's avatar
van den Berg committed
405
    container: containers["fastqc"]
406
407
    shell: "fastqc --threads 4 --nogroup -o {params.folder} "
           "{input.r1} {input.r2} 2> {log} && "
408
           "touch {output.done}"
Sander Bollen's avatar
Sander Bollen committed
409

410
## coverage
Sander Bollen's avatar
Sander Bollen committed
411
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
412
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
413
    input:
414
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
415
416
        genome = "current.genome",
        covpy = config["covstats"],
van den Berg's avatar
van den Berg committed
417
        bed = config.get("targetsfile","")
Sander Bollen's avatar
Sander Bollen committed
418
    params:
van den Berg's avatar
van den Berg committed
419
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
420
    output:
van den Berg's avatar
van den Berg committed
421
422
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
423
    log: "log/{sample}/covstats.log"
van den Berg's avatar
van den Berg committed
424
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
425
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
426
427
428
           "-b {input.bam} -d  2> {log} | python {input.covpy} - "
           "--plot {output.covp} --title 'Targets coverage' "
           "--subtitle '{params.subt}' > {output.covj} 2>> {log}"
Sander Bollen's avatar
Sander Bollen committed
429

Sander Bollen's avatar
Sander Bollen committed
430
431
432
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
433
434
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
435
        ref = config.get('refflat', "")
436
437
    output: "{sample}/coverage/refFlat_coverage.tsv"
    log: "log/{sample}/vtools_coverage.log"
van den Berg's avatar
van den Berg committed
438
    container: containers["vtools"]
439
440
    shell: "vtools-gcoverage -I {input.gvcf} "
           "-R {input.ref} > {output} 2> {log}"
Sander Bollen's avatar
Sander Bollen committed
441

442
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
443
    """Colect cutadapt summary from each readgroup per sample """
444
445
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
446
447
448
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
449
450
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
451
    log: "log/{sample}/collect_cutadapt_summary.log"
van den Berg's avatar
van den Berg committed
452
    container: containers["python3"]
453
454
455
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

456
457
458
459
460
461
462
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
van den Berg's avatar
van den Berg committed
463
        cov = rules.covstats.output.covj if "targetsfile" in config else [],
464
465
466
467
468
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
469
    log: "log/{sample}/collectstats.log"
470
471
472
473
474
475
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
476
           "--covstats {input.cov} > {output} 2> {log}"
477

van den Berg's avatar
van den Berg committed
478
479
480
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
481
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
482
483
        ref = config["reference"],
    params:
van den Berg's avatar
van den Berg committed
484
        prefix = "{sample}/bams/{sample}",
van den Berg's avatar
van den Berg committed
485
    output:
van den Berg's avatar
van den Berg committed
486
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
487
        insertMetrics = "{sample}/bams/{sample}.insert_size_metrics"
488
    log: "log/{sample}/multiple_metrics.log"
van den Berg's avatar
van den Berg committed
489
    container: containers["picard"]
490
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectMultipleMetrics "
van den Berg's avatar
van den Berg committed
491
492
493
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
494
           "PROGRAM=CollectInsertSizeMetrics 2> {log}"
van den Berg's avatar
van den Berg committed
495

496
497
498
499
500
501
502
rule bed_to_interval:
    """Run picard BedToIntervalList

    This is needed to convert the bed files for the capture kit to a format
    picard can read
    """
    input:
van den Berg's avatar
van den Berg committed
503
        targets = config.get("targetsfile",""),
504
505
506
507
508
        baits = config.get("baitsfile",""),
        ref = config["reference"]
    output:
        target_interval = "target.interval",
        baits_interval = "baits.interval"
509
    log: "log/bed_to_interval.log"
510
    container: containers["picard"]
511
    shell: "picard -Xmx4G BedToIntervalList "
512
            "I={input.targets} O={output.target_interval} "
513
            "SD={input.ref} 2> {log} && "
514
515
            "picard BedToIntervalList "
            "I={input.baits} O={output.baits_interval} "
516
            "SD={input.ref} 2>> {log}"
517
518
519
520
521
522
523
524
525

rule hs_metrics:
    """Run picard CollectHsMetrics"""
    input:
        bam = rules.markdup.output.bam,
        ref = config["reference"],
        targets = rules.bed_to_interval.output.target_interval,
        baits = rules.bed_to_interval.output.baits_interval,
    output: "{sample}/bams/{sample}.hs_metrics.txt"
526
    log: "log/{sample}/hs_metrics.log"
527
    container: containers["picard"]
528
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectHsMetrics "
529
530
531
532
533
           "I={input.bam} O={output} "
           "R={input.ref} "
           "BAIT_INTERVALS={input.baits} "
           "TARGET_INTERVALS={input.targets}"

Sander Bollen's avatar
Sander Bollen committed
534
535
536
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
537
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
538
539
    """
    input:
van den Berg's avatar
van den Berg committed
540
541
542
543
544
545
546
547
548
549
550
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
551
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/.done"
552
553
                      for read_group, sample in get_readgroup_per_sample()),

554
        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done"
555
556
557
558
559
                      for read_group, sample in get_readgroup_per_sample()),

        hs_metric = expand("{sample}/bams/{sample}.hs_metrics.txt",
                           sample=config["samples"]) if "baitsfile" in config else []

560
561
    output:
        html = "multiqc_report/multiqc_report.html",
562
        insertSize = "multiqc_report/multiqc_data/multiqc_picard_insertSize.json",
563
        AlignmentMetrics = "multiqc_report/multiqc_data/multiqc_picard_AlignmentSummaryMetrics.json",
564
        DuplicationMetrics = "multiqc_report/multiqc_data/multiqc_picard_dups.json",
565
        HsMetrics = "multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json" if "baitsfile" in config else []
566
    log: "log/multiqc.log"
van den Berg's avatar
van den Berg committed
567
    container: containers["multiqc"]
568
569
    shell: "multiqc --data-format json --force "
           "--outdir multiqc_report . 2> {log} "
van den Berg's avatar
van den Berg committed
570
           "|| touch {output}"
571
572
573
574
575
576
577

rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"],
578
        insertSize = rules.multiqc.output.insertSize,
579
        AlignmentMetrics = rules.multiqc.output.AlignmentMetrics,
580
        DuplicationMetrics = rules.multiqc.output.DuplicationMetrics,
581
        HsMetrics = rules.multiqc.output.HsMetrics
582
    output: "stats.json"
583
    log: "log/stats.tsv.log"
584
585
    container: containers["vtools"]
    shell: "python {input.mpy} --collectstats {input.cols} "
586
           "--picard-insertSize {input.insertSize} "
587
           "--picard-AlignmentMetrics {input.AlignmentMetrics} "
588
           "--picard-DuplicationMetrics {input.DuplicationMetrics} "
589
           "--picard-HsMetrics {input.HsMetrics} > {output} 2> {log}"
590
591
592
593
594
595
596

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats = rules.merge_stats.output,
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
597
    log: "log/stats.tsv.log"
598
    container: containers["python3"]
599
    shell: "python {input.sc} -i {input.stats} > {output} 2> {log}"
600

601
602
603
604
605

rule gvcf2coverage:
    """ Determine coverage from gvcf files """
    input: rules.gvcf_gather.output.gvcf
    output: "{sample}/vcf/{sample}_{threshold}.bed"
606
    log: "log/{sample}/gvcf2coverage.{threshold}.log"
607
    container: containers["gvcf2coverage"]
608
    shell: "gvcf2coverage -t {wildcards.threshold} < {input} 2> {log} | cut -f 1,2,3 > {output}"