Snakefile 23 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
#   hutspot - a DNAseq variant calling pipeline
van den Berg's avatar
van den Berg committed
2
#   Copyright (C) 2017-2019, Leiden University Medical Center
Sander Bollen's avatar
Sander Bollen committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
# Read the json schema
32
with open(srcdir('config/schema.json'), 'rt') as fin:
van den Berg's avatar
van den Berg committed
33
34
    schema = json.load(fin)

van den Berg's avatar
van den Berg committed
35
# Validate the config against the schema
van den Berg's avatar
van den Berg committed
36
try:
van den Berg's avatar
van den Berg committed
37
    jsonschema.validate(config, schema)
van den Berg's avatar
van den Berg committed
38
except jsonschema.ValidationError as e:
van den Berg's avatar
van den Berg committed
39
    raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')
van den Berg's avatar
van den Berg committed
40

van den Berg's avatar
van den Berg committed
41
42
43
44
45
# If you specify a baitsfile, you also have to specify a targets file for
# picard
if "baitsfile" in config and "targetsfile" not in config:
    msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
    raise jsonschema.ValidationError(msg)
46

47
48
49
50
51
52
53
# A sample name cannot be a substring of another sample, since that breaks picard
# metrics parsing by multiqc
msg = 'Invalid --configfile: sample names should not overlap ("{s1}" is contained in "{s2}")'
for s1, s2 in itertools.permutations(config['samples'], 2):
    if s1 in s2:
        raise jsonschema.ValidationError(msg.format(s1=s1, s2=s2))

54
55
# Set default values
def set_default(key, value):
van den Berg's avatar
van den Berg committed
56
    """Set default config values"""
van den Berg's avatar
van den Berg committed
57
58
    if key not in config:
        config[key] = value
van den Berg's avatar
van den Berg committed
59

van den Berg's avatar
van den Berg committed
60
# Set the default config
van den Berg's avatar
van den Berg committed
61
62
63
64
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
65
66
67
68
69
70
set_default("covstats", srcdir("src/covstats.py"))
set_default("collect_stats", srcdir("src/collect_stats.py"))
set_default("merge_stats", srcdir("src/merge_stats.py"))
set_default("stats_to_tsv", srcdir("src/stats_to_tsv.py"))
set_default("py_wordcount", srcdir("src/pywc.py"))
set_default("cutadapt_summary", srcdir("src/cutadapt_summary.py"))
Sander Bollen's avatar
Sander Bollen committed
71

72
73
74
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
75
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
van den Berg's avatar
van den Berg committed
76
    "bwa-0.7.17-picard-2.22.8": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:76d16eabff506ac13338d7f14644a0ad301b9d7e-0",
van den Berg's avatar
van den Berg committed
77
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
78
79
80
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
81
    "gvcf2coverage": "docker://lumc/gvcf2coverage:0.1-dirty-2",
82
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
van den Berg's avatar
van den Berg committed
83
    "picard": "docker://quay.io/biocontainers/picard:2.22.8--0",
84
85
86
87
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
88

89
def get_readgroup(wildcards):
90
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
91

van den Berg's avatar
van den Berg committed
92
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
93
    for sample in config["samples"]:
94
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
95
96
            yield rg, sample

97
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
98
99
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
100
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
101

Sander Bollen's avatar
Sander Bollen committed
102
103
rule all:
    input:
van den Berg's avatar
van den Berg committed
104
        multiqc = "multiqc_report/multiqc_report.html",
105
106
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

        bam = expand("{sample}/bams/{sample}.bam",
                     sample=config["samples"]),

        vcfs = expand("{sample}/vcf/{sample}.vcf.gz",
                      sample=config["samples"]),

        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi",
                         sample=config["samples"]),

        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz",
                       sample=config["samples"]),

        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                          sample=config["samples"]),

        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt"
                    for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
131
        coverage_stats = coverage_stats,
132
133
134
135
        coverage_files = (f"{sample}/vcf/{sample}_{threshold}.bed"
                          for sample, threshold in itertools.product(
                              config['samples'], config['coverage_threshold'])
                          ) if 'coverage_threshold' in config else []
Sander Bollen's avatar
Sander Bollen committed
136

137
138
139
140
141
142
rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    output: directory("tmp")
    container: containers["debian"]
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
143
rule genome:
Sander Bollen's avatar
Sander Bollen committed
144
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
145
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
146
    output: "current.genome"
van den Berg's avatar
van den Berg committed
147
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
148
149
150
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
151
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
152
    input:
153
154
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
155
    output:
van den Berg's avatar
van den Berg committed
156
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
157
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
158
159
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
160
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
161
162
163
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
164
           "{input.r1} {input.r2} "
165
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
166
167

rule align:
Sander Bollen's avatar
Sander Bollen committed
168
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
169
    input:
170
171
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
172
        ref = config["reference"],
173
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
174
    params:
175
        rg = "@RG\\tID:{sample}-library-{read_group}\\tSM:{sample}\\tLB:library\\tPL:ILLUMINA"
176
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
van den Berg's avatar
van den Berg committed
177
    container: containers["bwa-0.7.17-picard-2.22.8"]
Sander Bollen's avatar
Sander Bollen committed
178
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
179
180
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
181
182
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

183
def markdup_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
184
185
186
187
    """Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(
                sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)]
188

Sander Bollen's avatar
Sander Bollen committed
189
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
190
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
191
    input:
192
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
193
194
195
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
196
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
197
    output:
198
199
200
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
201
202
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
203
    container: containers["picard"]
204
205
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
206
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
207
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
208
209
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
210
211
212
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
213
        bai = rules.markdup.output.bai
Sander Bollen's avatar
Sander Bollen committed
214
    output:
215
        bai = "{sample}/bams/{sample}.bam.bai"
van den Berg's avatar
van den Berg committed
216
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
217
218
    shell: "cp {input.bai} {output.bai}"

219
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
220
221
222
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
223
224
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
225
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
226
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
227
    input:
228
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
229
230
231
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
232
233
        ref = config["reference"],
        vcfs = config["known_sites"]
234
    output: "{sample}/bams/{sample}.baserecal.grp"
235
    params:
van den Berg's avatar
van den Berg committed
236
237
238
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
239
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
240
    container: containers["gatk"]
241
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
242
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
243
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
244
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
245

246
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
247
248
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
249
        ref = config["reference"],
250
    params:
van den Berg's avatar
van den Berg committed
251
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
252
    output:
253
        directory("scatter")
van den Berg's avatar
van den Berg committed
254
    container: containers["biopet-scatterregions"]
255
    shell: "mkdir -p {output} && "
256
           "biopet-scatterregions -Xmx24G "
257
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
258
259
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
260
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
261
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
262
    input:
263
        bam = rules.markdup.output.bam,
264
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
265
266
267
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
268
    output:
van den Berg's avatar
van den Berg committed
269
270
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
271
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
272
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
273
    container: containers["gatk"]
274
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
275
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
276
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
277
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
278
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
279
280
281
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
282

283
284
285
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
286
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
287
288
289
290

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
291
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
292
293

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
294
    """Join the gvcf files together"""
295
296
297
298
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
299
300
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
301
302
303
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
           "--output {output.gvcf} --output-type z && "
304
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
305
306

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
307
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
308
    input:
309
310
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
311
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
312
    output:
313
314
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
315
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
316
317
318
319
        chunk = "[0-9]+"
    container: containers["gatk"]
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 "
           "/usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
320
           "GenotypeGVCFs -R {input.ref} "
321
           "-V {input.gvcf} -o '{output.vcf}'"
322

323
def aggregate_vcf(wildcards):
324
325
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
326
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
327

328
329
330
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
331
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
332

Sander Bollen's avatar
Sander Bollen committed
333
rule genotype_gather:
334
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
335
    input:
336
        vcfs = aggregate_vcf,
337
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
338
    output:
339
340
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
341
342
343
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
           "--output {output.vcf} --output-type z && "
344
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
345

Sander Bollen's avatar
Sander Bollen committed
346
## bam metrics
347
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
348
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
349
    input:
350
351
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
352
    output:
van den Berg's avatar
van den Berg committed
353
354
355
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
    container: containers["samtools-1.7-python-3.6"]
356
357
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
358

359
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
360
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
361
    input:
362
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
363
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
364
    output:
van den Berg's avatar
van den Berg committed
365
366
367
368
369
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
    container: containers["samtools-1.7-python-3.6"]
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
370
371

## fastqc
Sander Bollen's avatar
Sander Bollen committed
372
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
373
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
374
    input:
375
376
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
377
    output:
van den Berg's avatar
van den Berg committed
378
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
379
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
380
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
381

Sander Bollen's avatar
Sander Bollen committed
382
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
383
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
384
    input:
385
386
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
Sander Bollen's avatar
Sander Bollen committed
387
    output:
van den Berg's avatar
van den Berg committed
388
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
389
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
390
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
391

392
## coverage
Sander Bollen's avatar
Sander Bollen committed
393
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
394
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
395
    input:
396
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
397
398
        genome = "current.genome",
        covpy = config["covstats"],
van den Berg's avatar
van den Berg committed
399
        bed = config.get("targetsfile","")
Sander Bollen's avatar
Sander Bollen committed
400
    params:
van den Berg's avatar
van den Berg committed
401
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
402
    output:
van den Berg's avatar
van den Berg committed
403
404
405
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
406
407
408
409
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
410

Sander Bollen's avatar
Sander Bollen committed
411
412
413
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
414
415
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
416
        ref = config.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
417
    output:
van den Berg's avatar
van den Berg committed
418
419
        tsv = "{sample}/coverage/refFlat_coverage.tsv"
    container: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
420
421
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"

422
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
423
    """Colect cutadapt summary from each readgroup per sample """
424
425
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
426
427
428
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
429
430
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
van den Berg's avatar
van den Berg committed
431
    container: containers["python3"]
432
433
434
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

435
436
437
438
439
440
441
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
van den Berg's avatar
van den Berg committed
442
        cov = rules.covstats.output.covj if "targetsfile" in config else [],
443
444
445
446
447
448
449
450
451
452
453
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
454
           "--covstats {input.cov} > {output}"
455

van den Berg's avatar
van den Berg committed
456
457
458
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
459
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
460
461
        ref = config["reference"],
    params:
van den Berg's avatar
van den Berg committed
462
        prefix = "{sample}/bams/{sample}",
van den Berg's avatar
van den Berg committed
463
    output:
van den Berg's avatar
van den Berg committed
464
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
465
        insertMetrics = "{sample}/bams/{sample}.insert_size_metrics"
van den Berg's avatar
van den Berg committed
466
    container: containers["picard"]
467
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectMultipleMetrics "
van den Berg's avatar
van den Berg committed
468
469
470
471
472
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
           "PROGRAM=CollectInsertSizeMetrics "

473
474
475
476
477
478
479
rule bed_to_interval:
    """Run picard BedToIntervalList

    This is needed to convert the bed files for the capture kit to a format
    picard can read
    """
    input:
van den Berg's avatar
van den Berg committed
480
        targets = config.get("targetsfile",""),
481
482
483
484
485
486
487
        baits = config.get("baitsfile",""),
        ref = config["reference"]
    output:
        target_interval = "target.interval",
        baits_interval = "baits.interval"
    output:
    container: containers["picard"]
488
    shell: "picard -Xmx4G BedToIntervalList "
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
            "I={input.targets} O={output.target_interval} "
            "SD={input.ref} && "
            "picard BedToIntervalList "
            "I={input.baits} O={output.baits_interval} "
            "SD={input.ref} "

rule hs_metrics:
    """Run picard CollectHsMetrics"""
    input:
        bam = rules.markdup.output.bam,
        ref = config["reference"],
        targets = rules.bed_to_interval.output.target_interval,
        baits = rules.bed_to_interval.output.baits_interval,
    output: "{sample}/bams/{sample}.hs_metrics.txt"
    container: containers["picard"]
504
    shell: "picard -Xmx8G -XX:CompressedClassSpaceSize=256m CollectHsMetrics "
505
506
507
508
509
           "I={input.bam} O={output} "
           "R={input.ref} "
           "BAIT_INTERVALS={input.baits} "
           "TARGET_INTERVALS={input.targets}"

Sander Bollen's avatar
Sander Bollen committed
510
511
512
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
513
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
514
515
    """
    input:
van den Berg's avatar
van den Berg committed
516
517
518
519
520
521
522
523
524
525
526
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
527
        fastqc_raw = (directory(f"{sample}/pre_process/raw-{sample}-{read_group}/")
528
529
                      for read_group, sample in get_readgroup_per_sample()),

530
        fastqc_trim = (directory(f"{sample}/pre_process/trimmed-{sample}-{read_group}/")
531
532
533
534
535
                      for read_group, sample in get_readgroup_per_sample()),

        hs_metric = expand("{sample}/bams/{sample}.hs_metrics.txt",
                           sample=config["samples"]) if "baitsfile" in config else []

536
537
    output:
        html = "multiqc_report/multiqc_report.html",
538
        insertSize = "multiqc_report/multiqc_data/multiqc_picard_insertSize.json",
539
        AlignmentMetrics = "multiqc_report/multiqc_data/multiqc_picard_AlignmentSummaryMetrics.json",
540
        DuplicationMetrics = "multiqc_report/multiqc_data/multiqc_picard_dups.json",
541
        HsMetrics = "multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json" if "baitsfile" in config else []
van den Berg's avatar
van den Berg committed
542
543
544
    container: containers["multiqc"]
    shell: "multiqc --data-format json --force --outdir multiqc_report . "
           "|| touch {output}"
545
546
547
548
549
550
551

rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"],
552
        insertSize = rules.multiqc.output.insertSize,
553
        AlignmentMetrics = rules.multiqc.output.AlignmentMetrics,
554
        DuplicationMetrics = rules.multiqc.output.DuplicationMetrics,
555
        HsMetrics = rules.multiqc.output.HsMetrics
556
557
558
    output: "stats.json"
    container: containers["vtools"]
    shell: "python {input.mpy} --collectstats {input.cols} "
559
           "--picard-insertSize {input.insertSize} "
560
           "--picard-AlignmentMetrics {input.AlignmentMetrics} "
561
           "--picard-DuplicationMetrics {input.DuplicationMetrics} "
562
           "--picard-HsMetrics {input.HsMetrics} > {output}"
563
564
565
566
567
568
569
570
571
572

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats = rules.merge_stats.output,
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
    container: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output}"

573
574
575
576
577
578

rule gvcf2coverage:
    """ Determine coverage from gvcf files """
    input: rules.gvcf_gather.output.gvcf
    output: "{sample}/vcf/{sample}_{threshold}.bed"
    container: containers["gvcf2coverage"]
van den Berg's avatar
van den Berg committed
579
    shell: "gvcf2coverage -t {wildcards.threshold} < {input} | cut -f 1,2,3 > {output}"