Snakefile 20.3 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
# Read the json schema
32
with open(srcdir('config/schema.json'), 'rt') as fin:
van den Berg's avatar
van den Berg committed
33
34
    schema = json.load(fin)

van den Berg's avatar
van den Berg committed
35
# Validate the config against the schema
van den Berg's avatar
van den Berg committed
36
try:
van den Berg's avatar
van den Berg committed
37
    jsonschema.validate(config, schema)
van den Berg's avatar
van den Berg committed
38
39
40
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

41

42
43
# Set default values
def set_default(key, value):
van den Berg's avatar
van den Berg committed
44
    """Set default config values"""
van den Berg's avatar
van den Berg committed
45
46
    if key not in config:
        config[key] = value
van den Berg's avatar
van den Berg committed
47

van den Berg's avatar
van den Berg committed
48
# Set the default config
van den Berg's avatar
van den Berg committed
49
50
51
52
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
53
54
55
56
57
58
set_default("covstats", srcdir("src/covstats.py"))
set_default("collect_stats", srcdir("src/collect_stats.py"))
set_default("merge_stats", srcdir("src/merge_stats.py"))
set_default("stats_to_tsv", srcdir("src/stats_to_tsv.py"))
set_default("py_wordcount", srcdir("src/pywc.py"))
set_default("cutadapt_summary", srcdir("src/cutadapt_summary.py"))
Sander Bollen's avatar
Sander Bollen committed
59

60
61
62
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
63
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
64
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
van den Berg's avatar
van den Berg committed
65
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
66
67
68
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
69
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
van den Berg's avatar
van den Berg committed
70
    "picard": "docker://quay.io/biocontainers/picard:2.22.8--0",
71
72
73
74
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
75

76
def get_forward(wildcards):
van den Berg's avatar
van den Berg committed
77
    """Get the forward fastq file from the config"""
78
    return (
79
        config["samples"][wildcards.sample]["read_groups"]
80
81
82
83
                [wildcards.read_group]["R1"]
    )

def get_reverse(wildcards):
van den Berg's avatar
van den Berg committed
84
    """Get the reverse fastq file from the config"""
85
    return (
86
        config["samples"][wildcards.sample]["read_groups"]
87
88
89
90
            [wildcards.read_group]["R2"]
    )

def get_readgroup(wildcards):
91
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
92

van den Berg's avatar
van den Berg committed
93
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
94
    for sample in config["samples"]:
95
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
96
97
98
            yield rg, sample


99
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
100
101
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
102
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
103

Sander Bollen's avatar
Sander Bollen committed
104
105
rule all:
    input:
van den Berg's avatar
van den Berg committed
106
        multiqc = "multiqc_report/multiqc_report.html",
107
108
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

        bam = expand("{sample}/bams/{sample}.bam",
                     sample=config["samples"]),

        vcfs = expand("{sample}/vcf/{sample}.vcf.gz",
                      sample=config["samples"]),

        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi",
                         sample=config["samples"]),

        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz",
                       sample=config["samples"]),

        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                          sample=config["samples"]),

        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt"
                    for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
133
        coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
134

Sander Bollen's avatar
Sander Bollen committed
135
136
137

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
138
    output: directory("tmp")
van den Berg's avatar
van den Berg committed
139
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
140
141
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
142
rule genome:
Sander Bollen's avatar
Sander Bollen committed
143
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
144
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
145
    output: "current.genome"
van den Berg's avatar
van den Berg committed
146
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
147
148
149
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
150
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
151
    input:
van den Berg's avatar
van den Berg committed
152
153
        r1 = get_forward,
        r2 = get_reverse
Sander Bollen's avatar
Sander Bollen committed
154
    output:
van den Berg's avatar
van den Berg committed
155
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
156
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
157
158
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
159
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
160
161
162
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
163
           "{input.r1} {input.r2} "
164
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
165
166

rule align:
Sander Bollen's avatar
Sander Bollen committed
167
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
168
    input:
169
170
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
171
        ref = config["reference"],
172
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
173
    params:
174
175
        rg = "@RG\\tID:{read_group}\\tSM:{sample}\\tPL:ILLUMINA"
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
van den Berg's avatar
van den Berg committed
176
    container: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
177
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
178
179
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
180
181
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

182
def markdup_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
183
184
185
186
    """Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(
                sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)]
187

Sander Bollen's avatar
Sander Bollen committed
188
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
189
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
190
    input:
191
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
192
193
194
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
Sander Bollen's avatar
Sander Bollen committed
195
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
196
    output:
197
198
199
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
200
201
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
202
    container: containers["picard"]
203
204
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
205
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
206
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
207
208
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
209
210
211
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
212
        bai = rules.markdup.output.bai
Sander Bollen's avatar
Sander Bollen committed
213
    output:
214
        bai = "{sample}/bams/{sample}.bam.bai"
van den Berg's avatar
van den Berg committed
215
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
216
217
    shell: "cp {input.bai} {output.bai}"

218
219

def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
220
221
222
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
223
224
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
225
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
226
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
227
    input:
228
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
229
230
231
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
232
233
        ref = config["reference"],
        vcfs = config["known_sites"]
234
    output: "{sample}/bams/{sample}.baserecal.grp"
235
    params:
van den Berg's avatar
van den Berg committed
236
237
238
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
239
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
240
    container: containers["gatk"]
241
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
242
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
243
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
244
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
245

246
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
247
248
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
249
        ref = config["reference"],
250
    params:
van den Berg's avatar
van den Berg committed
251
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
252
    output:
253
        directory("scatter")
van den Berg's avatar
van den Berg committed
254
    container: containers["biopet-scatterregions"]
255
    shell: "mkdir -p {output} && "
256
           "biopet-scatterregions -Xmx24G "
257
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
258
259
           "--outputDir scatter"

260

Sander Bollen's avatar
Sander Bollen committed
261
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
262
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
263
    input:
264
        bam = rules.markdup.output.bam,
265
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
266
267
268
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
269
    output:
van den Berg's avatar
van den Berg committed
270
271
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
272
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
273
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
274
    container: containers["gatk"]
275
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
276
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
277
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
278
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
279
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
280
281
282
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
283

284
285
286
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
287
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
288
289
290
291

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
292
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
293
294

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
295
    """Join the gvcf files together"""
296
297
298
299
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
300
301
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
302
303
304
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
           "--output {output.gvcf} --output-type z && "
305
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
306
307

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
308
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
309
    input:
310
311
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
312
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
313
    output:
314
315
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
316
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
317
318
319
320
        chunk = "[0-9]+"
    container: containers["gatk"]
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 "
           "/usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
321
           "GenotypeGVCFs -R {input.ref} "
322
           "-V {input.gvcf} -o '{output.vcf}'"
323
324


325
def aggregate_vcf(wildcards):
326
327
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
328
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
329
330


331
332
333
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
334
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
335
336


Sander Bollen's avatar
Sander Bollen committed
337
rule genotype_gather:
338
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
339
    input:
340
        vcfs = aggregate_vcf,
341
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
342
    output:
343
344
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
345
346
347
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
           "--output {output.vcf} --output-type z && "
348
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
349
350


Sander Bollen's avatar
Sander Bollen committed
351
## bam metrics
352
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
353
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
354
    input:
355
356
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
357
    output:
van den Berg's avatar
van den Berg committed
358
359
360
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
    container: containers["samtools-1.7-python-3.6"]
361
362
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
363
364


365
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
366
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
367
    input:
368
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
369
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
370
    output:
van den Berg's avatar
van den Berg committed
371
372
373
374
375
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
    container: containers["samtools-1.7-python-3.6"]
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
376
377
378


## fastqc
Sander Bollen's avatar
Sander Bollen committed
379
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
380
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
381
    input:
van den Berg's avatar
van den Berg committed
382
383
        r1 = get_forward,
        r2 = get_reverse
Sander Bollen's avatar
Sander Bollen committed
384
    output:
van den Berg's avatar
van den Berg committed
385
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
386
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
387
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
388
389


Sander Bollen's avatar
Sander Bollen committed
390
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
391
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
392
    input:
393
394
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
Sander Bollen's avatar
Sander Bollen committed
395
    output:
van den Berg's avatar
van den Berg committed
396
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
397
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
398
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
399
400


401
## coverage
Sander Bollen's avatar
Sander Bollen committed
402
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
403
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
404
    input:
405
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
406
407
408
        genome = "current.genome",
        covpy = config["covstats"],
        bed = config.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
409
    params:
van den Berg's avatar
van den Berg committed
410
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
411
    output:
van den Berg's avatar
van den Berg committed
412
413
414
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
415
416
417
418
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
419
420


Sander Bollen's avatar
Sander Bollen committed
421
422
423
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
424
425
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
426
        ref = config.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
427
    output:
van den Berg's avatar
van den Berg committed
428
429
        tsv = "{sample}/coverage/refFlat_coverage.tsv"
    container: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
430
431
432
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


433
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
434
    """Colect cutadapt summary from each readgroup per sample """
435
436
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
437
438
439
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
440
441
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
van den Berg's avatar
van den Berg committed
442
    container: containers["python3"]
443
444
445
446
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"


447
## collection
van den Berg's avatar
van den Berg committed
448
if "bedfile" in config:
449
450
451
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
452
453
454
455
456
457
            mnum = rules.mapped_reads_bases.output.reads,
            mbnum = rules.mapped_reads_bases.output.bases,
            unum = rules.unique_reads_bases.output.reads,
            ubnum = rules.unique_reads_bases.output.bases,
            cov = rules.covstats.output.covj,
            cutadapt = rules.collect_cutadapt_summary.output,
van den Berg's avatar
van den Berg committed
458
            colpy = config["collect_stats"]
459
        params:
van den Berg's avatar
van den Berg committed
460
            fthresh = config["female_threshold"]
461
        output: "{sample}/{sample}.stats.json"
van den Berg's avatar
van den Berg committed
462
463
        container: containers["vtools"]
        shell: "python {input.colpy} --sample-name {wildcards.sample} "
464
465
466
467
468
469
470
471
472
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--cutadapt {input.cutadapt} "
               "{input.cov} > {output}"
else:
    rule collectstats:
        """Collect all stats for a particular sample without beds"""
        input:
473
474
475
476
477
            mnum = rules.mapped_reads_bases.output.reads,
            mbnum = rules.mapped_reads_bases.output.bases,
            unum = rules.unique_reads_bases.output.reads,
            ubnum = rules.unique_reads_bases.output.bases,
            cutadapt = rules.collect_cutadapt_summary.output,
van den Berg's avatar
van den Berg committed
478
            colpy = config["collect_stats"]
479
        params:
van den Berg's avatar
van den Berg committed
480
            fthresh = config["female_threshold"]
481
        output: "{sample}/{sample}.stats.json"
van den Berg's avatar
van den Berg committed
482
483
        container: containers["vtools"]
        shell: "python {input.colpy} --sample-name {wildcards.sample} "
484
485
486
487
488
489
490
491
492
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--cutadapt {input.cutadapt} "
               "> {output}"

rule merge_stats:
    """Merge all stats of all samples"""
    input:
van den Berg's avatar
van den Berg committed
493
494
495
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"]
496
    output: "stats.json"
van den Berg's avatar
van den Berg committed
497
    container: containers["vtools"]
498
    shell: "python {input.mpy} --collectstats {input.cols} "
499
           "> {output}"
500
501
502
503
504


rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
505
        stats = rules.merge_stats.output,
van den Berg's avatar
van den Berg committed
506
507
508
509
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
    container: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output}"
Sander Bollen's avatar
Sander Bollen committed
510
511


van den Berg's avatar
van den Berg committed
512
513
514
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
515
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
516
517
        ref = config["reference"],
    params:
van den Berg's avatar
van den Berg committed
518
        prefix = "{sample}/bams/{sample}",
van den Berg's avatar
van den Berg committed
519
    output:
van den Berg's avatar
van den Berg committed
520
521
522
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
        insert = "{sample}/bams/{sample}.insert_size_metrics"
    container: containers["picard"]
van den Berg's avatar
van den Berg committed
523
524
525
526
527
528
    shell: "picard CollectMultipleMetrics "
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
           "PROGRAM=CollectInsertSizeMetrics "

Sander Bollen's avatar
Sander Bollen committed
529
530
531
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
532
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
533
534
    """
    input:
van den Berg's avatar
van den Berg committed
535
536
537
538
539
540
541
542
543
544
545
546
        stats = rules.stats_tsv.output,
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
547
    output: "multiqc_report/multiqc_report.html"
van den Berg's avatar
van den Berg committed
548
549
550
    container: containers["multiqc"]
    shell: "multiqc --data-format json --force --outdir multiqc_report . "
           "|| touch {output}"