Snakefile 19.3 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
# Read the json schema
32
with open(srcdir('config/schema.json'), 'rt') as fin:
van den Berg's avatar
van den Berg committed
33
34
    schema = json.load(fin)

van den Berg's avatar
van den Berg committed
35
# Validate the config against the schema
van den Berg's avatar
van den Berg committed
36
try:
van den Berg's avatar
van den Berg committed
37
    jsonschema.validate(config, schema)
van den Berg's avatar
van den Berg committed
38
39
40
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

41

42
43
# Set default values
def set_default(key, value):
van den Berg's avatar
van den Berg committed
44
    """Set default config values"""
van den Berg's avatar
van den Berg committed
45
46
    if key not in config:
        config[key] = value
van den Berg's avatar
van den Berg committed
47

van den Berg's avatar
van den Berg committed
48
# Set the default config
van den Berg's avatar
van den Berg committed
49
50
51
52
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
53
54
55
56
57
58
set_default("covstats", srcdir("src/covstats.py"))
set_default("collect_stats", srcdir("src/collect_stats.py"))
set_default("merge_stats", srcdir("src/merge_stats.py"))
set_default("stats_to_tsv", srcdir("src/stats_to_tsv.py"))
set_default("py_wordcount", srcdir("src/pywc.py"))
set_default("cutadapt_summary", srcdir("src/cutadapt_summary.py"))
Sander Bollen's avatar
Sander Bollen committed
59

60
61
62
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
63
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
64
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
van den Berg's avatar
van den Berg committed
65
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
66
67
68
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
69
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
van den Berg's avatar
van den Berg committed
70
    "picard": "docker://quay.io/biocontainers/picard:2.22.8--0",
71
72
73
74
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
75

76
def get_readgroup(wildcards):
77
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
78

van den Berg's avatar
van den Berg committed
79
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
80
    for sample in config["samples"]:
81
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
82
83
            yield rg, sample

84
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
85
86
    files = expand("{sample}/coverage/refFlat_coverage.tsv",
                   sample=config["samples"])
van den Berg's avatar
van den Berg committed
87
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
88

Sander Bollen's avatar
Sander Bollen committed
89
90
rule all:
    input:
van den Berg's avatar
van den Berg committed
91
        multiqc = "multiqc_report/multiqc_report.html",
92
93
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

        bam = expand("{sample}/bams/{sample}.bam",
                     sample=config["samples"]),

        vcfs = expand("{sample}/vcf/{sample}.vcf.gz",
                      sample=config["samples"]),

        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi",
                         sample=config["samples"]),

        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz",
                       sample=config["samples"]),

        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                          sample=config["samples"]),

        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        fastqc_trim = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/"
                      for read_group, sample in get_readgroup_per_sample()),

        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt"
                    for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
118
        coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
119

Sander Bollen's avatar
Sander Bollen committed
120
121
rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
122
    output: directory("tmp")
van den Berg's avatar
van den Berg committed
123
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
124
125
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
126
rule genome:
Sander Bollen's avatar
Sander Bollen committed
127
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
128
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
129
    output: "current.genome"
van den Berg's avatar
van den Berg committed
130
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
131
132
133
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
134
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
135
    input:
136
137
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
138
    output:
van den Berg's avatar
van den Berg committed
139
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
140
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
141
142
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
van den Berg's avatar
van den Berg committed
143
    container: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
144
145
146
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
147
           "{input.r1} {input.r2} "
148
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
149
150

rule align:
Sander Bollen's avatar
Sander Bollen committed
151
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
152
    input:
153
154
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2,
van den Berg's avatar
van den Berg committed
155
        ref = config["reference"],
156
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
157
    params:
158
159
        rg = "@RG\\tID:{read_group}\\tSM:{sample}\\tPL:ILLUMINA"
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
van den Berg's avatar
van den Berg committed
160
    container: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
161
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
162
163
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
164
165
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

166
def markdup_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
167
168
169
170
    """Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(
                sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)]
171

Sander Bollen's avatar
Sander Bollen committed
172
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
173
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
174
    input:
175
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
176
177
178
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
179
        #bam = lambda wc: ('f{wc.sample}/bams/{wc.sample}-{readgroup}' for read_group in config['samples'][wc.sample}]['read_groups']),
Sander Bollen's avatar
Sander Bollen committed
180
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
181
    output:
182
183
184
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
185
186
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
187
    container: containers["picard"]
188
189
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
190
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
191
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
192
193
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
194
195
196
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
197
        bai = rules.markdup.output.bai
Sander Bollen's avatar
Sander Bollen committed
198
    output:
199
        bai = "{sample}/bams/{sample}.bam.bai"
van den Berg's avatar
van den Berg committed
200
    container: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
201
202
    shell: "cp {input.bai} {output.bai}"

203
def bqsr_bam_input(wildcards):
van den Berg's avatar
van den Berg committed
204
205
206
    """Generate the bam input string for each read group for BQSR"""
    template = "-I {sample}/bams/{sample}-{read_group}.sorted.bam"
    return " ".join([template.format(sample=wildcards.sample,
207
208
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
209
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
210
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
211
    input:
212
        bam = lambda wildcards:
van den Berg's avatar
van den Berg committed
213
214
215
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(
            sample=wildcards.sample, read_group=rg)
            for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
216
217
        ref = config["reference"],
        vcfs = config["known_sites"]
218
    output: "{sample}/bams/{sample}.baserecal.grp"
219
    params:
van den Berg's avatar
van den Berg committed
220
221
222
        known_sites = " ".join(
                expand("-knownSites {vcf}", vcf=config["known_sites"])
        ),
223
        bams = bqsr_bam_input
van den Berg's avatar
van den Berg committed
224
    container: containers["gatk"]
225
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
226
           "BaseRecalibrator {params.bams} -o {output} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
227
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
228
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
229

230
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
231
232
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
233
        ref = config["reference"],
234
    params:
van den Berg's avatar
van den Berg committed
235
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
236
    output:
237
        directory("scatter")
van den Berg's avatar
van den Berg committed
238
    container: containers["biopet-scatterregions"]
239
    shell: "mkdir -p {output} && "
240
           "biopet-scatterregions -Xmx24G "
241
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
242
243
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
244
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
245
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
246
    input:
247
        bam = rules.markdup.output.bam,
248
        bqsr = rules.baserecal.output,
van den Berg's avatar
van den Berg committed
249
250
251
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
252
    output:
van den Berg's avatar
van den Berg committed
253
254
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
255
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
256
        chunk = "[0-9]+"
van den Berg's avatar
van den Berg committed
257
    container: containers["gatk"]
258
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
259
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
260
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
261
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
262
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
263
264
265
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
266

267
268
269
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
van den Berg's avatar
van den Berg committed
270
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
271
272
273
274

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
275
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
276
277

rule gvcf_gather:
van den Berg's avatar
van den Berg committed
278
    """Join the gvcf files together"""
279
280
281
282
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
283
284
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
285
286
287
    container: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps "
           "--output {output.gvcf} --output-type z && "
288
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
289
290

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
291
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
292
    input:
293
294
        gvcf = rules.gvcf_scatter.output.gvcf,
        tbi = rules.gvcf_scatter.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
295
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
296
    output:
297
298
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
299
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
300
301
302
303
        chunk = "[0-9]+"
    container: containers["gatk"]
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 "
           "/usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
304
           "GenotypeGVCFs -R {input.ref} "
305
           "-V {input.gvcf} -o '{output.vcf}'"
306

307
def aggregate_vcf(wildcards):
308
309
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
van den Berg's avatar
van den Berg committed
310
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
311

312
313
314
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
315
       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
316

Sander Bollen's avatar
Sander Bollen committed
317
rule genotype_gather:
318
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
319
    input:
320
        vcfs = aggregate_vcf,
321
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
322
    output:
323
324
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
van den Berg's avatar
van den Berg committed
325
326
327
    container: containers["bcftools"]
    shell: "bcftools concat {input.vcfs} --allow-overlaps "
           "--output {output.vcf} --output-type z && "
328
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
329

Sander Bollen's avatar
Sander Bollen committed
330
## bam metrics
331
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
332
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
333
    input:
334
335
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
336
    output:
van den Berg's avatar
van den Berg committed
337
338
339
        reads = "{sample}/bams/{sample}.mapped.num",
        bases = "{sample}/bams/{sample}.mapped.basenum"
    container: containers["samtools-1.7-python-3.6"]
340
341
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
342

343
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
344
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
345
    input:
346
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
347
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
348
    output:
van den Berg's avatar
van den Berg committed
349
350
351
352
353
        reads = "{sample}/bams/{sample}.unique.num",
        bases = "{sample}/bams/{sample}.usable.basenum"
    container: containers["samtools-1.7-python-3.6"]
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | "
           "python {input.pywc} --reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
354
355

## fastqc
Sander Bollen's avatar
Sander Bollen committed
356
rule fastqc_raw:
van den Berg's avatar
van den Berg committed
357
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
358
    input:
359
360
        r1 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R1']),
        r2 = lambda wc: (config['samples'][wc.sample]['read_groups'][wc.read_group]['R2']),
Sander Bollen's avatar
Sander Bollen committed
361
    output:
van den Berg's avatar
van den Berg committed
362
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
363
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
364
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
365

Sander Bollen's avatar
Sander Bollen committed
366
rule fastqc_postqc:
van den Berg's avatar
van den Berg committed
367
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
368
    input:
369
370
        r1 = rules.cutadapt.output.r1,
        r2 = rules.cutadapt.output.r2
Sander Bollen's avatar
Sander Bollen committed
371
    output:
van den Berg's avatar
van den Berg committed
372
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
van den Berg's avatar
van den Berg committed
373
    container: containers["fastqc"]
van den Berg's avatar
van den Berg committed
374
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
375

376
## coverage
Sander Bollen's avatar
Sander Bollen committed
377
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
378
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
379
    input:
380
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
381
382
383
        genome = "current.genome",
        covpy = config["covstats"],
        bed = config.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
384
    params:
van den Berg's avatar
van den Berg committed
385
        subt = "Sample {sample}"
Sander Bollen's avatar
Sander Bollen committed
386
    output:
van den Berg's avatar
van den Berg committed
387
388
389
        covj = "{sample}/coverage/covstats.json",
        covp = "{sample}/coverage/covstats.png"
    container: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
390
391
392
393
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
394

Sander Bollen's avatar
Sander Bollen committed
395
396
397
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
398
399
        gvcf = rules.gvcf_gather.output.gvcf,
        tbi = rules.gvcf_gather.output.gvcf_tbi,
van den Berg's avatar
van den Berg committed
400
        ref = config.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
401
    output:
van den Berg's avatar
van den Berg committed
402
403
        tsv = "{sample}/coverage/refFlat_coverage.tsv"
    container: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
404
405
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"

406
rule collect_cutadapt_summary:
van den Berg's avatar
van den Berg committed
407
    """Colect cutadapt summary from each readgroup per sample """
408
409
    input:
        cutadapt = lambda wildcards:
van den Berg's avatar
van den Berg committed
410
411
412
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(
            sample=wildcards.sample, read_group=read_group)
            for read_group in get_readgroup(wildcards)),
413
414
        cutadapt_summary= config["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
van den Berg's avatar
van den Berg committed
415
    container: containers["python3"]
416
417
418
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
rule collectstats:
    """Collect all stats for a particular sample"""
    input:
        mnum = rules.mapped_reads_bases.output.reads,
        mbnum = rules.mapped_reads_bases.output.bases,
        unum = rules.unique_reads_bases.output.reads,
        ubnum = rules.unique_reads_bases.output.bases,
        cov = rules.covstats.output.covj if "bedfile" in config else ".",
        cutadapt = rules.collect_cutadapt_summary.output,
        colpy = config["collect_stats"]
    params:
        fthresh = config["female_threshold"]
    output: "{sample}/{sample}.stats.json"
    container: containers["python3"]
    shell: "python {input.colpy} --sample-name {wildcards.sample} "
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
           "--unique-num {input.unum} --usable-basenum {input.ubnum} "
           "--female-threshold {params.fthresh} "
           "--cutadapt {input.cutadapt} "
           "{input.cov} > {output}"
439
440
441
442

rule merge_stats:
    """Merge all stats of all samples"""
    input:
van den Berg's avatar
van den Berg committed
443
444
445
        cols = expand("{sample}/{sample}.stats.json",
                      sample=config['samples']),
        mpy = config["merge_stats"]
446
    output: "stats.json"
van den Berg's avatar
van den Berg committed
447
    container: containers["vtools"]
448
    shell: "python {input.mpy} --collectstats {input.cols} "
449
           "> {output}"
450
451
452
453

rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
454
        stats = rules.merge_stats.output,
van den Berg's avatar
van den Berg committed
455
456
457
458
        sc = config["stats_to_tsv"]
    output: "stats.tsv"
    container: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output}"
Sander Bollen's avatar
Sander Bollen committed
459

van den Berg's avatar
van den Berg committed
460
461
462
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
463
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
464
465
        ref = config["reference"],
    params:
van den Berg's avatar
van den Berg committed
466
        prefix = "{sample}/bams/{sample}",
van den Berg's avatar
van den Berg committed
467
    output:
van den Berg's avatar
van den Berg committed
468
469
470
        alignment = "{sample}/bams/{sample}.alignment_summary_metrics",
        insert = "{sample}/bams/{sample}.insert_size_metrics"
    container: containers["picard"]
van den Berg's avatar
van den Berg committed
471
472
473
474
475
476
    shell: "picard CollectMultipleMetrics "
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
           "PROGRAM=CollectInsertSizeMetrics "

Sander Bollen's avatar
Sander Bollen committed
477
478
479
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
480
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
481
482
    """
    input:
van den Berg's avatar
van den Berg committed
483
484
485
486
487
488
489
490
491
492
493
494
        stats = rules.stats_tsv.output,
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric = expand("{sample}/bams/{sample}.metrics",
                        sample=config["samples"]),
        alignment_metrics = expand(
                "{sample}/bams/{sample}.alignment_summary_metrics",
                sample=config["samples"]
        ),
        insert_metrics = expand(
                "{sample}/bams/{sample}.insert_size_metrics",
                sample=config["samples"]
        ),
495
    output: "multiqc_report/multiqc_report.html"
van den Berg's avatar
van den Berg committed
496
497
498
    container: containers["multiqc"]
    shell: "multiqc --data-format json --force --outdir multiqc_report . "
           "|| touch {output}"