Snakefile 20.2 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
# Read the json schema
32
with open(srcdir('config/schema.json'), 'rt') as fin:
van den Berg's avatar
van den Berg committed
33
34
    schema = json.load(fin)

van den Berg's avatar
van den Berg committed
35
# Validate the config against the schema
van den Berg's avatar
van den Berg committed
36
try:
van den Berg's avatar
van den Berg committed
37
    jsonschema.validate(config, schema)
van den Berg's avatar
van den Berg committed
38
39
40
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

41

42
43
44
# Set default values
def set_default(key, value):
    """ Set default config values """
van den Berg's avatar
van den Berg committed
45
46
    if key not in config:
        config[key] = value
van den Berg's avatar
van den Berg committed
47

van den Berg's avatar
van den Berg committed
48
# Set the default config
van den Berg's avatar
van den Berg committed
49
50
51
52
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
53
54
55
56
57
58
set_default("covstats", srcdir("src/covstats.py"))
set_default("collect_stats", srcdir("src/collect_stats.py"))
set_default("merge_stats", srcdir("src/merge_stats.py"))
set_default("stats_to_tsv", srcdir("src/stats_to_tsv.py"))
set_default("py_wordcount", srcdir("src/pywc.py"))
set_default("cutadapt_summary", srcdir("src/cutadapt_summary.py"))
Sander Bollen's avatar
Sander Bollen committed
59

60
61
62
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
63
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
64
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
van den Berg's avatar
van den Berg committed
65
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
66
67
68
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
69
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
van den Berg's avatar
van den Berg committed
70
    "picard": "docker://quay.io/biocontainers/picard:2.22.8--0",
71
72
73
74
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
75

76
77
78
def get_forward(wildcards):
    """ Get the forward fastq file from the config """
    return (
79
        config["samples"][wildcards.sample]["read_groups"]
80
81
82
83
84
85
                [wildcards.read_group]["R1"]
    )

def get_reverse(wildcards):
    """ Get the reverse fastq file from the config """
    return (
86
        config["samples"][wildcards.sample]["read_groups"]
87
88
89
90
            [wildcards.read_group]["R2"]
    )

def get_readgroup(wildcards):
91
    return config["samples"][wildcards.sample]["read_groups"]
Sander Bollen's avatar
Sander Bollen committed
92

van den Berg's avatar
van den Berg committed
93
def get_readgroup_per_sample():
van den Berg's avatar
van den Berg committed
94
    for sample in config["samples"]:
95
        for rg in config["samples"][sample]["read_groups"]:
van den Berg's avatar
van den Berg committed
96
97
98
            yield rg, sample


99
def coverage_stats(wildcards):
van den Berg's avatar
van den Berg committed
100
101
    files = expand("{sample}/coverage/refFlat_coverage.tsv", sample=config["samples"])
    return files if "refflat" in config else []
Sander Bollen's avatar
Sander Bollen committed
102

Sander Bollen's avatar
Sander Bollen committed
103
104
rule all:
    input:
van den Berg's avatar
van den Berg committed
105
        multiqc = "multiqc_report/multiqc_report.html",
106
107
        stats = "stats.json",
        stats_tsv = "stats.tsv",
108
        bam = expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
van den Berg's avatar
van den Berg committed
109
110
111
112
        vcfs = expand("{sample}/vcf/{sample}.vcf.gz", sample=config["samples"]),
        vcf_tbi = expand("{sample}/vcf/{sample}.vcf.gz.tbi", sample=config["samples"]),
        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz", sample=config["samples"]),
        gvcf_tbi = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=config["samples"]),
van den Berg's avatar
van den Berg committed
113
114
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
        fastqc_trimmed = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
115
116
        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt" for read_group, sample in get_readgroup_per_sample()),
        coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
117

Sander Bollen's avatar
Sander Bollen committed
118
119
120

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
121
    output: directory("tmp")
122
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
123
124
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
125
rule genome:
Sander Bollen's avatar
Sander Bollen committed
126
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
127
    input: config["reference"]
Sander Bollen's avatar
Sander Bollen committed
128
    output: "current.genome"
129
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
130
131
132
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
133
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
134
    input:
van den Berg's avatar
van den Berg committed
135
136
        r1 = get_forward,
        r2 = get_reverse
Sander Bollen's avatar
Sander Bollen committed
137
    output:
van den Berg's avatar
van den Berg committed
138
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
139
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
140
141
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
142
    singularity: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
143
144
145
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
146
           "{input.r1} {input.r2} "
147
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
148
149

rule align:
Sander Bollen's avatar
Sander Bollen committed
150
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
151
    input:
van den Berg's avatar
van den Berg committed
152
153
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
van den Berg's avatar
van den Berg committed
154
        ref = config["reference"],
155
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
156
    params:
157
158
        rg = "@RG\\tID:{read_group}\\tSM:{sample}\\tPL:ILLUMINA"
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
159
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
160
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
161
162
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
163
164
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

165
166
167
168
169
def markdup_bam_input(wildcards):
    """ Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
            read_group=rg) for rg in get_readgroup(wildcards)]

Sander Bollen's avatar
Sander Bollen committed
170
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
171
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
172
    input:
173
174
175
        bam = lambda wildcards:
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
        read_group=rg) for rg in get_readgroup(wildcards)),
Sander Bollen's avatar
Sander Bollen committed
176
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
177
    output:
178
179
180
        bam = "{sample}/bams/{sample}.bam",
        bai = "{sample}/bams/{sample}.bai",
        metrics = "{sample}/bams/{sample}.metrics"
181
182
    params:
        bams=markdup_bam_input
van den Berg's avatar
van den Berg committed
183
    singularity: containers["picard"]
184
185
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
186
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
187
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
188
189
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
190
191
192
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
193
        bai = rules.markdup.output.bai
Sander Bollen's avatar
Sander Bollen committed
194
    output:
195
        bai = "{sample}/bams/{sample}.bam.bai"
196
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
197
198
    shell: "cp {input.bai} {output.bai}"

199
200
201
202
203
204

def bqsr_bam_input(wildcards):
    """ Generate the bam input string for each read group for BQSR"""
    return " ".join(["-I {sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
205
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
206
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
207
    input:
208
209
210
        bam = lambda wildcards:
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
        read_group=rg) for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
211
212
        ref = config["reference"],
        vcfs = config["known_sites"]
Sander Bollen's avatar
Sander Bollen committed
213
    output:
Sander Bollen's avatar
Sander Bollen committed
214
        grp = "{sample}/bams/{sample}.baserecal.grp"
215
    params:
van den Berg's avatar
van den Berg committed
216
        known_sites = " ".join(expand("-knownSites {vcf}", vcf=config["known_sites"])),
217
        bams = bqsr_bam_input
218
    singularity: containers["gatk"]
219
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
220
           "BaseRecalibrator {params.bams} -o {output.grp} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
221
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
222
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
223

224
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
225
226
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
227
        ref = config["reference"],
228
    params:
van den Berg's avatar
van den Berg committed
229
        size = config['scatter_size']
van den Berg's avatar
van den Berg committed
230
    output:
231
        directory("scatter")
van den Berg's avatar
van den Berg committed
232
    singularity: containers["biopet-scatterregions"]
233
    shell: "mkdir -p {output} && "
234
           "biopet-scatterregions -Xmx24G "
235
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
236
237
           "--outputDir scatter"

238

Sander Bollen's avatar
Sander Bollen committed
239
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
240
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
241
    input:
242
        bam = rules.markdup.output.bam,
243
        bqsr = "{sample}/bams/{sample}.baserecal.grp",
van den Berg's avatar
van den Berg committed
244
245
246
        dbsnp = config["dbsnp"],
        ref = config["reference"],
        region = "scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
247
    output:
van den Berg's avatar
van den Berg committed
248
249
        gvcf = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
250
    wildcard_constraints:
van den Berg's avatar
van den Berg committed
251
        chunk = "[0-9]+"
252
    singularity: containers["gatk"]
253
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
254
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
255
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
256
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
257
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
258
259
260
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
261

262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
278
279
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
280
281
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
282
283
           "--output-type z && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
284
285

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
286
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
287
    input:
288
289
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
290
        ref= config["reference"]
Sander Bollen's avatar
Sander Bollen committed
291
    output:
292
293
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
294
295
    wildcard_constraints:
        chunk="[0-9]+"
296
    singularity: containers["gatk"]
297
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
298
           "GenotypeGVCFs -R {input.ref} "
299
           "-V {input.gvcf} -o '{output.vcf}'"
300
301


302
def aggregate_vcf(wildcards):
303
304
305
306
307
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


308
309
310
311
312
313
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
314
rule genotype_gather:
315
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
316
    input:
317
        vcfs = aggregate_vcf,
318
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
319
    output:
320
321
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
322
    singularity: containers["bcftools"]
323
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
324
325
           "--output-type z && "
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
326
327


Sander Bollen's avatar
Sander Bollen committed
328
## bam metrics
329
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
330
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
331
    input:
332
333
        bam = rules.markdup.output.bam,
        pywc = config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
334
    output:
335
336
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
337
    singularity: containers["samtools-1.7-python-3.6"]
338
339
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
340
341


342
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
343
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
344
    input:
345
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
346
        pywc=config["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
347
    output:
348
349
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
350
    singularity: containers["samtools-1.7-python-3.6"]
351
352
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
353
354
355


## fastqc
Sander Bollen's avatar
Sander Bollen committed
356
rule fastqc_raw:
357
358
359
    """
    Run fastqc on raw fastq files
    """
Sander Bollen's avatar
Sander Bollen committed
360
    input:
van den Berg's avatar
van den Berg committed
361
362
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
363
    output:
van den Berg's avatar
van den Berg committed
364
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
365
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
366
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
367
368


Sander Bollen's avatar
Sander Bollen committed
369
rule fastqc_postqc:
370
371
372
    """
    Run fastqc on fastq files post pre-processing
    """
Sander Bollen's avatar
Sander Bollen committed
373
    input:
van den Berg's avatar
van den Berg committed
374
375
        r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
376
    output:
van den Berg's avatar
van den Berg committed
377
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
378
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
379
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
380
381


382
## coverage
Sander Bollen's avatar
Sander Bollen committed
383
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
384
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
385
    input:
386
        bam = rules.markdup.output.bam,
Sander Bollen's avatar
Sander Bollen committed
387
        genome="current.genome",
van den Berg's avatar
van den Berg committed
388
389
        covpy=config["covstats"],
        bed=config.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
390
391
392
    params:
        subt="Sample {sample}"
    output:
van den Berg's avatar
van den Berg committed
393
394
        covj="{sample}/coverage/covstats.json",
        covp="{sample}/coverage/covstats.png"
395
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
396
397
398
399
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
400
401


Sander Bollen's avatar
Sander Bollen committed
402
403
404
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
405
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
406
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
407
        ref = config.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
408
    output:
409
        tsv="{sample}/coverage/refFlat_coverage.tsv"
410
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
411
412
413
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


414
## collection
van den Berg's avatar
van den Berg committed
415
if "bedfile" in config:
416
417
418
419
420
421
422
423
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            cov="{sample}/coverage/covstats.json",
424
            cutadapt = "{sample}/cutadapt.json",
van den Berg's avatar
van den Berg committed
425
            colpy=config["collect_stats"]
426
427
        params:
            sample_name="{sample}",
van den Berg's avatar
van den Berg committed
428
            fthresh=config["female_threshold"]
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
        output:
            "{sample}/{sample}.stats.json"
        singularity: containers["vtools"]
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--cutadapt {input.cutadapt} "
               "{input.cov} > {output}"
else:
    rule collectstats:
        """Collect all stats for a particular sample without beds"""
        input:
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
446
            cutadapt = "{sample}/cutadapt.json",
van den Berg's avatar
van den Berg committed
447
            colpy = config["collect_stats"]
448
449
        params:
            sample_name = "{sample}",
van den Berg's avatar
van den Berg committed
450
            fthresh = config["female_threshold"]
451
452
453
454
455
456
457
458
459
460
        output:
            "{sample}/{sample}.stats.json"
        singularity: containers["vtools"]
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--cutadapt {input.cutadapt} "
               "> {output}"

461
462
rule collect_cutadapt_summary:
    """ Colect cutadapt summary from each readgroup per sample """
Sander Bollen's avatar
Sander Bollen committed
463
    input:
van den Berg's avatar
van den Berg committed
464
465
466
        cutadapt = lambda wildcards:
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(sample=wildcards.sample,
        read_group=read_group) for read_group in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
467
        cutadapt_summary= config["cutadapt_summary"]
468
    output: "{sample}/cutadapt.json"
469
    singularity: containers["python3"]
470
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
van den Berg's avatar
van den Berg committed
471
472
           "--cutadapt-summary {input.cutadapt} > {output}"

van den Berg's avatar
van den Berg committed
473

474
475
476
rule merge_stats:
    """Merge all stats of all samples"""
    input:
van den Berg's avatar
van den Berg committed
477
478
        cols=expand("{sample}/{sample}.stats.json", sample=config['samples']),
        mpy=config["merge_stats"]
479
480
481
    output:
        stats="stats.json"
    singularity: containers["vtools"]
482
    shell: "python {input.mpy} --collectstats {input.cols} "
483
484
485
486
487
488
489
           "> {output.stats}"


rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats="stats.json",
van den Berg's avatar
van den Berg committed
490
        sc=config["stats_to_tsv"]
491
492
493
494
    output:
        stats="stats.tsv"
    singularity: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
495
496


van den Berg's avatar
van den Berg committed
497
498
499
rule multiple_metrics:
    """Run picard CollectMultipleMetrics"""
    input:
500
        bam = rules.markdup.output.bam,
van den Berg's avatar
van den Berg committed
501
502
503
504
505
506
507
508
509
510
511
512
513
        ref = config["reference"],
    params:
        prefix="{sample}/bams/{sample}",
    output:
        alignment="{sample}/bams/{sample}.alignment_summary_metrics",
        insert="{sample}/bams/{sample}.insert_size_metrics"
    singularity: containers["picard"]
    shell: "picard CollectMultipleMetrics "
           "I={input.bam} O={params.prefix} "
           "R={input.ref} "
           "PROGRAM=CollectAlignmentSummaryMetrics "
           "PROGRAM=CollectInsertSizeMetrics "

Sander Bollen's avatar
Sander Bollen committed
514
515
516
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
517
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
518
519
    """
    input:
van den Berg's avatar
van den Berg committed
520
        stats="stats.tsv",
521
522
        bam=expand("{sample}/bams/{sample}.bam", sample=config["samples"]),
        metric=expand("{sample}/bams/{sample}.metrics", sample=config["samples"]),
van den Berg's avatar
van den Berg committed
523
524
        alignment_metrics=expand("{sample}/bams/{sample}.alignment_summary_metrics", sample=config["samples"]),
        insert_metrics=expand("{sample}/bams/{sample}.insert_size_metrics", sample=config["samples"]),
Sander Bollen's avatar
Sander Bollen committed
525
    params:
Sander Bollen's avatar
Sander Bollen committed
526
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
527
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
528
    output:
van den Berg's avatar
van den Berg committed
529
        "multiqc_report/multiqc_report.html"
530
    singularity: containers["multiqc"]
van den Berg's avatar
van den Berg committed
531
    shell: "multiqc --data-format json -f -o {params.rdir} {params.odir} || touch {output}"