Snakefile 19.7 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Read the config json
with open(config['CONFIG_JSON'], 'rt') as fin:
    settings = json.load(fin)

# Read the json schema
with open('config/schema.json', 'rt') as fin:
    schema = json.load(fin)

# Validate the settings against the schema
try:
    jsonschema.validate(settings, schema)
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

45

46
47
48
# Set default values
def set_default(key, value):
    """ Set default config values """
van den Berg's avatar
van den Berg committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    if key not in settings:
        settings[key] = value

# Set the default settings
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
set_default("covstats", "src/covstats.py")
set_default("collect_stats", "src/collect_stats.py")
set_default("merge_stats", "src/merge_stats.py")
set_default("fastq_stats", "src/fastqc_stats.py")
set_default("stats_to_tsv", "src/stats_to_tsv.py")
set_default("py_wordcount", "src/pywc.py")
Sander Bollen's avatar
Sander Bollen committed
63

64
65
66
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
67
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
68
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
van den Berg's avatar
van den Berg committed
69
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
70
71
72
73
74
75
76
77
78
79
80
81
    "debian": "docker://debian:buster-slim",
    "fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.5--py36_0",
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
82

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def get_forward(wildcards):
    """ Get the forward fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
                [wildcards.read_group]["R1"]
    )

def get_reverse(wildcards):
    """ Get the reverse fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
            [wildcards.read_group]["R2"]
    )

def get_readgroup(wildcards):
    return settings["samples"][wildcards.sample]["libraries"]
Sander Bollen's avatar
Sander Bollen committed
99

van den Berg's avatar
van den Berg committed
100
101
102
103
104
105
def get_readgroup_per_sample():
    for sample in settings["samples"]:
        for rg in settings["samples"][sample]["libraries"]:
            yield rg, sample


106
107
108
def coverage_stats(wildcards):
    files = expand("{sample}/coverage/refFlat_coverage.tsv", sample=settings['samples'])
    return files if "refflat" in settings else []
Sander Bollen's avatar
Sander Bollen committed
109

Sander Bollen's avatar
Sander Bollen committed
110
111
rule all:
    input:
van den Berg's avatar
van den Berg committed
112
        multiqc="multiqc_report/multiqc_report.html",
van den Berg's avatar
van den Berg committed
113
        #stats = "stats.json",
van den Berg's avatar
van den Berg committed
114
115
116
117
118
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai", sample=settings['samples']),
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz", sample=settings['samples']),
        vcf_tbi=expand("{sample}/vcf/{sample}.vcf.gz.tbi", sample=settings['samples']),
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz", sample=settings['samples']),
        gvcf_tbi=expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=settings['samples']),
van den Berg's avatar
van den Berg committed
119
120
121
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
        fastqc_trimmed = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
        #coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
122

Sander Bollen's avatar
Sander Bollen committed
123
124
125

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
126
    output: directory("tmp")
127
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
128
129
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
130
rule genome:
Sander Bollen's avatar
Sander Bollen committed
131
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
132
    input: settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
133
    output: "current.genome"
134
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
135
136
137
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
138
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
139
    input:
van den Berg's avatar
van den Berg committed
140
141
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
142
    output:
van den Berg's avatar
van den Berg committed
143
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
144
145
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
        summary = "{sample}/pre_process/{sample}-{read_group}.txt"
146
    singularity: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
147
148
149
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
150
151
           "{input.r1} {input.r2} "
           "--report=minimal > {output.summary}"
Sander Bollen's avatar
Sander Bollen committed
152
153

rule align:
Sander Bollen's avatar
Sander Bollen committed
154
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
155
    input:
van den Berg's avatar
van den Berg committed
156
157
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
van den Berg's avatar
van den Berg committed
158
        ref = settings["reference"],
159
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
160
    params:
161
162
        rg = "@RG\\tID:{read_group}\\tSM:{sample}\\tPL:ILLUMINA"
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
163
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
164
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
165
166
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
167
168
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

169
170
171
172
173
def markdup_bam_input(wildcards):
    """ Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
            read_group=rg) for rg in get_readgroup(wildcards)]

Sander Bollen's avatar
Sander Bollen committed
174
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
175
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
176
    input:
177
178
179
        bam = lambda wildcards:
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
        read_group=rg) for rg in get_readgroup(wildcards)),
Sander Bollen's avatar
Sander Bollen committed
180
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
181
    output:
Sander Bollen's avatar
Sander Bollen committed
182
183
184
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
185
186
    params:
        bams=markdup_bam_input
187
    singularity: containers["picard-2.14"]
188
189
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
190
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
191
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
192
193
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
194
195
196
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
197
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
198
    output:
Sander Bollen's avatar
Sander Bollen committed
199
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
200
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
201
202
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
203
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
204
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
205
    input:
Sander Bollen's avatar
Sander Bollen committed
206
        bam = "{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
207
        ref = settings["reference"],
208
        vcfs = settings["known_sites"]
Sander Bollen's avatar
Sander Bollen committed
209
    output:
Sander Bollen's avatar
Sander Bollen committed
210
        grp = "{sample}/bams/{sample}.baserecal.grp"
211
    params: " ".join(expand("-knownSites {vcf}", vcf=settings["known_sites"]))
212
    singularity: containers["gatk"]
213
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
214
215
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
216
           "-cov CycleCovariate -cov ContextCovariate {params}"
Sander Bollen's avatar
Sander Bollen committed
217

218
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
219
220
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
221
        ref = settings["reference"],
222
    params:
van den Berg's avatar
van den Berg committed
223
        size = settings['scatter_size']
van den Berg's avatar
van den Berg committed
224
    output:
225
        directory("scatter")
van den Berg's avatar
van den Berg committed
226
    singularity: containers["biopet-scatterregions"]
227
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
228
           "biopet-scatterregions "
229
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
230
231
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
232
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
233
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
234
    input:
Sander Bollen's avatar
Sander Bollen committed
235
236
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
van den Berg's avatar
van den Berg committed
237
238
        dbsnp=settings["dbsnp"],
        ref=settings["reference"],
van den Berg's avatar
van den Berg committed
239
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
240
    output:
241
242
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
243
244
    wildcard_constraints:
        chunk="[0-9]+"
245
    singularity: containers["gatk"]
246
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
247
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
248
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
249
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
250
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
251
252
253
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
254

255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
271
272
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
273
274
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
275
276
           "--output-type z && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
277
278

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
279
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
280
    input:
281
282
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
283
        ref= settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
284
    output:
285
286
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
287
288
    wildcard_constraints:
        chunk="[0-9]+"
289
    singularity: containers["gatk"]
290
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
291
           "GenotypeGVCFs -R {input.ref} "
292
           "-V {input.gvcf} -o '{output.vcf}'"
293
294


295
def aggregate_vcf(wildcards):
296
297
298
299
300
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


301
302
303
304
305
306
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
307
rule genotype_gather:
308
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
309
    input:
310
        vcfs = aggregate_vcf,
311
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
312
    output:
313
314
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
315
    singularity: containers["bcftools"]
316
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
317
318
           "--output-type z && "
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
319
320


Sander Bollen's avatar
Sander Bollen committed
321
## bam metrics
322
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
323
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
324
    input:
325
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
326
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
327
    output:
328
329
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
330
    singularity: containers["samtools-1.7-python-3.6"]
331
332
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
333
334


335
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
336
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
337
    input:
338
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
339
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
340
    output:
341
342
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
343
    singularity: containers["samtools-1.7-python-3.6"]
344
345
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
346
347
348


## fastqc
Sander Bollen's avatar
Sander Bollen committed
349
rule fastqc_raw:
350
351
352
    """
    Run fastqc on raw fastq files
    """
Sander Bollen's avatar
Sander Bollen committed
353
    input:
van den Berg's avatar
van den Berg committed
354
355
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
356
    output:
van den Berg's avatar
van den Berg committed
357
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
358
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
359
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
360
361


Sander Bollen's avatar
Sander Bollen committed
362
rule fastqc_postqc:
363
364
365
    """
    Run fastqc on fastq files post pre-processing
    """
Sander Bollen's avatar
Sander Bollen committed
366
    input:
van den Berg's avatar
van den Berg committed
367
368
        r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
369
    output:
van den Berg's avatar
van den Berg committed
370
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
371
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
372
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
373
374
375
376
377


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
378
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
379
    input:
380
381
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
382
    output:
383
        "{sample}/pre_process/{sample}-{read_group}.preqc_count.json"
384
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
385
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
386
387
388


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
389
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
390
    input:
van den Berg's avatar
van den Berg committed
391
392
        r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq",
        r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
393
    output:
394
        "{sample}/pre_process/{sample}-{read_group}.postqc_count.json"
395
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
396
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
397
398
399
400
401


## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
402
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
403
    input:
Sander Bollen's avatar
Sander Bollen committed
404
405
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
van den Berg's avatar
van den Berg committed
406
        covpy=settings["covstats"],
van den Berg's avatar
van den Berg committed
407
        bed=settings.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
408
409
410
    params:
        subt="Sample {sample}"
    output:
van den Berg's avatar
van den Berg committed
411
412
        covj="{sample}/coverage/covstats.json",
        covp="{sample}/coverage/covstats.png"
413
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
414
415
416
417
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
418
419


Sander Bollen's avatar
Sander Bollen committed
420
421
422
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
423
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
424
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
425
        ref = settings.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
426
    output:
427
        tsv="{sample}/coverage/refFlat_coverage.tsv"
428
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
429
430
431
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
432
433
434
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
435
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
436
    input:
437
        vcf="{sample}/vcf/{sample}.vcf.gz",
438
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
439
    output:
440
        stats="{sampel}/vcf/{sample}.vcfstats.json"
441
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
442
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
443
444


Sander Bollen's avatar
Sander Bollen committed
445
## collection
van den Berg's avatar
van den Berg committed
446
if "bedfile" in settings:
447
448
449
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
450
451
452
453
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
van den Berg's avatar
van den Berg committed
454
            cov="{sample}/coverage/covstats.json",
van den Berg's avatar
van den Berg committed
455
            colpy=settings["collect_stats"]
456
457
        params:
            sample_name="{sample}",
van den Berg's avatar
van den Berg committed
458
            fthresh=settings["female_threshold"]
459
        output:
Sander Bollen's avatar
Sander Bollen committed
460
            "{sample}/{sample}.stats.json"
461
        singularity: containers["vtools"]
462
463
464
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
465
               "--female-threshold {params.fthresh} "
van den Berg's avatar
van den Berg committed
466
               "{input.cov} > {output}"
467
468
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
469
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
470
        input:
Sander Bollen's avatar
Sander Bollen committed
471
472
473
474
475
476
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
van den Berg's avatar
van den Berg committed
477
            colpy = settings["collect_stats"]
Sander Bollen's avatar
Sander Bollen committed
478
479
        params:
            sample_name = "{sample}",
van den Berg's avatar
van den Berg committed
480
            fthresh = settings["female_threshold"]
Sander Bollen's avatar
Sander Bollen committed
481
        output:
Sander Bollen's avatar
Sander Bollen committed
482
            "{sample}/{sample}.stats.json"
483
        singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
484
485
486
487
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
488
               "--female-threshold {params.fthresh} "
van den Berg's avatar
van den Berg committed
489
               "> {output}"
Sander Bollen's avatar
Sander Bollen committed
490
491

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
492
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
493
    input:
van den Berg's avatar
van den Berg committed
494
495
496
        cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
        vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=settings['samples']),
        mpy=settings["merge_stats"]
Sander Bollen's avatar
Sander Bollen committed
497
    output:
Sander Bollen's avatar
Sander Bollen committed
498
        stats="stats.json"
499
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
500
501
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
502
503


Sander Bollen's avatar
Sander Bollen committed
504
505
506
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
507
        stats="stats.json",
van den Berg's avatar
van den Berg committed
508
        sc=settings["stats_to_tsv"]
Sander Bollen's avatar
Sander Bollen committed
509
    output:
Sander Bollen's avatar
Sander Bollen committed
510
        stats="stats.tsv"
511
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
512
513
514
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
515
516
517
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
518
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
519
520
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
521
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
522
    params:
Sander Bollen's avatar
Sander Bollen committed
523
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
524
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
525
    output:
van den Berg's avatar
van den Berg committed
526
        "multiqc_report/multiqc_report.html"
527
    singularity: containers["multiqc"]
van den Berg's avatar
van den Berg committed
528
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output}"