Snakefile 19.5 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Read the config json
with open(config['CONFIG_JSON'], 'rt') as fin:
    settings = json.load(fin)

# Read the json schema
with open('config/schema.json', 'rt') as fin:
    schema = json.load(fin)

# Validate the settings against the schema
try:
    jsonschema.validate(settings, schema)
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

45

46
47
48
# Set default values
def set_default(key, value):
    """ Set default config values """
van den Berg's avatar
van den Berg committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    if key not in settings:
        settings[key] = value

# Set the default settings
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
set_default("covstats", "src/covstats.py")
set_default("collect_stats", "src/collect_stats.py")
set_default("merge_stats", "src/merge_stats.py")
set_default("fastq_stats", "src/fastqc_stats.py")
set_default("stats_to_tsv", "src/stats_to_tsv.py")
set_default("py_wordcount", "src/pywc.py")
Sander Bollen's avatar
Sander Bollen committed
63

64
65
66
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
67
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
68
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
van den Berg's avatar
van den Berg committed
69
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
70
71
72
73
74
75
76
77
78
79
80
81
    "debian": "docker://debian:buster-slim",
    "fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.5--py36_0",
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
82

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def get_forward(wildcards):
    """ Get the forward fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
                [wildcards.read_group]["R1"]
    )

def get_reverse(wildcards):
    """ Get the reverse fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
            [wildcards.read_group]["R2"]
    )

def get_readgroup(wildcards):
    return settings["samples"][wildcards.sample]["libraries"]
Sander Bollen's avatar
Sander Bollen committed
99

van den Berg's avatar
van den Berg committed
100
101
102
103
104
105
def get_readgroup_per_sample():
    for sample in settings["samples"]:
        for rg in settings["samples"][sample]["libraries"]:
            yield rg, sample


106
107
108
def coverage_stats(wildcards):
    files = expand("{sample}/coverage/refFlat_coverage.tsv", sample=settings['samples'])
    return files if "refflat" in settings else []
Sander Bollen's avatar
Sander Bollen committed
109

Sander Bollen's avatar
Sander Bollen committed
110
111
rule all:
    input:
van den Berg's avatar
van den Berg committed
112
        multiqc="multiqc_report/multiqc_report.html",
van den Berg's avatar
van den Berg committed
113
        #stats = "stats.json",
van den Berg's avatar
van den Berg committed
114
115
116
117
118
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai", sample=settings['samples']),
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz", sample=settings['samples']),
        vcf_tbi=expand("{sample}/vcf/{sample}.vcf.gz.tbi", sample=settings['samples']),
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz", sample=settings['samples']),
        gvcf_tbi=expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=settings['samples']),
van den Berg's avatar
van den Berg committed
119
120
121
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
        fastqc_trimmed = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
        #coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
122

Sander Bollen's avatar
Sander Bollen committed
123
124
125

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
126
    output: directory("tmp")
127
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
128
129
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
130
rule genome:
Sander Bollen's avatar
Sander Bollen committed
131
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
132
    input: settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
133
    output: "current.genome"
134
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
135
136
137
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
138
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
139
    input:
van den Berg's avatar
van den Berg committed
140
141
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
142
    output:
van den Berg's avatar
van den Berg committed
143
144
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz"
145
    singularity: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
146
147
148
149
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
           "{input.r1} {input.r2}"
Sander Bollen's avatar
Sander Bollen committed
150
151

rule align:
Sander Bollen's avatar
Sander Bollen committed
152
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
153
    input:
van den Berg's avatar
van den Berg committed
154
155
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
van den Berg's avatar
van den Berg committed
156
        ref = settings["reference"],
157
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
158
    params:
159
160
        rg = "@RG\\tID:{read_group}\\tSM:{sample}\\tPL:ILLUMINA"
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
161
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
162
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
163
164
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
165
166
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

167
168
169
170
171
def markdup_bam_input(wildcards):
    """ Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
            read_group=rg) for rg in get_readgroup(wildcards)]

Sander Bollen's avatar
Sander Bollen committed
172
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
173
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
174
    input:
175
176
177
        bam = lambda wildcards:
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
        read_group=rg) for rg in get_readgroup(wildcards)),
Sander Bollen's avatar
Sander Bollen committed
178
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
179
    output:
Sander Bollen's avatar
Sander Bollen committed
180
181
182
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
183
184
    params:
        bams=markdup_bam_input
185
    singularity: containers["picard-2.14"]
186
187
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
188
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
189
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
190
191
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
192
193
194
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
195
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
196
    output:
Sander Bollen's avatar
Sander Bollen committed
197
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
198
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
199
200
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
201
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
202
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
203
    input:
Sander Bollen's avatar
Sander Bollen committed
204
        bam = "{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
205
        ref = settings["reference"],
206
        vcfs = settings["known_sites"]
Sander Bollen's avatar
Sander Bollen committed
207
    output:
Sander Bollen's avatar
Sander Bollen committed
208
        grp = "{sample}/bams/{sample}.baserecal.grp"
209
    params: " ".join(expand("-knownSites {vcf}", vcf=settings["known_sites"]))
210
    singularity: containers["gatk"]
211
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
212
213
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
214
           "-cov CycleCovariate -cov ContextCovariate {params}"
Sander Bollen's avatar
Sander Bollen committed
215

216
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
217
218
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
219
        ref = settings["reference"],
220
    params:
van den Berg's avatar
van den Berg committed
221
        size = settings['scatter_size']
van den Berg's avatar
van den Berg committed
222
    output:
223
        directory("scatter")
van den Berg's avatar
van den Berg committed
224
    singularity: containers["biopet-scatterregions"]
225
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
226
           "biopet-scatterregions "
227
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
228
229
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
230
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
231
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
232
    input:
Sander Bollen's avatar
Sander Bollen committed
233
234
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
van den Berg's avatar
van den Berg committed
235
236
        dbsnp=settings["dbsnp"],
        ref=settings["reference"],
van den Berg's avatar
van den Berg committed
237
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
238
    output:
239
240
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
241
242
    wildcard_constraints:
        chunk="[0-9]+"
243
    singularity: containers["gatk"]
244
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
245
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
246
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
247
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
248
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
249
250
251
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
252

253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
269
270
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
271
272
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
273
274
           "--output-type z && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
275
276

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
277
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
278
    input:
279
280
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
281
        ref= settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
282
    output:
283
284
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
285
286
    wildcard_constraints:
        chunk="[0-9]+"
287
    singularity: containers["gatk"]
288
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
289
           "GenotypeGVCFs -R {input.ref} "
290
           "-V {input.gvcf} -o '{output.vcf}'"
291
292


293
def aggregate_vcf(wildcards):
294
295
296
297
298
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


299
300
301
302
303
304
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
305
rule genotype_gather:
306
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
307
    input:
308
        vcfs = aggregate_vcf,
309
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
310
    output:
311
312
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
313
    singularity: containers["bcftools"]
314
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
315
316
           "--output-type z && "
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
317
318


Sander Bollen's avatar
Sander Bollen committed
319
## bam metrics
320
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
321
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
322
    input:
323
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
324
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
325
    output:
326
327
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
328
    singularity: containers["samtools-1.7-python-3.6"]
329
330
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
331
332


333
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
334
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
335
    input:
336
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
337
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
338
    output:
339
340
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
341
    singularity: containers["samtools-1.7-python-3.6"]
342
343
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
344
345
346


## fastqc
Sander Bollen's avatar
Sander Bollen committed
347
rule fastqc_raw:
348
349
350
    """
    Run fastqc on raw fastq files
    """
Sander Bollen's avatar
Sander Bollen committed
351
    input:
van den Berg's avatar
van den Berg committed
352
353
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
354
    output:
van den Berg's avatar
van den Berg committed
355
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
356
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
357
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
358
359


Sander Bollen's avatar
Sander Bollen committed
360
rule fastqc_postqc:
361
362
363
    """
    Run fastqc on fastq files post pre-processing
    """
Sander Bollen's avatar
Sander Bollen committed
364
    input:
van den Berg's avatar
van den Berg committed
365
366
        r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
367
    output:
van den Berg's avatar
van den Berg committed
368
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
369
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
370
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
371
372
373
374
375


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
376
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
377
    input:
378
379
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
380
    output:
381
        "{sample}/pre_process/{sample}-{read_group}.preqc_count.json"
382
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
383
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
384
385
386


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
387
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
388
    input:
van den Berg's avatar
van den Berg committed
389
390
        r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq",
        r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
391
    output:
392
        "{sample}/pre_process/{sample}-{read_group}.postqc_count.json"
393
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
394
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
395
396
397
398
399


## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
400
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
401
    input:
Sander Bollen's avatar
Sander Bollen committed
402
403
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
van den Berg's avatar
van den Berg committed
404
        covpy=settings["covstats"],
van den Berg's avatar
van den Berg committed
405
        bed=settings.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
406
407
408
    params:
        subt="Sample {sample}"
    output:
van den Berg's avatar
van den Berg committed
409
410
        covj="{sample}/coverage/covstats.json",
        covp="{sample}/coverage/covstats.png"
411
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
412
413
414
415
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
416
417


Sander Bollen's avatar
Sander Bollen committed
418
419
420
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
421
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
422
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
423
        ref = settings.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
424
    output:
425
        tsv="{sample}/coverage/refFlat_coverage.tsv"
426
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
427
428
429
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
430
431
432
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
433
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
434
    input:
435
        vcf="{sample}/vcf/{sample}.vcf.gz",
436
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
437
    output:
438
        stats="{sampel}/vcf/{sample}.vcfstats.json"
439
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
440
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
441
442


Sander Bollen's avatar
Sander Bollen committed
443
## collection
van den Berg's avatar
van den Berg committed
444
if "bedfile" in settings:
445
446
447
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
448
449
450
451
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
van den Berg's avatar
van den Berg committed
452
            cov="{sample}/coverage/covstats.json",
van den Berg's avatar
van den Berg committed
453
            colpy=settings["collect_stats"]
454
455
        params:
            sample_name="{sample}",
van den Berg's avatar
van den Berg committed
456
            fthresh=settings["female_threshold"]
457
        output:
Sander Bollen's avatar
Sander Bollen committed
458
            "{sample}/{sample}.stats.json"
459
        singularity: containers["vtools"]
460
461
462
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
463
               "--female-threshold {params.fthresh} "
van den Berg's avatar
van den Berg committed
464
               "{input.cov} > {output}"
465
466
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
467
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
468
        input:
Sander Bollen's avatar
Sander Bollen committed
469
470
471
472
473
474
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
van den Berg's avatar
van den Berg committed
475
            colpy = settings["collect_stats"]
Sander Bollen's avatar
Sander Bollen committed
476
477
        params:
            sample_name = "{sample}",
van den Berg's avatar
van den Berg committed
478
            fthresh = settings["female_threshold"]
Sander Bollen's avatar
Sander Bollen committed
479
        output:
Sander Bollen's avatar
Sander Bollen committed
480
            "{sample}/{sample}.stats.json"
481
        singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
482
483
484
485
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
486
               "--female-threshold {params.fthresh} "
van den Berg's avatar
van den Berg committed
487
               "> {output}"
Sander Bollen's avatar
Sander Bollen committed
488
489

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
490
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
491
    input:
van den Berg's avatar
van den Berg committed
492
493
494
        cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
        vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=settings['samples']),
        mpy=settings["merge_stats"]
Sander Bollen's avatar
Sander Bollen committed
495
    output:
Sander Bollen's avatar
Sander Bollen committed
496
        stats="stats.json"
497
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
498
499
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
500
501


Sander Bollen's avatar
Sander Bollen committed
502
503
504
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
505
        stats="stats.json",
van den Berg's avatar
van den Berg committed
506
        sc=settings["stats_to_tsv"]
Sander Bollen's avatar
Sander Bollen committed
507
    output:
Sander Bollen's avatar
Sander Bollen committed
508
        stats="stats.tsv"
509
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
510
511
512
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
513
514
515
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
516
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
517
518
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
519
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
520
    params:
Sander Bollen's avatar
Sander Bollen committed
521
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
522
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
523
    output:
van den Berg's avatar
van den Berg committed
524
        "multiqc_report/multiqc_report.html"
525
    singularity: containers["multiqc"]
van den Berg's avatar
van den Berg committed
526
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output}"