Snakefile 19.6 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Read the config json
with open(config['CONFIG_JSON'], 'rt') as fin:
    settings = json.load(fin)

# Read the json schema
with open('config/schema.json', 'rt') as fin:
    schema = json.load(fin)

# Validate the settings against the schema
try:
    jsonschema.validate(settings, schema)
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

45

46
47
48
# Set default values
def set_default(key, value):
    """ Set default config values """
van den Berg's avatar
van den Berg committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    if key not in settings:
        settings[key] = value

# Set the default settings
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
set_default("covstats", "src/covstats.py")
set_default("collect_stats", "src/collect_stats.py")
set_default("merge_stats", "src/merge_stats.py")
set_default("fastq_stats", "src/fastqc_stats.py")
set_default("stats_to_tsv", "src/stats_to_tsv.py")
set_default("py_wordcount", "src/pywc.py")
Sander Bollen's avatar
Sander Bollen committed
63

64
65
66
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
67
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
68
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
van den Berg's avatar
van den Berg committed
69
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
70
71
72
73
    "debian": "docker://debian:buster-slim",
    "fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
74
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
75
76
77
78
79
80
81
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
82

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def get_forward(wildcards):
    """ Get the forward fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
                [wildcards.read_group]["R1"]
    )

def get_reverse(wildcards):
    """ Get the reverse fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
            [wildcards.read_group]["R2"]
    )

def get_readgroup(wildcards):
    return settings["samples"][wildcards.sample]["libraries"]
Sander Bollen's avatar
Sander Bollen committed
99

van den Berg's avatar
van den Berg committed
100
101
102
103
104
105
def get_readgroup_per_sample():
    for sample in settings["samples"]:
        for rg in settings["samples"][sample]["libraries"]:
            yield rg, sample


106
107
108
def coverage_stats(wildcards):
    files = expand("{sample}/coverage/refFlat_coverage.tsv", sample=settings['samples'])
    return files if "refflat" in settings else []
Sander Bollen's avatar
Sander Bollen committed
109

Sander Bollen's avatar
Sander Bollen committed
110
111
rule all:
    input:
van den Berg's avatar
van den Berg committed
112
        multiqc="multiqc_report/multiqc_report.html",
van den Berg's avatar
van den Berg committed
113
        #stats = "stats.json",
van den Berg's avatar
van den Berg committed
114
115
116
117
118
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai", sample=settings['samples']),
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz", sample=settings['samples']),
        vcf_tbi=expand("{sample}/vcf/{sample}.vcf.gz.tbi", sample=settings['samples']),
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz", sample=settings['samples']),
        gvcf_tbi=expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=settings['samples']),
van den Berg's avatar
van den Berg committed
119
120
121
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
        fastqc_trimmed = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
        #coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
122

Sander Bollen's avatar
Sander Bollen committed
123
124
125

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
126
    output: directory("tmp")
127
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
128
129
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
130
rule genome:
Sander Bollen's avatar
Sander Bollen committed
131
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
132
    input: settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
133
    output: "current.genome"
134
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
135
136
137
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
138
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
139
    input:
van den Berg's avatar
van den Berg committed
140
141
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
142
    output:
van den Berg's avatar
van den Berg committed
143
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
144
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
145
146
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
147
    singularity: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
148
149
150
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
151
           "{input.r1} {input.r2} "
152
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
153
154

rule align:
Sander Bollen's avatar
Sander Bollen committed
155
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
156
    input:
van den Berg's avatar
van den Berg committed
157
158
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
van den Berg's avatar
van den Berg committed
159
        ref = settings["reference"],
160
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
161
    params:
162
163
        rg = "@RG\\tID:{read_group}\\tSM:{sample}\\tPL:ILLUMINA"
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
164
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
165
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
166
167
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
168
169
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

170
171
172
173
174
def markdup_bam_input(wildcards):
    """ Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
            read_group=rg) for rg in get_readgroup(wildcards)]

Sander Bollen's avatar
Sander Bollen committed
175
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
176
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
177
    input:
178
179
180
        bam = lambda wildcards:
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
        read_group=rg) for rg in get_readgroup(wildcards)),
Sander Bollen's avatar
Sander Bollen committed
181
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
182
    output:
Sander Bollen's avatar
Sander Bollen committed
183
184
185
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
186
187
    params:
        bams=markdup_bam_input
188
    singularity: containers["picard-2.14"]
189
190
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
191
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
192
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
193
194
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
195
196
197
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
198
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
199
    output:
Sander Bollen's avatar
Sander Bollen committed
200
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
201
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
202
203
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
204
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
205
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
206
    input:
Sander Bollen's avatar
Sander Bollen committed
207
        bam = "{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
208
        ref = settings["reference"],
209
        vcfs = settings["known_sites"]
Sander Bollen's avatar
Sander Bollen committed
210
    output:
Sander Bollen's avatar
Sander Bollen committed
211
        grp = "{sample}/bams/{sample}.baserecal.grp"
212
    params: " ".join(expand("-knownSites {vcf}", vcf=settings["known_sites"]))
213
    singularity: containers["gatk"]
214
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
215
216
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
217
           "-cov CycleCovariate -cov ContextCovariate {params}"
Sander Bollen's avatar
Sander Bollen committed
218

219
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
220
221
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
222
        ref = settings["reference"],
223
    params:
van den Berg's avatar
van den Berg committed
224
        size = settings['scatter_size']
van den Berg's avatar
van den Berg committed
225
    output:
226
        directory("scatter")
van den Berg's avatar
van den Berg committed
227
    singularity: containers["biopet-scatterregions"]
228
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
229
           "biopet-scatterregions "
230
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
231
232
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
233
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
234
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
235
    input:
Sander Bollen's avatar
Sander Bollen committed
236
237
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
van den Berg's avatar
van den Berg committed
238
239
        dbsnp=settings["dbsnp"],
        ref=settings["reference"],
van den Berg's avatar
van den Berg committed
240
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
241
    output:
242
243
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
244
245
    wildcard_constraints:
        chunk="[0-9]+"
246
    singularity: containers["gatk"]
247
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
248
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
249
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
250
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
251
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
252
253
254
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
255

256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
272
273
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
274
275
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
276
277
           "--output-type z && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
278
279

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
280
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
281
    input:
282
283
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
284
        ref= settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
285
    output:
286
287
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
288
289
    wildcard_constraints:
        chunk="[0-9]+"
290
    singularity: containers["gatk"]
291
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
292
           "GenotypeGVCFs -R {input.ref} "
293
           "-V {input.gvcf} -o '{output.vcf}'"
294
295


296
def aggregate_vcf(wildcards):
297
298
299
300
301
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


302
303
304
305
306
307
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
308
rule genotype_gather:
309
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
310
    input:
311
        vcfs = aggregate_vcf,
312
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
313
    output:
314
315
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
316
    singularity: containers["bcftools"]
317
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
318
319
           "--output-type z && "
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
320
321


Sander Bollen's avatar
Sander Bollen committed
322
## bam metrics
323
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
324
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
325
    input:
326
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
327
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
328
    output:
329
330
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
331
    singularity: containers["samtools-1.7-python-3.6"]
332
333
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
334
335


336
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
337
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
338
    input:
339
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
340
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
341
    output:
342
343
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
344
    singularity: containers["samtools-1.7-python-3.6"]
345
346
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
347
348
349


## fastqc
Sander Bollen's avatar
Sander Bollen committed
350
rule fastqc_raw:
351
352
353
    """
    Run fastqc on raw fastq files
    """
Sander Bollen's avatar
Sander Bollen committed
354
    input:
van den Berg's avatar
van den Berg committed
355
356
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
357
    output:
van den Berg's avatar
van den Berg committed
358
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
359
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
360
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
361
362


Sander Bollen's avatar
Sander Bollen committed
363
rule fastqc_postqc:
364
365
366
    """
    Run fastqc on fastq files post pre-processing
    """
Sander Bollen's avatar
Sander Bollen committed
367
    input:
van den Berg's avatar
van den Berg committed
368
369
        r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
370
    output:
van den Berg's avatar
van den Berg committed
371
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
372
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
373
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
374
375
376
377
378


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
379
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
380
    input:
381
382
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
383
    output:
384
        "{sample}/pre_process/{sample}-{read_group}.preqc_count.json"
385
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
386
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
387
388
389


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
390
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
391
    input:
van den Berg's avatar
van den Berg committed
392
393
        r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq",
        r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
394
    output:
395
        "{sample}/pre_process/{sample}-{read_group}.postqc_count.json"
396
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
397
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
398
399
400
401
402


## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
403
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
404
    input:
Sander Bollen's avatar
Sander Bollen committed
405
406
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
van den Berg's avatar
van den Berg committed
407
        covpy=settings["covstats"],
van den Berg's avatar
van den Berg committed
408
        bed=settings.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
409
410
411
    params:
        subt="Sample {sample}"
    output:
van den Berg's avatar
van den Berg committed
412
413
        covj="{sample}/coverage/covstats.json",
        covp="{sample}/coverage/covstats.png"
414
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
415
416
417
418
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
419
420


Sander Bollen's avatar
Sander Bollen committed
421
422
423
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
424
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
425
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
426
        ref = settings.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
427
    output:
428
        tsv="{sample}/coverage/refFlat_coverage.tsv"
429
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
430
431
432
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
433
434
435
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
436
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
437
    input:
438
        vcf="{sample}/vcf/{sample}.vcf.gz",
439
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
440
    output:
441
        stats="{sampel}/vcf/{sample}.vcfstats.json"
442
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
443
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
444
445


Sander Bollen's avatar
Sander Bollen committed
446
## collection
van den Berg's avatar
van den Berg committed
447
if "bedfile" in settings:
448
449
450
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
451
452
453
454
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
van den Berg's avatar
van den Berg committed
455
            cov="{sample}/coverage/covstats.json",
van den Berg's avatar
van den Berg committed
456
            colpy=settings["collect_stats"]
457
458
        params:
            sample_name="{sample}",
van den Berg's avatar
van den Berg committed
459
            fthresh=settings["female_threshold"]
460
        output:
Sander Bollen's avatar
Sander Bollen committed
461
            "{sample}/{sample}.stats.json"
462
        singularity: containers["vtools"]
463
464
465
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
466
               "--female-threshold {params.fthresh} "
van den Berg's avatar
van den Berg committed
467
               "{input.cov} > {output}"
468
469
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
470
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
471
        input:
Sander Bollen's avatar
Sander Bollen committed
472
473
474
475
476
477
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
van den Berg's avatar
van den Berg committed
478
            colpy = settings["collect_stats"]
Sander Bollen's avatar
Sander Bollen committed
479
480
        params:
            sample_name = "{sample}",
van den Berg's avatar
van den Berg committed
481
            fthresh = settings["female_threshold"]
Sander Bollen's avatar
Sander Bollen committed
482
        output:
Sander Bollen's avatar
Sander Bollen committed
483
            "{sample}/{sample}.stats.json"
484
        singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
485
486
487
488
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
489
               "--female-threshold {params.fthresh} "
van den Berg's avatar
van den Berg committed
490
               "> {output}"
Sander Bollen's avatar
Sander Bollen committed
491
492

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
493
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
494
    input:
van den Berg's avatar
van den Berg committed
495
496
497
        cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
        vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=settings['samples']),
        mpy=settings["merge_stats"]
Sander Bollen's avatar
Sander Bollen committed
498
    output:
Sander Bollen's avatar
Sander Bollen committed
499
        stats="stats.json"
500
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
501
502
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
503
504


Sander Bollen's avatar
Sander Bollen committed
505
506
507
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
508
        stats="stats.json",
van den Berg's avatar
van den Berg committed
509
        sc=settings["stats_to_tsv"]
Sander Bollen's avatar
Sander Bollen committed
510
    output:
Sander Bollen's avatar
Sander Bollen committed
511
        stats="stats.tsv"
512
    singularity: containers["python3"]
Sander Bollen's avatar
Sander Bollen committed
513
514
515
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
516
517
518
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
519
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
520
521
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
522
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
523
    params:
Sander Bollen's avatar
Sander Bollen committed
524
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
525
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
526
    output:
van den Berg's avatar
van den Berg committed
527
        "multiqc_report/multiqc_report.html"
528
    singularity: containers["multiqc"]
van den Berg's avatar
van den Berg committed
529
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output}"