Snakefile 19 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Read the config json
with open(config['CONFIG_JSON'], 'rt') as fin:
    settings = json.load(fin)

# Read the json schema
with open('config/schema.json', 'rt') as fin:
    schema = json.load(fin)

# Validate the settings against the schema
try:
    jsonschema.validate(settings, schema)
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

45

46
47
48
# Set default values
def set_default(key, value):
    """ Set default config values """
van den Berg's avatar
van den Berg committed
49
50
51
52
53
54
55
56
57
58
59
60
61
    if key not in settings:
        settings[key] = value

# Set the default settings
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
set_default("covstats", "src/covstats.py")
set_default("collect_stats", "src/collect_stats.py")
set_default("merge_stats", "src/merge_stats.py")
set_default("stats_to_tsv", "src/stats_to_tsv.py")
set_default("py_wordcount", "src/pywc.py")
62
set_default("cutadapt_summary", "src/cutadapt_summary.py")
Sander Bollen's avatar
Sander Bollen committed
63

64
65
66
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
67
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
68
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
van den Berg's avatar
van den Berg committed
69
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
70
71
72
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
73
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
74
75
76
77
78
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
79

80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def get_forward(wildcards):
    """ Get the forward fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
                [wildcards.read_group]["R1"]
    )

def get_reverse(wildcards):
    """ Get the reverse fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
            [wildcards.read_group]["R2"]
    )

def get_readgroup(wildcards):
    return settings["samples"][wildcards.sample]["libraries"]
Sander Bollen's avatar
Sander Bollen committed
96

van den Berg's avatar
van den Berg committed
97
98
99
100
101
102
def get_readgroup_per_sample():
    for sample in settings["samples"]:
        for rg in settings["samples"][sample]["libraries"]:
            yield rg, sample


103
104
105
def coverage_stats(wildcards):
    files = expand("{sample}/coverage/refFlat_coverage.tsv", sample=settings['samples'])
    return files if "refflat" in settings else []
Sander Bollen's avatar
Sander Bollen committed
106

Sander Bollen's avatar
Sander Bollen committed
107
108
rule all:
    input:
van den Berg's avatar
van den Berg committed
109
        multiqc="multiqc_report/multiqc_report.html",
110
111
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
112
113
114
115
116
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai", sample=settings['samples']),
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz", sample=settings['samples']),
        vcf_tbi=expand("{sample}/vcf/{sample}.vcf.gz.tbi", sample=settings['samples']),
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz", sample=settings['samples']),
        gvcf_tbi=expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=settings['samples']),
van den Berg's avatar
van den Berg committed
117
118
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
        fastqc_trimmed = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
119
120
        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt" for read_group, sample in get_readgroup_per_sample()),
        coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
121

Sander Bollen's avatar
Sander Bollen committed
122
123
124

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
125
    output: directory("tmp")
126
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
127
128
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
129
rule genome:
Sander Bollen's avatar
Sander Bollen committed
130
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
131
    input: settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
132
    output: "current.genome"
133
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
134
135
136
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
137
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
138
    input:
van den Berg's avatar
van den Berg committed
139
140
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
141
    output:
van den Berg's avatar
van den Berg committed
142
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
143
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
144
145
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
146
    singularity: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
147
148
149
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
150
           "{input.r1} {input.r2} "
151
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
152
153

rule align:
Sander Bollen's avatar
Sander Bollen committed
154
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
155
    input:
van den Berg's avatar
van den Berg committed
156
157
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
van den Berg's avatar
van den Berg committed
158
        ref = settings["reference"],
159
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
160
    params:
161
162
        rg = "@RG\\tID:{read_group}\\tSM:{sample}\\tPL:ILLUMINA"
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
163
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
164
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
165
166
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
167
168
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

169
170
171
172
173
def markdup_bam_input(wildcards):
    """ Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
            read_group=rg) for rg in get_readgroup(wildcards)]

Sander Bollen's avatar
Sander Bollen committed
174
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
175
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
176
    input:
177
178
179
        bam = lambda wildcards:
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
        read_group=rg) for rg in get_readgroup(wildcards)),
Sander Bollen's avatar
Sander Bollen committed
180
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
181
    output:
Sander Bollen's avatar
Sander Bollen committed
182
183
184
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
185
186
    params:
        bams=markdup_bam_input
187
    singularity: containers["picard-2.14"]
188
189
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
190
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
191
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
192
193
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
194
195
196
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
197
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
198
    output:
Sander Bollen's avatar
Sander Bollen committed
199
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
200
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
201
202
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
203
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
204
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
205
    input:
Sander Bollen's avatar
Sander Bollen committed
206
        bam = "{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
207
        ref = settings["reference"],
208
        vcfs = settings["known_sites"]
Sander Bollen's avatar
Sander Bollen committed
209
    output:
Sander Bollen's avatar
Sander Bollen committed
210
        grp = "{sample}/bams/{sample}.baserecal.grp"
211
    params: " ".join(expand("-knownSites {vcf}", vcf=settings["known_sites"]))
212
    singularity: containers["gatk"]
213
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
214
215
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
216
           "-cov CycleCovariate -cov ContextCovariate {params}"
Sander Bollen's avatar
Sander Bollen committed
217

218
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
219
220
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
221
        ref = settings["reference"],
222
    params:
van den Berg's avatar
van den Berg committed
223
        size = settings['scatter_size']
van den Berg's avatar
van den Berg committed
224
    output:
225
        directory("scatter")
van den Berg's avatar
van den Berg committed
226
    singularity: containers["biopet-scatterregions"]
227
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
228
           "biopet-scatterregions "
229
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
230
231
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
232
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
233
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
234
    input:
Sander Bollen's avatar
Sander Bollen committed
235
236
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
van den Berg's avatar
van den Berg committed
237
238
        dbsnp=settings["dbsnp"],
        ref=settings["reference"],
van den Berg's avatar
van den Berg committed
239
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
240
    output:
241
242
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
243
244
    wildcard_constraints:
        chunk="[0-9]+"
245
    singularity: containers["gatk"]
246
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
247
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
248
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
249
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
250
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
251
252
253
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
254

255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
271
272
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
273
274
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
275
276
           "--output-type z && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
277
278

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
279
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
280
    input:
281
282
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
283
        ref= settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
284
    output:
285
286
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
287
288
    wildcard_constraints:
        chunk="[0-9]+"
289
    singularity: containers["gatk"]
290
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
291
           "GenotypeGVCFs -R {input.ref} "
292
           "-V {input.gvcf} -o '{output.vcf}'"
293
294


295
def aggregate_vcf(wildcards):
296
297
298
299
300
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


301
302
303
304
305
306
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
307
rule genotype_gather:
308
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
309
    input:
310
        vcfs = aggregate_vcf,
311
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
312
    output:
313
314
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
315
    singularity: containers["bcftools"]
316
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
317
318
           "--output-type z && "
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
319
320


Sander Bollen's avatar
Sander Bollen committed
321
## bam metrics
322
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
323
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
324
    input:
325
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
326
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
327
    output:
328
329
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
330
    singularity: containers["samtools-1.7-python-3.6"]
331
332
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
333
334


335
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
336
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
337
    input:
338
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
339
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
340
    output:
341
342
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
343
    singularity: containers["samtools-1.7-python-3.6"]
344
345
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
346
347
348


## fastqc
Sander Bollen's avatar
Sander Bollen committed
349
rule fastqc_raw:
350
351
352
    """
    Run fastqc on raw fastq files
    """
Sander Bollen's avatar
Sander Bollen committed
353
    input:
van den Berg's avatar
van den Berg committed
354
355
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
356
    output:
van den Berg's avatar
van den Berg committed
357
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
358
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
359
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
360
361


Sander Bollen's avatar
Sander Bollen committed
362
rule fastqc_postqc:
363
364
365
    """
    Run fastqc on fastq files post pre-processing
    """
Sander Bollen's avatar
Sander Bollen committed
366
    input:
van den Berg's avatar
van den Berg committed
367
368
        r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
369
    output:
van den Berg's avatar
van den Berg committed
370
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
371
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
372
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
373
374


375
## coverage
Sander Bollen's avatar
Sander Bollen committed
376
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
377
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
378
    input:
Sander Bollen's avatar
Sander Bollen committed
379
380
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
van den Berg's avatar
van den Berg committed
381
        covpy=settings["covstats"],
van den Berg's avatar
van den Berg committed
382
        bed=settings.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
383
384
385
    params:
        subt="Sample {sample}"
    output:
van den Berg's avatar
van den Berg committed
386
387
        covj="{sample}/coverage/covstats.json",
        covp="{sample}/coverage/covstats.png"
388
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
389
390
391
392
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
393
394


Sander Bollen's avatar
Sander Bollen committed
395
396
397
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
398
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
399
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
400
        ref = settings.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
401
    output:
402
        tsv="{sample}/coverage/refFlat_coverage.tsv"
403
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
404
405
406
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


407
## collection
408
409
410
411
412
413
414
415
416
if "bedfile" in settings:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            cov="{sample}/coverage/covstats.json",
417
            cutadapt = "{sample}/cutadapt.json",
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
            colpy=settings["collect_stats"]
        params:
            sample_name="{sample}",
            fthresh=settings["female_threshold"]
        output:
            "{sample}/{sample}.stats.json"
        singularity: containers["vtools"]
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--cutadapt {input.cutadapt} "
               "{input.cov} > {output}"
else:
    rule collectstats:
        """Collect all stats for a particular sample without beds"""
        input:
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
439
            cutadapt = "{sample}/cutadapt.json",
440
441
442
443
444
445
446
447
448
449
450
451
452
453
            colpy = settings["collect_stats"]
        params:
            sample_name = "{sample}",
            fthresh = settings["female_threshold"]
        output:
            "{sample}/{sample}.stats.json"
        singularity: containers["vtools"]
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--cutadapt {input.cutadapt} "
               "> {output}"

454
455
rule collect_cutadapt_summary:
    """ Colect cutadapt summary from each readgroup per sample """
Sander Bollen's avatar
Sander Bollen committed
456
    input:
van den Berg's avatar
van den Berg committed
457
458
459
        cutadapt = lambda wildcards:
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(sample=wildcards.sample,
        read_group=read_group) for read_group in get_readgroup(wildcards)),
460
461
        cutadapt_summary= settings["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
462
    singularity: containers["python3"]
463
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
van den Berg's avatar
van den Berg committed
464
465
           "--cutadapt-summary {input.cutadapt} > {output}"

van den Berg's avatar
van den Berg committed
466

467
468
469
470
471
472
473
474
rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
        mpy=settings["merge_stats"]
    output:
        stats="stats.json"
    singularity: containers["vtools"]
475
    shell: "python {input.mpy} --collectstats {input.cols} "
476
477
478
479
480
481
482
483
484
485
486
487
           "> {output.stats}"


rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats="stats.json",
        sc=settings["stats_to_tsv"]
    output:
        stats="stats.tsv"
    singularity: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
488
489


Sander Bollen's avatar
Sander Bollen committed
490
491
492
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
493
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
494
495
    """
    input:
496
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
497
    params:
Sander Bollen's avatar
Sander Bollen committed
498
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
499
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
500
    output:
van den Berg's avatar
van den Berg committed
501
        "multiqc_report/multiqc_report.html"
502
    singularity: containers["multiqc"]
503
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output}"