Snakefile 28.1 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
Main Snakefile for the pipeline. 

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
Sander Bollen's avatar
Sander Bollen committed
25
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
26
from os.path import join, basename
27
from pathlib import Path
Sander Bollen's avatar
Sander Bollen committed
28
29

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
30
31

REFERENCE = config.get("REFERENCE")
32
33
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
34
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
35
36
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
37

Sander Bollen's avatar
Sander Bollen committed
38
GATK = config.get("GATK")
39
40
if GATK is None:
    raise ValueError("You must set --config GATK=<path>")
Sander Bollen's avatar
Sander Bollen committed
41
42
if not Path(GATK).exists():
    raise FileNotFoundError("{0} does not exist.".format(GATK))
43

Sander Bollen's avatar
Sander Bollen committed
44
DBSNP = config.get("DBSNP")
45
46
if DBSNP is None:
    raise ValueError("You must set --config DBSNP=<path>")
Sander Bollen's avatar
Sander Bollen committed
47
48
if not Path(DBSNP).exists():
    raise FileNotFoundError("{0} does not exist".format(DBSNP))
49

Sander Bollen's avatar
Sander Bollen committed
50
ONETHOUSAND = config.get("ONETHOUSAND")
51
52
if ONETHOUSAND is None:
    raise ValueError("You must set --config ONETHOUSAND=<path>")
Sander Bollen's avatar
Sander Bollen committed
53
54
if not Path(ONETHOUSAND).exists():
    raise FileNotFoundError("{0} does not exist".format(ONETHOUSAND))
55

Sander Bollen's avatar
Sander Bollen committed
56
HAPMAP = config.get("HAPMAP")
57
if HAPMAP is None:
Sander Bollen's avatar
Sander Bollen committed
58
59
60
    raise ValueError("You must set --config HAPMAP=<path>")
if not Path(HAPMAP).exists():
    raise FileNotFoundError("{0} does not exist".format(HAPMAP))
61
62

# these are all optional
63
64
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
65
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
66
FASTQ_COUNT = config.get("FASTQ_COUNT")
Sander Bollen's avatar
Sander Bollen committed
67
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
68

69
70
71
72
73
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
74

75
76
77
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
78
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
79
fqpy = fsrc_dir("src", "fastqc_stats.py")
Sander Bollen's avatar
Sander Bollen committed
80
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
81
fqsc = fsrc_dir("src", "safe_fastqc.sh")
Sander Bollen's avatar
Sander Bollen committed
82

Sander Bollen's avatar
Sander Bollen committed
83
84
85
86
87
88
89
# sample config parsing
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

Sander Bollen's avatar
Sander Bollen committed
90
91
92
93
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
94
95
96
97
98
99
100
101
102
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed
103
104

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
105
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed
106

Sander Bollen's avatar
Sander Bollen committed
107
108

def split_genome(ref, approx_n_chunks=100):
Sander Bollen's avatar
Sander Bollen committed
109
110
111
112
113
114
115
116
    """
    Split genome in chunks.

    Chunks are strings in the format: `<ctg>:<start>-<end>`
    These follow the region string format as used by htslib,
    which uses _1_-based indexing.
    See: http://www.htslib.org/doc/tabix.html
    """
Sander Bollen's avatar
Sander Bollen committed
117
118
119
120
121
    fa = Fasta(ref)
    tot_size = sum([len(x) for x in fa.records.values()])
    chunk_size = tot_size//approx_n_chunks
    chunks = []
    for chrom_name, chrom_value in fa.records.items():
Sander Bollen's avatar
Sander Bollen committed
122
        pos = 1
Sander Bollen's avatar
Sander Bollen committed
123
124
125
126
127
128
129
130
131
132
133
134
135
        while pos <= len(chrom_value):
            end = pos+chunk_size
            if end <= len(chrom_value):
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, end)
            else:
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, len(chrom_value))
            chunks.append(chunk)
            pos = end
    return chunks

CHUNKS = split_genome(REFERENCE)


Sander Bollen's avatar
Sander Bollen committed
136

Sander Bollen's avatar
Sander Bollen committed
137
138
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
139
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
140
    rs = []
141
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
142
143
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
144

Sander Bollen's avatar
Sander Bollen committed
145
146
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
147
148


Sander Bollen's avatar
Sander Bollen committed
149
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
150
151
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
152
153
154


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
155
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
156
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed
157
158


Sander Bollen's avatar
Sander Bollen committed
159
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
160
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
161
162
163
164
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
165
166
167
168
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

Sander Bollen's avatar
Sander Bollen committed
169
170
171
172
173
174
    fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt",
                  sample=SAMPLES)
    fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
                  sample=SAMPLES)
    fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
                  sample=SAMPLES)
175
    if len(REFFLATS) >= 1:
Sander Bollen's avatar
Sander Bollen committed
176
177
        coverage_stats = expand("{sample}/coverage/{ref}.coverages.tsv",
                                sample=SAMPLES, ref=BASE_REFFLATS)
178
179
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
180
    stats = "stats.json"
Sander Bollen's avatar
Sander Bollen committed
181
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
Sander Bollen's avatar
Sander Bollen committed
182
183


Sander Bollen's avatar
Sander Bollen committed
184
185
186
localrules: gvcf_chunkfile, genotype_chunkfile


Sander Bollen's avatar
Sander Bollen committed
187
188
rule all:
    input:
Sander Bollen's avatar
Sander Bollen committed
189
190
        combined="multisample/genotyped.vcf.gz",
        report="multiqc_report/multiqc_report.html",
Sander Bollen's avatar
Sander Bollen committed
191
192
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai",sample=SAMPLES),
        vcfs=expand("{sample}/vcf/{sample}_single.vcf.gz",sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
193
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
194

Sander Bollen's avatar
Sander Bollen committed
195
196
197

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
198
    output: directory("tmp")
199
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
200
201
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
202
rule genome:
Sander Bollen's avatar
Sander Bollen committed
203
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
204
    input: REFERENCE
Sander Bollen's avatar
Sander Bollen committed
205
    output: "current.genome"
206
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
207
208
209
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
210
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
211
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
212
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
213
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
214
215
216
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
217
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
218
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
219
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
220
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
221
222
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
223
224

rule seqtk_r1:
Sander Bollen's avatar
Sander Bollen committed
225
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
226
    input:
Sander Bollen's avatar
Sander Bollen committed
227
228
        stats="{sample}/pre_process/{sample}.preqc_count.json",
        fastq="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
229
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
230
    params:
Sander Bollen's avatar
Sander Bollen committed
231
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
232
    output:
Sander Bollen's avatar
Sander Bollen committed
233
        fastq=temp("{sample}/pre_process/{sample}.sampled_R1.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
234
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
235
    singularity: "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0"
Sander Bollen's avatar
Sander Bollen committed
236
237
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
238
239
240


rule seqtk_r2:
Sander Bollen's avatar
Sander Bollen committed
241
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
242
    input:
Sander Bollen's avatar
Sander Bollen committed
243
244
        stats = "{sample}/pre_process/{sample}.preqc_count.json",
        fastq = "{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
245
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
246
    params:
Sander Bollen's avatar
Sander Bollen committed
247
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
248
    output:
Sander Bollen's avatar
Sander Bollen committed
249
        fastq = temp("{sample}/pre_process/{sample}.sampled_R2.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
250
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
251
    singularity: "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0"
Sander Bollen's avatar
Sander Bollen committed
252
253
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
254
255


256
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
257
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
258
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
259
    input:
Sander Bollen's avatar
Sander Bollen committed
260
261
262
263
        r1 = "{sample}/pre_process/{sample}.sampled_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.sampled_R2.fastq.gz",
        rr1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        rr2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
264
    output:
Sander Bollen's avatar
Sander Bollen committed
265
266
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
267
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
Sander Bollen's avatar
Sander Bollen committed
268
    singularity: "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4"
Sander Bollen's avatar
Sander Bollen committed
269
    conda: "envs/sickle.yml"
Sander Bollen's avatar
Sander Bollen committed
270
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
271
272
273
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
274
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
275
    input:
Sander Bollen's avatar
Sander Bollen committed
276
277
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
278
    output:
Sander Bollen's avatar
Sander Bollen committed
279
280
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
Sander Bollen's avatar
Sander Bollen committed
281
    singularity: "docker://quay.io/biocontainers/cutadapt:1.14--py36_0"
Sander Bollen's avatar
Sander Bollen committed
282
    conda: "envs/cutadapt.yml"
Sander Bollen's avatar
Sander Bollen committed
283
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
284
285
286
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
287
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
288
    input:
Sander Bollen's avatar
Sander Bollen committed
289
290
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
291
292
        ref = REFERENCE,
        temp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
293
294
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
295
    output: temp("{sample}/bams/{sample}.sorted.bam")
296
    singularity: "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0"
Sander Bollen's avatar
Sander Bollen committed
297
    conda: "envs/bwa.yml"
Sander Bollen's avatar
Sander Bollen committed
298
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
299
           "| picard -Xmx4G SortSam CREATE_INDEX=TRUE TMP_DIR={input.temp} "
Sander Bollen's avatar
Sander Bollen committed
300
301
302
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
303
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
304
    input:
Sander Bollen's avatar
Sander Bollen committed
305
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
306
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
307
    output:
Sander Bollen's avatar
Sander Bollen committed
308
309
310
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
Sander Bollen's avatar
Sander Bollen committed
311
    singularity: "docker://quay.io/biocontainers/picard:2.14--py36_0"
Sander Bollen's avatar
Sander Bollen committed
312
    conda: "envs/picard.yml"
313
    shell: "picard -Xmx4G MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
314
315
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
316
317
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
318
319
320
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
321
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
322
    output:
Sander Bollen's avatar
Sander Bollen committed
323
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
324
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
325
326
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
327
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
328
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
329
    input:
Sander Bollen's avatar
Sander Bollen committed
330
        bam = "{sample}/bams/{sample}.markdup.bam",
Sander Bollen's avatar
Sander Bollen committed
331
332
333
334
335
336
        gatk = GATK,
        ref = REFERENCE,
        dbsnp = DBSNP,
        one1kg = ONETHOUSAND,
        hapmap = HAPMAP
    output:
Sander Bollen's avatar
Sander Bollen committed
337
        grp = "{sample}/bams/{sample}.baserecal.grp"
Sander Bollen's avatar
Sander Bollen committed
338
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
Sander Bollen's avatar
Sander Bollen committed
339
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
340
    shell: "java -XX:ParallelGCThreads=1 -jar {input.gatk} -T "
Sander Bollen's avatar
Sander Bollen committed
341
342
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
Sander Bollen's avatar
Sander Bollen committed
343
344
           "-cov CycleCovariate -cov ContextCovariate -knownSites "
           "{input.dbsnp} -knownSites {input.one1kg} "
Sander Bollen's avatar
Sander Bollen committed
345
346
           "-knownSites {input.hapmap}"

Sander Bollen's avatar
Sander Bollen committed
347
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
348
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
349
    input:
Sander Bollen's avatar
Sander Bollen committed
350
351
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
Sander Bollen's avatar
Sander Bollen committed
352
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
353
354
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
355
    params:
Sander Bollen's avatar
Sander Bollen committed
356
        chunk="{chunk}"
Sander Bollen's avatar
Sander Bollen committed
357
    output:
Sander Bollen's avatar
Sander Bollen committed
358
359
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.part.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.part.vcf.gz.tbi")
Sander Bollen's avatar
Sander Bollen committed
360
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
Sander Bollen's avatar
Sander Bollen committed
361
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
362
363
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 {input.gatk} "
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
364
365
366
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
           "-L '{params.chunk}' -o '{output.gvcf}' "
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
367
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
368
369


Sander Bollen's avatar
Sander Bollen committed
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
rule gvcf_chunkfile:
    """
    Create simple text file with paths to chunks for GVCF.
    
    This uses a run directive in stead of a shell directive because
    the amount of chunks may be so large the shell would error out with
    an "argument list too long" error. 
    See https://unix.stackexchange.com/a/120842 for more info
    
    This also means this rule lives outside of singularity/conda and is
    executed in snakemake's own environment. 
    """
    params:
        chunkfiles = expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
                            chunk=CHUNKS)
    output:
        file = "{sample}/vcf/chunkfile.txt"
    run:
388
        with open(output.file, "w") as ohandle:
Sander Bollen's avatar
Sander Bollen committed
389
            for filename in params.chunkfiles:
390
391
                corrected = filename.format(sample=wildcards.sample)
                ohandle.write(corrected + "\n")
Sander Bollen's avatar
Sander Bollen committed
392

Sander Bollen's avatar
Sander Bollen committed
393
rule gvcf_gather:
Sander Bollen's avatar
Sander Bollen committed
394
    """Gather all GVCF scatters"""
Sander Bollen's avatar
Sander Bollen committed
395
    input:
Sander Bollen's avatar
Sander Bollen committed
396
397
398
        gvcfs = expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
                       chunk=CHUNKS),
        chunkfile = "{sample}/vcf/chunkfile.txt"
Sander Bollen's avatar
Sander Bollen committed
399
    output:
Sander Bollen's avatar
Sander Bollen committed
400
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
Sander Bollen's avatar
Sander Bollen committed
401
    conda: "envs/bcftools.yml"
Sander Bollen's avatar
Sander Bollen committed
402
403
404
405
406
407
408
409
410
411
    singularity: "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4"
    shell: "bcftools concat -f {input.chunkfile} -n > {output.gvcf}"


rule gvcf_gather_tbi:
    """Index GVCF"""
    input:
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
    output:
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
412
    conda: "envs/tabix.yml"
Sander Bollen's avatar
Sander Bollen committed
413
414
    singularity: "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0"
    shell: "tabix -pvcf {input.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
415
416
417


rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
418
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
419
    input:
Sander Bollen's avatar
Sander Bollen committed
420
        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz", sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
421
        tbis = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
Sander Bollen's avatar
Sander Bollen committed
422
                      sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
423
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
424
425
        gatk=GATK
    params:
Sander Bollen's avatar
Sander Bollen committed
426
427
        li=" -V ".join(expand("{sample}/vcf/{sample}.g.vcf.gz",
                              sample=SAMPLES)),
Sander Bollen's avatar
Sander Bollen committed
428
429
        chunk="{chunk}"
    output:
Sander Bollen's avatar
Sander Bollen committed
430
431
        vcf=temp("multisample/genotype.{chunk}.part.vcf.gz"),
        vcf_tbi=temp("multisample/genotype.{chunk}.part.vcf.gz.tbi")
Sander Bollen's avatar
Sander Bollen committed
432
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
Sander Bollen's avatar
Sander Bollen committed
433
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
434
435
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 {input.gatk} -T "
           "GenotypeGVCFs -R {input.ref} "
Sander Bollen's avatar
Sander Bollen committed
436
           "-V {params.li} -L '{params.chunk}' -o '{output.vcf}'"
Sander Bollen's avatar
Sander Bollen committed
437
438


439
440
441
442
443
444
445
446
447
448
449
450
451
rule genotype_chunkfile:
    """
    Create simple text file with paths to chunks for genotyping
    
    This uses a run directive in stead of a shell directive because
    the amount of chunks may be so large the shell would error out with
    an "argument list too long" error. 
    See https://unix.stackexchange.com/a/120842 for more info
    
    This also means this rule lives outside of singularity/conda and is
    executed in snakemake's own environment. 
    """
    params:
Sander Bollen's avatar
Sander Bollen committed
452
453
        vcfs = expand("multisample/genotype.{chunk}.part.vcf.gz",
                      chunk=CHUNKS)
454
455
456
457
458
459
460
461
    output:
        file = "multisample/chunkfile.txt"
    run:
        with open(output.file, "w") as ohandle:
            for filename in params.vcfs:
                ohandle.write(filename + "\n")


Sander Bollen's avatar
Sander Bollen committed
462
rule genotype_gather:
463
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
464
    input:
465
466
467
        vcfs = expand("multisample/genotype.{chunk}.part.vcf.gz",
                      chunk=CHUNKS),
        chunkfile = "multisample/chunkfile.txt"
Sander Bollen's avatar
Sander Bollen committed
468
    output:
469
        vcf = "multisample/genotyped.vcf.gz"
Sander Bollen's avatar
Sander Bollen committed
470
    conda: "envs/bcftools.yml"
471
472
473
474
475
476
477
478
479
    singularity: "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4"
    shell: "bcftools concat -f {input.chunkfile} -n > {output.vcf}"


rule genotype_gather_tbi:
    """Index genotyped vcf file"""
    input:
        vcf = "multisample/genotyped.vcf.gz"
    output:
Sander Bollen's avatar
Sander Bollen committed
480
        tbi = "multisample/genotyped.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
481
    conda: "envs/tabix.yml"
482
483
    singularity: "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0"
    shell: "tabix -pvcf {input.vcf}"
Sander Bollen's avatar
Sander Bollen committed
484
485


486
487
488
rule split_vcf:
    """Split multisample VCF in single samples"""
    input:
Sander Bollen's avatar
Sander Bollen committed
489
        vcf="multisample/genotyped.vcf.gz",
490
        tbi = "multisample/genotyped.vcf.gz.tbi",
491
492
493
494
495
        gatk=GATK,
        ref=REFERENCE
    params:
        s="{sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
496
        splitted="{sample}/vcf/{sample}_single.vcf.gz"
Sander Bollen's avatar
Sander Bollen committed
497
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
498
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
499
500
501
    shell: "java -Xmx15G -XX:ParallelGCThreads=1 -jar {input.gatk} "
           "-T SelectVariants -sn {params.s} -env -R {input.ref} -V "
           "{input.vcf} -o {output.splitted}"
502
503


Sander Bollen's avatar
Sander Bollen committed
504
505
506
## bam metrics

rule mapped_num:
Sander Bollen's avatar
Sander Bollen committed
507
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
508
    input:
Sander Bollen's avatar
Sander Bollen committed
509
        bam="{sample}/bams/{sample}.sorted.bam"
Sander Bollen's avatar
Sander Bollen committed
510
    output:
Sander Bollen's avatar
Sander Bollen committed
511
        num="{sample}/bams/{sample}.mapped.num"
512
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
513
514
515
516
517
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"


rule mapped_basenum:
Sander Bollen's avatar
Sander Bollen committed
518
    """Calculate number of mapped bases"""
Sander Bollen's avatar
Sander Bollen committed
519
    input:
Sander Bollen's avatar
Sander Bollen committed
520
        bam="{sample}/bams/{sample}.sorted.bam"
Sander Bollen's avatar
Sander Bollen committed
521
    output:
Sander Bollen's avatar
Sander Bollen committed
522
        num="{sample}/bams/{sample}.mapped.basenum"
523
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
524
    conda: "envs/samtools.yml"
Sander Bollen's avatar
Sander Bollen committed
525
    shell: "samtools view -F 4 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
526
527
528


rule unique_num:
Sander Bollen's avatar
Sander Bollen committed
529
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
530
    input:
Sander Bollen's avatar
Sander Bollen committed
531
        bam="{sample}/bams/{sample}.markdup.bam"
Sander Bollen's avatar
Sander Bollen committed
532
    output:
Sander Bollen's avatar
Sander Bollen committed
533
        num="{sample}/bams/{sample}.unique.num"
534
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
535
    conda: "envs/samtools.yml"
536
    shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
537
538
539


rule usable_basenum:
Sander Bollen's avatar
Sander Bollen committed
540
    """Calculate number of bases on unique reads"""
Sander Bollen's avatar
Sander Bollen committed
541
    input:
Sander Bollen's avatar
Sander Bollen committed
542
        bam="{sample}/bams/{sample}.markdup.bam"
Sander Bollen's avatar
Sander Bollen committed
543
    output:
Sander Bollen's avatar
Sander Bollen committed
544
        num="{sample}/bams/{sample}.usable.basenum"
545
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
546
    conda: "envs/samtools.yml"
Sander Bollen's avatar
Sander Bollen committed
547
548
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > "
           "{output.num}"
Sander Bollen's avatar
Sander Bollen committed
549
550
551
552


## fastqc

Sander Bollen's avatar
Sander Bollen committed
553
rule fastqc_raw:
554
555
556
557
558
    """
    Run fastqc on raw fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to 
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
559
    input:
Sander Bollen's avatar
Sander Bollen committed
560
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
561
562
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
563
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
564
    output:
Sander Bollen's avatar
Sander Bollen committed
565
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
566
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
567
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
568
569
    shell: "fastqc --nogroup -o {params.odir} {input.r1} {input.r2} "
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
570
571


Sander Bollen's avatar
Sander Bollen committed
572
rule fastqc_merged:
573
574
575
576
577
    """
    Run fastqc on merged fastq files    
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to 
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
578
    input:
Sander Bollen's avatar
Sander Bollen committed
579
580
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
581
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
582
    params:
Sander Bollen's avatar
Sander Bollen committed
583
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
584
    output:
Sander Bollen's avatar
Sander Bollen committed
585
586
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
587
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
588
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
589
590
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
591
592


Sander Bollen's avatar
Sander Bollen committed
593
rule fastqc_postqc:
594
595
596
597
598
    """
    Run fastqc on fastq files post pre-processing
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to 
    perl missing in the container of 0.11.5     
    """
Sander Bollen's avatar
Sander Bollen committed
599
    input:
Sander Bollen's avatar
Sander Bollen committed
600
601
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
602
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
603
    params:
Sander Bollen's avatar
Sander Bollen committed
604
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
605
    output:
Sander Bollen's avatar
Sander Bollen committed
606
607
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
608
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
609
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
610
611
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
612
613
614
615
616


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
617
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
618
    input:
Sander Bollen's avatar
Sander Bollen committed
619
620
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
621
    output:
Sander Bollen's avatar
Sander Bollen committed
622
        "{sample}/pre_process/{sample}.preqc_count.json"
Sander Bollen's avatar
Sander Bollen committed
623
    singularity: "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0"
Sander Bollen's avatar
Sander Bollen committed
624
625
    conda: "envs/fastq-count.yml"
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
626
627
628


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
629
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
630
    input:
Sander Bollen's avatar
Sander Bollen committed
631
632
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
633
    output:
Sander Bollen's avatar
Sander Bollen committed
634
        "{sample}/pre_process/{sample}.postqc_count.json"
Sander Bollen's avatar
Sander Bollen committed
635
    singularity: "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0"
Sander Bollen's avatar
Sander Bollen committed
636
637
    conda: "envs/fastq-count.yml"
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
638
639


Sander Bollen's avatar
Sander Bollen committed
640
641
642
643
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
644
645
646
647
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
Sander Bollen's avatar
Sander Bollen committed
648
        sc=fqpy
Sander Bollen's avatar
Sander Bollen committed
649
    singularity: "docker://python:3.6-slim"
650
    conda: "envs/collectstats.yml"
Sander Bollen's avatar
Sander Bollen committed
651
    output:
Sander Bollen's avatar
Sander Bollen committed
652
        "{sample}/pre_process/fastq_stats.json"
653
654
655
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
656
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
657

Sander Bollen's avatar
Sander Bollen committed
658
659
660
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
661
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
662
    input:
Sander Bollen's avatar
Sander Bollen committed
663
664
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
Sander Bollen's avatar
Sander Bollen committed
665
666
667
668
669
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
670
671
        covj="{sample}/coverage/{bed}.covstats.json",
        covp="{sample}/coverage/{bed}.covstats.png"
672
    singularity: "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0"
Sander Bollen's avatar
Sander Bollen committed
673
    conda: "envs/covstat.yml"
Sander Bollen's avatar
Sander Bollen committed
674
675
676
677
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
678
679


Sander Bollen's avatar
Sander Bollen committed
680
681
682
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
683
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
684
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
Sander Bollen's avatar
Sander Bollen committed
685
686
        ref=get_refflatpath
    output:
Sander Bollen's avatar
Sander Bollen committed
687
        tsv="{sample}/coverage/{ref}.coverages.tsv"
Sander Bollen's avatar
Sander Bollen committed
688
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
689
690
691
692
    conda: "envs/vcfstats.yml"
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
693
694
695
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
696
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
697
    input:
Sander Bollen's avatar
Sander Bollen committed
698
699
        vcf="multisample/genotyped.vcf.gz",
        tbi = "multisample/genotyped.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
700
    output:
Sander Bollen's avatar
Sander Bollen committed
701
        stats="multisample/vcfstats.json"
Sander Bollen's avatar
Sander Bollen committed
702
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
703
    conda: "envs/vcfstats.yml"
Sander Bollen's avatar
Sander Bollen committed
704
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
705
706


Sander Bollen's avatar
Sander Bollen committed
707
## collection
708
709
710
711
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
712
713
714
715
716
717
718
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
719
720
            cov=expand("{{sample}}/coverage/{bed}.covstats.json",
                       bed=BASE_BEDS),
721
722
723
724
725
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
726
            "{sample}/{sample}.stats.json"
Sander Bollen's avatar
Sander Bollen committed
727
        singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
728
729
730
731
732
        conda: "envs/collectstats.yml"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
733
734
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
735
736
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
737
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
738
        input:
Sander Bollen's avatar
Sander Bollen committed
739
740
741
742
743
744
745
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
746
747
748
749
750
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
751
            "{sample}/{sample}.stats.json"
Sander Bollen's avatar
Sander Bollen committed
752
        singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
753
754
755
756
757
        conda: "envs/collectstats.yml"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
758
759
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
760
761

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
762
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
763
    input:
Sander Bollen's avatar
Sander Bollen committed
764
765
        cols=expand("{sample}/{sample}.stats.json", sample=SAMPLES),
        vstat="multisample/vcfstats.json",
Sander Bollen's avatar
Sander Bollen committed
766
767
        mpy=mpy
    output:
Sander Bollen's avatar
Sander Bollen committed
768
        stats="stats.json"
769
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
770
    conda: "envs/collectstats.yml"
Sander Bollen's avatar
Sander Bollen committed
771
772
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
773
774


Sander Bollen's avatar
Sander Bollen committed
775
776
777
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
778
        stats="stats.json",
Sander Bollen's avatar
Sander Bollen committed
779
780
        sc=tsvpy
    output:
Sander Bollen's avatar
Sander Bollen committed
781
        stats="stats.tsv"
782
    singularity: "docker://python:3.6-slim"
Sander Bollen's avatar
Sander Bollen committed
783
784
785
786
    conda: "envs/collectstats.yml"
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
787
788
789
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
790
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
791
792
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
793
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
794
    params:
Sander Bollen's avatar
Sander Bollen committed
795
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
796
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
797
    output:
Sander Bollen's avatar
Sander Bollen committed
798
        report="multiqc_report/multiqc_report.html"
Sander Bollen's avatar
Sander Bollen committed
799
    singularity: "docker://quay.io/biocontainers/multiqc:1.5--py36_0"
Sander Bollen's avatar
Sander Bollen committed
800
    conda: "envs/multiqc.yml"
801
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"