Snakefile 27 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
Main Snakefile for the pipeline. 

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
Sander Bollen's avatar
Sander Bollen committed
25
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
26
from os.path import join, basename
27
from pathlib import Path
Sander Bollen's avatar
Sander Bollen committed
28
29

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
30
31

REFERENCE = config.get("REFERENCE")
32
33
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
34
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
35
36
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
37

Sander Bollen's avatar
Sander Bollen committed
38
GATK = config.get("GATK")
39
40
if GATK is None:
    raise ValueError("You must set --config GATK=<path>")
Sander Bollen's avatar
Sander Bollen committed
41
42
if not Path(GATK).exists():
    raise FileNotFoundError("{0} does not exist.".format(GATK))
43

Sander Bollen's avatar
Sander Bollen committed
44
DBSNP = config.get("DBSNP")
45
46
if DBSNP is None:
    raise ValueError("You must set --config DBSNP=<path>")
Sander Bollen's avatar
Sander Bollen committed
47
48
if not Path(DBSNP).exists():
    raise FileNotFoundError("{0} does not exist".format(DBSNP))
49

Sander Bollen's avatar
Sander Bollen committed
50
ONETHOUSAND = config.get("ONETHOUSAND")
51
52
if ONETHOUSAND is None:
    raise ValueError("You must set --config ONETHOUSAND=<path>")
Sander Bollen's avatar
Sander Bollen committed
53
54
if not Path(ONETHOUSAND).exists():
    raise FileNotFoundError("{0} does not exist".format(ONETHOUSAND))
55

Sander Bollen's avatar
Sander Bollen committed
56
HAPMAP = config.get("HAPMAP")
57
if HAPMAP is None:
Sander Bollen's avatar
Sander Bollen committed
58
59
60
    raise ValueError("You must set --config HAPMAP=<path>")
if not Path(HAPMAP).exists():
    raise FileNotFoundError("{0} does not exist".format(HAPMAP))
61
62

# these are all optional
63
64
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
65
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
66
FASTQ_COUNT = config.get("FASTQ_COUNT")
Sander Bollen's avatar
Sander Bollen committed
67
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
68

69
70
71
72
73
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
74

75
76
77
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
78
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
79
fqpy = fsrc_dir("src", "fastqc_stats.py")
Sander Bollen's avatar
Sander Bollen committed
80
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
81
fqsc = fsrc_dir("src", "safe_fastqc.sh")
Sander Bollen's avatar
Sander Bollen committed
82

Sander Bollen's avatar
Sander Bollen committed
83
84
85
86
87
88
89
# sample config parsing
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

Sander Bollen's avatar
Sander Bollen committed
90
91
92
93
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
94
95
96
97
98
99
100
101
102
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed
103
104

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
105
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed
106

Sander Bollen's avatar
Sander Bollen committed
107
108

def split_genome(ref, approx_n_chunks=100):
Sander Bollen's avatar
Sander Bollen committed
109
110
111
112
113
114
115
116
    """
    Split genome in chunks.

    Chunks are strings in the format: `<ctg>:<start>-<end>`
    These follow the region string format as used by htslib,
    which uses _1_-based indexing.
    See: http://www.htslib.org/doc/tabix.html
    """
Sander Bollen's avatar
Sander Bollen committed
117
118
119
120
121
    fa = Fasta(ref)
    tot_size = sum([len(x) for x in fa.records.values()])
    chunk_size = tot_size//approx_n_chunks
    chunks = []
    for chrom_name, chrom_value in fa.records.items():
Sander Bollen's avatar
Sander Bollen committed
122
        pos = 1
Sander Bollen's avatar
Sander Bollen committed
123
124
125
126
127
128
129
130
131
132
133
134
135
        while pos <= len(chrom_value):
            end = pos+chunk_size
            if end <= len(chrom_value):
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, end)
            else:
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, len(chrom_value))
            chunks.append(chunk)
            pos = end
    return chunks

CHUNKS = split_genome(REFERENCE)


Sander Bollen's avatar
Sander Bollen committed
136

Sander Bollen's avatar
Sander Bollen committed
137
138
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
139
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
140
    rs = []
141
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
142
143
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
144

Sander Bollen's avatar
Sander Bollen committed
145
146
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
147
148


Sander Bollen's avatar
Sander Bollen committed
149
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
150
151
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
152
153
154


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
155
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
156
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed
157
158


Sander Bollen's avatar
Sander Bollen committed
159
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
160
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
161
162
163
164
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
165
166
167
168
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

Sander Bollen's avatar
Sander Bollen committed
169
170
171
172
173
174
    fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt",
                  sample=SAMPLES)
    fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
                  sample=SAMPLES)
    fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
                  sample=SAMPLES)
175
    if len(REFFLATS) >= 1:
Sander Bollen's avatar
Sander Bollen committed
176
177
        coverage_stats = expand("{sample}/coverage/{ref}.coverages.tsv",
                                sample=SAMPLES, ref=BASE_REFFLATS)
178
179
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
180
    stats = "stats.json"
Sander Bollen's avatar
Sander Bollen committed
181
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
Sander Bollen's avatar
Sander Bollen committed
182
183


Sander Bollen's avatar
Sander Bollen committed
184
185
rule all:
    input:
Sander Bollen's avatar
Sander Bollen committed
186
187
        combined="multisample/genotyped.vcf.gz",
        report="multiqc_report/multiqc_report.html",
Sander Bollen's avatar
Sander Bollen committed
188
189
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai",sample=SAMPLES),
        vcfs=expand("{sample}/vcf/{sample}_single.vcf.gz",sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
190
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
191

Sander Bollen's avatar
Sander Bollen committed
192
193
194

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
195
    output: directory("tmp")
196
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
197
198
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
199
rule genome:
Sander Bollen's avatar
Sander Bollen committed
200
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
201
    input: REFERENCE
Sander Bollen's avatar
Sander Bollen committed
202
    output: "current.genome"
203
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
204
205
206
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
207
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
208
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
209
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
210
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
211
212
213
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
214
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
215
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
216
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
217
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
218
219
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
220
221

rule seqtk_r1:
Sander Bollen's avatar
Sander Bollen committed
222
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
223
    input:
Sander Bollen's avatar
Sander Bollen committed
224
225
        stats="{sample}/pre_process/{sample}.preqc_count.json",
        fastq="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
226
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
227
    params:
Sander Bollen's avatar
Sander Bollen committed
228
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
229
    output:
Sander Bollen's avatar
Sander Bollen committed
230
        fastq=temp("{sample}/pre_process/{sample}.sampled_R1.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
231
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
232
    singularity: "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0"
Sander Bollen's avatar
Sander Bollen committed
233
234
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
235
236
237


rule seqtk_r2:
Sander Bollen's avatar
Sander Bollen committed
238
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
239
    input:
Sander Bollen's avatar
Sander Bollen committed
240
241
        stats = "{sample}/pre_process/{sample}.preqc_count.json",
        fastq = "{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
242
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
243
    params:
Sander Bollen's avatar
Sander Bollen committed
244
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
245
    output:
Sander Bollen's avatar
Sander Bollen committed
246
        fastq = temp("{sample}/pre_process/{sample}.sampled_R2.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
247
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
248
    singularity: "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0"
Sander Bollen's avatar
Sander Bollen committed
249
250
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
251
252


253
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
254
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
255
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
256
    input:
Sander Bollen's avatar
Sander Bollen committed
257
258
259
260
        r1 = "{sample}/pre_process/{sample}.sampled_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.sampled_R2.fastq.gz",
        rr1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        rr2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
261
    output:
Sander Bollen's avatar
Sander Bollen committed
262
263
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
264
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
Sander Bollen's avatar
Sander Bollen committed
265
    singularity: "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4"
Sander Bollen's avatar
Sander Bollen committed
266
    conda: "envs/sickle.yml"
Sander Bollen's avatar
Sander Bollen committed
267
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
268
269
270
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
271
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
272
    input:
Sander Bollen's avatar
Sander Bollen committed
273
274
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
275
    output:
Sander Bollen's avatar
Sander Bollen committed
276
277
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
Sander Bollen's avatar
Sander Bollen committed
278
    singularity: "docker://quay.io/biocontainers/cutadapt:1.14--py36_0"
Sander Bollen's avatar
Sander Bollen committed
279
    conda: "envs/cutadapt.yml"
Sander Bollen's avatar
Sander Bollen committed
280
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
281
282
283
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
284
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
285
    input:
Sander Bollen's avatar
Sander Bollen committed
286
287
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
288
289
290
        ref = REFERENCE
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
291
    output: temp("{sample}/bams/{sample}.sorted.bam")
292
    singularity: "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0"
Sander Bollen's avatar
Sander Bollen committed
293
    conda: "envs/bwa.yml"
Sander Bollen's avatar
Sander Bollen committed
294
295
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
           "| picard SortSam CREATE_INDEX=TRUE TMP_DIR=null "
Sander Bollen's avatar
Sander Bollen committed
296
297
298
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
299
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
300
    input:
Sander Bollen's avatar
Sander Bollen committed
301
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
302
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
303
    output:
Sander Bollen's avatar
Sander Bollen committed
304
305
306
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
Sander Bollen's avatar
Sander Bollen committed
307
    singularity: "docker://quay.io/biocontainers/picard:2.14--py36_0"
Sander Bollen's avatar
Sander Bollen committed
308
    conda: "envs/picard.yml"
309
    shell: "picard -Xmx4G MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
310
311
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
312
313
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
314
315
316
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
317
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
318
    output:
Sander Bollen's avatar
Sander Bollen committed
319
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
320
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
321
322
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
323
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
324
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
325
    input:
Sander Bollen's avatar
Sander Bollen committed
326
        bam = "{sample}/bams/{sample}.markdup.bam",
Sander Bollen's avatar
Sander Bollen committed
327
328
329
330
331
332
        gatk = GATK,
        ref = REFERENCE,
        dbsnp = DBSNP,
        one1kg = ONETHOUSAND,
        hapmap = HAPMAP
    output:
Sander Bollen's avatar
Sander Bollen committed
333
        grp = "{sample}/bams/{sample}.baserecal.grp"
Sander Bollen's avatar
Sander Bollen committed
334
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
Sander Bollen's avatar
Sander Bollen committed
335
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
336
    shell: "java -XX:ParallelGCThreads=1 -jar {input.gatk} -T "
Sander Bollen's avatar
Sander Bollen committed
337
338
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
Sander Bollen's avatar
Sander Bollen committed
339
340
           "-cov CycleCovariate -cov ContextCovariate -knownSites "
           "{input.dbsnp} -knownSites {input.one1kg} "
Sander Bollen's avatar
Sander Bollen committed
341
342
           "-knownSites {input.hapmap}"

Sander Bollen's avatar
Sander Bollen committed
343
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
344
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
345
    input:
Sander Bollen's avatar
Sander Bollen committed
346
347
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
Sander Bollen's avatar
Sander Bollen committed
348
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
349
350
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
351
    params:
Sander Bollen's avatar
Sander Bollen committed
352
        chunk="{chunk}"
Sander Bollen's avatar
Sander Bollen committed
353
    output:
Sander Bollen's avatar
Sander Bollen committed
354
355
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.part.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.part.vcf.gz.tbi")
Sander Bollen's avatar
Sander Bollen committed
356
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
Sander Bollen's avatar
Sander Bollen committed
357
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
358
359
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 {input.gatk} "
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
360
361
362
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
           "-L '{params.chunk}' -o '{output.gvcf}' "
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
363
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
364
365


Sander Bollen's avatar
Sander Bollen committed
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
rule gvcf_chunkfile:
    """
    Create simple text file with paths to chunks for GVCF.
    
    This uses a run directive in stead of a shell directive because
    the amount of chunks may be so large the shell would error out with
    an "argument list too long" error. 
    See https://unix.stackexchange.com/a/120842 for more info
    
    This also means this rule lives outside of singularity/conda and is
    executed in snakemake's own environment. 
    """
    params:
        chunkfiles = expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
                            chunk=CHUNKS)
    output:
        file = "{sample}/vcf/chunkfile.txt"
    run:
        with open(output.file) as ohandle:
            for filename in params.chunkfiles:
                ohandle.write(filename + "\n")

Sander Bollen's avatar
Sander Bollen committed
388
rule gvcf_gather:
Sander Bollen's avatar
Sander Bollen committed
389
    """Gather all GVCF scatters"""
Sander Bollen's avatar
Sander Bollen committed
390
    input:
Sander Bollen's avatar
Sander Bollen committed
391
392
393
        gvcfs = expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
                       chunk=CHUNKS),
        chunkfile = "{sample}/vcf/chunkfile.txt"
Sander Bollen's avatar
Sander Bollen committed
394
    output:
Sander Bollen's avatar
Sander Bollen committed
395
396
397
398
399
400
401
402
403
404
405
406
407
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
    singularity: "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4"
    shell: "bcftools concat -f {input.chunkfile} -n > {output.gvcf}"


rule gvcf_gather_tbi:
    """Index GVCF"""
    input:
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
    output:
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
    singularity: "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0"
    shell: "tabix -pvcf {input.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
408
409
410


rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
411
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
412
    input:
Sander Bollen's avatar
Sander Bollen committed
413
        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz", sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
414
415
        tbis = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
                      samples=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
416
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
417
418
        gatk=GATK
    params:
Sander Bollen's avatar
Sander Bollen committed
419
420
        li=" -V ".join(expand("{sample}/vcf/{sample}.g.vcf.gz",
                              sample=SAMPLES)),
Sander Bollen's avatar
Sander Bollen committed
421
422
        chunk="{chunk}"
    output:
Sander Bollen's avatar
Sander Bollen committed
423
424
        vcf=temp("multisample/genotype.{chunk}.part.vcf.gz"),
        vcf_tbi=temp("multisample/genotype.{chunk}.part.vcf.gz.tbi")
Sander Bollen's avatar
Sander Bollen committed
425
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
Sander Bollen's avatar
Sander Bollen committed
426
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
427
428
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 {input.gatk} -T "
           "GenotypeGVCFs -R {input.ref} "
Sander Bollen's avatar
Sander Bollen committed
429
           "-V {params.li} -L '{params.chunk}' -o '{output.vcf}'"
Sander Bollen's avatar
Sander Bollen committed
430
431
432


rule genotype_gather:
Sander Bollen's avatar
Sander Bollen committed
433
    """Gather all genotyping scatters"""
Sander Bollen's avatar
Sander Bollen committed
434
    input:
Sander Bollen's avatar
Sander Bollen committed
435
        vcfs=expand("multisample/genotype.{chunk}.part.vcf.gz", chunk=CHUNKS),
Sander Bollen's avatar
Sander Bollen committed
436
437
        tbis=expand("multisample/genotype.{chunk}.part.vcf.gz.tbi",
                    chunk=CHUNKS),
Sander Bollen's avatar
Sander Bollen committed
438
439
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
440
    params:
Sander Bollen's avatar
Sander Bollen committed
441
442
        vcfs="' -V '".join(expand("multisample/genotype.{chunk}.part.vcf.gz",
                                  chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
443
    output:
Sander Bollen's avatar
Sander Bollen committed
444
        combined="multisample/genotyped.vcf.gz"
Sander Bollen's avatar
Sander Bollen committed
445
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
446
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
Sander Bollen's avatar
Sander Bollen committed
447
448
    shell: "java -Xmx4G -XX:ParallelGCThreads=1 -cp {input.gatk} "
           "org.broadinstitute.gatk.tools.CatVariants "
Sander Bollen's avatar
Sander Bollen committed
449
           "-R {input.ref} -V '{params.vcfs}' -out {output.combined} "
Sander Bollen's avatar
Sander Bollen committed
450
           "-assumeSorted"
Sander Bollen's avatar
Sander Bollen committed
451
452


453
454
455
rule split_vcf:
    """Split multisample VCF in single samples"""
    input:
Sander Bollen's avatar
Sander Bollen committed
456
        vcf="multisample/genotyped.vcf.gz",
457
458
459
460
461
        gatk=GATK,
        ref=REFERENCE
    params:
        s="{sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
462
        splitted="{sample}/vcf/{sample}_single.vcf.gz"
Sander Bollen's avatar
Sander Bollen committed
463
    singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
464
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
465
466
467
    shell: "java -Xmx15G -XX:ParallelGCThreads=1 -jar {input.gatk} "
           "-T SelectVariants -sn {params.s} -env -R {input.ref} -V "
           "{input.vcf} -o {output.splitted}"
468
469


Sander Bollen's avatar
Sander Bollen committed
470
471
472
## bam metrics

rule mapped_num:
Sander Bollen's avatar
Sander Bollen committed
473
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
474
    input:
Sander Bollen's avatar
Sander Bollen committed
475
        bam="{sample}/bams/{sample}.sorted.bam"
Sander Bollen's avatar
Sander Bollen committed
476
    output:
Sander Bollen's avatar
Sander Bollen committed
477
        num="{sample}/bams/{sample}.mapped.num"
478
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
479
480
481
482
483
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"


rule mapped_basenum:
Sander Bollen's avatar
Sander Bollen committed
484
    """Calculate number of mapped bases"""
Sander Bollen's avatar
Sander Bollen committed
485
    input:
Sander Bollen's avatar
Sander Bollen committed
486
        bam="{sample}/bams/{sample}.sorted.bam"
Sander Bollen's avatar
Sander Bollen committed
487
    output:
Sander Bollen's avatar
Sander Bollen committed
488
        num="{sample}/bams/{sample}.mapped.basenum"
489
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
490
    conda: "envs/samtools.yml"
Sander Bollen's avatar
Sander Bollen committed
491
    shell: "samtools view -F 4 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
492
493
494


rule unique_num:
Sander Bollen's avatar
Sander Bollen committed
495
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
496
    input:
Sander Bollen's avatar
Sander Bollen committed
497
        bam="{sample}/bams/{sample}.markdup.bam"
Sander Bollen's avatar
Sander Bollen committed
498
    output:
Sander Bollen's avatar
Sander Bollen committed
499
        num="{sample}/bams/{sample}.unique.num"
500
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
501
    conda: "envs/samtools.yml"
502
    shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
503
504
505


rule usable_basenum:
Sander Bollen's avatar
Sander Bollen committed
506
    """Calculate number of bases on unique reads"""
Sander Bollen's avatar
Sander Bollen committed
507
    input:
Sander Bollen's avatar
Sander Bollen committed
508
        bam="{sample}/bams/{sample}.markdup.bam"
Sander Bollen's avatar
Sander Bollen committed
509
    output:
Sander Bollen's avatar
Sander Bollen committed
510
        num="{sample}/bams/{sample}.usable.basenum"
511
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
512
    conda: "envs/samtools.yml"
Sander Bollen's avatar
Sander Bollen committed
513
514
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > "
           "{output.num}"
Sander Bollen's avatar
Sander Bollen committed
515
516
517
518


## fastqc

Sander Bollen's avatar
Sander Bollen committed
519
rule fastqc_raw:
520
521
522
523
524
    """
    Run fastqc on raw fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to 
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
525
    input:
Sander Bollen's avatar
Sander Bollen committed
526
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
527
528
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
529
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
530
    output:
Sander Bollen's avatar
Sander Bollen committed
531
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
532
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
533
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
534
535
    shell: "fastqc --nogroup -o {params.odir} {input.r1} {input.r2} "
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
536
537


Sander Bollen's avatar
Sander Bollen committed
538
rule fastqc_merged:
539
540
541
542
543
    """
    Run fastqc on merged fastq files    
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to 
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
544
    input:
Sander Bollen's avatar
Sander Bollen committed
545
546
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
547
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
548
    params:
Sander Bollen's avatar
Sander Bollen committed
549
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
550
    output:
Sander Bollen's avatar
Sander Bollen committed
551
552
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
553
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
554
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
555
556
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
557
558


Sander Bollen's avatar
Sander Bollen committed
559
rule fastqc_postqc:
560
561
562
563
564
    """
    Run fastqc on fastq files post pre-processing
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to 
    perl missing in the container of 0.11.5     
    """
Sander Bollen's avatar
Sander Bollen committed
565
    input:
Sander Bollen's avatar
Sander Bollen committed
566
567
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
568
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
569
    params:
Sander Bollen's avatar
Sander Bollen committed
570
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
571
    output:
Sander Bollen's avatar
Sander Bollen committed
572
573
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
574
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
575
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
576
577
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
578
579
580
581
582


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
583
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
584
    input:
Sander Bollen's avatar
Sander Bollen committed
585
586
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
587
    output:
Sander Bollen's avatar
Sander Bollen committed
588
        "{sample}/pre_process/{sample}.preqc_count.json"
Sander Bollen's avatar
Sander Bollen committed
589
    singularity: "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0"
Sander Bollen's avatar
Sander Bollen committed
590
591
    conda: "envs/fastq-count.yml"
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
592
593
594


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
595
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
596
    input:
Sander Bollen's avatar
Sander Bollen committed
597
598
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
599
    output:
Sander Bollen's avatar
Sander Bollen committed
600
        "{sample}/pre_process/{sample}.postqc_count.json"
Sander Bollen's avatar
Sander Bollen committed
601
    singularity: "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0"
Sander Bollen's avatar
Sander Bollen committed
602
603
    conda: "envs/fastq-count.yml"
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
604
605


Sander Bollen's avatar
Sander Bollen committed
606
607
608
609
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
610
611
612
613
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
Sander Bollen's avatar
Sander Bollen committed
614
        sc=fqpy
Sander Bollen's avatar
Sander Bollen committed
615
    singularity: "docker://python:3.6-slim"
616
    conda: "envs/collectstats.yml"
Sander Bollen's avatar
Sander Bollen committed
617
    output:
Sander Bollen's avatar
Sander Bollen committed
618
        "{sample}/pre_process/fastq_stats.json"
619
620
621
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
622
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
623

Sander Bollen's avatar
Sander Bollen committed
624
625
626
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
627
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
628
    input:
Sander Bollen's avatar
Sander Bollen committed
629
630
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
Sander Bollen's avatar
Sander Bollen committed
631
632
633
634
635
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
636
637
        covj="{sample}/coverage/{bed}.covstats.json",
        covp="{sample}/coverage/{bed}.covstats.png"
638
    singularity: "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0"
Sander Bollen's avatar
Sander Bollen committed
639
    conda: "envs/covstat.yml"
Sander Bollen's avatar
Sander Bollen committed
640
641
642
643
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
644
645


Sander Bollen's avatar
Sander Bollen committed
646
647
648
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
649
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
650
651
        ref=get_refflatpath
    output:
Sander Bollen's avatar
Sander Bollen committed
652
        tsv="{sample}/coverage/{ref}.coverages.tsv"
Sander Bollen's avatar
Sander Bollen committed
653
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
654
655
656
657
    conda: "envs/vcfstats.yml"
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
658
659
660
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
661
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
662
    input:
Sander Bollen's avatar
Sander Bollen committed
663
        vcf="multisample/genotyped.vcf.gz"
Sander Bollen's avatar
Sander Bollen committed
664
    output:
Sander Bollen's avatar
Sander Bollen committed
665
        stats="multisample/vcfstats.json"
Sander Bollen's avatar
Sander Bollen committed
666
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
667
    conda: "envs/vcfstats.yml"
Sander Bollen's avatar
Sander Bollen committed
668
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
669
670


Sander Bollen's avatar
Sander Bollen committed
671
## collection
672
673
674
675
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
676
677
678
679
680
681
682
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
683
684
            cov=expand("{{sample}}/coverage/{bed}.covstats.json",
                       bed=BASE_BEDS),
685
686
687
688
689
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
690
            "{sample}/{sample}.stats.json"
Sander Bollen's avatar
Sander Bollen committed
691
        singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
692
693
694
695
696
        conda: "envs/collectstats.yml"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
697
698
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
699
700
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
701
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
702
        input:
Sander Bollen's avatar
Sander Bollen committed
703
704
705
706
707
708
709
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
710
711
712
713
714
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
715
            "{sample}/{sample}.stats.json"
Sander Bollen's avatar
Sander Bollen committed
716
        singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
717
718
719
720
721
        conda: "envs/collectstats.yml"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
722
723
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
724
725

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
726
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
727
    input:
Sander Bollen's avatar
Sander Bollen committed
728
729
        cols=expand("{sample}/{sample}.stats.json", sample=SAMPLES),
        vstat="multisample/vcfstats.json",
Sander Bollen's avatar
Sander Bollen committed
730
731
        mpy=mpy
    output:
Sander Bollen's avatar
Sander Bollen committed
732
        stats="stats.json"
733
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
734
    conda: "envs/collectstats.yml"
Sander Bollen's avatar
Sander Bollen committed
735
736
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
737
738


Sander Bollen's avatar
Sander Bollen committed
739
740
741
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
742
        stats="stats.json",
Sander Bollen's avatar
Sander Bollen committed
743
744
        sc=tsvpy
    output:
Sander Bollen's avatar
Sander Bollen committed
745
        stats="stats.tsv"
746
    singularity: "docker://python:3.6-slim"
Sander Bollen's avatar
Sander Bollen committed
747
748
749
750
    conda: "envs/collectstats.yml"
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
751
752
753
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
754
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
755
756
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
757
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
758
    params:
Sander Bollen's avatar
Sander Bollen committed
759
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
760
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
761
    output:
Sander Bollen's avatar
Sander Bollen committed
762
        report="multiqc_report/multiqc_report.html"
Sander Bollen's avatar
Sander Bollen committed
763
    singularity: "docker://quay.io/biocontainers/multiqc:1.5--py36_0"
Sander Bollen's avatar
Sander Bollen committed
764
    conda: "envs/multiqc.yml"
765
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"