Snakefile 26.9 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
Sander Bollen's avatar
Sander Bollen committed
25
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
26
from os.path import join, basename
27
from pathlib import Path
28
import itertools
Sander Bollen's avatar
Sander Bollen committed
29
30

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
31
32

REFERENCE = config.get("REFERENCE")
33
34
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
35
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
36
37
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
Sander Bollen's avatar
Sander Bollen committed
38
DBSNP = config.get("DBSNP")
39
40
if DBSNP is None:
    raise ValueError("You must set --config DBSNP=<path>")
Sander Bollen's avatar
Sander Bollen committed
41
42
if not Path(DBSNP).exists():
    raise FileNotFoundError("{0} does not exist".format(DBSNP))
43

44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

KNOWN_SITES = config.get("KNOWN_SITES")
if KNOWN_SITES is None:
    raise ValueError("You must set --config KNOWN_SITES=<path>")

# The user can specify multiple known sites
KNOWN_SITES = KNOWN_SITES.split(',')
for filename in KNOWN_SITES:
    if not Path(filename).exists():
        raise FileNotFoundError("{0} does not exist".format(filename))

60
61

# these are all optional
62
63
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
64
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
65
FASTQ_COUNT = config.get("FASTQ_COUNT")
Sander Bollen's avatar
Sander Bollen committed
66
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
67

68
# Generate the input string for basrecalibration
69
70
71
BSQR_known_sites = ''
for argument, filename in zip(itertools.repeat('-knownSites'), KNOWN_SITES):
    BSQR_known_sites +=' {} {}'.format(argument, filename)
72

73
74
75
76
77
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
78

79
80
81
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
82
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
83
fqpy = fsrc_dir("src", "fastqc_stats.py")
Sander Bollen's avatar
Sander Bollen committed
84
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
85
fqsc = fsrc_dir("src", "safe_fastqc.sh")
Sander Bollen's avatar
Sander Bollen committed
86

Sander Bollen's avatar
Sander Bollen committed
87
# sample config parsing
Sander Bollen's avatar
Sander Bollen committed
88
89
90
91
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
92
93
94
95
96
97
98
99
100
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed
101
102

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
103
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed
104

Sander Bollen's avatar
Sander Bollen committed
105

van den Berg's avatar
van den Berg committed
106
def split_genome(ref, approx_n_chunks=2):
Sander Bollen's avatar
Sander Bollen committed
107
108
109
110
111
112
113
114
    """
    Split genome in chunks.

    Chunks are strings in the format: `<ctg>:<start>-<end>`
    These follow the region string format as used by htslib,
    which uses _1_-based indexing.
    See: http://www.htslib.org/doc/tabix.html
    """
Sander Bollen's avatar
Sander Bollen committed
115
116
117
118
119
    fa = Fasta(ref)
    tot_size = sum([len(x) for x in fa.records.values()])
    chunk_size = tot_size//approx_n_chunks
    chunks = []
    for chrom_name, chrom_value in fa.records.items():
Sander Bollen's avatar
Sander Bollen committed
120
        pos = 1
Sander Bollen's avatar
Sander Bollen committed
121
122
123
124
125
126
127
128
129
130
131
132
133
        while pos <= len(chrom_value):
            end = pos+chunk_size
            if end <= len(chrom_value):
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, end)
            else:
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, len(chrom_value))
            chunks.append(chunk)
            pos = end
    return chunks

CHUNKS = split_genome(REFERENCE)


Sander Bollen's avatar
Sander Bollen committed
134

Sander Bollen's avatar
Sander Bollen committed
135
136
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
137
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
138
    rs = []
139
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
140
141
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
142

Sander Bollen's avatar
Sander Bollen committed
143
144
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
145
146


Sander Bollen's avatar
Sander Bollen committed
147
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
148
149
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
150
151
152


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
153
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
154
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed
155
156


Sander Bollen's avatar
Sander Bollen committed
157
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
158
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
159
160
161
162
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
163
164
165
166
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

Sander Bollen's avatar
Sander Bollen committed
167
168
169
170
171
172
    fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt",
                  sample=SAMPLES)
    fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
                  sample=SAMPLES)
    fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
                  sample=SAMPLES)
173
    if len(REFFLATS) >= 1:
Sander Bollen's avatar
Sander Bollen committed
174
175
        coverage_stats = expand("{sample}/coverage/{ref}.coverages.tsv",
                                sample=SAMPLES, ref=BASE_REFFLATS)
176
177
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
178
    stats = "stats.json"
Sander Bollen's avatar
Sander Bollen committed
179
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
Sander Bollen's avatar
Sander Bollen committed
180
181


Sander Bollen's avatar
Sander Bollen committed
182
183
184
localrules: gvcf_chunkfile, genotype_chunkfile


Sander Bollen's avatar
Sander Bollen committed
185
186
rule all:
    input:
Sander Bollen's avatar
Sander Bollen committed
187
188
        combined="multisample/genotyped.vcf.gz",
        report="multiqc_report/multiqc_report.html",
Sander Bollen's avatar
Sander Bollen committed
189
190
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai",sample=SAMPLES),
        vcfs=expand("{sample}/vcf/{sample}_single.vcf.gz",sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
191
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
192

Sander Bollen's avatar
Sander Bollen committed
193
194
195

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
196
    output: directory("tmp")
197
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
198
199
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
200
rule genome:
Sander Bollen's avatar
Sander Bollen committed
201
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
202
    input: REFERENCE
Sander Bollen's avatar
Sander Bollen committed
203
    output: "current.genome"
204
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
205
206
207
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
208
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
209
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
210
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
211
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
212
213
214
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
215
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
216
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
217
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
218
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
219
220
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
221
222

rule seqtk_r1:
Sander Bollen's avatar
Sander Bollen committed
223
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
224
    input:
Sander Bollen's avatar
Sander Bollen committed
225
226
        stats="{sample}/pre_process/{sample}.preqc_count.json",
        fastq="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
227
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
228
    params:
Sander Bollen's avatar
Sander Bollen committed
229
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
230
    output:
Sander Bollen's avatar
Sander Bollen committed
231
        fastq=temp("{sample}/pre_process/{sample}.sampled_R1.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
232
    singularity: "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0"
Sander Bollen's avatar
Sander Bollen committed
233
234
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
235
236
237


rule seqtk_r2:
Sander Bollen's avatar
Sander Bollen committed
238
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
239
    input:
Sander Bollen's avatar
Sander Bollen committed
240
241
        stats = "{sample}/pre_process/{sample}.preqc_count.json",
        fastq = "{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
242
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
243
    params:
Sander Bollen's avatar
Sander Bollen committed
244
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
245
    output:
Sander Bollen's avatar
Sander Bollen committed
246
        fastq = temp("{sample}/pre_process/{sample}.sampled_R2.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
247
    singularity: "docker://quay.io/biocontainers/mulled-v2-13686261ac0aa5682c680670ff8cda7b09637943:d143450dec169186731bb4df6f045a3c9ee08eb6-0"
Sander Bollen's avatar
Sander Bollen committed
248
249
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} "
           "{params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
250
251


252
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
253
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
254
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
255
    input:
Sander Bollen's avatar
Sander Bollen committed
256
257
258
259
        r1 = "{sample}/pre_process/{sample}.sampled_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.sampled_R2.fastq.gz",
        rr1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        rr2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
260
    output:
Sander Bollen's avatar
Sander Bollen committed
261
262
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
263
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
Sander Bollen's avatar
Sander Bollen committed
264
    singularity: "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4"
Sander Bollen's avatar
Sander Bollen committed
265
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
266
267
268
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
269
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
270
    input:
Sander Bollen's avatar
Sander Bollen committed
271
272
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
273
    output:
Sander Bollen's avatar
Sander Bollen committed
274
275
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
Sander Bollen's avatar
Sander Bollen committed
276
    singularity: "docker://quay.io/biocontainers/cutadapt:1.14--py36_0"
Sander Bollen's avatar
Sander Bollen committed
277
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
278
279
280
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
281
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
282
    input:
Sander Bollen's avatar
Sander Bollen committed
283
284
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
285
286
        ref = REFERENCE,
        temp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
287
288
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
289
    output: temp("{sample}/bams/{sample}.sorted.bam")
290
    singularity: "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0"
Sander Bollen's avatar
Sander Bollen committed
291
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
292
           "| picard -Xmx4G SortSam CREATE_INDEX=TRUE TMP_DIR={input.temp} "
Sander Bollen's avatar
Sander Bollen committed
293
294
295
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
296
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
297
    input:
Sander Bollen's avatar
Sander Bollen committed
298
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
299
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
300
    output:
Sander Bollen's avatar
Sander Bollen committed
301
302
303
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
Sander Bollen's avatar
Sander Bollen committed
304
    singularity: "docker://quay.io/biocontainers/picard:2.14--py36_0"
305
    shell: "picard -Xmx4G MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
306
307
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
308
309
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
310
311
312
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
313
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
314
    output:
Sander Bollen's avatar
Sander Bollen committed
315
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
316
    singularity: "docker://debian:buster-slim"
Sander Bollen's avatar
Sander Bollen committed
317
318
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
319
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
320
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
321
    input:
Sander Bollen's avatar
Sander Bollen committed
322
        bam = "{sample}/bams/{sample}.markdup.bam",
Sander Bollen's avatar
Sander Bollen committed
323
324
        ref = REFERENCE,
    output:
Sander Bollen's avatar
Sander Bollen committed
325
        grp = "{sample}/bams/{sample}.baserecal.grp"
326
    params:
327
        known_sites = BSQR_known_sites
328
329
    singularity: "docker://broadinstitute/gatk3:3.7-0"
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
330
331
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
332
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
333

Sander Bollen's avatar
Sander Bollen committed
334
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
335
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
336
    input:
Sander Bollen's avatar
Sander Bollen committed
337
338
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
Sander Bollen's avatar
Sander Bollen committed
339
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
340
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
341
    params:
Sander Bollen's avatar
Sander Bollen committed
342
        chunk="{chunk}"
Sander Bollen's avatar
Sander Bollen committed
343
    output:
Sander Bollen's avatar
Sander Bollen committed
344
345
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.part.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.part.vcf.gz.tbi")
346
347
    singularity: "docker://broadinstitute/gatk3:3.7-0"
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
348
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
349
350
351
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
           "-L '{params.chunk}' -o '{output.gvcf}' "
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
352
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
353
354


Sander Bollen's avatar
Sander Bollen committed
355
356
357
rule gvcf_chunkfile:
    """
    Create simple text file with paths to chunks for GVCF.
358

Sander Bollen's avatar
Sander Bollen committed
359
360
    This uses a run directive in stead of a shell directive because
    the amount of chunks may be so large the shell would error out with
361
    an "argument list too long" error.
Sander Bollen's avatar
Sander Bollen committed
362
    See https://unix.stackexchange.com/a/120842 for more info
363

van den Berg's avatar
van den Berg committed
364
    This also means this rule lives outside of singularity and is
365
    executed in snakemake's own environment.
Sander Bollen's avatar
Sander Bollen committed
366
367
368
369
370
371
372
    """
    params:
        chunkfiles = expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
                            chunk=CHUNKS)
    output:
        file = "{sample}/vcf/chunkfile.txt"
    run:
373
        with open(output.file, "w") as ohandle:
Sander Bollen's avatar
Sander Bollen committed
374
            for filename in params.chunkfiles:
375
376
                corrected = filename.format(sample=wildcards.sample)
                ohandle.write(corrected + "\n")
Sander Bollen's avatar
Sander Bollen committed
377

Sander Bollen's avatar
Sander Bollen committed
378
rule gvcf_gather:
Sander Bollen's avatar
Sander Bollen committed
379
    """Gather all GVCF scatters"""
Sander Bollen's avatar
Sander Bollen committed
380
    input:
Sander Bollen's avatar
Sander Bollen committed
381
382
383
        gvcfs = expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
                       chunk=CHUNKS),
        chunkfile = "{sample}/vcf/chunkfile.txt"
Sander Bollen's avatar
Sander Bollen committed
384
    output:
Sander Bollen's avatar
Sander Bollen committed
385
386
387
388
389
390
391
392
393
394
395
396
397
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
    singularity: "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4"
    shell: "bcftools concat -f {input.chunkfile} -n > {output.gvcf}"


rule gvcf_gather_tbi:
    """Index GVCF"""
    input:
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
    output:
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
    singularity: "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0"
    shell: "tabix -pvcf {input.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
398
399
400


rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
401
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
402
    input:
Sander Bollen's avatar
Sander Bollen committed
403
        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz", sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
404
        tbis = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
Sander Bollen's avatar
Sander Bollen committed
405
                      sample=SAMPLES),
406
        ref=REFERENCE
Sander Bollen's avatar
Sander Bollen committed
407
    params:
Sander Bollen's avatar
Sander Bollen committed
408
409
        li=" -V ".join(expand("{sample}/vcf/{sample}.g.vcf.gz",
                              sample=SAMPLES)),
Sander Bollen's avatar
Sander Bollen committed
410
411
        chunk="{chunk}"
    output:
Sander Bollen's avatar
Sander Bollen committed
412
413
        vcf=temp("multisample/genotype.{chunk}.part.vcf.gz"),
        vcf_tbi=temp("multisample/genotype.{chunk}.part.vcf.gz.tbi")
414
415
    singularity: "docker://broadinstitute/gatk3:3.7-0"
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
416
           "GenotypeGVCFs -R {input.ref} "
Sander Bollen's avatar
Sander Bollen committed
417
           "-V {params.li} -L '{params.chunk}' -o '{output.vcf}'"
Sander Bollen's avatar
Sander Bollen committed
418
419


420
421
422
rule genotype_chunkfile:
    """
    Create simple text file with paths to chunks for genotyping
423

424
425
    This uses a run directive in stead of a shell directive because
    the amount of chunks may be so large the shell would error out with
426
    an "argument list too long" error.
427
    See https://unix.stackexchange.com/a/120842 for more info
428

van den Berg's avatar
van den Berg committed
429
    This also means this rule lives outside of singularity and is
430
    executed in snakemake's own environment.
431
432
    """
    params:
Sander Bollen's avatar
Sander Bollen committed
433
434
        vcfs = expand("multisample/genotype.{chunk}.part.vcf.gz",
                      chunk=CHUNKS)
435
436
437
438
439
440
441
442
    output:
        file = "multisample/chunkfile.txt"
    run:
        with open(output.file, "w") as ohandle:
            for filename in params.vcfs:
                ohandle.write(filename + "\n")


Sander Bollen's avatar
Sander Bollen committed
443
rule genotype_gather:
444
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
445
    input:
446
447
448
        vcfs = expand("multisample/genotype.{chunk}.part.vcf.gz",
                      chunk=CHUNKS),
        chunkfile = "multisample/chunkfile.txt"
Sander Bollen's avatar
Sander Bollen committed
449
    output:
450
451
452
453
454
455
456
457
458
459
        vcf = "multisample/genotyped.vcf.gz"
    singularity: "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4"
    shell: "bcftools concat -f {input.chunkfile} -n > {output.vcf}"


rule genotype_gather_tbi:
    """Index genotyped vcf file"""
    input:
        vcf = "multisample/genotyped.vcf.gz"
    output:
Sander Bollen's avatar
Sander Bollen committed
460
        tbi = "multisample/genotyped.vcf.gz.tbi"
461
462
    singularity: "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0"
    shell: "tabix -pvcf {input.vcf}"
Sander Bollen's avatar
Sander Bollen committed
463
464


465
466
467
rule split_vcf:
    """Split multisample VCF in single samples"""
    input:
Sander Bollen's avatar
Sander Bollen committed
468
        vcf="multisample/genotyped.vcf.gz",
469
        tbi = "multisample/genotyped.vcf.gz.tbi",
470
471
472
473
        ref=REFERENCE
    params:
        s="{sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
474
        splitted="{sample}/vcf/{sample}_single.vcf.gz"
475
476
    singularity: "docker://broadinstitute/gatk3:3.7-0"
    shell: "java -Xmx15G -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
477
478
           "-T SelectVariants -sn {params.s} -env -R {input.ref} -V "
           "{input.vcf} -o {output.splitted}"
479
480


Sander Bollen's avatar
Sander Bollen committed
481
482
483
## bam metrics

rule mapped_num:
Sander Bollen's avatar
Sander Bollen committed
484
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
485
    input:
Sander Bollen's avatar
Sander Bollen committed
486
        bam="{sample}/bams/{sample}.sorted.bam"
Sander Bollen's avatar
Sander Bollen committed
487
    output:
Sander Bollen's avatar
Sander Bollen committed
488
        num="{sample}/bams/{sample}.mapped.num"
489
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
490
491
492
493
    shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"


rule mapped_basenum:
Sander Bollen's avatar
Sander Bollen committed
494
    """Calculate number of mapped bases"""
Sander Bollen's avatar
Sander Bollen committed
495
    input:
Sander Bollen's avatar
Sander Bollen committed
496
        bam="{sample}/bams/{sample}.sorted.bam"
Sander Bollen's avatar
Sander Bollen committed
497
    output:
Sander Bollen's avatar
Sander Bollen committed
498
        num="{sample}/bams/{sample}.mapped.basenum"
499
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
500
    shell: "samtools view -F 4 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
501
502
503


rule unique_num:
Sander Bollen's avatar
Sander Bollen committed
504
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
505
    input:
Sander Bollen's avatar
Sander Bollen committed
506
        bam="{sample}/bams/{sample}.markdup.bam"
Sander Bollen's avatar
Sander Bollen committed
507
    output:
Sander Bollen's avatar
Sander Bollen committed
508
        num="{sample}/bams/{sample}.unique.num"
509
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
510
    shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
511
512
513


rule usable_basenum:
Sander Bollen's avatar
Sander Bollen committed
514
    """Calculate number of bases on unique reads"""
Sander Bollen's avatar
Sander Bollen committed
515
    input:
Sander Bollen's avatar
Sander Bollen committed
516
        bam="{sample}/bams/{sample}.markdup.bam"
Sander Bollen's avatar
Sander Bollen committed
517
    output:
Sander Bollen's avatar
Sander Bollen committed
518
        num="{sample}/bams/{sample}.usable.basenum"
519
    singularity: "docker://quay.io/biocontainers/samtools:1.6--he673b24_3"
Sander Bollen's avatar
Sander Bollen committed
520
521
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > "
           "{output.num}"
Sander Bollen's avatar
Sander Bollen committed
522
523
524
525


## fastqc

Sander Bollen's avatar
Sander Bollen committed
526
rule fastqc_raw:
527
528
    """
    Run fastqc on raw fastq files
529
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
530
531
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
532
    input:
Sander Bollen's avatar
Sander Bollen committed
533
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
534
535
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
536
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
537
    output:
Sander Bollen's avatar
Sander Bollen committed
538
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
539
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
Sander Bollen's avatar
Sander Bollen committed
540
541
    shell: "fastqc --nogroup -o {params.odir} {input.r1} {input.r2} "
           "&& echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
542
543


Sander Bollen's avatar
Sander Bollen committed
544
rule fastqc_merged:
545
    """
546
547
    Run fastqc on merged fastq files
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
548
549
    perl missing in the container of 0.11.5
    """
Sander Bollen's avatar
Sander Bollen committed
550
    input:
Sander Bollen's avatar
Sander Bollen committed
551
552
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
553
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
554
    params:
Sander Bollen's avatar
Sander Bollen committed
555
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
556
    output:
Sander Bollen's avatar
Sander Bollen committed
557
558
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
559
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
Sander Bollen's avatar
Sander Bollen committed
560
561
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
562
563


Sander Bollen's avatar
Sander Bollen committed
564
rule fastqc_postqc:
565
566
    """
    Run fastqc on fastq files post pre-processing
567
568
    NOTE: singularity version uses 0.11.7 in stead of 0.11.5 due to
    perl missing in the container of 0.11.5
569
    """
Sander Bollen's avatar
Sander Bollen committed
570
    input:
Sander Bollen's avatar
Sander Bollen committed
571
572
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
573
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
574
    params:
Sander Bollen's avatar
Sander Bollen committed
575
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
576
    output:
Sander Bollen's avatar
Sander Bollen committed
577
578
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
579
    singularity: "docker://quay.io/biocontainers/fastqc:0.11.7--4"
Sander Bollen's avatar
Sander Bollen committed
580
581
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
582
583
584
585
586


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
587
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
588
    input:
Sander Bollen's avatar
Sander Bollen committed
589
590
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
591
    output:
Sander Bollen's avatar
Sander Bollen committed
592
        "{sample}/pre_process/{sample}.preqc_count.json"
Sander Bollen's avatar
Sander Bollen committed
593
    singularity: "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0"
Sander Bollen's avatar
Sander Bollen committed
594
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
595
596
597


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
598
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
599
    input:
Sander Bollen's avatar
Sander Bollen committed
600
601
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
602
    output:
Sander Bollen's avatar
Sander Bollen committed
603
        "{sample}/pre_process/{sample}.postqc_count.json"
Sander Bollen's avatar
Sander Bollen committed
604
    singularity: "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0"
Sander Bollen's avatar
Sander Bollen committed
605
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
606
607


Sander Bollen's avatar
Sander Bollen committed
608
609
610
611
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
612
613
614
615
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
Sander Bollen's avatar
Sander Bollen committed
616
        sc=fqpy
Sander Bollen's avatar
Sander Bollen committed
617
    singularity: "docker://python:3.6-slim"
Sander Bollen's avatar
Sander Bollen committed
618
    output:
Sander Bollen's avatar
Sander Bollen committed
619
        "{sample}/pre_process/fastq_stats.json"
620
621
622
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
623
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
624

Sander Bollen's avatar
Sander Bollen committed
625
626
627
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
628
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
629
    input:
Sander Bollen's avatar
Sander Bollen committed
630
631
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
Sander Bollen's avatar
Sander Bollen committed
632
633
634
635
636
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
637
638
        covj="{sample}/coverage/{bed}.covstats.json",
        covp="{sample}/coverage/{bed}.covstats.png"
639
    singularity: "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0"
Sander Bollen's avatar
Sander Bollen committed
640
641
642
643
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
644
645


Sander Bollen's avatar
Sander Bollen committed
646
647
648
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
649
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
650
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
Sander Bollen's avatar
Sander Bollen committed
651
652
        ref=get_refflatpath
    output:
Sander Bollen's avatar
Sander Bollen committed
653
        tsv="{sample}/coverage/{ref}.coverages.tsv"
Sander Bollen's avatar
Sander Bollen committed
654
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
655
656
657
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
658
659
660
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
661
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
662
    input:
Sander Bollen's avatar
Sander Bollen committed
663
664
        vcf="multisample/genotyped.vcf.gz",
        tbi = "multisample/genotyped.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
665
    output:
Sander Bollen's avatar
Sander Bollen committed
666
        stats="multisample/vcfstats.json"
Sander Bollen's avatar
Sander Bollen committed
667
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
668
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
669
670


Sander Bollen's avatar
Sander Bollen committed
671
## collection
672
673
674
675
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
676
677
678
679
680
681
682
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
683
684
            cov=expand("{{sample}}/coverage/{bed}.covstats.json",
                       bed=BASE_BEDS),
685
686
687
688
689
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
690
            "{sample}/{sample}.stats.json"
Sander Bollen's avatar
Sander Bollen committed
691
        singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
692
693
694
695
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
696
697
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
698
699
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
700
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
701
        input:
Sander Bollen's avatar
Sander Bollen committed
702
703
704
705
706
707
708
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
709
710
711
712
713
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
714
            "{sample}/{sample}.stats.json"
Sander Bollen's avatar
Sander Bollen committed
715
        singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
716
717
718
719
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
720
721
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
722
723

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
724
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
725
    input:
Sander Bollen's avatar
Sander Bollen committed
726
727
        cols=expand("{sample}/{sample}.stats.json", sample=SAMPLES),
        vstat="multisample/vcfstats.json",
Sander Bollen's avatar
Sander Bollen committed
728
729
        mpy=mpy
    output:
Sander Bollen's avatar
Sander Bollen committed
730
        stats="stats.json"
731
    singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
Sander Bollen's avatar
Sander Bollen committed
732
733
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
           "> {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
734
735


Sander Bollen's avatar
Sander Bollen committed
736
737
738
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
739
        stats="stats.json",
Sander Bollen's avatar
Sander Bollen committed
740
741
        sc=tsvpy
    output:
Sander Bollen's avatar
Sander Bollen committed
742
        stats="stats.tsv"
743
    singularity: "docker://python:3.6-slim"
Sander Bollen's avatar
Sander Bollen committed
744
745
746
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
747
748
749
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
750
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
751
752
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
753
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
754
    params:
Sander Bollen's avatar
Sander Bollen committed
755
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
756
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
757
    output:
Sander Bollen's avatar
Sander Bollen committed
758
        report="multiqc_report/multiqc_report.html"
Sander Bollen's avatar
Sander Bollen committed
759
    singularity: "docker://quay.io/biocontainers/multiqc:1.5--py36_0"
760
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"