Snakefile 25 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
Main Snakefile for the pipeline. 

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
Sander Bollen's avatar
Sander Bollen committed
25
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
26
from os.path import join, basename
27
from pathlib import Path
Sander Bollen's avatar
Sander Bollen committed
28
29

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
30

Sander Bollen's avatar
Sander Bollen committed
31
OUT_DIR = config.get("OUTPUT_DIR")  # TODO: use regular snakemake option?
32
33
34
if OUT_DIR is None:
    raise ValueError("You must set --config OUT_DIR=<path>")

Sander Bollen's avatar
Sander Bollen committed
35
REFERENCE = config.get("REFERENCE")
36
37
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
38
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
39
40
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
41
42
43
44

JAVA = config.get("JAVA")  # TODO: should be handled by conda?!
if JAVA is None:
    raise ValueError("You must set --config JAVA=<path>")
Sander Bollen's avatar
Sander Bollen committed
45
46
if not Path(JAVA).exists():
    raise FileNotFoundError("{0} does not exist".format(JAVA))
47

Sander Bollen's avatar
Sander Bollen committed
48
GATK = config.get("GATK")
49
50
if GATK is None:
    raise ValueError("You must set --config GATK=<path>")
Sander Bollen's avatar
Sander Bollen committed
51
52
if not Path(GATK).exists():
    raise FileNotFoundError("{0} does not exist.".format(GATK))
53

Sander Bollen's avatar
Sander Bollen committed
54
DBSNP = config.get("DBSNP")
55
56
if DBSNP is None:
    raise ValueError("You must set --config DBSNP=<path>")
Sander Bollen's avatar
Sander Bollen committed
57
58
if not Path(DBSNP).exists():
    raise FileNotFoundError("{0} does not exist".format(DBSNP))
59

Sander Bollen's avatar
Sander Bollen committed
60
ONETHOUSAND = config.get("ONETHOUSAND")
61
62
if ONETHOUSAND is None:
    raise ValueError("You must set --config ONETHOUSAND=<path>")
Sander Bollen's avatar
Sander Bollen committed
63
64
if not Path(ONETHOUSAND).exists():
    raise FileNotFoundError("{0} does not exist".format(ONETHOUSAND))
65

Sander Bollen's avatar
Sander Bollen committed
66
HAPMAP = config.get("HAPMAP")
67
if HAPMAP is None:
Sander Bollen's avatar
Sander Bollen committed
68
69
70
    raise ValueError("You must set --config HAPMAP=<path>")
if not Path(HAPMAP).exists():
    raise FileNotFoundError("{0} does not exist".format(HAPMAP))
71
72

# these are all optional
73
74
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
75
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
76
FASTQ_COUNT = config.get("FASTQ_COUNT")
Sander Bollen's avatar
Sander Bollen committed
77
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
78

79
80
81
82
83
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
84

85
86
87
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
88
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
89
fqpy = fsrc_dir("src", "fastqc_stats.py")
Sander Bollen's avatar
Sander Bollen committed
90
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
91
fqsc = fsrc_dir("src", "safe_fastqc.sh")
Sander Bollen's avatar
Sander Bollen committed
92

93
if FASTQ_COUNT is None:
94
    fqc = "python {0}".format(fsrc_dir("src", "fastq-count.py"))
95
96
97
else:
    fqc = FASTQ_COUNT

Sander Bollen's avatar
Sander Bollen committed
98
99
100
101
102
103
104
# sample config parsing
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

Sander Bollen's avatar
Sander Bollen committed
105
106
107
108
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
109
110
111
112
113
114
115
116
117
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed
118
119

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
120
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed
121

Sander Bollen's avatar
Sander Bollen committed
122
123

def split_genome(ref, approx_n_chunks=100):
Sander Bollen's avatar
Sander Bollen committed
124
125
126
127
128
129
130
131
    """
    Split genome in chunks.

    Chunks are strings in the format: `<ctg>:<start>-<end>`
    These follow the region string format as used by htslib,
    which uses _1_-based indexing.
    See: http://www.htslib.org/doc/tabix.html
    """
Sander Bollen's avatar
Sander Bollen committed
132
133
134
135
136
    fa = Fasta(ref)
    tot_size = sum([len(x) for x in fa.records.values()])
    chunk_size = tot_size//approx_n_chunks
    chunks = []
    for chrom_name, chrom_value in fa.records.items():
Sander Bollen's avatar
Sander Bollen committed
137
        pos = 1
Sander Bollen's avatar
Sander Bollen committed
138
139
140
141
142
143
144
145
146
147
148
149
150
        while pos <= len(chrom_value):
            end = pos+chunk_size
            if end <= len(chrom_value):
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, end)
            else:
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, len(chrom_value))
            chunks.append(chunk)
            pos = end
    return chunks

CHUNKS = split_genome(REFERENCE)


Sander Bollen's avatar
Sander Bollen committed
151
152
153
def out_path(path):
    return join(OUT_DIR, path)

Sander Bollen's avatar
Sander Bollen committed
154
155
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
156
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
157
    rs = []
158
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
159
160
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
161

Sander Bollen's avatar
Sander Bollen committed
162
163
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
164
165


Sander Bollen's avatar
Sander Bollen committed
166
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
167
168
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
169
170
171


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
172
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
173
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed
174
175


Sander Bollen's avatar
Sander Bollen committed
176
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
177
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
178
179
180
181
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
182
183
184
185
186
187
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

    fqcr = expand(out_path("{sample}/pre_process/raw_fastqc/.done.txt"),
                  sample=SAMPLES)
Sander Bollen's avatar
Sander Bollen committed
188
    fqcm = expand(out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip"),
Sander Bollen's avatar
Sander Bollen committed
189
                  sample=SAMPLES)
Sander Bollen's avatar
Sander Bollen committed
190
    fqcp = expand(out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip"),
Sander Bollen's avatar
Sander Bollen committed
191
                  sample=SAMPLES)
192
193
194
195
196
    if len(REFFLATS) >= 1:
        coverage_stats = expand(out_path("{sample}/coverage/{ref}.coverages.tsv"),
                                sample=SAMPLES, ref=BASE_REFFLATS)
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
197
    stats = out_path("stats.json")
Sander Bollen's avatar
Sander Bollen committed
198
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
Sander Bollen's avatar
Sander Bollen committed
199
200


Sander Bollen's avatar
Sander Bollen committed
201
202
rule all:
    input:
Sander Bollen's avatar
Sander Bollen committed
203
        combined=out_path("multisample/genotyped.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
204
        report=out_path("multiqc_report/multiqc_report.html"),
Sander Bollen's avatar
Sander Bollen committed
205
206
        bais=expand(out_path("{sample}/bams/{sample}.markdup.bam.bai"),
                    sample=SAMPLES),
207
208
        vcfs=expand(out_path("{sample}/vcf/{sample}_single.vcf.gz"),
                    sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
209
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
210

Sander Bollen's avatar
Sander Bollen committed
211
212
213
214
215
216

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    output: ancient(out_path("tmp"))
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
217
rule genome:
Sander Bollen's avatar
Sander Bollen committed
218
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
219
220
221
222
223
    input: REFERENCE
    output: out_path("current.genome")
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
224
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
225
226
227
228
229
    input: get_r1
    output: temp(out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"))
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
230
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
231
232
233
234
    input: get_r2
    output: temp(out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"))
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
235
236

rule seqtk_r1:
Sander Bollen's avatar
Sander Bollen committed
237
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
238
239
    input:
        stats=out_path("{sample}/pre_process/{sample}.preqc_count.json"),
Sander Bollen's avatar
Sander Bollen committed
240
241
        fastq=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
242
    params:
Sander Bollen's avatar
Sander Bollen committed
243
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
244
245
    output:
        fastq=temp(out_path("{sample}/pre_process/{sample}.sampled_R1.fastq.gz"))
Sander Bollen's avatar
Sander Bollen committed
246
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
247
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} {params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
248
249
250


rule seqtk_r2:
Sander Bollen's avatar
Sander Bollen committed
251
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
252
253
    input:
        stats = out_path("{sample}/pre_process/{sample}.preqc_count.json"),
Sander Bollen's avatar
Sander Bollen committed
254
255
        fastq = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"),
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
256
    params:
Sander Bollen's avatar
Sander Bollen committed
257
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
258
259
    output:
        fastq = temp(out_path("{sample}/pre_process/{sample}.sampled_R2.fastq.gz"))
Sander Bollen's avatar
Sander Bollen committed
260
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
261
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} {params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
262
263


264
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
265
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
266
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
267
    input:
Sander Bollen's avatar
Sander Bollen committed
268
        r1 = out_path("{sample}/pre_process/{sample}.sampled_R1.fastq.gz"),
269
270
271
        r2 = out_path("{sample}/pre_process/{sample}.sampled_R2.fastq.gz"),
        rr1 = out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        rr2 = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
272
273
274
275
276
    output:
        r1 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq")),
        r2 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")),
        s = out_path("{sample}/pre_process/{sample}.trimmed_singles.fastq"),
    conda: "envs/sickle.yml"
Sander Bollen's avatar
Sander Bollen committed
277
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
278
279
280
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
281
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
282
283
284
285
286
287
288
    input:
        r1 = out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")
    output:
        r1 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq")),
        r2 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"))
    conda: "envs/cutadapt.yml"
Sander Bollen's avatar
Sander Bollen committed
289
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
290
291
292
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
293
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
294
295
296
297
298
299
300
301
    input:
        r1 = out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"),
        ref = REFERENCE
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
    output: temp(out_path("{sample}/bams/{sample}.sorted.bam"))
    conda: "envs/bwa.yml"
Sander Bollen's avatar
Sander Bollen committed
302
303
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
           "| picard SortSam CREATE_INDEX=TRUE TMP_DIR=null "
Sander Bollen's avatar
Sander Bollen committed
304
305
306
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
307
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
308
309
    input:
        bam = out_path("{sample}/bams/{sample}.sorted.bam"),
310
        tmp = ancient(out_path("tmp"))
Sander Bollen's avatar
Sander Bollen committed
311
    output:
Sander Bollen's avatar
Sander Bollen committed
312
        bam = out_path("{sample}/bams/{sample}.markdup.bam"),
Sander Bollen's avatar
Sander Bollen committed
313
        bai = out_path("{sample}/bams/{sample}.markdup.bai"),
Sander Bollen's avatar
Sander Bollen committed
314
315
        metrics = out_path("{sample}/bams/{sample}.markdup.metrics")
    conda: "envs/picard.yml"
Sander Bollen's avatar
Sander Bollen committed
316
317
318
    shell: "picard MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
319
320
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
321
322
323
324
325
326
327
328
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
        bai = out_path("{sample}/bams/{sample}.markdup.bai")
    output:
        bai = out_path("{sample}/bams/{sample}.markdup.bam.bai")
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
329
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
330
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
331
332
333
334
335
336
337
338
339
340
341
    input:
        bam = out_path("{sample}/bams/{sample}.markdup.bam"),
        java = JAVA,
        gatk = GATK,
        ref = REFERENCE,
        dbsnp = DBSNP,
        one1kg = ONETHOUSAND,
        hapmap = HAPMAP
    output:
        grp = out_path("{sample}/bams/{sample}.baserecal.grp")
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
342
    shell: "{input.java} -XX:ParallelGCThreads=1 -jar {input.gatk} -T BaseRecalibrator "
Sander Bollen's avatar
Sander Bollen committed
343
344
345
346
           "-I {input.bam} -o {output.grp} -nct 8 -R {input.ref} "
           "-cov ReadGroupCovariate -cov QualityScoreCovariate "
           "-cov CycleCovariate -cov ContextCovariate -knownSites "
           "{input.dbsnp} -knownSites {input.one1kg} "
Sander Bollen's avatar
Sander Bollen committed
347
348
           "-knownSites {input.hapmap}"

Sander Bollen's avatar
Sander Bollen committed
349
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
350
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
351
    input:
Sander Bollen's avatar
Sander Bollen committed
352
353
        bam=out_path("{sample}/bams/{sample}.markdup.bam"),
        bqsr=out_path("{sample}/bams/{sample}.baserecal.grp"),
Sander Bollen's avatar
Sander Bollen committed
354
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
355
356
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
357
    params:
Sander Bollen's avatar
Sander Bollen committed
358
        chunk="{chunk}"
Sander Bollen's avatar
Sander Bollen committed
359
    output:
360
361
        gvcf=temp(out_path("{sample}/vcf/{sample}.{chunk}.part.vcf.gz")),
        gvcf_tbi=temp(out_path("{sample}/vcf/{sample}.{chunk}.part.vcf.gz.tbi"))
Sander Bollen's avatar
Sander Bollen committed
362
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
363
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 {input.gatk} -T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
364
365
366
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
           "-L '{params.chunk}' -o '{output.gvcf}' "
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
367
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
368
369


Sander Bollen's avatar
Sander Bollen committed
370
rule gvcf_gather:
Sander Bollen's avatar
Sander Bollen committed
371
    """Gather all gvcf scatters"""
Sander Bollen's avatar
Sander Bollen committed
372
    input:
Sander Bollen's avatar
Sander Bollen committed
373
374
        gvcfs=expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz"),
                     chunk=CHUNKS),
375
376
        tbis=expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz.tbi"),
                     chunk=CHUNKS),
Sander Bollen's avatar
Sander Bollen committed
377
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
378
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
379
    params:
Sander Bollen's avatar
Sander Bollen committed
380
        gvcfs="' -V '".join(expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
381
                                 chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
382
383
    output:
        gvcf=out_path("{sample}/vcf/{sample}.g.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
384
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
385
    shell: "java -Xmx4G -XX:ParallelGCThreads=1 -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "
Sander Bollen's avatar
Sander Bollen committed
386
           "-R {input.ref} -V '{params.gvcfs}' -out {output.gvcf} "
Sander Bollen's avatar
Sander Bollen committed
387
388
389
390
           "-assumeSorted"


rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
391
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
392
    input:
Sander Bollen's avatar
Sander Bollen committed
393
394
        gvcfs = expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"),
                       sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
395
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
396
397
398
        gatk=GATK
    params:
        li=" -V ".join(expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
399
                              sample=SAMPLES)),
Sander Bollen's avatar
Sander Bollen committed
400
401
        chunk="{chunk}"
    output:
402
403
        vcf=temp(out_path("multisample/genotype.{chunk}.part.vcf.gz")),
        vcf_tbi=temp(out_path("multisample/genotype.{chunk}.part.vcf.gz.tbi"))
Sander Bollen's avatar
Sander Bollen committed
404
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
405
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 {input.gatk} -T GenotypeGVCFs -R {input.ref} "
Sander Bollen's avatar
Sander Bollen committed
406
           "-V {params.li} -L '{params.chunk}' -o '{output.vcf}'"
Sander Bollen's avatar
Sander Bollen committed
407
408
409


rule genotype_gather:
Sander Bollen's avatar
Sander Bollen committed
410
    """Gather all genotyping scatters"""
Sander Bollen's avatar
Sander Bollen committed
411
412
413
    input:
        vcfs=expand(out_path("multisample/genotype.{chunk}.part.vcf.gz"),
                    chunk=CHUNKS),
414
415
        tbis=expand(out_path("multisample/genotype.{chunk}.part.vcf.gz.tbi"),
                    chunk=CHUNKS),
Sander Bollen's avatar
Sander Bollen committed
416
417
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
418
    params:
Sander Bollen's avatar
Sander Bollen committed
419
        vcfs="' -V '".join(expand(out_path("multisample/genotype.{chunk}.part.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
420
                                chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
421
422
423
    output:
        combined=out_path("multisample/genotyped.vcf.gz")
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
424
    shell: "java -Xmx4G -XX:ParallelGCThreads=1 -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "
Sander Bollen's avatar
Sander Bollen committed
425
           "-R {input.ref} -V '{params.vcfs}' -out {output.combined} "
Sander Bollen's avatar
Sander Bollen committed
426
           "-assumeSorted"
Sander Bollen's avatar
Sander Bollen committed
427
428


429
430
431
432
433
434
435
436
437
438
439
rule split_vcf:
    """Split multisample VCF in single samples"""
    input:
        vcf=out_path("multisample/genotyped.vcf.gz"),
        gatk=GATK,
        ref=REFERENCE
    params:
        s="{sample}"
    output:
        splitted=out_path("{sample}/vcf/{sample}_single.vcf.gz")
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
440
    shell: "java -Xmx15G -XX:ParallelGCThreads=1 -jar {input.gatk} -T SelectVariants -sn "
441
           "{params.s} -env -R {input.ref} -V {input.vcf} -o {output.splitted}"
442
443


Sander Bollen's avatar
Sander Bollen committed
444
445
446
## bam metrics

rule mapped_num:
Sander Bollen's avatar
Sander Bollen committed
447
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
448
449
450
451
452
453
454
455
456
    input:
        bam=out_path("{sample}/bams/{sample}.sorted.bam")
    output:
        num=out_path("{sample}/bams/{sample}.mapped.num")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"


rule mapped_basenum:
Sander Bollen's avatar
Sander Bollen committed
457
    """Calculate number of mapped bases"""
Sander Bollen's avatar
Sander Bollen committed
458
459
460
461
462
    input:
        bam=out_path("{sample}/bams/{sample}.sorted.bam")
    output:
        num=out_path("{sample}/bams/{sample}.mapped.basenum")
    conda: "envs/samtools.yml"
Sander Bollen's avatar
Sander Bollen committed
463
    shell: "samtools view -F 4 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
464
465
466


rule unique_num:
Sander Bollen's avatar
Sander Bollen committed
467
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
468
469
470
471
472
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam")
    output:
        num=out_path("{sample}/bams/{sample}.unique.num")
    conda: "envs/samtools.yml"
473
    shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
474
475
476


rule usable_basenum:
Sander Bollen's avatar
Sander Bollen committed
477
    """Calculate number of bases on unique reads"""
Sander Bollen's avatar
Sander Bollen committed
478
479
480
481
482
483
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam")
    output:
        num=out_path("{sample}/bams/{sample}.usable.basenum")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
484
485
486
487


## fastqc

Sander Bollen's avatar
Sander Bollen committed
488
rule fastqc_raw:
Sander Bollen's avatar
Sander Bollen committed
489
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
490
    input:
Sander Bollen's avatar
Sander Bollen committed
491
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
492
493
494
495
496
        r2=get_r2
    params:
        odir=out_path("{sample}/pre_process/raw_fastqc")
    output:
        aux=out_path("{sample}/pre_process/raw_fastqc/.done.txt")
497
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
498
    shell: "fastqc --nogroup -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
499
500


Sander Bollen's avatar
Sander Bollen committed
501
rule fastqc_merged:
Sander Bollen's avatar
Sander Bollen committed
502
    """Run fastqc on merged fastq files"""
Sander Bollen's avatar
Sander Bollen committed
503
    input:
Sander Bollen's avatar
Sander Bollen committed
504
        r1=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
Sander Bollen's avatar
Sander Bollen committed
505
506
        r2=out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"),
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
507
508
509
    params:
        odir=out_path("{sample}/pre_process/merged_fastqc")
    output:
Sander Bollen's avatar
Sander Bollen committed
510
511
        r1=out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip"),
        r2=out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip")
512
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
513
514
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
515
516


Sander Bollen's avatar
Sander Bollen committed
517
rule fastqc_postqc:
Sander Bollen's avatar
Sander Bollen committed
518
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
519
    input:
Sander Bollen's avatar
Sander Bollen committed
520
        r1=out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
Sander Bollen's avatar
Sander Bollen committed
521
522
        r2=out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"),
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
523
524
525
    params:
        odir=out_path("{sample}/pre_process/postqc_fastqc")
    output:
Sander Bollen's avatar
Sander Bollen committed
526
        r1=out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip"),
Sander Bollen's avatar
Sander Bollen committed
527
        r2=out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip")
528
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
529
530
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
531
532
533
534
535


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
536
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
537
538
539
    input:
        r1=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        r2=out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
540
    params:
541
        fastqcount=fqc
Sander Bollen's avatar
Sander Bollen committed
542
543
    output:
        out_path("{sample}/pre_process/{sample}.preqc_count.json")
544
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
545
546
547


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
548
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
549
550
551
    input:
        r1=out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2=out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
552
    params:
553
        fastqcount=fqc
Sander Bollen's avatar
Sander Bollen committed
554
555
    output:
        out_path("{sample}/pre_process/{sample}.postqc_count.json")
Sander Bollen's avatar
Sander Bollen committed
556
557
558
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"


Sander Bollen's avatar
Sander Bollen committed
559
560
561
562
563
564
565
566
567
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
        preqc_r1=out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip"),
        preqc_r2=out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"),
        postqc_r1=out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip"),
        postqc_r2=out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"),
        sc=fqpy
568
    conda: "envs/collectstats.yml"
Sander Bollen's avatar
Sander Bollen committed
569
570
    output:
        out_path("{sample}/pre_process/fastq_stats.json")
571
572
573
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
574
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
575

Sander Bollen's avatar
Sander Bollen committed
576
577
578
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
579
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
580
581
582
583
584
585
586
587
588
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam"),
        genome=out_path("current.genome"),
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
        covj=out_path("{sample}/coverage/{bed}.covstats.json"),
Sander Bollen's avatar
Sander Bollen committed
589
        covp=out_path("{sample}/coverage/{bed}.covstats.png")
Sander Bollen's avatar
Sander Bollen committed
590
    conda: "envs/covstat.yml"
Sander Bollen's avatar
Sander Bollen committed
591
592
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} -b {input.bam} "
           "-d  | python {input.covpy} - --plot {output.covp} "
Sander Bollen's avatar
Sander Bollen committed
593
           "--title 'Targets coverage' --subtitle '{params.subt}' > {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
594
595


Sander Bollen's avatar
Sander Bollen committed
596
597
598
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
599
        gvcf=out_path("{sample}/vcf/{sample}.g.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
600
601
602
603
604
605
606
        ref=get_refflatpath
    output:
        tsv=out_path("{sample}/coverage/{ref}.coverages.tsv")
    conda: "envs/vcfstats.yml"
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
607
608
609
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
610
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
611
    input:
Sander Bollen's avatar
Sander Bollen committed
612
        vcf=out_path("multisample/genotyped.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
613
614
615
    output:
        stats=out_path("multisample/vcfstats.json")
    conda: "envs/vcfstats.yml"
Sander Bollen's avatar
Sander Bollen committed
616
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
617
618


Sander Bollen's avatar
Sander Bollen committed
619
## collection
620
621
622
623
624
625
626
627
628
629
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
            preqc=out_path("{sample}/pre_process/{sample}.preqc_count.json"),
            postq=out_path("{sample}/pre_process/{sample}.postqc_count.json"),
            mnum=out_path("{sample}/bams/{sample}.mapped.num"),
            mbnum=out_path("{sample}/bams/{sample}.mapped.basenum"),
            unum=out_path("{sample}/bams/{sample}.unique.num"),
            ubnum=out_path("{sample}/bams/{sample}.usable.basenum"),
630
            fastqc=out_path("{sample}/pre_process/fastq_stats.json"),
631
632
633
634
635
636
637
638
639
640
641
642
643
            cov=expand(out_path("{{sample}}/coverage/{bed}.covstats.json"),
                       bed=BASE_BEDS),
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
            out_path("{sample}/{sample}.stats.json")
        conda: "envs/collectstats.yml"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
644
645
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
646
647
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
648
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
649
650
651
652
653
654
655
        input:
            preqc = out_path("{sample}/pre_process/{sample}.preqc_count.json"),
            postq = out_path("{sample}/pre_process/{sample}.postqc_count.json"),
            mnum = out_path("{sample}/bams/{sample}.mapped.num"),
            mbnum = out_path("{sample}/bams/{sample}.mapped.basenum"),
            unum = out_path("{sample}/bams/{sample}.unique.num"),
            ubnum = out_path("{sample}/bams/{sample}.usable.basenum"),
656
            fastqc=out_path("{sample}/pre_process/fastq_stats.json"),
Sander Bollen's avatar
Sander Bollen committed
657
658
659
660
661
662
663
664
665
666
667
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
            out_path("{sample}/{sample}.stats.json")
        conda: "envs/collectstats.yml"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
668
669
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
670
671

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
672
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
673
674
675
676
677
678
679
680
    input:
        cols=expand(out_path("{sample}/{sample}.stats.json"), sample=SAMPLES),
        vstat=out_path("multisample/vcfstats.json"),
        mpy=mpy
    output:
        stats=out_path("stats.json")
    conda: "envs/collectstats.yml"
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
681
682


Sander Bollen's avatar
Sander Bollen committed
683
684
685
686
687
688
689
690
691
692
693
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats=out_path("stats.json"),
        sc=tsvpy
    output:
        stats=out_path("stats.tsv")
    conda: "envs/collectstats.yml"
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
694
695
696
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
697
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
698
699
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
700
        stats=out_path("stats.tsv")
Sander Bollen's avatar
Sander Bollen committed
701
    params:
702
703
        odir=out_path("."),
        rdir=out_path("multiqc_report")
Sander Bollen's avatar
Sander Bollen committed
704
705
    output:
        report=out_path("multiqc_report/multiqc_report.html")
Sander Bollen's avatar
Sander Bollen committed
706
    conda: "envs/multiqc.yml"
707
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"