Snakefile 23.2 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
Main Snakefile for the pipeline. 

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
Sander Bollen's avatar
Sander Bollen committed
25
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
26
from os.path import join, basename
27
from pathlib import Path
Sander Bollen's avatar
Sander Bollen committed
28
29

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
30
31

REFERENCE = config.get("REFERENCE")
32
33
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
34
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
35
36
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
37
38
39
40

JAVA = config.get("JAVA")  # TODO: should be handled by conda?!
if JAVA is None:
    raise ValueError("You must set --config JAVA=<path>")
Sander Bollen's avatar
Sander Bollen committed
41
42
if not Path(JAVA).exists():
    raise FileNotFoundError("{0} does not exist".format(JAVA))
43

Sander Bollen's avatar
Sander Bollen committed
44
GATK = config.get("GATK")
45
46
if GATK is None:
    raise ValueError("You must set --config GATK=<path>")
Sander Bollen's avatar
Sander Bollen committed
47
48
if not Path(GATK).exists():
    raise FileNotFoundError("{0} does not exist.".format(GATK))
49

Sander Bollen's avatar
Sander Bollen committed
50
DBSNP = config.get("DBSNP")
51
52
if DBSNP is None:
    raise ValueError("You must set --config DBSNP=<path>")
Sander Bollen's avatar
Sander Bollen committed
53
54
if not Path(DBSNP).exists():
    raise FileNotFoundError("{0} does not exist".format(DBSNP))
55

Sander Bollen's avatar
Sander Bollen committed
56
ONETHOUSAND = config.get("ONETHOUSAND")
57
58
if ONETHOUSAND is None:
    raise ValueError("You must set --config ONETHOUSAND=<path>")
Sander Bollen's avatar
Sander Bollen committed
59
60
if not Path(ONETHOUSAND).exists():
    raise FileNotFoundError("{0} does not exist".format(ONETHOUSAND))
61

Sander Bollen's avatar
Sander Bollen committed
62
HAPMAP = config.get("HAPMAP")
63
if HAPMAP is None:
Sander Bollen's avatar
Sander Bollen committed
64
65
66
    raise ValueError("You must set --config HAPMAP=<path>")
if not Path(HAPMAP).exists():
    raise FileNotFoundError("{0} does not exist".format(HAPMAP))
67
68

# these are all optional
69
70
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
71
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
72
FASTQ_COUNT = config.get("FASTQ_COUNT")
Sander Bollen's avatar
Sander Bollen committed
73
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
74

75
76
77
78
79
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
80

81
82
83
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
84
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
85
fqpy = fsrc_dir("src", "fastqc_stats.py")
Sander Bollen's avatar
Sander Bollen committed
86
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
87
fqsc = fsrc_dir("src", "safe_fastqc.sh")
Sander Bollen's avatar
Sander Bollen committed
88

89
if FASTQ_COUNT is None:
90
    fqc = "python {0}".format(fsrc_dir("src", "fastq-count.py"))
91
92
93
else:
    fqc = FASTQ_COUNT

Sander Bollen's avatar
Sander Bollen committed
94
95
96
97
98
99
100
# sample config parsing
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

Sander Bollen's avatar
Sander Bollen committed
101
102
103
104
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
105
106
107
108
109
110
111
112
113
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed
114
115

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
116
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed
117

Sander Bollen's avatar
Sander Bollen committed
118
119

def split_genome(ref, approx_n_chunks=100):
Sander Bollen's avatar
Sander Bollen committed
120
121
122
123
124
125
126
127
    """
    Split genome in chunks.

    Chunks are strings in the format: `<ctg>:<start>-<end>`
    These follow the region string format as used by htslib,
    which uses _1_-based indexing.
    See: http://www.htslib.org/doc/tabix.html
    """
Sander Bollen's avatar
Sander Bollen committed
128
129
130
131
132
    fa = Fasta(ref)
    tot_size = sum([len(x) for x in fa.records.values()])
    chunk_size = tot_size//approx_n_chunks
    chunks = []
    for chrom_name, chrom_value in fa.records.items():
Sander Bollen's avatar
Sander Bollen committed
133
        pos = 1
Sander Bollen's avatar
Sander Bollen committed
134
135
136
137
138
139
140
141
142
143
144
145
146
        while pos <= len(chrom_value):
            end = pos+chunk_size
            if end <= len(chrom_value):
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, end)
            else:
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, len(chrom_value))
            chunks.append(chunk)
            pos = end
    return chunks

CHUNKS = split_genome(REFERENCE)


Sander Bollen's avatar
Sander Bollen committed
147

Sander Bollen's avatar
Sander Bollen committed
148
149
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
150
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
151
    rs = []
152
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
153
154
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
155

Sander Bollen's avatar
Sander Bollen committed
156
157
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
158
159


Sander Bollen's avatar
Sander Bollen committed
160
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
161
162
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
163
164
165


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
166
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
167
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed
168
169


Sander Bollen's avatar
Sander Bollen committed
170
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
171
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
172
173
174
175
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
176
177
178
179
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

Sander Bollen's avatar
Sander Bollen committed
180
181
182
    fqcr = expand("{sample}/pre_process/raw_fastqc/.done.txt", sample=SAMPLES)
    fqcm = expand("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip", sample=SAMPLES)
    fqcp = expand("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip", sample=SAMPLES)
183
    if len(REFFLATS) >= 1:
Sander Bollen's avatar
Sander Bollen committed
184
        coverage_stats = expand("{sample}/coverage/{ref}.coverages.tsv", sample=SAMPLES, ref=BASE_REFFLATS)
185
186
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
187
    stats = "stats.json"
Sander Bollen's avatar
Sander Bollen committed
188
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
Sander Bollen's avatar
Sander Bollen committed
189
190


Sander Bollen's avatar
Sander Bollen committed
191
192
rule all:
    input:
Sander Bollen's avatar
Sander Bollen committed
193
194
        combined="multisample/genotyped.vcf.gz",
        report="multiqc_report/multiqc_report.html",
Sander Bollen's avatar
Sander Bollen committed
195
196
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai",sample=SAMPLES),
        vcfs=expand("{sample}/vcf/{sample}_single.vcf.gz",sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
197
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
198

Sander Bollen's avatar
Sander Bollen committed
199
200
201

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
Sander Bollen's avatar
Sander Bollen committed
202
    output: ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
203
204
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
205
rule genome:
Sander Bollen's avatar
Sander Bollen committed
206
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
207
    input: REFERENCE
Sander Bollen's avatar
Sander Bollen committed
208
    output: "current.genome"
Sander Bollen's avatar
Sander Bollen committed
209
210
211
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
212
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
213
    input: get_r1
Sander Bollen's avatar
Sander Bollen committed
214
    output: temp("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
215
216
217
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
218
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
219
    input: get_r2
Sander Bollen's avatar
Sander Bollen committed
220
    output: temp("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
221
222
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
223
224

rule seqtk_r1:
Sander Bollen's avatar
Sander Bollen committed
225
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
226
    input:
Sander Bollen's avatar
Sander Bollen committed
227
228
        stats="{sample}/pre_process/{sample}.preqc_count.json",
        fastq="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
229
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
230
    params:
Sander Bollen's avatar
Sander Bollen committed
231
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
232
    output:
Sander Bollen's avatar
Sander Bollen committed
233
        fastq=temp("{sample}/pre_process/{sample}.sampled_R1.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
234
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
235
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} {params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
236
237
238


rule seqtk_r2:
Sander Bollen's avatar
Sander Bollen committed
239
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
240
    input:
Sander Bollen's avatar
Sander Bollen committed
241
242
        stats = "{sample}/pre_process/{sample}.preqc_count.json",
        fastq = "{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
243
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
244
    params:
Sander Bollen's avatar
Sander Bollen committed
245
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
246
    output:
Sander Bollen's avatar
Sander Bollen committed
247
        fastq = temp("{sample}/pre_process/{sample}.sampled_R2.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
248
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
249
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} {params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
250
251


252
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
253
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
254
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
255
    input:
Sander Bollen's avatar
Sander Bollen committed
256
257
258
259
        r1 = "{sample}/pre_process/{sample}.sampled_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}.sampled_R2.fastq.gz",
        rr1 = "{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        rr2 = "{sample}/pre_process/{sample}.merged_R2.fastq.gz"
Sander Bollen's avatar
Sander Bollen committed
260
    output:
Sander Bollen's avatar
Sander Bollen committed
261
262
        r1 = temp("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.trimmed_R2.fastq"),
Sander Bollen's avatar
Sander Bollen committed
263
        s = "{sample}/pre_process/{sample}.trimmed_singles.fastq"
Sander Bollen's avatar
Sander Bollen committed
264
    conda: "envs/sickle.yml"
Sander Bollen's avatar
Sander Bollen committed
265
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
266
267
268
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
269
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
270
    input:
Sander Bollen's avatar
Sander Bollen committed
271
272
        r1 = "{sample}/pre_process/{sample}.trimmed_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.trimmed_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
273
    output:
Sander Bollen's avatar
Sander Bollen committed
274
275
        r1 = temp("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = temp("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
Sander Bollen's avatar
Sander Bollen committed
276
    conda: "envs/cutadapt.yml"
Sander Bollen's avatar
Sander Bollen committed
277
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
278
279
280
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
281
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
282
    input:
Sander Bollen's avatar
Sander Bollen committed
283
284
        r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
285
286
287
        ref = REFERENCE
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
Sander Bollen's avatar
Sander Bollen committed
288
    output: temp("{sample}/bams/{sample}.sorted.bam")
Sander Bollen's avatar
Sander Bollen committed
289
    conda: "envs/bwa.yml"
Sander Bollen's avatar
Sander Bollen committed
290
291
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
           "| picard SortSam CREATE_INDEX=TRUE TMP_DIR=null "
Sander Bollen's avatar
Sander Bollen committed
292
293
294
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
295
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
296
    input:
Sander Bollen's avatar
Sander Bollen committed
297
        bam = "{sample}/bams/{sample}.sorted.bam",
Sander Bollen's avatar
Sander Bollen committed
298
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
299
    output:
Sander Bollen's avatar
Sander Bollen committed
300
301
302
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
Sander Bollen's avatar
Sander Bollen committed
303
    conda: "envs/picard.yml"
Sander Bollen's avatar
Sander Bollen committed
304
305
306
    shell: "picard MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
307
308
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
309
310
311
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
312
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
313
    output:
Sander Bollen's avatar
Sander Bollen committed
314
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
Sander Bollen's avatar
Sander Bollen committed
315
316
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
317
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
318
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
319
    input:
Sander Bollen's avatar
Sander Bollen committed
320
        bam = "{sample}/bams/{sample}.markdup.bam",
Sander Bollen's avatar
Sander Bollen committed
321
322
323
324
325
326
327
        java = JAVA,
        gatk = GATK,
        ref = REFERENCE,
        dbsnp = DBSNP,
        one1kg = ONETHOUSAND,
        hapmap = HAPMAP
    output:
Sander Bollen's avatar
Sander Bollen committed
328
        grp = "{sample}/bams/{sample}.baserecal.grp"
Sander Bollen's avatar
Sander Bollen committed
329
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
330
    shell: "{input.java} -XX:ParallelGCThreads=1 -jar {input.gatk} -T BaseRecalibrator "
Sander Bollen's avatar
Sander Bollen committed
331
332
333
334
           "-I {input.bam} -o {output.grp} -nct 8 -R {input.ref} "
           "-cov ReadGroupCovariate -cov QualityScoreCovariate "
           "-cov CycleCovariate -cov ContextCovariate -knownSites "
           "{input.dbsnp} -knownSites {input.one1kg} "
Sander Bollen's avatar
Sander Bollen committed
335
336
           "-knownSites {input.hapmap}"

Sander Bollen's avatar
Sander Bollen committed
337
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
338
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
339
    input:
Sander Bollen's avatar
Sander Bollen committed
340
341
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
Sander Bollen's avatar
Sander Bollen committed
342
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
343
344
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
345
    params:
Sander Bollen's avatar
Sander Bollen committed
346
        chunk="{chunk}"
Sander Bollen's avatar
Sander Bollen committed
347
    output:
Sander Bollen's avatar
Sander Bollen committed
348
349
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.part.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.part.vcf.gz.tbi")
Sander Bollen's avatar
Sander Bollen committed
350
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
351
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 {input.gatk} -T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
352
353
354
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
           "-L '{params.chunk}' -o '{output.gvcf}' "
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
355
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
356
357


Sander Bollen's avatar
Sander Bollen committed
358
rule gvcf_gather:
Sander Bollen's avatar
Sander Bollen committed
359
    """Gather all gvcf scatters"""
Sander Bollen's avatar
Sander Bollen committed
360
    input:
Sander Bollen's avatar
Sander Bollen committed
361
362
        gvcfs=expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz", chunk=CHUNKS),
        tbis=expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz.tbi", chunk=CHUNKS),
Sander Bollen's avatar
Sander Bollen committed
363
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
364
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
365
    params:
Sander Bollen's avatar
Sander Bollen committed
366
        gvcfs="' -V '".join(expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz", chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
367
    output:
Sander Bollen's avatar
Sander Bollen committed
368
        gvcf="{sample}/vcf/{sample}.g.vcf.gz"
Sander Bollen's avatar
Sander Bollen committed
369
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
370
    shell: "java -Xmx4G -XX:ParallelGCThreads=1 -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "
Sander Bollen's avatar
Sander Bollen committed
371
           "-R {input.ref} -V '{params.gvcfs}' -out {output.gvcf} "
Sander Bollen's avatar
Sander Bollen committed
372
373
374
375
           "-assumeSorted"


rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
376
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
377
    input:
Sander Bollen's avatar
Sander Bollen committed
378
        gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz", sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
379
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
380
381
        gatk=GATK
    params:
Sander Bollen's avatar
Sander Bollen committed
382
        li=" -V ".join(expand("{sample}/vcf/{sample}.g.vcf.gz", sample=SAMPLES)),
Sander Bollen's avatar
Sander Bollen committed
383
384
        chunk="{chunk}"
    output:
Sander Bollen's avatar
Sander Bollen committed
385
386
        vcf=temp("multisample/genotype.{chunk}.part.vcf.gz"),
        vcf_tbi=temp("multisample/genotype.{chunk}.part.vcf.gz.tbi")
Sander Bollen's avatar
Sander Bollen committed
387
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
388
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 {input.gatk} -T GenotypeGVCFs -R {input.ref} "
Sander Bollen's avatar
Sander Bollen committed
389
           "-V {params.li} -L '{params.chunk}' -o '{output.vcf}'"
Sander Bollen's avatar
Sander Bollen committed
390
391
392


rule genotype_gather:
Sander Bollen's avatar
Sander Bollen committed
393
    """Gather all genotyping scatters"""
Sander Bollen's avatar
Sander Bollen committed
394
    input:
Sander Bollen's avatar
Sander Bollen committed
395
396
        vcfs=expand("multisample/genotype.{chunk}.part.vcf.gz", chunk=CHUNKS),
        tbis=expand("multisample/genotype.{chunk}.part.vcf.gz.tbi", chunk=CHUNKS),
Sander Bollen's avatar
Sander Bollen committed
397
398
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
399
    params:
Sander Bollen's avatar
Sander Bollen committed
400
        vcfs="' -V '".join(expand("multisample/genotype.{chunk}.part.vcf.gz", chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
401
    output:
Sander Bollen's avatar
Sander Bollen committed
402
        combined="multisample/genotyped.vcf.gz"
Sander Bollen's avatar
Sander Bollen committed
403
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
404
    shell: "java -Xmx4G -XX:ParallelGCThreads=1 -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "
Sander Bollen's avatar
Sander Bollen committed
405
           "-R {input.ref} -V '{params.vcfs}' -out {output.combined} "
Sander Bollen's avatar
Sander Bollen committed
406
           "-assumeSorted"
Sander Bollen's avatar
Sander Bollen committed
407
408


409
410
411
rule split_vcf:
    """Split multisample VCF in single samples"""
    input:
Sander Bollen's avatar
Sander Bollen committed
412
        vcf="multisample/genotyped.vcf.gz",
413
414
415
416
417
        gatk=GATK,
        ref=REFERENCE
    params:
        s="{sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
418
        splitted="{sample}/vcf/{sample}_single.vcf.gz"
419
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
420
    shell: "java -Xmx15G -XX:ParallelGCThreads=1 -jar {input.gatk} -T SelectVariants -sn "
421
           "{params.s} -env -R {input.ref} -V {input.vcf} -o {output.splitted}"
422
423


Sander Bollen's avatar
Sander Bollen committed
424
425
426
## bam metrics

rule mapped_num:
Sander Bollen's avatar
Sander Bollen committed
427
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
428
    input:
Sander Bollen's avatar
Sander Bollen committed
429
        bam="{sample}/bams/{sample}.sorted.bam"
Sander Bollen's avatar
Sander Bollen committed
430
    output:
Sander Bollen's avatar
Sander Bollen committed
431
        num="{sample}/bams/{sample}.mapped.num"
Sander Bollen's avatar
Sander Bollen committed
432
433
434
435
436
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"


rule mapped_basenum:
Sander Bollen's avatar
Sander Bollen committed
437
    """Calculate number of mapped bases"""
Sander Bollen's avatar
Sander Bollen committed
438
    input:
Sander Bollen's avatar
Sander Bollen committed
439
        bam="{sample}/bams/{sample}.sorted.bam"
Sander Bollen's avatar
Sander Bollen committed
440
    output:
Sander Bollen's avatar
Sander Bollen committed
441
        num="{sample}/bams/{sample}.mapped.basenum"
Sander Bollen's avatar
Sander Bollen committed
442
    conda: "envs/samtools.yml"
Sander Bollen's avatar
Sander Bollen committed
443
    shell: "samtools view -F 4 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
444
445
446


rule unique_num:
Sander Bollen's avatar
Sander Bollen committed
447
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
448
    input:
Sander Bollen's avatar
Sander Bollen committed
449
        bam="{sample}/bams/{sample}.markdup.bam"
Sander Bollen's avatar
Sander Bollen committed
450
    output:
Sander Bollen's avatar
Sander Bollen committed
451
        num="{sample}/bams/{sample}.unique.num"
Sander Bollen's avatar
Sander Bollen committed
452
    conda: "envs/samtools.yml"
453
    shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
454
455
456


rule usable_basenum:
Sander Bollen's avatar
Sander Bollen committed
457
    """Calculate number of bases on unique reads"""
Sander Bollen's avatar
Sander Bollen committed
458
    input:
Sander Bollen's avatar
Sander Bollen committed
459
        bam="{sample}/bams/{sample}.markdup.bam"
Sander Bollen's avatar
Sander Bollen committed
460
    output:
Sander Bollen's avatar
Sander Bollen committed
461
        num="{sample}/bams/{sample}.usable.basenum"
Sander Bollen's avatar
Sander Bollen committed
462
463
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
464
465
466
467


## fastqc

Sander Bollen's avatar
Sander Bollen committed
468
rule fastqc_raw:
Sander Bollen's avatar
Sander Bollen committed
469
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
470
    input:
Sander Bollen's avatar
Sander Bollen committed
471
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
472
473
        r2=get_r2
    params:
Sander Bollen's avatar
Sander Bollen committed
474
        odir="{sample}/pre_process/raw_fastqc"
Sander Bollen's avatar
Sander Bollen committed
475
    output:
Sander Bollen's avatar
Sander Bollen committed
476
        aux="{sample}/pre_process/raw_fastqc/.done.txt"
477
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
478
    shell: "fastqc --nogroup -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
479
480


Sander Bollen's avatar
Sander Bollen committed
481
rule fastqc_merged:
Sander Bollen's avatar
Sander Bollen committed
482
    """Run fastqc on merged fastq files"""
Sander Bollen's avatar
Sander Bollen committed
483
    input:
Sander Bollen's avatar
Sander Bollen committed
484
485
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
486
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
487
    params:
Sander Bollen's avatar
Sander Bollen committed
488
        odir="{sample}/pre_process/merged_fastqc"
Sander Bollen's avatar
Sander Bollen committed
489
    output:
Sander Bollen's avatar
Sander Bollen committed
490
491
        r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"
492
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
493
494
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
495
496


Sander Bollen's avatar
Sander Bollen committed
497
rule fastqc_postqc:
Sander Bollen's avatar
Sander Bollen committed
498
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
499
    input:
Sander Bollen's avatar
Sander Bollen committed
500
501
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq",
Sander Bollen's avatar
Sander Bollen committed
502
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
503
    params:
Sander Bollen's avatar
Sander Bollen committed
504
        odir="{sample}/pre_process/postqc_fastqc"
Sander Bollen's avatar
Sander Bollen committed
505
    output:
Sander Bollen's avatar
Sander Bollen committed
506
507
        r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"
508
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
509
510
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
511
512
513
514
515


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
516
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
517
    input:
Sander Bollen's avatar
Sander Bollen committed
518
519
        r1="{sample}/pre_process/{sample}.merged_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}.merged_R2.fastq.gz"
520
    params:
521
        fastqcount=fqc
Sander Bollen's avatar
Sander Bollen committed
522
    output:
Sander Bollen's avatar
Sander Bollen committed
523
        "{sample}/pre_process/{sample}.preqc_count.json"
524
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
525
526
527


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
528
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
529
    input:
Sander Bollen's avatar
Sander Bollen committed
530
531
        r1="{sample}/pre_process/{sample}.cutadapt_R1.fastq",
        r2="{sample}/pre_process/{sample}.cutadapt_R2.fastq"
532
    params:
533
        fastqcount=fqc
Sander Bollen's avatar
Sander Bollen committed
534
    output:
Sander Bollen's avatar
Sander Bollen committed
535
        "{sample}/pre_process/{sample}.postqc_count.json"
Sander Bollen's avatar
Sander Bollen committed
536
537
538
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"


Sander Bollen's avatar
Sander Bollen committed
539
540
541
542
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
Sander Bollen's avatar
Sander Bollen committed
543
544
545
546
        preqc_r1="{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip",
        preqc_r2="{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip",
        postqc_r1="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip",
        postqc_r2="{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip",
Sander Bollen's avatar
Sander Bollen committed
547
        sc=fqpy
548
    conda: "envs/collectstats.yml"
Sander Bollen's avatar
Sander Bollen committed
549
    output:
Sander Bollen's avatar
Sander Bollen committed
550
        "{sample}/pre_process/fastq_stats.json"
551
552
553
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
554
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
555

Sander Bollen's avatar
Sander Bollen committed
556
557
558
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
559
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
560
    input:
Sander Bollen's avatar
Sander Bollen committed
561
562
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
Sander Bollen's avatar
Sander Bollen committed
563
564
565
566
567
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
Sander Bollen's avatar
Sander Bollen committed
568
569
        covj="{sample}/coverage/{bed}.covstats.json",
        covp="{sample}/coverage/{bed}.covstats.png"
Sander Bollen's avatar
Sander Bollen committed
570
    conda: "envs/covstat.yml"
Sander Bollen's avatar
Sander Bollen committed
571
572
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} -b {input.bam} "
           "-d  | python {input.covpy} - --plot {output.covp} "
Sander Bollen's avatar
Sander Bollen committed
573
           "--title 'Targets coverage' --subtitle '{params.subt}' > {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
574
575


Sander Bollen's avatar
Sander Bollen committed
576
577
578
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
579
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
580
581
        ref=get_refflatpath
    output:
Sander Bollen's avatar
Sander Bollen committed
582
        tsv="{sample}/coverage/{ref}.coverages.tsv"
Sander Bollen's avatar
Sander Bollen committed
583
584
585
586
    conda: "envs/vcfstats.yml"
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
587
588
589
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
590
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
591
    input:
Sander Bollen's avatar
Sander Bollen committed
592
        vcf="multisample/genotyped.vcf.gz"
Sander Bollen's avatar
Sander Bollen committed
593
    output:
Sander Bollen's avatar
Sander Bollen committed
594
        stats="multisample/vcfstats.json"
Sander Bollen's avatar
Sander Bollen committed
595
    conda: "envs/vcfstats.yml"
Sander Bollen's avatar
Sander Bollen committed
596
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
597
598


Sander Bollen's avatar
Sander Bollen committed
599
## collection
600
601
602
603
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
Sander Bollen's avatar
Sander Bollen committed
604
605
606
607
608
609
610
            preqc="{sample}/pre_process/{sample}.preqc_count.json",
            postq="{sample}/pre_process/{sample}.postqc_count.json",
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
611
            cov=expand("{{sample}}/coverage/{bed}.covstats.json", bed=BASE_BEDS),
612
613
614
615
616
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
617
            "{sample}/{sample}.stats.json"
618
619
620
621
622
        conda: "envs/collectstats.yml"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
623
624
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
625
626
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
627
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
628
        input:
Sander Bollen's avatar
Sander Bollen committed
629
630
631
632
633
634
635
            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
            postq = "{sample}/pre_process/{sample}.postqc_count.json",
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            fastqc="{sample}/pre_process/fastq_stats.json",
Sander Bollen's avatar
Sander Bollen committed
636
637
638
639
640
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
Sander Bollen's avatar
Sander Bollen committed
641
            "{sample}/{sample}.stats.json"
Sander Bollen's avatar
Sander Bollen committed
642
643
644
645
646
        conda: "envs/collectstats.yml"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
647
648
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
649
650

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
651
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
652
    input:
Sander Bollen's avatar
Sander Bollen committed
653
654
        cols=expand("{sample}/{sample}.stats.json", sample=SAMPLES),
        vstat="multisample/vcfstats.json",
Sander Bollen's avatar
Sander Bollen committed
655
656
        mpy=mpy
    output:
Sander Bollen's avatar
Sander Bollen committed
657
        stats="stats.json"
Sander Bollen's avatar
Sander Bollen committed
658
659
    conda: "envs/collectstats.yml"
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
660
661


Sander Bollen's avatar
Sander Bollen committed
662
663
664
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
Sander Bollen's avatar
Sander Bollen committed
665
        stats="stats.json",
Sander Bollen's avatar
Sander Bollen committed
666
667
        sc=tsvpy
    output:
Sander Bollen's avatar
Sander Bollen committed
668
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
669
670
671
672
    conda: "envs/collectstats.yml"
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
673
674
675
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
676
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
677
678
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
679
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
680
    params:
Sander Bollen's avatar
Sander Bollen committed
681
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
682
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
683
    output:
Sander Bollen's avatar
Sander Bollen committed
684
        report="multiqc_report/multiqc_report.html"
Sander Bollen's avatar
Sander Bollen committed
685
    conda: "envs/multiqc.yml"
686
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"