Snakefile 24.1 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
import json
Sander Bollen's avatar
Sander Bollen committed
2
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
3
from os.path import join, basename
4
from pathlib import Path
Sander Bollen's avatar
Sander Bollen committed
5
6

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
7

Sander Bollen's avatar
Sander Bollen committed
8
OUT_DIR = config.get("OUTPUT_DIR")  # TODO: use regular snakemake option?
9
10
11
if OUT_DIR is None:
    raise ValueError("You must set --config OUT_DIR=<path>")

Sander Bollen's avatar
Sander Bollen committed
12
REFERENCE = config.get("REFERENCE")
13
14
if REFERENCE is None:
    raise ValueError("You must set --config REFERENCE=<path>")
Sander Bollen's avatar
Sander Bollen committed
15
if not Path(REFERENCE).exists():
Sander Bollen's avatar
Sander Bollen committed
16
17
    raise FileNotFoundError("Reference file {0} "
                            "does not exist.".format(REFERENCE))
18
19
20
21

JAVA = config.get("JAVA")  # TODO: should be handled by conda?!
if JAVA is None:
    raise ValueError("You must set --config JAVA=<path>")
Sander Bollen's avatar
Sander Bollen committed
22
23
if not Path(JAVA).exists():
    raise FileNotFoundError("{0} does not exist".format(JAVA))
24

Sander Bollen's avatar
Sander Bollen committed
25
GATK = config.get("GATK")
26
27
if GATK is None:
    raise ValueError("You must set --config GATK=<path>")
Sander Bollen's avatar
Sander Bollen committed
28
29
if not Path(GATK).exists():
    raise FileNotFoundError("{0} does not exist.".format(GATK))
30

Sander Bollen's avatar
Sander Bollen committed
31
DBSNP = config.get("DBSNP")
32
33
if DBSNP is None:
    raise ValueError("You must set --config DBSNP=<path>")
Sander Bollen's avatar
Sander Bollen committed
34
35
if not Path(DBSNP).exists():
    raise FileNotFoundError("{0} does not exist".format(DBSNP))
36

Sander Bollen's avatar
Sander Bollen committed
37
ONETHOUSAND = config.get("ONETHOUSAND")
38
39
if ONETHOUSAND is None:
    raise ValueError("You must set --config ONETHOUSAND=<path>")
Sander Bollen's avatar
Sander Bollen committed
40
41
if not Path(ONETHOUSAND).exists():
    raise FileNotFoundError("{0} does not exist".format(ONETHOUSAND))
42

Sander Bollen's avatar
Sander Bollen committed
43
HAPMAP = config.get("HAPMAP")
44
if HAPMAP is None:
Sander Bollen's avatar
Sander Bollen committed
45
46
47
    raise ValueError("You must set --config HAPMAP=<path>")
if not Path(HAPMAP).exists():
    raise FileNotFoundError("{0} does not exist".format(HAPMAP))
48
49

# these are all optional
50
51
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
52
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
53
FASTQ_COUNT = config.get("FASTQ_COUNT")
Sander Bollen's avatar
Sander Bollen committed
54
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
55

56
57
58
59
60
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
61

62
63
64
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
65
seq = fsrc_dir("src", "seqtk.sh")
Sander Bollen's avatar
Sander Bollen committed
66
fqpy = fsrc_dir("src", "fastqc_stats.py")
Sander Bollen's avatar
Sander Bollen committed
67
tsvpy = fsrc_dir("src", "stats_to_tsv.py")
Sander Bollen's avatar
Sander Bollen committed
68
fqsc = fsrc_dir("src", "safe_fastqc.sh")
Sander Bollen's avatar
Sander Bollen committed
69

70
if FASTQ_COUNT is None:
71
    fqc = "python {0}".format(fsrc_dir("src", "fastq-count.py"))
72
73
74
else:
    fqc = FASTQ_COUNT

Sander Bollen's avatar
Sander Bollen committed
75
76
77
78
79
80
81
# sample config parsing
SCONFIG = config.get("SAMPLE_CONFIG")
if SCONFIG is None:
    raise ValueError("You must set --config SAMPLE_CONFIG=<path>")
if not Path(SCONFIG).exists():
    raise FileNotFoundError("{0} does not exist".format(SCONFIG))

Sander Bollen's avatar
Sander Bollen committed
82
83
84
85
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
86
87
88
89
90
91
92
93
94
if BED != "":
    BEDS = BED.split(",")
else:
    BEDS = []

if REFFLAT != "":
    REFFLATS = REFFLAT.split(",")
else:
    REFFLATS = []
Sander Bollen's avatar
Sander Bollen committed
95
96

BASE_BEDS = [basename(x) for x in BEDS]
Sander Bollen's avatar
Sander Bollen committed
97
BASE_REFFLATS = [basename(x) for x in REFFLATS]
Sander Bollen's avatar
Sander Bollen committed
98

Sander Bollen's avatar
Sander Bollen committed
99
100

def split_genome(ref, approx_n_chunks=100):
Sander Bollen's avatar
Sander Bollen committed
101
102
103
104
105
106
107
108
    """
    Split genome in chunks.

    Chunks are strings in the format: `<ctg>:<start>-<end>`
    These follow the region string format as used by htslib,
    which uses _1_-based indexing.
    See: http://www.htslib.org/doc/tabix.html
    """
Sander Bollen's avatar
Sander Bollen committed
109
110
111
112
113
    fa = Fasta(ref)
    tot_size = sum([len(x) for x in fa.records.values()])
    chunk_size = tot_size//approx_n_chunks
    chunks = []
    for chrom_name, chrom_value in fa.records.items():
Sander Bollen's avatar
Sander Bollen committed
114
        pos = 1
Sander Bollen's avatar
Sander Bollen committed
115
116
117
118
119
120
121
122
123
124
125
126
127
        while pos <= len(chrom_value):
            end = pos+chunk_size
            if end <= len(chrom_value):
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, end)
            else:
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, len(chrom_value))
            chunks.append(chunk)
            pos = end
    return chunks

CHUNKS = split_genome(REFERENCE)


Sander Bollen's avatar
Sander Bollen committed
128
129
130
def out_path(path):
    return join(OUT_DIR, path)

Sander Bollen's avatar
Sander Bollen committed
131
132
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
133
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
134
    rs = []
135
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
136
137
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
138

Sander Bollen's avatar
Sander Bollen committed
139
140
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
141
142


Sander Bollen's avatar
Sander Bollen committed
143
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
144
145
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
146
147
148


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
149
    """Get absolute path of a refflat file"""
Sander Bollen's avatar
Sander Bollen committed
150
    return next(x for x in REFFLATS if basename(x) == wildcards.ref)
Sander Bollen's avatar
Sander Bollen committed
151
152


Sander Bollen's avatar
Sander Bollen committed
153
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
154
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
155
156
157
158
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
159
160
161
162
163
164
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

    fqcr = expand(out_path("{sample}/pre_process/raw_fastqc/.done.txt"),
                  sample=SAMPLES)
Sander Bollen's avatar
Sander Bollen committed
165
    fqcm = expand(out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip"),
Sander Bollen's avatar
Sander Bollen committed
166
                  sample=SAMPLES)
Sander Bollen's avatar
Sander Bollen committed
167
    fqcp = expand(out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip"),
Sander Bollen's avatar
Sander Bollen committed
168
                  sample=SAMPLES)
169
170
171
172
173
    if len(REFFLATS) >= 1:
        coverage_stats = expand(out_path("{sample}/coverage/{ref}.coverages.tsv"),
                                sample=SAMPLES, ref=BASE_REFFLATS)
    else:
        coverage_stats = []
Sander Bollen's avatar
Sander Bollen committed
174
    stats = out_path("stats.json")
Sander Bollen's avatar
Sander Bollen committed
175
    return  fqcr + fqcm + fqcp + coverage_stats + [stats]
Sander Bollen's avatar
Sander Bollen committed
176
177


Sander Bollen's avatar
Sander Bollen committed
178
179
rule all:
    input:
Sander Bollen's avatar
Sander Bollen committed
180
        combined=out_path("multisample/genotyped.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
181
        report=out_path("multiqc_report/multiqc_report.html"),
Sander Bollen's avatar
Sander Bollen committed
182
183
        bais=expand(out_path("{sample}/bams/{sample}.markdup.bam.bai"),
                    sample=SAMPLES),
184
185
        vcfs=expand(out_path("{sample}/vcf/{sample}_single.vcf.gz"),
                    sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
186
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
187

Sander Bollen's avatar
Sander Bollen committed
188
189
190
191
192
193

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
    output: ancient(out_path("tmp"))
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
194
rule genome:
Sander Bollen's avatar
Sander Bollen committed
195
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
196
197
198
199
200
    input: REFERENCE
    output: out_path("current.genome")
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
201
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
202
203
204
205
206
    input: get_r1
    output: temp(out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"))
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
207
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
208
209
210
211
    input: get_r2
    output: temp(out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"))
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
212
213

rule seqtk_r1:
Sander Bollen's avatar
Sander Bollen committed
214
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
215
216
    input:
        stats=out_path("{sample}/pre_process/{sample}.preqc_count.json"),
Sander Bollen's avatar
Sander Bollen committed
217
218
        fastq=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
219
    params:
Sander Bollen's avatar
Sander Bollen committed
220
        max_bases=str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
221
222
    output:
        fastq=temp(out_path("{sample}/pre_process/{sample}.sampled_R1.fastq.gz"))
Sander Bollen's avatar
Sander Bollen committed
223
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
224
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} {params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
225
226
227


rule seqtk_r2:
Sander Bollen's avatar
Sander Bollen committed
228
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
229
230
    input:
        stats = out_path("{sample}/pre_process/{sample}.preqc_count.json"),
Sander Bollen's avatar
Sander Bollen committed
231
232
        fastq = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"),
        seqtk=seq
Sander Bollen's avatar
Sander Bollen committed
233
    params:
Sander Bollen's avatar
Sander Bollen committed
234
        max_bases =str(MAX_BASES)
Sander Bollen's avatar
Sander Bollen committed
235
236
    output:
        fastq = temp(out_path("{sample}/pre_process/{sample}.sampled_R2.fastq.gz"))
Sander Bollen's avatar
Sander Bollen committed
237
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
238
    shell: "bash {input.seqtk} {input.stats} {input.fastq} {output.fastq} {params.max_bases}"
Sander Bollen's avatar
Sander Bollen committed
239
240


241
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
242
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
243
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
244
    input:
Sander Bollen's avatar
Sander Bollen committed
245
        r1 = out_path("{sample}/pre_process/{sample}.sampled_R1.fastq.gz"),
246
247
248
        r2 = out_path("{sample}/pre_process/{sample}.sampled_R2.fastq.gz"),
        rr1 = out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        rr2 = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
249
250
251
252
253
    output:
        r1 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq")),
        r2 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")),
        s = out_path("{sample}/pre_process/{sample}.trimmed_singles.fastq"),
    conda: "envs/sickle.yml"
Sander Bollen's avatar
Sander Bollen committed
254
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
255
256
257
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
258
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
259
260
261
262
263
264
265
    input:
        r1 = out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")
    output:
        r1 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq")),
        r2 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"))
    conda: "envs/cutadapt.yml"
Sander Bollen's avatar
Sander Bollen committed
266
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} "
Sander Bollen's avatar
Sander Bollen committed
267
268
269
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
270
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
271
272
273
274
275
276
277
278
    input:
        r1 = out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"),
        ref = REFERENCE
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
    output: temp(out_path("{sample}/bams/{sample}.sorted.bam"))
    conda: "envs/bwa.yml"
Sander Bollen's avatar
Sander Bollen committed
279
280
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
           "| picard SortSam CREATE_INDEX=TRUE TMP_DIR=null "
Sander Bollen's avatar
Sander Bollen committed
281
282
283
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
284
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
285
286
    input:
        bam = out_path("{sample}/bams/{sample}.sorted.bam"),
287
        tmp = ancient(out_path("tmp"))
Sander Bollen's avatar
Sander Bollen committed
288
    output:
Sander Bollen's avatar
Sander Bollen committed
289
        bam = out_path("{sample}/bams/{sample}.markdup.bam"),
Sander Bollen's avatar
Sander Bollen committed
290
        bai = out_path("{sample}/bams/{sample}.markdup.bai"),
Sander Bollen's avatar
Sander Bollen committed
291
292
        metrics = out_path("{sample}/bams/{sample}.markdup.metrics")
    conda: "envs/picard.yml"
Sander Bollen's avatar
Sander Bollen committed
293
294
295
    shell: "picard MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
           "INPUT={input.bam} OUTPUT={output.bam} "
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
296
297
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
298
299
300
301
302
303
304
305
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
        bai = out_path("{sample}/bams/{sample}.markdup.bai")
    output:
        bai = out_path("{sample}/bams/{sample}.markdup.bam.bai")
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
306
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
307
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
308
309
310
311
312
313
314
315
316
317
318
    input:
        bam = out_path("{sample}/bams/{sample}.markdup.bam"),
        java = JAVA,
        gatk = GATK,
        ref = REFERENCE,
        dbsnp = DBSNP,
        one1kg = ONETHOUSAND,
        hapmap = HAPMAP
    output:
        grp = out_path("{sample}/bams/{sample}.baserecal.grp")
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
319
    shell: "{input.java} -XX:ParallelGCThreads=1 -jar {input.gatk} -T BaseRecalibrator "
Sander Bollen's avatar
Sander Bollen committed
320
321
322
323
           "-I {input.bam} -o {output.grp} -nct 8 -R {input.ref} "
           "-cov ReadGroupCovariate -cov QualityScoreCovariate "
           "-cov CycleCovariate -cov ContextCovariate -knownSites "
           "{input.dbsnp} -knownSites {input.one1kg} "
Sander Bollen's avatar
Sander Bollen committed
324
325
           "-knownSites {input.hapmap}"

Sander Bollen's avatar
Sander Bollen committed
326
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
327
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
328
    input:
Sander Bollen's avatar
Sander Bollen committed
329
330
        bam=out_path("{sample}/bams/{sample}.markdup.bam"),
        bqsr=out_path("{sample}/bams/{sample}.baserecal.grp"),
Sander Bollen's avatar
Sander Bollen committed
331
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
332
333
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
334
    params:
Sander Bollen's avatar
Sander Bollen committed
335
        chunk="{chunk}"
Sander Bollen's avatar
Sander Bollen committed
336
    output:
337
338
        gvcf=temp(out_path("{sample}/vcf/{sample}.{chunk}.part.vcf.gz")),
        gvcf_tbi=temp(out_path("{sample}/vcf/{sample}.{chunk}.part.vcf.gz.tbi"))
Sander Bollen's avatar
Sander Bollen committed
339
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
340
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 {input.gatk} -T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
341
342
343
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
           "-L '{params.chunk}' -o '{output.gvcf}' "
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
Sander Bollen's avatar
Sander Bollen committed
344
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
345
346


Sander Bollen's avatar
Sander Bollen committed
347
rule gvcf_gather:
Sander Bollen's avatar
Sander Bollen committed
348
    """Gather all gvcf scatters"""
Sander Bollen's avatar
Sander Bollen committed
349
    input:
Sander Bollen's avatar
Sander Bollen committed
350
351
        gvcfs=expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz"),
                     chunk=CHUNKS),
352
353
        tbis=expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz.tbi"),
                     chunk=CHUNKS),
Sander Bollen's avatar
Sander Bollen committed
354
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
355
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
356
    params:
Sander Bollen's avatar
Sander Bollen committed
357
        gvcfs="' -V '".join(expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
358
                                 chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
359
360
    output:
        gvcf=out_path("{sample}/vcf/{sample}.g.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
361
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
362
    shell: "java -Xmx4G -XX:ParallelGCThreads=1 -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "
Sander Bollen's avatar
Sander Bollen committed
363
           "-R {input.ref} -V '{params.gvcfs}' -out {output.gvcf} "
Sander Bollen's avatar
Sander Bollen committed
364
365
366
367
           "-assumeSorted"


rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
368
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
369
    input:
Sander Bollen's avatar
Sander Bollen committed
370
371
        gvcfs = expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"),
                       sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
372
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
373
374
375
        gatk=GATK
    params:
        li=" -V ".join(expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
376
                              sample=SAMPLES)),
Sander Bollen's avatar
Sander Bollen committed
377
378
        chunk="{chunk}"
    output:
379
380
        vcf=temp(out_path("multisample/genotype.{chunk}.part.vcf.gz")),
        vcf_tbi=temp(out_path("multisample/genotype.{chunk}.part.vcf.gz.tbi"))
Sander Bollen's avatar
Sander Bollen committed
381
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
382
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 {input.gatk} -T GenotypeGVCFs -R {input.ref} "
Sander Bollen's avatar
Sander Bollen committed
383
           "-V {params.li} -L '{params.chunk}' -o '{output.vcf}'"
Sander Bollen's avatar
Sander Bollen committed
384
385
386


rule genotype_gather:
Sander Bollen's avatar
Sander Bollen committed
387
    """Gather all genotyping scatters"""
Sander Bollen's avatar
Sander Bollen committed
388
389
390
    input:
        vcfs=expand(out_path("multisample/genotype.{chunk}.part.vcf.gz"),
                    chunk=CHUNKS),
391
392
        tbis=expand(out_path("multisample/genotype.{chunk}.part.vcf.gz.tbi"),
                    chunk=CHUNKS),
Sander Bollen's avatar
Sander Bollen committed
393
394
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
395
    params:
Sander Bollen's avatar
Sander Bollen committed
396
        vcfs="' -V '".join(expand(out_path("multisample/genotype.{chunk}.part.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
397
                                chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
398
399
400
    output:
        combined=out_path("multisample/genotyped.vcf.gz")
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
401
    shell: "java -Xmx4G -XX:ParallelGCThreads=1 -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "
Sander Bollen's avatar
Sander Bollen committed
402
           "-R {input.ref} -V '{params.vcfs}' -out {output.combined} "
Sander Bollen's avatar
Sander Bollen committed
403
           "-assumeSorted"
Sander Bollen's avatar
Sander Bollen committed
404
405


406
407
408
409
410
411
412
413
414
415
416
rule split_vcf:
    """Split multisample VCF in single samples"""
    input:
        vcf=out_path("multisample/genotyped.vcf.gz"),
        gatk=GATK,
        ref=REFERENCE
    params:
        s="{sample}"
    output:
        splitted=out_path("{sample}/vcf/{sample}_single.vcf.gz")
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
417
    shell: "java -Xmx15G -XX:ParallelGCThreads=1 -jar {input.gatk} -T SelectVariants -sn "
418
           "{params.s} -env -R {input.ref} -V {input.vcf} -o {output.splitted}"
419
420


Sander Bollen's avatar
Sander Bollen committed
421
422
423
## bam metrics

rule mapped_num:
Sander Bollen's avatar
Sander Bollen committed
424
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
425
426
427
428
429
430
431
432
433
    input:
        bam=out_path("{sample}/bams/{sample}.sorted.bam")
    output:
        num=out_path("{sample}/bams/{sample}.mapped.num")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"


rule mapped_basenum:
Sander Bollen's avatar
Sander Bollen committed
434
    """Calculate number of mapped bases"""
Sander Bollen's avatar
Sander Bollen committed
435
436
437
438
439
    input:
        bam=out_path("{sample}/bams/{sample}.sorted.bam")
    output:
        num=out_path("{sample}/bams/{sample}.mapped.basenum")
    conda: "envs/samtools.yml"
Sander Bollen's avatar
Sander Bollen committed
440
    shell: "samtools view -F 4 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
441
442
443


rule unique_num:
Sander Bollen's avatar
Sander Bollen committed
444
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
445
446
447
448
449
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam")
    output:
        num=out_path("{sample}/bams/{sample}.unique.num")
    conda: "envs/samtools.yml"
450
    shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
451
452
453


rule usable_basenum:
Sander Bollen's avatar
Sander Bollen committed
454
    """Calculate number of bases on unique reads"""
Sander Bollen's avatar
Sander Bollen committed
455
456
457
458
459
460
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam")
    output:
        num=out_path("{sample}/bams/{sample}.usable.basenum")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
461
462
463
464


## fastqc

Sander Bollen's avatar
Sander Bollen committed
465
rule fastqc_raw:
Sander Bollen's avatar
Sander Bollen committed
466
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
467
    input:
Sander Bollen's avatar
Sander Bollen committed
468
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
469
470
471
472
473
        r2=get_r2
    params:
        odir=out_path("{sample}/pre_process/raw_fastqc")
    output:
        aux=out_path("{sample}/pre_process/raw_fastqc/.done.txt")
474
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
475
    shell: "fastqc --nogroup -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
476
477


Sander Bollen's avatar
Sander Bollen committed
478
rule fastqc_merged:
Sander Bollen's avatar
Sander Bollen committed
479
    """Run fastqc on merged fastq files"""
Sander Bollen's avatar
Sander Bollen committed
480
    input:
Sander Bollen's avatar
Sander Bollen committed
481
        r1=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
Sander Bollen's avatar
Sander Bollen committed
482
483
        r2=out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"),
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
484
485
486
    params:
        odir=out_path("{sample}/pre_process/merged_fastqc")
    output:
Sander Bollen's avatar
Sander Bollen committed
487
488
        r1=out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip"),
        r2=out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip")
489
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
490
491
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
492
493


Sander Bollen's avatar
Sander Bollen committed
494
rule fastqc_postqc:
Sander Bollen's avatar
Sander Bollen committed
495
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
496
    input:
Sander Bollen's avatar
Sander Bollen committed
497
        r1=out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
Sander Bollen's avatar
Sander Bollen committed
498
499
        r2=out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"),
        fq=fqsc
Sander Bollen's avatar
Sander Bollen committed
500
501
502
    params:
        odir=out_path("{sample}/pre_process/postqc_fastqc")
    output:
Sander Bollen's avatar
Sander Bollen committed
503
        r1=out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip"),
Sander Bollen's avatar
Sander Bollen committed
504
        r2=out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip")
505
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
506
507
    shell: "bash {input.fq} {input.r1} {input.r2} "
           "{output.r1} {output.r2} {params.odir}"
Sander Bollen's avatar
Sander Bollen committed
508
509
510
511
512


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
513
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
514
515
516
    input:
        r1=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        r2=out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
517
    params:
518
        fastqcount=fqc
Sander Bollen's avatar
Sander Bollen committed
519
520
    output:
        out_path("{sample}/pre_process/{sample}.preqc_count.json")
521
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
522
523
524


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
525
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
526
527
528
    input:
        r1=out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2=out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
529
    params:
530
        fastqcount=fqc
Sander Bollen's avatar
Sander Bollen committed
531
532
    output:
        out_path("{sample}/pre_process/{sample}.postqc_count.json")
Sander Bollen's avatar
Sander Bollen committed
533
534
535
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"


Sander Bollen's avatar
Sander Bollen committed
536
537
538
539
540
541
542
543
544
# fastqc stats
rule fastqc_stats:
    """Collect fastq stats for a sample in json format"""
    input:
        preqc_r1=out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip"),
        preqc_r2=out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip"),
        postqc_r1=out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip"),
        postqc_r2=out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R2_fastqc.zip"),
        sc=fqpy
545
    conda: "envs/collectstats.yml"
Sander Bollen's avatar
Sander Bollen committed
546
547
    output:
        out_path("{sample}/pre_process/fastq_stats.json")
548
549
550
    shell: "python {input.sc} --preqc-r1 {input.preqc_r1} "
           "--preqc-r2 {input.preqc_r2} "
           "--postqc-r1 {input.postqc_r1} "
Sander Bollen's avatar
Sander Bollen committed
551
           "--postqc-r2 {input.postqc_r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
552

Sander Bollen's avatar
Sander Bollen committed
553
554
555
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
556
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
557
558
559
560
561
562
563
564
565
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam"),
        genome=out_path("current.genome"),
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
        covj=out_path("{sample}/coverage/{bed}.covstats.json"),
Sander Bollen's avatar
Sander Bollen committed
566
        covp=out_path("{sample}/coverage/{bed}.covstats.png")
Sander Bollen's avatar
Sander Bollen committed
567
    conda: "envs/covstat.yml"
Sander Bollen's avatar
Sander Bollen committed
568
569
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} -b {input.bam} "
           "-d  | python {input.covpy} - --plot {output.covp} "
Sander Bollen's avatar
Sander Bollen committed
570
           "--title 'Targets coverage' --subtitle '{params.subt}' > {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
571
572


Sander Bollen's avatar
Sander Bollen committed
573
574
575
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
576
        gvcf=out_path("{sample}/vcf/{sample}.g.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
577
578
579
580
581
582
583
        ref=get_refflatpath
    output:
        tsv=out_path("{sample}/coverage/{ref}.coverages.tsv")
    conda: "envs/vcfstats.yml"
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
584
585
586
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
587
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
588
    input:
Sander Bollen's avatar
Sander Bollen committed
589
        vcf=out_path("multisample/genotyped.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
590
591
592
    output:
        stats=out_path("multisample/vcfstats.json")
    conda: "envs/vcfstats.yml"
Sander Bollen's avatar
Sander Bollen committed
593
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
594
595


Sander Bollen's avatar
Sander Bollen committed
596
## collection
597
598
599
600
601
602
603
604
605
606
if len(BASE_BEDS) >= 1:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
            preqc=out_path("{sample}/pre_process/{sample}.preqc_count.json"),
            postq=out_path("{sample}/pre_process/{sample}.postqc_count.json"),
            mnum=out_path("{sample}/bams/{sample}.mapped.num"),
            mbnum=out_path("{sample}/bams/{sample}.mapped.basenum"),
            unum=out_path("{sample}/bams/{sample}.unique.num"),
            ubnum=out_path("{sample}/bams/{sample}.usable.basenum"),
607
            fastqc=out_path("{sample}/pre_process/fastq_stats.json"),
608
609
610
611
612
613
614
615
616
617
618
619
620
            cov=expand(out_path("{{sample}}/coverage/{bed}.covstats.json"),
                       bed=BASE_BEDS),
            colpy=colpy
        params:
            sample_name="{sample}",
            fthresh=FEMALE_THRESHOLD
        output:
            out_path("{sample}/{sample}.stats.json")
        conda: "envs/collectstats.yml"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
621
622
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc} {input.cov} > {output}"
623
624
else:
    rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
625
        """Collect all stats for a particular sample without beds"""
Sander Bollen's avatar
Sander Bollen committed
626
627
628
629
630
631
632
        input:
            preqc = out_path("{sample}/pre_process/{sample}.preqc_count.json"),
            postq = out_path("{sample}/pre_process/{sample}.postqc_count.json"),
            mnum = out_path("{sample}/bams/{sample}.mapped.num"),
            mbnum = out_path("{sample}/bams/{sample}.mapped.basenum"),
            unum = out_path("{sample}/bams/{sample}.unique.num"),
            ubnum = out_path("{sample}/bams/{sample}.usable.basenum"),
633
            fastqc=out_path("{sample}/pre_process/fastq_stats.json"),
Sander Bollen's avatar
Sander Bollen committed
634
635
636
637
638
639
640
641
642
643
644
            colpy = colpy
        params:
            sample_name = "{sample}",
            fthresh = FEMALE_THRESHOLD
        output:
            out_path("{sample}/{sample}.stats.json")
        conda: "envs/collectstats.yml"
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
645
646
               "--female-threshold {params.fthresh} "
               "--fastqc-stats {input.fastqc}  > {output}"
Sander Bollen's avatar
Sander Bollen committed
647
648

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
649
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
650
651
652
653
654
655
656
657
    input:
        cols=expand(out_path("{sample}/{sample}.stats.json"), sample=SAMPLES),
        vstat=out_path("multisample/vcfstats.json"),
        mpy=mpy
    output:
        stats=out_path("stats.json")
    conda: "envs/collectstats.yml"
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
658
659


Sander Bollen's avatar
Sander Bollen committed
660
661
662
663
664
665
666
667
668
669
670
rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats=out_path("stats.json"),
        sc=tsvpy
    output:
        stats=out_path("stats.tsv")
    conda: "envs/collectstats.yml"
    shell: "python {input.sc} -i {input.stats} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
671
672
673
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
674
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
675
676
    """
    input:
Sander Bollen's avatar
Sander Bollen committed
677
        stats=out_path("stats.tsv")
Sander Bollen's avatar
Sander Bollen committed
678
    params:
679
680
        odir=out_path("."),
        rdir=out_path("multiqc_report")
Sander Bollen's avatar
Sander Bollen committed
681
682
    output:
        report=out_path("multiqc_report/multiqc_report.html")
Sander Bollen's avatar
Sander Bollen committed
683
    conda: "envs/multiqc.yml"
684
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output.report}"