Snakefile 13.5 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
import json
Sander Bollen's avatar
Sander Bollen committed
2
from os.path import join, basename
Sander Bollen's avatar
Sander Bollen committed
3
from os import mkdir
Sander Bollen's avatar
Sander Bollen committed
4
5

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
6
7
8
9
10
11
12
13
14

OUT_DIR = config.get("OUTPUT_DIR")
REFERENCE = config.get("REFERENCE")
JAVA = config.get("JAVA")
GATK = config.get("GATK")
DBSNP = config.get("DBSNP")
ONETHOUSAND = config.get("ONETHOUSAND")
HAPMAP = config.get("HAPMAP")
QUEUE = config.get("QUEUE")
15
16
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
17
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
18
FASTQ_COUNT = config.get("FASTQ_COUNT")
Sander Bollen's avatar
Sander Bollen committed
19
20
21
22
23
24
25
26

_this_dir = workflow.current_basedir


env_dir = join(_this_dir, "envs")
main_env = join(_this_dir, "environment.yml")

settings_template = join(join(_this_dir, "templates"), "pipeline_settings.md.j2")
Sander Bollen's avatar
Sander Bollen committed
27
covpy = join(join(_this_dir, "src"), "covstats.py")
Sander Bollen's avatar
Sander Bollen committed
28
29
30
31
32

with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
33
34
35
36
37
38
BEDS = BED.split(",")
REFFLATS = REFFLAT.split(",")

BASE_BEDS = [basename(x) for x in BEDS]
BASE_REFFLATS = [basename(x) for x in BEDS]

Sander Bollen's avatar
Sander Bollen committed
39
40
41
42
43
44
45

def split_genome(ref, approx_n_chunks=100):
    fa = Fasta(ref)
    tot_size = sum([len(x) for x in fa.records.values()])
    chunk_size = tot_size//approx_n_chunks
    chunks = []
    for chrom_name, chrom_value in fa.records.items():
Sander Bollen's avatar
Sander Bollen committed
46
        pos = 1
Sander Bollen's avatar
Sander Bollen committed
47
48
49
50
51
52
53
54
55
56
57
58
59
        while pos <= len(chrom_value):
            end = pos+chunk_size
            if end <= len(chrom_value):
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, end)
            else:
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, len(chrom_value))
            chunks.append(chunk)
            pos = end
    return chunks

CHUNKS = split_genome(REFERENCE)


Sander Bollen's avatar
Sander Bollen committed
60
61
62
63
64
65
66
67
68
69
70
71
def out_path(path):
    return join(OUT_DIR, path)


try:
    mkdir(out_path("tmp"))
except OSError:
    pass


def get_r1(wildcards):
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
72
73
74
    r1 = []
    for l in sorted(s['libraries'].keys()):
        r1.append(s['libraries'][l]['R1'])
Sander Bollen's avatar
Sander Bollen committed
75
76
77
78
79
    return r1


def get_r2(wildcards):
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
80
81
    r2 = []
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
fix    
Sander Bollen committed
82
        r2.append(s['libraries'][l]['R2'])
Sander Bollen's avatar
Sander Bollen committed
83
84
85
    return r2


Sander Bollen's avatar
Sander Bollen committed
86
87
88
89
90
91
92
93
def get_bedpath(wildcards):
    return [x for x in BEDS if basename(x) == wildcards.bed][0]


def get_refflatpath(wildcards):
    return [x for x in REFFLATS if basename(x) == wildcards.refflat][0]


Sander Bollen's avatar
Sander Bollen committed
94
95
96
97
98
def sample_gender(wildcards):
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

    mnum = expand(out_path("{sample}/bams/{sample}.mapped.num"),
                  sample=SAMPLES)
    mbnum = expand(out_path("{sample}/bams/{sample}.mapped.basenum"),
                   sample=SAMPLES)
    unum = expand(out_path("{sample}/bams/{sample}.unique.num"),
                  sample=SAMPLES)
    ubnum = expand(out_path("{sample}/bams/{sample}.usable.basenum"),
                   sample=SAMPLES)
    fqcr = expand(out_path("{sample}/pre_process/raw_fastqc/.done.txt"),
                  sample=SAMPLES)
    fqcm = expand(out_path("{sample}/pre_process/merged_fastqc/.done.txt"),
                  sample=SAMPLES)
    fqcp = expand(out_path("{sample}/pre_process/postqc_fastqc/.done.txt"),
                  sample=SAMPLES)
    fnumpre = expand(out_path("{sample}/pre_process/{sample}.preqc_count.json"),
                     sample=SAMPLES)
Sander Bollen's avatar
Sander Bollen committed
119
    fnumpos = expand(out_path("{sample}/pre_process/{sample}.postqc_count.json"),
Sander Bollen's avatar
Sander Bollen committed
120
121
                     sample=SAMPLES)

Sander Bollen's avatar
Sander Bollen committed
122
123
124
125
    covs = expand(out_path("{sample}/coverage/{bed}.covstats.json"),
                  sample=SAMPLES, bed=BASE_BEDS)

    return mnum + mbnum + unum + ubnum + fqcr + fqcm + fqcp + fnumpre + fnumpos + covs
Sander Bollen's avatar
Sander Bollen committed
126
127


Sander Bollen's avatar
Sander Bollen committed
128
129
rule all:
    input:
Sander Bollen's avatar
Sander Bollen committed
130
131
        combined=out_path("multisample/genotyped.vcf.gz"),
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231

rule genome:
    input: REFERENCE
    output: out_path("current.genome")
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
    input: get_r1
    output: temp(out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"))
    shell: "cat {input} > {output}"

rule merge_r2:
    input: get_r2
    output: temp(out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"))
    shell: "cat {input} > {output}"

rule sickle:
    input:
        r1 = out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        r2 = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
    output:
        r1 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq")),
        r2 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")),
        s = out_path("{sample}/pre_process/{sample}.trimmed_singles.fastq"),
    conda: "envs/sickle.yml"
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} " \
           "-p {output.r2} -s {output.s}"

rule cutadapt:
    input:
        r1 = out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")
    output:
        r1 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq")),
        r2 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"))
    conda: "envs/cutadapt.yml"
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} " \
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
    input:
        r1 = out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"),
        ref = REFERENCE
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
    output: temp(out_path("{sample}/bams/{sample}.sorted.bam"))
    conda: "envs/bwa.yml"
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} " \
           "| picard SortSam CREATE_INDEX=TRUE TMP_DIR=null " \
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
    input:
        bam = out_path("{sample}/bams/{sample}.sorted.bam"),
    params:
        tmp = out_path("tmp")
    output:
        bam = temp(out_path("{sample}/bams/{sample}.markdup.bam")),
        metrics = out_path("{sample}/bams/{sample}.markdup.metrics")
    conda: "envs/picard.yml"
    shell: "picard MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={params.tmp} " \
           "INPUT={input.bam} OUTPUT={output.bam} " \
           "METRICS_FILE={output.metrics} " \
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

rule baserecal:
    input:
        bam = out_path("{sample}/bams/{sample}.markdup.bam"),
        java = JAVA,
        gatk = GATK,
        ref = REFERENCE,
        dbsnp = DBSNP,
        one1kg = ONETHOUSAND,
        hapmap = HAPMAP
    output:
        grp = out_path("{sample}/bams/{sample}.baserecal.grp")
    conda: "envs/gatk.yml"
    shell: "{input.java} -jar {input.gatk} -T BaseRecalibrator " \
           "-I {input.bam} -o {output.grp} -nct 8 -R {input.ref} " \
           "-cov ReadGroupCovariate -cov QualityScoreCovariate " \
           "-cov CycleCovariate -cov ContextCovariate -knownSites " \
           "{input.dbsnp} -knownSites {input.one1kg} " \
           "-knownSites {input.hapmap}"

rule printreads:
    input:
        grp=out_path("{sample}/bams/{sample}.baserecal.grp"),
        bam=out_path("{sample}/bams/{sample}.markdup.bam"),
        java=JAVA,
        gatk=GATK,
        ref=REFERENCE
    output:
        bam=out_path("{sample}/bams/{sample}.baserecal.bam"),
        bai=out_path("{sample}/bams/{sample}.baserecal.bai")
    conda: "envs/gatk.yml"
    shell: "{input.java} -jar {input.gatk} -T PrintReads -I {input.bam} "\
           "-o {output.bam} -R {input.ref} -BQSR {input.grp}"


Sander Bollen's avatar
Sander Bollen committed
232
233
rule gvcf_scatter:
    input:
Sander Bollen's avatar
Sander Bollen committed
234
        bam=out_path("{sample}/bams/{sample}.baserecal.bam"),
Sander Bollen's avatar
Sander Bollen committed
235
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
236
237
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
238
    params:
Sander Bollen's avatar
Sander Bollen committed
239
        chunk="{chunk}"
Sander Bollen's avatar
Sander Bollen committed
240
    output:
Sander Bollen's avatar
Sander Bollen committed
241
        gvcf=out_path("{sample}/vcf/{sample}.{chunk}.part.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
242
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
243
    shell: "java -jar -Xmx4G {input.gatk} -T HaplotypeCaller -ERC GVCF -I "\
Sander Bollen's avatar
Sander Bollen committed
244
           "{input.bam} -R {input.ref} -D {input.dbsnp} "\
Sander Bollen's avatar
Sander Bollen committed
245
246
           "-L {params.chunk} -o {output.gvcf} "\
           "-variant_index_type LINEAR -variant_index_parameter 128000"
Sander Bollen's avatar
Sander Bollen committed
247
248


Sander Bollen's avatar
Sander Bollen committed
249
rule gvcf_gather:
Sander Bollen's avatar
Sander Bollen committed
250
    input:
Sander Bollen's avatar
Sander Bollen committed
251
252
        gvcfs=expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz"),
                     chunk=CHUNKS),
Sander Bollen's avatar
Sander Bollen committed
253
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
254
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
255
    params:
Sander Bollen's avatar
Sander Bollen committed
256
257
        gvcfs=" -V ".join(expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz"),
                                 chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
258
259
    output:
        gvcf=out_path("{sample}/vcf/{sample}.g.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
260
    conda: "envs/gatk.yml"
261
262
    shell: "java -Xmx4G -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "\
           "-R {input.ref} -V {params.gvcfs} -out {output.gvcf} "\
Sander Bollen's avatar
Sander Bollen committed
263
264
265
266
           "-assumeSorted"


rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
267
    input:
Sander Bollen's avatar
Sander Bollen committed
268
269
        gvcfs = expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"),
                       sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
270
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
271
272
273
        gatk=GATK
    params:
        li=" -V ".join(expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
274
                              sample=SAMPLES)),
Sander Bollen's avatar
Sander Bollen committed
275
276
277
        chunk="{chunk}"
    output:
        vcf=out_path("multisample/genotype.{chunk}.part.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
278
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
279
    shell: "java -jar -Xmx4G {input.gatk} -T GenotypeGVCFs -R {input.ref} "\
Sander Bollen's avatar
Sander Bollen committed
280
281
282
283
284
285
286
287
288
           "-V {params.li} -L {params.chunk} -o {output.vcf}"


rule genotype_gather:
    input:
        vcfs=expand(out_path("multisample/genotype.{chunk}.part.vcf.gz"),
                    chunk=CHUNKS),
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
289
    params:
Sander Bollen's avatar
Sander Bollen committed
290
291
        vcfs=" -V ".join(expand(out_path("multisample/genotype.{chunk}.part.vcf.gz"),
                                chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
292
293
294
    output:
        combined=out_path("multisample/genotyped.vcf.gz")
    conda: "envs/gatk.yml"
Sander Bollen's avatar
tinyfix    
Sander Bollen committed
295
    shell: "java -Xmx4G -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "\
296
           "-R {input.ref} -V {params.vcfs} -out {output.combined} "\
Sander Bollen's avatar
Sander Bollen committed
297
           "-assumeSorted"
Sander Bollen's avatar
Sander Bollen committed
298
299


Sander Bollen's avatar
Sander Bollen committed
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
## bam metrics

rule mapped_num:
    input:
        bam=out_path("{sample}/bams/{sample}.sorted.bam")
    output:
        num=out_path("{sample}/bams/{sample}.mapped.num")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"


rule mapped_basenum:
    input:
        bam=out_path("{sample}/bams/{sample}.sorted.bam")
    output:
        num=out_path("{sample}/bams/{sample}.mapped.basenum")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 {input.bam} | wc -c > {output.num}"


rule unique_num:
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam")
    output:
        num=out_path("{sample}/bams/{sample}.unique.num")
    conda: "envs/samtools.yml"
326
    shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
327
328
329
330
331
332
333
334
335


rule usable_basenum:
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam")
    output:
        num=out_path("{sample}/bams/{sample}.usable.basenum")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
336
337
338
339


## fastqc

Sander Bollen's avatar
Sander Bollen committed
340
rule fastqc_raw:
Sander Bollen's avatar
Sander Bollen committed
341
    input:
Sander Bollen's avatar
Sander Bollen committed
342
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
343
344
345
346
347
        r2=get_r2
    params:
        odir=out_path("{sample}/pre_process/raw_fastqc")
    output:
        aux=out_path("{sample}/pre_process/raw_fastqc/.done.txt")
348
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
349
350
351
    shell: "fastqc -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"


Sander Bollen's avatar
Sander Bollen committed
352
rule fastqc_merged:
Sander Bollen's avatar
Sander Bollen committed
353
    input:
Sander Bollen's avatar
Sander Bollen committed
354
        r1=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
Sander Bollen's avatar
Sander Bollen committed
355
356
357
358
359
        r2=out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
    params:
        odir=out_path("{sample}/pre_process/merged_fastqc")
    output:
        aux=out_path("{sample}/pre_process/merged_fastqc/.done.txt")
360
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
361
362
363
    shell: "fastqc -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"


Sander Bollen's avatar
Sander Bollen committed
364
rule fastqc_postqc:
Sander Bollen's avatar
Sander Bollen committed
365
    input:
Sander Bollen's avatar
Sander Bollen committed
366
        r1=out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
Sander Bollen's avatar
Sander Bollen committed
367
368
369
370
371
        r2=out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
    params:
        odir=out_path("{sample}/pre_process/postqc_fastqc")
    output:
        aux=out_path("{sample}/pre_process/postqc_fastqc/.done.txt")
372
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
373
    shell: "fastqc -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
374
375
376
377
378
379
380
381


## fastq-count

rule fqcount_preqc:
    input:
        r1=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        r2=out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
382
383
    params:
        fastqcount=FASTQ_COUNT
Sander Bollen's avatar
Sander Bollen committed
384
385
    output:
        out_path("{sample}/pre_process/{sample}.preqc_count.json")
386
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
387
388
389
390
391
392


rule fqcount_postqc:
    input:
        r1=out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2=out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
393
394
    params:
        fastqcount=FASTQ_COUNT
Sander Bollen's avatar
Sander Bollen committed
395
396
    output:
        out_path("{sample}/pre_process/{sample}.postqc_count.json")
Sander Bollen's avatar
Sander Bollen committed
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"


## coverages

rule covstats:
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam"),
        genome=out_path("current.genome"),
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
        covj=out_path("{sample}/coverage/{bed}.covstats.json"),
Sander Bollen's avatar
Sander Bollen committed
412
        covp=out_path("{sample}/coverage/{bed}.covstats.png")
Sander Bollen's avatar
Sander Bollen committed
413
    conda: "envs/covstat.yml"
Sander Bollen's avatar
Sander Bollen committed
414
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} -b {input.bam} " \
Sander Bollen's avatar
Sander Bollen committed
415
416
           "-d  | python {input.covpy} - --plot {output.covp} " \
           "--title 'Targets coverage' --subtitle '{params.subt}' > {output.covj}"