Snakefile 12.3 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
import json
from os.path import join
from os import mkdir
Sander Bollen's avatar
Sander Bollen committed
4
5

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
6
7
8
9
10
11
12
13
14
15
16
17

OUT_DIR = config.get("OUTPUT_DIR")
REFERENCE = config.get("REFERENCE")
JAVA = config.get("JAVA")
GATK = config.get("GATK")
DBSNP = config.get("DBSNP")
ONETHOUSAND = config.get("ONETHOUSAND")
HAPMAP = config.get("HAPMAP")
QUEUE = config.get("QUEUE")
BED = config.get("BED")
REFFLAT = config.get("REFFLAT")
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
18
FASTQ_COUNT = config.get("FASTQ_COUNT")
Sander Bollen's avatar
Sander Bollen committed
19
20
21
22
23
24
25
26
27
28
29
30
31

_this_dir = workflow.current_basedir


env_dir = join(_this_dir, "envs")
main_env = join(_this_dir, "environment.yml")

settings_template = join(join(_this_dir, "templates"), "pipeline_settings.md.j2")

with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
32
33
34
35
36
37
38

def split_genome(ref, approx_n_chunks=100):
    fa = Fasta(ref)
    tot_size = sum([len(x) for x in fa.records.values()])
    chunk_size = tot_size//approx_n_chunks
    chunks = []
    for chrom_name, chrom_value in fa.records.items():
Sander Bollen's avatar
Sander Bollen committed
39
        pos = 1
Sander Bollen's avatar
Sander Bollen committed
40
41
42
43
44
45
46
47
48
49
50
51
52
        while pos <= len(chrom_value):
            end = pos+chunk_size
            if end <= len(chrom_value):
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, end)
            else:
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, len(chrom_value))
            chunks.append(chunk)
            pos = end
    return chunks

CHUNKS = split_genome(REFERENCE)


Sander Bollen's avatar
Sander Bollen committed
53
54
55
56
57
58
59
60
61
62
63
64
def out_path(path):
    return join(OUT_DIR, path)


try:
    mkdir(out_path("tmp"))
except OSError:
    pass


def get_r1(wildcards):
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
65
66
67
    r1 = []
    for l in sorted(s['libraries'].keys()):
        r1.append(s['libraries'][l]['R1'])
Sander Bollen's avatar
Sander Bollen committed
68
69
70
71
72
    return r1


def get_r2(wildcards):
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
73
74
    r2 = []
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
fix    
Sander Bollen committed
75
        r2.append(s['libraries'][l]['R2'])
Sander Bollen's avatar
Sander Bollen committed
76
77
78
79
80
81
82
83
    return r2


def sample_gender(wildcards):
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

    mnum = expand(out_path("{sample}/bams/{sample}.mapped.num"),
                  sample=SAMPLES)
    mbnum = expand(out_path("{sample}/bams/{sample}.mapped.basenum"),
                   sample=SAMPLES)
    unum = expand(out_path("{sample}/bams/{sample}.unique.num"),
                  sample=SAMPLES)
    ubnum = expand(out_path("{sample}/bams/{sample}.usable.basenum"),
                   sample=SAMPLES)
    fqcr = expand(out_path("{sample}/pre_process/raw_fastqc/.done.txt"),
                  sample=SAMPLES)
    fqcm = expand(out_path("{sample}/pre_process/merged_fastqc/.done.txt"),
                  sample=SAMPLES)
    fqcp = expand(out_path("{sample}/pre_process/postqc_fastqc/.done.txt"),
                  sample=SAMPLES)
    fnumpre = expand(out_path("{sample}/pre_process/{sample}.preqc_count.json"),
                     sample=SAMPLES)
    fnumpos = expand(out_path("{sample}/pre_process/{sample}.preqc_count.json"),
                     sample=SAMPLES)

    return mnum + mbnum + unum + ubnum + fqcr + fqcm + fqcp + fnumpre + fnumpos


Sander Bollen's avatar
Sander Bollen committed
110
111
rule all:
    input:
Sander Bollen's avatar
Sander Bollen committed
112
113
        combined=out_path("multisample/genotyped.vcf.gz"),
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213

rule genome:
    input: REFERENCE
    output: out_path("current.genome")
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
    input: get_r1
    output: temp(out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"))
    shell: "cat {input} > {output}"

rule merge_r2:
    input: get_r2
    output: temp(out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"))
    shell: "cat {input} > {output}"

rule sickle:
    input:
        r1 = out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        r2 = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
    output:
        r1 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq")),
        r2 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")),
        s = out_path("{sample}/pre_process/{sample}.trimmed_singles.fastq"),
    conda: "envs/sickle.yml"
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} " \
           "-p {output.r2} -s {output.s}"

rule cutadapt:
    input:
        r1 = out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")
    output:
        r1 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq")),
        r2 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"))
    conda: "envs/cutadapt.yml"
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} " \
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
    input:
        r1 = out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"),
        ref = REFERENCE
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
    output: temp(out_path("{sample}/bams/{sample}.sorted.bam"))
    conda: "envs/bwa.yml"
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} " \
           "| picard SortSam CREATE_INDEX=TRUE TMP_DIR=null " \
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
    input:
        bam = out_path("{sample}/bams/{sample}.sorted.bam"),
    params:
        tmp = out_path("tmp")
    output:
        bam = temp(out_path("{sample}/bams/{sample}.markdup.bam")),
        metrics = out_path("{sample}/bams/{sample}.markdup.metrics")
    conda: "envs/picard.yml"
    shell: "picard MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={params.tmp} " \
           "INPUT={input.bam} OUTPUT={output.bam} " \
           "METRICS_FILE={output.metrics} " \
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

rule baserecal:
    input:
        bam = out_path("{sample}/bams/{sample}.markdup.bam"),
        java = JAVA,
        gatk = GATK,
        ref = REFERENCE,
        dbsnp = DBSNP,
        one1kg = ONETHOUSAND,
        hapmap = HAPMAP
    output:
        grp = out_path("{sample}/bams/{sample}.baserecal.grp")
    conda: "envs/gatk.yml"
    shell: "{input.java} -jar {input.gatk} -T BaseRecalibrator " \
           "-I {input.bam} -o {output.grp} -nct 8 -R {input.ref} " \
           "-cov ReadGroupCovariate -cov QualityScoreCovariate " \
           "-cov CycleCovariate -cov ContextCovariate -knownSites " \
           "{input.dbsnp} -knownSites {input.one1kg} " \
           "-knownSites {input.hapmap}"

rule printreads:
    input:
        grp=out_path("{sample}/bams/{sample}.baserecal.grp"),
        bam=out_path("{sample}/bams/{sample}.markdup.bam"),
        java=JAVA,
        gatk=GATK,
        ref=REFERENCE
    output:
        bam=out_path("{sample}/bams/{sample}.baserecal.bam"),
        bai=out_path("{sample}/bams/{sample}.baserecal.bai")
    conda: "envs/gatk.yml"
    shell: "{input.java} -jar {input.gatk} -T PrintReads -I {input.bam} "\
           "-o {output.bam} -R {input.ref} -BQSR {input.grp}"


Sander Bollen's avatar
Sander Bollen committed
214
215
rule gvcf_scatter:
    input:
Sander Bollen's avatar
Sander Bollen committed
216
        bam=out_path("{sample}/bams/{sample}.baserecal.bam"),
Sander Bollen's avatar
Sander Bollen committed
217
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
218
219
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
220
    params:
Sander Bollen's avatar
Sander Bollen committed
221
        chunk="{chunk}"
Sander Bollen's avatar
Sander Bollen committed
222
    output:
Sander Bollen's avatar
Sander Bollen committed
223
        gvcf=out_path("{sample}/vcf/{sample}.{chunk}.part.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
224
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
225
    shell: "java -jar -Xmx4G {input.gatk} -T HaplotypeCaller -ERC GVCF -I "\
Sander Bollen's avatar
Sander Bollen committed
226
           "{input.bam} -R {input.ref} -D {input.dbsnp} "\
Sander Bollen's avatar
Sander Bollen committed
227
228
           "-L {params.chunk} -o {output.gvcf} "\
           "-variant_index_type LINEAR -variant_index_parameter 128000"
Sander Bollen's avatar
Sander Bollen committed
229
230


Sander Bollen's avatar
Sander Bollen committed
231
rule gvcf_gather:
Sander Bollen's avatar
Sander Bollen committed
232
    input:
Sander Bollen's avatar
Sander Bollen committed
233
234
        gvcfs=expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz"),
                     chunk=CHUNKS),
Sander Bollen's avatar
Sander Bollen committed
235
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
236
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
237
    params:
Sander Bollen's avatar
Sander Bollen committed
238
239
        gvcfs=" -V ".join(expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz"),
                                 chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
240
241
    output:
        gvcf=out_path("{sample}/vcf/{sample}.g.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
242
    conda: "envs/gatk.yml"
243
244
    shell: "java -Xmx4G -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "\
           "-R {input.ref} -V {params.gvcfs} -out {output.gvcf} "\
Sander Bollen's avatar
Sander Bollen committed
245
246
247
248
           "-assumeSorted"


rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
249
    input:
Sander Bollen's avatar
Sander Bollen committed
250
251
        gvcfs = expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"),
                       sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
252
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
253
254
255
        gatk=GATK
    params:
        li=" -V ".join(expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
256
                              sample=SAMPLES)),
Sander Bollen's avatar
Sander Bollen committed
257
258
259
        chunk="{chunk}"
    output:
        vcf=out_path("multisample/genotype.{chunk}.part.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
260
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
261
    shell: "java -jar -Xmx4G {input.gatk} -T GenotypeGVCFs -R {input.ref} "\
Sander Bollen's avatar
Sander Bollen committed
262
263
264
265
266
267
268
269
270
           "-V {params.li} -L {params.chunk} -o {output.vcf}"


rule genotype_gather:
    input:
        vcfs=expand(out_path("multisample/genotype.{chunk}.part.vcf.gz"),
                    chunk=CHUNKS),
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
271
    params:
Sander Bollen's avatar
Sander Bollen committed
272
273
        vcfs=" -V ".join(expand(out_path("multisample/genotype.{chunk}.part.vcf.gz"),
                                chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
274
275
276
    output:
        combined=out_path("multisample/genotyped.vcf.gz")
    conda: "envs/gatk.yml"
Sander Bollen's avatar
tinyfix    
Sander Bollen committed
277
    shell: "java -Xmx4G -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "\
278
           "-R {input.ref} -V {params.vcfs} -out {output.combined} "\
Sander Bollen's avatar
Sander Bollen committed
279
           "-assumeSorted"
Sander Bollen's avatar
Sander Bollen committed
280
281


Sander Bollen's avatar
Sander Bollen committed
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
## bam metrics

rule mapped_num:
    input:
        bam=out_path("{sample}/bams/{sample}.sorted.bam")
    output:
        num=out_path("{sample}/bams/{sample}.mapped.num")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"


rule mapped_basenum:
    input:
        bam=out_path("{sample}/bams/{sample}.sorted.bam")
    output:
        num=out_path("{sample}/bams/{sample}.mapped.basenum")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 {input.bam} | wc -c > {output.num}"


rule unique_num:
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam")
    output:
        num=out_path("{sample}/bams/{sample}.unique.num")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l {output.num}"


rule usable_basenum:
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam")
    output:
        num=out_path("{sample}/bams/{sample}.usable.basenum")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
318
319
320
321
322
323


## fastqc

rule fastqc_raw
    input:
Sander Bollen's avatar
Sander Bollen committed
324
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
325
326
327
328
329
        r2=get_r2
    params:
        odir=out_path("{sample}/pre_process/raw_fastqc")
    output:
        aux=out_path("{sample}/pre_process/raw_fastqc/.done.txt")
330
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
331
332
333
334
335
    shell: "fastqc -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"


rule fastqc_merged
    input:
Sander Bollen's avatar
Sander Bollen committed
336
        r1=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
Sander Bollen's avatar
Sander Bollen committed
337
338
339
340
341
        r2=out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
    params:
        odir=out_path("{sample}/pre_process/merged_fastqc")
    output:
        aux=out_path("{sample}/pre_process/merged_fastqc/.done.txt")
342
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
343
344
345
346
347
    shell: "fastqc -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"


rule fastqc_postqc
    input:
Sander Bollen's avatar
Sander Bollen committed
348
        r1=out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
Sander Bollen's avatar
Sander Bollen committed
349
350
351
352
353
        r2=out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
    params:
        odir=out_path("{sample}/pre_process/postqc_fastqc")
    output:
        aux=out_path("{sample}/pre_process/postqc_fastqc/.done.txt")
354
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
355
    shell: "fastqc -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
356
357
358
359
360
361
362
363


## fastq-count

rule fqcount_preqc:
    input:
        r1=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        r2=out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
364
365
    params:
        fastqcount=FASTQ_COUNT
Sander Bollen's avatar
Sander Bollen committed
366
367
    output:
        out_path("{sample}/pre_process/{sample}.preqc_count.json")
368
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
369
370
371
372
373
374


rule fqcount_postqc:
    input:
        r1=out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2=out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
375
376
    params:
        fastqcount=FASTQ_COUNT
Sander Bollen's avatar
Sander Bollen committed
377
378
    output:
        out_path("{sample}/pre_process/{sample}.postqc_count.json")
379
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"