Snakefile 16.8 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
import json
Sander Bollen's avatar
Sander Bollen committed
2
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
3
from os.path import join, basename
Sander Bollen's avatar
Sander Bollen committed
4
5

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
6
7
8
9
10
11
12
13

OUT_DIR = config.get("OUTPUT_DIR")
REFERENCE = config.get("REFERENCE")
JAVA = config.get("JAVA")
GATK = config.get("GATK")
DBSNP = config.get("DBSNP")
ONETHOUSAND = config.get("ONETHOUSAND")
HAPMAP = config.get("HAPMAP")
14
15
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
16
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
17
FASTQ_COUNT = config.get("FASTQ_COUNT")
Sander Bollen's avatar
Sander Bollen committed
18
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
19

20
21
22
23
24
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
25

26
27
28
29
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
vs_py = fsrc_dir("src", "vcfstats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
30

31
if FASTQ_COUNT is None:
32
    fqc = "python {0}".format(fsrc_dir("src", "fastq-count.py"))
33
34
35
else:
    fqc = FASTQ_COUNT

Sander Bollen's avatar
Sander Bollen committed
36
37
38
39
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
40
41
42
43
44
45
BEDS = BED.split(",")
REFFLATS = REFFLAT.split(",")

BASE_BEDS = [basename(x) for x in BEDS]
BASE_REFFLATS = [basename(x) for x in BEDS]

Sander Bollen's avatar
Sander Bollen committed
46
47

def split_genome(ref, approx_n_chunks=100):
Sander Bollen's avatar
Sander Bollen committed
48
49
50
51
52
53
54
55
    """
    Split genome in chunks.

    Chunks are strings in the format: `<ctg>:<start>-<end>`
    These follow the region string format as used by htslib,
    which uses _1_-based indexing.
    See: http://www.htslib.org/doc/tabix.html
    """
Sander Bollen's avatar
Sander Bollen committed
56
57
58
59
60
    fa = Fasta(ref)
    tot_size = sum([len(x) for x in fa.records.values()])
    chunk_size = tot_size//approx_n_chunks
    chunks = []
    for chrom_name, chrom_value in fa.records.items():
Sander Bollen's avatar
Sander Bollen committed
61
        pos = 1
Sander Bollen's avatar
Sander Bollen committed
62
63
64
65
66
67
68
69
70
71
72
73
74
        while pos <= len(chrom_value):
            end = pos+chunk_size
            if end <= len(chrom_value):
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, end)
            else:
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, len(chrom_value))
            chunks.append(chunk)
            pos = end
    return chunks

CHUNKS = split_genome(REFERENCE)


Sander Bollen's avatar
Sander Bollen committed
75
76
77
def out_path(path):
    return join(OUT_DIR, path)

Sander Bollen's avatar
Sander Bollen committed
78
79
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
80
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
81
    rs = []
82
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
83
84
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
85

Sander Bollen's avatar
Sander Bollen committed
86
87
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
88
89


Sander Bollen's avatar
Sander Bollen committed
90
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
91
92
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
93
94
95


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
96
97
    """Get absolute path of a refflat file"""
    return next(x for x in REFFLATS if basename(x) == wildcards.refflat)
Sander Bollen's avatar
Sander Bollen committed
98
99


Sander Bollen's avatar
Sander Bollen committed
100
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
101
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
102
103
104
105
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
106
107
108
109
110
111
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

    fqcr = expand(out_path("{sample}/pre_process/raw_fastqc/.done.txt"),
                  sample=SAMPLES)
Sander Bollen's avatar
Sander Bollen committed
112
    fqcm = expand(out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip"),
Sander Bollen's avatar
Sander Bollen committed
113
                  sample=SAMPLES)
Sander Bollen's avatar
Sander Bollen committed
114
    fqcp = expand(out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip"),
Sander Bollen's avatar
Sander Bollen committed
115
                  sample=SAMPLES)
Sander Bollen's avatar
Sander Bollen committed
116
    stats = out_path("stats.json")
Sander Bollen's avatar
Sander Bollen committed
117
    return  fqcr + fqcm + fqcp + [stats]
Sander Bollen's avatar
Sander Bollen committed
118
119


Sander Bollen's avatar
Sander Bollen committed
120
121
rule all:
    input:
Sander Bollen's avatar
Sander Bollen committed
122
123
        combined=out_path("multisample/genotyped.vcf.gz"),
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
124
125

rule genome:
Sander Bollen's avatar
Sander Bollen committed
126
    """Create genome file as used by bedtools"""
Sander Bollen's avatar
Sander Bollen committed
127
128
129
130
131
    input: REFERENCE
    output: out_path("current.genome")
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
Sander Bollen's avatar
Sander Bollen committed
132
    """Merge all forward fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
133
134
135
136
137
    input: get_r1
    output: temp(out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"))
    shell: "cat {input} > {output}"

rule merge_r2:
Sander Bollen's avatar
Sander Bollen committed
138
    """Merge all reverse fastq files into one"""
Sander Bollen's avatar
Sander Bollen committed
139
140
141
142
    input: get_r2
    output: temp(out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"))
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
143
144

rule seqtk_r1:
Sander Bollen's avatar
Sander Bollen committed
145
    """Either subsample or link forward fastq file"""
Sander Bollen's avatar
Sander Bollen committed
146
147
148
149
150
151
152
    input:
        stats=out_path("{sample}/pre_process/{sample}.preqc_count.json"),
        fastq=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
    params:
        max_bases=MAX_BASES
    output:
        fastq=temp(out_path("{sample}/pre_process/{sample}.sampled_R1.fastq.gz"))
Sander Bollen's avatar
Sander Bollen committed
153
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
154
    script: "src/seqtk.py"
Sander Bollen's avatar
Sander Bollen committed
155
156
157


rule seqtk_r2:
Sander Bollen's avatar
Sander Bollen committed
158
    """Either subsample or link reverse fastq file"""
Sander Bollen's avatar
Sander Bollen committed
159
160
161
162
163
164
165
    input:
        stats = out_path("{sample}/pre_process/{sample}.preqc_count.json"),
        fastq = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
    params:
        max_bases = MAX_BASES
    output:
        fastq = temp(out_path("{sample}/pre_process/{sample}.sampled_R2.fastq.gz"))
Sander Bollen's avatar
Sander Bollen committed
166
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
167
    script: "src/seqtk.py"
Sander Bollen's avatar
Sander Bollen committed
168
169


170
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
171
rule sickle:
Sander Bollen's avatar
Sander Bollen committed
172
    """Trim fastq files"""
Sander Bollen's avatar
Sander Bollen committed
173
    input:
Sander Bollen's avatar
Sander Bollen committed
174
        r1 = out_path("{sample}/pre_process/{sample}.sampled_R1.fastq.gz"),
175
176
177
        r2 = out_path("{sample}/pre_process/{sample}.sampled_R2.fastq.gz"),
        rr1 = out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        rr2 = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
178
179
180
181
182
183
184
185
186
    output:
        r1 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq")),
        r2 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")),
        s = out_path("{sample}/pre_process/{sample}.trimmed_singles.fastq"),
    conda: "envs/sickle.yml"
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} " \
           "-p {output.r2} -s {output.s}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
187
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
188
189
190
191
192
193
194
195
196
197
198
    input:
        r1 = out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")
    output:
        r1 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq")),
        r2 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"))
    conda: "envs/cutadapt.yml"
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} " \
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
Sander Bollen's avatar
Sander Bollen committed
199
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
200
201
202
203
204
205
206
207
208
209
210
211
212
    input:
        r1 = out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"),
        ref = REFERENCE
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
    output: temp(out_path("{sample}/bams/{sample}.sorted.bam"))
    conda: "envs/bwa.yml"
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} " \
           "| picard SortSam CREATE_INDEX=TRUE TMP_DIR=null " \
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
Sander Bollen's avatar
Sander Bollen committed
213
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
214
215
    input:
        bam = out_path("{sample}/bams/{sample}.sorted.bam"),
216
        tmp = ancient(out_path("tmp"))
Sander Bollen's avatar
Sander Bollen committed
217
    output:
Sander Bollen's avatar
Sander Bollen committed
218
        bam = out_path("{sample}/bams/{sample}.markdup.bam"),
Sander Bollen's avatar
Sander Bollen committed
219
220
        metrics = out_path("{sample}/bams/{sample}.markdup.metrics")
    conda: "envs/picard.yml"
221
    shell: "picard MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={input.tmp} " \
Sander Bollen's avatar
Sander Bollen committed
222
223
224
225
226
           "INPUT={input.bam} OUTPUT={output.bam} " \
           "METRICS_FILE={output.metrics} " \
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
227
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
    input:
        bam = out_path("{sample}/bams/{sample}.markdup.bam"),
        java = JAVA,
        gatk = GATK,
        ref = REFERENCE,
        dbsnp = DBSNP,
        one1kg = ONETHOUSAND,
        hapmap = HAPMAP
    output:
        grp = out_path("{sample}/bams/{sample}.baserecal.grp")
    conda: "envs/gatk.yml"
    shell: "{input.java} -jar {input.gatk} -T BaseRecalibrator " \
           "-I {input.bam} -o {output.grp} -nct 8 -R {input.ref} " \
           "-cov ReadGroupCovariate -cov QualityScoreCovariate " \
           "-cov CycleCovariate -cov ContextCovariate -knownSites " \
           "{input.dbsnp} -knownSites {input.one1kg} " \
           "-knownSites {input.hapmap}"

Sander Bollen's avatar
Sander Bollen committed
246
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
247
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
248
    input:
Sander Bollen's avatar
Sander Bollen committed
249
250
        bam=out_path("{sample}/bams/{sample}.markdup.bam"),
        bqsr=out_path("{sample}/bams/{sample}.baserecal.grp"),
Sander Bollen's avatar
Sander Bollen committed
251
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
252
253
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
254
    params:
Sander Bollen's avatar
Sander Bollen committed
255
        chunk="{chunk}"
Sander Bollen's avatar
Sander Bollen committed
256
    output:
Sander Bollen's avatar
Sander Bollen committed
257
        gvcf=out_path("{sample}/vcf/{sample}.{chunk}.part.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
258
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
259
    shell: "java -jar -Xmx4G {input.gatk} -T HaplotypeCaller -ERC GVCF -I "\
Sander Bollen's avatar
Sander Bollen committed
260
           "{input.bam} -R {input.ref} -D {input.dbsnp} "\
Sander Bollen's avatar
Sander Bollen committed
261
           "-L '{params.chunk}' -o '{output.gvcf}' "\
Sander Bollen's avatar
Sander Bollen committed
262
263
           "-variant_index_type LINEAR -variant_index_parameter 128000 " \
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
264
265


Sander Bollen's avatar
Sander Bollen committed
266
rule gvcf_gather:
Sander Bollen's avatar
Sander Bollen committed
267
    """Gather all gvcf scatters"""
Sander Bollen's avatar
Sander Bollen committed
268
    input:
Sander Bollen's avatar
Sander Bollen committed
269
270
        gvcfs=expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz"),
                     chunk=CHUNKS),
Sander Bollen's avatar
Sander Bollen committed
271
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
272
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
273
    params:
Sander Bollen's avatar
Sander Bollen committed
274
        gvcfs="' -V '".join(expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
275
                                 chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
276
277
    output:
        gvcf=out_path("{sample}/vcf/{sample}.g.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
278
    conda: "envs/gatk.yml"
279
    shell: "java -Xmx4G -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "\
Sander Bollen's avatar
Sander Bollen committed
280
           "-R {input.ref} -V '{params.gvcfs}' -out {output.gvcf} "\
Sander Bollen's avatar
Sander Bollen committed
281
282
283
284
           "-assumeSorted"


rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
285
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
286
    input:
Sander Bollen's avatar
Sander Bollen committed
287
288
        gvcfs = expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"),
                       sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
289
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
290
291
292
        gatk=GATK
    params:
        li=" -V ".join(expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
293
                              sample=SAMPLES)),
Sander Bollen's avatar
Sander Bollen committed
294
295
296
        chunk="{chunk}"
    output:
        vcf=out_path("multisample/genotype.{chunk}.part.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
297
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
298
    shell: "java -jar -Xmx4G {input.gatk} -T GenotypeGVCFs -R {input.ref} "\
Sander Bollen's avatar
Sander Bollen committed
299
           "-V {params.li} -L '{params.chunk}' -o '{output.vcf}'"
Sander Bollen's avatar
Sander Bollen committed
300
301
302


rule genotype_gather:
Sander Bollen's avatar
Sander Bollen committed
303
    """Gather all genotyping scatters"""
Sander Bollen's avatar
Sander Bollen committed
304
305
306
307
308
    input:
        vcfs=expand(out_path("multisample/genotype.{chunk}.part.vcf.gz"),
                    chunk=CHUNKS),
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
309
    params:
Sander Bollen's avatar
Sander Bollen committed
310
        vcfs="' -V '".join(expand(out_path("multisample/genotype.{chunk}.part.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
311
                                chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
312
313
314
    output:
        combined=out_path("multisample/genotyped.vcf.gz")
    conda: "envs/gatk.yml"
Sander Bollen's avatar
tinyfix    
Sander Bollen committed
315
    shell: "java -Xmx4G -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "\
Sander Bollen's avatar
Sander Bollen committed
316
           "-R {input.ref} -V '{params.vcfs}' -out {output.combined} "\
Sander Bollen's avatar
Sander Bollen committed
317
           "-assumeSorted"
Sander Bollen's avatar
Sander Bollen committed
318
319


Sander Bollen's avatar
Sander Bollen committed
320
321
322
## bam metrics

rule mapped_num:
Sander Bollen's avatar
Sander Bollen committed
323
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
324
325
326
327
328
329
330
331
332
    input:
        bam=out_path("{sample}/bams/{sample}.sorted.bam")
    output:
        num=out_path("{sample}/bams/{sample}.mapped.num")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"


rule mapped_basenum:
Sander Bollen's avatar
Sander Bollen committed
333
    """Calculate number of mapped bases"""
Sander Bollen's avatar
Sander Bollen committed
334
335
336
337
338
    input:
        bam=out_path("{sample}/bams/{sample}.sorted.bam")
    output:
        num=out_path("{sample}/bams/{sample}.mapped.basenum")
    conda: "envs/samtools.yml"
Sander Bollen's avatar
Sander Bollen committed
339
    shell: "samtools view -F 4 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
340
341
342


rule unique_num:
Sander Bollen's avatar
Sander Bollen committed
343
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
344
345
346
347
348
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam")
    output:
        num=out_path("{sample}/bams/{sample}.unique.num")
    conda: "envs/samtools.yml"
349
    shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
350
351
352


rule usable_basenum:
Sander Bollen's avatar
Sander Bollen committed
353
    """Calculate number of bases on unique reads"""
Sander Bollen's avatar
Sander Bollen committed
354
355
356
357
358
359
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam")
    output:
        num=out_path("{sample}/bams/{sample}.usable.basenum")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
360
361
362
363


## fastqc

Sander Bollen's avatar
Sander Bollen committed
364
rule fastqc_raw:
Sander Bollen's avatar
Sander Bollen committed
365
    """Run fastqc on raw fastq files"""
Sander Bollen's avatar
Sander Bollen committed
366
    input:
Sander Bollen's avatar
Sander Bollen committed
367
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
368
369
370
371
372
        r2=get_r2
    params:
        odir=out_path("{sample}/pre_process/raw_fastqc")
    output:
        aux=out_path("{sample}/pre_process/raw_fastqc/.done.txt")
373
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
374
375
376
    shell: "fastqc -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"


Sander Bollen's avatar
Sander Bollen committed
377
rule fastqc_merged:
Sander Bollen's avatar
Sander Bollen committed
378
    """Run fastqc on merged fastq files"""
Sander Bollen's avatar
Sander Bollen committed
379
    input:
Sander Bollen's avatar
Sander Bollen committed
380
        r1=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
Sander Bollen's avatar
Sander Bollen committed
381
382
383
384
        r2=out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
    params:
        odir=out_path("{sample}/pre_process/merged_fastqc")
    output:
Sander Bollen's avatar
Sander Bollen committed
385
386
        r1=out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R1_fastqc.zip"),
        r2=out_path("{sample}/pre_process/merged_fastqc/{sample}.merged_R2_fastqc.zip")
387
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
388
    shell: "fastqc -o {params.odir} {input.r1} {input.r2}"
Sander Bollen's avatar
Sander Bollen committed
389
390


Sander Bollen's avatar
Sander Bollen committed
391
rule fastqc_postqc:
Sander Bollen's avatar
Sander Bollen committed
392
    """Run fastqc on fastq files post pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
393
    input:
Sander Bollen's avatar
Sander Bollen committed
394
        r1=out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
Sander Bollen's avatar
Sander Bollen committed
395
396
397
398
        r2=out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
    params:
        odir=out_path("{sample}/pre_process/postqc_fastqc")
    output:
Sander Bollen's avatar
Sander Bollen committed
399
400
        r1=out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip"),
        r2=out_path("{sample}/pre_process/postqc_fastqc/{sample}.cutadapt_R1_fastqc.zip")
401
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
402
    shell: "fastqc -o {params.odir} {input.r1} {input.r2}"
Sander Bollen's avatar
Sander Bollen committed
403
404
405
406
407


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
408
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
409
410
411
    input:
        r1=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        r2=out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
412
    params:
413
        fastqcount=fqc
Sander Bollen's avatar
Sander Bollen committed
414
415
    output:
        out_path("{sample}/pre_process/{sample}.preqc_count.json")
416
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
417
418
419


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
420
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
421
422
423
    input:
        r1=out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2=out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
424
    params:
425
        fastqcount=fqc
Sander Bollen's avatar
Sander Bollen committed
426
427
    output:
        out_path("{sample}/pre_process/{sample}.postqc_count.json")
Sander Bollen's avatar
Sander Bollen committed
428
429
430
431
432
433
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"


## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
434
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
435
436
437
438
439
440
441
442
443
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam"),
        genome=out_path("current.genome"),
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
        covj=out_path("{sample}/coverage/{bed}.covstats.json"),
Sander Bollen's avatar
Sander Bollen committed
444
        covp=out_path("{sample}/coverage/{bed}.covstats.png")
Sander Bollen's avatar
Sander Bollen committed
445
    conda: "envs/covstat.yml"
Sander Bollen's avatar
Sander Bollen committed
446
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} -b {input.bam} " \
Sander Bollen's avatar
Sander Bollen committed
447
448
           "-d  | python {input.covpy} - --plot {output.covp} " \
           "--title 'Targets coverage' --subtitle '{params.subt}' > {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
449
450


Sander Bollen's avatar
Sander Bollen committed
451
452
453
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
454
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
455
    input:
Sander Bollen's avatar
Sander Bollen committed
456
        vcf=out_path("multisample/genotyped.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
457
458
459
460
461
462
463
        vs_py=vs_py
    output:
        stats=out_path("multisample/vcfstats.json")
    conda: "envs/vcfstats.yml"
    shell: "python {input.vs_py} -i {input.vcf} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
464
465
## collection
rule collectstats:
Sander Bollen's avatar
Sander Bollen committed
466
    """Collect all stats for a particular sample"""
Sander Bollen's avatar
Sander Bollen committed
467
468
469
470
471
472
473
474
475
476
    input:
        preqc=out_path("{sample}/pre_process/{sample}.preqc_count.json"),
        postq=out_path("{sample}/pre_process/{sample}.postqc_count.json"),
        mnum=out_path("{sample}/bams/{sample}.mapped.num"),
        mbnum=out_path("{sample}/bams/{sample}.mapped.basenum"),
        unum=out_path("{sample}/bams/{sample}.unique.num"),
        ubnum=out_path("{sample}/bams/{sample}.usable.basenum"),
        cov=expand(out_path("{{sample}}/coverage/{bed}.covstats.json"), bed=BASE_BEDS),
        colpy=colpy
    params:
Sander Bollen's avatar
Sander Bollen committed
477
        sample_name="{sample}",
Sander Bollen's avatar
Sander Bollen committed
478
        fthresh=FEMALE_THRESHOLD
Sander Bollen's avatar
Sander Bollen committed
479
480
    output:
        out_path("{sample}/{sample}.stats.json")
481
    conda: "envs/collectstats.yml"
Sander Bollen's avatar
Sander Bollen committed
482
    shell: "python {input.colpy} --sample-name {params.sample_name} " \
Sander Bollen's avatar
Sander Bollen committed
483
           "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} " \
484
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} " \
Sander Bollen's avatar
Sander Bollen committed
485
           "--unique-num {input.unum} --usable-basenum {input.ubnum} " \
486
           "--female-threshold {params.fthresh} {input.cov} > {output}"
Sander Bollen's avatar
Sander Bollen committed
487
488

rule merge_stats:
Sander Bollen's avatar
Sander Bollen committed
489
    """Merge all stats of all samples"""
Sander Bollen's avatar
Sander Bollen committed
490
491
492
493
494
495
496
497
    input:
        cols=expand(out_path("{sample}/{sample}.stats.json"), sample=SAMPLES),
        vstat=out_path("multisample/vcfstats.json"),
        mpy=mpy
    output:
        stats=out_path("stats.json")
    conda: "envs/collectstats.yml"
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} > {output.stats}"