Snakefile 19.4 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Read the config json
with open(config['CONFIG_JSON'], 'rt') as fin:
    settings = json.load(fin)

# Read the json schema
with open('config/schema.json', 'rt') as fin:
    schema = json.load(fin)

# Validate the settings against the schema
try:
    jsonschema.validate(settings, schema)
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

45

46
47
48
# Set default values
def set_default(key, value):
    """ Set default config values """
van den Berg's avatar
van den Berg committed
49
50
51
52
53
54
55
56
57
58
59
60
61
    if key not in settings:
        settings[key] = value

# Set the default settings
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
set_default("covstats", "src/covstats.py")
set_default("collect_stats", "src/collect_stats.py")
set_default("merge_stats", "src/merge_stats.py")
set_default("stats_to_tsv", "src/stats_to_tsv.py")
set_default("py_wordcount", "src/pywc.py")
62
set_default("cutadapt_summary", "src/cutadapt_summary.py")
Sander Bollen's avatar
Sander Bollen committed
63

64
65
66
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
67
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
68
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
van den Berg's avatar
van den Berg committed
69
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
70
71
72
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
73
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
74
75
76
77
78
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
79

80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def get_forward(wildcards):
    """ Get the forward fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
                [wildcards.read_group]["R1"]
    )

def get_reverse(wildcards):
    """ Get the reverse fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
            [wildcards.read_group]["R2"]
    )

def get_readgroup(wildcards):
    return settings["samples"][wildcards.sample]["libraries"]
Sander Bollen's avatar
Sander Bollen committed
96

van den Berg's avatar
van den Berg committed
97
98
99
100
101
102
def get_readgroup_per_sample():
    for sample in settings["samples"]:
        for rg in settings["samples"][sample]["libraries"]:
            yield rg, sample


103
104
105
def coverage_stats(wildcards):
    files = expand("{sample}/coverage/refFlat_coverage.tsv", sample=settings['samples'])
    return files if "refflat" in settings else []
Sander Bollen's avatar
Sander Bollen committed
106

Sander Bollen's avatar
Sander Bollen committed
107
108
rule all:
    input:
van den Berg's avatar
van den Berg committed
109
        multiqc="multiqc_report/multiqc_report.html",
110
111
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
112
113
114
115
116
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai", sample=settings['samples']),
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz", sample=settings['samples']),
        vcf_tbi=expand("{sample}/vcf/{sample}.vcf.gz.tbi", sample=settings['samples']),
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz", sample=settings['samples']),
        gvcf_tbi=expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=settings['samples']),
van den Berg's avatar
van den Berg committed
117
118
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
        fastqc_trimmed = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
119
120
        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt" for read_group, sample in get_readgroup_per_sample()),
        coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
121

Sander Bollen's avatar
Sander Bollen committed
122
123
124

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
125
    output: directory("tmp")
126
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
127
128
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
129
rule genome:
Sander Bollen's avatar
Sander Bollen committed
130
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
131
    input: settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
132
    output: "current.genome"
133
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
134
135
136
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
137
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
138
    input:
van den Berg's avatar
van den Berg committed
139
140
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
141
    output:
van den Berg's avatar
van den Berg committed
142
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
143
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
144
145
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
146
    singularity: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
147
148
149
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
150
           "{input.r1} {input.r2} "
151
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
152
153

rule align:
Sander Bollen's avatar
Sander Bollen committed
154
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
155
    input:
van den Berg's avatar
van den Berg committed
156
157
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
van den Berg's avatar
van den Berg committed
158
        ref = settings["reference"],
159
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
160
    params:
161
162
        rg = "@RG\\tID:{read_group}\\tSM:{sample}\\tPL:ILLUMINA"
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
163
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
164
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
165
166
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
167
168
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

169
170
171
172
173
def markdup_bam_input(wildcards):
    """ Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
            read_group=rg) for rg in get_readgroup(wildcards)]

Sander Bollen's avatar
Sander Bollen committed
174
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
175
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
176
    input:
177
178
179
        bam = lambda wildcards:
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
        read_group=rg) for rg in get_readgroup(wildcards)),
Sander Bollen's avatar
Sander Bollen committed
180
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
181
    output:
Sander Bollen's avatar
Sander Bollen committed
182
183
184
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
185
186
    params:
        bams=markdup_bam_input
187
    singularity: containers["picard-2.14"]
188
189
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
190
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
191
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
192
193
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
194
195
196
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
197
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
198
    output:
Sander Bollen's avatar
Sander Bollen committed
199
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
200
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
201
202
    shell: "cp {input.bai} {output.bai}"

203
204
205
206
207
208

def bqsr_bam_input(wildcards):
    """ Generate the bam input string for each read group for BQSR"""
    return " ".join(["-I {sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
            read_group=rg) for rg in get_readgroup(wildcards)])

Sander Bollen's avatar
Sander Bollen committed
209
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
210
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
211
    input:
212
213
214
        bam = lambda wildcards:
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
        read_group=rg) for rg in get_readgroup(wildcards)),
van den Berg's avatar
van den Berg committed
215
        ref = settings["reference"],
216
        vcfs = settings["known_sites"]
Sander Bollen's avatar
Sander Bollen committed
217
    output:
Sander Bollen's avatar
Sander Bollen committed
218
        grp = "{sample}/bams/{sample}.baserecal.grp"
219
220
221
    params:
        known_sites = " ".join(expand("-knownSites {vcf}", vcf=settings["known_sites"])),
        bams = bqsr_bam_input
222
    singularity: containers["gatk"]
223
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
224
           "BaseRecalibrator {params.bams} -o {output.grp} -nct 8 "
Sander Bollen's avatar
Sander Bollen committed
225
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
226
           "-cov CycleCovariate -cov ContextCovariate {params.known_sites}"
Sander Bollen's avatar
Sander Bollen committed
227

228
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
229
230
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
231
        ref = settings["reference"],
232
    params:
van den Berg's avatar
van den Berg committed
233
        size = settings['scatter_size']
van den Berg's avatar
van den Berg committed
234
    output:
235
        directory("scatter")
van den Berg's avatar
van den Berg committed
236
    singularity: containers["biopet-scatterregions"]
237
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
238
           "biopet-scatterregions "
239
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
240
241
           "--outputDir scatter"

242

Sander Bollen's avatar
Sander Bollen committed
243
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
244
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
245
    input:
Sander Bollen's avatar
Sander Bollen committed
246
        bam="{sample}/bams/{sample}.markdup.bam",
247
        bqsr = "{sample}/bams/{sample}.baserecal.grp",
van den Berg's avatar
van den Berg committed
248
249
        dbsnp=settings["dbsnp"],
        ref=settings["reference"],
van den Berg's avatar
van den Berg committed
250
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
251
    output:
252
253
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
254
255
    wildcard_constraints:
        chunk="[0-9]+"
256
    singularity: containers["gatk"]
257
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
258
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
259
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
260
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
261
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
262
263
264
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
265

266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
282
283
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
284
285
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
286
287
           "--output-type z && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
288
289

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
290
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
291
    input:
292
293
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
294
        ref= settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
295
    output:
296
297
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
298
299
    wildcard_constraints:
        chunk="[0-9]+"
300
    singularity: containers["gatk"]
301
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
302
           "GenotypeGVCFs -R {input.ref} "
303
           "-V {input.gvcf} -o '{output.vcf}'"
304
305


306
def aggregate_vcf(wildcards):
307
308
309
310
311
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


312
313
314
315
316
317
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
318
rule genotype_gather:
319
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
320
    input:
321
        vcfs = aggregate_vcf,
322
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
323
    output:
324
325
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
326
    singularity: containers["bcftools"]
327
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
328
329
           "--output-type z && "
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
330
331


Sander Bollen's avatar
Sander Bollen committed
332
## bam metrics
333
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
334
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
335
    input:
336
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
337
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
338
    output:
339
340
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
341
    singularity: containers["samtools-1.7-python-3.6"]
342
343
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
344
345


346
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
347
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
348
    input:
349
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
350
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
351
    output:
352
353
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
354
    singularity: containers["samtools-1.7-python-3.6"]
355
356
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
357
358
359


## fastqc
Sander Bollen's avatar
Sander Bollen committed
360
rule fastqc_raw:
361
362
363
    """
    Run fastqc on raw fastq files
    """
Sander Bollen's avatar
Sander Bollen committed
364
    input:
van den Berg's avatar
van den Berg committed
365
366
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
367
    output:
van den Berg's avatar
van den Berg committed
368
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
369
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
370
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
371
372


Sander Bollen's avatar
Sander Bollen committed
373
rule fastqc_postqc:
374
375
376
    """
    Run fastqc on fastq files post pre-processing
    """
Sander Bollen's avatar
Sander Bollen committed
377
    input:
van den Berg's avatar
van den Berg committed
378
379
        r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
380
    output:
van den Berg's avatar
van den Berg committed
381
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
382
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
383
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
384
385


386
## coverage
Sander Bollen's avatar
Sander Bollen committed
387
rule covstats:
Sander Bollen's avatar
Sander Bollen committed
388
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
389
    input:
Sander Bollen's avatar
Sander Bollen committed
390
391
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
van den Berg's avatar
van den Berg committed
392
        covpy=settings["covstats"],
van den Berg's avatar
van den Berg committed
393
        bed=settings.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
394
395
396
    params:
        subt="Sample {sample}"
    output:
van den Berg's avatar
van den Berg committed
397
398
        covj="{sample}/coverage/covstats.json",
        covp="{sample}/coverage/covstats.png"
399
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
400
401
402
403
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
404
405


Sander Bollen's avatar
Sander Bollen committed
406
407
408
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
409
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
410
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
411
        ref = settings.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
412
    output:
413
        tsv="{sample}/coverage/refFlat_coverage.tsv"
414
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
415
416
417
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


418
## collection
419
420
421
422
423
424
425
426
427
if "bedfile" in settings:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            cov="{sample}/coverage/covstats.json",
428
            cutadapt = "{sample}/cutadapt.json",
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
            colpy=settings["collect_stats"]
        params:
            sample_name="{sample}",
            fthresh=settings["female_threshold"]
        output:
            "{sample}/{sample}.stats.json"
        singularity: containers["vtools"]
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--cutadapt {input.cutadapt} "
               "{input.cov} > {output}"
else:
    rule collectstats:
        """Collect all stats for a particular sample without beds"""
        input:
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
450
            cutadapt = "{sample}/cutadapt.json",
451
452
453
454
455
456
457
458
459
460
461
462
463
464
            colpy = settings["collect_stats"]
        params:
            sample_name = "{sample}",
            fthresh = settings["female_threshold"]
        output:
            "{sample}/{sample}.stats.json"
        singularity: containers["vtools"]
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--cutadapt {input.cutadapt} "
               "> {output}"

465
466
rule collect_cutadapt_summary:
    """ Colect cutadapt summary from each readgroup per sample """
Sander Bollen's avatar
Sander Bollen committed
467
    input:
van den Berg's avatar
van den Berg committed
468
469
470
        cutadapt = lambda wildcards:
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(sample=wildcards.sample,
        read_group=read_group) for read_group in get_readgroup(wildcards)),
471
472
        cutadapt_summary= settings["cutadapt_summary"]
    output: "{sample}/cutadapt.json"
473
    singularity: containers["python3"]
474
    shell: "python {input.cutadapt_summary} --sample {wildcards.sample} "
van den Berg's avatar
van den Berg committed
475
476
           "--cutadapt-summary {input.cutadapt} > {output}"

van den Berg's avatar
van den Berg committed
477

478
479
480
481
482
483
484
485
rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
        mpy=settings["merge_stats"]
    output:
        stats="stats.json"
    singularity: containers["vtools"]
486
    shell: "python {input.mpy} --collectstats {input.cols} "
487
488
489
490
491
492
493
494
495
496
497
498
           "> {output.stats}"


rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats="stats.json",
        sc=settings["stats_to_tsv"]
    output:
        stats="stats.tsv"
    singularity: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
499
500


Sander Bollen's avatar
Sander Bollen committed
501
502
503
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
504
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
505
506
    """
    input:
507
        stats="stats.tsv"
Sander Bollen's avatar
Sander Bollen committed
508
    params:
Sander Bollen's avatar
Sander Bollen committed
509
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
510
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
511
    output:
van den Berg's avatar
van den Berg committed
512
        "multiqc_report/multiqc_report.html"
513
    singularity: containers["multiqc"]
514
    shell: "multiqc -f -o {params.rdir} {params.odir} || touch {output}"