Snakefile 19.8 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Read the config json
with open(config['CONFIG_JSON'], 'rt') as fin:
    settings = json.load(fin)

# Read the json schema
with open('config/schema.json', 'rt') as fin:
    schema = json.load(fin)

# Validate the settings against the schema
try:
    jsonschema.validate(settings, schema)
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

45

46
47
48
# Set default values
def set_default(key, value):
    """ Set default config values """
van den Berg's avatar
van den Berg committed
49
50
51
52
53
54
55
56
57
58
59
60
61
    if key not in settings:
        settings[key] = value

# Set the default settings
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
set_default("covstats", "src/covstats.py")
set_default("collect_stats", "src/collect_stats.py")
set_default("merge_stats", "src/merge_stats.py")
set_default("stats_to_tsv", "src/stats_to_tsv.py")
set_default("py_wordcount", "src/pywc.py")
van den Berg's avatar
van den Berg committed
62
set_default("collect_metrics", "src/collect_metrics.py")
van den Berg's avatar
van den Berg committed
63
set_default("merge_metrics", "src/merge_metrics.py")
Sander Bollen's avatar
Sander Bollen committed
64

65
66
67
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
68
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
69
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
van den Berg's avatar
van den Berg committed
70
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
71
72
73
    "debian": "docker://debian:buster-slim",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
74
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
75
76
77
78
79
80
81
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
82

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def get_forward(wildcards):
    """ Get the forward fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
                [wildcards.read_group]["R1"]
    )

def get_reverse(wildcards):
    """ Get the reverse fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
            [wildcards.read_group]["R2"]
    )

def get_readgroup(wildcards):
    return settings["samples"][wildcards.sample]["libraries"]
Sander Bollen's avatar
Sander Bollen committed
99

van den Berg's avatar
van den Berg committed
100
101
102
103
104
105
def get_readgroup_per_sample():
    for sample in settings["samples"]:
        for rg in settings["samples"][sample]["libraries"]:
            yield rg, sample


106
107
108
def coverage_stats(wildcards):
    files = expand("{sample}/coverage/refFlat_coverage.tsv", sample=settings['samples'])
    return files if "refflat" in settings else []
Sander Bollen's avatar
Sander Bollen committed
109

Sander Bollen's avatar
Sander Bollen committed
110
111
rule all:
    input:
van den Berg's avatar
van den Berg committed
112
        multiqc="multiqc_report/multiqc_report.html",
113
114
        stats = "stats.json",
        stats_tsv = "stats.tsv",
van den Berg's avatar
van den Berg committed
115
116
117
118
119
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai", sample=settings['samples']),
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz", sample=settings['samples']),
        vcf_tbi=expand("{sample}/vcf/{sample}.vcf.gz.tbi", sample=settings['samples']),
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz", sample=settings['samples']),
        gvcf_tbi=expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=settings['samples']),
van den Berg's avatar
van den Berg committed
120
121
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
        fastqc_trimmed = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
122
        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt" for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
123
124
125
        sample_metrics = expand("{sample}/metrics.json", sample=settings['samples']),
        metrics_json = "metrics.json",
        metrics_tsv = "metrics.tsv",
van den Berg's avatar
van den Berg committed
126
        coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
127

Sander Bollen's avatar
Sander Bollen committed
128
129
130

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
131
    output: directory("tmp")
132
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
133
134
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
135
rule genome:
Sander Bollen's avatar
Sander Bollen committed
136
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
137
    input: settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
138
    output: "current.genome"
139
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
140
141
142
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
143
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
144
    input:
van den Berg's avatar
van den Berg committed
145
146
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
147
    output:
van den Berg's avatar
van den Berg committed
148
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
149
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
150
151
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
152
    singularity: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
153
154
155
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
156
           "{input.r1} {input.r2} "
157
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
158
159

rule align:
Sander Bollen's avatar
Sander Bollen committed
160
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
161
    input:
van den Berg's avatar
van den Berg committed
162
163
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
van den Berg's avatar
van den Berg committed
164
        ref = settings["reference"],
165
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
166
    params:
167
168
        rg = "@RG\\tID:{read_group}\\tSM:{sample}\\tPL:ILLUMINA"
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
169
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
170
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
171
172
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
173
174
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

175
176
177
178
179
def markdup_bam_input(wildcards):
    """ Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
            read_group=rg) for rg in get_readgroup(wildcards)]

Sander Bollen's avatar
Sander Bollen committed
180
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
181
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
182
    input:
183
184
185
        bam = lambda wildcards:
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
        read_group=rg) for rg in get_readgroup(wildcards)),
Sander Bollen's avatar
Sander Bollen committed
186
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
187
    output:
Sander Bollen's avatar
Sander Bollen committed
188
189
190
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
191
192
    params:
        bams=markdup_bam_input
193
    singularity: containers["picard-2.14"]
194
195
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
196
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
197
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
198
199
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
200
201
202
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
203
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
204
    output:
Sander Bollen's avatar
Sander Bollen committed
205
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
206
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
207
208
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
209
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
210
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
211
    input:
Sander Bollen's avatar
Sander Bollen committed
212
        bam = "{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
213
        ref = settings["reference"],
214
        vcfs = settings["known_sites"]
Sander Bollen's avatar
Sander Bollen committed
215
    output:
Sander Bollen's avatar
Sander Bollen committed
216
        grp = "{sample}/bams/{sample}.baserecal.grp"
217
    params: " ".join(expand("-knownSites {vcf}", vcf=settings["known_sites"]))
218
    singularity: containers["gatk"]
219
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
220
221
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
222
           "-cov CycleCovariate -cov ContextCovariate {params}"
Sander Bollen's avatar
Sander Bollen committed
223

224
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
225
226
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
227
        ref = settings["reference"],
228
    params:
van den Berg's avatar
van den Berg committed
229
        size = settings['scatter_size']
van den Berg's avatar
van den Berg committed
230
    output:
231
        directory("scatter")
van den Berg's avatar
van den Berg committed
232
    singularity: containers["biopet-scatterregions"]
233
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
234
           "biopet-scatterregions "
235
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
236
237
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
238
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
239
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
240
    input:
Sander Bollen's avatar
Sander Bollen committed
241
242
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
van den Berg's avatar
van den Berg committed
243
244
        dbsnp=settings["dbsnp"],
        ref=settings["reference"],
van den Berg's avatar
van den Berg committed
245
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
246
    output:
247
248
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
249
250
    wildcard_constraints:
        chunk="[0-9]+"
251
    singularity: containers["gatk"]
252
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
253
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
254
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
255
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
256
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
257
258
259
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
260

261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
277
278
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
279
280
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
281
282
           "--output-type z && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
283
284

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
285
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
286
    input:
287
288
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
289
        ref= settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
290
    output:
291
292
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
293
294
    wildcard_constraints:
        chunk="[0-9]+"
295
    singularity: containers["gatk"]
296
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
297
           "GenotypeGVCFs -R {input.ref} "
298
           "-V {input.gvcf} -o '{output.vcf}'"
299
300


301
def aggregate_vcf(wildcards):
302
303
304
305
306
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


307
308
309
310
311
312
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
313
rule genotype_gather:
314
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
315
    input:
316
        vcfs = aggregate_vcf,
317
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
318
    output:
319
320
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
321
    singularity: containers["bcftools"]
322
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
323
324
           "--output-type z && "
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
325
326


Sander Bollen's avatar
Sander Bollen committed
327
## bam metrics
328
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
329
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
330
    input:
331
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
332
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
333
    output:
334
335
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
336
    singularity: containers["samtools-1.7-python-3.6"]
337
338
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
339
340


341
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
342
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
343
    input:
344
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
345
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
346
    output:
347
348
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
349
    singularity: containers["samtools-1.7-python-3.6"]
350
351
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
352
353
354


## fastqc
Sander Bollen's avatar
Sander Bollen committed
355
rule fastqc_raw:
356
357
358
    """
    Run fastqc on raw fastq files
    """
Sander Bollen's avatar
Sander Bollen committed
359
    input:
van den Berg's avatar
van den Berg committed
360
361
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
362
    output:
van den Berg's avatar
van den Berg committed
363
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
364
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
365
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
366
367


Sander Bollen's avatar
Sander Bollen committed
368
rule fastqc_postqc:
369
370
371
    """
    Run fastqc on fastq files post pre-processing
    """
Sander Bollen's avatar
Sander Bollen committed
372
    input:
van den Berg's avatar
van den Berg committed
373
374
        r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
375
    output:
van den Berg's avatar
van den Berg committed
376
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
377
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
378
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
379
380


Sander Bollen's avatar
Sander Bollen committed
381
382
383
## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
384
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
385
    input:
Sander Bollen's avatar
Sander Bollen committed
386
387
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
van den Berg's avatar
van den Berg committed
388
        covpy=settings["covstats"],
van den Berg's avatar
van den Berg committed
389
        bed=settings.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
390
391
392
    params:
        subt="Sample {sample}"
    output:
van den Berg's avatar
van den Berg committed
393
394
        covj="{sample}/coverage/covstats.json",
        covp="{sample}/coverage/covstats.png"
395
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
396
397
398
399
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
400
401


Sander Bollen's avatar
Sander Bollen committed
402
403
404
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
405
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
406
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
407
        ref = settings.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
408
    output:
409
        tsv="{sample}/coverage/refFlat_coverage.tsv"
410
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
411
412
413
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
# collection
if "bedfile" in settings:
    rule collectstats:
        """Collect all stats for a particular sample with beds"""
        input:
            mnum="{sample}/bams/{sample}.mapped.num",
            mbnum="{sample}/bams/{sample}.mapped.basenum",
            unum="{sample}/bams/{sample}.unique.num",
            ubnum="{sample}/bams/{sample}.usable.basenum",
            cov="{sample}/coverage/covstats.json",
            cutadapt = "{sample}/metrics.json",
            colpy=settings["collect_stats"]
        params:
            sample_name="{sample}",
            fthresh=settings["female_threshold"]
        output:
            "{sample}/{sample}.stats.json"
        singularity: containers["vtools"]
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--cutadapt {input.cutadapt} "
               "{input.cov} > {output}"
else:
    rule collectstats:
        """Collect all stats for a particular sample without beds"""
        input:
            mnum = "{sample}/bams/{sample}.mapped.num",
            mbnum = "{sample}/bams/{sample}.mapped.basenum",
            unum = "{sample}/bams/{sample}.unique.num",
            ubnum = "{sample}/bams/{sample}.usable.basenum",
            cutadapt = "{sample}/metrics.json",
            colpy = settings["collect_stats"]
        params:
            sample_name = "{sample}",
            fthresh = settings["female_threshold"]
        output:
            "{sample}/{sample}.stats.json"
        singularity: containers["vtools"]
        shell: "python {input.colpy} --sample-name {params.sample_name} "
               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
               "--female-threshold {params.fthresh} "
               "--cutadapt {input.cutadapt} "
               "> {output}"

van den Berg's avatar
van den Berg committed
461
462
rule collect_metrics:
    """ Colect metrics for each sample """
Sander Bollen's avatar
Sander Bollen committed
463
    input:
van den Berg's avatar
van den Berg committed
464
465
466
467
468
        cutadapt = lambda wildcards:
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(sample=wildcards.sample,
        read_group=read_group) for read_group in get_readgroup(wildcards)),
        collect_metrics = settings["collect_metrics"]
    output: "{sample}/metrics.json"
469
    singularity: containers["python3"]
van den Berg's avatar
van den Berg committed
470
471
472
    shell: "python {input.collect_metrics} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

van den Berg's avatar
van den Berg committed
473
474
475
476
477
478
479
480
481
482
483
484
rule merge_metrics:
   """ Merge per sample metrics files """
   input:
        metrics = expand("{sample}/metrics.json", sample=settings["samples"]),
        merge_metrics = settings["merge_metrics"]
   output:
        json = "metrics.json",
        tsv = "metrics.tsv"
   singularity: containers["python3"]
   shell: "python {input.merge_metrics} --metrics {input.metrics} "
          "--json {output.json} --tsv {output.tsv}"

485
486
487
488
489
490
491
492
rule merge_stats:
    """Merge all stats of all samples"""
    input:
        cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
        mpy=settings["merge_stats"]
    output:
        stats="stats.json"
    singularity: containers["vtools"]
493
    shell: "python {input.mpy} --collectstats {input.cols} "
494
495
496
497
498
499
500
501
502
503
504
505
           "> {output.stats}"


rule stats_tsv:
    """Convert stats.json to tsv"""
    input:
        stats="stats.json",
        sc=settings["stats_to_tsv"]
    output:
        stats="stats.tsv"
    singularity: containers["python3"]
    shell: "python {input.sc} -i {input.stats} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
506
507


Sander Bollen's avatar
Sander Bollen committed
508
509
510
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
511
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
512
513
    """
    input:
van den Berg's avatar
van den Berg committed
514
        stats=expand("{sample}/metrics.json", sample=settings["samples"])
Sander Bollen's avatar
Sander Bollen committed
515
    params:
Sander Bollen's avatar
Sander Bollen committed
516
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
517
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
518
    output:
van den Berg's avatar
van den Berg committed
519
        "multiqc_report/multiqc_report.html"
520
    singularity: containers["multiqc"]
van den Berg's avatar
van den Berg committed
521
    shell: "multiqc -f -o {params.rdir} {params.odir} --ignore metrics.tsv || touch {output}"