Snakefile 21.1 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#   hutspot - a DNAseq variant calling pipeline
#   Copyright (C) 2017-2019, Sander Bollen, Leiden University Medical Center
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU Affero General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU Affero General Public License for more details.
#
#   You should have received a copy of the GNU Affero General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""
17
Main Snakefile for the pipeline.
Sander Bollen's avatar
Sander Bollen committed
18
19
20
21
22
23

:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
:license: AGPL-3.0
"""

Sander Bollen's avatar
Sander Bollen committed
24
import json
van den Berg's avatar
van den Berg committed
25
import jsonschema
Sander Bollen's avatar
Sander Bollen committed
26
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
27
from os.path import join, basename
28
from pathlib import Path
29
import itertools
Sander Bollen's avatar
Sander Bollen committed
30

van den Berg's avatar
van den Berg committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Read the config json
with open(config['CONFIG_JSON'], 'rt') as fin:
    settings = json.load(fin)

# Read the json schema
with open('config/schema.json', 'rt') as fin:
    schema = json.load(fin)

# Validate the settings against the schema
try:
    jsonschema.validate(settings, schema)
except jsonschema.ValidationError as e:
    raise jsonschema.ValidationError(f'Invalid CONFIG_JSON: {e}')

45

46
47
48
# Set default values
def set_default(key, value):
    """ Set default config values """
van den Berg's avatar
van den Berg committed
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    if key not in settings:
        settings[key] = value

# Set the default settings
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)

# Set the script paths
set_default("covstats", "src/covstats.py")
set_default("collect_stats", "src/collect_stats.py")
set_default("merge_stats", "src/merge_stats.py")
set_default("fastq_stats", "src/fastqc_stats.py")
set_default("stats_to_tsv", "src/stats_to_tsv.py")
set_default("py_wordcount", "src/pywc.py")
van den Berg's avatar
van den Berg committed
63
set_default("collect_metrics", "src/collect_metrics.py")
van den Berg's avatar
van den Berg committed
64
set_default("merge_metrics", "src/merge_metrics.py")
Sander Bollen's avatar
Sander Bollen committed
65

66
67
68
containers = {
    "bcftools": "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4",
    "bedtools-2.26-python-2.7": "docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0",
van den Berg's avatar
van den Berg committed
69
    "biopet-scatterregions": "docker://quay.io/biocontainers/biopet-scatterregions:0.2--0",
70
    "bwa-0.7.17-picard-2.18.7": "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0",
van den Berg's avatar
van den Berg committed
71
    "cutadapt": "docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0",
72
73
74
75
    "debian": "docker://debian:buster-slim",
    "fastq-count": "docker://quay.io/biocontainers/fastq-count:0.1.0--h14c3975_0",
    "fastqc": "docker://quay.io/biocontainers/fastqc:0.11.7--4",
    "gatk": "docker://broadinstitute/gatk3:3.7-0",
76
    "multiqc": "docker://quay.io/biocontainers/multiqc:1.8--py_2",
77
78
79
80
81
82
83
    "picard-2.14": "docker://quay.io/biocontainers/picard:2.14--py36_0",
    "python3": "docker://python:3.6-slim",
    "samtools-1.7-python-3.6": "docker://quay.io/biocontainers/mulled-v2-eb9e7907c7a753917c1e4d7a64384c047429618a:1abf1824431ec057c7d41be6f0c40e24843acde4-0",
    "sickle": "docker://quay.io/biocontainers/sickle-trim:1.33--ha92aebf_4",
    "tabix": "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0",
    "vtools": "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
}
Sander Bollen's avatar
Sander Bollen committed
84

85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
def get_forward(wildcards):
    """ Get the forward fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
                [wildcards.read_group]["R1"]
    )

def get_reverse(wildcards):
    """ Get the reverse fastq file from the config """
    return (
        settings["samples"][wildcards.sample]["libraries"]
            [wildcards.read_group]["R2"]
    )

def get_readgroup(wildcards):
    return settings["samples"][wildcards.sample]["libraries"]
Sander Bollen's avatar
Sander Bollen committed
101

van den Berg's avatar
van den Berg committed
102
103
104
105
106
107
def get_readgroup_per_sample():
    for sample in settings["samples"]:
        for rg in settings["samples"][sample]["libraries"]:
            yield rg, sample


108
109
110
def coverage_stats(wildcards):
    files = expand("{sample}/coverage/refFlat_coverage.tsv", sample=settings['samples'])
    return files if "refflat" in settings else []
Sander Bollen's avatar
Sander Bollen committed
111

Sander Bollen's avatar
Sander Bollen committed
112
113
rule all:
    input:
van den Berg's avatar
van den Berg committed
114
        multiqc="multiqc_report/multiqc_report.html",
van den Berg's avatar
van den Berg committed
115
        #stats = "stats.json",
van den Berg's avatar
van den Berg committed
116
117
118
119
120
        bais=expand("{sample}/bams/{sample}.markdup.bam.bai", sample=settings['samples']),
        vcfs=expand("{sample}/vcf/{sample}.vcf.gz", sample=settings['samples']),
        vcf_tbi=expand("{sample}/vcf/{sample}.vcf.gz.tbi", sample=settings['samples']),
        gvcfs=expand("{sample}/vcf/{sample}.g.vcf.gz", sample=settings['samples']),
        gvcf_tbi=expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=settings['samples']),
van den Berg's avatar
van den Berg committed
121
122
        fastqc_raw = (f"{sample}/pre_process/raw-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
        fastqc_trimmed = (f"{sample}/pre_process/trimmed-{sample}-{read_group}/" for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
123
        cutadapt = (f"{sample}/pre_process/{sample}-{read_group}.txt" for read_group, sample in get_readgroup_per_sample()),
van den Berg's avatar
van den Berg committed
124
125
126
        sample_metrics = expand("{sample}/metrics.json", sample=settings['samples']),
        metrics_json = "metrics.json",
        metrics_tsv = "metrics.tsv",
van den Berg's avatar
van den Berg committed
127
        coverage_stats = coverage_stats,
Sander Bollen's avatar
Sander Bollen committed
128

Sander Bollen's avatar
Sander Bollen committed
129
130
131

rule create_markdup_tmp:
    """Create tmp directory for mark duplicates"""
132
    output: directory("tmp")
133
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
134
135
    shell: "mkdir -p {output}"

Sander Bollen's avatar
Sander Bollen committed
136
rule genome:
Sander Bollen's avatar
Sander Bollen committed
137
    """Create genome file as used by bedtools"""
van den Berg's avatar
van den Berg committed
138
    input: settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
139
    output: "current.genome"
140
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
141
142
143
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule cutadapt:
Sander Bollen's avatar
Sander Bollen committed
144
    """Clip fastq files"""
Sander Bollen's avatar
Sander Bollen committed
145
    input:
van den Berg's avatar
van den Berg committed
146
147
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
148
    output:
van den Berg's avatar
van den Berg committed
149
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
van den Berg's avatar
van den Berg committed
150
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
151
152
    log:
        "{sample}/pre_process/{sample}-{read_group}.txt"
153
    singularity: containers["cutadapt"]
van den Berg's avatar
van den Berg committed
154
155
156
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG "
           "--minimum-length 1 --quality-cutoff=20,20 "
           "--output {output.r1} --paired-output {output.r2} -Z "
van den Berg's avatar
van den Berg committed
157
           "{input.r1} {input.r2} "
158
           "--report=minimal > {log}"
Sander Bollen's avatar
Sander Bollen committed
159
160

rule align:
Sander Bollen's avatar
Sander Bollen committed
161
    """Align fastq files"""
Sander Bollen's avatar
Sander Bollen committed
162
    input:
van den Berg's avatar
van den Berg committed
163
164
        r1 = "{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2 = "{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
van den Berg's avatar
van den Berg committed
165
        ref = settings["reference"],
166
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
167
    params:
168
169
        rg = "@RG\\tID:{read_group}\\tSM:{sample}\\tPL:ILLUMINA"
    output: "{sample}/bams/{sample}-{read_group}.sorted.bam"
170
    singularity: containers["bwa-0.7.17-picard-2.18.7"]
Sander Bollen's avatar
Sander Bollen committed
171
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
172
173
           "| picard -Xmx4G -Djava.io.tmpdir={input.tmp} SortSam "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
Sander Bollen's avatar
Sander Bollen committed
174
175
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

176
177
178
179
180
def markdup_bam_input(wildcards):
    """ Generate the INPUT for each bam file """
    return ["INPUT={sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
            read_group=rg) for rg in get_readgroup(wildcards)]

Sander Bollen's avatar
Sander Bollen committed
181
rule markdup:
Sander Bollen's avatar
Sander Bollen committed
182
    """Mark duplicates in BAM file"""
Sander Bollen's avatar
Sander Bollen committed
183
    input:
184
185
186
        bam = lambda wildcards:
        ("{sample}/bams/{sample}-{read_group}.sorted.bam".format(sample=wildcards.sample,
        read_group=rg) for rg in get_readgroup(wildcards)),
Sander Bollen's avatar
Sander Bollen committed
187
        tmp = ancient("tmp")
Sander Bollen's avatar
Sander Bollen committed
188
    output:
Sander Bollen's avatar
Sander Bollen committed
189
190
191
        bam = "{sample}/bams/{sample}.markdup.bam",
        bai = "{sample}/bams/{sample}.markdup.bai",
        metrics = "{sample}/bams/{sample}.markdup.metrics"
192
193
    params:
        bams=markdup_bam_input
194
    singularity: containers["picard-2.14"]
195
196
    shell: "picard -Xmx4G -Djava.io.tmpdir={input.tmp} MarkDuplicates "
           "CREATE_INDEX=TRUE TMP_DIR={input.tmp} "
197
           "{params.bams} OUTPUT={output.bam} "
Sander Bollen's avatar
Sander Bollen committed
198
           "METRICS_FILE={output.metrics} "
Sander Bollen's avatar
Sander Bollen committed
199
200
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

Sander Bollen's avatar
Sander Bollen committed
201
202
203
rule bai:
    """Copy bai files as some genome browsers can only .bam.bai files"""
    input:
Sander Bollen's avatar
Sander Bollen committed
204
        bai = "{sample}/bams/{sample}.markdup.bai"
Sander Bollen's avatar
Sander Bollen committed
205
    output:
Sander Bollen's avatar
Sander Bollen committed
206
        bai = "{sample}/bams/{sample}.markdup.bam.bai"
207
    singularity: containers["debian"]
Sander Bollen's avatar
Sander Bollen committed
208
209
    shell: "cp {input.bai} {output.bai}"

Sander Bollen's avatar
Sander Bollen committed
210
rule baserecal:
Sander Bollen's avatar
Sander Bollen committed
211
    """Base recalibrated BAM files"""
Sander Bollen's avatar
Sander Bollen committed
212
    input:
Sander Bollen's avatar
Sander Bollen committed
213
        bam = "{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
214
        ref = settings["reference"],
215
        vcfs = settings["known_sites"]
Sander Bollen's avatar
Sander Bollen committed
216
    output:
Sander Bollen's avatar
Sander Bollen committed
217
        grp = "{sample}/bams/{sample}.baserecal.grp"
218
    params: " ".join(expand("-knownSites {vcf}", vcf=settings["known_sites"]))
219
    singularity: containers["gatk"]
220
    shell: "java -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
221
222
           "BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
           "-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
223
           "-cov CycleCovariate -cov ContextCovariate {params}"
Sander Bollen's avatar
Sander Bollen committed
224

225
checkpoint scatterregions:
van den Berg's avatar
van den Berg committed
226
227
    """Scatter the reference genome"""
    input:
van den Berg's avatar
van den Berg committed
228
        ref = settings["reference"],
229
    params:
van den Berg's avatar
van den Berg committed
230
        size = settings['scatter_size']
van den Berg's avatar
van den Berg committed
231
    output:
232
        directory("scatter")
van den Berg's avatar
van den Berg committed
233
    singularity: containers["biopet-scatterregions"]
234
    shell: "mkdir -p {output} && "
van den Berg's avatar
van den Berg committed
235
           "biopet-scatterregions "
236
           "--referenceFasta {input.ref} --scatterSize {params.size} "
van den Berg's avatar
van den Berg committed
237
238
           "--outputDir scatter"

Sander Bollen's avatar
Sander Bollen committed
239
rule gvcf_scatter:
Sander Bollen's avatar
Sander Bollen committed
240
    """Run HaplotypeCaller in GVCF mode by chunk"""
Sander Bollen's avatar
Sander Bollen committed
241
    input:
Sander Bollen's avatar
Sander Bollen committed
242
243
        bam="{sample}/bams/{sample}.markdup.bam",
        bqsr="{sample}/bams/{sample}.baserecal.grp",
van den Berg's avatar
van den Berg committed
244
245
        dbsnp=settings["dbsnp"],
        ref=settings["reference"],
van den Berg's avatar
van den Berg committed
246
        region="scatter/scatter-{chunk}.bed"
Sander Bollen's avatar
Sander Bollen committed
247
    output:
248
249
        gvcf=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz"),
        gvcf_tbi=temp("{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
250
251
    wildcard_constraints:
        chunk="[0-9]+"
252
    singularity: containers["gatk"]
253
    shell: "java -jar -Xmx4G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar "
Sander Bollen's avatar
Sander Bollen committed
254
           "-T HaplotypeCaller -ERC GVCF -I "
Sander Bollen's avatar
Sander Bollen committed
255
           "{input.bam} -R {input.ref} -D {input.dbsnp} "
van den Berg's avatar
van den Berg committed
256
           "-L '{input.region}' -o '{output.gvcf}' "
Sander Bollen's avatar
Sander Bollen committed
257
           "-variant_index_type LINEAR -variant_index_parameter 128000 "
258
259
260
           "-BQSR {input.bqsr} "
           "--GVCFGQBands 20 --GVCFGQBands 40 --GVCFGQBands 60 "
           "--GVCFGQBands 80 --GVCFGQBands 100 "
Sander Bollen's avatar
Sander Bollen committed
261

262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
def aggregate_gvcf(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

def aggregate_gvcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)

rule gvcf_gather:
    """ Join the gvcf files together """
    input:
        gvcfs = aggregate_gvcf,
        tbis = aggregate_gvcf_tbi,
    output:
278
279
        gvcf = "{sample}/vcf/{sample}.g.vcf.gz",
        gvcf_tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
280
281
    singularity: containers["bcftools"]
    shell: "bcftools concat {input.gvcfs} --allow-overlaps --output {output.gvcf} "
282
283
           "--output-type z && "
           "bcftools index --tbi --output-file {output.gvcf_tbi} {output.gvcf}"
Sander Bollen's avatar
Sander Bollen committed
284
285

rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
286
    """Run GATK's GenotypeGVCFs by chunk"""
Sander Bollen's avatar
Sander Bollen committed
287
    input:
288
289
        gvcf = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz",
        tbi = "{sample}/vcf/{sample}.{chunk}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
290
        ref= settings["reference"]
Sander Bollen's avatar
Sander Bollen committed
291
    output:
292
293
        vcf = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz"),
        vcf_tbi = temp("{sample}/vcf/{sample}.{chunk}.vcf.gz.tbi")
van den Berg's avatar
van den Berg committed
294
295
    wildcard_constraints:
        chunk="[0-9]+"
296
    singularity: containers["gatk"]
297
    shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
Sander Bollen's avatar
Sander Bollen committed
298
           "GenotypeGVCFs -R {input.ref} "
299
           "-V {input.gvcf} -o '{output.vcf}'"
300
301


302
def aggregate_vcf(wildcards):
303
304
305
306
307
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


308
309
310
311
312
313
def aggregate_vcf_tbi(wildcards):
    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
           i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)


Sander Bollen's avatar
Sander Bollen committed
314
rule genotype_gather:
315
    """Gather all genotyping VCFs"""
Sander Bollen's avatar
Sander Bollen committed
316
    input:
317
        vcfs = aggregate_vcf,
318
        vcfs_tbi = aggregate_vcf_tbi
Sander Bollen's avatar
Sander Bollen committed
319
    output:
320
321
        vcf = "{sample}/vcf/{sample}.vcf.gz",
        vcf_tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
322
    singularity: containers["bcftools"]
323
    shell: "bcftools concat {input.vcfs} --allow-overlaps --output {output.vcf} "
324
325
           "--output-type z && "
           "bcftools index --tbi --output-file {output.vcf_tbi} {output.vcf}"
Sander Bollen's avatar
Sander Bollen committed
326
327


Sander Bollen's avatar
Sander Bollen committed
328
## bam metrics
329
rule mapped_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
330
    """Calculate number of mapped reads"""
Sander Bollen's avatar
Sander Bollen committed
331
    input:
332
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
333
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
334
    output:
335
336
        reads="{sample}/bams/{sample}.mapped.num",
        bases="{sample}/bams/{sample}.mapped.basenum"
337
    singularity: containers["samtools-1.7-python-3.6"]
338
339
    shell: "samtools view -F 4 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
340
341


342
rule unique_reads_bases:
Sander Bollen's avatar
Sander Bollen committed
343
    """Calculate number of unique reads"""
Sander Bollen's avatar
Sander Bollen committed
344
    input:
345
        bam="{sample}/bams/{sample}.markdup.bam",
van den Berg's avatar
van den Berg committed
346
        pywc=settings["py_wordcount"]
Sander Bollen's avatar
Sander Bollen committed
347
    output:
348
349
        reads="{sample}/bams/{sample}.unique.num",
        bases="{sample}/bams/{sample}.usable.basenum"
350
    singularity: containers["samtools-1.7-python-3.6"]
351
352
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f 10 | python {input.pywc} "
           "--reads {output.reads} --bases {output.bases}"
Sander Bollen's avatar
Sander Bollen committed
353
354
355


## fastqc
Sander Bollen's avatar
Sander Bollen committed
356
rule fastqc_raw:
357
358
359
    """
    Run fastqc on raw fastq files
    """
Sander Bollen's avatar
Sander Bollen committed
360
    input:
van den Berg's avatar
van den Berg committed
361
362
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
363
    output:
van den Berg's avatar
van den Berg committed
364
        directory("{sample}/pre_process/raw-{sample}-{read_group}/")
365
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
366
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
367
368


Sander Bollen's avatar
Sander Bollen committed
369
rule fastqc_postqc:
370
371
372
    """
    Run fastqc on fastq files post pre-processing
    """
Sander Bollen's avatar
Sander Bollen committed
373
    input:
van den Berg's avatar
van den Berg committed
374
375
        r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq.gz",
        r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq.gz",
Sander Bollen's avatar
Sander Bollen committed
376
    output:
van den Berg's avatar
van den Berg committed
377
        directory("{sample}/pre_process/trimmed-{sample}-{read_group}/")
378
    singularity: containers["fastqc"]
van den Berg's avatar
van den Berg committed
379
    shell: "fastqc --threads 4 --nogroup -o {output} {input.r1} {input.r2} "
Sander Bollen's avatar
Sander Bollen committed
380
381
382
383
384


## fastq-count

rule fqcount_preqc:
Sander Bollen's avatar
Sander Bollen committed
385
    """Calculate number of reads and bases before pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
386
    input:
387
388
        r1=get_forward,
        r2=get_reverse
Sander Bollen's avatar
Sander Bollen committed
389
    output:
390
        "{sample}/pre_process/{sample}-{read_group}.preqc_count.json"
391
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
392
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
393
394
395


rule fqcount_postqc:
Sander Bollen's avatar
Sander Bollen committed
396
    """Calculate number of reads and bases after pre-processing"""
Sander Bollen's avatar
Sander Bollen committed
397
    input:
van den Berg's avatar
van den Berg committed
398
399
        r1="{sample}/pre_process/{sample}-{read_group}_R1.fastq",
        r2="{sample}/pre_process/{sample}-{read_group}_R2.fastq"
Sander Bollen's avatar
Sander Bollen committed
400
    output:
401
        "{sample}/pre_process/{sample}-{read_group}.postqc_count.json"
402
    singularity: containers["fastq-count"]
Sander Bollen's avatar
Sander Bollen committed
403
    shell: "fastq-count {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
404
405
406
407
408


## coverages

rule covstats:
Sander Bollen's avatar
Sander Bollen committed
409
    """Calculate coverage statistics on bam file"""
Sander Bollen's avatar
Sander Bollen committed
410
    input:
Sander Bollen's avatar
Sander Bollen committed
411
412
        bam="{sample}/bams/{sample}.markdup.bam",
        genome="current.genome",
van den Berg's avatar
van den Berg committed
413
        covpy=settings["covstats"],
van den Berg's avatar
van den Berg committed
414
        bed=settings.get("bedfile","")
Sander Bollen's avatar
Sander Bollen committed
415
416
417
    params:
        subt="Sample {sample}"
    output:
van den Berg's avatar
van den Berg committed
418
419
        covj="{sample}/coverage/covstats.json",
        covp="{sample}/coverage/covstats.png"
420
    singularity: containers["bedtools-2.26-python-2.7"]
Sander Bollen's avatar
Sander Bollen committed
421
422
423
424
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} "
           "-b {input.bam} -d  | python {input.covpy} - --plot {output.covp} "
           "--title 'Targets coverage' --subtitle '{params.subt}' "
           "> {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
425
426


Sander Bollen's avatar
Sander Bollen committed
427
428
429
rule vtools_coverage:
    """Calculate coverage statistics per transcript"""
    input:
Sander Bollen's avatar
Sander Bollen committed
430
        gvcf="{sample}/vcf/{sample}.g.vcf.gz",
Sander Bollen's avatar
Sander Bollen committed
431
        tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
van den Berg's avatar
van den Berg committed
432
        ref = settings.get('refflat', "")
Sander Bollen's avatar
Sander Bollen committed
433
    output:
434
        tsv="{sample}/coverage/refFlat_coverage.tsv"
435
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
436
437
438
    shell: "vtools-gcoverage -I {input.gvcf} -R {input.ref} > {output.tsv}"


Sander Bollen's avatar
Sander Bollen committed
439
440
441
## vcfstats

rule vcfstats:
Sander Bollen's avatar
Sander Bollen committed
442
    """Calculate vcf statistics"""
Sander Bollen's avatar
Sander Bollen committed
443
    input:
444
        vcf="{sample}/vcf/{sample}.vcf.gz",
445
        tbi = "{sample}/vcf/{sample}.vcf.gz.tbi"
Sander Bollen's avatar
Sander Bollen committed
446
    output:
447
        stats="{sampel}/vcf/{sample}.vcfstats.json"
448
    singularity: containers["vtools"]
Sander Bollen's avatar
Sander Bollen committed
449
    shell: "vtools-stats -i {input.vcf} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
450
451


Sander Bollen's avatar
Sander Bollen committed
452
## collection
van den Berg's avatar
van den Berg committed
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
#if "bedfile" in settings:
#    rule collectstats:
#        """Collect all stats for a particular sample with beds"""
#        input:
#            mnum="{sample}/bams/{sample}.mapped.num",
#            mbnum="{sample}/bams/{sample}.mapped.basenum",
#            unum="{sample}/bams/{sample}.unique.num",
#            ubnum="{sample}/bams/{sample}.usable.basenum",
#            cov="{sample}/coverage/covstats.json",
#            colpy=settings["collect_stats"]
#        params:
#            sample_name="{sample}",
#            fthresh=settings["female_threshold"]
#        output:
#            "{sample}/{sample}.stats.json"
#        singularity: containers["vtools"]
#        shell: "python {input.colpy} --sample-name {params.sample_name} "
#               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
#               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
#               "--female-threshold {params.fthresh} "
#               "{input.cov} > {output}"
#else:
#    rule collectstats:
#        """Collect all stats for a particular sample without beds"""
#        input:
#            preqc = "{sample}/pre_process/{sample}.preqc_count.json",
#            postq = "{sample}/pre_process/{sample}.postqc_count.json",
#            mnum = "{sample}/bams/{sample}.mapped.num",
#            mbnum = "{sample}/bams/{sample}.mapped.basenum",
#            unum = "{sample}/bams/{sample}.unique.num",
#            ubnum = "{sample}/bams/{sample}.usable.basenum",
#            colpy = settings["collect_stats"]
#        params:
#            sample_name = "{sample}",
#            fthresh = settings["female_threshold"]
#        output:
#            "{sample}/{sample}.stats.json"
#        singularity: containers["vtools"]
#        shell: "python {input.colpy} --sample-name {params.sample_name} "
#               "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} "
#               "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} "
#               "--unique-num {input.unum} --usable-basenum {input.ubnum} "
#               "--female-threshold {params.fthresh} "
#               "> {output}"
#
rule collect_metrics:
    """ Colect metrics for each sample """
Sander Bollen's avatar
Sander Bollen committed
500
    input:
van den Berg's avatar
van den Berg committed
501
502
503
504
505
        cutadapt = lambda wildcards:
        ("{sample}/pre_process/{sample}-{read_group}.txt".format(sample=wildcards.sample,
        read_group=read_group) for read_group in get_readgroup(wildcards)),
        collect_metrics = settings["collect_metrics"]
    output: "{sample}/metrics.json"
506
    singularity: containers["python3"]
van den Berg's avatar
van den Berg committed
507
508
509
    shell: "python {input.collect_metrics} --sample {wildcards.sample} "
           "--cutadapt-summary {input.cutadapt} > {output}"

van den Berg's avatar
van den Berg committed
510
511
512
513
514
515
516
517
518
519
520
521
rule merge_metrics:
   """ Merge per sample metrics files """
   input:
        metrics = expand("{sample}/metrics.json", sample=settings["samples"]),
        merge_metrics = settings["merge_metrics"]
   output:
        json = "metrics.json",
        tsv = "metrics.tsv"
   singularity: containers["python3"]
   shell: "python {input.merge_metrics} --metrics {input.metrics} "
          "--json {output.json} --tsv {output.tsv}"

van den Berg's avatar
van den Berg committed
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
#rule merge_stats:
#    """Merge all stats of all samples"""
#    input:
#        cols=expand("{sample}/{sample}.stats.json", sample=settings['samples']),
#        vstat=expand("{sample}/vcf/{sample}.vcfstats.json", sample=settings['samples']),
#        mpy=settings["merge_stats"]
#    output:
#        stats="stats.json"
#    singularity: containers["vtools"]
#    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} "
#           "> {output.stats}"
#
#
#rule stats_tsv:
#    """Convert stats.json to tsv"""
#    input:
#        stats="stats.json",
#        sc=settings["stats_to_tsv"]
#    output:
#        stats="stats.tsv"
#    singularity: containers["python3"]
#    shell: "python {input.sc} -i {input.stats} > {output.stats}"
Sander Bollen's avatar
Sander Bollen committed
544
545


Sander Bollen's avatar
Sander Bollen committed
546
547
548
rule multiqc:
    """
    Create multiQC report
Sander Bollen's avatar
Sander Bollen committed
549
    Depends on stats.tsv to forcefully run at end of pipeline
Sander Bollen's avatar
Sander Bollen committed
550
551
    """
    input:
van den Berg's avatar
van den Berg committed
552
        stats=expand("{sample}/metrics.json", sample=settings["samples"])
Sander Bollen's avatar
Sander Bollen committed
553
    params:
Sander Bollen's avatar
Sander Bollen committed
554
        odir=".",
Sander Bollen's avatar
Sander Bollen committed
555
        rdir="multiqc_report"
Sander Bollen's avatar
Sander Bollen committed
556
    output:
van den Berg's avatar
van den Berg committed
557
        "multiqc_report/multiqc_report.html"
558
    singularity: containers["multiqc"]
van den Berg's avatar
van den Berg committed
559
    shell: "multiqc -f -o {params.rdir} {params.odir} --ignore metrics.tsv || touch {output}"