Snakefile 15.5 KB
Newer Older
Sander Bollen's avatar
Sander Bollen committed
1
import json
Sander Bollen's avatar
Sander Bollen committed
2
from functools import partial
Sander Bollen's avatar
Sander Bollen committed
3
from os.path import join, basename
Sander Bollen's avatar
Sander Bollen committed
4
from os import mkdir
Sander Bollen's avatar
Sander Bollen committed
5
6

from pyfaidx import Fasta
Sander Bollen's avatar
Sander Bollen committed
7
8
9
10
11
12
13
14

OUT_DIR = config.get("OUTPUT_DIR")
REFERENCE = config.get("REFERENCE")
JAVA = config.get("JAVA")
GATK = config.get("GATK")
DBSNP = config.get("DBSNP")
ONETHOUSAND = config.get("ONETHOUSAND")
HAPMAP = config.get("HAPMAP")
15
16
BED = config.get("BED", "")  # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "")  # comma-separated list of refFlat files
Sander Bollen's avatar
Sander Bollen committed
17
FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
18
FASTQ_COUNT = config.get("FASTQ_COUNT")
Sander Bollen's avatar
Sander Bollen committed
19
MAX_BASES = config.get("MAX_BASES", "")
Sander Bollen's avatar
Sander Bollen committed
20

21
22
23
24
25
def fsrc_dir(*args):
    """Wrapper around snakemake's srcdir to work like os.path.join"""
    if len(args) == 1:
        return srcdir(args[0])
    return srcdir(join(*args))
Sander Bollen's avatar
Sander Bollen committed
26

27
28
29
30
covpy = fsrc_dir("src", "covstats.py")
colpy = fsrc_dir("src", "collect_stats.py")
vs_py = fsrc_dir("src", "vcfstats.py")
mpy = fsrc_dir("src", "merge_stats.py")
Sander Bollen's avatar
Sander Bollen committed
31

32
if FASTQ_COUNT is None:
33
    fqc = "python {0}".format(fsrc_dir("src", "fastq-count.py"))
34
35
36
else:
    fqc = FASTQ_COUNT

Sander Bollen's avatar
Sander Bollen committed
37
38
39
40
with open(config.get("SAMPLE_CONFIG")) as handle:
    SAMPLE_CONFIG = json.load(handle)
SAMPLES = SAMPLE_CONFIG['samples'].keys()

Sander Bollen's avatar
Sander Bollen committed
41
42
43
44
45
46
BEDS = BED.split(",")
REFFLATS = REFFLAT.split(",")

BASE_BEDS = [basename(x) for x in BEDS]
BASE_REFFLATS = [basename(x) for x in BEDS]

Sander Bollen's avatar
Sander Bollen committed
47
48

def split_genome(ref, approx_n_chunks=100):
Sander Bollen's avatar
Sander Bollen committed
49
50
51
52
53
54
55
56
    """
    Split genome in chunks.

    Chunks are strings in the format: `<ctg>:<start>-<end>`
    These follow the region string format as used by htslib,
    which uses _1_-based indexing.
    See: http://www.htslib.org/doc/tabix.html
    """
Sander Bollen's avatar
Sander Bollen committed
57
58
59
60
61
    fa = Fasta(ref)
    tot_size = sum([len(x) for x in fa.records.values()])
    chunk_size = tot_size//approx_n_chunks
    chunks = []
    for chrom_name, chrom_value in fa.records.items():
Sander Bollen's avatar
Sander Bollen committed
62
        pos = 1
Sander Bollen's avatar
Sander Bollen committed
63
64
65
66
67
68
69
70
71
72
73
74
75
        while pos <= len(chrom_value):
            end = pos+chunk_size
            if end <= len(chrom_value):
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, end)
            else:
                chunk = "{0}:{1}-{2}".format(chrom_name, pos, len(chrom_value))
            chunks.append(chunk)
            pos = end
    return chunks

CHUNKS = split_genome(REFERENCE)


Sander Bollen's avatar
Sander Bollen committed
76
77
78
79
80
81
82
83
84
85
def out_path(path):
    return join(OUT_DIR, path)


try:
    mkdir(out_path("tmp"))
except OSError:
    pass


Sander Bollen's avatar
Sander Bollen committed
86
87
def get_r(strand, wildcards):
    """Get fastq files on a single strand for a sample"""
Sander Bollen's avatar
Sander Bollen committed
88
    s = SAMPLE_CONFIG['samples'].get(wildcards.sample)
Sander Bollen's avatar
Sander Bollen committed
89
    rs = []
90
    for l in sorted(s['libraries'].keys()):
Sander Bollen's avatar
Sander Bollen committed
91
92
        rs.append(s['libraries'][l][strand])
    return rs
Sander Bollen's avatar
Sander Bollen committed
93

Sander Bollen's avatar
Sander Bollen committed
94
95
get_r1 = partial(get_r, "R1")
get_r2 = partial(get_r, "R2")
Sander Bollen's avatar
Sander Bollen committed
96
97


Sander Bollen's avatar
Sander Bollen committed
98
def get_bedpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
99
100
    """Get absolute path of a bed file"""
    return next(x for x in BEDS if basename(x) == wildcards.bed)
Sander Bollen's avatar
Sander Bollen committed
101
102
103


def get_refflatpath(wildcards):
Sander Bollen's avatar
Sander Bollen committed
104
105
    """Get absolute path of a refflat file"""
    return next(x for x in REFFLATS if basename(x) == wildcards.refflat)
Sander Bollen's avatar
Sander Bollen committed
106
107


Sander Bollen's avatar
Sander Bollen committed
108
def sample_gender(wildcards):
Sander Bollen's avatar
Sander Bollen committed
109
    """Get sample gender"""
Sander Bollen's avatar
Sander Bollen committed
110
111
112
113
    sam = SAMPLE_CONFIG['samples'].get(wildcards.sample)
    return sam.get("gender", "null")


Sander Bollen's avatar
Sander Bollen committed
114
115
116
117
118
119
120
121
122
123
def metrics(do_metrics=True):
    if not do_metrics:
        return ""

    fqcr = expand(out_path("{sample}/pre_process/raw_fastqc/.done.txt"),
                  sample=SAMPLES)
    fqcm = expand(out_path("{sample}/pre_process/merged_fastqc/.done.txt"),
                  sample=SAMPLES)
    fqcp = expand(out_path("{sample}/pre_process/postqc_fastqc/.done.txt"),
                  sample=SAMPLES)
Sander Bollen's avatar
Sander Bollen committed
124
    stats = out_path("stats.json")
Sander Bollen's avatar
Sander Bollen committed
125
    return  fqcr + fqcm + fqcp + [stats]
Sander Bollen's avatar
Sander Bollen committed
126
127


Sander Bollen's avatar
Sander Bollen committed
128
129
rule all:
    input:
Sander Bollen's avatar
Sander Bollen committed
130
131
        combined=out_path("multisample/genotyped.vcf.gz"),
        stats=metrics()
Sander Bollen's avatar
Sander Bollen committed
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

rule genome:
    input: REFERENCE
    output: out_path("current.genome")
    shell: "awk -v OFS='\t' {{'print $1,$2'}} {input}.fai > {output}"

rule merge_r1:
    input: get_r1
    output: temp(out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"))
    shell: "cat {input} > {output}"

rule merge_r2:
    input: get_r2
    output: temp(out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz"))
    shell: "cat {input} > {output}"

Sander Bollen's avatar
Sander Bollen committed
148
149
150
151
152
153
154
155
156

rule seqtk_r1:
    input:
        stats=out_path("{sample}/pre_process/{sample}.preqc_count.json"),
        fastq=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz")
    params:
        max_bases=MAX_BASES
    output:
        fastq=temp(out_path("{sample}/pre_process/{sample}.sampled_R1.fastq.gz"))
Sander Bollen's avatar
Sander Bollen committed
157
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
158
    script: "src/seqtk.py"
Sander Bollen's avatar
Sander Bollen committed
159
160
161
162
163
164
165
166
167
168


rule seqtk_r2:
    input:
        stats = out_path("{sample}/pre_process/{sample}.preqc_count.json"),
        fastq = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
    params:
        max_bases = MAX_BASES
    output:
        fastq = temp(out_path("{sample}/pre_process/{sample}.sampled_R2.fastq.gz"))
Sander Bollen's avatar
Sander Bollen committed
169
    conda: "envs/seqtk.yml"
Sander Bollen's avatar
Sander Bollen committed
170
    script: "src/seqtk.py"
Sander Bollen's avatar
Sander Bollen committed
171
172


173
# contains original merged fastq files as input to prevent them from being prematurely deleted
Sander Bollen's avatar
Sander Bollen committed
174
175
rule sickle:
    input:
Sander Bollen's avatar
Sander Bollen committed
176
        r1 = out_path("{sample}/pre_process/{sample}.sampled_R1.fastq.gz"),
177
178
179
        r2 = out_path("{sample}/pre_process/{sample}.sampled_R2.fastq.gz"),
        rr1 = out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        rr2 = out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
Sander Bollen's avatar
Sander Bollen committed
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
    output:
        r1 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq")),
        r2 = temp(out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")),
        s = out_path("{sample}/pre_process/{sample}.trimmed_singles.fastq"),
    conda: "envs/sickle.yml"
    shell: "sickle pe -f {input.r1} -r {input.r2} -t sanger -o {output.r1} " \
           "-p {output.r2} -s {output.s}"

rule cutadapt:
    input:
        r1 = out_path("{sample}/pre_process/{sample}.trimmed_R1.fastq"),
        r2 = out_path("{sample}/pre_process/{sample}.trimmed_R2.fastq")
    output:
        r1 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq")),
        r2 = temp(out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"))
    conda: "envs/cutadapt.yml"
    shell: "cutadapt -a AGATCGGAAGAG -A AGATCGGAAGAG -m 1 -o {output.r1} " \
           "{input.r1} -p {output.r2} {input.r2}"

rule align:
    input:
        r1 = out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2 = out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq"),
        ref = REFERENCE
    params:
        rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
    output: temp(out_path("{sample}/bams/{sample}.sorted.bam"))
    conda: "envs/bwa.yml"
    shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} " \
           "| picard SortSam CREATE_INDEX=TRUE TMP_DIR=null " \
           "INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"

rule markdup:
    input:
        bam = out_path("{sample}/bams/{sample}.sorted.bam"),
    params:
        tmp = out_path("tmp")
    output:
Sander Bollen's avatar
Sander Bollen committed
218
        bam = out_path("{sample}/bams/{sample}.markdup.bam"),
Sander Bollen's avatar
Sander Bollen committed
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
        metrics = out_path("{sample}/bams/{sample}.markdup.metrics")
    conda: "envs/picard.yml"
    shell: "picard MarkDuplicates CREATE_INDEX=TRUE TMP_DIR={params.tmp} " \
           "INPUT={input.bam} OUTPUT={output.bam} " \
           "METRICS_FILE={output.metrics} " \
           "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=500"

rule baserecal:
    input:
        bam = out_path("{sample}/bams/{sample}.markdup.bam"),
        java = JAVA,
        gatk = GATK,
        ref = REFERENCE,
        dbsnp = DBSNP,
        one1kg = ONETHOUSAND,
        hapmap = HAPMAP
    output:
        grp = out_path("{sample}/bams/{sample}.baserecal.grp")
    conda: "envs/gatk.yml"
    shell: "{input.java} -jar {input.gatk} -T BaseRecalibrator " \
           "-I {input.bam} -o {output.grp} -nct 8 -R {input.ref} " \
           "-cov ReadGroupCovariate -cov QualityScoreCovariate " \
           "-cov CycleCovariate -cov ContextCovariate -knownSites " \
           "{input.dbsnp} -knownSites {input.one1kg} " \
           "-knownSites {input.hapmap}"

Sander Bollen's avatar
Sander Bollen committed
245
246
rule gvcf_scatter:
    input:
Sander Bollen's avatar
Sander Bollen committed
247
248
        bam=out_path("{sample}/bams/{sample}.markdup.bam"),
        bqsr=out_path("{sample}/bams/{sample}.baserecal.grp"),
Sander Bollen's avatar
Sander Bollen committed
249
        dbsnp=DBSNP,
Sander Bollen's avatar
Sander Bollen committed
250
251
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
252
    params:
Sander Bollen's avatar
Sander Bollen committed
253
        chunk="{chunk}"
Sander Bollen's avatar
Sander Bollen committed
254
    output:
Sander Bollen's avatar
Sander Bollen committed
255
        gvcf=out_path("{sample}/vcf/{sample}.{chunk}.part.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
256
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
257
    shell: "java -jar -Xmx4G {input.gatk} -T HaplotypeCaller -ERC GVCF -I "\
Sander Bollen's avatar
Sander Bollen committed
258
           "{input.bam} -R {input.ref} -D {input.dbsnp} "\
Sander Bollen's avatar
Sander Bollen committed
259
           "-L '{params.chunk}' -o '{output.gvcf}' "\
Sander Bollen's avatar
Sander Bollen committed
260
261
           "-variant_index_type LINEAR -variant_index_parameter 128000 " \
           "-BQSR {input.bqsr}"
Sander Bollen's avatar
Sander Bollen committed
262
263


Sander Bollen's avatar
Sander Bollen committed
264
rule gvcf_gather:
Sander Bollen's avatar
Sander Bollen committed
265
    input:
Sander Bollen's avatar
Sander Bollen committed
266
267
        gvcfs=expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz"),
                     chunk=CHUNKS),
Sander Bollen's avatar
Sander Bollen committed
268
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
269
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
270
    params:
Sander Bollen's avatar
Sander Bollen committed
271
        gvcfs="' -V '".join(expand(out_path("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
272
                                 chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
273
274
    output:
        gvcf=out_path("{sample}/vcf/{sample}.g.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
275
    conda: "envs/gatk.yml"
276
    shell: "java -Xmx4G -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "\
Sander Bollen's avatar
Sander Bollen committed
277
           "-R {input.ref} -V '{params.gvcfs}' -out {output.gvcf} "\
Sander Bollen's avatar
Sander Bollen committed
278
279
280
281
           "-assumeSorted"


rule genotype_scatter:
Sander Bollen's avatar
Sander Bollen committed
282
    input:
Sander Bollen's avatar
Sander Bollen committed
283
284
        gvcfs = expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"),
                       sample=SAMPLES),
Sander Bollen's avatar
Sander Bollen committed
285
        ref=REFERENCE,
Sander Bollen's avatar
Sander Bollen committed
286
287
288
        gatk=GATK
    params:
        li=" -V ".join(expand(out_path("{sample}/vcf/{sample}.g.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
289
                              sample=SAMPLES)),
Sander Bollen's avatar
Sander Bollen committed
290
291
292
        chunk="{chunk}"
    output:
        vcf=out_path("multisample/genotype.{chunk}.part.vcf.gz")
Sander Bollen's avatar
Sander Bollen committed
293
    conda: "envs/gatk.yml"
Sander Bollen's avatar
Sander Bollen committed
294
    shell: "java -jar -Xmx4G {input.gatk} -T GenotypeGVCFs -R {input.ref} "\
Sander Bollen's avatar
Sander Bollen committed
295
           "-V {params.li} -L '{params.chunk}' -o '{output.vcf}'"
Sander Bollen's avatar
Sander Bollen committed
296
297
298
299
300
301
302
303


rule genotype_gather:
    input:
        vcfs=expand(out_path("multisample/genotype.{chunk}.part.vcf.gz"),
                    chunk=CHUNKS),
        ref=REFERENCE,
        gatk=GATK
Sander Bollen's avatar
Sander Bollen committed
304
    params:
Sander Bollen's avatar
Sander Bollen committed
305
        vcfs="' -V '".join(expand(out_path("multisample/genotype.{chunk}.part.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
306
                                chunk=CHUNKS))
Sander Bollen's avatar
Sander Bollen committed
307
308
309
    output:
        combined=out_path("multisample/genotyped.vcf.gz")
    conda: "envs/gatk.yml"
Sander Bollen's avatar
tinyfix    
Sander Bollen committed
310
    shell: "java -Xmx4G -cp {input.gatk} org.broadinstitute.gatk.tools.CatVariants "\
Sander Bollen's avatar
Sander Bollen committed
311
           "-R {input.ref} -V '{params.vcfs}' -out {output.combined} "\
Sander Bollen's avatar
Sander Bollen committed
312
           "-assumeSorted"
Sander Bollen's avatar
Sander Bollen committed
313
314


Sander Bollen's avatar
Sander Bollen committed
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
## bam metrics

rule mapped_num:
    input:
        bam=out_path("{sample}/bams/{sample}.sorted.bam")
    output:
        num=out_path("{sample}/bams/{sample}.mapped.num")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 {input.bam} | wc -l > {output.num}"


rule mapped_basenum:
    input:
        bam=out_path("{sample}/bams/{sample}.sorted.bam")
    output:
        num=out_path("{sample}/bams/{sample}.mapped.basenum")
    conda: "envs/samtools.yml"
Sander Bollen's avatar
Sander Bollen committed
332
    shell: "samtools view -F 4 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
333
334
335
336
337
338
339
340


rule unique_num:
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam")
    output:
        num=out_path("{sample}/bams/{sample}.unique.num")
    conda: "envs/samtools.yml"
341
    shell: "samtools view -F 4 -F 1024 {input.bam} | wc -l > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
342
343
344
345
346
347
348
349
350


rule usable_basenum:
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam")
    output:
        num=out_path("{sample}/bams/{sample}.usable.basenum")
    conda: "envs/samtools.yml"
    shell: "samtools view -F 4 -F 1024 {input.bam} | cut -f10 | wc -c > {output.num}"
Sander Bollen's avatar
Sander Bollen committed
351
352
353
354


## fastqc

Sander Bollen's avatar
Sander Bollen committed
355
rule fastqc_raw:
Sander Bollen's avatar
Sander Bollen committed
356
    input:
Sander Bollen's avatar
Sander Bollen committed
357
        r1=get_r1,
Sander Bollen's avatar
Sander Bollen committed
358
359
360
361
362
        r2=get_r2
    params:
        odir=out_path("{sample}/pre_process/raw_fastqc")
    output:
        aux=out_path("{sample}/pre_process/raw_fastqc/.done.txt")
363
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
364
365
366
    shell: "fastqc -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"


Sander Bollen's avatar
Sander Bollen committed
367
rule fastqc_merged:
Sander Bollen's avatar
Sander Bollen committed
368
    input:
Sander Bollen's avatar
Sander Bollen committed
369
        r1=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
Sander Bollen's avatar
Sander Bollen committed
370
371
372
373
374
        r2=out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
    params:
        odir=out_path("{sample}/pre_process/merged_fastqc")
    output:
        aux=out_path("{sample}/pre_process/merged_fastqc/.done.txt")
375
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
376
377
378
    shell: "fastqc -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"


Sander Bollen's avatar
Sander Bollen committed
379
rule fastqc_postqc:
Sander Bollen's avatar
Sander Bollen committed
380
    input:
Sander Bollen's avatar
Sander Bollen committed
381
        r1=out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
Sander Bollen's avatar
Sander Bollen committed
382
383
384
385
386
        r2=out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
    params:
        odir=out_path("{sample}/pre_process/postqc_fastqc")
    output:
        aux=out_path("{sample}/pre_process/postqc_fastqc/.done.txt")
387
    conda: "envs/fastqc.yml"
Sander Bollen's avatar
Sander Bollen committed
388
    shell: "fastqc -o {params.odir} {input.r1} {input.r2} && echo 'done' > {output.aux}"
Sander Bollen's avatar
Sander Bollen committed
389
390
391
392
393
394
395
396


## fastq-count

rule fqcount_preqc:
    input:
        r1=out_path("{sample}/pre_process/{sample}.merged_R1.fastq.gz"),
        r2=out_path("{sample}/pre_process/{sample}.merged_R2.fastq.gz")
397
    params:
398
        fastqcount=fqc
Sander Bollen's avatar
Sander Bollen committed
399
400
    output:
        out_path("{sample}/pre_process/{sample}.preqc_count.json")
401
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"
Sander Bollen's avatar
Sander Bollen committed
402
403
404
405
406
407


rule fqcount_postqc:
    input:
        r1=out_path("{sample}/pre_process/{sample}.cutadapt_R1.fastq"),
        r2=out_path("{sample}/pre_process/{sample}.cutadapt_R2.fastq")
408
    params:
409
        fastqcount=fqc
Sander Bollen's avatar
Sander Bollen committed
410
411
    output:
        out_path("{sample}/pre_process/{sample}.postqc_count.json")
Sander Bollen's avatar
Sander Bollen committed
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
    shell: "{params.fastqcount} {input.r1} {input.r2} > {output}"


## coverages

rule covstats:
    input:
        bam=out_path("{sample}/bams/{sample}.markdup.bam"),
        genome=out_path("current.genome"),
        covpy=covpy,
        bed=get_bedpath
    params:
        subt="Sample {sample}"
    output:
        covj=out_path("{sample}/coverage/{bed}.covstats.json"),
Sander Bollen's avatar
Sander Bollen committed
427
        covp=out_path("{sample}/coverage/{bed}.covstats.png")
Sander Bollen's avatar
Sander Bollen committed
428
    conda: "envs/covstat.yml"
Sander Bollen's avatar
Sander Bollen committed
429
    shell: "bedtools coverage -sorted -g {input.genome} -a {input.bed} -b {input.bam} " \
Sander Bollen's avatar
Sander Bollen committed
430
431
           "-d  | python {input.covpy} - --plot {output.covp} " \
           "--title 'Targets coverage' --subtitle '{params.subt}' > {output.covj}"
Sander Bollen's avatar
Sander Bollen committed
432
433


Sander Bollen's avatar
Sander Bollen committed
434
435
436
437
## vcfstats

rule vcfstats:
    input:
Sander Bollen's avatar
Sander Bollen committed
438
        vcf=out_path("multisample/genotyped.vcf.gz"),
Sander Bollen's avatar
Sander Bollen committed
439
440
441
442
443
444
445
        vs_py=vs_py
    output:
        stats=out_path("multisample/vcfstats.json")
    conda: "envs/vcfstats.yml"
    shell: "python {input.vs_py} -i {input.vcf} > {output.stats}"


Sander Bollen's avatar
Sander Bollen committed
446
447
448
449
450
451
452
453
454
455
456
457
## collection
rule collectstats:
    input:
        preqc=out_path("{sample}/pre_process/{sample}.preqc_count.json"),
        postq=out_path("{sample}/pre_process/{sample}.postqc_count.json"),
        mnum=out_path("{sample}/bams/{sample}.mapped.num"),
        mbnum=out_path("{sample}/bams/{sample}.mapped.basenum"),
        unum=out_path("{sample}/bams/{sample}.unique.num"),
        ubnum=out_path("{sample}/bams/{sample}.usable.basenum"),
        cov=expand(out_path("{{sample}}/coverage/{bed}.covstats.json"), bed=BASE_BEDS),
        colpy=colpy
    params:
Sander Bollen's avatar
Sander Bollen committed
458
        sample_name="{sample}",
Sander Bollen's avatar
Sander Bollen committed
459
        fthresh=FEMALE_THRESHOLD
Sander Bollen's avatar
Sander Bollen committed
460
461
    output:
        out_path("{sample}/{sample}.stats.json")
462
    conda: "envs/collectstats.yml"
Sander Bollen's avatar
Sander Bollen committed
463
    shell: "python {input.colpy} --sample-name {params.sample_name} " \
Sander Bollen's avatar
Sander Bollen committed
464
           "--pre-qc-fastq {input.preqc} --post-qc-fastq {input.postq} " \
465
           "--mapped-num {input.mnum} --mapped-basenum {input.mbnum} " \
Sander Bollen's avatar
Sander Bollen committed
466
           "--unique-num {input.unum} --usable-basenum {input.ubnum} " \
467
           "--female-threshold {params.fthresh} {input.cov} > {output}"
Sander Bollen's avatar
Sander Bollen committed
468
469
470
471
472
473
474
475
476
477

rule merge_stats:
    input:
        cols=expand(out_path("{sample}/{sample}.stats.json"), sample=SAMPLES),
        vstat=out_path("multisample/vcfstats.json"),
        mpy=mpy
    output:
        stats=out_path("stats.json")
    conda: "envs/collectstats.yml"
    shell: "python {input.mpy} --vcfstats {input.vstat} {input.cols} > {output.stats}"