Unverified Commit 7f896f9b authored by Sander Bollen's avatar Sander Bollen Committed by GitHub
Browse files

Merge pull request #6 from LUMC/bcftools-concat

Bcftools concat in stead of GATK catvariants
parents f424c2f9 a61c0c64
Pipeline #2635 failed with stages
in 1 minute and 55 seconds
......@@ -271,79 +271,109 @@ digraph snakemake_dag {
rankdir=LR;
node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2];
edge[penwidth=2, color=grey];
0[label = "all", color = "0.32 0.6 0.85", style="rounded"];
1[label = "fastqc_postqc", color = "0.48 0.6 0.85", style="rounded"];
2[label = "fastqc_raw", color = "0.05 0.6 0.85", style="rounded"];
3[label = "fastqc_merged", color = "0.57 0.6 0.85", style="rounded"];
4[label = "merge_stats", color = "0.21 0.6 0.85", style="rounded"];
5[label = "genotype_gather", color = "0.37 0.6 0.85", style="rounded"];
6[label = "cutadapt", color = "0.30 0.6 0.85", style="rounded"];
7[label = "merge_r2", color = "0.60 0.6 0.85", style="rounded"];
8[label = "merge_r1", color = "0.23 0.6 0.85", style="rounded"];
9[label = "collectstats", color = "0.02 0.6 0.85", style="rounded"];
10[label = "vcfstats", color = "0.07 0.6 0.85", style="rounded"];
11[label = "genotype_scatter", color = "0.16 0.6 0.85", style="rounded"];
12[label = "sickle", color = "0.46 0.6 0.85", style="rounded"];
13[label = "mapped_num", color = "0.64 0.6 0.85", style="rounded"];
14[label = "fqcount_postqc", color = "0.00 0.6 0.85", style="rounded"];
15[label = "unique_num", color = "0.11 0.6 0.85", style="rounded"];
16[label = "covstats", color = "0.41 0.6 0.85", style="rounded"];
17[label = "fqcount_preqc", color = "0.62 0.6 0.85", style="rounded"];
18[label = "usable_basenum", color = "0.14 0.6 0.85", style="rounded"];
19[label = "mapped_basenum", color = "0.09 0.6 0.85", style="rounded"];
20[label = "gvcf_gather", color = "0.55 0.6 0.85", style="rounded"];
21[label = "seqtk_r2", color = "0.51 0.6 0.85", style="rounded"];
22[label = "seqtk_r1", color = "0.28 0.6 0.85", style="rounded"];
23[label = "align", color = "0.25 0.6 0.85", style="rounded"];
24[label = "markdup", color = "0.18 0.6 0.85", style="rounded"];
25[label = "genome", color = "0.53 0.6 0.85", style="rounded"];
26[label = "gvcf_scatter", color = "0.34 0.6 0.85", style="rounded"];
27[label = "printreads", color = "0.39 0.6 0.85", style="rounded"];
28[label = "baserecal", color = "0.44 0.6 0.85", style="rounded"];
4 -> 0
2 -> 0
1 -> 0
5 -> 0 [color="red"]
0[label = "all", color = "0.62 0.6 0.85", style="rounded"];
1[label = "genotype_gather", color = "0.31 0.6 0.85", style="rounded"];
2[label = "multiqc", color = "0.14 0.6 0.85", style="rounded"];
3[label = "bai", color = "0.41 0.6 0.85", style="rounded"];
4[label = "split_vcf", color = "0.53 0.6 0.85", style="rounded"];
5[label = "fastqc_raw", color = "0.63 0.6 0.85", style="rounded"];
6[label = "fastqc_merged", color = "0.24 0.6 0.85", style="rounded"];
7[label = "fastqc_postqc", color = "0.26 0.6 0.85", style="rounded"];
8[label = "vtools_coverage", color = "0.58 0.6 0.85", style="rounded"];
9[label = "merge_stats", color = "0.36 0.6 0.85", style="rounded"];
10[label = "genotype_scatter", color = "0.09 0.6 0.85", style="rounded"];
11[label = "genotype_chunkfile", color = "0.29 0.6 0.85", style="rounded"];
12[label = "stats_tsv", color = "0.51 0.6 0.85", style="rounded"];
13[label = "markdup", color = "0.55 0.6 0.85", style="rounded"];
14[label = "genotype_gather_tbi", color = "0.19 0.6 0.85", style="rounded"];
15[label = "merge_r1", color = "0.60 0.6 0.85", style="rounded"];
16[label = "merge_r2", color = "0.10 0.6 0.85", style="rounded"];
17[label = "cutadapt", color = "0.17 0.6 0.85", style="rounded"];
18[label = "gvcf_gather", color = "0.32 0.6 0.85", style="rounded"];
19[label = "gvcf_gather_tbi", color = "0.27 0.6 0.85", style="rounded"];
20[label = "collectstats", color = "0.03 0.6 0.85", style="rounded"];
21[label = "vcfstats", color = "0.00 0.6 0.85", style="rounded"];
22[label = "align", color = "0.05 0.6 0.85", style="rounded"];
23[label = "create_markdup_tmp", color = "0.44 0.6 0.85", style="rounded"];
24[label = "sickle", color = "0.39 0.6 0.85", style="rounded"];
25[label = "gvcf_scatter", color = "0.02 0.6 0.85", style="rounded"];
26[label = "gvcf_chunkfile", color = "0.56 0.6 0.85", style="rounded"];
27[label = "fqcount_preqc", color = "0.38 0.6 0.85", style="rounded"];
28[label = "fqcount_postqc", color = "0.12 0.6 0.85", style="rounded"];
29[label = "mapped_num", color = "0.50 0.6 0.85", style="rounded"];
30[label = "mapped_basenum", color = "0.43 0.6 0.85", style="rounded"];
31[label = "unique_num", color = "0.65 0.6 0.85", style="rounded"];
32[label = "usable_basenum", color = "0.22 0.6 0.85", style="rounded"];
33[label = "fastqc_stats", color = "0.46 0.6 0.85", style="rounded"];
34[label = "covstats", color = "0.07 0.6 0.85", style="rounded"];
35[label = "seqtk_r1", color = "0.34 0.6 0.85", style="rounded"];
36[label = "seqtk_r2", color = "0.21 0.6 0.85", style="rounded"];
37[label = "baserecal", color = "0.48 0.6 0.85", style="rounded"];
38[label = "genome", color = "0.15 0.6 0.85", style="rounded"];
9 -> 0
4 -> 0 [color = "red"]
3 -> 0
6 -> 1
8 -> 3
7 -> 3
10 -> 4
9 -> 4
11 -> 5 [color="red"]
12 -> 6 [color="red"]
14 -> 9
19 -> 9
18 -> 9
16 -> 9
15 -> 9
17 -> 9
13 -> 9
5 -> 10
20 -> 11 [color="red"]
22 -> 12 [color="red"]
21 -> 12 [color="red"]
23 -> 13
6 -> 14
24 -> 15
24 -> 16
25 -> 16
8 -> 17 [color="red"]
7 -> 17 [color="red"]
24 -> 18
23 -> 19
26 -> 20 [color="red"]
17 -> 21 [color="red"]
7 -> 21 [color="red"]
17 -> 22 [color="red"]
8 -> 22 [color="red"]
6 -> 23 [color="red"]
23 -> 24 [color="red"]
27 -> 26 [color="red"]
24 -> 27 [color="red"]
28 -> 27 [color="red"]
24 -> 28 [color="red"]
}
6 -> 0
7 -> 0
1 -> 0
8 -> 0
2 -> 0
5 -> 0
11 -> 1 [color = "red"]
10 -> 1 [color = "red"]
12 -> 2
13 -> 3
1 -> 4 [color = "red"]
14 -> 4 [color = "red"]
16 -> 6
15 -> 6
17 -> 7
19 -> 8
18 -> 8
20 -> 9
21 -> 9
19 -> 10 [color = "red"]
18 -> 10 [color = "red"]
9 -> 12
23 -> 13 [color = "red"]
22 -> 13 [color = "red"]
1 -> 14 [color = "red"]
24 -> 17 [color = "red"]
25 -> 18 [color = "red"]
26 -> 18 [color = "red"]
18 -> 19 [color = "red"]
28 -> 20
27 -> 20
32 -> 20
30 -> 20
33 -> 20
34 -> 20
29 -> 20
31 -> 20
1 -> 21
14 -> 21
17 -> 22 [color = "red"]
36 -> 24 [color = "red"]
35 -> 24 [color = "red"]
37 -> 25 [color = "red"]
13 -> 25 [color = "red"]
16 -> 27 [color = "red"]
15 -> 27 [color = "red"]
17 -> 28
22 -> 29
22 -> 30
13 -> 31
13 -> 32
7 -> 33
6 -> 33
38 -> 34
13 -> 34
27 -> 35 [color = "red"]
15 -> 35 [color = "red"]
27 -> 36 [color = "red"]
16 -> 36 [color = "red"]
13 -> 37 [color = "red"]
}
```
LICENSE
......
......@@ -181,6 +181,9 @@ def metrics(do_metrics=True):
return fqcr + fqcm + fqcp + coverage_stats + [stats]
localrules: gvcf_chunkfile, genotype_chunkfile
rule all:
input:
combined="multisample/genotyped.vcf.gz",
......@@ -285,14 +288,15 @@ rule align:
input:
r1 = "{sample}/pre_process/{sample}.cutadapt_R1.fastq",
r2 = "{sample}/pre_process/{sample}.cutadapt_R2.fastq",
ref = REFERENCE
ref = REFERENCE,
temp = ancient("tmp")
params:
rg = "@RG\\tID:{sample}_lib1\\tSM:{sample}\\tPL:ILLUMINA"
output: temp("{sample}/bams/{sample}.sorted.bam")
singularity: "docker://quay.io/biocontainers/mulled-v2-002f51ea92721407ef440b921fb5940f424be842:43ec6124f9f4f875515f9548733b8b4e5fed9aa6-0"
conda: "envs/bwa.yml"
shell: "bwa mem -t 8 -R '{params.rg}' {input.ref} {input.r1} {input.r2} "
"| picard SortSam CREATE_INDEX=TRUE TMP_DIR=null "
"| picard SortSam CREATE_INDEX=TRUE TMP_DIR={input.temp} "
"INPUT=/dev/stdin OUTPUT={output} SORT_ORDER=coordinate"
rule markdup:
......@@ -363,32 +367,59 @@ rule gvcf_scatter:
"-BQSR {input.bqsr}"
rule gvcf_chunkfile:
"""
Create simple text file with paths to chunks for GVCF.
This uses a run directive in stead of a shell directive because
the amount of chunks may be so large the shell would error out with
an "argument list too long" error.
See https://unix.stackexchange.com/a/120842 for more info
This also means this rule lives outside of singularity/conda and is
executed in snakemake's own environment.
"""
params:
chunkfiles = expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
chunk=CHUNKS)
output:
file = "{sample}/vcf/chunkfile.txt"
run:
with open(output.file, "w") as ohandle:
for filename in params.chunkfiles:
corrected = filename.format(sample=wildcards.sample)
ohandle.write(corrected + "\n")
rule gvcf_gather:
"""Gather all gvcf scatters"""
"""Gather all GVCF scatters"""
input:
gvcfs=expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
chunk=CHUNKS),
tbis=expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz.tbi",
chunk=CHUNKS),
ref=REFERENCE,
gatk=GATK
params:
gvcfs="' -V '".join(expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
chunk=CHUNKS))
gvcfs = expand("{{sample}}/vcf/{{sample}}.{chunk}.part.vcf.gz",
chunk=CHUNKS),
chunkfile = "{sample}/vcf/chunkfile.txt"
output:
gvcf="{sample}/vcf/{sample}.g.vcf.gz"
singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
conda: "envs/gatk.yml"
shell: "java -Xmx4G -XX:ParallelGCThreads=1 -cp {input.gatk} "
"org.broadinstitute.gatk.tools.CatVariants "
"-R {input.ref} -V '{params.gvcfs}' -out {output.gvcf} "
"-assumeSorted"
gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
conda: "envs/bcftools.yml"
singularity: "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4"
shell: "bcftools concat -f {input.chunkfile} -n > {output.gvcf}"
rule gvcf_gather_tbi:
"""Index GVCF"""
input:
gvcf = "{sample}/vcf/{sample}.g.vcf.gz"
output:
tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi"
conda: "envs/tabix.yml"
singularity: "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0"
shell: "tabix -pvcf {input.gvcf}"
rule genotype_scatter:
"""Run GATK's GenotypeGVCFs by chunk"""
input:
gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz", sample=SAMPLES),
tbis = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi",
sample=SAMPLES),
ref=REFERENCE,
gatk=GATK
params:
......@@ -405,31 +436,58 @@ rule genotype_scatter:
"-V {params.li} -L '{params.chunk}' -o '{output.vcf}'"
rule genotype_chunkfile:
"""
Create simple text file with paths to chunks for genotyping
This uses a run directive in stead of a shell directive because
the amount of chunks may be so large the shell would error out with
an "argument list too long" error.
See https://unix.stackexchange.com/a/120842 for more info
This also means this rule lives outside of singularity/conda and is
executed in snakemake's own environment.
"""
params:
vcfs = expand("multisample/genotype.{chunk}.part.vcf.gz",
chunk=CHUNKS)
output:
file = "multisample/chunkfile.txt"
run:
with open(output.file, "w") as ohandle:
for filename in params.vcfs:
ohandle.write(filename + "\n")
rule genotype_gather:
"""Gather all genotyping scatters"""
"""Gather all genotyping VCFs"""
input:
vcfs=expand("multisample/genotype.{chunk}.part.vcf.gz", chunk=CHUNKS),
tbis=expand("multisample/genotype.{chunk}.part.vcf.gz.tbi",
chunk=CHUNKS),
ref=REFERENCE,
gatk=GATK
params:
vcfs="' -V '".join(expand("multisample/genotype.{chunk}.part.vcf.gz",
chunk=CHUNKS))
vcfs = expand("multisample/genotype.{chunk}.part.vcf.gz",
chunk=CHUNKS),
chunkfile = "multisample/chunkfile.txt"
output:
combined="multisample/genotyped.vcf.gz"
conda: "envs/gatk.yml"
singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
shell: "java -Xmx4G -XX:ParallelGCThreads=1 -cp {input.gatk} "
"org.broadinstitute.gatk.tools.CatVariants "
"-R {input.ref} -V '{params.vcfs}' -out {output.combined} "
"-assumeSorted"
vcf = "multisample/genotyped.vcf.gz"
conda: "envs/bcftools.yml"
singularity: "docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4"
shell: "bcftools concat -f {input.chunkfile} -n > {output.vcf}"
rule genotype_gather_tbi:
"""Index genotyped vcf file"""
input:
vcf = "multisample/genotyped.vcf.gz"
output:
tbi = "multisample/genotyped.vcf.gz.tbi"
conda: "envs/tabix.yml"
singularity: "docker://quay.io/biocontainers/tabix:0.2.6--ha92aebf_0"
shell: "tabix -pvcf {input.vcf}"
rule split_vcf:
"""Split multisample VCF in single samples"""
input:
vcf="multisample/genotyped.vcf.gz",
tbi = "multisample/genotyped.vcf.gz.tbi",
gatk=GATK,
ref=REFERENCE
params:
......@@ -623,6 +681,7 @@ rule vtools_coverage:
"""Calculate coverage statistics per transcript"""
input:
gvcf="{sample}/vcf/{sample}.g.vcf.gz",
tbi = "{sample}/vcf/{sample}.g.vcf.gz.tbi",
ref=get_refflatpath
output:
tsv="{sample}/coverage/{ref}.coverages.tsv"
......@@ -636,7 +695,8 @@ rule vtools_coverage:
rule vcfstats:
"""Calculate vcf statistics"""
input:
vcf="multisample/genotyped.vcf.gz"
vcf="multisample/genotyped.vcf.gz",
tbi = "multisample/genotyped.vcf.gz.tbi"
output:
stats="multisample/vcfstats.json"
singularity: "docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0"
......
name: hutspot-bcftools
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- bcftools=1.9
- bzip2=1.0.6
- ca-certificates=2019.3.9
- curl=7.64.1
- krb5=1.16.3
- libcurl=7.64.1
- libdeflate=1.0
- libedit=3.1.20170329
- libgcc-ng=8.2.0
- libssh2=1.8.2
- libstdcxx-ng=8.2.0
- ncurses=6.1
- openssl=1.1.1b
- tk=8.6.9
- xz=5.2.4
- zlib=1.2.11
\ No newline at end of file
name: hutspot-tabix
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- libgcc-ng=8.2.0
- tabix=0.2.6
- zlib=1.2.11
\ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment