Commit d5f5d0a1 authored by van den Berg's avatar van den Berg
Browse files

Add option to generate a true multisample VCF file

parent 7bc45284
......@@ -157,7 +157,7 @@ The following configuration options are **optional**:
| `scatter_size` | The size of chunks to divide the reference into for parallel execution. Default = 1000000000 |
| `coverage_threshold` | One or more threshold coverage values. For each value, a sample specific bed file will be created that contains the regions where the coverage is above the threshold |
| `restrict_BQSR` | Restrict GATK BaseRecalibration to a single chromosome. This is faster, but the recalibration is possibly less reliable |
| `merge_vcf` | Merge the VCF files for each sample into a single multisample VCF file |
| `multisample_vcf` | Create a true multisample VCF file, in addition to the regular per-sample VCF files |
## Cluster configuration
......
......@@ -44,7 +44,7 @@ rule all:
gvcf_tbi = expand("{s}/vcf/{s}.g.vcf.gz.tbi", s=config["samples"]),
coverage_stats = coverage_stats,
coverage_files = coverage_files,
merged_vcf = "merged_multisample.vcf.gz" if config["merge_vcf"] else []
multisample_vcf = "multisample.vcf.gz" if config["multisample_vcf"] else []
rule create_tmp:
"""
......@@ -522,22 +522,22 @@ rule gvcf2coverage:
shell:
"gvcf2coverage -t {wildcards.threshold} < {input} 2> {log} | cut -f 1,2,3 > {output}"
rule merge_vcf:
""" Merge all vcf files into a single multisample vcf """
rule multisample_vcf:
""" Generate a true multisample VCF file with all samples """
input:
vcfs = expand("{sample}/vcf/{sample}.vcf.gz", sample=config["samples"])
gvcfs = expand("{sample}/vcf/{sample}.g.vcf.gz", sample=config["samples"]),
tbis = expand("{sample}/vcf/{sample}.g.vcf.gz.tbi", sample=config["samples"]),
ref = config["reference"]
params:
gvcf_files = lambda wc: expand("-V {sample}/vcf/{sample}.g.vcf.gz", sample=config["samples"]),
output:
"merged_multisample.vcf.gz"
"multisample.vcf.gz"
log:
"log/merged_multisample.log"
"log/multisample.log"
container:
containers["bcftools"]
containers["gatk"]
threads:
8
shell:
"bcftools merge --merge both "
"--output-type z "
"--output {output} "
"--threads 8 "
"{input} 2> {log} && "
"bcftools index --tbi --thread 8 {output}"
shell: "java -jar -Xmx15G -XX:ParallelGCThreads=1 /usr/GenomeAnalysisTK.jar -T "
"GenotypeGVCFs -R {input.ref} "
"{params.gvcf_files} -o '{output}'"
......@@ -59,7 +59,7 @@ def process_config():
# Set the default config values
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)
set_default('merge_vcf', False)
set_default('multisample_vcf', False)
# Hide the absolute path so the snakemake linter doesn't cry about it
set_default('gatk_jar', os.path.join(os.path.sep,'usr','GenomeAnalysisTK.jar'))
......
......@@ -16,7 +16,7 @@
"coverage_threshold",
"restrict_BQSR",
"gatk_jar",
"merge_vcf",
"multisample_vcf",
"baitsfile"
],
"properties": {
......@@ -80,8 +80,8 @@
"description": "Restrict BQSR to the listed chromosome",
"type": "string"
},
"merge_vcf": {
"description": "Create a merged output vcf file containing all samples",
"multisample_vcf": {
"description": "Create a true multisample VCF file, in addition to the regular per-sample VCF files",
"type": "boolean"
},
"refflat": {
......
......@@ -22,5 +22,5 @@
"known_sites": ["tests/data/reference/database.vcf.gz"],
"targetsfile": "tests/data/reference/full_chrM.bed",
"baitsfile": "tests/data/reference/target_baits.bed",
"merge_vcf": true
"multisample_vcf": true
}
......@@ -17,7 +17,7 @@
- micro/bams/micro.insert_size_metrics
must_not_contain:
- rror
- rule merge_vcf
- rule multisample_vcf
stderr:
must_not_contain:
- rror
......@@ -54,14 +54,14 @@
- 'output: micro/vcf/micro_120.bed'
- 'output: micro/vcf/micro_196.bed'
- name: dry-run-merge
- name: dry-run-multisample
tags:
- dry-run
command: >
snakemake -s Snakefile -n --configfile
tests/data/config/sample_config_merge_samples.json
tests/data/config/sample_config_multisample.json
exit_code: 0
stdout:
contains:
- Job counts
- rule merge_vcf
- rule multisample_vcf
......@@ -307,13 +307,13 @@
- path: 'micro/bams/micro.hs_metrics.txt'
- path: 'multiqc_report/multiqc_data/multiqc_picard_HsMetrics.json'
- name: integration-merge
- name: integration-multisample
tags:
- integration
command: >
snakemake --use-singularity --singularity-args ' --containall --bind /tmp '
--jobs 1 -w 120 -r -p
--configfile tests/data/config/sample_config_merge_samples.json
--configfile tests/data/config/sample_config_multisample.json
files:
- path: 'merged_multisample.vcf.gz'
- path: 'merged_multisample.vcf.gz.tbi'
- path: 'multisample.vcf.gz'
- path: 'multisample.vcf.gz.tbi'
......@@ -68,12 +68,12 @@
contains:
- 'Invalid --configfile: sample names should not overlap ("micro1" is contained in "micro12")'
- name: sanity-merge
- name: sanity-multisample
tags:
- sanity
command: >
snakemake -s Snakefile -n --configfile
tests/data/config/sample_config_merge_samples.json
tests/data/config/sample_config_multisample.json
- name: sanity-snakemake-lint
tags:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment