Commit 6e9e769d authored by van den Berg's avatar van den Berg
Browse files

Make known variant sites optional

parent 45387234
Pipeline #2747 failed with stages
in 46 seconds
......@@ -144,9 +144,6 @@ The following configuration values are **required**:
| `REFERENCE` | Absolute path to fasta file |
| `SAMPLE_CONFIG` | Path to config file as described above |
| `GATK` | Path to GATK jar. **Must** be version 3.7 |
| `DBSNP` | Path to dbSNP VCF |
| `ONETHOUSAND` | Path to 1000Genomes VCF |
| `HAPMAP` | Path to HapMap VCF |
The following configuration options are **optional**:
......@@ -156,6 +153,7 @@ The following configuration options are **optional**:
| `FEMALE_THRESHOLD` | Float between 0 and 1 that signifies the threshold of the ratio between coverage on X/overall coverage that 'calls' a sample as female. Default = 0.6 |
| `FASTQ_COUNT` | Path to `fastq-count` executable |
| `MAX_BASES` | Maximum allowed number of bases per sample before subsampling. Default = None (no subsampling) |
| `KNOWN_SITES` | Path to one or more VCF files of known variants, to be used with baserecalibration |
## Cluster configuration
......@@ -218,9 +216,7 @@ snakemake -s Snakefile \
--config SAMPLE_CONFIG=samples.json \
REFERENCE=/path/to/genome.fasta \
GATK=/path/to/GenomeAnalysisTK.jar \
DBSNP=/path/to/dbsnp.vcf.gz \
ONETHOUSAND=/path/to/onekg.vcf \
HAPMAP=/path/to/hapmap.vcf \
KNOWN_SITES=/path/to/dbsnp.vcf.gz,/path/to/onekg.vcf,/path/to/hapmap.vcf \
FASTQ_COUNT=/path/to/fastq-count \
BED=/path/to/interesting_region.bed
```
......
......@@ -14,7 +14,7 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
Main Snakefile for the pipeline.
Main Snakefile for the pipeline.
:copyright: (c) 2017-2019 Sander Bollen
:copyright: (c) 2017-2019 Leiden University Medical Center
......@@ -41,24 +41,6 @@ if GATK is None:
if not Path(GATK).exists():
raise FileNotFoundError("{0} does not exist.".format(GATK))
DBSNP = config.get("DBSNP")
if DBSNP is None:
raise ValueError("You must set --config DBSNP=<path>")
if not Path(DBSNP).exists():
raise FileNotFoundError("{0} does not exist".format(DBSNP))
ONETHOUSAND = config.get("ONETHOUSAND")
if ONETHOUSAND is None:
raise ValueError("You must set --config ONETHOUSAND=<path>")
if not Path(ONETHOUSAND).exists():
raise FileNotFoundError("{0} does not exist".format(ONETHOUSAND))
HAPMAP = config.get("HAPMAP")
if HAPMAP is None:
raise ValueError("You must set --config HAPMAP=<path>")
if not Path(HAPMAP).exists():
raise FileNotFoundError("{0} does not exist".format(HAPMAP))
# these are all optional
BED = config.get("BED", "") # comma-separated list of BED files
REFFLAT = config.get("REFFLAT", "") # comma-separated list of refFlat files
......@@ -66,6 +48,17 @@ FEMALE_THRESHOLD = config.get("FEMALE_THRESHOLD", 0.6)
FASTQ_COUNT = config.get("FASTQ_COUNT")
MAX_BASES = config.get("MAX_BASES", "")
# Make sure all files with known sites exist
known_sites = config.get("KNOWN_SITES").split(',')
for filename in known_sites:
if not Path(filename).exists():
raise FileNotFoundError("{0} does not exist".format(DBSNP))
# Generate the input string for basrecalibration
known_sites_argument = ''
for argument, filename in zip(repeat('-knownSites'), known_sites):
known_sites_argument +=' {argument} {filename}'.format(argument, filename)
def fsrc_dir(*args):
"""Wrapper around snakemake's srcdir to work like os.path.join"""
if len(args) == 1:
......@@ -335,14 +328,14 @@ rule baserecal:
hapmap = HAPMAP
output:
grp = "{sample}/bams/{sample}.baserecal.grp"
params:
knownsites = known_sites_argument
singularity: "docker://quay.io/biocontainers/gatk:3.7--py36_1"
conda: "envs/gatk.yml"
shell: "java -XX:ParallelGCThreads=1 -jar {input.gatk} -T "
"BaseRecalibrator -I {input.bam} -o {output.grp} -nct 8 "
"-R {input.ref} -cov ReadGroupCovariate -cov QualityScoreCovariate "
"-cov CycleCovariate -cov ContextCovariate -knownSites "
"{input.dbsnp} -knownSites {input.one1kg} "
"-knownSites {input.hapmap}"
"-cov CycleCovariate -cov ContextCovariate {params.knownsites}"
rule gvcf_scatter:
"""Run HaplotypeCaller in GVCF mode by chunk"""
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment