Unverified Commit 51781bb6 authored by van den Berg's avatar van den Berg Committed by GitHub
Browse files

Merge pull request #10 from LUMC/develop

Full rewrite of the pipeline
parents 39111a1f 0056c068
Pipeline #5310 failed with stages
in 26 seconds
### Checklist
- [ ] Pull request details were added to CHANGELOG.md.
- [ ] New tests have been added to the matrix section of the
.github/workflows/ci.yml file.
name: Continuous Integration
on: [push, pull_request]
defaults:
run:
# This is needed for miniconda, see:
# https://github.com/marketplace/actions/setup-miniconda#important.
shell: bash -l {0}
jobs:
tests:
runs-on: ubuntu-latest
strategy:
matrix:
test:
- sanity-snakemake
- sanity-snakemake-lint
- sanity-singularity
- sanity-no-reference
- sanity-reference-does-not-exist
- sanity-baits-only
- sanity-targets-only
- sanity-samples-overlapping-name
- sanity-multisample
- dry-run-vanilla
- dry-run-target-baits
- dry-run-bed-coverage
- dry-run-multisample
- integration-vanilla
- integration-small-scatter
- integration-refflat
- integration-all-on-target
- integration-gene-bedfile
- integration-two-known-sites
- integration-two-readgroups
- integration-two-samples
- integration-target-baits
- integration-bed-coverage
- integration-restrict-BQSR
- integration-targets-only
- integration-multisample
steps:
- uses: actions/checkout@v2
- name: Install singularity
uses: eWaterCycle/setup-singularity@v6
with:
singularity-version: 3.6.4
- name: Cache conda environment
uses: actions/cache@v2
env:
cache-name: cache-conda-environment
# Increase this value to reset the cache without changing
# environment.yml
cache-number: 0
with:
path: ~/conda_pkgs_dir
key: build-${{ env.cache-name }}-${{ env.cache-number }}-${{ hashFiles('environment.yml') }}
- name: Install miniconda
uses: conda-incubator/setup-miniconda@v2.0.1
# https://github.com/conda-incubator/setup-miniconda.
# https://github.com/marketplace/actions/setup-miniconda
with:
activate-environment: hutspot
environment-file: environment.yml
auto-activate-base: false
use-only-tar-bz2: true
- name: Run test in conda evironment
# Use --symlink to limit disk usage.
run: >-
pytest --keep-workflow-wd-on-fail --tag ${{ matrix.test }} tests/
- name: Check pipeline stderr messages in case of failure
if: ${{ failure() }}
run: >-
bash -c '
for file in $(find /tmp/pytest_workflow_* -name log.err); do
echo $file; cat $file
done
'
- name: Check pipeline stdout messages in case of failure
if: ${{ failure() }}
run: >-
bash -c '
for file in $(find /tmp/pytest_workflow_* -name log.out); do
echo $file; cat $file
done
'
- name: Check all job log files in case of failure
if: ${{ failure() }}
run: >-
bash -c '
for file in $(find /tmp/pytest_workflow_*/${{ matrix.test}}/log/ -type f); do
echo $file; cat $file
done
'
...@@ -3,20 +3,32 @@ variables: ...@@ -3,20 +3,32 @@ variables:
.docker_before_script_anchor: &docker_before_script_anchor .docker_before_script_anchor: &docker_before_script_anchor
before_script: before_script:
- pip install -r requirements.txt - pip3 install -r requirements.txt
- pip install -r requirements-dev.txt - pip3 install -r requirements-dev.txt
.singularity_before_script_anchor: &singularity_before_script_anchor
before_script:
- export BASETEMP=$RUN_BASE_DIR/$CI_COMMIT_REF_NAME/$CI_JOB_ID
- source ${CONDA_SH}
- conda activate hutspot-pipeline || conda create -n hutspot-pipeline --file requirements.txt --file requirements-dev.txt -y && conda activate hutspot-pipeline
- export PATH=${PATH}:${SINGULARITY_PATH}
- echo "#!/usr/bin/env bash" > snakemake
- echo "$(which snakemake) --profile slurm-test \"\$@\"" >> snakemake
- chmod +x snakemake
- export PATH=$(pwd):${PATH}
- hash -r
stages: stages:
- sanity - sanity
- dry-run - dry-run
- integration - integration
- functional
test_sanities: test_sanities:
<<: *docker_before_script_anchor <<: *docker_before_script_anchor
script: script:
- py.test --tag sanity - pytest --tag sanity --workflow-threads 8
image: python:3.6-stretch image: lumc/singularity-snakemake:3.5.2-5.15.0
tags: tags:
- docker - docker
stage: sanity stage: sanity
...@@ -24,49 +36,17 @@ test_sanities: ...@@ -24,49 +36,17 @@ test_sanities:
test_dry_run: test_dry_run:
<<: *docker_before_script_anchor <<: *docker_before_script_anchor
script: script:
- py.test --tag dry-run - pytest --tag dry-run --workflow-threads 8
image: python:3.6-stretch image: lumc/singularity-snakemake:3.5.2-5.15.0
tags: tags:
- docker - docker
stage: dry-run stage: dry-run
# this requires a priviliged docker container.
# most docker runners will not do this
test_integration_singularity:
before_script:
- apt-get update && apt-get install -y python3-pip
- pip3 install pyfaidx
- pip3 install -r requirements-dev.txt
script:
- py.test --tag singularity-integration
image: lumc/singularity-snakemake:3.0.3-5.4.0
tags:
- docker
stage: integration
test_integration: test_integration:
before_script: <<: *singularity_before_script_anchor
- export BASETEMP=$(mktemp -p ${RUN_BASE_DIR} -d)
script: script:
- source ${CONDA_SH} - pytest --tag integration --basetemp ${BASETEMP} --keep-workflow-wd --workflow-threads 8
- conda activate hutspot-pipeline
- export PATH=${PATH}:${CONDA_EXTRA_PATH}
- py.test --tag integration --basetemp ${BASETEMP} --keep-workflow-wd
tags: tags:
- slurm - slurm
stage: integration stage: integration
test_functional:
before_script:
- export BASETEMP=$(mktemp -p ${RUN_BASE_DIR} -d)
script:
- source ${CONDA_SH}
- conda activate hutspot-pipeline
- export PATH=${PATH}:${CONDA_EXTRA_PATH}
- py.test --tag functional --basetemp ${BASETEMP} --keep-workflow-wd
tags:
- slurm
stage: functional
only:
- schedules
\ No newline at end of file
Changelog
==========
<!--
Newest changes should be on top.
This document is user facing. Please word the changes in such a way
that users understand how the changes affect the new version.
-->
v2.0.1
---------------------------
+ Switch to using chunked-scatter
v2.0.0
---------------------------
+ Add an environment.yml file for conda.
+ Greatly simplified the snakemake workflow.
+ All statistics are now calculated using existing tools.
+ Add option `multisample_vcf` to enable joint variantcalling.
...@@ -3,57 +3,46 @@ ...@@ -3,57 +3,46 @@
# Hutspot # Hutspot
This is a multisample DNA variant calling pipeline based on Snakemake, bwa and the This is a multi sample DNA variant calling pipeline based on Snakemake, bwa and
GATK HaplotypeCaller. the GATK HaplotypeCaller.
## Features ## Features
* Any number of samples is supported * Any number of samples is supported
* Whole-genome calling, regardless of wet-lab library preparation. * Whole-genome calling, regardless of wet-lab library preparation.
* Follows modern best practices * Follows modern best practices
* Each sample is individually called as as a GVCF. * Each sample is individually called as as a GVCF.
* A multisample VCF is then produced by genotyping the collection of GVCFs. * A VCF is then produced by genotyping the individual GVCFs separately
for each sample.
* Data parallelization for calling and genotyping steps. * Data parallelization for calling and genotyping steps.
* Using ~100 chunks, we call an entire exome in ~15 minutes! * Using the `scatter_size` setting in the configuration file, the reference
genome is split into chunks, and each chunk can be processed
independenly. The default value of 1 billon will scatter the human
reference genoom into 6 chunks.
* Reasonably fast. * Reasonably fast.
* 96 exomes in < 24 hours. * 96 exomes in < 24 hours.
* No unnecessary jobs * No unnecessary jobs
* Coverage metrics for any number of bed files. * Calculate coverage metrics if a `bedfile` is specified.
* Fully containerized rules through singularity and biocontainers. Legacy * Fully containerized rules through singularity and biocontainers. Legacy
conda environments are available as well. conda environments are no long available.
* Optionally sub-sample inputs when number of bases exceeds a user-defined
threshold.
# Installation # Installation
To run this pipeline you will need the following at minimum:
* python 3.6
* snakemake 5.2.0 or newer
* pyfaidx
This repository contains a [conda](https://conda.io/docs/) This repository contains a [conda](https://conda.io/docs/)
environment file that you can use to install all minimum dependencies in a environment file that you can use to install all dependencies in a
conda environment: conda environment:
```bash ```bash
conda env create -f environment.yml conda env create -f environment.yml
``` ```
Alternatively, you can set up a python virtualenv and run
```bash
pip install -r requirements.txt
```
## Singularity ## Singularity
We highly recommend the user of the containerized rules through We highly recommend the user of the containerized rules through
[singularity](https://www.sylabs.io/singularity/). [singularity](https://www.sylabs.io/singularity/).
This option does, however, This option does require you to install singularity on your system. As this
require you to install singularity on your system. As this usually requires usually requires administrative privileges, singularity is not contained
administrative privileges, singularity is not contained within our provided within our provided conda environment file.
conda environment file.
If you want to use singularity, make sure you install version 3 or higher. If you want to use singularity, make sure you install version 3 or higher.
...@@ -82,45 +71,42 @@ Please see the installation instructions ...@@ -82,45 +71,42 @@ Please see the installation instructions
to do that. to do that.
## GATK
For license reasons, conda and singularity cannot fully install the GATK. The JAR
must be registered by running `gatk-register` after the environment is
created, which conflicts with the automated environment/container creation.
For this reason, hutspot **requires** you to manually specify the path to
the GATK executable JAR via `--config GATK=/path/to/gatk.jar`.
## Operating system ## Operating system
Hutspot was tested on Ubuntu 16.04 only. Hutspot was tested on Ubuntu 16.04 only.
It should reasonably work on most modern Linux distributions. It should reasonably work on most modern Linux distributions.
# Requirements # Requirements
For every sample you wish to analyze, we require one or more paired end For every sample you wish to analyze, we require one or more paired end
readgroups in fastq format. They must be compressed with either `gzip` or readgroups in fastq format. They must be compressed with either `gzip` or
`bgzip`. `bgzip`.
Samples must be passed to the pipeline through a config file. This is a The configuration must be passed to the pipeline through a configuration file.
simple json file listing the samples and their associated readgroups/libraries. This is a json file listing the samples and their associated readgroups
as well as the other settings to be used.
An example config json can be found [here](config/example.json), and a An example config json can be found [here](config/example.json), and a
json schema describing the configuration file can be found [here](config/schema.json). json schema describing the configuration file can be found [here](config/schema.json).
This json schema can also be used to validate your configuration file. This json schema can also be used to validate your configuration file.
## Reference files ## Reference files
The following reference files **must** be provided: The following reference files **must** be provided in the configuration:
1. A reference genome, in fasta format. Must be indexed with `samtools faidx`. 1. `reference`: A reference genome, in fasta format. Must be indexed with
2. A dbSNP VCF file `samtools faidx`.
3. A VCF file from 1000Genomes 2. `dbsnp`: A dbSNP VCF file
4. A VCF file from the HapMap project. 3. `known_sites`: One ore more VCF files with known sites for base
recalibration
The following reference files **may** be provided: The following reference files **may** be provided:
1. Any number of BED files to calculate coverage on. 1. `targetsfile`: Bed file of the targets of the capture kit. Used to calculate coverage.
2. `baitsfile`: Bed file of the baits of the capture kit. Used to calculate picard HsMetric.
3. `refflat`: A refFlat file to calculate coverage over transcripts.
4. `scatter_size`: Size of the chunks to split the variant calling into.
5. `female_threshold`: Fraction of reads between X and the autosomes to call as
female.
# How to run # How to run
...@@ -131,7 +117,7 @@ the pipeline can be started with: ...@@ -131,7 +117,7 @@ the pipeline can be started with:
```bash ```bash
snakemake -s Snakefile \ snakemake -s Snakefile \
--use-singularity \ --use-singularity \
--config <CONFIGURATION VALUES> --configfile tests/data/config/sample_config.json
``` ```
This would start all jobs locally. Obviously this is not what one would This would start all jobs locally. Obviously this is not what one would
...@@ -139,26 +125,31 @@ regularly do for a normal pipeline run. How to submit jobs on a cluster is ...@@ -139,26 +125,31 @@ regularly do for a normal pipeline run. How to submit jobs on a cluster is
described later. Let's first move on to the necessary configuration values. described later. Let's first move on to the necessary configuration values.
## Configuration values ## Configuration values
The required and optional outputs are specified in the json schema located in
`config/schema.json`. Before running, the content of the `--configfile` is
validated against this schema.
The following configuration values are **required**: The following configuration values are **required**:
| configuration | description | | configuration | description |
| ------------- | ----------- | | ------------- | ----------- |
| `REFERENCE` | Absolute path to fasta file | | `reference` | Absolute path to fasta file |
| `SAMPLE_CONFIG` | Path to config file as described above | | `samples` | One or more samples, with associated fastq files |
| `GATK` | Path to GATK jar. **Must** be version 3.7 | | `dbsnp` | Path to dbSNP VCF file|
| `DBSNP` | Path to dbSNP VCF | | `known_sites` | Path to one or more VCF files with known sites. Can be the same as the `dbsnp` file|
| `ONETHOUSAND` | Path to 1000Genomes VCF |
| `HAPMAP` | Path to HapMap VCF |
The following configuration options are **optional**: The following configuration options are **optional**:
| configuration | description | | configuration | description |
| ------------- | ----------- | | ------------- | ----------- |
| `BED` | Comma-separate list of paths to BED files of interest | | `targetsfile` | Bed file of the targets of the capture kit. Used to calculate coverage |
| `FEMALE_THRESHOLD` | Float between 0 and 1 that signifies the threshold of the ratio between coverage on X/overall coverage that 'calls' a sample as female. Default = 0.6 | | `baitsfile` | Bed file of the baits of the capture kit. Used to calculate picard HsMetrics |
| `FASTQ_COUNT` | Path to `fastq-count` executable | | `female_threshold` | Float between 0 and 1 that signifies the threshold of the ratio between coverage on X/overall coverage that 'calls' a sample as female. Default = 0.6 |
| `MAX_BASES` | Maximum allowed number of bases per sample before subsampling. Default = None (no subsampling) | | `scatter_size` | The size of chunks to divide the reference into for parallel execution. Default = 1000000000 |
| `coverage_threshold` | One or more threshold coverage values. For each value, a sample specific bed file will be created that contains the regions where the coverage is above the threshold |
| `restrict_BQSR` | Restrict GATK BaseRecalibration to a single chromosome. This is faster, but the recalibration is possibly less reliable |
| `multisample_vcf` | Create a true multisample VCF file, in addition to the regular per-sample VCF files |
## Cluster configuration ## Cluster configuration
...@@ -183,6 +174,12 @@ snakemake -s Snakefile \ ...@@ -183,6 +174,12 @@ snakemake -s Snakefile \
--drmaa ' -pe <PE_NAME> {cluster.threads} -q all.q -l h_vmem={cluster.vmem} -cwd -V -N hutspot' \ --drmaa ' -pe <PE_NAME> {cluster.threads} -q all.q -l h_vmem={cluster.vmem} -cwd -V -N hutspot' \
``` ```
## Limitations
Sample names should be unique, and not overlap (such as `sample1` and
`sample10`). This is due to the way output files are parsed by multiQC,
when sample names overlap, the json output for picard DuplicationMetrics cannot
be parsed unambiguously.
## Binding additional directories under singularity ## Binding additional directories under singularity
In singularity mode, snakemake binds the location of itself in the container. In singularity mode, snakemake binds the location of itself in the container.
...@@ -218,34 +215,12 @@ snakemake -s Snakefile \ ...@@ -218,34 +215,12 @@ snakemake -s Snakefile \
-w 120 \ -w 120 \
--max-jobs-per-second 30 \ --max-jobs-per-second 30 \
--restart-times 2 \ --restart-times 2 \
--config SAMPLE_CONFIG=samples.json \ --configfile config.json
REFERENCE=/path/to/genome.fasta \
GATK=/path/to/GenomeAnalysisTK.jar \
DBSNP=/path/to/dbsnp.vcf.gz \
ONETHOUSAND=/path/to/onekg.vcf \
HAPMAP=/path/to/hapmap.vcf \
FASTQ_COUNT=/path/to/fastq-count \
BED=/path/to/interesting_region.bed
``` ```
## Using conda instead of singularity
Legacy conda environments are also available for each and every rule.
Simply use `--use-conda` instead of `--use-singularity` to enable conda
environments.
As dependency conflicts can and do arise with conda, it is recommended to
combine this flag with `--conda-prefix`, such that you only have to
build the environments once.
The conda environments use the same versions of tools as the singularity
containers, bar one:
* `fastqc` uses version 0.11.5 on conda, but 0.11.7 on singularity.
# Graph # Graph
Below you can see the rulegraph of the pipeline. The main variant calling flow Below you can see the rule graph of the pipeline. The main variant calling flow
is highlighted in red. This only shows dependencies is highlighted in red. This only shows dependencies
between rules, and not between jobs. The actual job graph is considerably between rules, and not between jobs. The actual job graph is considerably
more complex, as nearly all rules are duplicated by sample and some more complex, as nearly all rules are duplicated by sample and some
...@@ -271,111 +246,76 @@ Having trouble viewing the graph? See [this](img/rulegraph.svg) static SVG inste ...@@ -271,111 +246,76 @@ Having trouble viewing the graph? See [this](img/rulegraph.svg) static SVG inste
```plantuml ```plantuml
digraph snakemake_dag { digraph snakemake_dag {
graph[bgcolor=white, margin=0]; graph[bgcolor=white, margin=0];
rankdir=LR;
node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2]; node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2];
edge[penwidth=2, color=grey]; edge[penwidth=2, color=grey];
0[label = "all", color = "0.62 0.6 0.85", style="rounded"]; 0[label = "all", color = "0.30 0.6 0.85", style="rounded"];
1[label = "genotype_gather", color = "0.31 0.6 0.85", style="rounded"]; 1[label = "multiqc", color = "0.60 0.6 0.85", style="rounded"];
2[label = "multiqc", color = "0.14 0.6 0.85", style="rounded"]; 2[label = "merge_stats", color = "0.17 0.6 0.85", style="rounded"];
3[label = "bai", color = "0.41 0.6 0.85", style="rounded"]; 3[label = "bai", color = "0.09 0.6 0.85", style="rounded"];
4[label = "split_vcf", color = "0.53 0.6 0.85", style="rounded"]; 4[label = "genotype_gather\nsample: micro", color = "0.06 0.6 0.85", style="rounded"];
5[label = "fastqc_raw", color = "0.63 0.6 0.85", style="rounded"]; 5[label = "gvcf_gather\nsample: micro", color = "0.32 0.6 0.85", style="rounded"];
6[label = "fastqc_merged", color = "0.24 0.6 0.85", style="rounded"]; 6[label = "fastqc_raw\nsample: micro", color = "0.00 0.6 0.85", style="rounded"];
7[label = "fastqc_postqc", color = "0.26 0.6 0.85", style="rounded"]; 7[label = "fastqc_merged", color = "0.11 0.6 0.85", style="rounded"];
8[label = "vtools_coverage", color = "0.58 0.6 0.85", style="rounded"]; 8[label = "fastqc_postqc", color = "0.02 0.6 0.85", style="rounded"];
9[label = "merge_stats", color = "0.36 0.6 0.85", style="rounded"]; 9[label = "stats_tsv", color = "0.45 0.6 0.85", style="rounded"];
10[label = "genotype_scatter", color = "0.09 0.6 0.85", style="rounded"]; 10[label = "collectstats", color = "0.24 0.6 0.85", style="rounded"];
11[label = "genotype_chunkfile", color = "0.29 0.6 0.85", style="rounded"]; 11[label = "vcfstats\nsampel: micro", color = "0.52 0.6 0.85", style="rounded"];
12[label = "stats_tsv", color = "0.51 0.6 0.85", style="rounded"]; 12[label = "markdup", color = "0.47 0.6 0.85", style="rounded"];
13[label = "markdup", color = "0.55 0.6 0.85", style="rounded"]; 13[label = "scatterregions", color = "0.56 0.6 0.85", style="rounded"];
14[label = "genotype_gather_tbi", color = "0.19 0.6 0.85", style="rounded"]; 14[label = "merge_r1\nsample: micro", color = "0.65 0.6 0.85", style="rounded"];
15[label = "merge_r1", color = "0.60 0.6 0.85", style="rounded"]; 15[label = "merge_r2\nsample: micro", color = "0.26 0.6 0.85", style="rounded"];
16[label = "merge_r2", color = "0.10 0.6 0.85", style="rounded"]; 16[label = "cutadapt", color = "0.22 0.6 0.85", style="rounded"];
17[label = "cutadapt", color = "0.17 0.6 0.85", style="rounded"]; 17[label = "fqcount_preqc", color = "0.37 0.6 0.85", style="rounded"];
18[label = "gvcf_gather", color = "0.32 0.6 0.85", style="rounded"]; 18[label = "fqcount_postqc", color = "0.58 0.6 0.85", style="rounded"];
19[label = "gvcf_gather_tbi", color = "0.27 0.6 0.85", style="rounded"]; 19[label = "mapped_reads_bases", color = "0.43 0.6 0.85", style="rounded"];
20[label = "collectstats", color = "0.03 0.6 0.85", style="rounded"]; 20[label = "unique_reads_bases", color = "0.34 0.6 0.85", style="rounded"];
21[label = "vcfstats", color = "0.00 0.6 0.85", style="rounded"]; 21[label = "fastqc_stats", color = "0.13 0.6 0.85", style="rounded"];
22[label = "align", color = "0.05 0.6 0.85", style="rounded"]; 22[label = "covstats", color = "0.39 0.6 0.85", style="rounded"];
23[label = "create_markdup_tmp", color = "0.44 0.6 0.85", style="rounded"]; 23[label = "align", color = "0.49 0.6 0.85", style="rounded"];
24[label = "sickle", color = "0.39 0.6 0.85", style="rounded"]; 24[label = "create_markdup_tmp", color = "0.41 0.6 0.85", style="rounded,dashed"];
25[label = "gvcf_scatter", color = "0.02 0.6 0.85", style="rounded"]; 25[label = "sickle", color = "0.19 0.6 0.85", style="rounded"];
26[label = "gvcf_chunkfile", color = "0.56 0.6 0.85", style="rounded"]; 26[label = "genome", color = "0.62 0.6 0.85", style="rounded"];
27[label = "fqcount_preqc", color = "0.38 0.6 0.85", style="rounded"]; 1 -> 0
28[label = "fqcount_postqc", color = "0.12 0.6 0.85", style="rounded"]; 2 -> 0
29[label = "mapped_num", color = "0.50 0.6 0.85", style="rounded"];
30[label = "mapped_basenum", color = "0.43 0.6 0.85", style="rounded"];
31[label = "unique_num", color = "0.65 0.6 0.85", style="rounded"];
32[label = "usable_basenum", color = "0.22 0.6 0.85", style="rounded"];
33[label = "fastqc_stats", color = "0.46 0.6 0.85", style="rounded"];
34[label = "covstats", color = "0.07 0.6 0.85", style="rounded"];
35[label = "seqtk_r1", color = "0.34 0.6 0.85", style="rounded"];
36[label = "seqtk_r2", color = "0.21 0.6 0.85", style="rounded"];
37[label = "baserecal", color = "0.48 0.6 0.85", style="rounded"];
38[label = "genome", color = "0.15 0.6 0.85", style="rounded"];
9 -> 0
4 -> 0 [color = "red"]
3 -> 0 3 -> 0
4 -> 0
5 -> 0
6 -> 0 6 -> 0
7 -> 0 7 -> 0
1 -> 0
8 -> 0 8 -> 0
2 -> 0 9 -> 1
5 -> 0 10 -> 2
11 -> 1 [color = "red"] 11 -> 2
10 -> 1 [color = "red"] 12 -> 3
12 -> 2 13 -> 4
13 -> 3 13 -> 5