Skip to content
Snippets Groups Projects
Unverified Commit 51781bb6 authored by van den Berg's avatar van den Berg Committed by GitHub
Browse files

Merge pull request #10 from LUMC/develop

Full rewrite of the pipeline
parents 39111a1f 0056c068
No related branches found
No related tags found
No related merge requests found
Pipeline #5310 failed
Showing with 1073 additions and 1281 deletions
### Checklist
- [ ] Pull request details were added to CHANGELOG.md.
- [ ] New tests have been added to the matrix section of the
.github/workflows/ci.yml file.
name: Continuous Integration
on: [push, pull_request]
defaults:
run:
# This is needed for miniconda, see:
# https://github.com/marketplace/actions/setup-miniconda#important.
shell: bash -l {0}
jobs:
tests:
runs-on: ubuntu-latest
strategy:
matrix:
test:
- sanity-snakemake
- sanity-snakemake-lint
- sanity-singularity
- sanity-no-reference
- sanity-reference-does-not-exist
- sanity-baits-only
- sanity-targets-only
- sanity-samples-overlapping-name
- sanity-multisample
- dry-run-vanilla
- dry-run-target-baits
- dry-run-bed-coverage
- dry-run-multisample
- integration-vanilla
- integration-small-scatter
- integration-refflat
- integration-all-on-target
- integration-gene-bedfile
- integration-two-known-sites
- integration-two-readgroups
- integration-two-samples
- integration-target-baits
- integration-bed-coverage
- integration-restrict-BQSR
- integration-targets-only
- integration-multisample
steps:
- uses: actions/checkout@v2
- name: Install singularity
uses: eWaterCycle/setup-singularity@v6
with:
singularity-version: 3.6.4
- name: Cache conda environment
uses: actions/cache@v2
env:
cache-name: cache-conda-environment
# Increase this value to reset the cache without changing
# environment.yml
cache-number: 0
with:
path: ~/conda_pkgs_dir
key: build-${{ env.cache-name }}-${{ env.cache-number }}-${{ hashFiles('environment.yml') }}
- name: Install miniconda
uses: conda-incubator/setup-miniconda@v2.0.1
# https://github.com/conda-incubator/setup-miniconda.
# https://github.com/marketplace/actions/setup-miniconda
with:
activate-environment: hutspot
environment-file: environment.yml
auto-activate-base: false
use-only-tar-bz2: true
- name: Run test in conda evironment
# Use --symlink to limit disk usage.
run: >-
pytest --keep-workflow-wd-on-fail --tag ${{ matrix.test }} tests/
- name: Check pipeline stderr messages in case of failure
if: ${{ failure() }}
run: >-
bash -c '
for file in $(find /tmp/pytest_workflow_* -name log.err); do
echo $file; cat $file
done
'
- name: Check pipeline stdout messages in case of failure
if: ${{ failure() }}
run: >-
bash -c '
for file in $(find /tmp/pytest_workflow_* -name log.out); do
echo $file; cat $file
done
'
- name: Check all job log files in case of failure
if: ${{ failure() }}
run: >-
bash -c '
for file in $(find /tmp/pytest_workflow_*/${{ matrix.test}}/log/ -type f); do
echo $file; cat $file
done
'
......@@ -3,20 +3,32 @@ variables:
.docker_before_script_anchor: &docker_before_script_anchor
before_script:
- pip install -r requirements.txt
- pip install -r requirements-dev.txt
- pip3 install -r requirements.txt
- pip3 install -r requirements-dev.txt
.singularity_before_script_anchor: &singularity_before_script_anchor
before_script:
- export BASETEMP=$RUN_BASE_DIR/$CI_COMMIT_REF_NAME/$CI_JOB_ID
- source ${CONDA_SH}
- conda activate hutspot-pipeline || conda create -n hutspot-pipeline --file requirements.txt --file requirements-dev.txt -y && conda activate hutspot-pipeline
- export PATH=${PATH}:${SINGULARITY_PATH}
- echo "#!/usr/bin/env bash" > snakemake
- echo "$(which snakemake) --profile slurm-test \"\$@\"" >> snakemake
- chmod +x snakemake
- export PATH=$(pwd):${PATH}
- hash -r
stages:
- sanity
- dry-run
- integration
- functional
test_sanities:
<<: *docker_before_script_anchor
script:
- py.test --tag sanity
image: python:3.6-stretch
- pytest --tag sanity --workflow-threads 8
image: lumc/singularity-snakemake:3.5.2-5.15.0
tags:
- docker
stage: sanity
......@@ -24,49 +36,17 @@ test_sanities:
test_dry_run:
<<: *docker_before_script_anchor
script:
- py.test --tag dry-run
image: python:3.6-stretch
- pytest --tag dry-run --workflow-threads 8
image: lumc/singularity-snakemake:3.5.2-5.15.0
tags:
- docker
stage: dry-run
# this requires a priviliged docker container.
# most docker runners will not do this
test_integration_singularity:
before_script:
- apt-get update && apt-get install -y python3-pip
- pip3 install pyfaidx
- pip3 install -r requirements-dev.txt
script:
- py.test --tag singularity-integration
image: lumc/singularity-snakemake:3.0.3-5.4.0
tags:
- docker
stage: integration
test_integration:
before_script:
- export BASETEMP=$(mktemp -p ${RUN_BASE_DIR} -d)
<<: *singularity_before_script_anchor
script:
- source ${CONDA_SH}
- conda activate hutspot-pipeline
- export PATH=${PATH}:${CONDA_EXTRA_PATH}
- py.test --tag integration --basetemp ${BASETEMP} --keep-workflow-wd
- pytest --tag integration --basetemp ${BASETEMP} --keep-workflow-wd --workflow-threads 8
tags:
- slurm
stage: integration
test_functional:
before_script:
- export BASETEMP=$(mktemp -p ${RUN_BASE_DIR} -d)
script:
- source ${CONDA_SH}
- conda activate hutspot-pipeline
- export PATH=${PATH}:${CONDA_EXTRA_PATH}
- py.test --tag functional --basetemp ${BASETEMP} --keep-workflow-wd
tags:
- slurm
stage: functional
only:
- schedules
\ No newline at end of file
Changelog
==========
<!--
Newest changes should be on top.
This document is user facing. Please word the changes in such a way
that users understand how the changes affect the new version.
-->
v2.0.1
---------------------------
+ Switch to using chunked-scatter
v2.0.0
---------------------------
+ Add an environment.yml file for conda.
+ Greatly simplified the snakemake workflow.
+ All statistics are now calculated using existing tools.
+ Add option `multisample_vcf` to enable joint variantcalling.
......@@ -3,99 +3,78 @@
# Hutspot
This is a multisample DNA variant calling pipeline based on Snakemake, bwa and the
GATK HaplotypeCaller.
This is a multi sample DNA variant calling pipeline based on Snakemake, bwa and
the GATK HaplotypeCaller.
## Features
## Features
* Any number of samples is supported
* Whole-genome calling, regardless of wet-lab library preparation.
* Whole-genome calling, regardless of wet-lab library preparation.
* Follows modern best practices
* Each sample is individually called as as a GVCF.
* A multisample VCF is then produced by genotyping the collection of GVCFs.
* Each sample is individually called as as a GVCF.
* A VCF is then produced by genotyping the individual GVCFs separately
for each sample.
* Data parallelization for calling and genotyping steps.
* Using ~100 chunks, we call an entire exome in ~15 minutes!
* Using the `scatter_size` setting in the configuration file, the reference
genome is split into chunks, and each chunk can be processed
independenly. The default value of 1 billon will scatter the human
reference genoom into 6 chunks.
* Reasonably fast.
* 96 exomes in < 24 hours.
* No unnecessary jobs
* Coverage metrics for any number of bed files.
* Fully containerized rules through singularity and biocontainers. Legacy
conda environments are available as well.
* Optionally sub-sample inputs when number of bases exceeds a user-defined
threshold.
* Calculate coverage metrics if a `bedfile` is specified.
* Fully containerized rules through singularity and biocontainers. Legacy
conda environments are no long available.
# Installation
To run this pipeline you will need the following at minimum:
* python 3.6
* snakemake 5.2.0 or newer
* pyfaidx
This repository contains a [conda](https://conda.io/docs/)
environment file that you can use to install all minimum dependencies in a
This repository contains a [conda](https://conda.io/docs/)
environment file that you can use to install all dependencies in a
conda environment:
```bash
conda env create -f environment.yml
```
Alternatively, you can set up a python virtualenv and run
```bash
pip install -r requirements.txt
```
## Singularity
## Singularity
We highly recommend the user of the containerized rules through
We highly recommend the user of the containerized rules through
[singularity](https://www.sylabs.io/singularity/).
This option does, however,
require you to install singularity on your system. As this usually requires
administrative privileges, singularity is not contained within our provided
conda environment file.
This option does require you to install singularity on your system. As this
usually requires administrative privileges, singularity is not contained
within our provided conda environment file.
If you want to use singularity, make sure you install version 3 or higher.
If you want to use singularity, make sure you install version 3 or higher.
### Debian
If you happen to use Debian buster, singularity 3.0.3 comes straight out
of the box with a simple:
```bash
sudo apt install singularity-container
sudo apt install singularity-container
```
### Docker
You can run singularity within a docker container. Please note that
the container **MUST** run in privileged mode for this to work.
You can run singularity within a docker container. Please note that
the container **MUST** run in privileged mode for this to work.
We have provided our own container that includes singularity and snakemake
[here](https://hub.docker.com/r/lumc/singularity-snakemake).
[here](https://hub.docker.com/r/lumc/singularity-snakemake).
### Manual install
If you don't use Debian buster and cannot run a privileged docker container,
you - unfortunately :-( - will have to install singularity manually.
Please see the installation instructions
you - unfortunately :-( - will have to install singularity manually.
Please see the installation instructions
[here](https://github.com/sylabs/singularity/blob/master/INSTALL.md) on how
to do that.
to do that.
## GATK
For license reasons, conda and singularity cannot fully install the GATK. The JAR
must be registered by running `gatk-register` after the environment is
created, which conflicts with the automated environment/container creation.
For this reason, hutspot **requires** you to manually specify the path to
the GATK executable JAR via `--config GATK=/path/to/gatk.jar`.
## Operating system
Hutspot was tested on Ubuntu 16.04 only.
It should reasonably work on most modern Linux distributions.
It should reasonably work on most modern Linux distributions.
# Requirements
......@@ -103,24 +82,31 @@ For every sample you wish to analyze, we require one or more paired end
readgroups in fastq format. They must be compressed with either `gzip` or
`bgzip`.
Samples must be passed to the pipeline through a config file. This is a
simple json file listing the samples and their associated readgroups/libraries.
The configuration must be passed to the pipeline through a configuration file.
This is a json file listing the samples and their associated readgroups
as well as the other settings to be used.
An example config json can be found [here](config/example.json), and a
json schema describing the configuration file can be found [here](config/schema.json).
json schema describing the configuration file can be found [here](config/schema.json).
This json schema can also be used to validate your configuration file.
## Reference files
The following reference files **must** be provided:
The following reference files **must** be provided in the configuration:
1. A reference genome, in fasta format. Must be indexed with `samtools faidx`.
2. A dbSNP VCF file
3. A VCF file from 1000Genomes
4. A VCF file from the HapMap project.
1. `reference`: A reference genome, in fasta format. Must be indexed with
`samtools faidx`.
2. `dbsnp`: A dbSNP VCF file
3. `known_sites`: One ore more VCF files with known sites for base
recalibration
The following reference files **may** be provided:
1. Any number of BED files to calculate coverage on.
1. `targetsfile`: Bed file of the targets of the capture kit. Used to calculate coverage.
2. `baitsfile`: Bed file of the baits of the capture kit. Used to calculate picard HsMetric.
3. `refflat`: A refFlat file to calculate coverage over transcripts.
4. `scatter_size`: Size of the chunks to split the variant calling into.
5. `female_threshold`: Fraction of reads between X and the autosomes to call as
female.
# How to run
......@@ -131,7 +117,7 @@ the pipeline can be started with:
```bash
snakemake -s Snakefile \
--use-singularity \
--config <CONFIGURATION VALUES>
--configfile tests/data/config/sample_config.json
```
This would start all jobs locally. Obviously this is not what one would
......@@ -139,26 +125,31 @@ regularly do for a normal pipeline run. How to submit jobs on a cluster is
described later. Let's first move on to the necessary configuration values.
## Configuration values
The required and optional outputs are specified in the json schema located in
`config/schema.json`. Before running, the content of the `--configfile` is
validated against this schema.
The following configuration values are **required**:
| configuration | description |
| ------------- | ----------- |
| `REFERENCE` | Absolute path to fasta file |
| `SAMPLE_CONFIG` | Path to config file as described above |
| `GATK` | Path to GATK jar. **Must** be version 3.7 |
| `DBSNP` | Path to dbSNP VCF |
| `ONETHOUSAND` | Path to 1000Genomes VCF |
| `HAPMAP` | Path to HapMap VCF |
| `reference` | Absolute path to fasta file |
| `samples` | One or more samples, with associated fastq files |
| `dbsnp` | Path to dbSNP VCF file|
| `known_sites` | Path to one or more VCF files with known sites. Can be the same as the `dbsnp` file|
The following configuration options are **optional**:
| configuration | description |
| ------------- | ----------- |
| `BED` | Comma-separate list of paths to BED files of interest |
| `FEMALE_THRESHOLD` | Float between 0 and 1 that signifies the threshold of the ratio between coverage on X/overall coverage that 'calls' a sample as female. Default = 0.6 |
| `FASTQ_COUNT` | Path to `fastq-count` executable |
| `MAX_BASES` | Maximum allowed number of bases per sample before subsampling. Default = None (no subsampling) |
| `targetsfile` | Bed file of the targets of the capture kit. Used to calculate coverage |
| `baitsfile` | Bed file of the baits of the capture kit. Used to calculate picard HsMetrics |
| `female_threshold` | Float between 0 and 1 that signifies the threshold of the ratio between coverage on X/overall coverage that 'calls' a sample as female. Default = 0.6 |
| `scatter_size` | The size of chunks to divide the reference into for parallel execution. Default = 1000000000 |
| `coverage_threshold` | One or more threshold coverage values. For each value, a sample specific bed file will be created that contains the regions where the coverage is above the threshold |
| `restrict_BQSR` | Restrict GATK BaseRecalibration to a single chromosome. This is faster, but the recalibration is possibly less reliable |
| `multisample_vcf` | Create a true multisample VCF file, in addition to the regular per-sample VCF files |
## Cluster configuration
......@@ -166,12 +157,12 @@ The following configuration options are **optional**:
To run on a cluster, snakemake needs to be called with some extra arguments.
Additionally, it needs a cluster yaml file describing resources per job.
If you run on a cluster with drmaa support,an environment variable named
`DRMAA_LIBRARY_PATH` must be in the executing shell environment. This variable
If you run on a cluster with drmaa support,an environment variable named
`DRMAA_LIBRARY_PATH` must be in the executing shell environment. This variable
points to the `.so` file of the DRMAA library.
An sge-cluster.yml is bundled with this pipeline in the cluster directory.
It is optimized for SGE clusters, where the default vmem limit is 4G.
An sge-cluster.yml is bundled with this pipeline in the cluster directory.
It is optimized for SGE clusters, where the default vmem limit is 4G.
If you run SLURM, or any other cluster system, you will have to write your own
cluster yaml file. Please see the [snakemake documentation](http://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#cluster-configuration)
for details on how to do so. Given the provided sge-cluster.yml, activating the
......@@ -183,23 +174,29 @@ snakemake -s Snakefile \
--drmaa ' -pe <PE_NAME> {cluster.threads} -q all.q -l h_vmem={cluster.vmem} -cwd -V -N hutspot' \
```
## Limitations
Sample names should be unique, and not overlap (such as `sample1` and
`sample10`). This is due to the way output files are parsed by multiQC,
when sample names overlap, the json output for picard DuplicationMetrics cannot
be parsed unambiguously.
## Binding additional directories under singularity
In singularity mode, snakemake binds the location of itself in the container.
The current working directory is also visible directly in the container.
In singularity mode, snakemake binds the location of itself in the container.
The current working directory is also visible directly in the container.
In many cases, this is not enough, and will result in `FileNotFoundError`s.
E.g., suppose you run your pipeline in `/runs`, but your fastq files live in
E.g., suppose you run your pipeline in `/runs`, but your fastq files live in
`/fastq` and your reference genome lives in `/genomes`. We would have to bind
`/fastq` and `/genomes` in the container.
`/fastq` and `/genomes` in the container.
This can be accomplished with `--singularity-args`, which accepts a simple
This can be accomplished with `--singularity-args`, which accepts a simple
string of arguments passed to singularity. E.g. in the above example,
we could do:
```bash
snakemake -S Snakefile \
--use-singularity \
--use-singularity \
--singularity-args ' --bind /fastq:/fastq --bind /genomes:/genomes '
```
......@@ -218,38 +215,16 @@ snakemake -s Snakefile \
-w 120 \
--max-jobs-per-second 30 \
--restart-times 2 \
--config SAMPLE_CONFIG=samples.json \
REFERENCE=/path/to/genome.fasta \
GATK=/path/to/GenomeAnalysisTK.jar \
DBSNP=/path/to/dbsnp.vcf.gz \
ONETHOUSAND=/path/to/onekg.vcf \
HAPMAP=/path/to/hapmap.vcf \
FASTQ_COUNT=/path/to/fastq-count \
BED=/path/to/interesting_region.bed
--configfile config.json
```
## Using conda instead of singularity
Legacy conda environments are also available for each and every rule.
Simply use `--use-conda` instead of `--use-singularity` to enable conda
environments.
As dependency conflicts can and do arise with conda, it is recommended to
combine this flag with `--conda-prefix`, such that you only have to
build the environments once.
The conda environments use the same versions of tools as the singularity
containers, bar one:
* `fastqc` uses version 0.11.5 on conda, but 0.11.7 on singularity.
# Graph
Below you can see the rulegraph of the pipeline. The main variant calling flow
Below you can see the rule graph of the pipeline. The main variant calling flow
is highlighted in red. This only shows dependencies
between rules, and not between jobs. The actual job graph is considerably
more complex, as nearly all rules are duplicated by sample and some
(the scatter jobs) additionally by chunk.
(the scatter jobs) additionally by chunk.
As a rough estimate of the total number of jobs in pipeline you can use
the following formula:
......@@ -271,111 +246,76 @@ Having trouble viewing the graph? See [this](img/rulegraph.svg) static SVG inste
```plantuml
digraph snakemake_dag {
graph[bgcolor=white, margin=0];
rankdir=LR;
node[shape=box, style=rounded, fontname=sans, fontsize=10, penwidth=2];
edge[penwidth=2, color=grey];
0[label = "all", color = "0.62 0.6 0.85", style="rounded"];
1[label = "genotype_gather", color = "0.31 0.6 0.85", style="rounded"];
2[label = "multiqc", color = "0.14 0.6 0.85", style="rounded"];
3[label = "bai", color = "0.41 0.6 0.85", style="rounded"];
4[label = "split_vcf", color = "0.53 0.6 0.85", style="rounded"];
5[label = "fastqc_raw", color = "0.63 0.6 0.85", style="rounded"];
6[label = "fastqc_merged", color = "0.24 0.6 0.85", style="rounded"];
7[label = "fastqc_postqc", color = "0.26 0.6 0.85", style="rounded"];
8[label = "vtools_coverage", color = "0.58 0.6 0.85", style="rounded"];
9[label = "merge_stats", color = "0.36 0.6 0.85", style="rounded"];
10[label = "genotype_scatter", color = "0.09 0.6 0.85", style="rounded"];
11[label = "genotype_chunkfile", color = "0.29 0.6 0.85", style="rounded"];
12[label = "stats_tsv", color = "0.51 0.6 0.85", style="rounded"];
13[label = "markdup", color = "0.55 0.6 0.85", style="rounded"];
14[label = "genotype_gather_tbi", color = "0.19 0.6 0.85", style="rounded"];
15[label = "merge_r1", color = "0.60 0.6 0.85", style="rounded"];
16[label = "merge_r2", color = "0.10 0.6 0.85", style="rounded"];
17[label = "cutadapt", color = "0.17 0.6 0.85", style="rounded"];
18[label = "gvcf_gather", color = "0.32 0.6 0.85", style="rounded"];
19[label = "gvcf_gather_tbi", color = "0.27 0.6 0.85", style="rounded"];
20[label = "collectstats", color = "0.03 0.6 0.85", style="rounded"];
21[label = "vcfstats", color = "0.00 0.6 0.85", style="rounded"];
22[label = "align", color = "0.05 0.6 0.85", style="rounded"];
23[label = "create_markdup_tmp", color = "0.44 0.6 0.85", style="rounded"];
24[label = "sickle", color = "0.39 0.6 0.85", style="rounded"];
25[label = "gvcf_scatter", color = "0.02 0.6 0.85", style="rounded"];
26[label = "gvcf_chunkfile", color = "0.56 0.6 0.85", style="rounded"];
27[label = "fqcount_preqc", color = "0.38 0.6 0.85", style="rounded"];
28[label = "fqcount_postqc", color = "0.12 0.6 0.85", style="rounded"];
29[label = "mapped_num", color = "0.50 0.6 0.85", style="rounded"];
30[label = "mapped_basenum", color = "0.43 0.6 0.85", style="rounded"];
31[label = "unique_num", color = "0.65 0.6 0.85", style="rounded"];
32[label = "usable_basenum", color = "0.22 0.6 0.85", style="rounded"];
33[label = "fastqc_stats", color = "0.46 0.6 0.85", style="rounded"];
34[label = "covstats", color = "0.07 0.6 0.85", style="rounded"];
35[label = "seqtk_r1", color = "0.34 0.6 0.85", style="rounded"];
36[label = "seqtk_r2", color = "0.21 0.6 0.85", style="rounded"];
37[label = "baserecal", color = "0.48 0.6 0.85", style="rounded"];
38[label = "genome", color = "0.15 0.6 0.85", style="rounded"];
9 -> 0
4 -> 0 [color = "red"]
0[label = "all", color = "0.30 0.6 0.85", style="rounded"];
1[label = "multiqc", color = "0.60 0.6 0.85", style="rounded"];
2[label = "merge_stats", color = "0.17 0.6 0.85", style="rounded"];
3[label = "bai", color = "0.09 0.6 0.85", style="rounded"];
4[label = "genotype_gather\nsample: micro", color = "0.06 0.6 0.85", style="rounded"];
5[label = "gvcf_gather\nsample: micro", color = "0.32 0.6 0.85", style="rounded"];
6[label = "fastqc_raw\nsample: micro", color = "0.00 0.6 0.85", style="rounded"];
7[label = "fastqc_merged", color = "0.11 0.6 0.85", style="rounded"];
8[label = "fastqc_postqc", color = "0.02 0.6 0.85", style="rounded"];
9[label = "stats_tsv", color = "0.45 0.6 0.85", style="rounded"];
10[label = "collectstats", color = "0.24 0.6 0.85", style="rounded"];
11[label = "vcfstats\nsampel: micro", color = "0.52 0.6 0.85", style="rounded"];
12[label = "markdup", color = "0.47 0.6 0.85", style="rounded"];
13[label = "scatterregions", color = "0.56 0.6 0.85", style="rounded"];
14[label = "merge_r1\nsample: micro", color = "0.65 0.6 0.85", style="rounded"];
15[label = "merge_r2\nsample: micro", color = "0.26 0.6 0.85", style="rounded"];
16[label = "cutadapt", color = "0.22 0.6 0.85", style="rounded"];
17[label = "fqcount_preqc", color = "0.37 0.6 0.85", style="rounded"];
18[label = "fqcount_postqc", color = "0.58 0.6 0.85", style="rounded"];
19[label = "mapped_reads_bases", color = "0.43 0.6 0.85", style="rounded"];
20[label = "unique_reads_bases", color = "0.34 0.6 0.85", style="rounded"];
21[label = "fastqc_stats", color = "0.13 0.6 0.85", style="rounded"];
22[label = "covstats", color = "0.39 0.6 0.85", style="rounded"];
23[label = "align", color = "0.49 0.6 0.85", style="rounded"];
24[label = "create_markdup_tmp", color = "0.41 0.6 0.85", style="rounded,dashed"];
25[label = "sickle", color = "0.19 0.6 0.85", style="rounded"];
26[label = "genome", color = "0.62 0.6 0.85", style="rounded"];
1 -> 0
2 -> 0
3 -> 0
4 -> 0
5 -> 0
6 -> 0
7 -> 0
1 -> 0
8 -> 0
2 -> 0
5 -> 0
11 -> 1 [color = "red"]
10 -> 1 [color = "red"]
12 -> 2
13 -> 3
1 -> 4 [color = "red"]
14 -> 4 [color = "red"]
16 -> 6
15 -> 6
17 -> 7
19 -> 8
18 -> 8
20 -> 9
21 -> 9
19 -> 10 [color = "red"]
18 -> 10 [color = "red"]
9 -> 12
23 -> 13 [color = "red"]
22 -> 13 [color = "red"]
1 -> 14 [color = "red"]
24 -> 17 [color = "red"]
25 -> 18 [color = "red"]
26 -> 18 [color = "red"]
18 -> 19 [color = "red"]
28 -> 20
27 -> 20
32 -> 20
30 -> 20
33 -> 20
34 -> 20
29 -> 20
31 -> 20
1 -> 21
14 -> 21
17 -> 22 [color = "red"]
36 -> 24 [color = "red"]
35 -> 24 [color = "red"]
37 -> 25 [color = "red"]
13 -> 25 [color = "red"]
16 -> 27 [color = "red"]
15 -> 27 [color = "red"]
17 -> 28
22 -> 29
22 -> 30
13 -> 31
13 -> 32
7 -> 33
6 -> 33
38 -> 34
13 -> 34
27 -> 35 [color = "red"]
15 -> 35 [color = "red"]
27 -> 36 [color = "red"]
16 -> 36 [color = "red"]
13 -> 37 [color = "red"]
9 -> 1
10 -> 2
11 -> 2
12 -> 3
13 -> 4
13 -> 5
14 -> 7
15 -> 7
16 -> 8
2 -> 9
17 -> 10
18 -> 10
19 -> 10
20 -> 10
21 -> 10
22 -> 10
4 -> 11
23 -> 12
24 -> 12
25 -> 16
14 -> 17
15 -> 17
16 -> 18
23 -> 19
12 -> 20
7 -> 21
8 -> 21
12 -> 22
26 -> 22
16 -> 23
24 -> 23
14 -> 25
15 -> 25
}
```
......
This diff is collapsed.
......@@ -6,8 +6,17 @@ __default__:
align:
threads: 8
vmem: 4G
bed_to_interval:
threads: 1
vmem: 16G
hs_metrics:
threads: 1
vmem: 20G
markdup:
vmem: 10G
vmem: 20G
multiple_metrics:
threads: 1
vmem: 20G
baserecal:
threads: 8
vmem: 6G
......@@ -22,8 +31,16 @@ genotype_scatter:
genotype_gather:
vmem: 10G
covstats:
vmem: 6G
multiqc:
vmem: 20G
multiqc:
vmem: 30G
split_vcf:
vmem: 20G
fastqc:
threads: 4
vem: 8G
scatterregions:
vmem: 30G
merge_vcf:
threads: 8
vmem: 10G
__default__:
job_name: hutspot
threads: 1
vmem: 4G
queue: all
time: 00:30:00
align:
threads: 8
vmem: 4G
time: 0-2
baserecal:
threads: 8
vmem: 6G
time: 0-2
covstats:
vmem: 6G
cutadapt:
threads: 8
time: 0-2
fastqc_raw:
threads: 4
time: 0-1
fastqc_merged:
threads: 4
time: 0-1
fastqc_postqc:
threads: 4
time: 0-1
fqcount_postqc:
time: 0-1
gvcf_scatter:
vmem: 20G
time: 0-1
gvcf_gather:
vmem: 10G
genotype_scatter:
vmem: 20G
time: 0-1
genotype_gather:
vmem: 10G
markdup:
vmem: 20G
time: 0-1
multiqc:
vmem: 30G
time: 0-1
sickle:
time: 0-1
split_vcf:
vmem: 20G
vcfstats:
time: 0-1
import itertools
import json
import jsonschema
import os
containers = {
'bcftools': 'docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4',
'bedtools-2.26-python-2.7': 'docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0',
'bwa-0.7.17-samtools-1.10': 'docker://quay.io/biocontainers/mulled-v2-ad317f19f5881324e963f6a6d464d696a2825ab6:c59b7a73c87a9fe81737d5d628e10a3b5807f453-0',
'chunked-scatter': 'docker://quay.io/biocontainers/chunked-scatter:1.0.0--py_0',
'cutadapt': 'docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0',
'debian': 'docker://debian:buster-slim',
'fastqc': 'docker://quay.io/biocontainers/fastqc:0.11.7--4',
'gatk': 'docker://broadinstitute/gatk3:3.7-0',
'gvcf2coverage': 'docker://lumc/gvcf2coverage:0.1-dirty-2',
'multiqc': 'docker://quay.io/biocontainers/multiqc:1.8--py_2',
'picard': 'docker://quay.io/biocontainers/picard:2.22.8--0',
'python3': 'docker://python:3.6-slim',
'vtools': 'docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0'
}
def process_config():
""" Process the config file and set the default values """
def set_default(key, value):
"""Set default config values"""
if key not in config:
config[key] = value
# Read the json schema
with open(srcdir('config/schema.json'), 'rt') as fin:
schema = json.load(fin)
# Validate the config against the schema
try:
jsonschema.validate(config, schema)
except jsonschema.ValidationError as e:
raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')
# If you specify a baitsfile, you also have to specify a targets file for
# picard
if 'baitsfile' in config and 'targetsfile' not in config:
msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
raise jsonschema.ValidationError(msg)
# If you specify a target file but no baitsfile, we use the targets as
# baits. This is needed because picard HsMetrics needs both a baitfile and
# targets file as input
if 'targetsfile' in config and 'baitsfile' not in config:
set_default('baitsfile', config['targetsfile'])
# A sample name cannot be a substring of another sample, since that breaks picard
# metrics parsing by multiqc
msg = 'Invalid --configfile: sample names should not overlap ("{s1}" is contained in "{s2}")'
for s1, s2 in itertools.permutations(config['samples'], 2):
if s1 in s2:
raise jsonschema.ValidationError(msg.format(s1=s1, s2=s2))
# Set the default config values
set_default('scatter_size', 1000000000)
set_default('female_threshold', 0.6)
set_default('multisample_vcf', False)
# Hide the absolute path so the snakemake linter doesn't cry about it
set_default('gatk_jar', os.path.join(os.path.sep,'usr','GenomeAnalysisTK.jar'))
def coverage_stats(wildcards):
files = expand("{sample}/coverage/refFlat_coverage.tsv",
sample=config["samples"])
return files if "refflat" in config else []
def coverage_files(wildcards):
""" Return a list of all coverage files
The coverage is calculated for each sample, for each specified threshold
"""
# We only calculate the coverage when this is specified in the
# configuration
if 'coverage_threshold' not in config:
return list()
# Fetch the values we need from the configuration
samples = config['samples']
thresholds = config['coverage_threshold']
files = list()
for sample, threshold in itertools.product(samples, thresholds):
files.append(f'{sample}/vcf/{sample}_{threshold}.bed')
return files
def sample_bamfiles(wildcards):
""" Determine the bam files for a sample (one for each readgroup)
"""
files = list()
sample = config['samples'][wildcards.sample]
sample_name = wildcards.sample
for read_group in sample['read_groups']:
files.append(f'{sample_name}/bams/{sample_name}-{read_group}.sorted.bam')
return files
def gather_gvcf(wildcards):
""" Gather the gvcf files based on the scatterregions checkpoint
This is depends on the 'scatter_size' parameter and the reference genome
used
"""
checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
def gather_gvcf_tbi(wildcards):
""" Gather the gvcf index files based on the scatterregions checkpoint
This is depends on the 'scatter_size' parameter and the reference genome
used
"""
checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
def gather_vcf(wildcards):
""" Gather the vcf files based on the scatterregions checkpoint
This is depends on the 'scatter_size' parameter and the reference genome
used
"""
checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
def gather_vcf_tbi(wildcards):
""" Gather the vcf index files based on the scatterregions checkpoint
This is depends on the 'scatter_size' parameter and the reference genome
used
"""
checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
def sample_cutadapt_files(wildcards):
""" Determine the cutadapt log files files for a sample (one for each
readgroup).
"""
files = list()
sample = config['samples'][wildcards.sample]
sample_name = wildcards.sample
for read_group in sample['read_groups']:
files.append(f'{sample_name}/pre_process/{sample_name}-{read_group}.txt')
return files
def all_trimmed_fastqc(wildcards):
""" Determine the trimmed fastq files for each sample """
fastq_files = list()
for sample in config['samples']:
for read_group in config['samples'][sample]['read_groups']:
fastq_files.append(f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done")
return fastq_files
{
"samples": {
"sample_01": {
"libraries": {
"read_groups": {
"lib_l1": {
"R1": "1.fq.gz",
"R2": "2.fq.gz"
......@@ -13,12 +13,19 @@
}
},
"sample_02": {
"libraries": {
"read_groups": {
"lib_l1": {
"R1": "3.1.fq.gz",
"R2": "3.2.fq.gz"
}
}
}
}
},
"reference": "/path/to/ref",
"dbsnp": "/path/to/vcf1",
"known_sites": ["/path/to/vcf1", "/path/to/vcf2"],
"scatter_size": 1000000000,
"female_threshold": 0.6,
"bedfile": "/path/to/bed",
"refflat": "/path/to/refflat"
}
......@@ -2,16 +2,32 @@
"$schema": "http://json-schema.org/draft-04/schema#",
"description": "JSON schema for samples config for the hutspot pipeline",
"type": "object",
"required": ["samples"],
"additionalProperties": false,
"required": [
"samples",
"reference",
"dbsnp",
"known_sites"
],
"optional": [
"scatter_size",
"female_threshold",
"bedfile",
"coverage_threshold",
"restrict_BQSR",
"gatk_jar",
"multisample_vcf",
"baitsfile"
],
"properties": {
"samples": {
"type": "object",
"additionalProperties": {
"description": "sample object",
"type": "object",
"required": ["libraries"],
"required": ["read_groups"],
"properties": {
"libraries": {
"read_groups": {
"type": "object",
"additionalProperties": {
"description": "library",
......@@ -25,6 +41,52 @@
}
}
}
}
},
"reference": {
"description": "Reference fasta file to map against",
"type": "string"
},
"dbsnp": {
"description": "VCF file to be used to annotate variants",
"type": "string"
},
"known_sites": {
"description": "VCF files of known sites, to be used to recalibrate the quality scores",
"type": "array",
"minItems": 1
},
"scatter_size": {
"description": "Size of the chunks to split the variant calling into",
"type": "integer"
},
"female_threshold": {
"description": "Fraction of reads between X and the autosomes to call as female",
"type": "number"
},
"targetsfile": {
"description": "Bed file of the targets of the capture kit. Used to calculate coverage",
"type": "string"
},
"baitsfile": {
"description": "Bed file of the baits of the capture kit. Used to calculate picard HsMetrics",
"type": "string"
},
"coverage_threshold": {
"description": "One or more thresholds to calculate coverage for, one bedfile per value per sample",
"type": "array",
"minItems": 1
},
"restrict_BQSR": {
"description": "Restrict BQSR to the listed chromosome",
"type": "string"
},
"multisample_vcf": {
"description": "Create a true multisample VCF file, in addition to the regular per-sample VCF files",
"type": "boolean"
},
"refflat": {
"description": "RefFlat file with transcripts",
"type": "string"
}
}
}
# This file may be used to create an environment using:
# $ conda env create --file environment.yml
# platform: linux-64
name: hutspot
channels:
- conda-forge
- bioconda
- defaults
- conda-forge
dependencies:
- aioeasywebdav=2.2.0
- aiohttp=3.5.4
- appdirs=1.4.3
- asn1crypto=0.24.0
- async-timeout=3.0.1
- attrs=19.1.0
- bcrypt=3.1.4
- boto3=1.9.138
- botocore=1.12.138
- ca-certificates=2019.3.9
- cachetools=2.1.0
- cairo=1.16.0
- certifi=2019.3.9
- cffi=1.12.3
- chardet=3.0.4
- click=7.0
- configargparse=0.13.0
- cryptography=2.6.1
- datrie=0.7.1
- decorator=4.4.0
- docutils=0.14
- dropbox=9.2.0
- expat=2.2.5
- filechunkio=1.8
- fontconfig=2.13.1
- freetype=2.10.0
- ftputil=3.4
- gettext=0.19.8.1
- gitdb2=2.0.5
- gitpython=2.1.11
- glib=2.58.3
- google-api-core=1.10.0
- google-auth=1.6.3
- google-cloud-core=0.29.1
- google-cloud-storage=1.15.0
- google-resumable-media=0.3.2
- googleapis-common-protos=1.5.9
- graphite2=1.3.13
- graphviz=2.38.0
- harfbuzz=2.4.0
- icu=58.2
- idna=2.8
- idna_ssl=1.1.0
- jinja2=2.10.1
- jmespath=0.9.4
- jpeg=9c
- jsonschema=3.0.1
- libblas=3.8.0
- libcblas=3.8.0
- libffi=3.2.1
- libgcc-ng=8.2.0
- libgfortran=3.0.0
- libiconv=1.15
- liblapack=3.8.0
- libpng=1.6.37
- libprotobuf=3.7.1
- libstdcxx-ng=8.2.0
- libtiff=4.0.10
- libtool=2.4.6
- libuuid=2.32.1
- libxcb=1.13
- libxml2=2.9.9
- markupsafe=1.1.1
- multidict=4.5.2
- ncurses=6.1
- networkx=2.3
- numpy=1.16.3
- openblas=0.3.5
- openssl=1.1.1b
- pandas=0.24.2
- pango=1.40.14
- paramiko=2.4.2
- pcre=8.41
- pip=19.1
- pixman=0.34.0
- prettytable=0.7.2
- protobuf=3.7.1
- psutil=5.6.2
- pthread-stubs=0.4
- pyasn1=0.4.4
- pyasn1-modules=0.2.4
- pycparser=2.19
- pyfaidx=0.5.0
- pygraphviz=1.5
- pynacl=1.3.0
- pyopenssl=19.0.0
- pyrsistent=0.15.1
- pysftp=0.2.9
- pysocks=1.6.8
- python=3.6.7
- python-dateutil=2.8.0
- python-irodsclient=0.7.0
- pytz=2019.1
- pyyaml=5.1
- ratelimiter=1.2.0
- readline=7.0
- requests=2.21.0
- rsa=3.4.2
- s3transfer=0.2.0
- setuptools=41.0.1
- six=1.12.0
- smmap2=2.0.5
- snakemake=5.4.5
- snakemake-minimal=5.4.5
- sqlite=3.26.0
- tk=8.6.9
- typing_extensions=3.7.2
- urllib3=1.24.2
- wheel=0.33.1
- wrapt=1.11.1
- xmlrunner=1.7.7
- xorg-kbproto=1.0.7
- xorg-libice=1.0.9
- xorg-libsm=1.2.3
- xorg-libx11=1.6.7
- xorg-libxau=1.0.9
- xorg-libxdmcp=1.1.3
- xorg-libxext=1.3.4
- xorg-libxpm=3.5.12
- xorg-libxrender=0.9.10
- xorg-libxt=1.1.5
- xorg-renderproto=0.11.1
- xorg-xextproto=7.3.0
- xorg-xproto=7.0.31
- xz=5.2.4
- yaml=0.1.7
- yarl=1.3.0
- zlib=1.2.11
- pytest-workflow>=1.4.0
- snakemake-minimal
- boto3
- smart_open
name: hutspot-bcftools
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- bcftools=1.9
- bzip2=1.0.6
- ca-certificates=2019.3.9
- curl=7.64.1
- krb5=1.16.3
- libcurl=7.64.1
- libdeflate=1.0
- libedit=3.1.20170329
- libgcc-ng=8.2.0
- libssh2=1.8.2
- libstdcxx-ng=8.2.0
- ncurses=6.1
- openssl=1.1.1b
- tk=8.6.9
- xz=5.2.4
- zlib=1.2.11
\ No newline at end of file
name: hutspot-bwa
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- bwa=0.7.16
- ca-certificates=2017.7.27.1
- certifi=2017.7.27.1
- libgcc=5.2.0
- ncurses=5.9
- openjdk=8.0.121
- openssl=1.0.2l
- perl=5.22.0.1
- picard=2.14
- pip=9.0.1
- python=3.6.3
- readline=6.2
- setuptools=36.6.0
- sqlite=3.13.0
- tk=8.5.19
- wheel=0.30.0
- xz=5.2.3
- zlib=1.2.8
name: hutspot-collectstats
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- ca-certificates=2017.11.5
- certifi=2017.11.5
- click=6.7
- ncurses=5.9
- openssl=1.0.2l
- pip=9.0.1
- python=3.6.3
- readline=6.2
- setuptools=36.7.2
- sqlite=3.13.0
- tk=8.5.19
- wheel=0.30.0
- xz=5.2.3
- zlib=1.2.11
name: hutspot-covstat
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- backports=1.0
- backports.functools_lru_cache=1.4
- backports_abc=0.5
- bedtools=2.26.0
- blas=1.1
- ca-certificates=2017.11.5
- certifi=2017.11.5
- cycler=0.10.0
- dbus=1.10.22
- expat=2.2.1
- fontconfig=2.12.6
- freetype=2.8.1
- functools32=3.2.3.2
- gettext=0.19.7
- glib=2.53.5
- gst-plugins-base=1.8.0
- gstreamer=1.8.0
- icu=58.2
- jpeg=9b
- libffi=3.2.1
- libgcc=5.2.0
- libgcc-ng=8.2.0
- libgfortran=3.0.0
- libiconv=1.15
- libpng=1.6.34
- libstdcxx-ng=8.2.0
- libxcb=1.12
- libxml2=2.9.5
- matplotlib=2.1.0
- ncurses=5.9
- numpy=1.13.3
- openblas=0.2.20
- openssl=1.0.2l
- pcre=8.39
- pip=9.0.1
- pyparsing=2.2.0
- pyqt=5.6.0
- python=2.7.14
- python-dateutil=2.8.0
- pytz=2017.3
- qt=5.6.2
- readline=7.0
- setuptools=36.7.2
- singledispatch=3.4.0.3
- sip=4.18
- six=1.11.0
- sqlite=3.20.1
- ssl_match_hostname=3.5.0.1
- subprocess32=3.2.7
- tk=8.6.9
- tornado=4.5.2
- wheel=0.30.0
- xorg-libxau=1.0.8
- xorg-libxdmcp=1.1.2
- xz=5.2.3
- zlib=1.2.11
name: hutspot-cutadapt
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- ca-certificates=2017.7.27.1
- certifi=2017.7.27.1
- cutadapt=1.14
- ncurses=5.9
- openssl=1.0.2l
- pip=9.0.1
- python=3.6.3
- readline=6.2
- setuptools=36.6.0
- sqlite=3.13.0
- tk=8.5.19
- wheel=0.30.0
- xopen=0.1.1
- xz=5.2.3
- zlib=1.2.11
name: hutspot-fastq-count
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- fastq-count=0.1.0
- libgcc-ng=8.2.0
name: hutspot-fastqc
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- fastqc=0.11.5
- openjdk=8.0.121
- perl=5.22.0.1
name: hutspot-gatk
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- bzip2=1.0.6
- ca-certificates=2017.7.27.1
- certifi=2017.7.27.1
- gatk=3.7
- ncurses=5.9
- openjdk=8.0.121
- openssl=1.0.2l
- pip=9.0.1
- python=3.6.3
- readline=6.2
- setuptools=36.6.0
- sqlite=3.13.0
- tk=8.5.19
- wheel=0.30.0
- xz=5.2.3
- zlib=1.2.11
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment