Merge pull request #10 from LUMC/develop

Full rewrite of the pipeline

Merge pull request #10 from LUMC/develop
51781bb6 · van den Berg · GitHub · 39111a1f · 0056c068 · 51781bb6
Unverified Commit 51781bb6 authored 4 years ago by van den Berg Committed by GitHub 4 years ago
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
+### Checklist
+- [ ] Pull request details were added to CHANGELOG.md.
+- [ ] New tests have been added to the matrix section of the
+    .github/workflows/ci.yml file.
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
+name: Continuous Integration
+
+on: [push, pull_request]
+
+defaults:
+  run:
+    # This is needed for miniconda, see:
+    # https://github.com/marketplace/actions/setup-miniconda#important.
+    shell: bash -l {0}
+
+jobs:
+  tests:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        test:
+          - sanity-snakemake
+          - sanity-snakemake-lint
+          - sanity-singularity
+          - sanity-no-reference
+          - sanity-reference-does-not-exist
+          - sanity-baits-only
+          - sanity-targets-only
+          - sanity-samples-overlapping-name
+          - sanity-multisample
+
+          - dry-run-vanilla
+          - dry-run-target-baits
+          - dry-run-bed-coverage
+          - dry-run-multisample
+
+          - integration-vanilla
+          - integration-small-scatter
+          - integration-refflat
+          - integration-all-on-target
+          - integration-gene-bedfile
+          - integration-two-known-sites
+          - integration-two-readgroups
+          - integration-two-samples
+          - integration-target-baits
+          - integration-bed-coverage
+          - integration-restrict-BQSR
+          - integration-targets-only
+          - integration-multisample
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Install singularity
+      uses: eWaterCycle/setup-singularity@v6
+      with:
+        singularity-version: 3.6.4
+
+    - name: Cache conda environment
+      uses: actions/cache@v2
+      env:
+        cache-name: cache-conda-environment
+        # Increase this value to reset the cache without changing
+        # environment.yml
+        cache-number: 0
+      with:
+        path: ~/conda_pkgs_dir
+        key: build-${{ env.cache-name }}-${{ env.cache-number }}-${{ hashFiles('environment.yml') }}
+
+    - name: Install miniconda
+      uses: conda-incubator/setup-miniconda@v2.0.1
+      # https://github.com/conda-incubator/setup-miniconda.
+      # https://github.com/marketplace/actions/setup-miniconda
+      with:
+        activate-environment: hutspot
+        environment-file: environment.yml
+        auto-activate-base: false
+        use-only-tar-bz2: true
+
+    - name: Run test in conda evironment
+      # Use --symlink to limit disk usage.
+      run: >-
+        pytest --keep-workflow-wd-on-fail --tag ${{ matrix.test }} tests/
+
+    - name: Check pipeline stderr messages in case of failure
+      if: ${{ failure() }}
+      run: >-
+        bash -c '
+        for file in $(find /tmp/pytest_workflow_* -name log.err); do
+          echo $file; cat $file
+        done
+        '
+    - name: Check pipeline stdout messages in case of failure
+      if: ${{ failure() }}
+      run: >-
+        bash -c '
+        for file in $(find /tmp/pytest_workflow_* -name log.out); do
+          echo $file; cat $file
+        done
+        '
+    - name: Check all job log files in case of failure
+      if: ${{ failure() }}
+      run: >-
+        bash -c '
+        for file in $(find /tmp/pytest_workflow_*/${{ matrix.test}}/log/ -type f); do
+          echo $file; cat $file
+        done
+        '
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -3,20 +3,32 @@ variables:

 .docker_before_script_anchor: &docker_before_script_anchor
  before_script:
-    - pip install -r requirements.txt
-    - pip install -r requirements-dev.txt
+    - pip3 install -r requirements.txt
+    - pip3 install -r requirements-dev.txt
+
+.singularity_before_script_anchor: &singularity_before_script_anchor
+  before_script:
+    - export BASETEMP=$RUN_BASE_DIR/$CI_COMMIT_REF_NAME/$CI_JOB_ID
+    - source ${CONDA_SH}
+    - conda activate hutspot-pipeline || conda create -n hutspot-pipeline --file requirements.txt --file requirements-dev.txt -y && conda activate hutspot-pipeline
+    - export PATH=${PATH}:${SINGULARITY_PATH}
+    - echo "#!/usr/bin/env bash" > snakemake
+    - echo "$(which snakemake) --profile slurm-test \"\$@\"" >> snakemake
+    - chmod +x snakemake
+    - export PATH=$(pwd):${PATH}
+    - hash -r
+

 stages:
  - sanity
  - dry-run
  - integration
-  - functional

 test_sanities:
  <<: *docker_before_script_anchor
  script:
-    - py.test --tag sanity
-  image: python:3.6-stretch
+    - pytest --tag sanity --workflow-threads 8
+  image: lumc/singularity-snakemake:3.5.2-5.15.0
  tags:
    - docker
  stage: sanity
@@ -24,49 +36,17 @@ test_sanities:
 test_dry_run:
  <<: *docker_before_script_anchor
  script:
-    - py.test --tag dry-run
-  image: python:3.6-stretch
+    - pytest --tag dry-run --workflow-threads 8
+  image: lumc/singularity-snakemake:3.5.2-5.15.0
  tags:
    - docker
  stage: dry-run


-# this requires a priviliged docker container.
-# most docker runners will not do this
-test_integration_singularity:
-  before_script:
-    - apt-get update && apt-get install -y python3-pip
-    - pip3 install pyfaidx
-    - pip3 install -r requirements-dev.txt
-  script:
-    - py.test --tag singularity-integration
-  image: lumc/singularity-snakemake:3.0.3-5.4.0
-  tags:
-    - docker
-  stage: integration
-
 test_integration:
-  before_script:
-    - export BASETEMP=$(mktemp -p ${RUN_BASE_DIR} -d)
+  <<: *singularity_before_script_anchor
  script:
-    - source ${CONDA_SH}
-    - conda activate hutspot-pipeline
-    - export PATH=${PATH}:${CONDA_EXTRA_PATH}
-    - py.test --tag integration --basetemp ${BASETEMP} --keep-workflow-wd
+    - pytest --tag integration --basetemp ${BASETEMP} --keep-workflow-wd --workflow-threads 8
  tags:
    - slurm
  stage: integration
-
-test_functional:
-  before_script:
-    - export BASETEMP=$(mktemp -p ${RUN_BASE_DIR} -d)
-  script:
-    - source ${CONDA_SH}
-    - conda activate hutspot-pipeline
-    - export PATH=${PATH}:${CONDA_EXTRA_PATH}
-    - py.test --tag functional --basetemp ${BASETEMP} --keep-workflow-wd
-  tags:
-    - slurm
-  stage: functional
-  only:
-    - schedules
\ No newline at end of file
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+Changelog
+==========
+
+<!--
+Newest changes should be on top.
+
+This document is user facing. Please word the changes in such a way
+that users understand how the changes affect the new version.
+-->
+
+v2.0.1
+---------------------------
+ Switch to using chunked-scatter
+
+v2.0.0
+---------------------------
+ Add an environment.yml file for conda.
+ Greatly simplified the snakemake workflow.
+ All statistics are now calculated using existing tools.
+ Add option `multisample_vcf` to enable joint variantcalling.
--- a/README.md
+++ b/README.md
@@ -3,99 +3,78 @@

 # Hutspot

-This is a multisample DNA variant calling pipeline based on Snakemake, bwa and the
-GATK HaplotypeCaller.  
+This is a multi sample DNA variant calling pipeline based on Snakemake, bwa and
+the GATK HaplotypeCaller.

-## Features 
+## Features
 * Any number of samples is supported
-* Whole-genome calling, regardless of wet-lab library preparation. 
+* Whole-genome calling, regardless of wet-lab library preparation.
 * Follows modern best practices
-    * Each sample is individually called as as a GVCF. 
-    * A multisample VCF is then produced by genotyping the collection of GVCFs.
+    * Each sample is individually called as as a GVCF.
+    * A VCF is then produced by genotyping the individual GVCFs separately
+      for each sample.
 * Data parallelization for calling and genotyping steps.
-    * Using ~100 chunks, we call an entire exome in ~15 minutes!
+    * Using the `scatter_size` setting in the configuration file, the reference
+      genome is split into chunks, and each chunk can be processed
+      independenly. The default value of 1 billon will scatter the human
+      reference genoom into 6 chunks.
 * Reasonably fast.
    * 96 exomes in < 24 hours.
 * No unnecessary jobs
-* Coverage metrics for any number of bed files.
-* Fully containerized rules through singularity and biocontainers. Legacy 
-conda environments are available as well.  
-* Optionally sub-sample inputs when number of bases exceeds a user-defined
-threshold.
+* Calculate coverage metrics if a `bedfile` is specified.
+* Fully containerized rules through singularity and biocontainers. Legacy
+conda environments are no long available.

 # Installation

-To run this pipeline you will need the following at minimum:
-
-* python 3.6
-* snakemake 5.2.0 or newer
-* pyfaidx 
-
-This repository contains a [conda](https://conda.io/docs/) 
-environment file that you can use to install all minimum dependencies in a 
+This repository contains a [conda](https://conda.io/docs/)
+environment file that you can use to install all dependencies in a
 conda environment:

 ```bash
 conda env create -f environment.yml
-``` 
-
-Alternatively, you can set up a python virtualenv and run
-
-```bash
-pip install -r requirements.txt
 ```

-## Singularity 
+## Singularity

-We highly recommend the user of the containerized rules through 
+We highly recommend the user of the containerized rules through
 [singularity](https://www.sylabs.io/singularity/).

-This option does, however,
-require you to install singularity on your system. As this usually requires 
-administrative privileges, singularity is not contained within our provided
-conda environment file.
+This option does require you to install singularity on your system. As this
+usually requires administrative privileges, singularity is not contained
+within our provided conda environment file.

-If you want to use singularity, make sure you install version 3 or higher. 
+If you want to use singularity, make sure you install version 3 or higher.

 ### Debian
 If you happen to use Debian buster, singularity 3.0.3 comes straight out
 of the box with a simple:

 ```bash
-sudo apt install singularity-container 
+sudo apt install singularity-container
 ```

 ### Docker

-You can run singularity within a docker container. Please note that 
-the container **MUST** run in privileged mode for this to work. 
+You can run singularity within a docker container. Please note that
+the container **MUST** run in privileged mode for this to work.

 We have provided our own container that includes singularity and snakemake
-[here](https://hub.docker.com/r/lumc/singularity-snakemake). 
+[here](https://hub.docker.com/r/lumc/singularity-snakemake).

 ### Manual install

 If you don't use Debian buster and cannot run a privileged docker container,
-you - unfortunately :-( - will have to install singularity manually. 
-Please see the installation instructions 
+you - unfortunately :-( - will have to install singularity manually.
+Please see the installation instructions
 [here](https://github.com/sylabs/singularity/blob/master/INSTALL.md) on how
-to do that. 
+to do that.


-## GATK
-
-For license reasons, conda and singularity cannot fully install the GATK. The JAR 
-must be registered by running `gatk-register` after the environment is
-created, which conflicts with the automated environment/container creation.
- 
-For this reason, hutspot **requires** you to manually specify the path to
-the GATK executable JAR via `--config GATK=/path/to/gatk.jar`.
-
 ## Operating system

 Hutspot was tested on Ubuntu 16.04 only.
-It should reasonably work on most modern Linux distributions. 
-   
+It should reasonably work on most modern Linux distributions.

 # Requirements

@@ -103,24 +82,31 @@ For every sample you wish to analyze, we require one or more paired end
 readgroups in fastq format. They must be compressed with either `gzip` or
 `bgzip`.

-Samples must be passed to the pipeline through a config file. This is a
-simple json file listing the samples and their associated readgroups/libraries.
+The configuration must be passed to the pipeline through a configuration file.
+This is a json file listing the samples and their associated readgroups
+as well as the other settings to be used.
 An example config json can be found [here](config/example.json), and a
-json schema describing the configuration file can be found [here](config/schema.json). 
+json schema describing the configuration file can be found [here](config/schema.json).
 This json schema can also be used to validate your configuration file.

 ## Reference files

-The following reference files **must** be provided:
+The following reference files **must** be provided in the configuration:

-1. A reference genome, in fasta format. Must be indexed with `samtools faidx`.
-2. A dbSNP VCF file
-3. A VCF file from 1000Genomes
-4. A VCF file from the HapMap project.
+1. `reference`: A reference genome, in fasta format. Must be indexed with
+   `samtools faidx`.
+2. `dbsnp`: A dbSNP VCF file
+3. `known_sites`: One ore more VCF files with known sites for base
+    recalibration

 The following reference files **may** be provided:

-1. Any number of BED files to calculate coverage on.
+1. `targetsfile`: Bed file of the targets of the capture kit. Used to calculate coverage.
+2. `baitsfile`: Bed file of the baits of the capture kit. Used to calculate picard HsMetric.
+3. `refflat`: A refFlat file to calculate coverage over transcripts.
+4. `scatter_size`: Size of the chunks to split the variant calling into.
+5. `female_threshold`: Fraction of reads between X and the autosomes to call as
+    female.


 # How to run
@@ -131,7 +117,7 @@ the pipeline can be started with:
 ```bash
 snakemake -s Snakefile \
 --use-singularity \
--config <CONFIGURATION VALUES>
+--configfile tests/data/config/sample_config.json
 ```

 This would start all jobs locally. Obviously this is not what one would
@@ -139,26 +125,31 @@ regularly do for a normal pipeline run. How to submit jobs on a cluster is
 described later. Let's first move on to the necessary configuration values.

 ## Configuration values
+The required and optional outputs are specified in the json schema located in
+`config/schema.json`. Before running, the content of the `--configfile` is
+validated against this schema.

 The following configuration values are **required**:

 | configuration | description |
 | ------------- | ----------- |
-| `REFERENCE` | Absolute path to fasta file |
-| `SAMPLE_CONFIG` | Path to config file as described above |
-| `GATK` | Path to GATK jar. **Must** be version 3.7  |
-| `DBSNP` | Path to dbSNP VCF |
-| `ONETHOUSAND` | Path to 1000Genomes VCF |
-| `HAPMAP` | Path to HapMap VCF |
+| `reference` | Absolute path to fasta file |
+| `samples` | One or more samples, with associated fastq files |
+| `dbsnp` | Path to dbSNP VCF file|
+| `known_sites` | Path to one or more VCF files with known sites. Can be the same as the `dbsnp` file|
+

 The following configuration options are **optional**:

 | configuration | description |
 | ------------- | ----------- |
-| `BED` | Comma-separate list of paths to BED files of interest |
-| `FEMALE_THRESHOLD` | Float between 0 and 1 that signifies the threshold of the ratio between coverage on X/overall coverage that 'calls' a sample as female. Default = 0.6 |
-| `FASTQ_COUNT` | Path to `fastq-count` executable |
-| `MAX_BASES` | Maximum allowed number of bases per sample before subsampling. Default = None (no subsampling) |
+| `targetsfile` | Bed file of the targets of the capture kit. Used to calculate coverage |
+| `baitsfile` | Bed file of the baits of the capture kit. Used to calculate picard HsMetrics |
+| `female_threshold` | Float between 0 and 1 that signifies the threshold of the ratio between coverage on X/overall coverage that 'calls' a sample as female. Default = 0.6 |
+| `scatter_size` | The size of chunks to divide the reference into for parallel execution. Default = 1000000000 |
+| `coverage_threshold` | One or more threshold coverage values. For each value, a sample specific bed file will be created that contains the regions where the coverage is above the threshold |
+| `restrict_BQSR` | Restrict GATK BaseRecalibration to a single chromosome. This is faster, but the recalibration is possibly less reliable |
+| `multisample_vcf` | Create a true multisample VCF file, in addition to the regular per-sample VCF files |


 ## Cluster configuration
@@ -166,12 +157,12 @@ The following configuration options are **optional**:
 To run on a cluster, snakemake needs to be called with some extra arguments.
 Additionally, it needs a cluster yaml file describing resources per job.

-If you run on a cluster with drmaa support,an environment variable named 
-`DRMAA_LIBRARY_PATH` must be in the executing shell environment. This variable 
+If you run on a cluster with drmaa support,an environment variable named
+`DRMAA_LIBRARY_PATH` must be in the executing shell environment. This variable
 points to the `.so` file of the DRMAA library.

-An sge-cluster.yml is bundled with this pipeline in the cluster directory. 
-It is optimized for SGE clusters, where the default vmem limit is 4G. 
+An sge-cluster.yml is bundled with this pipeline in the cluster directory.
+It is optimized for SGE clusters, where the default vmem limit is 4G.
 If you run SLURM, or any other cluster system, you will have to write your own
 cluster yaml file. Please see the [snakemake documentation](http://snakemake.readthedocs.io/en/stable/snakefiles/configuration.html#cluster-configuration)
 for details on how to do so. Given the provided sge-cluster.yml, activating the
@@ -183,23 +174,29 @@ snakemake -s Snakefile \
 --drmaa ' -pe <PE_NAME> {cluster.threads} -q all.q -l h_vmem={cluster.vmem} -cwd -V -N hutspot' \
 ```

+## Limitations
+Sample names should be unique, and not overlap (such as `sample1` and
+`sample10`). This is due to the way output files are parsed by multiQC,
+when sample names overlap, the json output for picard DuplicationMetrics cannot
+be parsed unambiguously.
+
 ## Binding additional directories under singularity

-In singularity mode, snakemake binds the location of itself in the container. 
-The current working directory is also visible directly in the container. 
+In singularity mode, snakemake binds the location of itself in the container.
+The current working directory is also visible directly in the container.

 In many cases, this is not enough, and will result in `FileNotFoundError`s.
-E.g., suppose you run your pipeline in `/runs`, but your fastq files live in 
+E.g., suppose you run your pipeline in `/runs`, but your fastq files live in
 `/fastq` and your reference genome lives in `/genomes`. We would have to bind
-`/fastq` and `/genomes` in the container. 
+`/fastq` and `/genomes` in the container.

-This can be accomplished with `--singularity-args`, which accepts a simple 
+This can be accomplished with `--singularity-args`, which accepts a simple
 string of arguments passed to singularity. E.g. in the above example,
 we could do:

 ```bash
 snakemake -S Snakefile \
--use-singularity  \ 
+--use-singularity  \
 --singularity-args ' --bind /fastq:/fastq --bind /genomes:/genomes '
 ```

@@ -218,38 +215,16 @@ snakemake -s Snakefile \
 -w 120 \
 --max-jobs-per-second 30 \
 --restart-times 2 \
--config SAMPLE_CONFIG=samples.json \
-REFERENCE=/path/to/genome.fasta \
-GATK=/path/to/GenomeAnalysisTK.jar \
-DBSNP=/path/to/dbsnp.vcf.gz \
-ONETHOUSAND=/path/to/onekg.vcf \
-HAPMAP=/path/to/hapmap.vcf \
-FASTQ_COUNT=/path/to/fastq-count \
-BED=/path/to/interesting_region.bed
+--configfile config.json
 ```

-## Using conda instead of singularity
-
-Legacy conda environments are also available for each and every rule. 
-Simply use `--use-conda` instead of `--use-singularity` to enable conda
-environments.
-
-As dependency conflicts can and do arise with conda, it is recommended to 
-combine this flag with `--conda-prefix`, such that you only have to 
-build the environments once.
-
-The conda environments use the same versions of tools as the singularity
-containers, bar one:
-
-* `fastqc` uses version 0.11.5 on conda, but 0.11.7 on singularity.    
-
 # Graph

-Below you can see the rulegraph of the pipeline. The main variant calling flow
+Below you can see the rule graph of the pipeline. The main variant calling flow
 is highlighted in red. This only shows dependencies
 between rules, and not between jobs. The actual job graph is considerably
 more complex, as nearly all rules are duplicated by sample and some
-(the scatter jobs) additionally by chunk. 
+(the scatter jobs) additionally by chunk.

 As a rough estimate of the total number of jobs in pipeline you can use
 the following formula:
@@ -271,111 +246,76 @@ Having trouble viewing the graph? See [this](img/rulegraph.svg) static SVG inste
 ```plantuml
 digraph snakemake_dag {
    graph[bgcolor=white, margin=0];
-    rankdir=LR;
    node[shape=box, style=rounded, fontname=sans,                 fontsize=10, penwidth=2];
    edge[penwidth=2, color=grey];
-	0[label = "all", color = "0.62 0.6 0.85", style="rounded"];
-	1[label = "genotype_gather", color = "0.31 0.6 0.85", style="rounded"];
-	2[label = "multiqc", color = "0.14 0.6 0.85", style="rounded"];
-	3[label = "bai", color = "0.41 0.6 0.85", style="rounded"];
-	4[label = "split_vcf", color = "0.53 0.6 0.85", style="rounded"];
-	5[label = "fastqc_raw", color = "0.63 0.6 0.85", style="rounded"];
-	6[label = "fastqc_merged", color = "0.24 0.6 0.85", style="rounded"];
-	7[label = "fastqc_postqc", color = "0.26 0.6 0.85", style="rounded"];
-	8[label = "vtools_coverage", color = "0.58 0.6 0.85", style="rounded"];
-	9[label = "merge_stats", color = "0.36 0.6 0.85", style="rounded"];
-	10[label = "genotype_scatter", color = "0.09 0.6 0.85", style="rounded"];
-	11[label = "genotype_chunkfile", color = "0.29 0.6 0.85", style="rounded"];
-	12[label = "stats_tsv", color = "0.51 0.6 0.85", style="rounded"];
-	13[label = "markdup", color = "0.55 0.6 0.85", style="rounded"];
-	14[label = "genotype_gather_tbi", color = "0.19 0.6 0.85", style="rounded"];
-	15[label = "merge_r1", color = "0.60 0.6 0.85", style="rounded"];
-	16[label = "merge_r2", color = "0.10 0.6 0.85", style="rounded"];
-	17[label = "cutadapt", color = "0.17 0.6 0.85", style="rounded"];
-	18[label = "gvcf_gather", color = "0.32 0.6 0.85", style="rounded"];
-	19[label = "gvcf_gather_tbi", color = "0.27 0.6 0.85", style="rounded"];
-	20[label = "collectstats", color = "0.03 0.6 0.85", style="rounded"];
-	21[label = "vcfstats", color = "0.00 0.6 0.85", style="rounded"];
-	22[label = "align", color = "0.05 0.6 0.85", style="rounded"];
-	23[label = "create_markdup_tmp", color = "0.44 0.6 0.85", style="rounded"];
-	24[label = "sickle", color = "0.39 0.6 0.85", style="rounded"];
-	25[label = "gvcf_scatter", color = "0.02 0.6 0.85", style="rounded"];
-	26[label = "gvcf_chunkfile", color = "0.56 0.6 0.85", style="rounded"];
-	27[label = "fqcount_preqc", color = "0.38 0.6 0.85", style="rounded"];
-	28[label = "fqcount_postqc", color = "0.12 0.6 0.85", style="rounded"];
-	29[label = "mapped_num", color = "0.50 0.6 0.85", style="rounded"];
-	30[label = "mapped_basenum", color = "0.43 0.6 0.85", style="rounded"];
-	31[label = "unique_num", color = "0.65 0.6 0.85", style="rounded"];
-	32[label = "usable_basenum", color = "0.22 0.6 0.85", style="rounded"];
-	33[label = "fastqc_stats", color = "0.46 0.6 0.85", style="rounded"];
-	34[label = "covstats", color = "0.07 0.6 0.85", style="rounded"];
-	35[label = "seqtk_r1", color = "0.34 0.6 0.85", style="rounded"];
-	36[label = "seqtk_r2", color = "0.21 0.6 0.85", style="rounded"];
-	37[label = "baserecal", color = "0.48 0.6 0.85", style="rounded"];
-	38[label = "genome", color = "0.15 0.6 0.85", style="rounded"];
-	9 -> 0
-	4 -> 0 [color = "red"]
+	0[label = "all", color = "0.30 0.6 0.85", style="rounded"];
+	1[label = "multiqc", color = "0.60 0.6 0.85", style="rounded"];
+	2[label = "merge_stats", color = "0.17 0.6 0.85", style="rounded"];
+	3[label = "bai", color = "0.09 0.6 0.85", style="rounded"];
+	4[label = "genotype_gather\nsample: micro", color = "0.06 0.6 0.85", style="rounded"];
+	5[label = "gvcf_gather\nsample: micro", color = "0.32 0.6 0.85", style="rounded"];
+	6[label = "fastqc_raw\nsample: micro", color = "0.00 0.6 0.85", style="rounded"];
+	7[label = "fastqc_merged", color = "0.11 0.6 0.85", style="rounded"];
+	8[label = "fastqc_postqc", color = "0.02 0.6 0.85", style="rounded"];
+	9[label = "stats_tsv", color = "0.45 0.6 0.85", style="rounded"];
+	10[label = "collectstats", color = "0.24 0.6 0.85", style="rounded"];
+	11[label = "vcfstats\nsampel: micro", color = "0.52 0.6 0.85", style="rounded"];
+	12[label = "markdup", color = "0.47 0.6 0.85", style="rounded"];
+	13[label = "scatterregions", color = "0.56 0.6 0.85", style="rounded"];
+	14[label = "merge_r1\nsample: micro", color = "0.65 0.6 0.85", style="rounded"];
+	15[label = "merge_r2\nsample: micro", color = "0.26 0.6 0.85", style="rounded"];
+	16[label = "cutadapt", color = "0.22 0.6 0.85", style="rounded"];
+	17[label = "fqcount_preqc", color = "0.37 0.6 0.85", style="rounded"];
+	18[label = "fqcount_postqc", color = "0.58 0.6 0.85", style="rounded"];
+	19[label = "mapped_reads_bases", color = "0.43 0.6 0.85", style="rounded"];
+	20[label = "unique_reads_bases", color = "0.34 0.6 0.85", style="rounded"];
+	21[label = "fastqc_stats", color = "0.13 0.6 0.85", style="rounded"];
+	22[label = "covstats", color = "0.39 0.6 0.85", style="rounded"];
+	23[label = "align", color = "0.49 0.6 0.85", style="rounded"];
+	24[label = "create_markdup_tmp", color = "0.41 0.6 0.85", style="rounded,dashed"];
+	25[label = "sickle", color = "0.19 0.6 0.85", style="rounded"];
+	26[label = "genome", color = "0.62 0.6 0.85", style="rounded"];
+	1 -> 0
+	2 -> 0
 	3 -> 0
+	4 -> 0
+	5 -> 0
 	6 -> 0
 	7 -> 0
-	1 -> 0
 	8 -> 0
-	2 -> 0
-	5 -> 0
-	11 -> 1 [color = "red"]
-	10 -> 1 [color = "red"]
-	12 -> 2
-	13 -> 3
-	1 -> 4 [color = "red"]
-	14 -> 4 [color = "red"]
-	16 -> 6
-	15 -> 6
-	17 -> 7
-	19 -> 8
-	18 -> 8
-	20 -> 9
-	21 -> 9
-	19 -> 10 [color = "red"]
-	18 -> 10 [color = "red"]
-	9 -> 12
-	23 -> 13 [color = "red"]
-	22 -> 13 [color = "red"]
-	1 -> 14  [color = "red"]
-	24 -> 17 [color = "red"]
-	25 -> 18 [color = "red"]
-	26 -> 18 [color = "red"]
-	18 -> 19 [color = "red"]
-	28 -> 20
-	27 -> 20
-	32 -> 20
-	30 -> 20
-	33 -> 20
-	34 -> 20
-	29 -> 20
-	31 -> 20
-	1 -> 21
-	14 -> 21
-	17 -> 22 [color = "red"]
-	36 -> 24 [color = "red"]
-	35 -> 24 [color = "red"]
-	37 -> 25 [color = "red"]
-	13 -> 25 [color = "red"]
-	16 -> 27 [color = "red"]
-	15 -> 27 [color = "red"]
-	17 -> 28
-	22 -> 29
-	22 -> 30
-	13 -> 31
-	13 -> 32
-	7 -> 33
-	6 -> 33
-	38 -> 34
-	13 -> 34
-	27 -> 35 [color = "red"]
-	15 -> 35 [color = "red"]
-	27 -> 36 [color = "red"]
-	16 -> 36 [color = "red"]
-	13 -> 37 [color = "red"]
+	9 -> 1
+	10 -> 2
+	11 -> 2
+	12 -> 3
+	13 -> 4
+	13 -> 5
+	14 -> 7
+	15 -> 7
+	16 -> 8
+	2 -> 9
+	17 -> 10
+	18 -> 10
+	19 -> 10
+	20 -> 10
+	21 -> 10
+	22 -> 10
+	4 -> 11
+	23 -> 12
+	24 -> 12
+	25 -> 16
+	14 -> 17
+	15 -> 17
+	16 -> 18
+	23 -> 19
+	12 -> 20
+	7 -> 21
+	8 -> 21
+	12 -> 22
+	26 -> 22
+	16 -> 23
+	24 -> 23
+	14 -> 25
+	15 -> 25
 }
 ```


--- a/Snakefile
+++ b/Snakefile
--- a/cluster/sge_cluster.yml
+++ b/cluster/sge_cluster.yml
@@ -6,8 +6,17 @@ __default__:
 align:
  threads: 8
  vmem: 4G
+bed_to_interval:
+  threads: 1
+  vmem: 16G
+hs_metrics:
+  threads: 1
+  vmem: 20G
 markdup:
-  vmem: 10G
+  vmem: 20G
+multiple_metrics:
+  threads: 1
+  vmem: 20G
 baserecal:
  threads: 8
  vmem: 6G
@@ -22,8 +31,16 @@ genotype_scatter:
 genotype_gather:
  vmem: 10G
 covstats:
-  vmem: 6G
-multiqc:
  vmem: 20G
+multiqc:
+  vmem: 30G
 split_vcf:
  vmem: 20G
+fastqc:
+  threads: 4
+  vem: 8G
+scatterregions:
+  vmem: 30G
+merge_vcf:
+  threads: 8
+  vmem: 10G
--- a/cluster/slurm_cluster.yml
+++ b/cluster/slurm_cluster.yml
+__default__:
+  job_name: hutspot
+  threads: 1
+  vmem: 4G
+  queue: all
+  time: 00:30:00
+
+align:
+  threads: 8
+  vmem: 4G
+  time: 0-2
+
+baserecal:
+  threads: 8
+  vmem: 6G
+  time: 0-2
+
+covstats:
+  vmem: 6G
+
+cutadapt:
+  threads: 8
+  time: 0-2
+
+fastqc_raw:
+  threads: 4
+  time: 0-1
+
+fastqc_merged:
+  threads: 4
+  time: 0-1
+
+fastqc_postqc:
+  threads: 4
+  time: 0-1
+
+fqcount_postqc:
+  time: 0-1
+
+gvcf_scatter:
+  vmem: 20G
+  time: 0-1
+
+gvcf_gather:
+  vmem: 10G
+
+genotype_scatter:
+  vmem: 20G
+  time: 0-1
+
+genotype_gather:
+  vmem: 10G
+
+markdup:
+  vmem: 20G
+  time: 0-1
+
+multiqc:
+  vmem: 30G
+  time: 0-1
+
+sickle:
+  time: 0-1
+
+split_vcf:
+  vmem: 20G
+
+vcfstats:
+  time: 0-1
--- a/common.smk
+++ b/common.smk
+import itertools
+import json
+import jsonschema
+import os
+
+containers = {
+   'bcftools': 'docker://quay.io/biocontainers/bcftools:1.9--ha228f0b_4',
+   'bedtools-2.26-python-2.7': 'docker://quay.io/biocontainers/mulled-v2-3251e6c49d800268f0bc575f28045ab4e69475a6:4ce073b219b6dabb79d154762a9b67728c357edb-0',
+   'bwa-0.7.17-samtools-1.10': 'docker://quay.io/biocontainers/mulled-v2-ad317f19f5881324e963f6a6d464d696a2825ab6:c59b7a73c87a9fe81737d5d628e10a3b5807f453-0',
+   'chunked-scatter': 'docker://quay.io/biocontainers/chunked-scatter:1.0.0--py_0',
+   'cutadapt': 'docker://quay.io/biocontainers/cutadapt:2.9--py37h516909a_0',
+   'debian': 'docker://debian:buster-slim',
+   'fastqc': 'docker://quay.io/biocontainers/fastqc:0.11.7--4',
+   'gatk': 'docker://broadinstitute/gatk3:3.7-0',
+   'gvcf2coverage': 'docker://lumc/gvcf2coverage:0.1-dirty-2',
+   'multiqc': 'docker://quay.io/biocontainers/multiqc:1.8--py_2',
+   'picard': 'docker://quay.io/biocontainers/picard:2.22.8--0',
+   'python3': 'docker://python:3.6-slim',
+   'vtools': 'docker://quay.io/biocontainers/vtools:1.0.0--py37h3010b51_0'
+}
+
+def process_config():
+    """ Process the config file and set the default values """
+
+    def set_default(key, value):
+        """Set default config values"""
+        if key not in config:
+            config[key] = value
+
+    # Read the json schema
+    with open(srcdir('config/schema.json'), 'rt') as fin:
+        schema = json.load(fin)
+
+    # Validate the config against the schema
+    try:
+        jsonschema.validate(config, schema)
+    except jsonschema.ValidationError as e:
+        raise jsonschema.ValidationError(f'Invalid --configfile: {e.message}')
+
+    # If you specify a baitsfile, you also have to specify a targets file for
+    # picard
+    if 'baitsfile' in config and 'targetsfile' not in config:
+        msg = 'Invalid --configfile: "baitsfile" specified without "targetsfile"'
+        raise jsonschema.ValidationError(msg)
+
+    # If you specify a target file but no baitsfile, we use the targets as
+    # baits. This is needed because picard HsMetrics needs both a baitfile and
+    # targets file as input
+    if 'targetsfile' in config and 'baitsfile' not in config:
+        set_default('baitsfile', config['targetsfile'])
+
+    # A sample name cannot be a substring of another sample, since that breaks picard
+    # metrics parsing by multiqc
+    msg = 'Invalid --configfile: sample names should not overlap ("{s1}" is contained in "{s2}")'
+    for s1, s2 in itertools.permutations(config['samples'], 2):
+        if s1 in s2:
+            raise jsonschema.ValidationError(msg.format(s1=s1, s2=s2))
+
+    # Set the default config values
+    set_default('scatter_size', 1000000000)
+    set_default('female_threshold', 0.6)
+    set_default('multisample_vcf', False)
+
+    # Hide the absolute path so the snakemake linter doesn't cry about it
+    set_default('gatk_jar', os.path.join(os.path.sep,'usr','GenomeAnalysisTK.jar'))
+
+def coverage_stats(wildcards):
+    files = expand("{sample}/coverage/refFlat_coverage.tsv",
+                   sample=config["samples"])
+    return files if "refflat" in config else []
+
+def coverage_files(wildcards):
+    """ Return a list of all coverage files
+
+    The coverage is calculated for each sample, for each specified threshold
+    """
+
+    # We only calculate the coverage when this is specified in the
+    # configuration
+    if 'coverage_threshold' not in config:
+        return list()
+
+    # Fetch the values we need from the configuration
+    samples = config['samples']
+    thresholds = config['coverage_threshold']
+
+    files = list()
+    for sample, threshold in itertools.product(samples, thresholds):
+        files.append(f'{sample}/vcf/{sample}_{threshold}.bed')
+    return files
+
+def sample_bamfiles(wildcards):
+    """ Determine the bam files for a sample (one for each readgroup)
+    """
+    files = list()
+    sample = config['samples'][wildcards.sample]
+    sample_name = wildcards.sample
+    for read_group in sample['read_groups']:
+        files.append(f'{sample_name}/bams/{sample_name}-{read_group}.sorted.bam')
+    return files
+
+def gather_gvcf(wildcards):
+    """ Gather the gvcf files based on the scatterregions checkpoint
+
+    This is depends on the 'scatter_size' parameter and the reference genome
+    used
+    """
+    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
+    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz",
+       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
+
+def gather_gvcf_tbi(wildcards):
+    """ Gather the gvcf index files based on the scatterregions checkpoint
+    This is depends on the 'scatter_size' parameter and the reference genome
+    used
+    """
+    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
+    return expand("{{sample}}/vcf/{{sample}}.{i}.g.vcf.gz.tbi",
+       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
+
+def gather_vcf(wildcards):
+    """ Gather the vcf files based on the scatterregions checkpoint
+    This is depends on the 'scatter_size' parameter and the reference genome
+    used
+    """
+    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
+    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz",
+       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
+
+def gather_vcf_tbi(wildcards):
+    """ Gather the vcf index files based on the scatterregions checkpoint
+    This is depends on the 'scatter_size' parameter and the reference genome
+    used
+    """
+    checkpoint_output = checkpoints.scatterregions.get(**wildcards).output[0]
+    return expand("{{sample}}/vcf/{{sample}}.{i}.vcf.gz.tbi",
+       i=glob_wildcards(os.path.join(checkpoint_output, 'scatter-{i}.bed')).i)
+
+def sample_cutadapt_files(wildcards):
+    """ Determine the cutadapt log files files for a sample (one for each
+    readgroup).
+    """
+    files = list()
+    sample = config['samples'][wildcards.sample]
+    sample_name = wildcards.sample
+    for read_group in sample['read_groups']:
+        files.append(f'{sample_name}/pre_process/{sample_name}-{read_group}.txt')
+    return files
+
+def all_trimmed_fastqc(wildcards):
+    """ Determine the trimmed fastq files for each sample """
+    fastq_files = list()
+    for sample in config['samples']:
+        for read_group in config['samples'][sample]['read_groups']:
+            fastq_files.append(f"{sample}/pre_process/trimmed-{sample}-{read_group}/.done")
+    return fastq_files
--- a/config/example.json
+++ b/config/example.json
 {
    "samples": {
        "sample_01": {
-            "libraries": {
+            "read_groups": {
                "lib_l1": {
                    "R1": "1.fq.gz",
                    "R2": "2.fq.gz"
@@ -13,12 +13,19 @@
            }
        },
        "sample_02": {
-            "libraries": {
+            "read_groups": {
                "lib_l1": {
                    "R1": "3.1.fq.gz",
                    "R2": "3.2.fq.gz"
                }
            }
        }
-    }
+    },
+    "reference": "/path/to/ref",
+    "dbsnp": "/path/to/vcf1",
+    "known_sites": ["/path/to/vcf1", "/path/to/vcf2"],
+    "scatter_size": 1000000000,
+    "female_threshold": 0.6,
+    "bedfile": "/path/to/bed",
+    "refflat": "/path/to/refflat"
 }
--- a/config/schema.json
+++ b/config/schema.json
@@ -2,16 +2,32 @@
    "$schema": "http://json-schema.org/draft-04/schema#",
    "description": "JSON schema for samples config for the hutspot pipeline",
    "type": "object",
-    "required": ["samples"],
+    "additionalProperties": false,
+    "required": [
+        "samples",
+        "reference",
+        "dbsnp",
+        "known_sites"
+    ],
+    "optional": [
+        "scatter_size",
+        "female_threshold",
+        "bedfile",
+        "coverage_threshold",
+        "restrict_BQSR",
+        "gatk_jar",
+        "multisample_vcf",
+        "baitsfile"
+    ],
    "properties": {
        "samples": {
            "type": "object",
            "additionalProperties": {
                "description": "sample object",
                "type": "object",
-                "required": ["libraries"],
+                "required": ["read_groups"],
                "properties": {
-                    "libraries": {
+                    "read_groups": {
                        "type": "object",
                        "additionalProperties": {
                            "description": "library",
@@ -25,6 +41,52 @@
                    }
                }
            }
-        }
+        },
+    "reference": {
+        "description": "Reference fasta file to map against",
+        "type": "string"
+    },
+    "dbsnp": {
+        "description": "VCF file to be used to annotate variants",
+        "type": "string"
+    },
+    "known_sites": {
+        "description": "VCF files of known sites, to be used to recalibrate the quality scores",
+        "type": "array",
+        "minItems": 1
+    },
+    "scatter_size": {
+        "description": "Size of the chunks to split the variant calling into",
+        "type": "integer"
+    },
+    "female_threshold": {
+        "description": "Fraction of reads between X and the autosomes to call as female",
+        "type": "number"
+    },
+    "targetsfile": {
+        "description": "Bed file of the targets of the capture kit. Used to calculate coverage",
+        "type": "string"
+    },
+    "baitsfile": {
+        "description": "Bed file of the baits of the capture kit. Used to calculate picard HsMetrics",
+        "type": "string"
+    },
+    "coverage_threshold": {
+        "description": "One or more thresholds to calculate coverage for, one bedfile per value per sample",
+        "type": "array",
+        "minItems": 1
+    },
+    "restrict_BQSR": {
+        "description": "Restrict BQSR to the listed chromosome",
+        "type": "string"
+    },
+    "multisample_vcf": {
+        "description": "Create a true multisample VCF file, in addition to the regular per-sample VCF files",
+        "type": "boolean"
+    },
+    "refflat": {
+        "description": "RefFlat file with transcripts",
+        "type": "string"
    }
+  }
 }
--- a/environment.yml
+++ b/environment.yml
+# This file may be used to create an environment using:
+# $ conda env create --file environment.yml
+# platform: linux-64
 name: hutspot
 channels:
-  - conda-forge
  - bioconda
-  - defaults
+  - conda-forge
 dependencies:
-  - aioeasywebdav=2.2.0
-  - aiohttp=3.5.4
-  - appdirs=1.4.3
-  - asn1crypto=0.24.0
-  - async-timeout=3.0.1
-  - attrs=19.1.0
-  - bcrypt=3.1.4
-  - boto3=1.9.138
-  - botocore=1.12.138
-  - ca-certificates=2019.3.9
-  - cachetools=2.1.0
-  - cairo=1.16.0
-  - certifi=2019.3.9
-  - cffi=1.12.3
-  - chardet=3.0.4
-  - click=7.0
-  - configargparse=0.13.0
-  - cryptography=2.6.1
-  - datrie=0.7.1
-  - decorator=4.4.0
-  - docutils=0.14
-  - dropbox=9.2.0
-  - expat=2.2.5
-  - filechunkio=1.8
-  - fontconfig=2.13.1
-  - freetype=2.10.0
-  - ftputil=3.4
-  - gettext=0.19.8.1
-  - gitdb2=2.0.5
-  - gitpython=2.1.11
-  - glib=2.58.3
-  - google-api-core=1.10.0
-  - google-auth=1.6.3
-  - google-cloud-core=0.29.1
-  - google-cloud-storage=1.15.0
-  - google-resumable-media=0.3.2
-  - googleapis-common-protos=1.5.9
-  - graphite2=1.3.13
-  - graphviz=2.38.0
-  - harfbuzz=2.4.0
-  - icu=58.2
-  - idna=2.8
-  - idna_ssl=1.1.0
-  - jinja2=2.10.1
-  - jmespath=0.9.4
-  - jpeg=9c
-  - jsonschema=3.0.1
-  - libblas=3.8.0
-  - libcblas=3.8.0
-  - libffi=3.2.1
-  - libgcc-ng=8.2.0
-  - libgfortran=3.0.0
-  - libiconv=1.15
-  - liblapack=3.8.0
-  - libpng=1.6.37
-  - libprotobuf=3.7.1
-  - libstdcxx-ng=8.2.0
-  - libtiff=4.0.10
-  - libtool=2.4.6
-  - libuuid=2.32.1
-  - libxcb=1.13
-  - libxml2=2.9.9
-  - markupsafe=1.1.1
-  - multidict=4.5.2
-  - ncurses=6.1
-  - networkx=2.3
-  - numpy=1.16.3
-  - openblas=0.3.5
-  - openssl=1.1.1b
-  - pandas=0.24.2
-  - pango=1.40.14
-  - paramiko=2.4.2
-  - pcre=8.41
-  - pip=19.1
-  - pixman=0.34.0
-  - prettytable=0.7.2
-  - protobuf=3.7.1
-  - psutil=5.6.2
-  - pthread-stubs=0.4
-  - pyasn1=0.4.4
-  - pyasn1-modules=0.2.4
-  - pycparser=2.19
-  - pyfaidx=0.5.0
-  - pygraphviz=1.5
-  - pynacl=1.3.0
-  - pyopenssl=19.0.0
-  - pyrsistent=0.15.1
-  - pysftp=0.2.9
-  - pysocks=1.6.8
-  - python=3.6.7
-  - python-dateutil=2.8.0
-  - python-irodsclient=0.7.0
-  - pytz=2019.1
-  - pyyaml=5.1
-  - ratelimiter=1.2.0
-  - readline=7.0
-  - requests=2.21.0
-  - rsa=3.4.2
-  - s3transfer=0.2.0
-  - setuptools=41.0.1
-  - six=1.12.0
-  - smmap2=2.0.5
-  - snakemake=5.4.5
-  - snakemake-minimal=5.4.5
-  - sqlite=3.26.0
-  - tk=8.6.9
-  - typing_extensions=3.7.2
-  - urllib3=1.24.2
-  - wheel=0.33.1
-  - wrapt=1.11.1
-  - xmlrunner=1.7.7
-  - xorg-kbproto=1.0.7
-  - xorg-libice=1.0.9
-  - xorg-libsm=1.2.3
-  - xorg-libx11=1.6.7
-  - xorg-libxau=1.0.9
-  - xorg-libxdmcp=1.1.3
-  - xorg-libxext=1.3.4
-  - xorg-libxpm=3.5.12
-  - xorg-libxrender=0.9.10
-  - xorg-libxt=1.1.5
-  - xorg-renderproto=0.11.1
-  - xorg-xextproto=7.3.0
-  - xorg-xproto=7.0.31
-  - xz=5.2.4
-  - yaml=0.1.7
-  - yarl=1.3.0
-  - zlib=1.2.11
+  - pytest-workflow>=1.4.0
+  - snakemake-minimal
+  - boto3
+  - smart_open
--- a/envs/bcftools.yml
+++ b/envs/bcftools.yml
-name: hutspot-bcftools
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-dependencies:
-  - bcftools=1.9
-  - bzip2=1.0.6
-  - ca-certificates=2019.3.9
-  - curl=7.64.1
-  - krb5=1.16.3
-  - libcurl=7.64.1
-  - libdeflate=1.0
-  - libedit=3.1.20170329
-  - libgcc-ng=8.2.0
-  - libssh2=1.8.2
-  - libstdcxx-ng=8.2.0
-  - ncurses=6.1
-  - openssl=1.1.1b
-  - tk=8.6.9
-  - xz=5.2.4
-  - zlib=1.2.11
\ No newline at end of file
--- a/envs/bwa.yml
+++ b/envs/bwa.yml
-name: hutspot-bwa
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-dependencies:
-  - bwa=0.7.16
-  - ca-certificates=2017.7.27.1
-  - certifi=2017.7.27.1
-  - libgcc=5.2.0
-  - ncurses=5.9
-  - openjdk=8.0.121
-  - openssl=1.0.2l
-  - perl=5.22.0.1
-  - picard=2.14
-  - pip=9.0.1
-  - python=3.6.3
-  - readline=6.2
-  - setuptools=36.6.0
-  - sqlite=3.13.0
-  - tk=8.5.19
-  - wheel=0.30.0
-  - xz=5.2.3
-  - zlib=1.2.8
--- a/envs/collectstats.yml
+++ b/envs/collectstats.yml
-name: hutspot-collectstats
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-dependencies:
-  - ca-certificates=2017.11.5
-  - certifi=2017.11.5
-  - click=6.7
-  - ncurses=5.9
-  - openssl=1.0.2l
-  - pip=9.0.1
-  - python=3.6.3
-  - readline=6.2
-  - setuptools=36.7.2
-  - sqlite=3.13.0
-  - tk=8.5.19
-  - wheel=0.30.0
-  - xz=5.2.3
-  - zlib=1.2.11
--- a/envs/covstat.yml
+++ b/envs/covstat.yml
-name: hutspot-covstat
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-dependencies:
-  - backports=1.0
-  - backports.functools_lru_cache=1.4
-  - backports_abc=0.5
-  - bedtools=2.26.0
-  - blas=1.1
-  - ca-certificates=2017.11.5
-  - certifi=2017.11.5
-  - cycler=0.10.0
-  - dbus=1.10.22
-  - expat=2.2.1
-  - fontconfig=2.12.6
-  - freetype=2.8.1
-  - functools32=3.2.3.2
-  - gettext=0.19.7
-  - glib=2.53.5
-  - gst-plugins-base=1.8.0
-  - gstreamer=1.8.0
-  - icu=58.2
-  - jpeg=9b
-  - libffi=3.2.1
-  - libgcc=5.2.0
-  - libgcc-ng=8.2.0
-  - libgfortran=3.0.0
-  - libiconv=1.15
-  - libpng=1.6.34
-  - libstdcxx-ng=8.2.0
-  - libxcb=1.12
-  - libxml2=2.9.5
-  - matplotlib=2.1.0
-  - ncurses=5.9
-  - numpy=1.13.3
-  - openblas=0.2.20
-  - openssl=1.0.2l
-  - pcre=8.39
-  - pip=9.0.1
-  - pyparsing=2.2.0
-  - pyqt=5.6.0
-  - python=2.7.14
-  - python-dateutil=2.8.0
-  - pytz=2017.3
-  - qt=5.6.2
-  - readline=7.0
-  - setuptools=36.7.2
-  - singledispatch=3.4.0.3
-  - sip=4.18
-  - six=1.11.0
-  - sqlite=3.20.1
-  - ssl_match_hostname=3.5.0.1
-  - subprocess32=3.2.7
-  - tk=8.6.9
-  - tornado=4.5.2
-  - wheel=0.30.0
-  - xorg-libxau=1.0.8
-  - xorg-libxdmcp=1.1.2
-  - xz=5.2.3
-  - zlib=1.2.11
--- a/envs/cutadapt.yml
+++ b/envs/cutadapt.yml
-name: hutspot-cutadapt
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-dependencies:
-  - ca-certificates=2017.7.27.1
-  - certifi=2017.7.27.1
-  - cutadapt=1.14
-  - ncurses=5.9
-  - openssl=1.0.2l
-  - pip=9.0.1
-  - python=3.6.3
-  - readline=6.2
-  - setuptools=36.6.0
-  - sqlite=3.13.0
-  - tk=8.5.19
-  - wheel=0.30.0
-  - xopen=0.1.1
-  - xz=5.2.3
-  - zlib=1.2.11
--- a/envs/fastq-count.yml
+++ b/envs/fastq-count.yml
-name: hutspot-fastq-count
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-dependencies:
-  - fastq-count=0.1.0
-  - libgcc-ng=8.2.0
--- a/envs/fastqc.yml
+++ b/envs/fastqc.yml
-name: hutspot-fastqc
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-dependencies:
-  - fastqc=0.11.5
-  - openjdk=8.0.121
-  - perl=5.22.0.1
--- a/envs/gatk.yml
+++ b/envs/gatk.yml
-name: hutspot-gatk
-channels:
-  - conda-forge
-  - bioconda
-  - defaults
-dependencies:
-  - bzip2=1.0.6
-  - ca-certificates=2017.7.27.1
-  - certifi=2017.7.27.1
-  - gatk=3.7
-  - ncurses=5.9
-  - openjdk=8.0.121
-  - openssl=1.0.2l
-  - pip=9.0.1
-  - python=3.6.3
-  - readline=6.2
-  - setuptools=36.6.0
-  - sqlite=3.13.0
-  - tk=8.5.19
-  - wheel=0.30.0
-  - xz=5.2.3
-  - zlib=1.2.11