Commit 37b7f677 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'feature-gentrap_docs' into 'develop'

Added extra documentation

fixes #363 

See merge request !433
parents ea4e0944 b4688da4
......@@ -24,7 +24,7 @@ import org.broadinstitute.gatk.queue.QScript
/** This trait creates a structured way of use multisample pipelines */
trait MultiSampleQScript extends SummaryQScript { qscript: QScript =>
@Argument(doc = "Only Sample", shortName = "s", required = false, fullName = "sample")
@Argument(doc = "Only Process This Sample", shortName = "s", required = false, fullName = "sample")
private[core] val onlySamples: List[String] = Nil
if (!globalConfig.map.contains("samples")) Logging.addError("No Samples found in config")
......
......@@ -52,22 +52,22 @@ object AnnotateVcfWithBed extends ToolCommand {
class OptParser extends AbstractOptParser {
opt[File]('I', "inputFile") required () unbounded () valueName "<vcf file>" action { (x, c) =>
c.copy(inputFile = x)
} text "Input is a required file property"
} text "Input VCF file. Mandatory field"
opt[File]('B', "bedFile") required () unbounded () valueName "<bed file>" action { (x, c) =>
c.copy(bedFile = x)
} text "Bedfile is a required file property"
} text "Input Bed file. Mandatory field"
opt[File]('o', "output") required () unbounded () valueName "<vcf file>" action { (x, c) =>
c.copy(outputFile = x)
} text "out is a required file property"
} text "Output VCF file. Mandatory field"
opt[String]('f', "fieldName") required () unbounded () valueName "<name of field in vcf file>" action { (x, c) =>
c.copy(fieldName = x)
} text "Name of info field in new vcf file"
opt[String]('d', "fieldDescription") unbounded () valueName "<name of field in vcf file>" action { (x, c) =>
opt[String]('d', "fieldDescription") unbounded () valueName "<description of field in vcf file>" action { (x, c) =>
c.copy(fieldDescription = x)
} text "Description of field in new vcf file"
opt[String]('t', "fieldType") unbounded () valueName "<name of field in vcf file>" action { (x, c) =>
opt[String]('t', "fieldType") unbounded () valueName "<type of field in vcf file>" action { (x, c) =>
c.copy(fieldType = x)
} text "Description of field in new vcf file"
} text "Type of field in new vcf file. Can be 'Integer', 'Flag', 'Character', 'Float'"
}
/**
......
......@@ -38,13 +38,13 @@ object BaseCounter extends ToolCommand {
class OptParser extends AbstractOptParser {
opt[File]('r', "refFlat") required () valueName "<file>" action { (x, c) =>
c.copy(refFlat = x)
}
} text "refFlat file. Mandatory"
opt[File]('o', "outputDir") required () valueName "<directory>" action { (x, c) =>
c.copy(outputDir = x)
}
} text "Output directory. Mandatory"
opt[File]('b', "bam") required () valueName "<file>" action { (x, c) =>
c.copy(bamFile = x)
}
} text "Bam file. Mandatory"
opt[String]('p', "prefix") valueName "<prefix>" action { (x, c) =>
c.copy(prefix = x)
}
......
......@@ -27,10 +27,10 @@ object BedtoolsCoverageToCounts extends ToolCommand {
class OptParser extends AbstractOptParser {
opt[File]('I', "input") required () valueName "<file>" action { (x, c) =>
c.copy(input = x)
}
} text "Coverage file produced with bedtools"
opt[File]('o', "output") required () unbounded () valueName "<file>" action { (x, c) =>
c.copy(output = x)
}
} text "Output file name"
}
/**
......
......@@ -41,7 +41,7 @@ object BiopetFlagstat extends ToolCommand {
} text "summary output file"
opt[String]('r', "region") valueName "<chr:start-stop>" action { (x, c) =>
c.copy(region = Some(x))
}
} text "Region to calculate the statistics on"
}
/**
......
......@@ -47,19 +47,19 @@ object CheckAllelesVcfInBam extends ToolCommand {
class OptParser extends AbstractOptParser {
opt[File]('I', "inputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(inputFile = x)
}
} text "VCF file"
opt[File]('o', "outputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(outputFile = x)
}
} text "output VCF file name"
opt[String]('s', "sample") unbounded () minOccurs 1 action { (x, c) =>
c.copy(samples = x :: c.samples)
}
} text "sample name"
opt[File]('b', "bam") unbounded () minOccurs 1 action { (x, c) =>
c.copy(bamFiles = x :: c.bamFiles)
}
} text "bam file, from which the variants (VCF files) were called"
opt[Int]('m', "min_mapping_quality") maxOccurs 1 action { (x, c) =>
c.copy(minMapQual = c.minMapQual)
}
} text "minimum mapping quality score for a read to be taken into account"
}
private class CountReport(
......
......@@ -35,7 +35,7 @@ trait ToolCommand extends MainCommand with Logging {
case "error" => logger.setLevel(org.apache.log4j.Level.ERROR)
case _ =>
}
} text "Log level" validate {
} text "Level of log information printed. Possible levels: 'debug', 'info', 'warn', 'error'" validate {
case "debug" | "info" | "warn" | "error" => success
case _ => failure("Log level must be <debug/info/warn/error>")
}
......
......@@ -138,4 +138,4 @@ During execution, biopet framework will resolve the value for each ConfigNamespa
### JSON validation
To check if the created JSON file is correct their are several possibilities: the simplest way is using [this](http://jsonformatter.curiousconcept.com/)
website. It is also possible to use Python, Scala or any other programming languages for validating JSON files but this requires some more knowledge.
\ No newline at end of file
website. It is also possible to use Python, Scala or any other programming languages for validating JSON files but this requires some more knowledge.
......@@ -32,48 +32,34 @@ To get help creating the appropriate [configs](../general/config.md) please refe
Samples are single experimental units whose expression you want to measure. They usually consist of a single sequencing library, but in some cases (for example when the experiment demands each sample have a minimum library depth) a single sample may contain multiple sequencing libraries as well. All this is can be configured using the correct JSON nesting, with the following pattern:
~~~ json
{
"samples": {
"sample_A": {
"libraries": {
"lib_01": {
"R1": "/absolute/path/to/first/read/pair.fq",
"R2": "/absolute/path/to/second/read/pair.fq"
}
}
}
}
}
~~~ yaml
---
samples:
sample_A:
libraries:
lib_01:
R1: "/absolute/path/to/first/read/pair.fq"
R2: "/absolute/path/to/second/read/pair.fq"
~~~
In the example above, there is one sample (named `sample_A`) which contains one sequencing library (named `lib_01`). The library itself is paired end, with both `R1` and `R2` pointing to the location of the files in the file system. A more complicated example is the following:
~~~ json
{
"samples": {
"sample_X": {
"libraries": {
"lib_one": {
"R1": "/absolute/path/to/first/read/pair.fq",
"R2": "/absolute/path/to/second/read/pair.fq"
}
}
},
"sample_Y": {
"libraries": {
"lib_one": {
"R1": "/absolute/path/to/first/read/pair.fq",
"R2": "/absolute/path/to/second/read/pair.fq"
},
"lib_two": {
"R1": "/absolute/path/to/first/read/pair.fq",
"R2": "/absolute/path/to/second/read/pair.fq"
}
}
}
}
}
~~~ yaml
---
samples:
sample_X:
libraries:
lib_one:
R1: "/absolute/path/to/first/read/pair.fq"
R2: "/absolute/path/to/second/read/pair.fq"
sample_Y:
libraries:
lib_one:
R1: "/absolute/path/to/first/read/pair.fq"
R2: "/absolute/path/to/second/read/pair.fq"
lib_two:
R1: "/absolute/path/to/first/read/pair.fq"
R2: "/absolute/path/to/second/read/pair.fq"
~~~
In this case, we have two samples (`sample_X` and `sample_Y`) and `sample_Y` has two different libraries (`lib_one` and `lib_two`). Notice that the names of the samples and libraries may change, but several keys such as `samples`, `libraries`, `R1`, and `R2` remain the same.
......@@ -84,8 +70,8 @@ In this case, we have two samples (`sample_X` and `sample_Y`) and `sample_Y` has
For the pipeline settings, there are some values that you need to specify while some are optional. Required settings are:
1. `output_dir`: path to output directory (if it does not exist, Gentrap will create it for you).
2. `aligner`: which aligner to use (`gsnap`, `tophat`, `hisat2`, `star` or `star-2pass`)
3. `reference_fasta`: this must point to a reference FASTA file and in the same directory, there must be a `.dict` file of the FASTA file.
2. `aligner`: which aligner to use (`gsnap`, `tophat`, `hisat2`, `star` or `star-2pass`). `star-2pass` enables the 2-pass mapping option of STAR, for the most sensitive novel junction discovery. For more, please refer to [STAR user Manual](https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf)
3. `reference_fasta`: this must point to a reference FASTA file and in the same directory, there must be a `.dict` file of the FASTA file. If the `.dict` file does not exist, you can create it using: ```` java -jar <picard jar> CreateSequenceDictionary R=<reference.fasta> O=<outputDict> ````
4. `expression_measures`: this entry determines which expression measurement modes Gentrap will do. You can choose zero or more from the following: `fragments_per_gene`, `base_counts`, `cufflinks_strict`, `cufflinks_guided` and/or `cufflinks_blind`. If you only wish to align, you can set the value as an empty list (`[]`).
5. `strand_protocol`: this determines whether your library is prepared with a specific stranded protocol or not. There are two protocols currently supported now: `dutp` for dUTP-based protocols and `non_specific` for non-strand-specific protocols.
6. `annotation_refflat`: contains the path to an annotation refFlat file of the entire genome
......@@ -99,16 +85,29 @@ While optional settings are:
5. `call_variants`: whether to call variants on the RNA-seq data or not, defaults to `false`.
Thus, an example settings configuration is as follows:
~~~ yaml
---
output_dir: "/path/to/output/dir"
expression_measures:
- "fragments_per_gene"
- "bases_per_gene"
strand_protocol: "dutp"
reference_fasta: "/path/to/reference/fastafile"
annotation_gtf: "/path/to/gtf"
annotation_refflat: "/path/to/refflat"
~~~
~~~ json
{
"output_dir": "/path/to/output/dir",
"expression_measures": ["fragments_per_gene", "bases_per_gene"],
"strand_protocol": "dutp",
"reference_fasta": "/path/to/reference/fastafile",
"annotation_gtf": "/path/to/gtf",
"annotation_refflat": "/path/to/refflat",
}
#### Best practice example
If you are unsure of how to use the numerous options of gentrap, please refer to the following best practice configuration file example.
~~~ yaml
---
output_dir: "/path/to/output/dir"
aligner: "gsnap"
reference_fasta: "/path/to/reference/fastafile"
expression_measures:
- "fragments_per_gene"
strand_protocol: "dutp"
annotation_refflat: "/path/to/refflat"
~~~
#### Example configurations
......@@ -126,7 +125,7 @@ biopet pipeline gentrap -config </path/to/config.json> -qsub -jobParaEnv BWA -ru
You can also use the `biopet` environment module (recommended) when you are running the pipeline in SHARK:
~~~ bash
$ module load biopet/v0.5.0
$ module load biopet/v0.7.0
$ biopet pipeline gentrap -config </path/to/config.json> -qsub -jobParaEnv BWA -run
~~~
......
# AnnotateVcfWithBed
## Introduction
This tool to annotates a vcf file using the input from a bed file
## Example
To get the help menu:
~~~
Usage: AnnotateVcfWithBed [options]
-l <value> | --log_level <value>
Log level
-h | --help
Print usage
-v | --version
Print version
-I <vcf file> | --inputFile <vcf file>
Input is a required file property
-B <bed file> | --bedFile <bed file>
Bedfile is a required file property
-o <vcf file> | --output <vcf file>
out is a required file property
-f <name of field in vcf file> | --fieldName <name of field in vcf file>
Name of info field in new vcf file
-d <name of field in vcf file> | --fieldDescription <name of field in vcf file>
Description of field in new vcf file
-t <name of field in vcf file> | --fieldType <name of field in vcf file>
Description of field in new vcf file
~~~
To run the tool use:
~~~
biopet tool AnnotateVcfWithBed -I myVcf.vcf -B myBed.bed -o myannotatedVcf.vcf
~~~
## Results
The result of this tool will be a vcf file with an extra field with annotation
# BaseCounter
## Introduction
This tool will generate Base count based on a bam file and a refflat file
## Example
Help menu
~~~~
Usage: BaseCounter [options]
-l <value> | --log_level <value>
Log level
-h | --help
Print usage
-v | --version
Print version
-r <file> | --refFlat <file>
refFlat file. Mandatory
-o <directory> | --outputDir <directory>
Output directory. Mandatory
-b <file> | --bam <file>
Bam file. Mandatory
-p <prefix> | --prefix
~~~~
# CheckAllelesVcfInBam
## Introduction
This tool has been written to check the allele frequency in BAM files.
This tool has been written to check the allele frequency in BAM files. This is meant for comparison with the allele frequency reported at the VCF file
## Example
To get the help menu:
......@@ -16,14 +16,15 @@ Usage: CheckAllelesVcfInBam [options]
-v | --version
Print version
-I <file> | --inputFile <file>
VCF file
-o <file> | --outputFile <file>
output VCF file name
-s <value> | --sample <value>
sample name
-b <value> | --bam <value>
bam file, from which the variants (VCF files) were called
-m <value> | --min_mapping_quality <value>
minimum mapping quality score for a read to be taken into account
~~~
To run the tool:
......
# FastqSplitter
## Introduction
This tool splits a fastq files based on the number of output files specified. So if one specifies 5 output files it will split the fastq
into 5 files. This can be very usefull if one wants to use chunking option in one of our pipelines, we can generate the exact amount of fastqs
needed for the number of chunks specified. Note that this will be automatically done inside the pipelines.
This tool divides a fastq file into smaller fastq files, based on the number of output files specified. For ecample, if one specifies 5 output files it will split the fastq
into 5 files of equal size. This can be very useful if one wants to use chunking option in one of our pipelines: FastqSplitter can generate the exact number of fastq files (chunks) as needed. This tool is used internally in our pipelines as required
## Example
To get the help menu:
......@@ -29,7 +27,7 @@ biopet tool FastqSplitter --inputFile myFastq.fastq \
--output mySplittedFastq_1.fastq --output mySplittedFastq_2.fastq \
--output mySplittedFastq_3.fastq
~~~
The above invocation will split the input in 3 equally divided fastq files.
The above invocation will split the input file into 3 fastq files of equal size.
## Output
Multiple fastq files based on the number of outputFiles specified.
\ No newline at end of file
# FindRepeatsPacBio
## Introduction
This tool looks and annotates repeat regions inside a BAM file. It extracts the regions of interest from a bed file and then intersects
those regions with the BAM file. On those extracted regions the tool will perform a
Mpileup and counts all insertions/deletions etc. etc. for that specific location on a per read basis.
This tool searches for and annotates repeat regions inside a BAM file.
It intersect the regions provided in the bed file with the BAM file and extracts them.
On the extracted regions *samtools mpileup* will be run and all insertions, deletions or substitutions will be counted on a per read basis
## Example
......
# MpileupToVcf
## Introduction
This tool enables a user to extract a VCF file out a mpileup file generated from the BAM file.
The tool can also stream through STDin and STDout so that the mpileup file is not stored on disk.
Mpileup files tend to be very large since they describe each covered base position in the genome on a per read basis,
so usually one does not want to safe these files.
This tool enables a user to extract a VCF file out a mpileup file generated from the BAM file using *samtools mpileup*, for instance.
The tool can also stream through STDin and STDout so that it is not necessary to store the mpileup file on disk.
Mpileup files can to be very large because they describe each covered base position in the genome on a per read basis,
so it is not desired to store them.
----
......
# SamplesTsvToJson
This tool enables a user to create a full sample sheet in JSON format suitable for all our Queue pipelines.
The tool can be started as follows:
This tool enables a user to create a full sample sheet in JSON format, suitable for all our Queue pipelines, from TSV file(s).
The tool can be called as follows:
~~~ bash
biopet tool SamplesTsvToJson
......@@ -27,11 +27,11 @@ Usage: SamplesTsvToJson [options]
~~~
The tool is designed in such a way that a user can provide a TAB seperated file (TSV) with sample specific properties and even those will be parsed by the tool.
For example: a user wants to have certain properties e.g. which treatment a sample got than the user should provide a extra columns called treatment and then the
JSON file is parsed with those properties inside it as well. The order of columns does not matter.
A user provides a TAB seperated file (TSV) with sample specific properties which are parsed into JSON format by the tool.
For example, a user wants to add certain properties to the description of a sample, such as the treatment a sample received. Then a TSV file with an extra column called treatment is provided.
The resulting JSON file will have the 'treatment' property in it as well. The order of the columns is not relevant to the end result
The tag files works the same only the value are prefixed in the key `tags`.
The tag files works the same only the value is prefixed in the key `tags`.
#### Example
......
# VcfFilter
## Introduction
This tool filters VCF files on a number values. For example, it can filter on sample depth and/or total depth.
It can also filter out the reference calls and/or minimum number of sample passes.
For more on filtering options and how to set them, please refer to the help menu.
This tool enables a user to filter VCF files. For example on sample depth and/or total depth.
It can also be used to filter out the reference calls and/or minimum number of sample passes.
There is a wide set of options which one can use to change the filter settings.
......@@ -32,19 +37,19 @@ Usage: VcfFilter [options]
Min number of samples to pass --minAlternateDepth, --minBamAlternateDepth and --minSampleDepth
--minBamAlternateDepth <int>
--denovoInSample <sample>
Only show variants that contain unique alleles in compete set for given sample
Only keep variants that contain unique alleles in complete set for the given sample
--mustHaveVariant <sample>
Given sample must have 1 alternative allele
Only keep variants that for the given sample have an alternative allele
--diffGenotype <sample:sample>
Given samples must have a different genotype
--filterHetVarToHomVar <sample:sample>
If variants in sample 1 are heterogeneous and alternative alleles are homogeneous in sample 2 variants are filtered
Only keep variands that for the given samples have a different genotype
--filterHetVarToHomVar <sample1:sample2>
Filter out varianst that are heterozygous in sample1 and homozygous in sample2
--filterRefCalls
Filter when there are only ref calls
Filter out ref calls
--filterNoCalls
Filter when there are only no calls
Filter out no calls
--minQualScore <value>
Min qual score
Filter out variants with Min qual score below threshold
~~~
To run the tool:
......@@ -54,4 +59,4 @@ biopet tool VcfFilter --inputVcf myInput.vcf \
~~~
## Output
The output is a vcf file containing the filters specified values.
\ No newline at end of file
The output is a vcf file containing the values that pass the user-defined filtering options
# VcfToTsv
## Introduction
This tool enables a user to convert a vcf file to a tab delimited file (TSV).
This can be very usefull since some programs only accept a TSV for downstream analysis.
It gets rid of the vcf header and parses all data columns in a nice TSV file.
There is also a possibility to only select some specific fields from you vcf and only parse those fields to a TSV.
Tool converts a vcf file to a Tab Separated Values (TSV) file. For every key in the INFO column of the VCF file, a separate column will be created with the corresponding values.
User can select the keys that will be parsed into the output TSV file.
This can be useful in the case a program only accepts a TSV file for downstream analysis.
## Example
To open the help menu:
......
......@@ -3,11 +3,12 @@ VepNormalizer
Introduction
------------
This tool normalizes a VCF file annotated with the Variant Effect Predictor (VEP).
This tool modifies a VCF file annotated with the Variant Effect Predictor (VEP).
Since the VEP does not use INFO fields to annotate, but rather puts all its annotations in one big string inside a "CSQ" INFO tag it is necessary to normalize it.
This normalizer will use the information in the CSQ header to create INFO fields for each annotation field.
It has two modes: `standard` and `explode`. The `standard` mode will produce a VCF according to the VCF specification.
Tool will parse the information in the CSQ header to create INFO fields for each annotation field. Tool has two modes: `standard` and `explode`.
The `standard` mode will produce a VCF according to the VCF specification.
This means that every VEP INFO tag will consist of the comma-separated list of values for each transcript.
In case the value is empty, the VEP INFO tag will not be shown for that specific record
......@@ -20,6 +21,7 @@ The CSQ tag is by default removed from the output VCF file. If one wishes to ret
Example
---------
Help menu:
~~~ bash
biopet tool VepNormalizer -h
......
# WipeReads
## Introduction
WipeReads is a tool for removing reads from indexed BAM files.
It respects pairing information and can be set to remove reads whose duplicate
maps outside of the target region. The main use case is to remove reads mapping
to known ribosomal RNA regions (using a supplied BED file containing intervals for these regions).
WipeReads is a tool for removing reads from indexed BAM files that are inside a user defined region.
It takes pairing information into account and can be set to remove reads if one of the pairs maps outside of the target region.
An application example is to remove reads mapping to known ribosomal RNA regions (using a supplied BED file containing intervals for these regions).
## Example
To open the help menu:
......@@ -62,5 +61,5 @@ biopet tool WipeReads --input_file myBam.bam \
~~~
## Output
This tool outputs a bam file containing all the reads not inside a ribosomal region.
And optionally a bam file with only the ribosomal reads
This tool outputs a bam file containing all the reads not inside the ribosomal region.
It can optionally output a bam file with only the reads inside the ribosomal region
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment