diff --git a/docs/general/config.md b/docs/general/config.md new file mode 100644 index 0000000000000000000000000000000000000000..10d69a2699d3a2a811d0beaa943c71a3aabdc62a --- /dev/null +++ b/docs/general/config.md @@ -0,0 +1,98 @@ +# How to create configs + +### The sample config + +The sample config should be in [__JSON__](http://www.json.org/) format + +- First field should have the key __"samples"__ +- Second field should contain the __"libraries"__ +- Third field contains __"R1" or "R2"__ or __"bam"__ +- The fastq input files can be provided zipped and un zipped + +#### Example sample config +~~~ + { + "samples":{ + "Sample_ID1":{ + "libraries":{ + "MySeries_1":{ + "R1":"Your_R1.fastq.gz", + "R2":"Your_R2.fastq.gz" + } + } + } + } + } +~~~ + +- For BAM files as input one should use a config like this: + +~~~ + { + "samples":{ + "Sample_ID_1":{ + "libraries":{ + "Lib_ID_1":{ + "bam":"MyFirst.bam" + }, + "Lib_ID_2":{ + "bam":"MySecond.bam" + } + } + } + } + } +~~~ + + +Note that there is a tool called [SamplesTsvToJson](../tools/SamplesTsvToJson.md) this enables a user to get the sample config without any chance of creating a wrongly formatted JSON file. + + +### The settings config +The settings config enables a user to alter the settings for almost all settings available in the tools used for a given pipeline. +This config file should be written in JSON format. It can contain setup settings like references for the tools used, +if the pipeline should use chunking or setting memory limits for certain programs almost everything can be adjusted trough this config file. +One could set global variables containing settings for all tools used in the pipeline or set tool specific options one layer deeper into the JSON file. +E.g. in the example below the settings for Picard tools are altered only for Picard and not global. + +~~~ +"picard": { "validationstringency": "LENIENT" } +~~~ + +Global setting examples are: +~~~ +"java_gc_timelimit": 98, +"numberchunks": 25, +"chunking": true +~~~ + + +---- + +#### Example settings config +~~~ +{ + "reference": "/data/LGTC/projects/vandoorn-melanoma/data/references/hg19_nohap/ucsc.hg19_nohap.fasta", + "dbsnp": "/data/LGTC/projects/vandoorn-melanoma/data/references/hg19_nohap/dbsnp_137.hg19_nohap.vcf", + "joint_variantcalling": false, + "haplotypecaller": { "scattercount": 100 }, + "multisample": { "haplotypecaller": { "scattercount": 1000 } }, + "picard": { "validationstringency": "LENIENT" }, + "library_variantcalling_temp": true, + "target_bed_temp": "/data/LGTC/projects/vandoorn-melanoma/analysis/target.bed", + "min_dp": 5, + "bedtools": {"exe":"/share/isilon/system/local/BEDtools/bedtools-2.17.0/bin/bedtools"}, + "bam_to_fastq": true, + "baserecalibrator": { "memory_limit": 8, "vmem":"16G" }, + "samtofastq": {"memory_limit": 8, "vmem": "16G"}, + "java_gc_timelimit": 98, + "numberchunks": 25, + "chunking": true, + "haplotypecaller": { "scattercount": 1000 } +} +~~~ + +### JSON validation + +To check if the JSON file created is correct we can use multiple options the simplest way is using [this](http://jsonformatter.curiousconcept.com/) +website. It is also possible to use Python or Scala for validating but this requires some more knowledge. \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 89da0c00d8aa49cc90099ad9f0467c3c54348b65..c45a003aaf4cb073d849c5859b237f47b437d6a0 100644 --- a/docs/index.md +++ b/docs/index.md @@ -52,7 +52,7 @@ java -jar Biopet(version).jar (pipeline of interest) (pipeline options) -qsub* - ~~~ java -jar Biopet(version).jar (pipeline of interest) (pipeline options) ~~~ - +If one performs a dry run the config report will be generated. From this config report you can identify all configurable options. ### Shark Compute Cluster specific @@ -85,18 +85,14 @@ Using this option, the `java -jar Biopet-<version>.jar` can be ommited and `biop - [Sage](pipelines/sage) - Yamsvp (Under development) -__Note that each pipeline needs a config file written in JSON format see [config](config.md) & [How To! Config](https://git.lumc.nl/biopet/biopet/wikis/Config) __ +__Note that each pipeline needs a config file written in JSON format see [config](general/config.md) & [How To! Config](https://git.lumc.nl/biopet/biopet/wikis/Config) __ There are multiple configs that can be passed to a pipeline, for example the sample, settings and executables wherefrom sample and settings are mandatory. -- [Here](config) one can find how to create a sample and settings config +- [Here](general/config.md) one can find how to create a sample and settings config - More info can be found here: [How To! Config](https://git.lumc.nl/biopet/biopet/wikis/Config) - - - - ### Running a tool $ biopet tool <tool_name> diff --git a/docs/license.md b/docs/license.md index ba2726bf6f76faa51cb592764103c357e103e6b2..a7d72a6ddfd3638fef021e34ffc4730f03384d5d 100644 --- a/docs/license.md +++ b/docs/license.md @@ -1 +1,25 @@ +Public release: +~~~bash +Biopet is built on top of GATK Queue for building bioinformatic +pipelines. It is mainly intended to support LUMC SHARK cluster which is running +SGE. But other types of HPC that are supported by GATK Queue (such as PBS) +should also be able to execute Biopet tools and pipelines. + +Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + +Contact us at: sasc@lumc.nl + +A dual licensing mode is applied. The source code within this project that are +not part of GATK Queue is freely available for non-commercial use under an AGPL +license; For commercial users or users who do not want to follow the AGPL +license, please contact us to obtain a separate license. +~~~ + +Private release: +~~~bash +Due to the license issue with GATK, this part of Biopet can only be used inside the +LUMC. Please refer to https://git.lumc.nl/biopet/biopet/wikis/home for instructions +on how to use this protected part of biopet or contact us at sasc@lumc.nl +~~~ + Copyright [2013-2014] [Sequence Analysis Support Core](https://sasc.lumc.nl/) diff --git a/docs/pipelines/GATK-pipeline.md b/docs/pipelines/GATK-pipeline.md index 9d4e4bd2ae10d428b21637e443c218eb85078d52..30984d0478d313c1b49b61ecc35eee883494dfbf 100644 --- a/docs/pipelines/GATK-pipeline.md +++ b/docs/pipelines/GATK-pipeline.md @@ -28,7 +28,7 @@ The pipeline accepts ```.fastq & .bam``` files as input. ## Example -Note that one should first create the appropriate [configs](../config.md). +Note that one should first create the appropriate [configs](../general/config.md). To get the help menu: ~~~ diff --git a/docs/pipelines/basty.md b/docs/pipelines/basty.md index c235995115641d2a55668d06005c1fead4f19e08..3080349f0e622b43b1dc6f0bf173eddb0620467c 100644 --- a/docs/pipelines/basty.md +++ b/docs/pipelines/basty.md @@ -30,7 +30,7 @@ java -jar Biopet.0.2.0.jar pipeline basty -h ~~~ #### Run the pipeline: -Note that one should first create the appropriate [configs](../config.md). +Note that one should first create the appropriate [configs](../general/config.md). ~~~ java -jar Biopet.0.2.0.jar pipeline basty -run -config MySamples.json -config MySettings.json -outDir myOutDir diff --git a/docs/pipelines/flexiprep.md b/docs/pipelines/flexiprep.md index 98e1e274b05a65fb0a813fe75cc477da91f163a3..d62011f81421ad34b339d5af64742455c10e0c7d 100644 --- a/docs/pipelines/flexiprep.md +++ b/docs/pipelines/flexiprep.md @@ -1,75 +1,126 @@ -# Introduction +# Flexiprep + +## Introduction +Flexiprep is out quality control pipeline. This pipeline checks for possible barcode contamination, clips reads, trims reads and runs +the tool <a href="http://www.bioinformatics.babraham.ac.uk/projects/fastqc/" target="_blank">Fastqc</a>. +The adapter clipping is performed by <a href="https://github.com/marcelm/cutadapt" target="_blank">Cutadapt</a>. +For the quality trimming we use: <a href="https://github.com/najoshi/sickle" target="_blank">Sickle</a>. Flexiprep works on `.fastq` files. + + +## Example + +To get the help menu: +~~~ +java -jar Biopet-0.2.0-DEV.jar pipeline Flexiprep -h +Arguments for Flexiprep: + -R1,--input_r1 <input_r1> R1 fastq file (gzipped allowed) + -sample,--samplename <samplename> Sample name + -library,--libraryname <libraryname> Library name + -outDir,--output_directory <output_directory> Output directory + -R2,--input_r2 <input_r2> R2 fastq file (gzipped allowed) + -skiptrim,--skiptrim Skip Trim fastq files + -skipclip,--skipclip Skip Clip fastq files + -config,--config_file <config_file> JSON config file(s) + -DSC,--disablescatterdefault Disable all scatters +~~~ + +As we can see in the above example we provide the options to skip trimming or clipping +since sometimes you want to have the possibility to not perform these tasks e.g. +if there are no adapters present in your .fastq. Note that the pipeline also works on unpaired reads where one should only provide R1. + + +To start the pipeline (remove `-run` for a dry run): +~~~bash +java -jar Biopet-0.2.0.jar pipeline Flexiprep -run -outDir myDir \ +-R1 myFirstReadPair -R2 mySecondReadPair -sample mySampleName \ +-library myLibname -config mySettings.json +~~~ -# [Flexiprep](https://git.lumc.nl/biopet/biopet/tree/develop/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep) - -QC pipeline for fastq files - -### Commandline options - - -| Argument | Explain | -| -------- | ------- | -| -R1,--input_r1 <input_r1> | R1 fastq file (gzipped allowed) | -| -outputDir,--outputdir <outputdir> | Output directory | -| -config,--configfiles <configfiles> | Config Json file | -| -R2,--input_r2 <input_r2> | R2 fastq file (gzipped allowed) | -| -skiptrim,--skiptrim | Skip Trim fastq files | -| -skipclip,--skipclip | Skip Clip fastq files | - ---- - -### Config options - - -| Config Name | Name | Type | Default | Function | -| ----------- | ---- | ----- | ------- | -------- | -| flexiprep | skip_native_link | Boolean | false | Do not make a link to the final file with name: <sample>.qc.<fastq extension> | -| flexiprep | skiptrim | Boolean | false | | -| flexiprep | skiptrim | Boolean | false | | - ---- - -### sub Module options - - -This can be used in the root of the config or within the flexiprep, within flexiprep got prio over the root value - -| Config Name | Name | Type | Default | Function | -| ----------- | ---- | ---- | ------- | -------- | -| cutadapt | exe | String | cutadapt | Excuteble for cutadapt | -| cutadapt | default_clip_mode | String | 3 | Do not make a link with name: <sample>.qc.<fastq extension> | -| cutadapt | adapter | Array[String] | | | -| cutadapt | anywhere | Array[String] | | | -| cutadapt | front | Array[String] | | | -| cutadapt | discard | Boolean | false | | -| cutadapt | opt_minimum_length | Int | 1 | | -| cutadapt | opt_maximum_length | Int | | | -| fastqc | exe | String | fastqc | Excuteble for fastqc | -| fastqc->java | kmers | String | java | Excuteble for java for fastqc | -| fastqc | kmers | Int | 5 | | -| fastqc | quiet | Boolean | false | | -| fastqc | noextract | Boolean | false | | -| fastqc | nogroup | Boolean | false | | -| sickle | exe | String | sickle | Excuteble for sickle | -| sickle | qualitytype | String | | | -| sickle | defaultqualitytype | String | sanger | use this when quality type can't be found at fastqc | - ---- - -### License - -A dual licensing model is applied. The source code within this project is freely available for non-commercial use under an AGPL license; For commercial users or users who do not want to follow the AGPL license, please contact sasc@lumc.nl to purchase a separate license. - -# Example -Note that one should first create the appropriate [configs](../config.md). - -# Testcase A - -# Testcase B +## Result files +The results from this pipeline will be a fastq file which is depending on the options either clipped and trimmed, only clipped, + only trimmed or no quality control at all. The pipeline also outputs 2 Fastqc runs one before and one after quality control. + +### Example output + +~~~ +. +├── mySample_01.qc.summary.json +├── mySample_01.qc.summary.json.out +├── mySample_01.R1.contams.txt +├── mySample_01.R1.fastqc +│  ├── mySample_01.R1_fastqc +│  │  ├── fastqc_data.txt +│  │  ├── fastqc_report.html +│  │  ├── Icons +│  │  │  ├── error.png +│  │  │  ├── fastqc_icon.png +│  │  │  ├── tick.png +│  │  │  └── warning.png +│  │  ├── Images +│  │  │  └── warning.png +│  │  ├── Images +│  │  │  ├── duplication_levels.png +│  │  │  ├── kmer_profiles.png +│  │  │  ├── per_base_gc_content.png +│  │  │  ├── per_base_n_content.png +│  │  │  ├── per_base_quality.png +│  │  │  ├── per_base_sequence_content.png +│  │  │  ├── per_sequence_gc_content.png +│  │  │  ├── per_sequence_quality.png +│  │  │  └── sequence_length_distribution.png +│  │  └── summary.txt +│  └── mySample_01.R1.qc_fastqc.zip +├── mySample_01.R1.qc.fastq.gz +├── mySample_01.R1.qc.fastq.gz.md5 +├── mySample_01.R2.contams.txt +├── mySample_01.R2.fastqc +│  ├── mySample_01.R2_fastqc +│  │  ├── fastqc_data.txt +│  │  ├── fastqc_report.html +│  │  ├── Icons +│  │  │  ├── error.png +│  │  │  ├── fastqc_icon.png +│  │  │  ├── tick.png +│  │  │  └── warning.png +│  │  ├── Images +│  │  │  ├── duplication_levels.png +│  │  │  ├── kmer_profiles.png +│  │  │  ├── per_base_gc_content.png +│  │  │  ├── per_base_n_content.png +│  │  │  ├── per_base_quality.png +│  │  │  ├── per_base_sequence_content.png +│  │  │  ├── per_sequence_gc_content.png +│  │  │  ├── per_sequence_quality.png +│  │  │  └── sequence_length_distribution.png +│  │  └── summary.txt +│  └── mySample_01.R2_fastqc.zip +├── mySample_01.R2.fastq.md5 +├── mySample_01.R2.qc.fastqc +│  ├── mySample_01.R2.qc_fastqc +│  │  ├── fastqc_data.txt +│  │  ├── fastqc_report.html +│  │  ├── Icons +│  │  │  ├── error.png +│  │  │  ├── fastqc_icon.png +│  │  │  ├── tick.png +│  │  │  └── warning.png +│  │  ├── Images +│  │  │  ├── duplication_levels.png +│  │  │  ├── kmer_profiles.png +│  │  │  ├── per_base_gc_content.png +│  │  │  ├── per_base_n_content.png +│  │  │  ├── per_base_quality.png +│  │  │  ├── per_base_sequence_content.png +│  │  │  ├── per_sequence_gc_content.png +│  │  │  ├── per_sequence_quality.png +│  │  │  └── sequence_length_distribution.png +│  │  └── summary.txt +│  └── mySample_01.R2.qc_fastqc.zip +├── mySample_01.R2.qc.fastq.gz +└── mySample_01.R2.qc.fastq.gz.md5 +~~~ -# Examine results -## Result files ## Best practice diff --git a/docs/pipelines/gentrap.md b/docs/pipelines/gentrap.md index 4f1b63cd390cae40b98856a0fc7a849d691e16e3..05e8fa0f358f64ff2422bff2cb71eeef73471e4a 100644 --- a/docs/pipelines/gentrap.md +++ b/docs/pipelines/gentrap.md @@ -3,7 +3,7 @@ # Invocation # Example -Note that one should first create the appropriate [configs](../config.md). +Note that one should first create the appropriate [configs](../general/config.md). # Testcase A diff --git a/docs/pipelines/mapping.md b/docs/pipelines/mapping.md index 49b220deb0477822578296c62fe931bc8e635b24..05e86ff81b34a15cf3574d0ae77390aabc170ae2 100644 --- a/docs/pipelines/mapping.md +++ b/docs/pipelines/mapping.md @@ -19,7 +19,7 @@ After the QC, the pipeline simply maps the reads with the chosen aligner. The re ---- ## Example -Note that one should first create the appropriate [configs](../config.md). +Note that one should first create the appropriate [configs](../general/config.md). For the help menu: ~~~ @@ -52,9 +52,11 @@ Arguments for Mapping: To run the pipeline: ~~~ -java -jar Biopet.0.2.0.jar pipeline mapping -run --config mySamples.json --config mySettings.json +java -jar Biopet.0.2.0.jar pipeline mapping -run --config mySettings.json \ +-R1 myReads1.fastq -R2 myReads2.fastq -outDir myOutDir -OutputName myReadsOutput \ +-R hg19.fasta -RGSM mySampleName -RGLB myLib1 ~~~ -__Note that the pipeline also accepts sample specification through command line but we encourage you to use the sample config__ +Note that removing -R2 causes the pipeline to be able of handlind single end `.fastq` files. To perform a dry run simply remove `-run` from the commandline call. diff --git a/docs/pipelines/sage.md b/docs/pipelines/sage.md index 1b48faa81943d2c4de0ae623834c5310e8c33324..d6cbf06343203b660971b1d14563b7fe9ff9e090 100644 --- a/docs/pipelines/sage.md +++ b/docs/pipelines/sage.md @@ -6,13 +6,15 @@ The Sage pipeline has been created to process SAGE data, which requires a differ * [Flexiprep](flexiprep.md) * [Mapping](mapping.md) -* [SageCountFastq](sagetools.md) -* [SageCreateLibrary](sagetools.md) -* [SageCreateTagCounts](sagetools.md) +* [SageCountFastq](../tools/sagetools.md) +* [SageCreateLibrary](../tools/sagetools.md) +* [SageCreateTagCounts](../tools/sagetools.md) # Example -Note that one should first create the appropriate [configs](../config.md). +Note that one should first create the appropriate [configs](../general/config.md). + +To get the help menu: ~~~ java -jar Biopet-0.2.0.jar pipeline Sage -h Arguments for Sage: @@ -25,6 +27,11 @@ Arguments for Sage: -DSC,--disablescatterdefault Disable all scatters ~~~ +To run the pipeline: +~~~ + java -jar Biopet-0.2.0-DEV-801b72ed.jar pipeline Sage -run --config MySamples.json --config --MySettings.json +~~~ + # Examine results diff --git a/docs/pipelines/yamsvp.md b/docs/pipelines/yamsvp.md index 4f1b63cd390cae40b98856a0fc7a849d691e16e3..05e8fa0f358f64ff2422bff2cb71eeef73471e4a 100644 --- a/docs/pipelines/yamsvp.md +++ b/docs/pipelines/yamsvp.md @@ -3,7 +3,7 @@ # Invocation # Example -Note that one should first create the appropriate [configs](../config.md). +Note that one should first create the appropriate [configs](../general/config.md). # Testcase A diff --git a/docs/tools/BiopetFlagstat.md b/docs/tools/BiopetFlagstat.md new file mode 100644 index 0000000000000000000000000000000000000000..26e14c240aafa4fc4fe745acf9b5feadc604576d --- /dev/null +++ b/docs/tools/BiopetFlagstat.md @@ -0,0 +1,61 @@ +# BiopetFlagstat + +## Introduction +This tool has been created to extract all the metrics from a required bam file. +It captures for example the # of mapped reads, # of duplicates, # of mates unmapped, # of reads with a certain mapping quality etc. etc. + + +## Example +To get the help menu: +~~~ +java -jar Biopet-0.2.0.jar tool BiopetFlagstat -h +Usage: BiopetFlagstat [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -I <file> | --inputFile <file> + out is a required file property + -r <chr:start-stop> | --region <chr:start-stop> + out is a required file property +~~~ + +To run the tool: +~~~ +java -jar Biopet-0.2.0.jar tool BiopetFlagstat -I myBAM.bam +~~~ + +### Output + +|Number |Total Flags| Fraction| Name| +|------ | -------- | --------- | ------| +|1 |862623034| 100.0000%| All| +|2 |861096240| 99.8230%| Mapped| +|3 |26506366| 3.0728%| Duplicates| +|4 |431233321| 49.9909%| FirstOfPair| +|5 |431389713| 50.0091%| SecondOfPair| +|6 |430909871| 49.9534%| ReadNegativeStrand| +|7 |0| 0.0000%| NotPrimaryAlignment| +|8 |862623034| 100.0000%| ReadPaired| +|9 |803603283| 93.1581%| ProperPair| +|10 |430922821| 49.9549%| MateNegativeStrand| +|11 |1584255| 0.1837%| MateUnmapped| +|12 |0| 0.0000%| ReadFailsVendorQualityCheck| +|13 |1380318| 0.1600%| SupplementaryAlignment| +|14 |1380318| 0.1600%| SecondaryOrSupplementary| +|15 |821996241| 95.2903%| MAPQ>0| +|16 |810652212| 93.9753%| MAPQ>10| +|17 |802852105| 93.0710%| MAPQ>20| +|18 |789252132| 91.4944%| MAPQ>30| +|19 |770426224| 89.3120%| MAPQ>40| +|20 |758373888| 87.9149%| MAPQ>50| +|21 |0| 0.0000%| MAPQ>60| +|22 |835092541| 96.8085%| First normal, second read inverted (paired end orientation)| +|23 |765156| 0.0887%| First normal, second read normal| +|24 |624090| 0.0723%| First inverted, second read inverted| +|25 |11537740| 1.3375%| First inverted, second read normal| +|26 |1462857| 0.1696%| Mate in same strand| +|27 |11751691| 1.3623%| Mate on other chr| \ No newline at end of file diff --git a/docs/tools/CheckAllelesVcfInBam.md b/docs/tools/CheckAllelesVcfInBam.md new file mode 100644 index 0000000000000000000000000000000000000000..b21791d9dad3ff056de03ff362cda839a39b354b --- /dev/null +++ b/docs/tools/CheckAllelesVcfInBam.md @@ -0,0 +1,46 @@ +# CheckAllelesVcfInBam + +## Introduction +This tool has been written to check the allele frequency in BAM files. + +## Example +To get the help menu: +~~~ +java -jar Biopet-0.2.0.jar tool CheckAllelesVcfInBam -h +Usage: CheckAllelesVcfInBam [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -I <file> | --inputFile <file> + + -o <file> | --outputFile <file> + + -s <value> | --sample <value> + + -b <value> | --bam <value> + + -m <value> | --min_mapping_quality <value> +~~~ + +To run the tool: +~~~ +java -jar Biopet-0.2.0.jar tool CheckAllelesVcfInBam --inputFile myVCF.vcf \ +--bam myBam1.bam --sample bam_sample1 --outputFile myAlleles.vcf + +~~~ +Note that the tool can run multiple BAM files at once. +The only thing one needs to make sure off is matching the `--bam` and `--sample` in that same order. + +For multiple bam files: +~~~ +java -jar Biopet-0.2.0.jar tool CheckAllelesVcfInBam --inputFile myVCF.vcf \ +--bam myBam1.bam --sample bam_sample1 --bam myBam2.bam --sample bam_sample2 \ +--bam myBam3.bam --sample bam_sample3 --outputFile myAlleles.vcf +~~~ + +## Output +outputFile = VCF file which contains an extra field with the allele frequencies per sample given to the tool. diff --git a/docs/tools/ExtractAlignedFastq.md b/docs/tools/ExtractAlignedFastq.md new file mode 100644 index 0000000000000000000000000000000000000000..eb765142228ec9e0bded57e7b234f70c40130ca0 --- /dev/null +++ b/docs/tools/ExtractAlignedFastq.md @@ -0,0 +1,54 @@ +# ExtractAlignedFastq + +## Introduction +This tool extracts reads from a BAM file based on alignment intervals. +E.g if one is interested in a specific location this tool extracts the full reads from the location. +The tool is also very usefull to create test data sets. + + +## Example +To get the help menu: +~~~ +java -jar Biopet-0.2.0.jar tool ExtractAlignedFastq -h +ExtractAlignedFastq - Select aligned FASTQ records + +Usage: ExtractAlignedFastq [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -I <bam> | --input_file <bam> + Input BAM file + -r <interval> | --interval <interval> + Interval strings + -i <fastq> | --in1 <fastq> + Input FASTQ file 1 + -j <fastq> | --in2 <fastq> + Input FASTQ file 2 (default: none) + -o <fastq> | --out1 <fastq> + Output FASTQ file 1 + -p <fastq> | --out2 <fastq> + Output FASTQ file 2 (default: none) + -Q <value> | --min_mapq <value> + Minimum MAPQ of reads in target region to remove (default: 0) + -s <value> | --read_suffix_length <value> + Length of common suffix from each read pair (default: 0) + +This tool creates FASTQ file(s) containing reads mapped to the given alignment intervals. +~~~ + +To run the tool: +~~~ +java -jar Biopet-0.2.0.jar tool ExtractAlignedFastq \ +--input_file myBam.bam --in1 myFastq_R1.fastq --out1 myOutFastq_R1.fastq --interval myTarget.bed +~~~ +* Note that this tool works for single end and paired end data. The above example can be easily extended for paired end data. +The only thing one should add is: `--in2 myFastq_R2.fastq --out2 myOutFastq_R2.fastq` +* The interval is just a genomic position or multiple genomic positions wherefrom one wants to extract the reads. + + +## Output +The output of this tool will be fastq files containing only mapped reads with the given alignment intervals extracted from the bam file. \ No newline at end of file diff --git a/docs/tools/FastqSplitter.md b/docs/tools/FastqSplitter.md new file mode 100644 index 0000000000000000000000000000000000000000..742a376b19d2948beeab4c6340792ccf2dbfde53 --- /dev/null +++ b/docs/tools/FastqSplitter.md @@ -0,0 +1,35 @@ +# FastqSplitter + +## Introduction +This tool splits a fastq files based on the number of output files specified. So if one specifies 5 output files it will split the fastq +into 5 files. This can be very usefull if one wants to use chunking option in one of our pipelines, we can generate the exact amount of fastqs +needed for the number of chunks specified. Note that this will be automatically done inside the pipelines. + + +## Example +To get the help menu: +~~~ +java -jar Biopet-0.2.0.jar tool FastqSplitter -h +Usage: FastqSplitter [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -I <file> | --inputFile <file> + out is a required file property + -o <file> | --output <file> + out is a required file property +~~~ +To run the tool: +~~~ +java -jar Biopet-0.2.0.jar tool FastqSplitter --inputFile myFastq.fastq \ +--output mySplittedFastq_1.fastq --output mySplittedFastq_2.fastq \ +--output mySplittedFastq_3.fastq +~~~ +The above invocation will split the input in 3 equally divided fastq files. + +## Output +Multiple fastq files based on the number of outputFiles specified. \ No newline at end of file diff --git a/docs/tools/FindRepeatsPacBio.md b/docs/tools/FindRepeatsPacBio.md new file mode 100644 index 0000000000000000000000000000000000000000..e02daeeab6eb9a5d8d3eeadf125364dbef9e97c4 --- /dev/null +++ b/docs/tools/FindRepeatsPacBio.md @@ -0,0 +1,58 @@ +# FindRepeatsPacBio + +## Introduction +This tool looks and annotates repeat regions inside a BAM file. It extracts the regions of interest from a bed file and then intersects +those regions with the BAM file. On those extracted regions the tool will perform a + Mpileup and counts all insertions/deletions etc. etc. for that specific location on a per read basis. + + +## Example +To get the help menu: +~~~ +java -jar Biopet-0.2.0.jar tool FindRepeatsPacBio -h +Usage: FindRepeatsPacBio [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -I <file> | --inputBam <file> + + -b <file> | --inputBed <file> + output file, default to stdout +~~~ + +To run the tool: +~~~ +java -jar Biopet-0.2.0.jar tool FindRepeatsPacBio --inputBam myInputbam.bam \ +--inputBed myRepeatRegions.bed > mySummary.txt +~~~ +Since the default output of the program is printed in stdout we can use > to write the output to a text file. + + +## Output +The Output is a tab delimited text file which looks like this: + +|chr |startPos|stopPos |Repeat_seq|repeatLength|original_Repeat_readLength| +|-----|--------|--------|----------|------------|--------------------------| +|chr4 |3076603 |3076667 |CAG |3 |65 | +|chr4 |3076665 |3076667 |GCC |3 |3 | +|chrX |66765158|66765261|GCA |3 |104 | + +table continues below: + +|Calculated_repeat_readLength|minLength|maxLength|inserts | +|----------------------------|---------|---------|-------------------------------------| +|61,73,68 |61 |73 |GAC,G,T/A,C,G,G,A,G,A,G/C,C,C,A,C,A,G| +|3,3,3 |3 |3 |// | +|98 |98 |98 |A,G,G | + +table continues below: + +|deletions |notSpan| +|--------------------|-------| +|1,1,2,1,1,1,2//2,1,1|0 | +|// |0 | +|1,1,1,1,1,1,2,1 |0 | \ No newline at end of file diff --git a/docs/tools/MergeAlleles.md b/docs/tools/MergeAlleles.md new file mode 100644 index 0000000000000000000000000000000000000000..f1d891ca085f55085399c1a424fe841ff11d77cf --- /dev/null +++ b/docs/tools/MergeAlleles.md @@ -0,0 +1,34 @@ +# MergeAlleles + +## Introduction +This tool is used to merge overlapping alleles. + + +## Example +To get the help menu: +~~~ +java -jar Biopet-0.2.0.jar tool MergeAlleles -h +Usage: MergeAlleles [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -I <file> | --inputVcf <file> + + -o <file> | --outputVcf <file> + + -R <file> | --reference <file> +~~~ + +To run the tool: +~~~ +java -jar Biopet-0.2.0-DEV-801b72ed.jar tool MergeAlleles \ +--inputVcf myInput.vcf --outputVcf myOutput.vcf \ +--reference /H.Sapiens/hg19/reference.fa +~~~ + +## Output +The output of this tool is a VCF file like format containing the merged Alleles only. \ No newline at end of file diff --git a/docs/tools/VcfFilter.md b/docs/tools/VcfFilter.md new file mode 100644 index 0000000000000000000000000000000000000000..6967840356d74e7fcbdc64e3900088218187cedb --- /dev/null +++ b/docs/tools/VcfFilter.md @@ -0,0 +1,56 @@ +# VcfFilter + +## Introduction +This tool enables a user to filter VCF files. For example on sample depth and/or total depth. +It can also be used to filter out the reference calls and/or minimum number of sample passes. +There is a wide set of options which one can use to change the filter settings. + +## Example +To open the help menu: +~~~ +java -jar Biopet-0.2.0.jar tool VcfFilter -h +Usage: VcfFilter [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -I <file> | --inputVcf <file> + Input vcf file + -o <file> | --outputVcf <file> + Output vcf file + --minSampleDepth <int> + Min value for DP in genotype fields + --minTotalDepth <int> + Min value of DP field in INFO fields + --minAlternateDepth <int> + Min value of AD field in genotype fields + --minSamplesPass <int> + Min number of samples to pass --minAlternateDepth, --minBamAlternateDepth and --minSampleDepth + --minBamAlternateDepth <int> + --denovoInSample <sample> + Only show variants that contain unique alleles in compete set for given sample + --mustHaveVariant <sample> + Given sample must have 1 alternative allele + --diffGenotype <sample:sample> + Given samples must have a different genotype + --filterHetVarToHomVar <sample:sample> + If variants in sample 1 are heterogeneous and alternative alleles are homogeneous in sample 2 variants are filtered + --filterRefCalls + Filter when there are only ref calls + --filterNoCalls + Filter when there are only no calls + --minQualscore <value> + Min qual score +~~~ + +To run the tool: +~~~ +java -jar Biopet-0.2.0.jar tool VcfFilter --inputVcf myInput.vcf \ +--outputVcf myOutput.vcf --filterRefCalls --minSampleDepth +~~~ + +## Output +The output is a vcf file containing the filters specified values. \ No newline at end of file diff --git a/docs/tools/VcfToTsv.md b/docs/tools/VcfToTsv.md new file mode 100644 index 0000000000000000000000000000000000000000..4f1e294f976a97e564dbf7ec3b516271d1c153d3 --- /dev/null +++ b/docs/tools/VcfToTsv.md @@ -0,0 +1,46 @@ +# VcfToTsv + +## Introduction +This tool enables a user to convert a vcf file to a tab delimited file (TSV). +This can be very usefull since some programs only accept a TSV for downstream analysis. +It gets rid of the vcf header and parses all data columns in a nice TSV file. +There is also a possibility to only select some specific fields from you vcf and only parse those fields to a TSV. + +## Example +To open the help menu: +~~~ +java -jar Biopet-0.2.0.jar tool VcfToTsv -h +Usage: VcfToTsv [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -I <file> | --inputFile <file> + + -o <file> | --outputFile <file> + output file, default to stdout + -f <value> | --field <value> + + -i <value> | --info_field <value> + + --all_info + + --all_format + + -s <value> | --sample_field <value> + + -d | --disable_defaults +~~~ + +To run the tool: +~~~ +java -jar Biopet-0.2.0.jar tool VcfToTsv --inputFile myVCF.vcf \ +--outputFile my_tabDelimited_VCF.tsv --all_info +~~~ + +## Output +The output of this tool is a TSV file produced from the input vcf file. +Depending on which options are enabled their could be some fields discarded. \ No newline at end of file diff --git a/docs/tools/WipeReads.md b/docs/tools/WipeReads.md new file mode 100644 index 0000000000000000000000000000000000000000..96377a03122808519293fb8204dd4144de974b1c --- /dev/null +++ b/docs/tools/WipeReads.md @@ -0,0 +1,64 @@ +# WipeReads + +## Introduction +WipeReads is a tool for removing reads from indexed BAM files. +It respects pairing information and can be set to remove reads whose duplicate +maps outside of the target region. The main use case is to remove reads mapping +to known ribosomal RNA regions (using a supplied BED file containing intervals for these regions). + +## Example +To open the help menu: +~~~ +java -jar Biopet-0.2.0.jar tool WipeReads -h + +WipeReads - Region-based reads removal from an indexed BAM file + +Usage: WipeReads [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -I <bam> | --input_file <bam> + Input BAM file + -r <bed/gtf/refflat> | --interval_file <bed/gtf/refflat> + Interval BED file + -o <bam> | --output_file <bam> + Output BAM file + -f <bam> | --discarded_file <bam> + Discarded reads BAM file (default: none) + -Q <value> | --min_mapq <value> + Minimum MAPQ of reads in target region to remove (default: 0) + -G <rgid> | --read_group <rgid> + Read group IDs to be removed (default: remove reads from all read groups) + --limit_removal + Whether to remove multiple-mapped reads outside the target regions (default: yes) + --no_make_index + Whether to index output BAM file or not (default: yes) + +GTF-only options: + -t <gtf_feature_type> | --feature_type <gtf_feature_type> + GTF feature containing intervals (default: exon) + +Advanced options: + --bloom_size <value> + Expected maximum number of reads in target regions (default: 7e7) + --false_positive <value> + False positive rate (default: 4e-7) + +This tool will remove BAM records that overlaps a set of given regions. +By default, if the removed reads are also mapped to other regions outside +the given ones, they will also be removed. +~~~ + +To run the tool: +~~~ +java -jar Biopet-0.2.0.jar tool WipeReads --input_file myBam.bam \ +--interval_file myRibosomal_regions.bed --output_file myFilteredBam.bam +~~~ + +## Output +This tool outputs a bam file containing all the reads not inside a ribosomal region. +And optionally a bam file with only the ribosomal reads diff --git a/docs/tools/bedtointerval.md b/docs/tools/bedtointerval.md new file mode 100644 index 0000000000000000000000000000000000000000..2c7093f5c4b28d0f5c269e8a1ed76219afa8e3bf --- /dev/null +++ b/docs/tools/bedtointerval.md @@ -0,0 +1,32 @@ +# BedToInterval + +## Introduction +BedToInterval has been written to ensure a proper input for the tools from Picard. +Since the latest release of Picard tools (v 1.124) there is already a tool available called: BedToIntervalList. + +## Example +To get the help menu: +~~~ +java -jar Biopet-0.2.0.jar tool BedToInterval -h +Usage: BedToInterval [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -I <file> | --inputFile <file> + + -o <file> | --output <file> + + -b <file> | --bam <file> +~~~ + +To run the tool: +~~~ +java -jar Biopet-0.2.0 tool BedToInterval -I myBed.bed -o myIntervals.txt -b myBam.bam +~~~ + +## Results +The results of this tool will be a tab delimited text file called a interval list. \ No newline at end of file diff --git a/docs/tools/bedtoolscoveragetocounts.md b/docs/tools/bedtoolscoveragetocounts.md new file mode 100644 index 0000000000000000000000000000000000000000..441fcc71da38db096cc446ef388d597ebd992890 --- /dev/null +++ b/docs/tools/bedtoolscoveragetocounts.md @@ -0,0 +1,31 @@ +# BedtoolsCoverageToCounts + +## Introduction +This tool enables a user to generate a count file, out of a coverage file. + + +## Example +To get the help menu: +~~~bash +java -jar Biopet-0.2.0.jar tool BedtoolsCoverageToCounts -h +Usage: BedtoolsCoverageToCounts [options] + + -l <value> | --log_level <value> + Log level + -h | --help + Print usage + -v | --version + Print version + -I <file> | --input <file> + + -o <file> | --output <file> +~~~ + +input: coverage file produced with bedtools +output: a count file with the counts from the the values inside the coverage file. Where values could be almost everything, e.g. +genes, ensemblIDs etc. etc. + +To run the tool: +~~~bash +java -jar Biopet-0.2.0.jar tool BedtoolsCoverageToCounts +~~~ \ No newline at end of file diff --git a/docs/tools/sagetools.md b/docs/tools/sagetools.md index 7e90fba4fab137faa655f285159a3dbe449073ac..62451c70cc2d787c679b0ee23d3ee88296f8d7dd 100644 --- a/docs/tools/sagetools.md +++ b/docs/tools/sagetools.md @@ -1,8 +1,11 @@ # SAGE tools - +These tools are written to create the appropriate files for the SAGE pipeline. +Note that these tools are already implemented in the pipeline. ## SageCountFastq +To open the help menu: ~~~ +java -jar Biopet-0.2.0.jar tool SageCreateLibrary -h Usage: SageCountFastq [options] -l <value> | --log_level <value> @@ -17,7 +20,9 @@ Usage: SageCountFastq [options] ~~~ ## SageCreateLibrary +To open the help menu: ~~~ +java -jar Biopet-0.2.0.jar tool SageCreateLibrary -h Usage: SageCreateLibrary [options] -l <value> | --log_level <value> @@ -39,11 +44,12 @@ Usage: SageCreateLibrary [options] --noAntiTagsOutput <file> --allGenesOutput <file> - ~~~ ## SageCreateTagCounts +To open the help menu: ~~~ +java -jar Biopet-0.2.0.jar tool SageCreateTagCounts -h Usage: SageCreateTagCounts [options] -l <value> | --log_level <value> diff --git a/mkdocs.yml b/mkdocs.yml index 168e182e0bd05d43bea16b4f932a8bb56fbe7103..f1530e5cd46c562a3c9d3b32acd476b1db22a9f6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,18 +1,27 @@ -site_name: Biopet user manual +site_name: Biopet User Manual pages: - ['index.md', 'Home'] -- ['config.md', 'Config'] +- ['general/config.md', 'General', 'Config'] - ['pipelines/basty.md', 'Pipelines', 'Basty'] - ['pipelines/GATK-pipeline.md', 'Pipelines', 'GATK-pipeline'] - ['pipelines/flexiprep.md', 'Pipelines', 'Flexiprep'] - ['pipelines/mapping.md', 'Pipelines', 'Mapping'] - ['pipelines/sage.md', 'Pipelines', 'Sage'] -- ['tools/SamplesTsvToJson.md','tools','SamplesTsvToJson'] -- ['tools/BastyGenerateFasta.md','tools','BastyGenerateFasta'] -- ['tools/MpileupToVcf.md', 'tools', 'MpileupToVcf'] -- ['tools/sagetools.md', 'tools', 'Sagetools'] -- ['cluster/oge.md', 'OpenGridEngine'] +- ['tools/SamplesTsvToJson.md','Tools','SamplesTsvToJson'] +- ['tools/BastyGenerateFasta.md','Tools','BastyGenerateFasta'] +- ['tools/bedtointerval.md','Tools','BedToInterval'] +- ['tools/bedtoolscoveragetocounts.md','Tools','BedtoolsCoverageToCounts'] +- ['tools/BiopetFlagstat.md','Tools','BiopetFlagstat'] +- ['tools/CheckAllelesVcfInBam.md','Tools','CheckAllelesVcfInBam'] +- ['tools/ExtractAlignedFastq.md','Tools','ExtractAlignedFastq'] +- ['tools/FastqSplitter.md', 'Tools','FastqSplitter'] +- ['tools/FindRepeatsPacBio.md','Tools','FindRepeatsPacBio'] +- ['tools/VcfFilter.md','Tools','VcfFilter'] +- ['tools/MpileupToVcf.md', 'Tools', 'MpileupToVcf'] +- ['tools/sagetools.md', 'Tools', 'Sagetools'] +- ['tools/WipeReads.md', 'Tools', 'WipeReads'] +#- ['developing/Setup.md', 'Developing', 'Setting up your local development environment'] - ['about.md', 'About'] - ['license.md', 'License'] -theme: readthedocs -repo_url: https://git.lumc.nl/biopet/biopet +#theme: readthedocs +repo_url: https://git.lumc.nl/biopet/biopet \ No newline at end of file