From 787d0c99a2293ed04fe2b5820f636f0c63f85f3f Mon Sep 17 00:00:00 2001 From: Wai Yi Leung <w.y.leung@lumc.nl> Date: Tue, 27 Oct 2015 11:33:20 +0100 Subject: [PATCH] An update on example pipeline --- docs/developer/example-pipeline.md | 112 ++++++++++++++++++ docs/developer/example-tool.md | 28 +++-- .../group/pipelines/BiopetPipeline.scala | 1 - .../group/pipelines/HelloPipeline.scala | 42 +++++++ 4 files changed, 175 insertions(+), 8 deletions(-) create mode 100644 external-example/src/main/scala/org/example/group/pipelines/HelloPipeline.scala diff --git a/docs/developer/example-pipeline.md b/docs/developer/example-pipeline.md index 659baea82..ad075e071 100644 --- a/docs/developer/example-pipeline.md +++ b/docs/developer/example-pipeline.md @@ -1,10 +1,122 @@ # Developer - Example pipeline +This document/tutorial will show you how to add a new pipeline to biopet. The minimum requirement is having: + + - A clean biopet checkout from git + - Texteditor or IntelliJ IDEA + +### Adding pipeline folder + +Via commandline: + +``` +cd biopet/public/ +mkdir -p mypipeline/src/main/scala/nl/lumc/sasc/biopet/pipelines/mypipeline +``` ### Adding maven project +Adding a `pom.xml` to `biopet/public/mypipeline` folder. The example below is the minimum required POM definition + +```xml +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>Biopet</artifactId> + <groupId>nl.lumc.sasc</groupId> + <version>0.5.0-SNAPSHOT</version> + <relativePath>../</relativePath> + </parent> + <modelVersion>4.0.0</modelVersion> + + <inceptionYear>2015</inceptionYear> + <artifactId>MyPipeline</artifactId> + <name>MyPipeline</name> + <packaging>jar</packaging> + + <dependencies> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>BiopetCore</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>nl.lumc.sasc</groupId> + <artifactId>BiopetToolsExtensions</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.testng</groupId> + <artifactId>testng</artifactId> + <version>6.8</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_2.10</artifactId> + <version>2.2.1</version> + <scope>test</scope> + </dependency> + </dependencies> + +</project> +``` + ### Initial pipeline code +In `biopet/public/mypipeline/src/main/scala/nl/lumc/sasc/biopet/pipelines/mypipeline` create a file named `HelloPipeline.scala` with the following contents: + +```scala +package nl.lumc.sasc.biopet/pipelines.mypipeline + +import nl.lumc.sasc.biopet.core.PipelineCommand +import nl.lumc.sasc.biopet.utils.config.Configurable +import nl.lumc.sasc.biopet.core.summary.SummaryQScript +import org.broadinstitute.gatk.queue.QScript + +class HelloPipeline(val root: Configurable) extends QScript with SummaryQScript { + def this() = this(null) + + /** Only required when using [[SummaryQScript]] */ + def summaryFile = new File(outputDir, "hello.summary.json") + + /** Only required when using [[SummaryQScript]] */ + def summaryFiles: Map[String, File] = Map() + + /** Only required when using [[SummaryQScript]] */ + def summarySettings = Map() + + // This method can be used to initialize some classes where needed + def init(): Unit = { + } + + // This method is the actual pipeline + def biopetScript: Unit = { + + // Executing a tool like FastQC + val shiva = new Shiva(this) + shiva.init() + shiva.biopetScript() + addAll(shiva.functions) + + /* Only required when using [[SummaryQScript]] */ + addSummaryQScript(shiva) + + // From here you can use the output files of shiva as input file of other jobs + } +} + +//TODO: Replace object Name, must be the same as the class of the pipeline +object HelloPipeline extends PipelineCommand + +``` + + + + + ### Config setup ### Test pipeline diff --git a/docs/developer/example-tool.md b/docs/developer/example-tool.md index a6477b3b3..2c02efdbb 100644 --- a/docs/developer/example-tool.md +++ b/docs/developer/example-tool.md @@ -27,7 +27,9 @@ object SimpleTool extends ToolCommand { } ``` -This is the minimum setup for having a working tool. (not functional yet) +This is the minimum setup for having a working tool. We will place some code for line counting in ``main``. Like in other +higher order programming languages like Java, C++, .Net. One need to specify an entry for the program to run. ``def main`` +is here the first entrypoint from commandline into your tool. ### Program arguments and environment variables @@ -40,13 +42,13 @@ In biopet we facilitate an ``AbstractArgs`` case-class which stores the argument case class Args(inputFile: File = Nil, outputFile: Option[File] = None) extends AbstractArgs ``` -The arguments are stored in ``Args`` +The arguments are stored in ``Args``, this is a `Case Class` which acts as a java `HashMap` storing the arguments in an +object-like fashion. -Then add code that fills the Args. +Consuming and placing values in `Args` works as follows: ```scala class OptParser extends AbstractOptParser { - head( s""" |$commandName - Count lines in a textfile @@ -65,7 +67,11 @@ Then add code that fills the Args. } ``` -In the end your tool would look like the following: +One has to implement class `OptParser` in order to fill `Args`. In `OptParser` one defines the commandline args and how it should be processed. + In our example, we just copy the values passed on the commandline. Further reading: [scala scopt](https://github.com/scopt/scopt) + +Let's compile the code into 1 file and test with real functional code: + ```scala @@ -134,15 +140,22 @@ object SimpleTool extends ToolCommand { ### Running your new tool +#!TODO: write how to run the tool from a compiled state + + ### Debugging the tool with IDEA ### Setting up unit tests ### Adding tool-extension for usage in pipeline -When this tool is used in a pipeline in biopet, one has to add a tool wrapper for the tool created. +In order to use this tool within biopet, one should write an `extension` for the tool. (as we also do for normal executables like `bwa-mem`) -The wrapper would look like: +The wrapper would look like this, basicly exposing the same commandline arguments to biopet in an OOP format. +Note: we also add some functionalities for getting summary data and passing on to biopet. + +The concept of having (extension)-wrappers is to create a black-box service model. One should only know how to interact with the tool without necessarily knowing the internals. + ```scala package nl.lumc.sasc.biopet.extensions.tools @@ -169,6 +182,7 @@ class SimpleTool(val root: Configurable) extends ToolCommandFunction with Summar @Output(doc = "Output JSON", shortName = "output", required = true) var output: File = _ + // setting the memory for this tool where it starts from. override def defaultCoreMemory = 1.0 override def cmdLine = super.cmdLine + diff --git a/external-example/src/main/scala/org/example/group/pipelines/BiopetPipeline.scala b/external-example/src/main/scala/org/example/group/pipelines/BiopetPipeline.scala index 6099047a6..f28b4d52f 100644 --- a/external-example/src/main/scala/org/example/group/pipelines/BiopetPipeline.scala +++ b/external-example/src/main/scala/org/example/group/pipelines/BiopetPipeline.scala @@ -1,7 +1,6 @@ package org.example.group.pipelines import nl.lumc.sasc.biopet.core.PipelineCommand -import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.core.summary.SummaryQScript import nl.lumc.sasc.biopet.pipelines.shiva.Shiva import nl.lumc.sasc.biopet.utils.config.Configurable diff --git a/external-example/src/main/scala/org/example/group/pipelines/HelloPipeline.scala b/external-example/src/main/scala/org/example/group/pipelines/HelloPipeline.scala new file mode 100644 index 000000000..cd4a158c1 --- /dev/null +++ b/external-example/src/main/scala/org/example/group/pipelines/HelloPipeline.scala @@ -0,0 +1,42 @@ +package nl.lumc.sasc.biopet/pipelines.mypipeline + +import nl.lumc.sasc.biopet.core.PipelineCommand +import nl.lumc.sasc.biopet.core.summary.SummaryQScript +import nl.lumc.sasc.biopet.extensions.Fastqc +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.queue.QScript + +class HelloPipeline(val root: Configurable) extends QScript with SummaryQScript { + def this() = this(null) + + /** Only required when using [[SummaryQScript]] */ + def summaryFile = new File(outputDir, "hello.summary.json") + + /** Only required when using [[SummaryQScript]] */ + def summaryFiles: Map[String, File] = Map() + + /** Only required when using [[SummaryQScript]] */ + def summarySettings = Map() + + // This method can be used to initialize some classes where needed + def init(): Unit = { + } + + // This method is the actual pipeline + def biopetScript: Unit = { + + // Executing a tool like FastQC, calling the extension in `nl.lumc.sasc.biopet.extensions.Fastqc` + + val fastqc = new Fastqc(this) + fastqc.fastqfile = config("fastqc_input") + fastqc.output = new File(outputDir, + + /* Only required when using [[SummaryQScript]] */ + addSummaryQScript(shiva) + + // From here you can use the output files of shiva as input file of other jobs + } +} + +//TODO: Replace object Name, must be the same as the class of the pipeline +object HelloPipeline extends PipelineCommand \ No newline at end of file -- GitLab