getting-started.md 6.76 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
2
3
4
# Developer - Getting started

### Requirements
- Maven 3.3
Peter van 't Hof's avatar
Peter van 't Hof committed
5
6
- Installed Gatk to maven local repository (see below)
- Installed Biopet to maven local repository (see below)
7
8
9
10
11
12
13
14
15
- Some knowledge of the programming language [Scala](http://www.scala-lang.org/) (The pipelines are scripted using Scala)
- We encourage users to use an IDE for scripting the pipeline. One that works pretty well for us is: [IntelliJ IDEA](https://www.jetbrains.com/idea/)

To start the development of a biopet pipeline you should have the following tools installed: 

* Gatk 
* Biopet

Make sure both tools are installed in your local maven repository. To do this one should use the commands below.
Peter van 't Hof's avatar
Peter van 't Hof committed
16
17

```bash
18
# Replace 'mvn' with the location of you maven executable or put it in your PATH with the export command.
Peter van 't Hof's avatar
Peter van 't Hof committed
19
20
21
git clone https://github.com/broadgsa/gatk-protected
cd gatk-protected
git checkout 3.4
22
# The GATK version is bound to a version of Biopet. Biopet 0.5.0 uses Gatk 3.4
Peter van 't Hof's avatar
Peter van 't Hof committed
23
24
25
26
27
28
29
30
31
32
mvn clean install

cd ..

git clone https://github.com/biopet/biopet.git
cd biopet
git checkout 0.5.0
mvn -DskipTests=true clean install
```

Peter van 't Hof's avatar
Peter van 't Hof committed
33
34
### Basic components

35
36
### Qscript (pipeline)
A basic pipeline would look like this. [Extended example](example-pipeline.md)
Peter van 't Hof's avatar
Peter van 't Hof committed
37

38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
```scala
package org.example.group.pipelines

import nl.lumc.sasc.biopet.core.{ BiopetQScript, PipelineCommand }
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.extensions.{ Gzip, Cat }
import org.broadinstitute.gatk.queue.QScript

//TODO: Replace class name, must be the same as the class of the pipeline
class SimplePipeline(val root: Configurable) extends QScript with BiopetQScript {
  // A constructor without arguments is needed if this pipeline is a root pipeline
  def this() = this(null)

  @Input(required = true)
  var inputFile: File = null

  /** This method can be used to initialize some classes where needed */
  def init(): Unit = {
  }

  /** This method is the actual pipeline */
  def biopetScript: Unit = {
    val cat = new Cat(this)
    cat.input :+= inputFile
    cat.output = new File(outputDir, "file.out")
    add(cat)

    val gzip = new Gzip(this)
    gzip.input :+= cat.output
    gzip.output = new File(outputDir, "file.out.gz")
    add(gzip)
  }
}

//TODO: Replace object name, must be the same as the class of the pipeline
object SimplePipeline extends PipelineCommand
```

76
### Extensions (wrappers)
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
Wrappers have to be written for each tool used inside the pipeline. A basic wrapper (example wraps the linux ```cat``` command) would look like this:
```scala
package nl.lumc.sasc.biopet.extensions

import java.io.File

import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }

/**
 * Extension for GNU cat
 */
class Cat(val root: Configurable) extends BiopetCommandLineFunction {
  @Input(doc = "Input file", required = true)
  var input: List[File] = Nil

  @Output(doc = "Unzipped file", required = true)
  var output: File = _

  executable = config("exe", default = "cat")

  /** return commandline to execute */
  def cmdLine = required(executable) + repeat(input) + " > " + required(output)
}
```

104
105
106
107
108
109
### Tools (Scala programs)
Within the Biopet framework it is also possible to write your own tools in Scala. 
When a certain functionality or script is not incorporated within the framework one can write a tool that does the job. 
Below you can see an example tool which is written for automatically building sample configs.

[Extended example](example-tool.md)
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

```scala
package nl.lumc.sasc.biopet.tools

import java.io.{ PrintWriter, File }

import nl.lumc.sasc.biopet.utils.ConfigUtils._
import nl.lumc.sasc.biopet.utils.ToolCommand
import scala.collection.mutable

import scala.io.Source

/**
 * This tool can convert a tsv to a json file
 */
object SamplesTsvToJson extends ToolCommand {
  case class Args(inputFiles: List[File] = Nil, outputFile: Option[File] = None) extends AbstractArgs

  class OptParser extends AbstractOptParser {
    opt[File]('i', "inputFiles") required () unbounded () valueName "<file>" action { (x, c) =>
      c.copy(inputFiles = x :: c.inputFiles)
    } text "Input must be a tsv file, first line is seen as header and must at least have a 'sample' column, 'library' column is optional, multiple files allowed"
    opt[File]('o', "outputFile") unbounded () valueName "<file>" action { (x, c) =>
      c.copy(outputFile = Some(x))
    }
  }

  /** Executes SamplesTsvToJson */
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)

    val jsonString = stringFromInputs(commandArgs.inputFiles)
    commandArgs.outputFile match {
      case Some(file) => {
        val writer = new PrintWriter(file)
        writer.println(jsonString)
        writer.close()
      }
      case _ => println(jsonString)
    }
  }

  def mapFromFile(inputFile: File): Map[String, Any] = {
    val reader = Source.fromFile(inputFile)
    val lines = reader.getLines().toList.filter(!_.isEmpty)
    val header = lines.head.split("\t")
    val sampleColumn = header.indexOf("sample")
    val libraryColumn = header.indexOf("library")
    if (sampleColumn == -1) throw new IllegalStateException("Sample column does not exist in: " + inputFile)

    val sampleLibCache: mutable.Set[(String, Option[String])] = mutable.Set()

    val librariesValues: List[Map[String, Any]] = for (tsvLine <- lines.tail) yield {
      val values = tsvLine.split("\t")
      require(header.length == values.length, "Number of columns is not the same as the header")
      val sample = values(sampleColumn)
      val library = if (libraryColumn != -1) Some(values(libraryColumn)) else None

      //FIXME: this is a workaround, should be removed after fixing #180
      if (sample.head.isDigit || library.forall(_.head.isDigit))
        throw new IllegalStateException("Sample or library may not start with a number")
Peter van 't Hof's avatar
Peter van 't Hof committed
172

173
174
175
176
177
178
179
180
181
182
183
184
185
      if (sampleLibCache.contains((sample, library)))
        throw new IllegalStateException(s"Combination of $sample ${library.map("and " + _).getOrElse("")} is found multiple times")
      else sampleLibCache.add((sample, library))
      val valuesMap = (for (
        t <- 0 until values.size if !values(t).isEmpty && t != sampleColumn && t != libraryColumn
      ) yield header(t) -> values(t)).toMap
      library match {
        case Some(lib) => Map("samples" -> Map(sample -> Map("libraries" -> Map(lib -> valuesMap))))
        case _         => Map("samples" -> Map(sample -> valuesMap))
      }
    }
    librariesValues.foldLeft(Map[String, Any]())((acc, kv) => mergeMaps(acc, kv))
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
186

187
188
189
190
191
192
  def stringFromInputs(inputs: List[File]): String = {
    val map = inputs.map(f => mapFromFile(f)).foldLeft(Map[String, Any]())((acc, kv) => mergeMaps(acc, kv))
    mapToJson(map).spaces2
  }
}
```