From 9cb010e94e60504548d66f7d7e75661e3533f5ab Mon Sep 17 00:00:00 2001
From: Peter van 't Hof <p.j.van_t_hof@lumc.nl>
Date: Tue, 17 May 2016 13:27:23 +0200
Subject: [PATCH] Add gtf to refflat

---
 .../nl/lumc/sasc/biopet/extensions/Awk.scala  | 41 +++++++++++++++++++
 .../biopet/extensions/GtfToGenePred.scala     | 39 ++++++++++++++++++
 .../generateindexes/GenerateIndexes.scala     |  7 +++-
 3 files changed, 85 insertions(+), 2 deletions(-)
 create mode 100644 biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Awk.scala
 create mode 100644 biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/GtfToGenePred.scala

diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Awk.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Awk.scala
new file mode 100644
index 000000000..594e6f564
--- /dev/null
+++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Awk.scala
@@ -0,0 +1,41 @@
+package nl.lumc.sasc.biopet.extensions
+
+import java.io.File
+
+import nl.lumc.sasc.biopet.core.{BiopetCommandLineFunction, Version}
+import nl.lumc.sasc.biopet.utils.config.Configurable
+import org.broadinstitute.gatk.utils.commandline.{Input, Output}
+
+import scala.util.matching.Regex
+
+/**
+  * Created by pjvan_thof on 17-5-16.
+  */
+class Awk(val root: Configurable) extends BiopetCommandLineFunction with Version {
+  executable = config("exe", default = "awk", freeVar = false)
+
+  def versionCommand: String = executable + " --version"
+
+  def versionRegex: Regex = """(GNU Awk \d+\.\d+\.\d+)""".r
+
+  @Input(required = false)
+  var input: File = _
+
+  @Output
+  var output: File = _
+
+  var command: String = _
+
+  def cmdLine = executable +
+    required(command) +
+    (if (inputAsStdin) "" else required(input)) +
+    (if (outputAsStsout) "" else " > " + required(output))
+}
+
+object Awk {
+  def apply(root: Configurable, command: String): Awk = {
+    val awk = new Awk(root)
+    awk.command = command
+    awk
+  }
+}
\ No newline at end of file
diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/GtfToGenePred.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/GtfToGenePred.scala
new file mode 100644
index 000000000..3593ef84b
--- /dev/null
+++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/GtfToGenePred.scala
@@ -0,0 +1,39 @@
+package nl.lumc.sasc.biopet.extensions
+
+import java.io.File
+
+import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
+import nl.lumc.sasc.biopet.utils.config.Configurable
+import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
+
+/**
+  * Created by pjvan_thof on 17-5-16.
+  */
+class GtfToGenePred(val root: Configurable) extends BiopetCommandLineFunction {
+  executable = config("exe", default = "gtfToGenePred", freeVar = false)
+
+  @Input
+  var inputGtfs: List[File] = Nil
+
+  @Output
+  var outputGenePred: File = _
+
+  @Output
+  var infoOut: Option[File] = None
+
+  var genePredExt: Boolean = config("gene _pred _ext", default = false)
+  var allErrors: Boolean = config("all_errors", default = false)
+  var impliedStopAfterCds: Boolean = config("implied_stop_after_cds", default = false)
+  var simple: Boolean = config("simple", default = false)
+  var geneNameAsName2: Boolean = config("gene _name_as_name2", default = false)
+
+  def cmdLine = executable +
+    conditional(genePredExt, "-genePredExt") +
+    conditional(allErrors, "-allErrors") +
+    optional("-infoOut", infoOut) +
+    conditional(allErrors, "-allErrors") +
+    conditional(impliedStopAfterCds, "-impliedStopAfterCds") +
+    conditional(simple, "-simple") +
+    conditional(geneNameAsName2, "-geneNameAsName2") +
+    (if (outputAsStsout) "" else " > " + required(outputGenePred))
+}
diff --git a/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexes.scala b/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexes.scala
index f4a0e0406..38eb6178c 100644
--- a/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexes.scala
+++ b/generate-indexes/src/main/scala/nl/lumc/sasc/biopet/pipelines/generateindexes/GenerateIndexes.scala
@@ -208,9 +208,12 @@ class GenerateIndexes(val root: Configurable) extends QScript with BiopetQScript
 
         val refFlatFile: Option[File] = gtfFile.map { gtf =>
           val refFlat = new File(gtf + ".refFlat")
-          //TODO: gtf to refFlat conversion
+          val gtfToGenePred = new GtfToGenePred(this)
+          gtfToGenePred.inputGtfs :+= gtf
 
-          outputConfig += "ribosome_refflat" -> refFlat
+          add(gtfToGenePred | Awk(this, """{ print $12"\t"$1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10 }""") > refFlat)
+
+          outputConfig += "annotation_refflat" -> refFlat
           refFlat
         }
 
-- 
GitLab