From abca64c223180c6427901a0d8df318787bbf2d65 Mon Sep 17 00:00:00 2001
From: DavyCats <davycats.dc@gmail.com>
Date: Thu, 14 Feb 2019 15:40:32 +0100
Subject: [PATCH] allow for additional annotations to be added into mergecounts
 output

---
 mergecounts.wdl | 41 +++++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/mergecounts.wdl b/mergecounts.wdl
index 5de98e7..5059ea9 100644
--- a/mergecounts.wdl
+++ b/mergecounts.wdl
@@ -9,6 +9,9 @@ task MergeCounts {
         Int featureColumn
         Int valueColumn
         Boolean inputHasHeader
+        String featureAttribute = "gene_id"
+        File referenceGtf
+        Array[String]+? additionalAttributes
     }
 
     # Based on a script by Szymon Kielbasa/Ioannis Moustakas
@@ -19,26 +22,43 @@ task MergeCounts {
         R --no-save <<CODE
             library(dplyr)
             library(reshape2)
+            library(refGenome)
 
-            listOfFiles <- c("~{sep='", "' inputFiles}")
+            list.of.files <- c("~{sep='", "' inputFiles}")
 
-            valueI <- ~{valueColumn}
-            featureI <- ~{featureColumn}
+            value.i <- ~{valueColumn}
+            feature.i <- ~{featureColumn}
             header <- ~{true="TRUE" false="FALSE" inputHasHeader}
+            feature.attribute <- "~{featureAttribute}"
+            additional.attributes <- c(~{true='"' false="" defined(additionalAttributes)}~{sep='", "' additionalAttributes}~{true='"' false="" defined(additionalAttributes)})
+            reference.gtf <- "~{referenceGtf}"
+            output.path <- "~{outputFile}"
 
-            d <- do.call(rbind, lapply(listOfFiles, function(file){
+            d <- do.call(rbind, lapply(list.of.files, function(file){
                 d <- read.table(file, sep="\t", header=header, comment.char="#")
 
-                splitPath <- strsplit(file, "/")[[1]]
-                colnames(d)[valueI] <- sub("\\\.[^\\\.]*$", "",
-                    splitPath[length(splitPath)])
-                colnames(d)[featureI] <- "feature"
+                filename <- basename(file)
+                colnames(d)[value.i] <- sub("\\\.[^\\\.]*$", "", filename)
+                colnames(d)[feature.i] <- "feature"
 
-                d <- d %>% melt(id.vars=featureI, variable.name="sample", value.name="count")
+                d <- d %>% melt(id.vars=feature.i, variable.name="sample",
+                    value.name="count")
             }))
 
             d <- d %>% dcast(feature ~ sample, value.var="count")
-            write.table(d, file="~{outputFile}", sep="\t", quote=FALSE, row.names=FALSE)
+
+            gtf <- ensemblGenome(dirname(reference.gtf))
+            read.gtf(gtf, basename(reference.gtf))
+
+            gtf.table <- gtf@ev$gtf
+            gtf.table <- gtf.table[order(gtf.table[,feature.attribute]),]
+            gtf.table <- gtf.table[!duplicated(gtf.table[,feature.attribute]),]
+            id.table <- gtf.table[, c(feature.attribute, additional.attributes), drop=F]
+            output.table <- merge(id.table, d, all.y = T, by.y="feature",
+                by.x=feature.attribute)
+
+            write.table(output.table, file=output.path, sep="\t", quote=FALSE,
+                row.names=FALSE, na="")
         CODE
     >>>
 
@@ -48,5 +68,6 @@ task MergeCounts {
 
     runtime {
         memory: 4 + (2*length(inputFiles))
+        docker: "biowdl/mergecounts:latest"
     }
 }
\ No newline at end of file
-- 
GitLab