Commit 7fb6a028 authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Merge branch 'feature-gentrap_docs' into 'develop'

Feature gentrap docs

As we discussed earlier, this is the docs addition to Gentrap and other tools that it uses. I think this should make all the docs for ScalaDoc complete (for the tools I've written).

Please let me know if that is not the case :) ~ so I can add them.

See merge request !141
parents aa405ebc 172b0345
......@@ -21,28 +21,48 @@ import scala.collection.mutable.{ Set => MutSet }
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.config.{ Configurable, ConfigValue }
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/**
* Biopet wrapper for the [[MergeTables]] command line tool.
*
* @param root [[Configurable]] object
*/
class MergeTables(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
override val defaultVmem = "5G"
/** List of input tabular files */
@Input(doc = "Input table files", required = true)
var inputTables: List[File] = List.empty[File]
/** Output file */
@Output(doc = "Output merged table", required = true)
var output: File = null
// TODO: should be List[Int] really
/** List of column indices to combine to make a unique identifier per row */
var idColumnIndices: List[String] = config("id_column_indices", default = List("1"))
/** Index of column from each tabular file containing the values to be put in the final merged table */
var valueColumnIndex: Int = config("value_column_index", default = 2)
/** Name of the identifier column in the output file */
var idColumnName: Option[String] = config("id_column_name")
/** Common file extension of all input files */
var fileExtension: Option[String] = config("file_extension")
/** Number of header lines from each input file to ignore */
var numHeaderLines: Option[Int] = config("num_header_lines")
/** String to use when a value is missing from an input file */
var fallbackString: Option[String] = config("fallback_string")
/** Column delimiter of each input file (used for splitting into columns */
var delimiter: Option[String] = config("delimiter")
// executed command line
......@@ -197,7 +217,7 @@ object MergeTables extends ToolCommand {
opt[Char]('d', "delimiter") optional () action { (x, c) =>
c.copy(delimiter = x)
} text "The string to use when a value for a feature is missing in one or more sample(s) (default: '-')"
} text "The character used for separating columns in the input files (default: '\\t')"
arg[File]("<input_tables> ...") unbounded () optional () action { (x, c) =>
c.copy(inputTables = c.inputTables :+ x)
......@@ -235,6 +255,7 @@ object MergeTables extends ToolCommand {
case otherwise => new BufferedWriter(new FileWriter(otherwise))
}
/** Main entry point */
def main(args: Array[String]): Unit = {
val commandArgs: Args = parseArgs(args)
......
......@@ -41,6 +41,8 @@ import nl.lumc.sasc.biopet.tools.{ MergeTables, WipeReads }
/**
* Gentrap pipeline
* Generic transcriptome analysis pipeline
*
* @author Wibowo Arindrarto <w.arindrarto@lumc.nl>
*/
class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript with SummaryQScript { qscript =>
......@@ -118,9 +120,11 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
}
}
/** Expression measures which are subject to TMM normalization during correlation calculation */
protected lazy val forTmmNormalization: Set[ExpMeasures.Value] =
Set(FragmentsPerGene, FragmentsPerExon, BasesPerGene, BasesPerExon)
/** Returns a QFunction to generate heatmaps */
private def makeHeatmapJob(mergeJob: Option[MergeTables], outName: String,
expMeasure: ExpMeasures.Value, isCuffIsoform: Boolean = false): Option[PlotHeatmap] =
(mergeJob.isDefined && samples.size > 2)
......@@ -142,6 +146,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
private lazy val geneFragmentsCountJob =
makeMergeTableJob((s: Sample) => s.geneFragmentsCount, ".fragments_per_gene", List(1), 2, fallback = "0")
/** Heatmap job for gene fragment count */
private lazy val geneFragmentsCountHeatmapJob =
makeHeatmapJob(geneFragmentsCountJob, "fragments_per_gene", FragmentsPerGene)
......@@ -149,6 +154,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
private lazy val exonFragmentsCountJob =
makeMergeTableJob((s: Sample) => s.exonFragmentsCount, ".fragments_per_exon", List(1), 2, fallback = "0")
/** Heatmap job for exon fragment count */
private lazy val exonFragmentsCountHeatmapJob =
makeHeatmapJob(exonFragmentsCountJob, "fragments_per_exon", FragmentsPerExon)
......@@ -156,6 +162,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
private lazy val geneBasesCountJob =
makeMergeTableJob((s: Sample) => s.geneBasesCount, ".bases_per_gene", List(1), 2, fallback = "0")
/** Heatmap job for gene base count */
private lazy val geneBasesCountHeatmapJob =
makeHeatmapJob(geneBasesCountJob, "bases_per_gene", BasesPerGene)
......@@ -163,6 +170,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
private lazy val exonBasesCountJob =
makeMergeTableJob((s: Sample) => s.exonBasesCount, ".bases_per_exon", List(1), 2, fallback = "0")
/** Heatmap job for exon base count */
private lazy val exonBasesCountHeatmapJob =
makeHeatmapJob(exonBasesCountJob, "bases_per_exon", BasesPerExon)
......@@ -170,6 +178,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
private lazy val geneFpkmCufflinksStrictJob =
makeMergeTableJob((s: Sample) => s.geneFpkmCufflinksStrict, ".genes_fpkm_cufflinks_strict", List(1, 7), 10, fallback = "0.0")
/** Heatmap job for gene FPKM Cufflinks, strict mode */
private lazy val geneFpkmCufflinksStrictHeatmapJob =
makeHeatmapJob(geneFpkmCufflinksStrictJob, "genes_fpkm_cufflinks_strict", CufflinksStrict)
......@@ -177,6 +186,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
private lazy val isoFpkmCufflinksStrictJob =
makeMergeTableJob((s: Sample) => s.isoformFpkmCufflinksStrict, ".isoforms_fpkm_cufflinks_strict", List(1, 7), 10, fallback = "0.0")
/** Heatmap job for isoform FPKM Cufflinks, strict mode */
private lazy val isoFpkmCufflinksStrictHeatmapJob =
makeHeatmapJob(isoFpkmCufflinksStrictJob, "isoforms_fpkm_cufflinks_strict", CufflinksStrict, true)
......@@ -184,6 +194,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
private lazy val geneFpkmCufflinksGuidedJob =
makeMergeTableJob((s: Sample) => s.geneFpkmCufflinksGuided, ".genes_fpkm_cufflinks_guided", List(1, 7), 10, fallback = "0.0")
/** Heatmap job for gene FPKM Cufflinks, guided mode */
private lazy val geneFpkmCufflinksGuidedHeatmapJob =
makeHeatmapJob(geneFpkmCufflinksGuidedJob, "genes_fpkm_cufflinks_guided", CufflinksGuided)
......@@ -191,6 +202,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
private lazy val isoFpkmCufflinksGuidedJob =
makeMergeTableJob((s: Sample) => s.isoformFpkmCufflinksGuided, ".isoforms_fpkm_cufflinks_guided", List(1, 7), 10, fallback = "0.0")
/** Heatmap job for isoform FPKM Cufflinks, guided mode */
private lazy val isoFpkmCufflinksGuidedHeatmapJob =
makeHeatmapJob(isoFpkmCufflinksGuidedJob, "isoforms_fpkm_cufflinks_guided", CufflinksGuided, true)
......@@ -198,6 +210,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
private lazy val geneFpkmCufflinksBlindJob =
makeMergeTableJob((s: Sample) => s.geneFpkmCufflinksBlind, ".genes_fpkm_cufflinks_blind", List(1, 7), 10, fallback = "0.0")
/** Heatmap job for gene FPKM Cufflinks, blind mode */
private lazy val geneFpkmCufflinksBlindHeatmapJob =
makeHeatmapJob(geneFpkmCufflinksBlindJob, "genes_fpkm_cufflinks_blind", CufflinksBlind)
......@@ -205,9 +218,11 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
private lazy val isoFpkmCufflinksBlindJob =
makeMergeTableJob((s: Sample) => s.isoformFpkmCufflinksBlind, ".isoforms_fpkm_cufflinks_blind", List(1, 7), 10, fallback = "0.0")
/** Heatmap job for isoform FPKM Cufflinks, blind mode */
private lazy val isoFpkmCufflinksBlindHeatmapJob =
makeHeatmapJob(isoFpkmCufflinksBlindJob, "isoforms_fpkm_cufflinks_blind", CufflinksBlind, true)
/** Container for merge table jobs */
private lazy val mergeTableJobs: Map[String, Option[MergeTables]] = Map(
"gene_fragments_count" -> geneFragmentsCountJob,
"exon_fragments_count" -> exonFragmentsCountJob,
......@@ -221,6 +236,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
"isoform_fpkm_cufflinks_blind" -> isoFpkmCufflinksBlindJob
)
/** Container for heatmap jobs */
private lazy val heatmapJobs: Map[String, Option[PlotHeatmap]] = Map(
"gene_fragments_count_heatmap" -> geneFragmentsCountHeatmapJob,
"exon_fragments_count_heatmap" -> exonFragmentsCountHeatmapJob,
......@@ -333,10 +349,12 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
require(ribosomalRefFlat.isDefined, "rRNA intervals must be supplied if removeRibosomalReads is set")
}
/** Pipeline run for each sample */
def biopetScript(): Unit = {
addSamplesJobs()
}
/** Pipeline run for multiple samples */
def addMultiSampleJobs(): Unit = {
// merge expression tables
mergeTableJobs.values.foreach { case maybeJob => maybeJob.foreach(add(_)) }
......@@ -352,8 +370,14 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
add(pdfReportJob)
}
/** Returns a [[Sample]] object */
def makeSample(sampleId: String): Sample = new Sample(sampleId)
/**
* Gentrap sample
*
* @param sampleId Unique identifier of the sample
*/
class Sample(sampleId: String) extends AbstractSample(sampleId) with CufflinksProducer {
/** Shortcut to qscript object */
......@@ -473,6 +497,8 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
job.inputAlignment = idSortingJob.get.output
job.output = createFile(".fragments_per_gene")
job.format = Option("bam")
// We are forcing the sort order to be ID-sorted, since HTSeq-count often chokes when using position-sorting due
// to its buffer not being large enough.
job.order = Option("name")
job.stranded = strandProtocol match {
case NonSpecific => Option("no")
......@@ -511,9 +537,11 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
}
}
/** Alignment file of reads from the plus strand, only defined when run is strand-specific */
def alnFilePlusStrand: Option[File] = alnPlusStrandJobs
.collect { case jobSet => jobSet.combineJob.output }
/** Jobs for generating reads from the plus strand, only defined when run is strand-specific */
private def alnPlusStrandJobs: Option[StrandSeparationJobSet] = strandProtocol match {
case Dutp =>
val r2Job = this.allPaired
......@@ -551,9 +579,11 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
case _ => throw new IllegalStateException
}
/** Alignment file of reads from the minus strand, only defined when run is strand-specific */
def alnFileMinusStrand: Option[File] = alnMinusStrandJobs
.collect { case jobSet => jobSet.combineJob.output }
/** Jobs for generating reads from the minus, only defined when run is strand-specific */
private def alnMinusStrandJobs: Option[StrandSeparationJobSet] = strandProtocol match {
case Dutp =>
val r1Job = this.allPaired
......@@ -668,6 +698,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
makeCollectRnaSeqMetricsJob(alnFileDirty, createFile(".rna_metrics"), Option(createFile(".coverage_bias.pdf")))
}
/** Job for removing ribosomal reads */
private def wipeJob: Option[WipeReads] = removeRibosomalReads
.option {
require(ribosomalRefFlat.isDefined)
......@@ -711,6 +742,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
def allSingle: Boolean = libraries.values.forall(!_.paired)
// TODO: add warnings or other messages for config values that are hard-coded by the pipeline
/** Adds all jobs for the sample */
def addJobs(): Unit = {
// TODO: this is our requirement since it's easier to calculate base counts when all libraries are either paired or single
require(allPaired || allSingle, s"Sample $sampleId contains only single-end or paired-end libraries")
......@@ -756,12 +788,14 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
varCallJob.foreach(add(_))
}
/** Add jobs for fragments per gene counting using HTSeq */
// We are forcing the sort order to be ID-sorted, since HTSeq-count often chokes when using position-sorting due
// to its buffer not being large enough.
/** Returns a [[Library]] object */
def makeLibrary(libId: String): Library = new Library(libId)
/**
* Gentrap library
*
* @param libId Unique identifier of the library
*/
class Library(libId: String) extends AbstractLibrary(libId) {
/** Summary stats of the library */
......@@ -798,6 +832,7 @@ class Gentrap(val root: Configurable) extends QScript with MultiSampleQScript wi
job
}
/** Adds all jobs for the library */
def addJobs(): Unit = {
// create per-library alignment file
addAll(mappingJob.functions)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment