GentrapTest.scala 8.14 KB
Newer Older
1
/**
bow's avatar
bow committed
2 3 4 5 6 7 8 9 10 11 12 13 14
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
15 16 17
 */
package nl.lumc.sasc.biopet.pipelines.gentrap

18
import java.io.{ FileOutputStream, File }
bow's avatar
bow committed
19 20

import com.google.common.io.Files
bow's avatar
bow committed
21
import nl.lumc.sasc.biopet.pipelines.gentrap.scripts.AggrBaseCount
bow's avatar
bow committed
22 23
import org.apache.commons.io.FileUtils
import org.broadinstitute.gatk.queue.QSettings
24 25
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
bow's avatar
bow committed
26
import org.testng.annotations.{ AfterClass, DataProvider, Test }
27 28

import nl.lumc.sasc.biopet.core.config.Config
bow's avatar
bow committed
29 30
import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.utils.ConfigUtils
31 32 33

class GentrapTest extends TestNGSuite with Matchers {

34 35 36 37
  import Gentrap._
  import Gentrap.ExpMeasures._
  import Gentrap.StrandProtocol._

bow's avatar
bow committed
38 39 40 41
  def initPipeline(map: Map[String, Any]): Gentrap = {
    new Gentrap() {
      override def configName = "gentrap"
      override def globalConfig = new Config(map)
42 43
      // disable dict file check since it is based on the reference file name (which we can't modify here since
      // we use the mock /usr/bin/test file
bow's avatar
bow committed
44 45 46
      qSettings = new QSettings
      qSettings.runName = "test"
    }
47 48
  }

49 50 51 52 53 54 55
  /** Convenience method for making library config */
  private def makeLibConfig(idx: Int, paired: Boolean = true) = {
    val files = Map("R1" -> "test_R1.fq")
    if (paired) (s"lib_$idx", files ++ Map("R2" -> "test_R2.fq"))
    else (s"lib_$idx", files)
  }

56 57 58 59 60 61 62
  /** Convenience type for sample config */
  private type SamplesConfig = Map[String, Map[String, Map[String, Map[String, Map[String, String]]]]]

  /** Convenience method for making a single sample config */
  private def makeSampleConfig(sampleIdx: Int, numLibs: Int, paired: Boolean) =
    (s"sample_$sampleIdx",
      Map("libraries" ->
bow's avatar
bow committed
63
        (1 to numLibs)
bow's avatar
bow committed
64 65
        .map(n => makeLibConfig(n, paired))
        .toMap
66 67 68
      )
    )

69 70 71
  /** Convenience method for making all samples config */
  private def makeSamplesConfig(numSamples: Int, numLibsEachSample: Int, pairMode: String): SamplesConfig =
    Map("samples" ->
bow's avatar
bow committed
72
      (1 to numSamples)
bow's avatar
bow committed
73 74 75
      // if paired == "mixed", alternate paired/not paired between each number
      .map(n => makeSampleConfig(n, numLibsEachSample, if (pairMode == "mixed") n % 2 == 0 else pairMode == "paired"))
      .toMap
76
    )
77

bow's avatar
bow committed
78 79 80 81
  private lazy val validExpressionMeasures = Set(
    "fragments_per_gene", "fragments_per_exon", "bases_per_gene", "bases_per_exon",
    "cufflinks_strict", "cufflinks_guided", "cufflinks_blind")

82 83 84 85 86
  @DataProvider(name = "expMeasuresstrandProtocol")
  def expMeasuresStrandProtocolProvider = {

    //val sampleConfigs = Array(pairedOneSampleOneLib, pairedOneSampleTwoLib, pairedOneSampleThreeLib)
    val sampleConfigs = for {
bow's avatar
bow committed
87 88 89 90 91
      (sampleNum, libNum) <- Seq(
        // check multiple libs for single run only ~ to trim down less-informative tests
        // need to check 2 and 3 samples since multi-sample plotting differs when sample is 1 or 2 and 3
        (1, 1), (1, 2), (2, 1), (3, 1)
      )
92 93
      libType <- Seq("paired", "single", "mixed")
    } yield makeSamplesConfig(sampleNum, libNum, libType)
bow's avatar
bow committed
94

95 96
    val strandProtocols = Array("non_specific", "dutp")
    // get all possible combinations of expression measures
bow's avatar
bow committed
97
    val expressionMeasures = validExpressionMeasures
98 99
      //.subsets
      //.map(_.toList)
bow's avatar
bow committed
100 101
      .toArray

102
    for {
103
      sampleConfig <- sampleConfigs.toArray
104 105
      expressionMeasure <- expressionMeasures
      strandProtocol <- strandProtocols
106
    } yield Array(sampleConfig, List(expressionMeasure), strandProtocol)
bow's avatar
bow committed
107 108
  }

109 110
  @Test(dataProvider = "expMeasuresstrandProtocol")
  def testGentrap(sampleConfig: SamplesConfig, expMeasures: List[String], strandProtocol: String) = {
111 112 113 114 115 116 117 118

    val settings = Map(
      "output_dir" -> GentrapTest.outputDir,
      "gsnap" -> Map("db" -> "test", "dir" -> "test"),
      "aligner" -> "gsnap",
      "expression_measures" -> expMeasures,
      "strand_protocol" -> strandProtocol
    )
119
    val config = ConfigUtils.mergeMaps(settings ++ sampleConfig, Map(GentrapTest.executables.toSeq: _*))
120
    val gentrap: Gentrap = initPipeline(config)
bow's avatar
bow committed
121 122

    gentrap.script()
123
    val functions = gentrap.functions.groupBy(_.getClass)
bow's avatar
bow committed
124
    val numSamples = sampleConfig("samples").size
bow's avatar
bow committed
125

126
    functions(classOf[Gsnap]).size should be >= 1
127

bow's avatar
bow committed
128 129 130
    if (expMeasures.contains("fragments_per_gene")) {
      gentrap.functions
        .collect { case x: HtseqCount => x.output.toString.endsWith(".fragments_per_gene") }.size shouldBe numSamples
131
    }
bow's avatar
bow committed
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160

    if (expMeasures.contains("fragments_per_exon")) {
      gentrap.functions
        .collect { case x: HtseqCount => x.output.toString.endsWith(".fragments_per_exon") }.size shouldBe numSamples
    }

    if (expMeasures.contains("bases_per_gene")) {
      gentrap.functions
        .collect { case x: AggrBaseCount => x.output.toString.endsWith(".bases_per_gene") }.size shouldBe numSamples
    }

    if (expMeasures.contains("bases_per_exon")) {
      gentrap.functions
        .collect { case x: AggrBaseCount => x.output.toString.endsWith(".bases_per_exon") }.size shouldBe numSamples
    }

    if (expMeasures.contains("cufflinks_strict")) {
      gentrap.functions
        .collect {
          case x: Cufflinks => x.outputGenesFpkm.getParentFile.toString.endsWith("cufflinks_strict")
          case x: Ln => x.output.toString.endsWith(".genes_fpkm_cufflinks_strict") ||
            x.output.toString.endsWith(".isoforms_fpkm_cufflinks_strict")
        }
        .count(identity) shouldBe numSamples * 3 // three types of jobs per sample
    }

    if (expMeasures.contains("cufflinks_guided")) {
      gentrap.functions
        .collect {
161 162 163 164
          case x: Cufflinks => x.outputGenesFpkm.getParentFile.toString.endsWith("cufflinks_guided")
          case x: Ln => x.output.toString.endsWith(".genes_fpkm_cufflinks_guided") ||
            x.output.toString.endsWith(".isoforms_fpkm_cufflinks_guided")
        }
bow's avatar
bow committed
165 166 167 168 169 170
        .count(identity) shouldBe numSamples * 3 // three types of jobs per sample
    }

    if (expMeasures.contains("cufflinks_blind")) {
      gentrap.functions
        .collect {
171 172 173 174
          case x: Cufflinks => x.outputGenesFpkm.getParentFile.toString.endsWith("cufflinks_blind")
          case x: Ln => x.output.toString.endsWith(".genes_fpkm_cufflinks_blind") ||
            x.output.toString.endsWith(".isoforms_fpkm_cufflinks_blind")
        }
bow's avatar
bow committed
175 176
        .count(identity) shouldBe numSamples * 3 // three types of jobs per sample
    }
bow's avatar
bow committed
177 178 179 180 181
  }

  // remove temporary run directory all tests in the class have been run
  @AfterClass def removeTempOutputDir() = {
    FileUtils.deleteDirectory(GentrapTest.outputDir)
182 183
  }
}
bow's avatar
bow committed
184 185 186 187

object GentrapTest {
  val outputDir = Files.createTempDir()

188 189 190 191 192 193 194 195 196 197 198
  private def copyFile(name: String): Unit = {
    val is = getClass.getResourceAsStream("/" + name)
    val os = new FileOutputStream(new File(outputDir, name))
    org.apache.commons.io.IOUtils.copy(is, os)
    os.close()
  }

  copyFile("ref.fa")
  copyFile("ref.dict")
  copyFile("ref.fa.fai")

bow's avatar
bow committed
199
  val executables = Map(
200
    "reference_fasta" -> (outputDir + File.separator + "ref.fa"),
201
    "dict" -> "test",
Peter van 't Hof's avatar
Peter van 't Hof committed
202
    "refFlat" -> "test",
bow's avatar
bow committed
203 204 205
    "annotation_gtf" -> "test",
    "annotation_bed" -> "test",
    "annotation_refflat" -> "test",
206
    "varscan_jar" -> "test"
bow's avatar
bow committed
207 208 209 210
  ) ++ Seq(
      // fastqc executables
      "fastqc", "seqtk", "sickle", "cutadapt",
      // mapping executables
bow's avatar
bow committed
211
      "star", "bowtie", "samtools", "gsnap", "tophat",
bow's avatar
bow committed
212
      // gentrap executables
213
      "cufflinks", "htseqcount", "grep", "pdflatex", "rscript", "tabix", "bgzip", "bedtoolscoverage", "md5sum",
214 215
      // bam2wig executables
      "igvtools", "wigtobigwig"
bow's avatar
bow committed
216 217
    ).map { case exe => exe -> Map("exe" -> "test") }.toMap
}