GearsQiimeClosed.scala 5.46 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1 2 3 4 5 6 7 8 9 10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
Peter van 't Hof's avatar
Peter van 't Hof committed
12 13 14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
15 16
package nl.lumc.sasc.biopet.pipelines.gears

Peter van 't Hof's avatar
Peter van 't Hof committed
17
import java.io.{ File, PrintWriter }
Peter van 't Hof's avatar
Peter van 't Hof committed
18

19 20
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.core.SampleLibraryTag
21
import nl.lumc.sasc.biopet.extensions.qiime._
22
import nl.lumc.sasc.biopet.extensions.seqtk.SeqtkSample
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import nl.lumc.sasc.biopet.utils.ConfigUtils
24 25 26
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript

Peter van 't Hof's avatar
Peter van 't Hof committed
27 28
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
Peter van 't Hof's avatar
Peter van 't Hof committed
29
import scala.xml.{ Elem, PrettyPrinter }
Peter van 't Hof's avatar
Peter van 't Hof committed
30

31 32 33
/**
 * Created by pjvan_thof on 12/4/15.
 */
34
class GearsQiimeClosed(val root: Configurable) extends QScript with SummaryQScript with SampleLibraryTag {
35

Peter van 't Hof's avatar
Peter van 't Hof committed
36
  var fastqInput: File = _
37 38 39 40 41 42 43 44

  override def defaults = Map(
    "splitlibrariesfastq" -> Map(
      "barcode_type" -> "not-barcoded"
    )
  )

  def init() = {
Peter van 't Hof's avatar
Peter van 't Hof committed
45
    require(fastqInput != null)
Peter van 't Hof's avatar
Peter van 't Hof committed
46
    require(sampleId.isDefined)
47 48
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
49 50 51 52 53 54
  private var _otuMap: File = _
  def otuMap = _otuMap

  private var _otuTable: File = _
  def otuTable = _otuTable

55 56 57
  def biopetScript() = {

    val splitLib = new SplitLibrariesFastq(this)
58
    splitLib.input :+= fastqInput
59
    splitLib.outputDir = new File(outputDir, "split_libraries_fastq")
Peter van 't Hof's avatar
Peter van 't Hof committed
60
    sampleId.foreach(splitLib.sampleIds :+= _.replaceAll("_", "-"))
Peter van 't Hof's avatar
Peter van 't Hof committed
61
    splitLib.isIntermediate = true
62 63 64
    add(splitLib)

    val closedReference = new PickClosedReferenceOtus(this)
Peter van 't Hof's avatar
Peter van 't Hof committed
65
    closedReference.inputFasta = addDownsample(splitLib.outputSeqs, new File(splitLib.outputDir, s"${sampleId.get}.downsample.fna"))
66 67
    closedReference.outputDir = new File(outputDir, "pick_closed_reference_otus")
    add(closedReference)
Peter van 't Hof's avatar
Peter van 't Hof committed
68 69
    _otuMap = closedReference.otuMap
    _otuTable = closedReference.otuTable
Peter van 't Hof's avatar
Peter van 't Hof committed
70 71

    addSummaryJobs()
72
  }
73 74 75 76 77

  /** Must return a map with used settings for this pipeline */
  def summarySettings: Map[String, Any] = Map()

  /** File to put in the summary for thie pipeline */
Peter van 't Hof's avatar
Peter van 't Hof committed
78
  def summaryFiles: Map[String, File] = Map("otu_table" -> otuTable, "otu_map" -> otuMap)
79 80 81

  /** Name of summary output file */
  def summaryFile: File = new File(outputDir, "summary.closed_reference.json")
82 83 84

  val downSample: Option[Double] = config("downsample")

85
  def addDownsample(input: File, output: File): File = {
86 87 88 89 90
    downSample match {
      case Some(x) =>
        val seqtk = new SeqtkSample(this)
        seqtk.input = input
        seqtk.sample = x
91 92
        seqtk.output = output
        add(seqtk)
93 94 95 96
        output
      case _ => input
    }
  }
97
}
Peter van 't Hof's avatar
Peter van 't Hof committed
98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130

object GearsQiimeClosed {
  def qiimeBiomToKrona(inputFile: File, outputFile: File): Unit = {
    val biom = ConfigUtils.fileToConfigMap(inputFile)

    val samples = biom("columns").asInstanceOf[List[Map[String, Any]]].toArray.map(_("id"))

    val sortedSamples = samples.toList.map(_.toString).sorted

    case class TaxNode(name: String, level: String) {
      val childs: ListBuffer[TaxNode] = ListBuffer()

      val counts: mutable.Map[String, Long] = mutable.Map()
      def totalCount(sample: String): Long = counts.getOrElse(sample, 0L) + childs.map(_.totalCount(sample)).sum

      def node: Elem = {
        val sizes = sortedSamples.map { sample => <val>{ totalCount(sample) }</val> }
        val size = <size>{ sizes }</size>

        val node = <node name={ name }>{ size }</node>

        node.copy(child = node.child ++ childs.map(_.node))
      }
    }

    val root = TaxNode("root", "-")

    val taxs = biom("rows").asInstanceOf[List[Map[String, Any]]].toArray.map { row =>
      val taxonomy = row("metadata").asInstanceOf[Map[String, Any]]("taxonomy")
        .asInstanceOf[List[String]].filter(!_.endsWith("__"))
      taxonomy.foldLeft(root) { (a, b) =>
        val n = b.split("__", 2)
        val level = n(0)
Peter van 't Hof's avatar
Peter van 't Hof committed
131
        val name = if (level == "Unassigned") "Unassigned" else n(1)
132
        a.childs.find(_ == TaxNode(name, level)) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
          case Some(node) => node
          case _ =>
            val node = TaxNode(name, level)
            a.childs += node
            node
        }
      }
    }

    biom("data").asInstanceOf[List[List[Any]]].map { data =>
      val row = data(0).asInstanceOf[Long]
      val column = data(1).asInstanceOf[Long]
      val value = data(2).asInstanceOf[Long]
      val sample = samples(column.toInt).toString
      taxs(row.toInt).counts += sample -> (value + taxs(row.toInt).counts.getOrElse(sample, 0L))
      value
    }.sum

    val xml = <krona>
                <attributes magnitude="size">
                  <attribute display="size">size</attribute>
                </attributes>
                <datasets>
                  { sortedSamples.map { sample => <dataset>{ sample }</dataset> } }
                </datasets>
              </krona>

    val writer = new PrintWriter(outputFile)
    val prettyXml = new PrettyPrinter(80, 2)
    writer.println(prettyXml.format(xml.copy(child = xml.child :+ root.node)))
    writer.close()
  }
}