Reference.scala 8.43 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
2
3
4
5
6
7
8
9
10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
Peter van 't Hof's avatar
Peter van 't Hof committed
12
13
14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
15
16
17
18
package nl.lumc.sasc.biopet.core

import java.io.File

Peter van 't Hof's avatar
Peter van 't Hof committed
19
import htsjdk.samtools.reference.IndexedFastaSequenceFile
Peter van 't Hof's avatar
Peter van 't Hof committed
20
import nl.lumc.sasc.biopet.core.summary.{ Summarizable, SummaryQScript }
21
import nl.lumc.sasc.biopet.utils._
Peter van 't Hof's avatar
Peter van 't Hof committed
22
import nl.lumc.sasc.biopet.utils.config.{ Config, Configurable }
Peter van 't Hof's avatar
Peter van 't Hof committed
23

24
25
26
import scala.collection.JavaConversions._

/**
27
28
29
 * This trait is used for pipelines and extension that use a reference based on one fasta file.
 * The fasta file can contain multiple contigs.
 *
30
31
32
33
 * Created by pjvan_thof on 4/6/15.
 */
trait Reference extends Configurable {

34
  /** Returns species, default to unknown_species */
Peter van 't Hof's avatar
Peter van 't Hof committed
35
  def referenceSpecies: String = {
36
    root match {
37
38
      case r: Reference if r.referenceSpecies != "unknown_species" => r.referenceSpecies
      case _ => config("species", default = "unknown_species", path = super.configPath)
39
40
41
    }
  }

42
  /** Return referencename, default to unknown_ref */
Peter van 't Hof's avatar
Peter van 't Hof committed
43
  def referenceName: String = {
44
    root match {
45
      case r: Reference if r.referenceName != "unknown_ref" => r.referenceName
46
      case _ =>
47
48
        val default: String = config("default", default = "unknown_ref", path = List("references", referenceSpecies))
        config("reference_name", default = default, path = super.configPath)
49
50
51
    }
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
52
53
  def referenceDict = FastaUtils.getCachedDict(referenceFasta())

54
  /** All config values will get a prefix */
Peter van 't Hof's avatar
Peter van 't Hof committed
55
56
57
  override def subPath = {
    referenceConfigPath ::: super.subPath
  }
58

Peter van 't Hof's avatar
Peter van 't Hof committed
59
  lazy val geneAnnotationVersion: Option[String] = config("gene_annotation_name")
Peter van 't Hof's avatar
Peter van 't Hof committed
60
61
62
  lazy val geneAnnotationSubPath = geneAnnotationVersion.map(List("gene_annotations", _)).getOrElse(Nil)
  lazy val dbsnpVersion: Option[String] = config("dbsnp_version")
  lazy val dbsnpSubPath: List[String] = dbsnpVersion.map(List("dbsnp_annotations", _)).getOrElse(Nil)
Peter van 't Hof's avatar
Peter van 't Hof committed
63
64
  def dbsnpVcfFile: Option[File] = config("dbsnp_vcf", extraSubPath = dbsnpSubPath)

65
  /** Returns the reference config path */
Peter van 't Hof's avatar
Peter van 't Hof committed
66
67
68
  def referenceConfigPath = {
    List("references", referenceSpecies, referenceName)
  }
69

70
  /** When set override this on true the pipeline with raise an exception when fai index is not found */
71
  def faiRequired = false
72

73
  /** When set override this on true the pipeline with raise an exception when dict index is not found */
74
75
76
  def dictRequired = this.isInstanceOf[Summarizable] || this.isInstanceOf[SummaryQScript]

  /** Returns the dict file belonging to the fasta file */
Peter van 't Hof's avatar
Peter van 't Hof committed
77
  def referenceDictFile = new File(referenceFasta().getAbsolutePath
78
79
80
81
82
83
    .stripSuffix(".fa")
    .stripSuffix(".fasta")
    .stripSuffix(".fna") + ".dict")

  /** Returns the fai file belonging to the fasta file */
  def referenceFai = new File(referenceFasta().getAbsolutePath + ".fai")
84

85
  /** Returns the fasta file */
86
  def referenceFasta(): File = {
87
    val file: File = config("reference_fasta")
88
89
    if (config.contains("reference_fasta")) checkFasta(file)
    else {
90
      val defaults = ConfigUtils.mergeMaps(this.defaults, this.internalDefaults)
Peter van 't Hof's avatar
Peter van 't Hof committed
91

92
      def getReferences(map: Map[String, Any]): Set[(String, String)] = (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
93
94
        (species, species_content) <- map.getOrElse("references", Map[String, Any]()).asInstanceOf[Map[String, Any]].toList;
        (reference_name, _) <- species_content.asInstanceOf[Map[String, Any]].toList
95
      ) yield (species, reference_name)).toSet
Peter van 't Hof's avatar
Peter van 't Hof committed
96

97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
      val references = getReferences(defaults) ++ getReferences(Config.global.map)
      if (!references.contains((referenceSpecies, referenceName))) {
        val buffer = new StringBuilder()
        if (references.exists(_._1 == referenceSpecies)) {
          buffer.append(s"Reference: '$referenceName' does not exist in config for species: '$referenceSpecies'")
          buffer.append(s"\nRefrences found for species '$referenceSpecies':")
          references.filter(_._1 == referenceSpecies).foreach(x => buffer.append("\n - " + x._2))
        } else {
          buffer.append(s"Species: '$referenceSpecies' does not exist in config")
          if (references.nonEmpty) buffer.append("\n    References available in config (species -> reference_name):")
          else buffer.append("\n    No references found in user or global config")
          references.toList.sorted.foreach(x => buffer.append(s"\n     - ${x._1} -> ${x._2}"))
        }
        Logging.addError(buffer.toString)
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
112
    }
113
114
115
    file
  }

116
  /** Create summary part for reference */
117
  def referenceSummary: Map[String, Any] = {
118
    Reference.requireDict(referenceFasta())
Peter van 't Hof's avatar
Peter van 't Hof committed
119
    Map("contigs" ->
Peter van 't Hof's avatar
Peter van 't Hof committed
120
      (for (seq <- referenceDict.getSequences) yield seq.getSequenceName -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
121
122
123
124
125
        val md5 = Option(seq.getAttribute("M5"))
        Map("md5" -> md5, "length" -> seq.getSequenceLength)
      }).toMap,
      "species" -> referenceSpecies,
      "name" -> referenceName
126
127
128
    )
  }

129
  //TODO: this become obsolete when index get auto generated
130
131
132

  /** Check fasta file if file exist and index file are there */
  def checkFasta(file: File): Unit = {
133
    if (!Reference.checked.contains(file)) {
134
      if (!file.exists()) Logging.addError(s"Reference not found: $file, species: $referenceSpecies, name: $referenceName, configValue: " + config("reference_fasta"))
135
      Reference.checked += file
136
    }
137
138
139

    if (dictRequired) Reference.requireDict(file)
    if (faiRequired) Reference.requireFai(file)
140
141
  }
}
142
143
144
145
146

object Reference {

  /** Used as cache to avoid double checking */
  private var checked: Set[File] = Set()
147
148
149

  /**
   * Raise an exception when given fasta file has no fai file
Peter van 't Hof's avatar
Peter van 't Hof committed
150
151
   *
   * @param fastaFile Fasta file
152
153
154
   */
  def requireFai(fastaFile: File): Unit = {
    val fai = new File(fastaFile.getAbsolutePath + ".fai")
155
156
157
158
159
160
161
    if (!checked.contains(fai)) {
      checked += fai
      if (fai.exists()) {
        if (!IndexedFastaSequenceFile.canCreateIndexedFastaReader(fastaFile))
          Logging.addError(s"Index of reference cannot be loaded, reference: $fastaFile")
      } else Logging.addError("Reference is missing a fai file")
    }
162
163
164
165
  }

  /**
   * Raise an exception when given fasta file has no dict file
Peter van 't Hof's avatar
Peter van 't Hof committed
166
167
   *
   * @param fastaFile Fasta file
168
169
   */
  def requireDict(fastaFile: File): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
170
171
172
173
    val dict = new File(fastaFile.getAbsolutePath
      .stripSuffix(".fna")
      .stripSuffix(".fa")
      .stripSuffix(".fasta") + ".dict")
174
175
176
177
    if (!checked.contains(dict)) {
      checked += dict
      if (!dict.exists()) Logging.addError("Reference is missing a dict file")
    }
178
  }
179
180

  def askReference: Map[String, Any] = {
Peter van 't Hof's avatar
Peter van 't Hof committed
181
    val warn = "If you use a non-standard reference, please make sure that you have generated all required indexes for this reference"
Peter van 't Hof's avatar
Peter van 't Hof committed
182
    val globalSpecies = Config.global.defaults.getOrElse("references", Map()).asInstanceOf[Map[String, Any]]
Peter van 't Hof's avatar
Peter van 't Hof committed
183
    val species = Question.string("species",
184
185
      description = Some(if (globalSpecies.nonEmpty)
        s"""Species found in general config:
Peter van 't Hof's avatar
Peter van 't Hof committed
186
           |- ${globalSpecies.keys.toList.sorted.mkString("\n- ")}
Peter van 't Hof's avatar
Peter van 't Hof committed
187
           |$warn
Peter van 't Hof's avatar
Peter van 't Hof committed
188
189
           |""".stripMargin
      else s"No references found in global config. $warn"))
190

Peter van 't Hof's avatar
Peter van 't Hof committed
191
    val globalReferences = globalSpecies.getOrElse(species, Map()).asInstanceOf[Map[String, Any]]
Peter van 't Hof's avatar
Peter van 't Hof committed
192
    val referenceName = Question.string("reference_name",
193
194
      description = Some(if (globalReferences.nonEmpty)
        s"""Reference for $species found in general config:
Peter van 't Hof's avatar
Peter van 't Hof committed
195
            |- ${globalReferences.keys.toList.sorted.mkString("\n- ")}
Peter van 't Hof's avatar
Peter van 't Hof committed
196
            |$warn
Peter van 't Hof's avatar
Peter van 't Hof committed
197
198
            |""".stripMargin
      else s"No references found in global config. $warn"))
199

Peter van 't Hof's avatar
Peter van 't Hof committed
200
    val reference = globalReferences.getOrElse(referenceName, Map()).asInstanceOf[Map[String, Any]]
201
    val referenceFasta: Option[String] = if (reference.contains("reference_fasta")) None else {
Peter van 't Hof's avatar
Peter van 't Hof committed
202
      Some(Question.string("Reference Fasta", validation = List(TemplateTool.isAbsolutePath, TemplateTool.mustExist),
203
204
205
206
207
        description = Some(s"No fasta file found for $species -> $referenceName")))
    }

    Map("species" -> species, "reference_name" -> referenceName) ++ referenceFasta.map("reference_fasta" -> _)
  }
208
}