Reference.scala 6.5 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
2
3
4
5
6
7
8
9
10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
Peter van 't Hof's avatar
Peter van 't Hof committed
12
13
14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
15
16
17
18
package nl.lumc.sasc.biopet.core

import java.io.File

Peter van 't Hof's avatar
Peter van 't Hof committed
19
import htsjdk.samtools.reference.IndexedFastaSequenceFile
Peter van 't Hof's avatar
Peter van 't Hof committed
20
import nl.lumc.sasc.biopet.core.summary.{ Summarizable, SummaryQScript }
21
import nl.lumc.sasc.biopet.utils.config.{ Config, Configurable }
Peter van 't Hof's avatar
Peter van 't Hof committed
22
import nl.lumc.sasc.biopet.utils.{ ConfigUtils, FastaUtils, Logging }
Peter van 't Hof's avatar
Peter van 't Hof committed
23

24
25
26
import scala.collection.JavaConversions._

/**
27
28
29
 * This trait is used for pipelines and extension that use a reference based on one fasta file.
 * The fasta file can contain multiple contigs.
 *
30
31
32
33
 * Created by pjvan_thof on 4/6/15.
 */
trait Reference extends Configurable {

34
  /** Returns species, default to unknown_species */
Peter van 't Hof's avatar
Peter van 't Hof committed
35
  def referenceSpecies: String = {
36
    root match {
37
38
      case r: Reference if r.referenceSpecies != "unknown_species" => r.referenceSpecies
      case _ => config("species", default = "unknown_species", path = super.configPath)
39
40
41
    }
  }

42
  /** Return referencename, default to unknown_ref */
Peter van 't Hof's avatar
Peter van 't Hof committed
43
  def referenceName: String = {
44
    root match {
45
      case r: Reference if r.referenceName != "unknown_ref" => r.referenceName
46
      case _ =>
47
48
        val default: String = config("default", default = "unknown_ref", path = List("references", referenceSpecies))
        config("reference_name", default = default, path = super.configPath)
49
50
51
    }
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
52
53
  def referenceDict = FastaUtils.getCachedDict(referenceFasta())

54
  /** All config values will get a prefix */
Peter van 't Hof's avatar
Peter van 't Hof committed
55
56
57
  override def subPath = {
    referenceConfigPath ::: super.subPath
  }
58
59

  /** Returns the reference config path */
Peter van 't Hof's avatar
Peter van 't Hof committed
60
61
62
  def referenceConfigPath = {
    List("references", referenceSpecies, referenceName)
  }
63

64
  /** When set override this on true the pipeline with raise an exception when fai index is not found */
65
  def faiRequired = false
66

67
  /** When set override this on true the pipeline with raise an exception when dict index is not found */
68
69
70
  def dictRequired = this.isInstanceOf[Summarizable] || this.isInstanceOf[SummaryQScript]

  /** Returns the dict file belonging to the fasta file */
Peter van 't Hof's avatar
Peter van 't Hof committed
71
  def referenceDictFile = new File(referenceFasta().getAbsolutePath
72
73
74
75
76
77
    .stripSuffix(".fa")
    .stripSuffix(".fasta")
    .stripSuffix(".fna") + ".dict")

  /** Returns the fai file belonging to the fasta file */
  def referenceFai = new File(referenceFasta().getAbsolutePath + ".fai")
78

79
  /** Returns the fasta file */
80
  def referenceFasta(): File = {
81
    val file: File = config("reference_fasta")
82
83
    if (config.contains("reference_fasta")) checkFasta(file)
    else {
84
      val defaults = ConfigUtils.mergeMaps(this.defaults, this.internalDefaults)
Peter van 't Hof's avatar
Peter van 't Hof committed
85

86
      def getReferences(map: Map[String, Any]): Set[(String, String)] = (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
87
88
        (species, species_content) <- map.getOrElse("references", Map[String, Any]()).asInstanceOf[Map[String, Any]].toList;
        (reference_name, _) <- species_content.asInstanceOf[Map[String, Any]].toList
89
      ) yield (species, reference_name)).toSet
Peter van 't Hof's avatar
Peter van 't Hof committed
90

91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
      val references = getReferences(defaults) ++ getReferences(Config.global.map)
      if (!references.contains((referenceSpecies, referenceName))) {
        val buffer = new StringBuilder()
        if (references.exists(_._1 == referenceSpecies)) {
          buffer.append(s"Reference: '$referenceName' does not exist in config for species: '$referenceSpecies'")
          buffer.append(s"\nRefrences found for species '$referenceSpecies':")
          references.filter(_._1 == referenceSpecies).foreach(x => buffer.append("\n - " + x._2))
        } else {
          buffer.append(s"Species: '$referenceSpecies' does not exist in config")
          if (references.nonEmpty) buffer.append("\n    References available in config (species -> reference_name):")
          else buffer.append("\n    No references found in user or global config")
          references.toList.sorted.foreach(x => buffer.append(s"\n     - ${x._1} -> ${x._2}"))
        }
        Logging.addError(buffer.toString)
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
106
    }
107
108
109
    file
  }

110
  /** Create summary part for reference */
111
  def referenceSummary: Map[String, Any] = {
112
    Reference.requireDict(referenceFasta())
Peter van 't Hof's avatar
Peter van 't Hof committed
113
    Map("contigs" ->
Peter van 't Hof's avatar
Peter van 't Hof committed
114
      (for (seq <- referenceDict.getSequences) yield seq.getSequenceName -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
115
116
117
118
119
        val md5 = Option(seq.getAttribute("M5"))
        Map("md5" -> md5, "length" -> seq.getSequenceLength)
      }).toMap,
      "species" -> referenceSpecies,
      "name" -> referenceName
120
121
122
    )
  }

123
  //TODO: this become obsolete when index get auto generated
124
125
126

  /** Check fasta file if file exist and index file are there */
  def checkFasta(file: File): Unit = {
127
    if (!Reference.checked.contains(file)) {
128
      if (!file.exists()) Logging.addError(s"Reference not found: $file, species: $referenceSpecies, name: $referenceName, configValue: " + config("reference_fasta"))
129
      Reference.checked += file
130
    }
131
132
133

    if (dictRequired) Reference.requireDict(file)
    if (faiRequired) Reference.requireFai(file)
134
135
  }
}
136
137
138
139
140

object Reference {

  /** Used as cache to avoid double checking */
  private var checked: Set[File] = Set()
141
142
143

  /**
   * Raise an exception when given fasta file has no fai file
Peter van 't Hof's avatar
Peter van 't Hof committed
144
145
   *
   * @param fastaFile Fasta file
146
147
148
   */
  def requireFai(fastaFile: File): Unit = {
    val fai = new File(fastaFile.getAbsolutePath + ".fai")
149
150
151
152
153
154
155
    if (!checked.contains(fai)) {
      checked += fai
      if (fai.exists()) {
        if (!IndexedFastaSequenceFile.canCreateIndexedFastaReader(fastaFile))
          Logging.addError(s"Index of reference cannot be loaded, reference: $fastaFile")
      } else Logging.addError("Reference is missing a fai file")
    }
156
157
158
159
  }

  /**
   * Raise an exception when given fasta file has no dict file
Peter van 't Hof's avatar
Peter van 't Hof committed
160
161
   *
   * @param fastaFile Fasta file
162
163
   */
  def requireDict(fastaFile: File): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
164
165
166
167
    val dict = new File(fastaFile.getAbsolutePath
      .stripSuffix(".fna")
      .stripSuffix(".fa")
      .stripSuffix(".fasta") + ".dict")
168
169
170
171
    if (!checked.contains(dict)) {
      checked += dict
      if (!dict.exists()) Logging.addError("Reference is missing a dict file")
    }
172
  }
173
}