Reference.scala 4.61 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
16
17
18
19
20
21
package nl.lumc.sasc.biopet.core

import java.io.File

import htsjdk.samtools.reference.IndexedFastaSequenceFile
import nl.lumc.sasc.biopet.core.config.Configurable
Peter van 't Hof's avatar
Peter van 't Hof committed
22

23
24
25
import scala.collection.JavaConversions._

/**
26
27
28
 * This trait is used for pipelines and extension that use a reference based on one fasta file.
 * The fasta file can contain multiple contigs.
 *
29
30
31
32
 * Created by pjvan_thof on 4/6/15.
 */
trait Reference extends Configurable {

33
  /** Returns species, default to unknown_species */
Peter van 't Hof's avatar
Peter van 't Hof committed
34
  def referenceSpecies: String = {
35
    root match {
36
37
      case r: Reference if r.referenceSpecies != "unknown_species" => r.referenceSpecies
      case _ => config("species", default = "unknown_species", path = super.configPath)
38
39
40
    }
  }

41
  /** Return referencename, default to unknown_ref */
Peter van 't Hof's avatar
Peter van 't Hof committed
42
  def referenceName: String = {
43
    root match {
44
      case r: Reference if r.referenceName != "unknown_ref" => r.referenceName
45
      case _ =>
46
47
        val default: String = config("default", default = "unknown_ref", path = List("references", referenceSpecies))
        config("reference_name", default = default, path = super.configPath)
48
49
50
    }
  }

51
  /** All config values will get a prefix */
Peter van 't Hof's avatar
Peter van 't Hof committed
52
53
54
  override def subPath = {
    referenceConfigPath ::: super.subPath
  }
55
56

  /** Returns the reference config path */
Peter van 't Hof's avatar
Peter van 't Hof committed
57
58
59
  def referenceConfigPath = {
    List("references", referenceSpecies, referenceName)
  }
60

61
  /** When set override this on true the pipeline with raise an exception when fai index is not found */
62
63
  protected def faiRequired = false

64
  /** When set override this on true the pipeline with raise an exception when dict index is not found */
65
66
  protected def dictRequired = false

67
  /** Returns the fasta file */
68
  def referenceFasta(): File = {
69
    val file: File = config("reference_fasta")
70
    checkFasta(file)
Peter van 't Hof's avatar
Peter van 't Hof committed
71
72
73
74
75
76
77
78
79

    val dict = new File(file.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta") + ".dict")
    val fai = new File(file.getAbsolutePath + ".fai")

    this match {
      case c: BiopetCommandLineFunctionTrait => c.deps :::= dict :: fai :: Nil
      case _                                 =>
    }

80
81
82
    file
  }

83
  /** Create summary part for reference */
84
  def referenceSummary: Map[String, Any] = {
85
    Reference.requireDict(referenceFasta())
86
    val file = new IndexedFastaSequenceFile(referenceFasta())
Peter van 't Hof's avatar
Peter van 't Hof committed
87
88
89
90
91
92
93
    Map("contigs" ->
      (for (seq <- file.getSequenceDictionary.getSequences) yield seq.getSequenceName -> {
        val md5 = Option(seq.getAttribute("M5"))
        Map("md5" -> md5, "length" -> seq.getSequenceLength)
      }).toMap,
      "species" -> referenceSpecies,
      "name" -> referenceName
94
95
96
97
98
99
100
    )
  }

  //TODO: this become obsolete when index get autogenerated

  /** Check fasta file if file exist and index file are there */
  def checkFasta(file: File): Unit = {
101
    if (!Reference.checked.contains(file)) {
102
103
      require(file.exists(), "Reference not found: " + file)

104
105
      if (dictRequired) Reference.requireDict(file)
      if (faiRequired) Reference.requireFai(file)
106

107
      Reference.checked += file
108
109
110
    }
  }
}
111
112
113
114
115

object Reference {

  /** Used as cache to avoid double checking */
  private var checked: Set[File] = Set()
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

  /**
   * Raise an exception when given fasta file has no fai file
   * @param fastaFile Fasta file
   * @throws IllegalArgumentException
   */
  def requireFai(fastaFile: File): Unit = {
    val fai = new File(fastaFile.getAbsolutePath + ".fai")
    require(fai.exists(), "Reference is missing a fai file")
    require(IndexedFastaSequenceFile.canCreateIndexedFastaReader(fastaFile),
      "Index of reference cannot be loaded, reference: " + fastaFile)
  }

  /**
   * Raise an exception when given fasta file has no dict file
   * @param fastaFile Fasta file
   * @throws IllegalArgumentException
   */
  def requireDict(fastaFile: File): Unit = {
    val dict = new File(fastaFile.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta") + ".dict")
    require(dict.exists(), "Reference is missing a dict file")
  }
138
}