FastaUtils.scala 3.22 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
/**
2 3 4 5 6 7 8 9 10 11 12 13 14
  * Biopet is built on top of GATK Queue for building bioinformatic
  * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
  * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
  * should also be able to execute Biopet tools and pipelines.
  *
  * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
  *
  * Contact us at: sasc@lumc.nl
  *
  * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
  * license; For commercial users or users who do not want to follow the AGPL
  * license, please contact us to obtain a separate license.
  */
Peter van 't Hof's avatar
Peter van 't Hof committed
15 16 17 18 19
package nl.lumc.sasc.biopet.utils

import java.io.File

import htsjdk.samtools.SAMSequenceDictionary
Peter van 't Hof's avatar
Peter van 't Hof committed
20
import htsjdk.samtools.reference.{FastaSequenceFile, IndexedFastaSequenceFile}
Peter van 't Hof's avatar
Peter van 't Hof committed
21

22 23
import scala.io.Source

Peter van 't Hof's avatar
Peter van 't Hof committed
24
/**
25 26
  * Created by pjvan_thof on 25-10-16.
  */
Peter van 't Hof's avatar
Peter van 't Hof committed
27
object FastaUtils {
28

Peter van 't Hof's avatar
Peter van 't Hof committed
29
  /**
30 31 32 33 34
    * This method will get a dict from the fasta file. This will not use the cache
    *
    * @param fastaFile Fasta file
    * @return sequence dict
    */
Peter van 't Hof's avatar
Peter van 't Hof committed
35 36 37 38 39 40 41 42 43 44 45
  def getDictFromFasta(fastaFile: File): SAMSequenceDictionary = {
    val referenceFile = new FastaSequenceFile(fastaFile, true)
    val dict = referenceFile.getSequenceDictionary
    referenceFile.close()
    dictCache += fastaFile -> dict
    dict
  }

  private var dictCache: Map[File, SAMSequenceDictionary] = Map()

  /** This will clear the dict cache */
46
  def clearCache(): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
47 48
    dictCache = Map()
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
49 50

  /**
51 52 53 54 55
    * This method will get a dict from the fasta file. If it's already in the cache file will not opened again.
    *
    * @param fastaFile Fasta file
    * @return sequence dict
    */
Peter van 't Hof's avatar
Peter van 't Hof committed
56
  def getCachedDict(fastaFile: File): SAMSequenceDictionary = {
Peter van 't Hof's avatar
Peter van 't Hof committed
57 58
    if (!dictCache.contains(fastaFile)) getDictFromFasta(fastaFile)
    else dictCache(fastaFile)
Peter van 't Hof's avatar
Peter van 't Hof committed
59
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
60 61 62 63 64 65 66

  /** This method returns the fraction of GC for a given region */
  def getSequenceGc(fastaFile: File, contig: String, start: Long, end: Long): Double = {
    getSequenceGc(new IndexedFastaSequenceFile(fastaFile), contig, start, end)
  }

  /** This method returns the fraction of GC for a given region */
Peter van 't Hof's avatar
WIP  
Peter van 't Hof committed
67 68 69 70
  def getSequenceGc(referenceFile: IndexedFastaSequenceFile,
                    contig: String,
                    start: Long,
                    end: Long): Double = {
Peter van 't Hof's avatar
Peter van 't Hof committed
71 72 73
    require(referenceFile.isIndexed)
    val sequence = referenceFile.getSubsequenceAt(contig, start, end)
    val gcCount = sequence.getBaseString.toLowerCase.count(c => c == 'c' || c == 'g')
74 75
    val atCount = sequence.getBaseString.toLowerCase.count(c => c == 'a' || c == 't')
    val gc = gcCount.toDouble / (gcCount + atCount)
Peter van 't Hof's avatar
Peter van 't Hof committed
76 77
    gc
  }
78 79 80 81 82 83 84 85 86

  def readContigMap(file: File): Map[String, Set[String]] = {
    val reader = Source.fromFile(file)
    val map = reader
      .getLines()
      .filter(!_.startsWith("#"))
      .map { line =>
        val columns = line.split("\t")
        val refContig = columns(0)
87 88
        val alternativeNames = columns(1).split(";").toSet
        refContig -> alternativeNames
89 90 91 92 93 94 95 96 97
      }
      .toMap
    reader.close()
    map
  }

  def readContigMapReverse(file: File): Map[String, String] = {
    readContigMap(file).flatMap(x => x._2.map(y => y -> x._1))
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
98
}