VcfUtils.scala 5.48 KB
Newer Older
1
/**
2 3 4 5 6 7 8 9 10 11 12 13 14
  * Biopet is built on top of GATK Queue for building bioinformatic
  * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
  * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
  * should also be able to execute Biopet tools and pipelines.
  *
  * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
  *
  * Contact us at: sasc@lumc.nl
  *
  * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
  * license; For commercial users or users who do not want to follow the AGPL
  * license, please contact us to obtain a separate license.
  */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
package nl.lumc.sasc.biopet.utils
Peter van 't Hof's avatar
Peter van 't Hof committed
16

17
import java.io.File
Sander Bollen's avatar
Sander Bollen committed
18 19
import java.util

20 21
import htsjdk.variant.variantcontext.{Allele, Genotype, VariantContext}
import htsjdk.variant.vcf.{VCFFileReader, VCFFilterHeaderLine, VCFHeader}
Peter van 't Hof's avatar
Peter van 't Hof committed
22

Peter van 't Hof's avatar
Peter van 't Hof committed
23 24 25 26
import scala.collection.JavaConversions._

/** Utility object for general vcf file/records functions. */
object VcfUtils {
27

Peter van 't Hof's avatar
Peter van 't Hof committed
28
  /**
29 30 31 32 33
    * Return longest allele of VariantContext.
    *
    * @param vcfRecord record to check
    * @return allele with most nucleotides
    */
34
  def getLongestAllele(vcfRecord: VariantContext): Allele = {
Peter van 't Hof's avatar
Peter van 't Hof committed
35 36 37 38
    val alleles = vcfRecord.getAlleles
    val longestAlleleId = alleles.map(_.getBases.length).zipWithIndex.maxBy(_._1)._2
    alleles(longestAlleleId)
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
39

Peter van 't Hof's avatar
Peter van 't Hof committed
40
  /**
41 42 43 44 45 46
    * Method will extend a allele till a new length
    * @param bases Allele
    * @param newSize New size of allele
    * @param fillWith Char to fill gap
    * @return
    */
Peter van 't Hof's avatar
Peter van 't Hof committed
47
  def fillAllele(bases: String, newSize: Int, fillWith: Char = '-'): String = {
Peter van 't Hof's avatar
Peter van 't Hof committed
48
    bases + Array.fill[Char](newSize - bases.length)(fillWith).mkString
Peter van 't Hof's avatar
Peter van 't Hof committed
49
  }
Sander Bollen's avatar
Sander Bollen committed
50 51

  /**
52 53 54 55 56 57 58 59
    * Stands for scalaListToJavaObjectArrayList
    * Convert a scala List[Any] to a java ArrayList[Object]. This is necessary for BCF conversions
    * As scala ints and floats cannot be directly cast to java objects (they aren't objects),
    * we need to box them.
    * For items not Int, Float or Object, we assume them to be strings (TODO: sane assumption?)
    * @param array scala List[Any]
    * @return converted java ArrayList[Object]
    */
Sander Bollen's avatar
Sander Bollen committed
60 61 62 63
  def scalaListToJavaObjectArrayList(array: List[Any]): util.ArrayList[Object] = {
    val out = new util.ArrayList[Object]()

    array.foreach {
64 65 66 67 68 69
      case x: Long => out.add(Long.box(x))
      case x: Int => out.add(Int.box(x))
      case x: Char => out.add(Char.box(x))
      case x: Byte => out.add(Byte.box(x))
      case x: Double => out.add(Double.box(x))
      case x: Float => out.add(Float.box(x))
Peter van 't Hof's avatar
Peter van 't Hof committed
70
      case x: Boolean => out.add(Boolean.box(x))
71 72 73
      case x: String => out.add(x)
      case x: Object => out.add(x)
      case x => out.add(x.toString)
Sander Bollen's avatar
Sander Bollen committed
74 75 76 77
    }
    out
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
78
  //TODO: Add genotype comparing to this function
Sander Bollen's avatar
Sander Bollen committed
79
  def identicalVariantContext(var1: VariantContext, var2: VariantContext): Boolean = {
Peter van 't Hof's avatar
Peter van 't Hof committed
80
    var1.getContig == var2.getContig &&
81 82 83
    var1.getStart == var2.getStart &&
    var1.getEnd == var2.getEnd &&
    var1.getAttributes == var2.getAttributes
Sander Bollen's avatar
Sander Bollen committed
84
  }
Sander Bollen's avatar
Sander Bollen committed
85 86

  /**
87 88 89 90
    * Return true if header is a block-type GVCF file
    * @param header header of Vcf file
    * @return boolean
    */
Sander Bollen's avatar
Sander Bollen committed
91 92 93
  def isBlockGVcf(header: VCFHeader): Boolean = {
    header.getMetaDataLine("GVCFBlock") != null
  }
94 95

  /**
96 97 98 99
    * Get sample IDs from vcf File
    * @param vcf File object pointing to vcf
    * @return list of strings with sample IDs
    */
100 101 102 103 104 105
  def getSampleIds(vcf: File): List[String] = {
    val reader = new VCFFileReader(vcf, false)
    val samples = reader.getFileHeader.getSampleNamesInOrder.toList
    reader.close()
    samples
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
106 107

  /**
108 109 110 111 112 113
    * Check whether record has minimum genome Quality
    * @param record variant context
    * @param sample sample name
    * @param minGQ minimum genome quality value
    * @return
    */
Peter van 't Hof's avatar
Peter van 't Hof committed
114 115 116 117 118 119 120 121
  def hasMinGenomeQuality(record: VariantContext, sample: String, minGQ: Int): Boolean = {
    if (!record.getSampleNamesOrderedByName.contains(sample))
      throw new IllegalArgumentException("Sample does not exist")
    val gt = record.getGenotype(sample)
    hasMinGenomeQuality(gt, minGQ)
  }

  /**
122 123 124 125 126
    * Check whether genotype has minimum genome Quality
    * @param gt Genotype
    * @param minGQ minimum genome quality value
    * @return
    */
Peter van 't Hof's avatar
Peter van 't Hof committed
127 128 129
  def hasMinGenomeQuality(gt: Genotype, minGQ: Int): Boolean = {
    gt.hasGQ && gt.getGQ >= minGQ
  }
130 131 132 133 134 135 136

  def getVcfIndexFile(vcfFile: File): File = {
    val name = vcfFile.getAbsolutePath
    if (name.endsWith(".vcf")) new File(name + ".idx")
    else if (name.endsWith(".vcf.gz")) new File(name + ".tbi")
    else throw new IllegalArgumentException(s"File given is no vcf file: $vcfFile")
  }
137 138 139 140 141 142 143

  def vcfFileIsEmpty(file: File): Boolean = {
    val reader = new VCFFileReader(file, false)
    val hasNext = reader.iterator().hasNext
    reader.close()
    !hasNext
  }
Sander Bollen's avatar
Sander Bollen committed
144 145

  /**
146 147 148 149
    * Check whether genotype is of the form 0/.
    * @param genotype genotype
    * @return boolean
    */
Sander Bollen's avatar
Sander Bollen committed
150
  def isCompoundNoCall(genotype: Genotype): Boolean = {
151 152
    genotype.isCalled && genotype.getAlleles.exists(_.isNoCall) && genotype.getAlleles.exists(
      _.isReference)
Sander Bollen's avatar
Sander Bollen committed
153
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
154 155 156 157 158 159 160 161 162 163 164 165 166 167

  /** Give back the number of alleles that overlap */
  def alleleOverlap(g1: List[Allele], g2: List[Allele], start: Int = 0): Int = {
    if (g1.isEmpty) start
    else {
      val found = g2.contains(g1.head)
      val g2tail = if (found) {
        val index = g2.indexOf(g1.head)
        g2.drop(index + 1) ++ g2.take(index)
      } else g2

      alleleOverlap(g1.tail, g2tail, if (found) start + 1 else start)
    }
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
168
}