VcfToTsv.scala 8.1 KB
Newer Older
1 2 3 4 5
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
Peter van 't Hof's avatar
Peter van 't Hof committed
6
 *
7
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
Peter van 't Hof's avatar
Peter van 't Hof committed
8
 *
9
 * Contact us at: sasc@lumc.nl
Peter van 't Hof's avatar
Peter van 't Hof committed
10
 *
11 12 13 14
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
Peter van 't Hof's avatar
Peter van 't Hof committed
15 16 17
 */
package nl.lumc.sasc.biopet.tools

Peter van 't Hof's avatar
Peter van 't Hof committed
18
import java.io.{ File, PrintStream }
19 20
import java.text.DecimalFormat

Peter van 't Hof's avatar
Peter van 't Hof committed
21 22
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.core.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
23

Peter van 't Hof's avatar
Peter van 't Hof committed
24
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
25
import scala.collection.mutable
Peter van 't Hof's avatar
Peter van 't Hof committed
26
import scala.collection.mutable.{ ListBuffer, Map }
Peter van 't Hof's avatar
Peter van 't Hof committed
27 28 29 30 31 32

class VcfToTsv {
  // TODO: Queue wrapper
}

object VcfToTsv extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
33
  case class Args(inputFile: File = null, outputFile: File = null, fields: List[String] = Nil, infoFields: List[String] = Nil,
34
                  sampleFields: List[String] = Nil, disableDefaults: Boolean = false,
35
                  allInfo: Boolean = false, allFormat: Boolean = false,
36
                  separator: String = "\t", listSeparator: String = ",", maxDecimals: Int = 2) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
37 38

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
39
    opt[File]('I', "inputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
40 41
      c.copy(inputFile = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
42
    opt[File]('o', "outputFile") maxOccurs 1 valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
43
      c.copy(outputFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
44
    } text "output file, default to stdout"
Peter van 't Hof's avatar
Peter van 't Hof committed
45 46 47 48 49 50 51 52 53 54 55 56 57
    opt[String]('f', "field") unbounded () action { (x, c) =>
      c.copy(fields = x :: c.fields)
    }
    opt[String]('i', "info_field") unbounded () action { (x, c) =>
      c.copy(infoFields = x :: c.infoFields)
    }
    opt[Unit]("all_info") unbounded () action { (x, c) =>
      c.copy(allInfo = true)
    }
    opt[Unit]("all_format") unbounded () action { (x, c) =>
      c.copy(allFormat = true)
    }
    opt[String]('s', "sample_field") unbounded () action { (x, c) =>
58
      c.copy(sampleFields = x :: c.sampleFields)
Peter van 't Hof's avatar
Peter van 't Hof committed
59 60 61 62
    }
    opt[Unit]('d', "disable_defaults") unbounded () action { (x, c) =>
      c.copy(disableDefaults = true)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
63
    opt[String]("separator") maxOccurs 1 action { (x, c) =>
64
      c.copy(separator = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
65 66
    } text "Optional separator. Default is tab-delimited"
    opt[String]("list_separator") maxOccurs 1 action { (x, c) =>
67
      c.copy(listSeparator = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
68 69
    } text "Optional list separator. By default, lists are separated by a comma"
    opt[Int]("max_decimals") maxOccurs 1 action { (x, c) =>
70
      c.copy(maxDecimals = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
71
    } text "Number of decimal places for numbers. Default is 2"
Peter van 't Hof's avatar
Peter van 't Hof committed
72
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
73

74
  val defaultFields = List("CHROM", "POS", "ID", "REF", "ALT", "QUAL")
Peter van 't Hof's avatar
Peter van 't Hof committed
75

Peter van 't Hof's avatar
Peter van 't Hof committed
76 77 78
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
79

80 81
    // Throw exception if separator and listSeparator are identical
    if (commandArgs.separator == commandArgs.listSeparator) throw new IllegalArgumentException(
Peter van 't Hof's avatar
Peter van 't Hof committed
82
      "Separator and list_separator should not be identical"
83 84 85 86
    )

    val formatter = createFormatter(commandArgs.maxDecimals)

Peter van 't Hof's avatar
Peter van 't Hof committed
87 88 89
    val reader = new VCFFileReader(commandArgs.inputFile, false)
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder
Peter van 't Hof's avatar
Peter van 't Hof committed
90

91 92
    val allInfoFields = header.getInfoHeaderLines.map(_.getID).toList
    val allFormatFields = header.getFormatHeaderLines.map(_.getID).toList
Peter van 't Hof's avatar
Peter van 't Hof committed
93 94 95 96 97

    val fields: Set[String] = (if (commandArgs.disableDefaults) Nil else defaultFields).toSet[String] ++
      commandArgs.fields.toSet[String] ++
      (if (commandArgs.allInfo) allInfoFields else commandArgs.infoFields).map("INFO-" + _) ++ {
        val buffer: ListBuffer[String] = ListBuffer()
Peter van 't Hof's avatar
Peter van 't Hof committed
98
        for (f <- if (commandArgs.allFormat) allFormatFields else commandArgs.sampleFields; sample <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
99 100 101
          buffer += sample + "-" + f
        }
        buffer.toSet[String]
Peter van 't Hof's avatar
Peter van 't Hof committed
102
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
103

104
    val sortedFields = sortFields(fields, samples.toList)
Peter van 't Hof's avatar
Peter van 't Hof committed
105

Sander Bollen's avatar
Sander Bollen committed
106
    val writer = if (commandArgs.outputFile != null) new PrintStream(commandArgs.outputFile)
107
    else sys.process.stdout
Peter van 't Hof's avatar
Peter van 't Hof committed
108

109
    writer.println(sortedFields.mkString("#", commandArgs.separator, ""))
Peter van 't Hof's avatar
Peter van 't Hof committed
110
    for (vcfRecord <- reader) {
Peter van 't Hof's avatar
Peter van 't Hof committed
111
      val values: mutable.Map[String, Any] = mutable.Map()
112
      values += "CHROM" -> vcfRecord.getContig
113 114 115 116
      values += "POS" -> vcfRecord.getStart
      values += "ID" -> vcfRecord.getID
      values += "REF" -> vcfRecord.getReference.getBaseString
      values += "ALT" -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
117
        val t = for (a <- vcfRecord.getAlternateAlleles) yield a.getBaseString
118
        t.mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
119
      }
120
      values += "QUAL" -> (if (vcfRecord.getPhredScaledQual == -10) "." else formatter.format(vcfRecord.getPhredScaledQual))
121
      values += "INFO" -> vcfRecord.getFilters
Peter van 't Hof's avatar
Peter van 't Hof committed
122
      for ((field, content) <- vcfRecord.getAttributes) {
Peter van 't Hof's avatar
Peter van 't Hof committed
123
        values += "INFO-" + field -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
124
          content match {
125 126 127
            case a: List[_]                => a.mkString(commandArgs.listSeparator)
            case a: Array[_]               => a.mkString(commandArgs.listSeparator)
            case a: java.util.ArrayList[_] => a.mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
128
            case _                         => content
Peter van 't Hof's avatar
Peter van 't Hof committed
129 130 131
          }
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
132

Peter van 't Hof's avatar
Peter van 't Hof committed
133 134
      for (sample <- samples) {
        val genotype = vcfRecord.getGenotype(sample)
Peter van 't Hof's avatar
Peter van 't Hof committed
135
        values += sample + "-GT" -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
136 137 138
          val l = for (g <- genotype.getAlleles) yield vcfRecord.getAlleleIndex(g)
          l.map(x => if (x < 0) "." else x).mkString("/")
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
139 140 141 142
        if (genotype.hasAD) values += sample + "-AD" -> List(genotype.getAD: _*).mkString(commandArgs.listSeparator)
        if (genotype.hasDP) values += sample + "-DP" -> genotype.getDP
        if (genotype.hasGQ) values += sample + "-GQ" -> genotype.getGQ
        if (genotype.hasPL) values += sample + "-PL" -> List(genotype.getPL: _*).mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
143
        for ((field, content) <- genotype.getExtendedAttributes) {
Peter van 't Hof's avatar
Peter van 't Hof committed
144
          values += sample + "-" + field -> content
Peter van 't Hof's avatar
Peter van 't Hof committed
145 146
        }
      }
147
      val line = for (f <- sortedFields) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
148 149
        if (values.contains(f)) {
          values(f)
Peter van 't Hof's avatar
Peter van 't Hof committed
150 151
        } else ""
      }
152
      writer.println(line.mkString(commandArgs.separator))
Peter van 't Hof's avatar
Peter van 't Hof committed
153 154
    }
  }
155 156 157 158 159 160 161

  /**
   *  This function creates a correct DecimalFormat for a specific length of decimals
   * @param len number of decimal places
   * @return DecimalFormat formatter
   */
  def createFormatter(len: Int): DecimalFormat = {
Peter van 't Hof's avatar
Peter van 't Hof committed
162
    val patternString = "###." + (for (x <- 1 to len) yield "#").mkString("")
163 164 165 166 167 168 169 170 171 172 173 174 175 176
    new DecimalFormat(patternString)
  }

  /**
   * This fields sorts fields, such that non-info and non-sample specific fields (e.g. general ones) are on front
   * followed by info fields
   * followed by sample-specific fields
   * @param fields fields
   * @param samples samples
   * @return sorted samples
   */
  def sortFields(fields: Set[String], samples: List[String]): List[String] = {
    def fieldType(x: String) = x match {
      case _ if x.startsWith("INFO-") => 'i'
Peter van 't Hof's avatar
Peter van 't Hof committed
177
      case _ if samples.exists(y => x.startsWith(y + "-")) => 'f'
178 179 180 181 182
      case _ => 'g'
    }

    fields.toList.sortWith((a, b) => {
      (fieldType(a), fieldType(b)) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
183
        case ('g', 'g') =>
184 185 186
          val ai = defaultFields.indexOf(a)
          val bi = defaultFields.indexOf(b)
          if (bi < 0) true else ai <= bi
Peter van 't Hof's avatar
Peter van 't Hof committed
187
        case ('f', 'f') =>
188 189 190
          val sampleA = a.split("-").head
          val sampleB = b.split("-").head
          sampleA.compareTo(sampleB) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
191 192 193
            case 0          => !(a.compareTo(b) > 0)
            case i if i > 0 => false
            case _          => true
194
          }
Peter van 't Hof's avatar
Peter van 't Hof committed
195 196 197 198 199
        case ('g', _)             => true
        case (_, 'g')             => false
        case (a2, b2) if a2 == b2 => !(a2.compareTo(b2) > 0)
        case ('i', _)             => true
        case _                    => false
200 201 202
      }
    })
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
203
}