VcfToTsv.scala 8.08 KB
Newer Older
1
2
3
4
5
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
Peter van 't Hof's avatar
Peter van 't Hof committed
6
 *
7
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
Peter van 't Hof's avatar
Peter van 't Hof committed
8
 *
9
 * Contact us at: sasc@lumc.nl
Peter van 't Hof's avatar
Peter van 't Hof committed
10
 *
11
12
13
14
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
Peter van 't Hof's avatar
Peter van 't Hof committed
15
16
17
 */
package nl.lumc.sasc.biopet.tools

Peter van 't Hof's avatar
Peter van 't Hof committed
18
import java.io.{ File, PrintStream }
19
20
import java.text.DecimalFormat

Peter van 't Hof's avatar
Peter van 't Hof committed
21
22
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.core.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
23

Peter van 't Hof's avatar
Peter van 't Hof committed
24
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
25
import scala.collection.mutable.{ ListBuffer, Map }
Peter van 't Hof's avatar
Peter van 't Hof committed
26
27
28
29
30
31

class VcfToTsv {
  // TODO: Queue wrapper
}

object VcfToTsv extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
32
  case class Args(inputFile: File = null, outputFile: File = null, fields: List[String] = Nil, infoFields: List[String] = Nil,
33
                  sampleFields: List[String] = Nil, disableDefaults: Boolean = false,
34
                  allInfo: Boolean = false, allFormat: Boolean = false,
35
                  separator: String = "\t", listSeparator: String = ",", maxDecimals: Int = 2) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
36
37

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
    opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputFile = x)
    }
    opt[File]('o', "outputFile") maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputFile = x)
    } text ("output file, default to stdout")
    opt[String]('f', "field") unbounded () action { (x, c) =>
      c.copy(fields = x :: c.fields)
    }
    opt[String]('i', "info_field") unbounded () action { (x, c) =>
      c.copy(infoFields = x :: c.infoFields)
    }
    opt[Unit]("all_info") unbounded () action { (x, c) =>
      c.copy(allInfo = true)
    }
    opt[Unit]("all_format") unbounded () action { (x, c) =>
      c.copy(allFormat = true)
    }
    opt[String]('s', "sample_field") unbounded () action { (x, c) =>
57
      c.copy(sampleFields = x :: c.sampleFields)
Peter van 't Hof's avatar
Peter van 't Hof committed
58
59
60
61
    }
    opt[Unit]('d', "disable_defaults") unbounded () action { (x, c) =>
      c.copy(disableDefaults = true)
    }
62
63
64
65
66
67
    opt[String]("separator") maxOccurs (1) action { (x, c) =>
      c.copy(separator = x)
    } text ("Optional separator. Default is tab-delimited")
    opt[String]("list_separator") maxOccurs (1) action { (x, c) =>
      c.copy(listSeparator = x)
    } text ("Optional list separator. By default, lists are separated by a comma")
Peter van 't Hof's avatar
Peter van 't Hof committed
68
    opt[Int]("max_decimals") maxOccurs (1) action { (x, c) =>
69
70
      c.copy(maxDecimals = x)
    } text ("Number of decimal places for numbers. Default is 2")
Peter van 't Hof's avatar
Peter van 't Hof committed
71
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
72

73
  val defaultFields = List("CHROM", "POS", "ID", "REF", "ALT", "QUAL")
Peter van 't Hof's avatar
Peter van 't Hof committed
74

Peter van 't Hof's avatar
Peter van 't Hof committed
75
76
77
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
78

79
80
    // Throw exception if separator and listSeparator are identical
    if (commandArgs.separator == commandArgs.listSeparator) throw new IllegalArgumentException(
Peter van 't Hof's avatar
Peter van 't Hof committed
81
      "Separator and list_separator should not be identical"
82
83
84
85
    )

    val formatter = createFormatter(commandArgs.maxDecimals)

Peter van 't Hof's avatar
Peter van 't Hof committed
86
87
88
    val reader = new VCFFileReader(commandArgs.inputFile, false)
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder
Peter van 't Hof's avatar
Peter van 't Hof committed
89

90
91
    val allInfoFields = header.getInfoHeaderLines.map(_.getID).toList
    val allFormatFields = header.getFormatHeaderLines.map(_.getID).toList
Peter van 't Hof's avatar
Peter van 't Hof committed
92
93
94
95
96

    val fields: Set[String] = (if (commandArgs.disableDefaults) Nil else defaultFields).toSet[String] ++
      commandArgs.fields.toSet[String] ++
      (if (commandArgs.allInfo) allInfoFields else commandArgs.infoFields).map("INFO-" + _) ++ {
        val buffer: ListBuffer[String] = ListBuffer()
97
        for (f <- (if (commandArgs.allFormat) allFormatFields else commandArgs.sampleFields); sample <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
98
99
100
          buffer += sample + "-" + f
        }
        buffer.toSet[String]
Peter van 't Hof's avatar
Peter van 't Hof committed
101
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
102

103
    val sortedFields = sortFields(fields, samples.toList)
Peter van 't Hof's avatar
Peter van 't Hof committed
104

Sander Bollen's avatar
Sander Bollen committed
105
    val writer = if (commandArgs.outputFile != null) new PrintStream(commandArgs.outputFile)
106
    else sys.process.stdout
Peter van 't Hof's avatar
Peter van 't Hof committed
107

108
    writer.println(sortedFields.mkString("#", commandArgs.separator, ""))
Peter van 't Hof's avatar
Peter van 't Hof committed
109
110
    for (vcfRecord <- reader) {
      val values: Map[String, Any] = Map()
111
112
113
114
115
      values += "CHROM" -> vcfRecord.getChr
      values += "POS" -> vcfRecord.getStart
      values += "ID" -> vcfRecord.getID
      values += "REF" -> vcfRecord.getReference.getBaseString
      values += "ALT" -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
116
        val t = for (a <- vcfRecord.getAlternateAlleles) yield a.getBaseString
117
        t.mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
118
      }
119
      values += "QUAL" -> (if (vcfRecord.getPhredScaledQual == -10) "." else formatter.format(vcfRecord.getPhredScaledQual))
120
      values += "INFO" -> vcfRecord.getFilters
Peter van 't Hof's avatar
Peter van 't Hof committed
121
      for ((field, content) <- vcfRecord.getAttributes) {
Peter van 't Hof's avatar
Peter van 't Hof committed
122
        values += "INFO-" + field -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
123
          content match {
124
125
126
            case a: List[_]                => a.mkString(commandArgs.listSeparator)
            case a: Array[_]               => a.mkString(commandArgs.listSeparator)
            case a: java.util.ArrayList[_] => a.mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
127
            case _                         => content
Peter van 't Hof's avatar
Peter van 't Hof committed
128
129
130
          }
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
131

Peter van 't Hof's avatar
Peter van 't Hof committed
132
133
      for (sample <- samples) {
        val genotype = vcfRecord.getGenotype(sample)
Peter van 't Hof's avatar
Peter van 't Hof committed
134
        values += sample + "-GT" -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
135
136
137
          val l = for (g <- genotype.getAlleles) yield vcfRecord.getAlleleIndex(g)
          l.map(x => if (x < 0) "." else x).mkString("/")
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
138
139
140
141
        if (genotype.hasAD) values += sample + "-AD" -> List(genotype.getAD: _*).mkString(commandArgs.listSeparator)
        if (genotype.hasDP) values += sample + "-DP" -> genotype.getDP
        if (genotype.hasGQ) values += sample + "-GQ" -> genotype.getGQ
        if (genotype.hasPL) values += sample + "-PL" -> List(genotype.getPL: _*).mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
142
        for ((field, content) <- genotype.getExtendedAttributes) {
Peter van 't Hof's avatar
Peter van 't Hof committed
143
          values += sample + "-" + field -> content
Peter van 't Hof's avatar
Peter van 't Hof committed
144
145
        }
      }
146
      val line = for (f <- sortedFields) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
147
148
        if (values.contains(f)) {
          values(f)
Peter van 't Hof's avatar
Peter van 't Hof committed
149
150
        } else ""
      }
151
      writer.println(line.mkString(commandArgs.separator))
Peter van 't Hof's avatar
Peter van 't Hof committed
152
153
    }
  }
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

  /**
   *  This function creates a correct DecimalFormat for a specific length of decimals
   * @param len number of decimal places
   * @return DecimalFormat formatter
   */
  def createFormatter(len: Int): DecimalFormat = {
    val patternString = "###." + (for (x <- (1 to len)) yield "#").mkString("")
    new DecimalFormat(patternString)
  }

  /**
   * This fields sorts fields, such that non-info and non-sample specific fields (e.g. general ones) are on front
   * followed by info fields
   * followed by sample-specific fields
   * @param fields fields
   * @param samples samples
   * @return sorted samples
   */
  def sortFields(fields: Set[String], samples: List[String]): List[String] = {
    def fieldType(x: String) = x match {
      case _ if x.startsWith("INFO-") => 'i'
      case _ if (samples.exists(y => x.startsWith(y + "-"))) => 'f'
      case _ => 'g'
    }

    fields.toList.sortWith((a, b) => {
      (fieldType(a), fieldType(b)) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
182
        case ('g', 'g') => {
183
184
185
186
187
188
189
190
          val ai = defaultFields.indexOf(a)
          val bi = defaultFields.indexOf(b)
          if (bi < 0) true else ai <= bi
        }
        case ('f', 'f') => {
          val sampleA = a.split("-").head
          val sampleB = b.split("-").head
          sampleA.compareTo(sampleB) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
191
            case 0            => !(a.compareTo(b) > 0)
192
            case i if (i > 0) => false
Peter van 't Hof's avatar
Peter van 't Hof committed
193
            case _            => true
194
195
          }
        }
Peter van 't Hof's avatar
Peter van 't Hof committed
196
197
        case ('g', _)         => true
        case (_, 'g')         => false
198
        case (a, b) if a == b => !(a.compareTo(b) > 0)
Peter van 't Hof's avatar
Peter van 't Hof committed
199
200
        case ('i', _)         => true
        case _                => false
201
202
203
      }
    })
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
204
}