VcfToTsv.scala 6.54 KB
Newer Older
1
2
3
4
5
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
Peter van 't Hof's avatar
Peter van 't Hof committed
6
 *
7
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
Peter van 't Hof's avatar
Peter van 't Hof committed
8
 *
9
 * Contact us at: sasc@lumc.nl
Peter van 't Hof's avatar
Peter van 't Hof committed
10
 *
11
12
13
14
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
Peter van 't Hof's avatar
Peter van 't Hof committed
15
16
17
18
19
20
21
22
 */
package nl.lumc.sasc.biopet.tools

import htsjdk.variant.vcf.VCFFileReader
import java.io.File
import java.io.PrintStream
import nl.lumc.sasc.biopet.core.ToolCommand
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
23
import scala.collection.mutable.{ Map, ListBuffer }
Peter van 't Hof's avatar
Peter van 't Hof committed
24
25
26
27
28
29

class VcfToTsv {
  // TODO: Queue wrapper
}

object VcfToTsv extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
30
  case class Args(inputFile: File = null, outputFile: File = null, fields: List[String] = Nil, infoFields: List[String] = Nil,
31
                  sampleFields: List[String] = Nil, disableDefaults: Boolean = false,
32
33
                  allInfo: Boolean = false, allFormat: Boolean = false,
                  separator: String = "\t", listSeparator: String = ",") extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
34
35

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
    opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(inputFile = x)
    }
    opt[File]('o', "outputFile") maxOccurs (1) valueName ("<file>") action { (x, c) =>
      c.copy(outputFile = x)
    } text ("output file, default to stdout")
    opt[String]('f', "field") unbounded () action { (x, c) =>
      c.copy(fields = x :: c.fields)
    }
    opt[String]('i', "info_field") unbounded () action { (x, c) =>
      c.copy(infoFields = x :: c.infoFields)
    }
    opt[Unit]("all_info") unbounded () action { (x, c) =>
      c.copy(allInfo = true)
    }
    opt[Unit]("all_format") unbounded () action { (x, c) =>
      c.copy(allFormat = true)
    }
    opt[String]('s', "sample_field") unbounded () action { (x, c) =>
55
      c.copy(sampleFields = x :: c.sampleFields)
Peter van 't Hof's avatar
Peter van 't Hof committed
56
57
58
59
    }
    opt[Unit]('d', "disable_defaults") unbounded () action { (x, c) =>
      c.copy(disableDefaults = true)
    }
60
61
62
63
64
65
    opt[String]("separator") maxOccurs (1) action { (x, c) =>
      c.copy(separator = x)
    } text ("Optional separator. Default is tab-delimited")
    opt[String]("list_separator") maxOccurs (1) action { (x, c) =>
      c.copy(listSeparator = x)
    } text ("Optional list separator. By default, lists are separated by a comma")
Peter van 't Hof's avatar
Peter van 't Hof committed
66
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
67

68
  val defaultFields = List("CHROM", "POS", "ID", "REF", "ALT", "QUAL")
Peter van 't Hof's avatar
Peter van 't Hof committed
69

Peter van 't Hof's avatar
Peter van 't Hof committed
70
71
72
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
73

Peter van 't Hof's avatar
Peter van 't Hof committed
74
75
76
    val reader = new VCFFileReader(commandArgs.inputFile, false)
    val header = reader.getFileHeader
    val samples = header.getSampleNamesInOrder
Peter van 't Hof's avatar
Peter van 't Hof committed
77

78
79
    val allInfoFields = header.getInfoHeaderLines.map(_.getID).toList
    val allFormatFields = header.getFormatHeaderLines.map(_.getID).toList
Peter van 't Hof's avatar
Peter van 't Hof committed
80
81
82
83
84

    val fields: Set[String] = (if (commandArgs.disableDefaults) Nil else defaultFields).toSet[String] ++
      commandArgs.fields.toSet[String] ++
      (if (commandArgs.allInfo) allInfoFields else commandArgs.infoFields).map("INFO-" + _) ++ {
        val buffer: ListBuffer[String] = ListBuffer()
85
        for (f <- (if (commandArgs.allFormat) allFormatFields else commandArgs.sampleFields); sample <- samples) {
Peter van 't Hof's avatar
Peter van 't Hof committed
86
87
88
          buffer += sample + "-" + f
        }
        buffer.toSet[String]
Peter van 't Hof's avatar
Peter van 't Hof committed
89
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
90
91
92
93

    val sortedFields = fields.toList.sortWith((a, b) => {
      val aT = if (a.startsWith("INFO-")) 'i' else if (samples.exists(x => a.startsWith(x + "-"))) 'f' else 'g'
      val bT = if (b.startsWith("INFO-")) 'i' else if (samples.exists(x => b.startsWith(x + "-"))) 'f' else 'g'
94
95
96
97
98
      if (aT == 'g' && bT == 'g') {
        val ai = defaultFields.indexOf(a)
        val bi = defaultFields.indexOf(b)
        if (bi < 0) true
        else ai <= bi
Peter van 't Hof's avatar
Peter van 't Hof committed
99
      } else if (aT == 'g') true
100
101
102
103
104
      else if (bT == 'g') false
      else if (aT == bT) (if (a.compareTo(b) > 0) false else true)
      else if (aT == 'i') true
      else false
    })
Peter van 't Hof's avatar
Peter van 't Hof committed
105

Sander Bollen's avatar
Sander Bollen committed
106
    val writer = if (commandArgs.outputFile != null) new PrintStream(commandArgs.outputFile)
107
    else sys.process.stdout
Peter van 't Hof's avatar
Peter van 't Hof committed
108

109
    writer.println(sortedFields.mkString("#", commandArgs.separator, ""))
Peter van 't Hof's avatar
Peter van 't Hof committed
110
111
    for (vcfRecord <- reader) {
      val values: Map[String, Any] = Map()
112
113
114
115
116
      values += "CHROM" -> vcfRecord.getChr
      values += "POS" -> vcfRecord.getStart
      values += "ID" -> vcfRecord.getID
      values += "REF" -> vcfRecord.getReference.getBaseString
      values += "ALT" -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
117
        val t = for (a <- vcfRecord.getAlternateAlleles) yield a.getBaseString
118
        t.mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
119
      }
120
121
      values += "QUAL" -> (if (vcfRecord.getPhredScaledQual == -10) "." else scala.math.round(vcfRecord.getPhredScaledQual * 100.0) / 100.0)
      values += "INFO" -> vcfRecord.getFilters
Peter van 't Hof's avatar
Peter van 't Hof committed
122
      for ((field, content) <- vcfRecord.getAttributes) {
Peter van 't Hof's avatar
Peter van 't Hof committed
123
        values += "INFO-" + field -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
124
          content match {
125
126
127
            case a: List[_]                => a.mkString(commandArgs.listSeparator)
            case a: Array[_]               => a.mkString(commandArgs.listSeparator)
            case a: java.util.ArrayList[_] => a.mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
128
            case _                         => content
Peter van 't Hof's avatar
Peter van 't Hof committed
129
130
131
          }
        }
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
132

Peter van 't Hof's avatar
Peter van 't Hof committed
133
134
      for (sample <- samples) {
        val genotype = vcfRecord.getGenotype(sample)
Peter van 't Hof's avatar
Peter van 't Hof committed
135
        values += sample + "-GT" -> {
Peter van 't Hof's avatar
Peter van 't Hof committed
136
137
138
          val l = for (g <- genotype.getAlleles) yield vcfRecord.getAlleleIndex(g)
          l.map(x => if (x < 0) "." else x).mkString("/")
        }
139
        if (genotype.hasAD) values += "AD-" + sample -> List(genotype.getAD: _*).mkString(commandArgs.listSeparator)
140
141
        if (genotype.hasDP) values += "DP-" + sample -> genotype.getDP
        if (genotype.hasGQ) values += "GQ-" + sample -> genotype.getGQ
142
        if (genotype.hasPL) values += "PL-" + sample -> List(genotype.getPL: _*).mkString(commandArgs.listSeparator)
Peter van 't Hof's avatar
Peter van 't Hof committed
143
        for ((field, content) <- genotype.getExtendedAttributes) {
Peter van 't Hof's avatar
Peter van 't Hof committed
144
          values += sample + "-" + field -> content
Peter van 't Hof's avatar
Peter van 't Hof committed
145
146
        }
      }
147
      val line = for (f <- sortedFields) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
148
149
        if (values.contains(f)) {
          values(f)
Peter van 't Hof's avatar
Peter van 't Hof committed
150
151
        } else ""
      }
152
      writer.println(line.mkString(commandArgs.separator))
Peter van 't Hof's avatar
Peter van 't Hof committed
153
154
155
    }
  }
}