VcfWithVcf.scala 7.52 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Sander Bollen's avatar
Sander Bollen committed
16
17
18
19
package nl.lumc.sasc.biopet.tools

import java.io.File

Peter van 't Hof's avatar
Peter van 't Hof committed
20
import htsjdk.variant.variantcontext.VariantContextBuilder
21
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
Sander Bollen's avatar
Sander Bollen committed
22
23
24
import htsjdk.variant.vcf._
import nl.lumc.sasc.biopet.core.ToolCommand

Peter van 't Hof's avatar
Peter van 't Hof committed
25
26
import scala.collection.JavaConversions._

Sander Bollen's avatar
Sander Bollen committed
27
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
28
29
 * This is a tool to annotate a vcf file with info value from a other vcf file
 *
Sander Bollen's avatar
Sander Bollen committed
30
31
 * Created by ahbbollen on 11-2-15.
 */
32
object VcfWithVcf extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
33
  case class Fields(inputField: String, outputField: String, fieldMethod: FieldMethod.Value = FieldMethod.none)
34
35
36

  case class Args(inputFile: File = null,
                  outputFile: File = null,
37
                  secondaryVcf: File = null,
38
39
                  fields: List[Fields] = Nil,
                  matchAllele: Boolean = true) extends AbstractArgs
Sander Bollen's avatar
Sander Bollen committed
40

Peter van 't Hof's avatar
Peter van 't Hof committed
41
  object FieldMethod extends Enumeration {
Peter van 't Hof's avatar
Peter van 't Hof committed
42
    val none, max, min, unique = Value
Peter van 't Hof's avatar
Peter van 't Hof committed
43
44
  }

Sander Bollen's avatar
Sander Bollen committed
45
  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
46
    opt[File]('I', "inputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
Sander Bollen's avatar
Sander Bollen committed
47
48
      c.copy(inputFile = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
49
    opt[File]('O', "outputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
Sander Bollen's avatar
Sander Bollen committed
50
51
      c.copy(outputFile = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
52
    opt[File]('S', "secondaryVcf") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
53
      c.copy(secondaryVcf = x)
Sander Bollen's avatar
Sander Bollen committed
54
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
55
    opt[String]('f', "field") unbounded () valueName "<field> or <input_field:output_field> or <input_field:output_field:method>" action { (x, c) =>
56
      val values = x.split(":")
Peter van 't Hof's avatar
Peter van 't Hof committed
57
      if (values.size > 2) c.copy(fields = Fields(values(0), values(1), FieldMethod.withName(values(2))) :: c.fields)
Peter van 't Hof's avatar
Peter van 't Hof committed
58
      else if (values.size > 1) c.copy(fields = Fields(values(0), values(1)) :: c.fields)
59
      else c.copy(fields = Fields(x, x) :: c.fields)
Peter van 't Hof's avatar
Peter van 't Hof committed
60
61
62
63
64
65
66
    } text """| If only <field> is given, the field's identifier in the output VCF will be identical to <field>.
                                                                                                                                                      | By default we will return all values found for a given field.
                                                                                                                                                      | With <method> the values will processed after getting it from the secondary VCF file, posible methods are:
                                                                                                                                                      |   - max   : takes maximum of found value, only works for numeric (integer/float) fields
                                                                                                                                                      |   - min   : takes minemal of found value, only works for numeric (integer/float) fields
                                                                                                                                                      |   - unique: takes only unique values """.stripMargin
    opt[Boolean]("match") valueName "<Boolean>" maxOccurs 1 action { (x, c) =>
67
      c.copy(matchAllele = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
68
    } text "Match alternative alleles; default true"
Sander Bollen's avatar
Sander Bollen committed
69
70
71
  }

  def main(args: Array[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
72
73
    logger.info("Init phase")

Sander Bollen's avatar
Sander Bollen committed
74
75
76
77
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)

    val reader = new VCFFileReader(commandArgs.inputFile)
78
    val secondaryReader = new VCFFileReader(commandArgs.secondaryVcf)
Sander Bollen's avatar
Sander Bollen committed
79
80

    val header = reader.getFileHeader
81
    val secondHeader = secondaryReader.getFileHeader
Sander Bollen's avatar
Sander Bollen committed
82
83
84
85
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
      setOutputFile(commandArgs.outputFile).
      setReferenceDictionary(header.getSequenceDictionary).
      build)
Sander Bollen's avatar
Sander Bollen committed
86
87

    for (x <- commandArgs.fields) {
88
89
90
91
92
      if (header.hasInfoLine(x.outputField))
        throw new IllegalArgumentException("Field '" + x.outputField + "' already exist in input vcf")
      if (!secondHeader.hasInfoLine(x.inputField))
        throw new IllegalArgumentException("Field '" + x.inputField + "' does not exist in secondary vcf")

93
94
95
96
97
      val oldHeaderLine = secondHeader.getInfoHeaderLine(x.inputField)

      val newHeaderLine = new VCFInfoHeaderLine(x.outputField, VCFHeaderLineCount.UNBOUNDED,
        oldHeaderLine.getType, oldHeaderLine.getDescription)
      header.addMetaDataLine(newHeaderLine)
Sander Bollen's avatar
Sander Bollen committed
98
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
99
100
    writer.writeHeader(header)

Peter van 't Hof's avatar
Peter van 't Hof committed
101
    logger.info("Start reading records")
Sander Bollen's avatar
Sander Bollen committed
102

Peter van 't Hof's avatar
Peter van 't Hof committed
103
    var counter = 0
104
    for (record <- reader) {
105
      val secondaryRecords = if (commandArgs.matchAllele) {
106
        secondaryReader.query(record.getContig, record.getStart, record.getEnd).toList.
Peter van 't Hof's avatar
Peter van 't Hof committed
107
          filter(x => record.getAlternateAlleles.exists(x.hasAlternateAllele))
108
      } else {
109
        secondaryReader.query(record.getContig, record.getStart, record.getEnd).toList
110
111
      }

112
      val fieldMap = (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
113
        f <- commandArgs.fields if secondaryRecords.exists(_.hasAttribute(f.inputField))
114
115
      ) yield {
        f.outputField -> (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
116
          secondRecord <- secondaryRecords if secondRecord.hasAttribute(f.inputField)
117
        ) yield {
118
          secondRecord.getAttribute(f.inputField) match {
119
120
            case l: List[_] => l
            case x          => List(x)
Sander Bollen's avatar
Sander Bollen committed
121
          }
122
123
124
125
        }).fold(Nil)(_ ::: _)
      }).toMap

      writer.add(fieldMap.foldLeft(new VariantContextBuilder(record))((builder, attribute) => {
Peter van 't Hof's avatar
Peter van 't Hof committed
126
        builder.attribute(attribute._1, commandArgs.fields.filter(_.outputField == attribute._1).head.fieldMethod match {
Peter van 't Hof's avatar
Peter van 't Hof committed
127
          case FieldMethod.max =>
128
129
130
131
132
            header.getInfoHeaderLine(attribute._1).getType match {
              case VCFHeaderLineType.Integer => Array(attribute._2.map(_.toString.toInt).max)
              case VCFHeaderLineType.Float   => Array(attribute._2.map(_.toString.toFloat).max)
              case _                         => throw new IllegalArgumentException("Type of field " + attribute._1 + " is not numeric")
            }
Peter van 't Hof's avatar
Peter van 't Hof committed
133
          case FieldMethod.min =>
Peter van 't Hof's avatar
Peter van 't Hof committed
134
135
136
137
138
            header.getInfoHeaderLine(attribute._1).getType match {
              case VCFHeaderLineType.Integer => Array(attribute._2.map(_.toString.toInt).min)
              case VCFHeaderLineType.Float   => Array(attribute._2.map(_.toString.toFloat).min)
              case _                         => throw new IllegalArgumentException("Type of field " + attribute._1 + " is not numeric")
            }
Peter van 't Hof's avatar
Peter van 't Hof committed
139
140
          case FieldMethod.unique => attribute._2.distinct.toArray
          case _                  => attribute._2.toArray
Peter van 't Hof's avatar
Peter van 't Hof committed
141
        })
142
      }).make())
143

Peter van 't Hof's avatar
Peter van 't Hof committed
144
145
146
      counter += 1
      if (counter % 100000 == 0) {
        logger.info(s"""Processed $counter records""")
147
      }
Sander Bollen's avatar
Sander Bollen committed
148
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
149
    logger.info(s"""Processed $counter records""")
Sander Bollen's avatar
Sander Bollen committed
150

Sander Bollen's avatar
Sander Bollen committed
151
    logger.debug("Closing readers")
Sander Bollen's avatar
Sander Bollen committed
152
153
154
    writer.close()
    reader.close()
    secondaryReader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
155
    logger.info("Done")
Sander Bollen's avatar
Sander Bollen committed
156
157
  }
}