VcfWithVcf.scala 7.52 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Sander Bollen's avatar
Sander Bollen committed
16 17 18 19
package nl.lumc.sasc.biopet.tools

import java.io.File

Peter van 't Hof's avatar
Peter van 't Hof committed
20
import htsjdk.variant.variantcontext.VariantContextBuilder
21
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
Sander Bollen's avatar
Sander Bollen committed
22 23 24
import htsjdk.variant.vcf._
import nl.lumc.sasc.biopet.core.ToolCommand

Peter van 't Hof's avatar
Peter van 't Hof committed
25 26
import scala.collection.JavaConversions._

Sander Bollen's avatar
Sander Bollen committed
27
/**
Peter van 't Hof's avatar
Peter van 't Hof committed
28 29
 * This is a tool to annotate a vcf file with info value from a other vcf file
 *
Sander Bollen's avatar
Sander Bollen committed
30 31
 * Created by ahbbollen on 11-2-15.
 */
32
object VcfWithVcf extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
33
  case class Fields(inputField: String, outputField: String, fieldMethod: FieldMethod.Value = FieldMethod.none)
34 35 36

  case class Args(inputFile: File = null,
                  outputFile: File = null,
37
                  secondaryVcf: File = null,
38 39
                  fields: List[Fields] = Nil,
                  matchAllele: Boolean = true) extends AbstractArgs
Sander Bollen's avatar
Sander Bollen committed
40

Peter van 't Hof's avatar
Peter van 't Hof committed
41
  object FieldMethod extends Enumeration {
Peter van 't Hof's avatar
Peter van 't Hof committed
42
    val none, max, min, unique = Value
Peter van 't Hof's avatar
Peter van 't Hof committed
43 44
  }

Sander Bollen's avatar
Sander Bollen committed
45
  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
46
    opt[File]('I', "inputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
Sander Bollen's avatar
Sander Bollen committed
47 48
      c.copy(inputFile = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
49
    opt[File]('O', "outputFile") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
Sander Bollen's avatar
Sander Bollen committed
50 51
      c.copy(outputFile = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
52
    opt[File]('S', "secondaryVcf") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
53
      c.copy(secondaryVcf = x)
Sander Bollen's avatar
Sander Bollen committed
54
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
55
    opt[String]('f', "field") unbounded () valueName "<field> or <input_field:output_field> or <input_field:output_field:method>" action { (x, c) =>
56
      val values = x.split(":")
Peter van 't Hof's avatar
Peter van 't Hof committed
57
      if (values.size > 2) c.copy(fields = Fields(values(0), values(1), FieldMethod.withName(values(2))) :: c.fields)
Peter van 't Hof's avatar
Peter van 't Hof committed
58
      else if (values.size > 1) c.copy(fields = Fields(values(0), values(1)) :: c.fields)
59
      else c.copy(fields = Fields(x, x) :: c.fields)
Peter van 't Hof's avatar
Peter van 't Hof committed
60 61 62 63 64 65 66
    } text """| If only <field> is given, the field's identifier in the output VCF will be identical to <field>.
                                                                                                                                                      | By default we will return all values found for a given field.
                                                                                                                                                      | With <method> the values will processed after getting it from the secondary VCF file, posible methods are:
                                                                                                                                                      |   - max   : takes maximum of found value, only works for numeric (integer/float) fields
                                                                                                                                                      |   - min   : takes minemal of found value, only works for numeric (integer/float) fields
                                                                                                                                                      |   - unique: takes only unique values """.stripMargin
    opt[Boolean]("match") valueName "<Boolean>" maxOccurs 1 action { (x, c) =>
67
      c.copy(matchAllele = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
68
    } text "Match alternative alleles; default true"
Sander Bollen's avatar
Sander Bollen committed
69 70 71
  }

  def main(args: Array[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
72 73
    logger.info("Init phase")

Sander Bollen's avatar
Sander Bollen committed
74 75 76 77
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)

    val reader = new VCFFileReader(commandArgs.inputFile)
78
    val secondaryReader = new VCFFileReader(commandArgs.secondaryVcf)
Sander Bollen's avatar
Sander Bollen committed
79 80

    val header = reader.getFileHeader
81
    val secondHeader = secondaryReader.getFileHeader
Sander Bollen's avatar
Sander Bollen committed
82 83 84 85
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
      setOutputFile(commandArgs.outputFile).
      setReferenceDictionary(header.getSequenceDictionary).
      build)
Sander Bollen's avatar
Sander Bollen committed
86 87

    for (x <- commandArgs.fields) {
88 89 90 91 92
      if (header.hasInfoLine(x.outputField))
        throw new IllegalArgumentException("Field '" + x.outputField + "' already exist in input vcf")
      if (!secondHeader.hasInfoLine(x.inputField))
        throw new IllegalArgumentException("Field '" + x.inputField + "' does not exist in secondary vcf")

93 94 95 96 97
      val oldHeaderLine = secondHeader.getInfoHeaderLine(x.inputField)

      val newHeaderLine = new VCFInfoHeaderLine(x.outputField, VCFHeaderLineCount.UNBOUNDED,
        oldHeaderLine.getType, oldHeaderLine.getDescription)
      header.addMetaDataLine(newHeaderLine)
Sander Bollen's avatar
Sander Bollen committed
98
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
99 100
    writer.writeHeader(header)

Peter van 't Hof's avatar
Peter van 't Hof committed
101
    logger.info("Start reading records")
Sander Bollen's avatar
Sander Bollen committed
102

Peter van 't Hof's avatar
Peter van 't Hof committed
103
    var counter = 0
104
    for (record <- reader) {
105
      val secondaryRecords = if (commandArgs.matchAllele) {
106
        secondaryReader.query(record.getContig, record.getStart, record.getEnd).toList.
Peter van 't Hof's avatar
Peter van 't Hof committed
107
          filter(x => record.getAlternateAlleles.exists(x.hasAlternateAllele))
108
      } else {
109
        secondaryReader.query(record.getContig, record.getStart, record.getEnd).toList
110 111
      }

112
      val fieldMap = (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
113
        f <- commandArgs.fields if secondaryRecords.exists(_.hasAttribute(f.inputField))
114 115
      ) yield {
        f.outputField -> (for (
Peter van 't Hof's avatar
Peter van 't Hof committed
116
          secondRecord <- secondaryRecords if secondRecord.hasAttribute(f.inputField)
117
        ) yield {
118
          secondRecord.getAttribute(f.inputField) match {
119 120
            case l: List[_] => l
            case x          => List(x)
Sander Bollen's avatar
Sander Bollen committed
121
          }
122 123 124 125
        }).fold(Nil)(_ ::: _)
      }).toMap

      writer.add(fieldMap.foldLeft(new VariantContextBuilder(record))((builder, attribute) => {
Peter van 't Hof's avatar
Peter van 't Hof committed
126
        builder.attribute(attribute._1, commandArgs.fields.filter(_.outputField == attribute._1).head.fieldMethod match {
Peter van 't Hof's avatar
Peter van 't Hof committed
127
          case FieldMethod.max =>
128 129 130 131 132
            header.getInfoHeaderLine(attribute._1).getType match {
              case VCFHeaderLineType.Integer => Array(attribute._2.map(_.toString.toInt).max)
              case VCFHeaderLineType.Float   => Array(attribute._2.map(_.toString.toFloat).max)
              case _                         => throw new IllegalArgumentException("Type of field " + attribute._1 + " is not numeric")
            }
Peter van 't Hof's avatar
Peter van 't Hof committed
133
          case FieldMethod.min =>
Peter van 't Hof's avatar
Peter van 't Hof committed
134 135 136 137 138
            header.getInfoHeaderLine(attribute._1).getType match {
              case VCFHeaderLineType.Integer => Array(attribute._2.map(_.toString.toInt).min)
              case VCFHeaderLineType.Float   => Array(attribute._2.map(_.toString.toFloat).min)
              case _                         => throw new IllegalArgumentException("Type of field " + attribute._1 + " is not numeric")
            }
Peter van 't Hof's avatar
Peter van 't Hof committed
139 140
          case FieldMethod.unique => attribute._2.distinct.toArray
          case _                  => attribute._2.toArray
Peter van 't Hof's avatar
Peter van 't Hof committed
141
        })
142
      }).make())
143

Peter van 't Hof's avatar
Peter van 't Hof committed
144 145 146
      counter += 1
      if (counter % 100000 == 0) {
        logger.info(s"""Processed $counter records""")
147
      }
Sander Bollen's avatar
Sander Bollen committed
148
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
149
    logger.info(s"""Processed $counter records""")
Sander Bollen's avatar
Sander Bollen committed
150

Sander Bollen's avatar
Sander Bollen committed
151
    logger.debug("Closing readers")
Sander Bollen's avatar
Sander Bollen committed
152 153 154
    writer.close()
    reader.close()
    secondaryReader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
155
    logger.info("Done")
Sander Bollen's avatar
Sander Bollen committed
156 157
  }
}