AnnotateVcfWithBed.scala 6.05 KB
Newer Older
bow's avatar
bow committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16 17 18 19 20
package nl.lumc.sasc.biopet.tools

import java.io.File

import htsjdk.variant.variantcontext.VariantContextBuilder
Peter van 't Hof's avatar
Peter van 't Hof committed
21 22 23 24
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
import htsjdk.variant.vcf.{ VCFFileReader, VCFHeaderLineCount, VCFHeaderLineType, VCFInfoHeaderLine }
import nl.lumc.sasc.biopet.core.ToolCommand

Peter van 't Hof's avatar
Peter van 't Hof committed
25 26 27 28 29 30 31 32
import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.io.Source

class AnnotateVcfWithBed {
  // TODO: Queue wrapper
}

Peter van 't Hof's avatar
Peter van 't Hof committed
33 34 35 36 37
/**
 * This a tools to annotate a vcf file with values from a bed file
 *
 * Created by pjvan_thof on 1/10/15.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
38 39
object AnnotateVcfWithBed extends ToolCommand {

Peter van 't Hof's avatar
Peter van 't Hof committed
40 41 42 43 44 45 46 47 48
  /**
   * Args for the commandline tool
   * @param inputFile input vcf file
   * @param bedFile bed file to annotate to vcf file
   * @param outputFile output vcf file
   * @param fieldName Info field that should be used
   * @param fieldDescription Description at field if needed
   * @param fieldType Type of filed, can be: "Integer", "Flag", "Character", "Float"
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
49 50 51 52 53 54 55 56
  case class Args(inputFile: File = null,
                  bedFile: File = null,
                  outputFile: File = null,
                  fieldName: String = null,
                  fieldDescription: String = "",
                  fieldType: String = "String") extends AbstractArgs

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
57
    opt[File]('I', "inputFile") required () unbounded () valueName "<vcf file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
58
      c.copy(inputFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
59 60
    } text "Input is a required file property"
    opt[File]('B', "bedFile") required () unbounded () valueName "<bed file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
61
      c.copy(bedFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
62 63
    } text "Bedfile is a required file property"
    opt[File]('o', "output") required () unbounded () valueName "<vcf file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
64
      c.copy(outputFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
65 66
    } text "out is a required file property"
    opt[String]('f', "fieldName") required () unbounded () valueName "<name of field in vcf file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
67
      c.copy(fieldName = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
68 69
    } text "Name of info field in new vcf file"
    opt[String]('d', "fieldDescription") unbounded () valueName "<name of field in vcf file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
70
      c.copy(fieldDescription = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
71 72
    } text "Description of field in new vcf file"
    opt[String]('t', "fieldType") unbounded () valueName "<name of field in vcf file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
73
      c.copy(fieldType = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
74
    } text "Description of field in new vcf file"
Peter van 't Hof's avatar
Peter van 't Hof committed
75 76
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
77
  /**
Peter van 't Hof's avatar
Peter van 't Hof committed
78 79
   * Program will Annotate a vcf file with the overlapping regions of a bed file,
   * 4e column of the bed file we in a info tag in the vcf file
Peter van 't Hof's avatar
Peter van 't Hof committed
80
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
81
  def main(args: Array[String]): Unit = {
82 83 84

    logger.info("Start")

Peter van 't Hof's avatar
Peter van 't Hof committed
85 86 87 88 89 90 91 92 93 94 95 96
    val argsParser = new OptParser
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)

    val bedRecords: mutable.Map[String, List[(Int, Int, String)]] = mutable.Map()
    // Read bed file
    /*
    // function bedRecord.getName will not compile, not clear why
    for (bedRecord <- asScalaIteratorConverter(getFeatureReader(commandArgs.bedFile.toPath.toString, new BEDCodec(), false).iterator()).asScala) {
      logger.debug(bedRecord)
      bedRecords(bedRecord.getChr) = (bedRecord.getStart, bedRecord.getEnd, bedRecord.getName) :: bedRecords.getOrElse(bedRecord.getChr, Nil)
    }
    */
97 98 99 100 101 102 103 104 105 106 107

    val fieldType = commandArgs.fieldType match {
      case "Integer"   => VCFHeaderLineType.Integer
      case "Flag"      => VCFHeaderLineType.Flag
      case "Character" => VCFHeaderLineType.Character
      case "Float"     => VCFHeaderLineType.Float
      case _           => VCFHeaderLineType.String
    }

    logger.info("Reading bed file")

Peter van 't Hof's avatar
Peter van 't Hof committed
108 109 110 111
    for (line <- Source.fromFile(commandArgs.bedFile).getLines()) {
      val values = line.split("\t")
      if (values.size >= 4)
        bedRecords(values(0)) = (values(1).toInt, values(2).toInt, values(3)) :: bedRecords.getOrElse(values(0), Nil)
Peter van 't Hof's avatar
Peter van 't Hof committed
112
      else values.size >= 3 && fieldType == VCFHeaderLineType.Flag
113
      bedRecords(values(0)) = (values(1).toInt, values(2).toInt, "") :: bedRecords.getOrElse(values(0), Nil)
Peter van 't Hof's avatar
Peter van 't Hof committed
114 115
    }

116 117
    logger.info("Sorting bed records")

Peter van 't Hof's avatar
Peter van 't Hof committed
118 119
    // Sort records when needed
    for ((chr, record) <- bedRecords) {
Peter van 't Hof's avatar
Peter van 't Hof committed
120
      bedRecords(chr) = record.sortBy(x => (x._1, x._2))
Peter van 't Hof's avatar
Peter van 't Hof committed
121 122
    }

123 124
    logger.info("Starting output file")

Peter van 't Hof's avatar
Peter van 't Hof committed
125 126 127
    val reader = new VCFFileReader(commandArgs.inputFile, false)
    val header = reader.getFileHeader

Sander Bollen's avatar
Sander Bollen committed
128 129 130 131 132
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
      setOutputFile(commandArgs.outputFile).
      setReferenceDictionary(header.getSequenceDictionary).
      build)

Peter van 't Hof's avatar
Peter van 't Hof committed
133 134 135 136
    header.addMetaDataLine(new VCFInfoHeaderLine(commandArgs.fieldName,
      VCFHeaderLineCount.UNBOUNDED, fieldType, commandArgs.fieldDescription))
    writer.writeHeader(header)

137 138
    logger.info("Start reading vcf records")

Peter van 't Hof's avatar
Peter van 't Hof committed
139 140 141 142 143 144 145 146
    for (record <- reader) {
      val overlaps = bedRecords.getOrElse(record.getChr, Nil).filter(x => {
        record.getStart <= x._2 && record.getEnd >= x._1
      })
      if (overlaps.isEmpty) {
        writer.add(record)
      } else {
        val builder = new VariantContextBuilder(record)
147 148
        if (fieldType == VCFHeaderLineType.Flag) builder.attribute(commandArgs.fieldName, true)
        else builder.attribute(commandArgs.fieldName, overlaps.map(_._3).mkString(","))
Peter van 't Hof's avatar
Peter van 't Hof committed
149 150 151
        writer.add(builder.make)
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
152 153
    reader.close()
    writer.close()
154 155

    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
156 157
  }
}