AnnotateVcfWithBed.scala 5.03 KB
Newer Older
bow's avatar
bow committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16 17 18 19 20
package nl.lumc.sasc.biopet.tools

import java.io.File

import htsjdk.variant.variantcontext.VariantContextBuilder
Peter van 't Hof's avatar
Peter van 't Hof committed
21 22 23
import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
import htsjdk.variant.vcf.{ VCFFileReader, VCFHeaderLineCount, VCFHeaderLineType, VCFInfoHeaderLine }
import nl.lumc.sasc.biopet.core.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
24
import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList }
Peter van 't Hof's avatar
Peter van 't Hof committed
25

Peter van 't Hof's avatar
Peter van 't Hof committed
26 27 28 29 30 31 32 33
import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.io.Source

class AnnotateVcfWithBed {
  // TODO: Queue wrapper
}

Peter van 't Hof's avatar
Peter van 't Hof committed
34 35 36 37 38
/**
 * This a tools to annotate a vcf file with values from a bed file
 *
 * Created by pjvan_thof on 1/10/15.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
39 40
object AnnotateVcfWithBed extends ToolCommand {

Peter van 't Hof's avatar
Peter van 't Hof committed
41 42 43 44 45 46 47 48 49
  /**
   * Args for the commandline tool
   * @param inputFile input vcf file
   * @param bedFile bed file to annotate to vcf file
   * @param outputFile output vcf file
   * @param fieldName Info field that should be used
   * @param fieldDescription Description at field if needed
   * @param fieldType Type of filed, can be: "Integer", "Flag", "Character", "Float"
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
50 51 52 53 54 55 56 57
  case class Args(inputFile: File = null,
                  bedFile: File = null,
                  outputFile: File = null,
                  fieldName: String = null,
                  fieldDescription: String = "",
                  fieldType: String = "String") extends AbstractArgs

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
58
    opt[File]('I', "inputFile") required () unbounded () valueName "<vcf file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
59
      c.copy(inputFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
60 61
    } text "Input is a required file property"
    opt[File]('B', "bedFile") required () unbounded () valueName "<bed file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
62
      c.copy(bedFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
63 64
    } text "Bedfile is a required file property"
    opt[File]('o', "output") required () unbounded () valueName "<vcf file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
65
      c.copy(outputFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
66 67
    } text "out is a required file property"
    opt[String]('f', "fieldName") required () unbounded () valueName "<name of field in vcf file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
68
      c.copy(fieldName = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
69 70
    } text "Name of info field in new vcf file"
    opt[String]('d', "fieldDescription") unbounded () valueName "<name of field in vcf file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
71
      c.copy(fieldDescription = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
72 73
    } text "Description of field in new vcf file"
    opt[String]('t', "fieldType") unbounded () valueName "<name of field in vcf file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
74
      c.copy(fieldType = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
75
    } text "Description of field in new vcf file"
Peter van 't Hof's avatar
Peter van 't Hof committed
76 77
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
78
  /**
Peter van 't Hof's avatar
Peter van 't Hof committed
79 80
   * Program will Annotate a vcf file with the overlapping regions of a bed file,
   * 4e column of the bed file we in a info tag in the vcf file
Peter van 't Hof's avatar
Peter van 't Hof committed
81
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
82
  def main(args: Array[String]): Unit = {
83 84 85

    logger.info("Start")

Peter van 't Hof's avatar
Peter van 't Hof committed
86
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
87
    val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
Peter van 't Hof's avatar
Peter van 't Hof committed
88

Peter van 't Hof's avatar
Peter van 't Hof committed
89
    val fieldType = cmdArgs.fieldType match {
90 91 92 93 94 95 96 97
      case "Integer"   => VCFHeaderLineType.Integer
      case "Flag"      => VCFHeaderLineType.Flag
      case "Character" => VCFHeaderLineType.Character
      case "Float"     => VCFHeaderLineType.Float
      case _           => VCFHeaderLineType.String
    }

    logger.info("Reading bed file")
98
    val bedRecords = BedRecordList.fromFile(cmdArgs.bedFile).sorted
Peter van 't Hof's avatar
Peter van 't Hof committed
99

100 101
    logger.info("Starting output file")

Peter van 't Hof's avatar
Peter van 't Hof committed
102
    val reader = new VCFFileReader(cmdArgs.inputFile, false)
Peter van 't Hof's avatar
Peter van 't Hof committed
103 104
    val header = reader.getFileHeader

Sander Bollen's avatar
Sander Bollen committed
105
    val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().
Peter van 't Hof's avatar
Peter van 't Hof committed
106
      setOutputFile(cmdArgs.outputFile).
Sander Bollen's avatar
Sander Bollen committed
107 108 109
      setReferenceDictionary(header.getSequenceDictionary).
      build)

Peter van 't Hof's avatar
Peter van 't Hof committed
110 111
    header.addMetaDataLine(new VCFInfoHeaderLine(cmdArgs.fieldName,
      VCFHeaderLineCount.UNBOUNDED, fieldType, cmdArgs.fieldDescription))
Peter van 't Hof's avatar
Peter van 't Hof committed
112 113
    writer.writeHeader(header)

114 115
    logger.info("Start reading vcf records")

Peter van 't Hof's avatar
Peter van 't Hof committed
116
    for (record <- reader) {
Peter van 't Hof's avatar
Peter van 't Hof committed
117
      val overlaps = bedRecords.overlapWith(BedRecord(record.getContig, record.getStart, record.getEnd))
Peter van 't Hof's avatar
Peter van 't Hof committed
118 119 120 121
      if (overlaps.isEmpty) {
        writer.add(record)
      } else {
        val builder = new VariantContextBuilder(record)
Peter van 't Hof's avatar
Peter van 't Hof committed
122
        if (fieldType == VCFHeaderLineType.Flag) builder.attribute(cmdArgs.fieldName, true)
Peter van 't Hof's avatar
Peter van 't Hof committed
123
        else builder.attribute(cmdArgs.fieldName, overlaps.map(_.name).mkString(","))
Peter van 't Hof's avatar
Peter van 't Hof committed
124 125 126
        writer.add(builder.make)
      }
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
127 128
    reader.close()
    writer.close()
129 130

    logger.info("Done")
Peter van 't Hof's avatar
Peter van 't Hof committed
131 132
  }
}