CleverFixVCF.scala 6.52 KB
Newer Older
1
2
3
4
5
6
package nl.lumc.sasc.biopet.extensions.clever

/**
 * Created by wyleung on 4-4-16.
 */

Wai Yi Leung's avatar
Style    
Wai Yi Leung committed
7
import java.io.{ File, PrintWriter }
8
9

import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
10
import nl.lumc.sasc.biopet.utils.ToolCommand
11
import nl.lumc.sasc.biopet.utils.config.Configurable
Wai Yi Leung's avatar
Style    
Wai Yi Leung committed
12
import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output }
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48

import scala.io.Source

class CleverFixVCF(val root: Configurable) extends BiopetJavaCommandLineFunction {
  javaMainClass = getClass.getName
  @Input(doc = "Input Clever VCF")
  var input: File = _

  @Output(doc = "Output fixed VCF")
  var output: File = _

  @Argument(doc = "Samplename")
  var sampleName: String = _

  override def cmdLine = super.cmdLine +
    required("-i", input) +
    required("-o", output) +
    required("-s", sampleName)
}

object CleverFixVCF extends ToolCommand {
  case class Args(inputVCF: File = null, sampleLabel: String = "",
                  outputVCF: File = null) extends AbstractArgs

  class OptParser extends AbstractOptParser {
    opt[File]('i', "inputvcf") required () valueName "<vcffile/path>" action { (x, c) =>
      c.copy(inputVCF = x)
    } text "Please specify the input Clever VCF file"
    opt[String]('s', "samplelabel") valueName "<sample label>" action { (x, c) =>
      c.copy(sampleLabel = x)
    } text "Sample label is missing"
    opt[File]('o', "outputvcf") valueName "<output>" action { (x, c) =>
      c.copy(outputVCF = x)
    } text "Output path is missing"
  }

49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  def replaceHeaderLine(inHeaderLine: String, toCheckFor: String, replacement: String, extraHeader: String): String = {
    (inHeaderLine == toCheckFor) match {
      case true => {
        extraHeader + "\n" + replacement + "\n"
      }
      case _ => {
        // We have to deal with matching records
        // these don't start with #

        inHeaderLine.startsWith("#") match {
          case true =>
            inHeaderLine + "\n"
          case _ => {
            // this should be a record
            // Ensure the REF field is at least an N
            val cols = inHeaderLine.split("\t")
            cols(3) = "N"
            cols.mkString("\t") + "\n"
          }
        }
      }
    }
  }
72

73
  val extraHeader = """##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">
##INFO=<ID=NOVEL,Number=0,Type=Flag,Description="Indicates a novel structural variation">
##INFO=<ID=SVEND,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">
##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
##INFO=<ID=SVLEN,Number=.,Type=Integer,Description="Difference in length between REF and ALT alleles">
##INFO=<ID=CIPOS,Number=2,Type=Integer,Description="Confidence interval around POS for imprecise variants">
##INFO=<ID=CIEND,Number=2,Type=Integer,Description="Confidence interval around END for imprecise variants">
##INFO=<ID=HOMLEN,Number=.,Type=Integer,Description="Length of base pair identical micro-homology at event breakpoints">
##INFO=<ID=HOMSEQ,Number=.,Type=String,Description="Sequence of base pair identical micro-homology at event breakpoints">
##INFO=<ID=BKPTID,Number=.,Type=String,Description="ID of the assembled alternate allele in the assembly file">
##INFO=<ID=MEINFO,Number=4,Type=String,Description="Mobile element info of the form NAME,START,END,POLARITY">
##INFO=<ID=METRANS,Number=4,Type=String,Description="Mobile element transduction info of the form CHR,START,END,POLARITY">
##INFO=<ID=DGVID,Number=1,Type=String,Description="ID of this element in Database of Genomic Variation">
##INFO=<ID=DBVARID,Number=1,Type=String,Description="ID of this element in DBVAR">
##INFO=<ID=DBRIPID,Number=1,Type=String,Description="ID of this element in DBRIP">
##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">
##INFO=<ID=PARID,Number=1,Type=String,Description="ID of partner breakend">
##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend">
95
##INFO=<ID=BPWINDOW,Number=2,Type=Integer,Description="Window of breakpoints">
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
##INFO=<ID=CILEN,Number=2,Type=Integer,Description="Confidence interval around the inserted material between breakends">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Read Depth of segment containing breakend">
##INFO=<ID=DPADJ,Number=.,Type=Integer,Description="Read Depth of adjacency">
##INFO=<ID=CN,Number=1,Type=Integer,Description="Copy number of segment containing breakend">
##INFO=<ID=CNADJ,Number=.,Type=Integer,Description="Copy number of adjacency">
##INFO=<ID=CICN,Number=2,Type=Integer,Description="Confidence interval around copy number for the segment">
##INFO=<ID=CICNADJ,Number=.,Type=Integer,Description="Confidence interval around copy number for the adjacency">
##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">
##FORMAT=<ID=CNQ,Number=1,Type=Float,Description="Copy number genotype quality for imprecise events">
##FORMAT=<ID=CNL,Number=.,Type=Float,Description="Copy number genotype likelihood for imprecise events">
##FORMAT=<ID=NQ,Number=1,Type=Integer,Description="Phred style probability score that the variant is novel">
##FORMAT=<ID=HAP,Number=1,Type=Integer,Description="Unique haplotype identifier">
##FORMAT=<ID=AHAP,Number=1,Type=Integer,Description="Unique identifier of ancestral haplotype">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">"""

113
114
115
116
117
118
119
120
  val vcfColHeader = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tdefault"

  val vcfColReplacementHeader = s"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t"
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
121
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
122
123
124
125

    val input: File = commandArgs.inputVCF
    val output: File = commandArgs.outputVCF

126
127
    val inputVCF = Source.fromFile(input)
    val writer = new PrintWriter(output)
Wai Yi Leung's avatar
Style    
Wai Yi Leung committed
128
129
130
    inputVCF.getLines().foreach(x =>
      writer.write(replaceHeaderLine(x, vcfColHeader, vcfColReplacementHeader + commandArgs.sampleLabel, extraHeader))
    )
131
132
133
134
135
    writer.close()
    inputVCF.close()
  }
}