Merge branch 'feature-breakdancer' into 'develop'

Feature breakdancer Breakdancer tool is now "feature complete", including the conversion to vcf. Optimalisation can be scheduled to the next milestone. See merge request !23

Merge branch 'feature-breakdancer' into 'develop'
063c377b · Peter van 't Hof · 25a8538c · b3957406 · 063c377b · 063c377b
Commit 063c377b authored 10 years ago by Peter van 't Hof
--- a/biopet-framework/pom.xml
+++ b/biopet-framework/pom.xml
@@ -105,7 +105,7 @@
                    <prefix>git</prefix>
                    <dateFormat>dd.MM.yyyy '@' HH:mm:ss z</dateFormat>
                    <verbose>false</verbose>
-                    <useNativeGit>false</useNativeGit>
+                    <useNativeGit>true</useNativeGit>
                    <dotGitDirectory>${project.basedir}/../.git</dotGitDirectory>
                    <skipPoms>false</skipPoms>
                    <generateGitPropertiesFile>true</generateGitPropertiesFile>

--- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala
+++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala
@@ -8,7 +8,6 @@ import nl.lumc.sasc.biopet.core.config.Configurable
 import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument }
 import java.io.File

-
 class BreakdancerConfig(val root: Configurable) extends BiopetCommandLineFunction {
  executable = config("exe", default = "bam2cfg.pl", freeVar = false)

@@ -23,22 +22,22 @@ class BreakdancerConfig(val root: Configurable) extends BiopetCommandLineFunctio
  var min_insertsize: Option[Int] = config("min_insertsize", default = 450)
  var solid_data: Boolean = config("solid", default = false)
  var sd_cutoff: Option[Int] = config("sd_cutoff", default = 4) // Cutoff in unit of standard deviation [4]
-  
+
  // we set this to a higher number to avoid biases in small numbers in sorted bams
  var min_observations: Option[Int] = config("min_observations", default = 10000) //  Number of observation required to estimate mean and s.d. insert size [10_000]
  var coefvar_cutoff: Option[Int] = config("coef_cutoff", default = 1) // Cutoff on coefficients of variation [1]
  var histogram_bins: Option[Int] = config("histogram_bins", default = 50) // Number of bins in the histogram [50]
-  
+
  def cmdLine = required(executable) +
-      optional("-q", min_mq) +
-      conditional(use_mq, "-m") +
-      optional("-s", min_insertsize) +
-      conditional(solid_data, "-s") +
-      optional("-c", sd_cutoff) +
-      optional("-n", min_observations) +
-      optional("-v", coefvar_cutoff) +
-      optional("-b", histogram_bins) +
-      required(input) + " 1> " + required(output)
+    optional("-q", min_mq) +
+    conditional(use_mq, "-m") +
+    optional("-s", min_insertsize) +
+    conditional(solid_data, "-s") +
+    optional("-c", sd_cutoff) +
+    optional("-n", min_observations) +
+    optional("-v", coefvar_cutoff) +
+    optional("-b", histogram_bins) +
+    required(input) + " 1> " + required(output)
 }

 object BreakdancerConfig {
@@ -62,35 +61,30 @@ object BreakdancerConfig {
  private def swapExtension(inputFile: String) = inputFile.substring(0, inputFile.lastIndexOf(".bam")) + ".breakdancer.cfg"
 }

-
-
-
 /*
 * The caller
 * 
 **/

-
-class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunction  {
+class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunction {
  executable = config("exe", default = "breakdancer-max", freeVar = false)
-  
-  override val defaultVmem = "4G"
+
+  override val defaultVmem = "6G"
  override val defaultThreads = 1 // breakdancer can only work on 1 single thread
-  
+
  override val versionRegex = """.*[Vv]ersion:? (.*)""".r
  override val versionExitcode = List(1)
  override def versionCommand = executable
-  
-  
+
  @Input(doc = "The breakdancer configuration file")
  var input: File = _
-  
-//  @Argument(doc = "Work directory")
-//  var workdir: String = _
-  
-  @Output(doc = "Breakdancer VCF output")
+
+  //  @Argument(doc = "Work directory")
+  //  var workdir: String = _
+
+  @Output(doc = "Breakdancer TSV output")
  var output: File = _
-  
+
  /*
   Options: 
       -o STRING       operate on a single chromosome [all chromosome]
@@ -116,36 +110,36 @@ class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunctio
  var q: Option[Int] = config("qs", default = 35)
  var r: Option[Int] = config("r", default = 2)
  var x: Option[Int] = config("x", default = 1000)
-  var b: Option[Int] = config("b", default = 100)  
+  var b: Option[Int] = config("b", default = 100)
  var t: Boolean = config("t", default = false)
  var d: String = config("d")
  var g: String = config("g")
  var l: Boolean = config("l", default = false)
  var a: Boolean = config("a", default = false)
  var h: Boolean = config("h", default = false)
-  var y: Option[Int] = config("y", default = 30)  
-  
+  var y: Option[Int] = config("y", default = 30)
+
  override def beforeCmd {
  }

-  def cmdLine = required(executable) + 
-      optional("-s", s) +
-      optional("-c", c) +
-      optional("-m", m) +
-      optional("-q", q) +
-      optional("-r", r) +
-      optional("-x", x) +
-      optional("-b", b) +
-      conditional(t ,"-t") +
-      optional("-d", d) +
-      optional("-g", g) +
-      conditional(l ,"-l") +
-      conditional(a ,"-a") +
-      conditional(h ,"-h") +
-      optional("-y", y) +
-      required(input) + 
-      ">" + 
-      required(output)
+  def cmdLine = required(executable) +
+    optional("-s", s) +
+    optional("-c", c) +
+    optional("-m", m) +
+    optional("-q", q) +
+    optional("-r", r) +
+    optional("-x", x) +
+    optional("-b", b) +
+    conditional(t, "-t") +
+    optional("-d", d) +
+    optional("-g", g) +
+    conditional(l, "-l") +
+    conditional(a, "-a") +
+    conditional(h, "-h") +
+    optional("-y", y) +
+    required(input) +
+    ">" +
+    required(output)
 }

 object BreakdancerCaller {
@@ -160,21 +154,18 @@ object BreakdancerCaller {
 /// Breakdancer is actually a mini pipeline executing binaries from the breakdancer package
 class Breakdancer(val root: Configurable) extends QScript with BiopetQScript {
  def this() = this(null)
-  
+
  @Input(doc = "Input file (bam)")
  var input: File = _

  @Input(doc = "Reference Fasta file")
  var reference: File = _
-  
+
  @Argument(doc = "Work directory")
  var workdir: String = _
-  
-//  @Output(doc = "Breakdancer VCF output")
-//  lazy val outputvcf: File = {
-//    new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.vcf")
-//  }
-  
+
+  var deps: List[File] = Nil
+
  @Output(doc = "Breakdancer config")
  lazy val configfile: File = {
    new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.cfg")
@@ -183,32 +174,35 @@ class Breakdancer(val root: Configurable) extends QScript with BiopetQScript {
  lazy val outputraw: File = {
    new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.tsv")
  }
-  
+  @Output(doc = "Breakdancer VCF output")
+  lazy val outputvcf: File = {
+    new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.vcf")
+  }
+
  override def init() {
  }

  def biopetScript() {
    // read config and set all parameters for the pipeline
    logger.info("Starting Breakdancer configuration")
-    
+
    val bdcfg = BreakdancerConfig(this, input, this.configfile)
-    outputFiles += ("breakdancer_cfg" -> bdcfg.output )
-    add( bdcfg )
-    
-    val output_tsv: File = this.outputraw
-    val breakdancer = BreakdancerCaller( this, bdcfg.output, output_tsv )
-    add( breakdancer )
-    outputFiles += ("breakdancer_tsv" -> breakdancer.output )
-
-//    val output_vcf: File = this.outputvcf
-    // convert this tsv to vcf using the python script
-    
-    
+    bdcfg.deps = this.deps
+    outputFiles += ("cfg" -> bdcfg.output)
+    add(bdcfg)
+
+    val breakdancer = BreakdancerCaller(this, bdcfg.output, this.outputraw)
+    add(breakdancer)
+    outputFiles += ("tsv" -> breakdancer.output)
+
+    val bdvcf = BreakdancerVCF(this, breakdancer.output, this.outputvcf)
+    add(bdvcf)
+    outputFiles += ("vcf" -> bdvcf.output)
  }
 }

 object Breakdancer extends PipelineCommand {
-  def apply(root: Configurable, input: File, reference: File, runDir: String): Breakdancer = {    
+  def apply(root: Configurable, input: File, reference: File, runDir: String): Breakdancer = {
    val breakdancer = new Breakdancer(root)
    breakdancer.input = input
    breakdancer.reference = reference

--- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala
+++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala
+package nl.lumc.sasc.biopet.extensions.svcallers
+
+import java.io.File
+
+import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
+
+import argonaut._, Argonaut._
+import scalaz._, Scalaz._
+
+import nl.lumc.sasc.biopet.core.config.Configurable
+import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
+
+class BreakdancerVCF(val root: Configurable) extends PythonCommandLineFunction {
+  setPythonScript("breakdancer2vcf.py")
+
+  @Input(doc = "Breakdancer TSV")
+  var input: File = _
+
+  @Output(doc = "Output VCF to PATH")
+  var output: File = _
+
+  def cmdLine = {
+    getPythonCommand +
+      "-i " + required(input) +
+      "-o " + required(output)
+  }
+}
+
+object BreakdancerVCF {
+  def apply(root: Configurable, input: File, output: File): BreakdancerVCF = {
+    val bd = new BreakdancerVCF(root)
+    bd.input = input
+    bd.output = output
+    return bd
+  }
+}
\ No newline at end of file
--- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala
+++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala
@@ -8,7 +8,6 @@ import nl.lumc.sasc.biopet.core.config.Configurable
 import nl.lumc.sasc.biopet.core.MultiSampleQScript
 import nl.lumc.sasc.biopet.core.PipelineCommand

-import nl.lumc.sasc.biopet.extensions.picard.MergeSamFiles
 import nl.lumc.sasc.biopet.extensions.sambamba.{ SambambaIndex, SambambaMerge }
 import nl.lumc.sasc.biopet.extensions.svcallers.pindel.Pindel
 import nl.lumc.sasc.biopet.extensions.svcallers.{ Breakdancer, Clever }
@@ -84,24 +83,24 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript {
        mergeSamFiles.input = libraryBamfiles
        mergeSamFiles.output = alignmentDir + sampleID + ".merged.bam"
        add(mergeSamFiles)
-        
-        val bamIndex = SambambaIndex(root, mergeSamFiles.output)
-        add(bamIndex)
-        
        mergeSamFiles.output
      } else null
+      
+    val bamIndex = SambambaIndex(root, bamFile)
+    add(bamIndex)
    
    /// bamfile will be used as input for the SV callers. First run Clever
    //    val cleverVCF : File = sampleDir + "/" + sampleID + ".clever.vcf"

    val cleverDir = svcallingDir + sampleID + ".clever/"
    val clever = Clever(this, bamFile, this.reference, svcallingDir, cleverDir)
+    clever.deps = List(bamIndex.output)
    sampleOutput.vcf += ("clever" -> List(clever.outputvcf))
    add(clever)

    val breakdancerDir = svcallingDir + sampleID + ".breakdancer/"
    val breakdancer = Breakdancer(this, bamFile, this.reference, breakdancerDir)
-    sampleOutput.vcf += ("breakdancer" -> List(breakdancer.outputraw))
+    sampleOutput.vcf += ("breakdancer" -> List(breakdancer.outputvcf))
    addAll(breakdancer.functions)

    // for pindel we should use per library config collected into one config file

--- a/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extensions/svcallers/breakdancer2vcf.py
+++ b/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extensions/svcallers/breakdancer2vcf.py
+#!/usr/bin/env python
+
+__copyright__ = """
+Copyright (C) 2013 - Tim te Beek
+Copyright (C) 2013 - Wai Yi Leung
+Copyright (C) 2013 AllBio (see AUTHORS file)
+"""
+
+__desc__ = """Convert breakdancer output to pseudo .vcf file format."""
+__created__ = "Mar 18, 2013"
+__author__ = "tbeek"
+
+import argparse
+import csv
+import os.path
+import sys
+import datetime
+
+
+def main(tsvfile, vcffile):
+    '''
+    :param tsvfile: filename of input file.tsv
+    :type tsvfile: string
+    :param vcffile: filename of output file.vcf
+    :type vcffile: string
+    '''
+    with open(tsvfile) as reader:
+        # Parse file
+        dictreader = _parse_tsvfile(reader)
+        print dictreader.fieldnames
+
+        # Write out file
+        _format_vcffile(dictreader, vcffile)
+
+    # Quick output
+    with open(vcffile) as reader:
+        print reader.read(1000)
+
+
+def _parse_tsvfile(readable):
+    '''
+    Read readable using csv.Sniffer and csv.DictReader
+    :param readable: open file.tsv handle to read with csv.DictReader
+    :type readable: file
+    '''
+    prev, curr = 0, 0
+    while True:
+        line = readable.readline()
+        if not line.startswith('#'):
+            # lets start from prev # line, without the hash sign
+            readable.seek(prev + 1)
+            break
+        else:
+            prev = curr
+            curr = readable.tell()
+
+    # Determine dialect
+    curr = readable.tell()
+    #dialect = csv.Sniffer().sniff(readable.read(3000))
+    dialect = 'excel-tab'
+    readable.seek(curr)
+
+    # Read file
+    dictreader = csv.DictReader(readable, dialect=dialect)
+    return dictreader
+
+
+_tsv_fields = ('Chr1', 'Pos1', 'Orientation1',
+               'Chr2', 'Pos2', 'Orientation2',
+               'Type', 'Size', 'Score',
+               'num_Reads', 'num_Reads_lib',
+               'ERR031544.sort.bam')
+# 'Chr1': '1',
+# 'Pos1': '269907',
+# 'Orientation1': '39+39-',
+# 'Chr2': '1',
+# 'Pos2': '270919',
+# 'Orientation2': '39+39-',
+# 'Type': 'DEL',
+# 'Size': '99',
+# 'Score': '99',
+# 'num_Reads': '38',
+# 'num_Reads_lib': '/home/allbio/ERR031544.sort.bam|38',
+# 'ERR031544.sort.bam': 'NA'
+
+
+
+_vcf_fields = ('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'default')
+
+TS_NOW = datetime.datetime.now()
+
+VCF_HEADER = """##fileformat=VCFv4.1
+##fileDate={filedate}
+##source=breakdancer-max
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of variation">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">""".format( filedate=TS_NOW.strftime( "%Y%m%d" ) )
+
+def _format_vcffile(dictreader, vcffile):
+    '''
+    Create a pseudo .vcf file based on values read from DictReader instance.
+    :param dictreader: DictReader instance to read data from
+    :type dictreader: csv.DictRedaer
+    :param vcffile: output file.vcf filename
+    :type vcffile: string
+    '''
+    FORMAT = "GT:DP"
+    with open(vcffile, mode='w') as writer:
+        writer.write('{header}\n#{columns}\n'.format(header=VCF_HEADER, columns='\t'.join(_vcf_fields)))
+        output_vcf = []
+        for line in dictreader:
+            CHROM = line['Chr1']
+            # TODO Figure out whether we have zero or one based positioning
+            POS = int(line['Pos1'])
+            ALT = '.'
+            SVEND = int(line['Pos2'])
+
+            INFO = 'PROGRAM=breakdancer;SVTYPE={}'.format(line['Type'])
+
+            if line['Type'] not in ['CTX']:
+                INFO += ';SVLEN={}'.format(int(line['Size']))
+                INFO += ";SVEND={}".format(SVEND)
+            
+            # write alternate ALT field for Intrachromosomal translocations
+            if line['Type'] in ['CTX']:
+                ALT = "N[{}:{}[".format(line['Chr2'], line['Pos2'])
+
+
+
+            SAMPLEINFO = "{}:{}".format( '1/.', line['num_Reads'] )
+            # Create record
+            output_vcf.append([CHROM, POS, '.', '.', ALT, '.', 'PASS', INFO, FORMAT, SAMPLEINFO])
+
+        # Sort all results
+        output_vcf.sort()
+        output = "\n".join(["\t".join(map(str,vcf_row)) for vcf_row in output_vcf])
+        # Write record
+        writer.write(output)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--breakdancertsv', dest='breakdancertsv', type=str,
+            help='Breakdancer TSV outputfile')
+    parser.add_argument('-o', '--outputvcf', dest='outputvcf', type=str,
+            help='Output vcf to')
+
+    args = parser.parse_args()
+    main(args.breakdancertsv, args.outputvcf)