Adding breakdancer python conversion

3361016a · wyleung · 8cfa3ebd · 3361016a · 3361016a · 3361016a
Commit 3361016a authored 10 years ago by wyleung
--- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala
+++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala
@@ -8,7 +8,6 @@ import nl.lumc.sasc.biopet.core.config.Configurable
 import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument }
 import java.io.File

-
 class BreakdancerConfig(val root: Configurable) extends BiopetCommandLineFunction {
  executable = config("exe", default = "bam2cfg.pl", freeVar = false)

@@ -74,7 +73,7 @@ object BreakdancerConfig {
 class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunction  {
  executable = config("exe", default = "breakdancer-max", freeVar = false)
  
-  override val defaultVmem = "4G"
+  override val defaultVmem = "6G"
  override val defaultThreads = 1 // breakdancer can only work on 1 single thread
  
  override val versionRegex = """.*[Vv]ersion:? (.*)""".r
@@ -170,10 +169,12 @@ class Breakdancer(val root: Configurable) extends QScript with BiopetQScript {
  @Argument(doc = "Work directory")
  var workdir: String = _
  
-//  @Output(doc = "Breakdancer VCF output")
-//  lazy val outputvcf: File = {
-//    new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.vcf")
-//  }
+  var deps : List[File] = Nil
+  
+  @Output(doc = "Breakdancer VCF output")
+  lazy val outputvcf: File = {
+    new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.vcf")
+  }
  
  @Output(doc = "Breakdancer config")
  lazy val configfile: File = {
@@ -192,6 +193,7 @@ class Breakdancer(val root: Configurable) extends QScript with BiopetQScript {
    logger.info("Starting Breakdancer configuration")
    
    val bdcfg = BreakdancerConfig(this, input, this.configfile)
+    bdcfg.deps = this.deps
    outputFiles += ("breakdancer_cfg" -> bdcfg.output )
    add( bdcfg )
    
@@ -200,10 +202,9 @@ class Breakdancer(val root: Configurable) extends QScript with BiopetQScript {
    add( breakdancer )
    outputFiles += ("breakdancer_tsv" -> breakdancer.output )

-//    val output_vcf: File = this.outputvcf
-    // convert this tsv to vcf using the python script
-    
-    
+    val bdvcf = BreakdancerVCF( this, breakdancer.output, this.outputvcf )
+    add( bdvcf )
+    outputFiles += ("breakdancer_vcf" -> bdvcf.output )
  }
 }


--- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala
+++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala
+/*
+ * Copyright 2014 wyleung.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package nl.lumc.sasc.biopet.extensions.svcallers
+
+import java.io.File
+
+import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
+
+import argonaut._, Argonaut._
+import scalaz._, Scalaz._
+
+import nl.lumc.sasc.biopet.core.config.Configurable
+import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
+
+class BreakdancerVCF(val root: Configurable) extends PythonCommandLineFunction {
+  setPythonScript("breakdancer2vcf.py")
+
+  @Input(doc = "Breakdancer TSV")
+  var input: File = _
+
+  @Output(doc = "Output VCF to PATH")
+  var output: File = _
+
+  def cmdLine = {
+    getPythonCommand +
+    "-i " + required(input) +
+    "-o " + required(output)
+  }
+}
+
+object BreakdancerVCF {
+  def apply(root: Configurable, input:File, output:File): BreakdancerVCF = {
+    val bd = new BreakdancerVCF(root)
+    bd.input =input
+    bd.output = output
+    return bd
+  }
+}
\ No newline at end of file
--- a/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extension/svcallers/breakdancer2vcf.py
+++ b/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extension/svcallers/breakdancer2vcf.py
+#!/usr/bin/env python
+
+__copyright__ = """
+Copyright (C) 2013 - Tim te Beek
+Copyright (C) 2013 - Wai Yi Leung
+Copyright (C) 2013 AllBio (see AUTHORS file)
+"""
+
+__desc__ = """Convert breakdancer output to pseudo .vcf file format."""
+__created__ = "Mar 18, 2013"
+__author__ = "tbeek"
+
+import argparse
+import csv
+import os.path
+import sys
+import datetime
+
+
+def main(tsvfile, vcffile):
+    '''
+    :param tsvfile: filename of input file.tsv
+    :type tsvfile: string
+    :param vcffile: filename of output file.vcf
+    :type vcffile: string
+    '''
+    with open(tsvfile) as reader:
+        # Parse file
+        dictreader = _parse_tsvfile(reader)
+        print dictreader.fieldnames
+
+        # Write out file
+        _format_vcffile(dictreader, vcffile)
+
+    # Quick output
+    with open(vcffile) as reader:
+        print reader.read(1000)
+
+
+def _parse_tsvfile(readable):
+    '''
+    Read readable using csv.Sniffer and csv.DictReader
+    :param readable: open file.tsv handle to read with csv.DictReader
+    :type readable: file
+    '''
+    prev, curr = 0, 0
+    while True:
+        line = readable.readline()
+        if not line.startswith('#'):
+            # lets start from prev # line, without the hash sign
+            readable.seek(prev + 1)
+            break
+        else:
+            prev = curr
+            curr = readable.tell()
+
+    # Determine dialect
+    curr = readable.tell()
+    #dialect = csv.Sniffer().sniff(readable.read(3000))
+    dialect = 'excel-tab'
+    readable.seek(curr)
+
+    # Read file
+    dictreader = csv.DictReader(readable, dialect=dialect)
+    return dictreader
+
+
+_tsv_fields = ('Chr1', 'Pos1', 'Orientation1',
+               'Chr2', 'Pos2', 'Orientation2',
+               'Type', 'Size', 'Score',
+               'num_Reads', 'num_Reads_lib',
+               'ERR031544.sort.bam')
+# 'Chr1': '1',
+# 'Pos1': '269907',
+# 'Orientation1': '39+39-',
+# 'Chr2': '1',
+# 'Pos2': '270919',
+# 'Orientation2': '39+39-',
+# 'Type': 'DEL',
+# 'Size': '99',
+# 'Score': '99',
+# 'num_Reads': '38',
+# 'num_Reads_lib': '/home/allbio/ERR031544.sort.bam|38',
+# 'ERR031544.sort.bam': 'NA'
+
+
+
+_vcf_fields = ('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'default')
+
+TS_NOW = datetime.datetime.now()
+
+VCF_HEADER = """##fileformat=VCFv4.1
+##fileDate={filedate}
+##source=breakdancer-max
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
+##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of variation">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">""".format( filedate=TS_NOW.strftime( "%Y%m%d" ) )
+
+def _format_vcffile(dictreader, vcffile):
+    '''
+    Create a pseudo .vcf file based on values read from DictReader instance.
+    :param dictreader: DictReader instance to read data from
+    :type dictreader: csv.DictRedaer
+    :param vcffile: output file.vcf filename
+    :type vcffile: string
+    '''
+    FORMAT = "GT:DP"
+    with open(vcffile, mode='w') as writer:
+        writer.write('{header}\n#{columns}\n'.format(header=VCF_HEADER, columns='\t'.join(_vcf_fields)))
+        output_vcf = []
+        for line in dictreader:
+            CHROM = line['Chr1']
+            # TODO Figure out whether we have zero or one based positioning
+            POS = int(line['Pos1'])
+            ALT = '.'
+            SVEND = int(line['Pos2'])
+
+            INFO = 'PROGRAM=breakdancer;SVTYPE={}'.format(line['Type'])
+
+            if line['Type'] not in ['CTX']:
+                INFO += ';SVLEN={}'.format(int(line['Size']))
+                INFO += ";SVEND={}".format(SVEND)
+            
+            # write alternate ALT field for Intrachromosomal translocations
+            if line['Type'] in ['CTX']:
+                ALT = "N[{}:{}[".format(line['Chr2'], line['Pos2'])
+
+
+
+            SAMPLEINFO = "{}:{}".format( '1/.', line['num_Reads'] )
+            # Create record
+            output_vcf.append([CHROM, POS, '.', '.', ALT, '.', 'PASS', INFO, FORMAT, SAMPLEINFO])
+
+        # Sort all results
+        output_vcf.sort()
+        output = "\n".join(["\t".join(map(str,vcf_row)) for vcf_row in output_vcf])
+        # Write record
+        writer.write(output)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--breakdancertsv', dest='breakdancertsv', type=str,
+            help='Breakdancer TSV outputfile')
+    parser.add_argument('-o', '--outputvcf', dest='outputvcf', type=str,
+            help='Output vcf to')
+
+    args = parser.parse_args()
+    main(args.breakdancertsv, args.outputvcf)