diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala index 7e1d70a98f4ab7866af10def3c097cb96b9b5de5..a6233d71f508d10ab4de3aab14f8691af7a0ae19 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala @@ -8,7 +8,6 @@ import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument } import java.io.File - class BreakdancerConfig(val root: Configurable) extends BiopetCommandLineFunction { executable = config("exe", default = "bam2cfg.pl", freeVar = false) @@ -74,7 +73,7 @@ object BreakdancerConfig { class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunction { executable = config("exe", default = "breakdancer-max", freeVar = false) - override val defaultVmem = "4G" + override val defaultVmem = "6G" override val defaultThreads = 1 // breakdancer can only work on 1 single thread override val versionRegex = """.*[Vv]ersion:? (.*)""".r @@ -170,10 +169,12 @@ class Breakdancer(val root: Configurable) extends QScript with BiopetQScript { @Argument(doc = "Work directory") var workdir: String = _ -// @Output(doc = "Breakdancer VCF output") -// lazy val outputvcf: File = { -// new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.vcf") -// } + var deps : List[File] = Nil + + @Output(doc = "Breakdancer VCF output") + lazy val outputvcf: File = { + new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.vcf") + } @Output(doc = "Breakdancer config") lazy val configfile: File = { @@ -192,6 +193,7 @@ class Breakdancer(val root: Configurable) extends QScript with BiopetQScript { logger.info("Starting Breakdancer configuration") val bdcfg = BreakdancerConfig(this, input, this.configfile) + bdcfg.deps = this.deps outputFiles += ("breakdancer_cfg" -> bdcfg.output ) add( bdcfg ) @@ -200,10 +202,9 @@ class Breakdancer(val root: Configurable) extends QScript with BiopetQScript { add( breakdancer ) outputFiles += ("breakdancer_tsv" -> breakdancer.output ) -// val output_vcf: File = this.outputvcf - // convert this tsv to vcf using the python script - - + val bdvcf = BreakdancerVCF( this, breakdancer.output, this.outputvcf ) + add( bdvcf ) + outputFiles += ("breakdancer_vcf" -> bdvcf.output ) } } diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala new file mode 100644 index 0000000000000000000000000000000000000000..731b0f8292102218a856b6aa21da72a86261f7db --- /dev/null +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala @@ -0,0 +1,52 @@ +/* + * Copyright 2014 wyleung. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nl.lumc.sasc.biopet.extensions.svcallers + +import java.io.File + +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +import argonaut._, Argonaut._ +import scalaz._, Scalaz._ + +import nl.lumc.sasc.biopet.core.config.Configurable +import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction + +class BreakdancerVCF(val root: Configurable) extends PythonCommandLineFunction { + setPythonScript("breakdancer2vcf.py") + + @Input(doc = "Breakdancer TSV") + var input: File = _ + + @Output(doc = "Output VCF to PATH") + var output: File = _ + + def cmdLine = { + getPythonCommand + + "-i " + required(input) + + "-o " + required(output) + } +} + +object BreakdancerVCF { + def apply(root: Configurable, input:File, output:File): BreakdancerVCF = { + val bd = new BreakdancerVCF(root) + bd.input =input + bd.output = output + return bd + } +} \ No newline at end of file diff --git a/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extension/svcallers/breakdancer2vcf.py b/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extension/svcallers/breakdancer2vcf.py new file mode 100644 index 0000000000000000000000000000000000000000..be19779229fe65b7501e1ef6b4b409d522684eeb --- /dev/null +++ b/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extension/svcallers/breakdancer2vcf.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python + +__copyright__ = """ +Copyright (C) 2013 - Tim te Beek +Copyright (C) 2013 - Wai Yi Leung +Copyright (C) 2013 AllBio (see AUTHORS file) +""" + +__desc__ = """Convert breakdancer output to pseudo .vcf file format.""" +__created__ = "Mar 18, 2013" +__author__ = "tbeek" + +import argparse +import csv +import os.path +import sys +import datetime + + +def main(tsvfile, vcffile): + ''' + :param tsvfile: filename of input file.tsv + :type tsvfile: string + :param vcffile: filename of output file.vcf + :type vcffile: string + ''' + with open(tsvfile) as reader: + # Parse file + dictreader = _parse_tsvfile(reader) + print dictreader.fieldnames + + # Write out file + _format_vcffile(dictreader, vcffile) + + # Quick output + with open(vcffile) as reader: + print reader.read(1000) + + +def _parse_tsvfile(readable): + ''' + Read readable using csv.Sniffer and csv.DictReader + :param readable: open file.tsv handle to read with csv.DictReader + :type readable: file + ''' + prev, curr = 0, 0 + while True: + line = readable.readline() + if not line.startswith('#'): + # lets start from prev # line, without the hash sign + readable.seek(prev + 1) + break + else: + prev = curr + curr = readable.tell() + + # Determine dialect + curr = readable.tell() + #dialect = csv.Sniffer().sniff(readable.read(3000)) + dialect = 'excel-tab' + readable.seek(curr) + + # Read file + dictreader = csv.DictReader(readable, dialect=dialect) + return dictreader + + +_tsv_fields = ('Chr1', 'Pos1', 'Orientation1', + 'Chr2', 'Pos2', 'Orientation2', + 'Type', 'Size', 'Score', + 'num_Reads', 'num_Reads_lib', + 'ERR031544.sort.bam') +# 'Chr1': '1', +# 'Pos1': '269907', +# 'Orientation1': '39+39-', +# 'Chr2': '1', +# 'Pos2': '270919', +# 'Orientation2': '39+39-', +# 'Type': 'DEL', +# 'Size': '99', +# 'Score': '99', +# 'num_Reads': '38', +# 'num_Reads_lib': '/home/allbio/ERR031544.sort.bam|38', +# 'ERR031544.sort.bam': 'NA' + + + +_vcf_fields = ('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'default') + +TS_NOW = datetime.datetime.now() + +VCF_HEADER = """##fileformat=VCFv4.1 +##fileDate={filedate} +##source=breakdancer-max +##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data"> +##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency"> +##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of variation"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> +##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">""".format( filedate=TS_NOW.strftime( "%Y%m%d" ) ) + +def _format_vcffile(dictreader, vcffile): + ''' + Create a pseudo .vcf file based on values read from DictReader instance. + :param dictreader: DictReader instance to read data from + :type dictreader: csv.DictRedaer + :param vcffile: output file.vcf filename + :type vcffile: string + ''' + FORMAT = "GT:DP" + with open(vcffile, mode='w') as writer: + writer.write('{header}\n#{columns}\n'.format(header=VCF_HEADER, columns='\t'.join(_vcf_fields))) + output_vcf = [] + for line in dictreader: + CHROM = line['Chr1'] + # TODO Figure out whether we have zero or one based positioning + POS = int(line['Pos1']) + ALT = '.' + SVEND = int(line['Pos2']) + + INFO = 'PROGRAM=breakdancer;SVTYPE={}'.format(line['Type']) + + if line['Type'] not in ['CTX']: + INFO += ';SVLEN={}'.format(int(line['Size'])) + INFO += ";SVEND={}".format(SVEND) + + # write alternate ALT field for Intrachromosomal translocations + if line['Type'] in ['CTX']: + ALT = "N[{}:{}[".format(line['Chr2'], line['Pos2']) + + + + SAMPLEINFO = "{}:{}".format( '1/.', line['num_Reads'] ) + # Create record + output_vcf.append([CHROM, POS, '.', '.', ALT, '.', 'PASS', INFO, FORMAT, SAMPLEINFO]) + + # Sort all results + output_vcf.sort() + output = "\n".join(["\t".join(map(str,vcf_row)) for vcf_row in output_vcf]) + # Write record + writer.write(output) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--breakdancertsv', dest='breakdancertsv', type=str, + help='Breakdancer TSV outputfile') + parser.add_argument('-o', '--outputvcf', dest='outputvcf', type=str, + help='Output vcf to') + + args = parser.parse_args() + main(args.breakdancertsv, args.outputvcf)