Skip to content
Snippets Groups Projects
Commit 063c377b authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'feature-breakdancer' into 'develop'

Feature breakdancer

Breakdancer tool is now "feature complete", including the conversion to vcf.

Optimalisation can be scheduled to the next milestone.

See merge request !23
parents 25a8538c b3957406
No related branches found
No related tags found
No related merge requests found
......@@ -105,7 +105,7 @@
<prefix>git</prefix>
<dateFormat>dd.MM.yyyy '@' HH:mm:ss z</dateFormat>
<verbose>false</verbose>
<useNativeGit>false</useNativeGit>
<useNativeGit>true</useNativeGit>
<dotGitDirectory>${project.basedir}/../.git</dotGitDirectory>
<skipPoms>false</skipPoms>
<generateGitPropertiesFile>true</generateGitPropertiesFile>
......
......@@ -8,7 +8,6 @@ import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument }
import java.io.File
class BreakdancerConfig(val root: Configurable) extends BiopetCommandLineFunction {
executable = config("exe", default = "bam2cfg.pl", freeVar = false)
......@@ -23,22 +22,22 @@ class BreakdancerConfig(val root: Configurable) extends BiopetCommandLineFunctio
var min_insertsize: Option[Int] = config("min_insertsize", default = 450)
var solid_data: Boolean = config("solid", default = false)
var sd_cutoff: Option[Int] = config("sd_cutoff", default = 4) // Cutoff in unit of standard deviation [4]
// we set this to a higher number to avoid biases in small numbers in sorted bams
var min_observations: Option[Int] = config("min_observations", default = 10000) // Number of observation required to estimate mean and s.d. insert size [10_000]
var coefvar_cutoff: Option[Int] = config("coef_cutoff", default = 1) // Cutoff on coefficients of variation [1]
var histogram_bins: Option[Int] = config("histogram_bins", default = 50) // Number of bins in the histogram [50]
def cmdLine = required(executable) +
optional("-q", min_mq) +
conditional(use_mq, "-m") +
optional("-s", min_insertsize) +
conditional(solid_data, "-s") +
optional("-c", sd_cutoff) +
optional("-n", min_observations) +
optional("-v", coefvar_cutoff) +
optional("-b", histogram_bins) +
required(input) + " 1> " + required(output)
optional("-q", min_mq) +
conditional(use_mq, "-m") +
optional("-s", min_insertsize) +
conditional(solid_data, "-s") +
optional("-c", sd_cutoff) +
optional("-n", min_observations) +
optional("-v", coefvar_cutoff) +
optional("-b", histogram_bins) +
required(input) + " 1> " + required(output)
}
object BreakdancerConfig {
......@@ -62,35 +61,30 @@ object BreakdancerConfig {
private def swapExtension(inputFile: String) = inputFile.substring(0, inputFile.lastIndexOf(".bam")) + ".breakdancer.cfg"
}
/*
* The caller
*
**/
class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunction {
class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunction {
executable = config("exe", default = "breakdancer-max", freeVar = false)
override val defaultVmem = "4G"
override val defaultVmem = "6G"
override val defaultThreads = 1 // breakdancer can only work on 1 single thread
override val versionRegex = """.*[Vv]ersion:? (.*)""".r
override val versionExitcode = List(1)
override def versionCommand = executable
@Input(doc = "The breakdancer configuration file")
var input: File = _
// @Argument(doc = "Work directory")
// var workdir: String = _
@Output(doc = "Breakdancer VCF output")
// @Argument(doc = "Work directory")
// var workdir: String = _
@Output(doc = "Breakdancer TSV output")
var output: File = _
/*
Options:
-o STRING operate on a single chromosome [all chromosome]
......@@ -116,36 +110,36 @@ class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunctio
var q: Option[Int] = config("qs", default = 35)
var r: Option[Int] = config("r", default = 2)
var x: Option[Int] = config("x", default = 1000)
var b: Option[Int] = config("b", default = 100)
var b: Option[Int] = config("b", default = 100)
var t: Boolean = config("t", default = false)
var d: String = config("d")
var g: String = config("g")
var l: Boolean = config("l", default = false)
var a: Boolean = config("a", default = false)
var h: Boolean = config("h", default = false)
var y: Option[Int] = config("y", default = 30)
var y: Option[Int] = config("y", default = 30)
override def beforeCmd {
}
def cmdLine = required(executable) +
optional("-s", s) +
optional("-c", c) +
optional("-m", m) +
optional("-q", q) +
optional("-r", r) +
optional("-x", x) +
optional("-b", b) +
conditional(t ,"-t") +
optional("-d", d) +
optional("-g", g) +
conditional(l ,"-l") +
conditional(a ,"-a") +
conditional(h ,"-h") +
optional("-y", y) +
required(input) +
">" +
required(output)
def cmdLine = required(executable) +
optional("-s", s) +
optional("-c", c) +
optional("-m", m) +
optional("-q", q) +
optional("-r", r) +
optional("-x", x) +
optional("-b", b) +
conditional(t, "-t") +
optional("-d", d) +
optional("-g", g) +
conditional(l, "-l") +
conditional(a, "-a") +
conditional(h, "-h") +
optional("-y", y) +
required(input) +
">" +
required(output)
}
object BreakdancerCaller {
......@@ -160,21 +154,18 @@ object BreakdancerCaller {
/// Breakdancer is actually a mini pipeline executing binaries from the breakdancer package
class Breakdancer(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
@Input(doc = "Input file (bam)")
var input: File = _
@Input(doc = "Reference Fasta file")
var reference: File = _
@Argument(doc = "Work directory")
var workdir: String = _
// @Output(doc = "Breakdancer VCF output")
// lazy val outputvcf: File = {
// new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.vcf")
// }
var deps: List[File] = Nil
@Output(doc = "Breakdancer config")
lazy val configfile: File = {
new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.cfg")
......@@ -183,32 +174,35 @@ class Breakdancer(val root: Configurable) extends QScript with BiopetQScript {
lazy val outputraw: File = {
new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.tsv")
}
@Output(doc = "Breakdancer VCF output")
lazy val outputvcf: File = {
new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.vcf")
}
override def init() {
}
def biopetScript() {
// read config and set all parameters for the pipeline
logger.info("Starting Breakdancer configuration")
val bdcfg = BreakdancerConfig(this, input, this.configfile)
outputFiles += ("breakdancer_cfg" -> bdcfg.output )
add( bdcfg )
val output_tsv: File = this.outputraw
val breakdancer = BreakdancerCaller( this, bdcfg.output, output_tsv )
add( breakdancer )
outputFiles += ("breakdancer_tsv" -> breakdancer.output )
// val output_vcf: File = this.outputvcf
// convert this tsv to vcf using the python script
bdcfg.deps = this.deps
outputFiles += ("cfg" -> bdcfg.output)
add(bdcfg)
val breakdancer = BreakdancerCaller(this, bdcfg.output, this.outputraw)
add(breakdancer)
outputFiles += ("tsv" -> breakdancer.output)
val bdvcf = BreakdancerVCF(this, breakdancer.output, this.outputvcf)
add(bdvcf)
outputFiles += ("vcf" -> bdvcf.output)
}
}
object Breakdancer extends PipelineCommand {
def apply(root: Configurable, input: File, reference: File, runDir: String): Breakdancer = {
def apply(root: Configurable, input: File, reference: File, runDir: String): Breakdancer = {
val breakdancer = new Breakdancer(root)
breakdancer.input = input
breakdancer.reference = reference
......
package nl.lumc.sasc.biopet.extensions.svcallers
import java.io.File
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import argonaut._, Argonaut._
import scalaz._, Scalaz._
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
class BreakdancerVCF(val root: Configurable) extends PythonCommandLineFunction {
setPythonScript("breakdancer2vcf.py")
@Input(doc = "Breakdancer TSV")
var input: File = _
@Output(doc = "Output VCF to PATH")
var output: File = _
def cmdLine = {
getPythonCommand +
"-i " + required(input) +
"-o " + required(output)
}
}
object BreakdancerVCF {
def apply(root: Configurable, input: File, output: File): BreakdancerVCF = {
val bd = new BreakdancerVCF(root)
bd.input = input
bd.output = output
return bd
}
}
\ No newline at end of file
......@@ -8,7 +8,6 @@ import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.core.MultiSampleQScript
import nl.lumc.sasc.biopet.core.PipelineCommand
import nl.lumc.sasc.biopet.extensions.picard.MergeSamFiles
import nl.lumc.sasc.biopet.extensions.sambamba.{ SambambaIndex, SambambaMerge }
import nl.lumc.sasc.biopet.extensions.svcallers.pindel.Pindel
import nl.lumc.sasc.biopet.extensions.svcallers.{ Breakdancer, Clever }
......@@ -84,24 +83,24 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript {
mergeSamFiles.input = libraryBamfiles
mergeSamFiles.output = alignmentDir + sampleID + ".merged.bam"
add(mergeSamFiles)
val bamIndex = SambambaIndex(root, mergeSamFiles.output)
add(bamIndex)
mergeSamFiles.output
} else null
val bamIndex = SambambaIndex(root, bamFile)
add(bamIndex)
/// bamfile will be used as input for the SV callers. First run Clever
// val cleverVCF : File = sampleDir + "/" + sampleID + ".clever.vcf"
val cleverDir = svcallingDir + sampleID + ".clever/"
val clever = Clever(this, bamFile, this.reference, svcallingDir, cleverDir)
clever.deps = List(bamIndex.output)
sampleOutput.vcf += ("clever" -> List(clever.outputvcf))
add(clever)
val breakdancerDir = svcallingDir + sampleID + ".breakdancer/"
val breakdancer = Breakdancer(this, bamFile, this.reference, breakdancerDir)
sampleOutput.vcf += ("breakdancer" -> List(breakdancer.outputraw))
sampleOutput.vcf += ("breakdancer" -> List(breakdancer.outputvcf))
addAll(breakdancer.functions)
// for pindel we should use per library config collected into one config file
......
#!/usr/bin/env python
__copyright__ = """
Copyright (C) 2013 - Tim te Beek
Copyright (C) 2013 - Wai Yi Leung
Copyright (C) 2013 AllBio (see AUTHORS file)
"""
__desc__ = """Convert breakdancer output to pseudo .vcf file format."""
__created__ = "Mar 18, 2013"
__author__ = "tbeek"
import argparse
import csv
import os.path
import sys
import datetime
def main(tsvfile, vcffile):
'''
:param tsvfile: filename of input file.tsv
:type tsvfile: string
:param vcffile: filename of output file.vcf
:type vcffile: string
'''
with open(tsvfile) as reader:
# Parse file
dictreader = _parse_tsvfile(reader)
print dictreader.fieldnames
# Write out file
_format_vcffile(dictreader, vcffile)
# Quick output
with open(vcffile) as reader:
print reader.read(1000)
def _parse_tsvfile(readable):
'''
Read readable using csv.Sniffer and csv.DictReader
:param readable: open file.tsv handle to read with csv.DictReader
:type readable: file
'''
prev, curr = 0, 0
while True:
line = readable.readline()
if not line.startswith('#'):
# lets start from prev # line, without the hash sign
readable.seek(prev + 1)
break
else:
prev = curr
curr = readable.tell()
# Determine dialect
curr = readable.tell()
#dialect = csv.Sniffer().sniff(readable.read(3000))
dialect = 'excel-tab'
readable.seek(curr)
# Read file
dictreader = csv.DictReader(readable, dialect=dialect)
return dictreader
_tsv_fields = ('Chr1', 'Pos1', 'Orientation1',
'Chr2', 'Pos2', 'Orientation2',
'Type', 'Size', 'Score',
'num_Reads', 'num_Reads_lib',
'ERR031544.sort.bam')
# 'Chr1': '1',
# 'Pos1': '269907',
# 'Orientation1': '39+39-',
# 'Chr2': '1',
# 'Pos2': '270919',
# 'Orientation2': '39+39-',
# 'Type': 'DEL',
# 'Size': '99',
# 'Score': '99',
# 'num_Reads': '38',
# 'num_Reads_lib': '/home/allbio/ERR031544.sort.bam|38',
# 'ERR031544.sort.bam': 'NA'
_vcf_fields = ('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'default')
TS_NOW = datetime.datetime.now()
VCF_HEADER = """##fileformat=VCFv4.1
##fileDate={filedate}
##source=breakdancer-max
##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency">
##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of variation">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">""".format( filedate=TS_NOW.strftime( "%Y%m%d" ) )
def _format_vcffile(dictreader, vcffile):
'''
Create a pseudo .vcf file based on values read from DictReader instance.
:param dictreader: DictReader instance to read data from
:type dictreader: csv.DictRedaer
:param vcffile: output file.vcf filename
:type vcffile: string
'''
FORMAT = "GT:DP"
with open(vcffile, mode='w') as writer:
writer.write('{header}\n#{columns}\n'.format(header=VCF_HEADER, columns='\t'.join(_vcf_fields)))
output_vcf = []
for line in dictreader:
CHROM = line['Chr1']
# TODO Figure out whether we have zero or one based positioning
POS = int(line['Pos1'])
ALT = '.'
SVEND = int(line['Pos2'])
INFO = 'PROGRAM=breakdancer;SVTYPE={}'.format(line['Type'])
if line['Type'] not in ['CTX']:
INFO += ';SVLEN={}'.format(int(line['Size']))
INFO += ";SVEND={}".format(SVEND)
# write alternate ALT field for Intrachromosomal translocations
if line['Type'] in ['CTX']:
ALT = "N[{}:{}[".format(line['Chr2'], line['Pos2'])
SAMPLEINFO = "{}:{}".format( '1/.', line['num_Reads'] )
# Create record
output_vcf.append([CHROM, POS, '.', '.', ALT, '.', 'PASS', INFO, FORMAT, SAMPLEINFO])
# Sort all results
output_vcf.sort()
output = "\n".join(["\t".join(map(str,vcf_row)) for vcf_row in output_vcf])
# Write record
writer.write(output)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--breakdancertsv', dest='breakdancertsv', type=str,
help='Breakdancer TSV outputfile')
parser.add_argument('-o', '--outputvcf', dest='outputvcf', type=str,
help='Output vcf to')
args = parser.parse_args()
main(args.breakdancertsv, args.outputvcf)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment