Commit 53d48f23 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge 'develop' into 'feature-basty_generate_fasta'

parent e29d7fcf
{
"bwa": {
"exe": "/usr/local/bwa/bwa-0.7.10/bwa"
},
"seqtk": {
"exe":"/data/DIV5/SASC/common/programs/seqtk/seqtk/seqtk"
},
"sickle": {
"exe":"/data/DIV5/SASC/common/programs/sickle/sickle-1.33/sickle"
},
"clever": {
"exe": "/data/DIV5/SASC/common/programs/clever/clever-toolkit-v2.0rc3/bin/clever",
"version_exe": "/data/DIV5/SASC/common/programs/clever/clever-toolkit-v2.0rc3/bin/ctk-version"
},
"pindel": {
"exe": "/data/DIV5/SASC/common/programs/pindel/pindel-0.2.5/pindel"
},
"breakdancerconfig": {
"exe": "/data/DIV5/SASC/common/programs/breakdancer/breakdancer-v1.4.4/lib/breakdancer-max1.4.4/bam2cfg.pl"
},
"breakdancercaller": {
"exe": "/data/DIV5/SASC/common/programs/breakdancer/breakdancer-v1.4.4/bin/breakdancer-max"
},
"fastqc": {
"exe": "/usr/local/FastQC/FastQC_v0.10.1/fastqc"
},
"seqstat": {
"exe": "/data/DIV5/SASC/common/programs/dQual/fastq-seqstat"
},
"stampy": {
"exe": "/usr/local/stampy/stampy-1.0.23/stampy.py"
},
"sambamba": {
"exe": "/data/DIV5/SASC/common/programs/sambamba/sambamba-0.4.7/build/sambamba"
}
}
......@@ -92,7 +92,7 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
protected def versionCommand: String = null
protected val versionRegex: Regex = null
protected val versionExitcode = List(0) // Can select multiple
def getVersion: String = {
private def getVersionInternal: String = {
if (versionCommand == null || versionRegex == null) return "N/A"
val stdout = new StringBuffer()
val stderr = new StringBuffer()
......@@ -114,6 +114,12 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
return "N/A"
}
def getVersion: String = {
if (!BiopetCommandLineFunctionTrait.versionCache.contains(executable))
BiopetCommandLineFunctionTrait.versionCache += executable -> getVersionInternal
return BiopetCommandLineFunctionTrait.versionCache(executable)
}
def getThreads(default: Int): Int = {
val maxThreads: Int = config("maxthreads", default = 8)
val threads: Int = config("threads", default = default)
......@@ -128,3 +134,8 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
else return maxThreads
}
}
object BiopetCommandLineFunctionTrait {
import scala.collection.mutable.Map
private val versionCache: Map[String, String] = Map()
}
\ No newline at end of file
package nl.lumc.sasc.biopet.core
import java.util.Properties
import org.apache.log4j.Logger
object BiopetExecutable {
object BiopetExecutable extends Logging {
val modules: Map[String, List[MainCommand]] = Map(
"pipeline" -> List(
......@@ -109,4 +110,17 @@ object BiopetExecutable {
prop.load(getClass.getClassLoader.getResourceAsStream("git.properties"))
prop.getProperty("git.commit.id.abbrev")
}
def checkDirtyBuild(logger: Logger) {
val prop = new Properties()
prop.load(getClass.getClassLoader.getResourceAsStream("git.properties"))
val describeShort = prop.getProperty("git.commit.id.describe-short")
if (describeShort.endsWith("-dirty")) {
logger.warn("**********************************************************")
logger.warn("* This JAR was built while there are uncommited changes. *")
logger.warn("* Reproducible results are *not* guaranteed. *")
logger.warn("**********************************************************")
}
}
checkDirtyBuild(logger)
}
package nl.lumc.sasc.biopet.core
import java.text.SimpleDateFormat
import java.util.Calendar
import org.apache.log4j.Logger
import org.apache.log4j.WriterAppender
import org.apache.log4j.helpers.DateLayout
trait Logging {
protected val logger = Logger.getLogger(getClass.getSimpleName.split("\\$").last)
private[core] val logLayout = new DateLayout() {
val ignoresThrowable = false
def format(event: org.apache.log4j.spi.LoggingEvent): String = {
val calendar: Calendar = Calendar.getInstance
calendar.setTimeInMillis(event.getTimeStamp)
val formatter: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
val formattedDate: String = formatter.format(calendar.getTime)
var logLevel = event.getLevel.toString
while (logLevel.size < 6) logLevel += " "
logLevel + " [" + formattedDate + "] [" + event.getLoggerName + "] " + event.getMessage + "\n"
}
}
private[core] val stderrAppender = new WriterAppender(logLayout, sys.process.stderr)
logger.setLevel(org.apache.log4j.Level.INFO)
logger.addAppender(stderrAppender)
}
package nl.lumc.sasc.biopet.core
import org.broadinstitute.gatk.queue.util.Logging
import org.broadinstitute.gatk.queue.util.{ Logging => GatkLogging }
trait PipelineCommand extends MainCommand with Logging {
trait PipelineCommand extends MainCommand with GatkLogging {
def pipeline = "/" + getClass.getName.stripSuffix("$").replaceAll("\\.", "/") + ".class"
......
package nl.lumc.sasc.biopet.core
import java.text.SimpleDateFormat
import java.util.Calendar
import org.apache.log4j.Logger
import org.apache.log4j.WriterAppender
import org.apache.log4j.helpers.DateLayout
import org.apache.log4j.spi.LoggingEvent
import java.io.File
trait ToolCommand extends MainCommand {
trait ToolCommand extends MainCommand with Logging {
abstract class AbstractArgs {
}
......@@ -45,22 +40,4 @@ trait ToolCommand extends MainCommand {
type Args <: AbstractArgs
type OptParser <: AbstractOptParser
protected val logger = Logger.getLogger(commandName)
private val logLayout = new DateLayout() {
val ignoresThrowable = false
def format(event: org.apache.log4j.spi.LoggingEvent): String = {
val calendar: Calendar = Calendar.getInstance
calendar.setTimeInMillis(event.getTimeStamp)
val formatter: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
val formattedDate: String = formatter.format(calendar.getTime)
var logLevel = event.getLevel.toString
while (logLevel.size < 6) logLevel += " "
logLevel + " [" + formattedDate + "] [" + event.getLoggerName + "] " + event.getMessage + "\n"
}
}
private val stderrAppender = new WriterAppender(logLayout, sys.process.stderr)
logger.setLevel(org.apache.log4j.Level.INFO)
logger.addAppender(stderrAppender)
}
\ No newline at end of file
......@@ -2,6 +2,7 @@ package nl.lumc.sasc.biopet.core.config
import java.io.File
import org.broadinstitute.gatk.queue.util.Logging
import scala.language.implicitConversions
trait Configurable extends Logging {
val root: Configurable
......
......@@ -86,4 +86,4 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction {
optional("-t", t) +
optional("-z", z) +
required("-T", threads)
}
\ No newline at end of file
}
......@@ -52,6 +52,14 @@ class MarkDuplicates(val root: Configurable) extends Picard {
@Argument(doc = "OPTICAL_DUPLICATE_PIXEL_DISTANCE", required = false)
var opticalDuplicatePixelDistance: Option[Int] = config("opticalDuplicatePixelDistance")
@Output(doc = "Bam Index", required = true)
private var outputIndex: File = _
override def afterGraph {
super.afterGraph
if (createIndex) outputIndex = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai")
}
override def commandLine = super.commandLine +
repeat("INPUT=", input, spaceSeparated = false) +
required("OUTPUT=", output, spaceSeparated = false) +
......
......@@ -16,6 +16,14 @@ class SortSam(val root: Configurable) extends Picard {
@Argument(doc = "Sort order of output file Required. Possible values: {unsorted, queryname, coordinate} ", required = true)
var sortOrder: String = _
@Output(doc = "Bam Index", required = true)
private var outputIndex: File = _
override def afterGraph {
super.afterGraph
if (createIndex) outputIndex = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai")
}
override def commandLine = super.commandLine +
required("INPUT=", input, spaceSeparated = false) +
required("OUTPUT=", output, spaceSeparated = false) +
......
......@@ -3,7 +3,7 @@ package nl.lumc.sasc.biopet.pipelines.gatk
import nl.lumc.sasc.biopet.core.MultiSampleQScript
import nl.lumc.sasc.biopet.core.PipelineCommand
import nl.lumc.sasc.biopet.core.config.Configurable
import htsjdk.samtools.SAMFileReader
import htsjdk.samtools.SamReaderFactory
import scala.collection.JavaConversions._
import java.io.File
import nl.lumc.sasc.biopet.extensions.gatk.{ CombineVariants, CombineGVCFs }
......@@ -186,7 +186,7 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri
libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile")
} else {
var readGroupOke = true
val inputSam = new SAMFileReader(bamFile)
val inputSam = SamReaderFactory.makeDefault.open(bamFile)
val header = inputSam.getFileHeader.getReadGroups
for (readGroup <- inputSam.getFileHeader.getReadGroups) {
if (readGroup.getSample != sampleID) logger.warn("Sample ID readgroup in bam file is not the same")
......
......@@ -10,6 +10,7 @@ import org.broadinstitute.gatk.queue.QScript
import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile
import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument }
import scala.collection.SortedMap
import scala.language.reflectiveCalls
class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScript {
def this() = this(null)
......@@ -83,6 +84,7 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
if (variantcalling) {
var mergBuffer: SortedMap[String, File] = SortedMap()
def mergeList = mergBuffer map { case (key, file) => TaggedFile(removeNoneVariants(file), "name=" + key) }
if (sampleID != null && (useHaplotypecaller.get || config("joint_genotyping", default = false).getBoolean)) {
val hcGvcf = new HaplotypeCaller(this)
......@@ -140,10 +142,17 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
mergBuffer += ("9.raw" -> scriptOutput.rawFilterVcfFile)
if (useAllelesOption.get) {
val tempFile = if (mergeList.toList.size > 1) {
val allelesTemp = CombineVariants(this, mergeList.toList, outputDir + outputName + ".alleles_temp.vcf.gz")
allelesTemp.genotypemergeoption = org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils.GenotypeMergeType.UNSORTED
add(allelesTemp, isIntermediate = true)
allelesTemp.out
} else mergeList.toList.head
val alleleOnly = new CommandLineFunction {
@Input val input: File = scriptOutput.rawFilterVcfFile
@Input val input: File = tempFile
@Output val output: File = outputDir + "raw.allele_only.vcf.gz"
@Output val outputindex: File = outputDir + "raw.allele_only.vcf.gz.tbi"
@Output val outputindex: File = outputDir + "raw.allele__temp_only.vcf.gz.tbi"
def commandLine = "zcat " + input + " | cut -f1,2,3,4,5,6,7,8 | bgzip -c > " + output + " && tabix -pvcf " + output
}
add(alleleOnly, isIntermediate = true)
......@@ -180,8 +189,8 @@ class GatkVariantcalling(val root: Configurable) extends QScript with BiopetQScr
sv.out
}
def mergeList = mergBuffer map { case (key, file) => TaggedFile(removeNoneVariants(file), "name=" + key) }
val cvFinal = CombineVariants(this, mergeList.toList, outputDir + outputName + ".final.vcf.gz")
cvFinal.genotypemergeoption = org.broadinstitute.gatk.utils.variant.GATKVariantContextUtils.GenotypeMergeType.UNSORTED
add(cvFinal)
scriptOutput.finalVcfFile = cvFinal.out
}
......
package nl.lumc.sasc.biopet.tools
import htsjdk.samtools.SAMFileReader
import htsjdk.samtools.SAMSequenceRecord
import htsjdk.samtools.{ SAMSequenceRecord, SamReaderFactory }
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
......@@ -68,7 +67,7 @@ object BedToInterval extends ToolCommand {
val writer = new PrintWriter(commandArgs.outputFile)
val inputSam = new SAMFileReader(commandArgs.bamFile)
val inputSam = SamReaderFactory.makeDefault.open(commandArgs.bamFile)
val refs = for (SQ <- inputSam.getFileHeader.getSequenceDictionary.getSequences.toArray) yield {
val record = SQ.asInstanceOf[SAMSequenceRecord]
writer.write("@SQ\tSN:" + record.getSequenceName + "\tLN:" + record.getSequenceLength + "\n")
......@@ -80,10 +79,10 @@ object BedToInterval extends ToolCommand {
val bedFile = Source.fromFile(commandArgs.inputFile)
for (
line <- bedFile.getLines;
val split = line.split("\t") if split.size >= 3;
val chr = split(0);
val start = split(1);
val stop = split(2) if start forall Character.isDigit if stop forall Character.isDigit
split = line.split("\t") if split.size >= 3;
chr = split(0);
start = split(1);
stop = split(2) if start forall Character.isDigit if stop forall Character.isDigit
) {
if (!refsMap.contains(chr)) throw new IllegalStateException("Chr '" + chr + "' in bed file not found in bam file")
writer.write(chr + "\t" + start + "\t" + stop + "\t")
......
package nl.lumc.sasc.biopet.tools
import htsjdk.samtools.SAMFileReader
import htsjdk.samtools.SAMRecord
import htsjdk.samtools.{ SAMRecord, SamReaderFactory }
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
......@@ -57,7 +56,7 @@ object BiopetFlagstat extends ToolCommand {
val argsParser = new OptParser
val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val inputSam = new SAMFileReader(commandArgs.inputFile)
val inputSam = SamReaderFactory.makeDefault.open(commandArgs.inputFile)
val iterSam = if (commandArgs.region == None) inputSam.iterator else {
val regionRegex = """(.*):(.*)-(.*)""".r
commandArgs.region.get match {
......
package nl.lumc.sasc.biopet.tools
import htsjdk.samtools.SAMFileReader
import htsjdk.samtools.QueryInterval
import htsjdk.samtools.SAMRecord
import htsjdk.samtools.{ QueryInterval, SamReaderFactory, SAMRecord, SamReader }
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.variantcontext.VariantContextBuilder
import htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter
......@@ -69,7 +67,8 @@ object CheckAllelesVcfInBam extends ToolCommand {
if (commandArgs.bamFiles.size != commandArgs.samples.size)
logger.warn("Number of samples is diffrent then number of bam files, left over will be removed")
val bamReaders: Map[String, SAMFileReader] = Map(commandArgs.samples zip commandArgs.bamFiles.map(x => new SAMFileReader(x)): _*)
val samReaderFactory = SamReaderFactory.makeDefault
val bamReaders: Map[String, SamReader] = Map(commandArgs.samples zip commandArgs.bamFiles.map(x => samReaderFactory.open(x)): _*)
val bamHeaders = bamReaders.map(x => (x._1, x._2.getFileHeader))
val reader = new VCFFileReader(commandArgs.inputFile, false)
......
......@@ -21,6 +21,9 @@ object ExtractAlignedFastq extends ToolCommand {
type FastqInput = (FastqRecord, Option[FastqRecord])
/** function to get FastqRecord ID */
def fastqId(rec: FastqRecord) = rec.getReadHeader.split(" ")(0)
/**
* Function to create iterator over Interval given input interval string
*
......@@ -90,10 +93,10 @@ object ExtractAlignedFastq extends ToolCommand {
}
val queries: Array[QueryInterval] = iv.toList
// sort Interval
.sortBy(x => (x.getSequence, x.getStart, x.getEnd))
// transform to QueryInterval
.map(x => new QueryInterval(getSequenceIndex(x.getSequence), x.getStart, x.getEnd))
// sort Interval
.sortBy(x => (x.referenceIndex, x.start, x.end))
// cast to array
.toArray
......@@ -113,11 +116,12 @@ object ExtractAlignedFastq extends ToolCommand {
)
(pair: FastqInput) => pair._2 match {
case None => selected.contains(pair._1.getReadHeader)
case None => selected.contains(fastqId(pair._1))
case Some(x) =>
require(commonSuffixLength < pair._1.getReadHeader.length)
require(commonSuffixLength < x.getReadHeader.length)
selected.contains(pair._1.getReadHeader.dropRight(commonSuffixLength))
val rec1Id = fastqId(pair._1)
require(commonSuffixLength < rec1Id.length)
require(commonSuffixLength < fastqId(x).length)
selected.contains(rec1Id.dropRight(commonSuffixLength))
}
}
......@@ -224,6 +228,7 @@ object ExtractAlignedFastq extends ToolCommand {
minMapQ = commandArgs.minMapQ,
commonSuffixLength = commandArgs.commonSuffixLength)
logger.info("Writing to output file(s) ...")
(commandArgs.inputFastq2, commandArgs.outputFastq2) match {
case (None, None) => extractReads(memFunc,
......
/*
* Copyright 2014 pjvan_thof.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl.lumc.sasc.biopet.tools
import htsjdk.samtools.QueryInterval
import htsjdk.samtools.SAMFileReader
import htsjdk.samtools.ValidationStringency
import htsjdk.samtools.SAMRecord
import htsjdk.samtools.{ QueryInterval, SAMRecord, SamReaderFactory, ValidationStringency }
import java.io.File
import nl.lumc.sasc.biopet.core.ToolCommand
import scala.io.Source
......@@ -44,8 +25,9 @@ object FindRepeatsPacBio extends ToolCommand {
val argsParser = new OptParser
val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val bamReader = new SAMFileReader(commandArgs.inputBam)
bamReader.setValidationStringency(ValidationStringency.SILENT)
val bamReader = SamReaderFactory.makeDefault
.validationStringency(ValidationStringency.SILENT)
.open(commandArgs.inputBam)
val bamHeader = bamReader.getFileHeader
val header = List("chr", "startPos", "stopPos", "Repeat_seq", "repeatLength",
......@@ -55,7 +37,7 @@ object FindRepeatsPacBio extends ToolCommand {
for (
bedLine <- Source.fromFile(commandArgs.inputBed).getLines;
val values = bedLine.split("\t"); if values.size >= 3
values = bedLine.split("\t"); if values.size >= 3
) {
val interval = new QueryInterval(bamHeader.getSequenceIndex(values(0)), values(1).toInt, values(2).toInt)
val bamIter = bamReader.query(Array(interval), false)
......
......@@ -115,7 +115,7 @@ object MpileupToVcf extends ToolCommand {
class Counts(var forward: Int, var reverse: Int)
for (
line <- inputStream;
val values = line.split("\t");
values = line.split("\t");
if values.size > 5
) {
val chr = values(0)
......
......@@ -6,6 +6,7 @@ package nl.lumc.sasc.biopet.tools
import java.io.File
import scala.collection.JavaConverters._
import scala.io.Source
import scala.math.{ max, min }
import com.google.common.hash.{ Funnel, BloomFilter, PrimitiveSink }
......@@ -73,7 +74,7 @@ object WipeReads extends ToolCommand {
*
* @param inFile input interval file
*/
def makeIntervalFromFile(inFile: File): List[Interval] = {
def makeIntervalFromFile(inFile: File, gtfFeatureType: String = "exon"): List[Interval] = {
logger.info("Parsing interval file ...")
......@@ -83,20 +84,62 @@ object WipeReads extends ToolCommand {
.asScala
.map(x => new Interval(x.getChr, x.getStart, x.getEnd))
/** Function to create iterator from refFlat file */
def makeIntervalFromRefFlat(inFile: File): Iterator[Interval] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
/**
* Parses a refFlat file to yield Interval objects
*
* Format description:
* http://genome.csdb.cn/cgi-bin/hgTables?hgsid=6&hgta_doSchemaDb=hg18&hgta_doSchemaTable=refFlat
*
* @param inFile input refFlat file
*/
def makeIntervalFromRefFlat(inFile: File): Iterator[Interval] =
Source.fromFile(inFile)
// read each line
.getLines()
// skip all empty lines
.filterNot(x => x.trim.isEmpty)
// split per column
.map(line => line.trim.split("\t"))
// take chromosome and exonEnds and exonStars
.map(x => (x(2), x.reverse.take(2)))
// split starts and ends based on comma
.map(x => (x._1, x._2.map(y => y.split(","))))
// zip exonStarts and exonEnds, note the index was reversed because we did .reverse above
.map(x => (x._1, x._2(1).zip(x._2(0))))
// make Intervals, accounting for the fact that refFlat coordinates are 0-based
.map(x => x._2.map(y => new Interval(x._1, y._1.toInt + 1, y._2.toInt)))
// flatten sublist
.flatten
/** Function to create iterator from GTF file */
def makeIntervalFromGtf(inFile: File): Iterator[Interval] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
/**
* Parses a GTF file to yield Interval objects
*
* @param inFile input GTF file
* @return
*/
def makeIntervalFromGtf(inFile: File): Iterator[Interval] =
Source.fromFile(inFile)
// read each line
.getLines()
// skip all empty lines
.filterNot(x => x.trim.isEmpty)
// skip all UCSC track lines and/or ensembl comment lines
.dropWhile(x => x.matches("^track | ^browser | ^#"))
// split to columns
.map(x => x.split("\t"))
// exclude intervals whose type is different from the supplied one
.filter(x => x(2) == gtfFeatureType)
// and finally create the interval objects
.map(x => new Interval(x(0), x(3).toInt, x(4).toInt))
// detect interval file format from extension
val iterFunc: (File => Iterator[Interval]) =
if (getExtension(inFile.toString.toLowerCase) == "bed")
makeIntervalFromBed
else if (getExtension(inFile.toString.toLowerCase) == "refflat")
makeIntervalFromRefFlat
else if (getExtension(inFile.toString.toLowerCase) == "gtf")
makeIntervalFromGtf
else
throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath)
......@@ -107,7 +150,7 @@ object WipeReads extends ToolCommand {
acc match {
case head :: tail if x.intersects(head) =>
new Interval(x.getSequence, min(x.getStart, head.getStart), max(x.getEnd, head.getEnd)) :: tail
case _ => x :: acc
case _ => x :: acc
}
}
)
......@@ -150,12 +193,10 @@ object WipeReads extends ToolCommand {
else if (iv.getSequence.startsWith("chr") && getIndex(iv.getSequence.substring(3)) > -1) {
logger.warn("Removing 'chr' prefix from interval " + iv.toString)