Commit 8cab8c38 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge remote-tracking branch 'remotes/origin/develop' into feature-snp_test

parents 0e122e1f 4caab1f4
......@@ -80,6 +80,15 @@ trait BiopetCommandLineFunction extends CommandLineResources { biopetFunction =>
def beforeGraph() {}
override def freezeFieldValues() {
this match {
case r: Reference =>
if (r.dictRequired) deps :+= r.referenceDict
if (r.faiRequired) deps :+= r.referenceFai
deps = deps.distinct
case _ =>
}
preProcessExecutable()
beforeGraph()
internalBeforeGraph()
......
......@@ -61,25 +61,25 @@ trait Reference extends Configurable {
}
/** When set override this on true the pipeline with raise an exception when fai index is not found */
protected def faiRequired = false
def faiRequired = false
/** When set override this on true the pipeline with raise an exception when dict index is not found */
protected def dictRequired = this.isInstanceOf[Summarizable] || this.isInstanceOf[SummaryQScript]
def dictRequired = this.isInstanceOf[Summarizable] || this.isInstanceOf[SummaryQScript]
/** Returns the dict file belonging to the fasta file */
def referenceDict = new File(referenceFasta().getAbsolutePath
.stripSuffix(".fa")
.stripSuffix(".fasta")
.stripSuffix(".fna") + ".dict")
/** Returns the fai file belonging to the fasta file */
def referenceFai = new File(referenceFasta().getAbsolutePath + ".fai")
/** Returns the fasta file */
def referenceFasta(): File = {
val file: File = config("reference_fasta")
if (config.contains("reference_fasta")) {
checkFasta(file)
val dict = new File(file.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta").stripSuffix(".fna") + ".dict")
val fai = new File(file.getAbsolutePath + ".fai")
this match {
case c: BiopetCommandLineFunction => c.deps :::= dict :: fai :: Nil
case _ =>
}
} else {
if (config.contains("reference_fasta")) checkFasta(file)
else {
val defaults = ConfigUtils.mergeMaps(this.defaults, this.internalDefaults)
def getReferences(map: Map[String, Any]): Set[(String, String)] = (for (
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.picard
import java.io.File
import nl.lumc.sasc.biopet.core.Reference
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
/** Extension for picard SortVcf */
class SortVcf(val root: Configurable) extends Picard with Reference {
javaMainClass = new picard.vcf.SortVcf().getClass.getName
@Input(doc = "Input VCF(s) to be sorted. Multiple inputs must have the same sample names (in order)", required = true)
var input: File = _
@Output(doc = "Output VCF to be written.", required = true)
var output: File = _
@Input(doc = "Sequence dictionary to use", required = true)
var sequenceDictionary: File = _
override val dictRequired = true
override def beforeGraph(): Unit = {
super.beforeGraph()
if (sequenceDictionary == null) sequenceDictionary = referenceDict
}
/** Returns command to execute */
override def cmdLine = super.cmdLine +
(if (inputAsStdin) required("INPUT=", new File("/dev/stdin"), spaceSeparated = false)
else required("INPUT=", input, spaceSeparated = false)) +
(if (outputAsStsout) required("OUTPUT=", new File("/dev/stdout"), spaceSeparated = false)
else required("OUTPUT=", output, spaceSeparated = false)) +
required("SEQUENCE_DICTIONARY=", sequenceDictionary, spaceSeparated = false)
}
object SortVcf {
/** Returns default SortSam */
def apply(root: Configurable, input: File, output: File): SortVcf = {
val sortVcf = new SortVcf(root)
sortVcf.input = input
sortVcf.output = output
sortVcf
}
}
\ No newline at end of file
......@@ -263,7 +263,7 @@ object VcfFilter extends ToolCommand {
def minAlternateDepth(record: VariantContext, minAlternateDepth: Int, minSamplesPass: Int = 1): Boolean = {
record.getGenotypes.count(genotype => {
val AD = if (genotype.hasAD) List(genotype.getAD: _*) else Nil
if (AD.nonEmpty) AD.tail.count(_ >= minAlternateDepth) > 0 else true
if (AD.nonEmpty && minAlternateDepth >= 0) AD.tail.count(_ >= minAlternateDepth) > 0 else true
}) >= minSamplesPass
}
......
......@@ -18,8 +18,10 @@ package nl.lumc.sasc.biopet.utils
import java.io.File
import htsjdk.samtools.{ SamReader, SamReaderFactory }
import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList }
import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.collection.parallel.immutable
/**
......@@ -57,39 +59,76 @@ object BamUtils {
* @param end postion to stop scanning
* @return Int with insertsize for this contig
*/
def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int, samplingSize: Int = 100000): Option[Int] = {
val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam)
val samIterator = inputSam.query(contig, start, end, true)
val insertsizes: List[Int] = (for {
read <- samIterator.toStream.takeWhile(rec => {
val paired = rec.getReadPairedFlag && rec.getProperPairFlag
val bothMapped = if (paired) ((rec.getReadUnmappedFlag == false) && (rec.getMateUnmappedFlag == false)) else false
paired && bothMapped
}).take(samplingSize)
} yield {
read.getInferredInsertSize.asInstanceOf[Int].abs
})(collection.breakOut)
val cti = insertsizes.foldLeft((0.0, 0))((t, r) => (t._1 + r, t._2 + 1))
samIterator.close()
inputSam.close()
val ret = if (cti._2 == 0) None else Some((cti._1 / cti._2).toInt)
ret
def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int,
samplingSize: Int = 10000,
binSize: Int = 1000000): Option[Int] = {
// create a bedList to devide the contig into multiple pieces
val insertSizesOnAllFragments = BedRecordList.fromList(Seq(BedRecord(contig, start, end)))
.scatter(binSize)
.allRecords.par.flatMap({
bedRecord =>
// for each scatter, open the bamfile for this specific region-query
val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam)
val samIterator = inputSam.query(bedRecord.chr, bedRecord.start, bedRecord.end, true)
val counts: mutable.Map[Int, Int] = mutable.Map()
for (i <- 0 until samplingSize if samIterator.hasNext) {
val rec = samIterator.next()
val isPaired = rec.getReadPairedFlag
val minQ10 = rec.getMappingQuality >= 10
val pairOnSameContig = rec.getContig == rec.getMateReferenceName
if (isPaired && minQ10 && pairOnSameContig) {
val insertSize = rec.getInferredInsertSize.abs
counts(insertSize) = counts.getOrElse(insertSize, 0) + 1
}
}
counts.keys.size match {
case 1 => Some(counts.keys.head)
case 0 => None
case _ => {
Some(counts.foldLeft(0)((old, observation) => {
observation match {
case (insertSize: Int, observations: Int) => {
(old + (insertSize * observations)) / (observations + 1)
}
case _ => 0
}
}))
}
}
})
insertSizesOnAllFragments.size match {
case 1 => Some(insertSizesOnAllFragments.head)
case 0 => None
case _ => {
Some(
insertSizesOnAllFragments.foldLeft(0)((old, observation) => {
(old + observation) / 2
}))
}
}
}
/**
* Estimate the insertsize for one single bamfile and return the insertsize
*
* @param bamFile bamfile to estimate avg insertsize from
* @param bamFile bamfile to estimate average insertsize from
* @return
*/
def sampleBamInsertSize(bamFile: File, samplingSize: Int = 100000): Int = {
def sampleBamInsertSize(bamFile: File, samplingSize: Int = 10000, binSize: Int = 1000000): Int = {
val inputSam: SamReader = SamReaderFactory.makeDefault.open(bamFile)
val baminsertsizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({
contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize)
val bamInsertSizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({
contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength, samplingSize, binSize)
}).toList
val counts = baminsertsizes.flatMap(x => x)
val counts = bamInsertSizes.flatMap(x => x)
// avoid division by zero
if (counts.size != 0) {
counts.sum / counts.size
} else {
......@@ -103,8 +142,8 @@ object BamUtils {
* @param bamFiles input bam files
* @return
*/
def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 100000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile =>
bamFile -> sampleBamInsertSize(bamFile, samplingSize)
def sampleBamsInsertSize(bamFiles: List[File], samplingSize: Int = 10000, binSize: Int = 1000000): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile =>
bamFile -> sampleBamInsertSize(bamFile, samplingSize, binSize)
}.toMap
}
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=ERROR, A1
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-5p [%d] [%C{1}] - %m%n
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment