From 75b7230cec865ee98e62b9eb82d810e0a6f03130 Mon Sep 17 00:00:00 2001
From: Wai Yi Leung <w.y.leung@lumc.nl>
Date: Fri, 29 Jan 2016 11:12:07 +0100
Subject: [PATCH] Added insert size estimation. Fix pindel2vcf dependency
 (depend on pindel, use beforegraph to set outputpath)

---
 .../extensions/pindel/PindelCaller.scala      | 17 ++++-----
 .../biopet/extensions/pindel/PindelVCF.scala  |  4 ++-
 .../nl/lumc/sasc/biopet/utils/BamUtils.scala  | 35 +++++++++++++------
 .../pipelines/shiva/svcallers/Pindel.scala    |  5 ++-
 4 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/pindel/PindelCaller.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/pindel/PindelCaller.scala
index 426dd52dd..2e1c37985 100644
--- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/pindel/PindelCaller.scala
+++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/pindel/PindelCaller.scala
@@ -59,19 +59,19 @@ class PindelCaller(val root: Configurable) extends BiopetCommandLineFunction wit
   @Output(doc = "Output file of pindel, pointing to the DEL file")
   var outputFile: File = _
 
-  @Output(doc="", required=false)
+  @Output(doc = "", required = false)
   var outputINV: File = _
-  @Output(doc="", required=false)
+  @Output(doc = "", required = false)
   var outputTD: File = _
-  @Output(doc="", required=false)
+  @Output(doc = "", required = false)
   var outputLI: File = _
-  @Output(doc="", required=false)
+  @Output(doc = "", required = false)
   var outputBP: File = _
-  @Output(doc="", required=false)
+  @Output(doc = "", required = false)
   var outputSI: File = _
-  @Output(doc="", required=false)
+  @Output(doc = "", required = false)
   var outputRP: File = _
-  @Output(doc="", required=false)
+  @Output(doc = "", required = false)
   var outputCloseEndMapped: File = _
 
   var RP: Option[Int] = config("RP")
@@ -148,7 +148,7 @@ class PindelCaller(val root: Configurable) extends BiopetCommandLineFunction wit
     if (reportLongInsertions) {
       outputLI = new File(outputPrefix + File.separator, "sample_LI")
     }
-    if (reportBreakpoints){
+    if (reportBreakpoints) {
       outputBP = new File(outputPrefix + File.separator, "sample_BP")
     }
     outputSI = new File(outputPrefix + File.separator, "sample_SI")
@@ -215,6 +215,7 @@ object PindelCaller {
     val caller = new PindelCaller(root)
     caller.configFile = Some(configFile)
     caller.outputPrefix = outputDir
+    caller.beforeGraph
     caller
   }
 }
diff --git a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/pindel/PindelVCF.scala b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/pindel/PindelVCF.scala
index 6d69531e8..7dc6ef154 100644
--- a/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/pindel/PindelVCF.scala
+++ b/public/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/pindel/PindelVCF.scala
@@ -32,9 +32,10 @@ class PindelVCF(val root: Configurable) extends BiopetCommandLineFunction with R
 
   override def beforeGraph: Unit = {
     if (reference == null) reference = referenceFasta()
+    println(pindelOutputInputHolder)
   }
 
-  @Input
+  @Input(doc = "Make this file a dependency before pindel2vcf can run. Usually a file generated by Pindel such as a _D file")
   var pindelOutputInputHolder: File = _
 
   var pindelOutput: Option[File] = config("pindel_output")
@@ -66,6 +67,7 @@ class PindelVCF(val root: Configurable) extends BiopetCommandLineFunction with R
     required("--reference", reference) +
     required("--reference_name", referenceSpecies) +
     required("--reference_date", rDate) +
+    required("--fake_biopet_input_holder", pindelOutputInputHolder) +
     optional("--pindel_output", pindelOutput) +
     optional("--pindel_output_root", pindelOutputRoot) +
     required("--vcf", outputVCF) +
diff --git a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala
index 18ba1bdbd..7b7e2ee02 100644
--- a/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala
+++ b/public/biopet-utils/src/main/scala/nl/lumc/sasc/biopet/utils/BamUtils.scala
@@ -41,13 +41,15 @@ object BamUtils {
    * @param end postion to stop scanning
    * @return Int with insertsize for this contig
    */
-  def contigInsertSize(inputBam: File, contig: String, end: Int): Option[Int] = {
+  def contigInsertSize(inputBam: File, contig: String, start: Int, end: Int, samplingSize: Int = 100000): Option[Int] = {
     val inputSam: SamReader = SamReaderFactory.makeDefault.open(inputBam)
-    val samIterator = inputSam.query(contig, 1, end, true)
+    val samIterator = inputSam.query(contig, start, end, true)
     val insertsizes: List[Int] = (for {
       read <- samIterator.toStream.takeWhile(rec => {
-        rec.getReadPairedFlag && ((rec.getReadUnmappedFlag == false) && (rec.getMateUnmappedFlag == false) && rec.getProperPairFlag)
-      }).take(100000)
+        val paired = rec.getReadPairedFlag && rec.getProperPairFlag
+        val bothMapped = (rec.getReadUnmappedFlag == false) && (rec.getMateUnmappedFlag == false)
+        paired && bothMapped
+      }).take(samplingSize)
     } yield {
       read.getInferredInsertSize.asInstanceOf[Int].abs
     })(collection.breakOut)
@@ -60,19 +62,30 @@ object BamUtils {
   }
 
   /**
-   * Estimate the insertsize for each bam file and return Map[<sampleName>, <insertSize>]
+   * Estimate the insertsize for one single bamfile and return the insertsize
    *
-   * @param bamFiles input bam files
+   * @param bamFile bamfile to estimate avg insertsize from
    * @return
    */
-  def sampleBamInsertSize(bamFiles: List[File]): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile =>
+  def sampleBamInsertSize(bamFile: File): Int = {
     val inputSam: SamReader = SamReaderFactory.makeDefault.open(bamFile)
     val baminsertsizes = inputSam.getFileHeader.getSequenceDictionary.getSequences.par.map({
-      contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, contig.getSequenceLength)
+      contig => BamUtils.contigInsertSize(bamFile, contig.getSequenceName, 1, contig.getSequenceLength)
     }).toList
-    val sum = baminsertsizes.flatMap(x => x).reduceLeft(_ + _)
-    val n = baminsertsizes.flatMap(x => x).size
-    bamFile -> (sum / n).toInt
+    val counts = baminsertsizes.flatMap(x => x)
+    val sum = counts.reduceLeft(_ + _)
+    val n = counts.size
+    sum / n
+  }
+
+  /**
+   * Estimate the insertsize for each bam file and return Map[<sampleBamFile>, <insertSize>]
+   *
+   * @param bamFiles input bam files
+   * @return
+   */
+  def sampleBamInsertSize(bamFiles: List[File]): immutable.ParMap[File, Int] = bamFiles.par.map { bamFile =>
+    bamFile -> sampleBamInsertSize(bamFile)
   }.toMap
 
 }
diff --git a/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/svcallers/Pindel.scala b/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/svcallers/Pindel.scala
index a318c5eb9..a452c0fee 100644
--- a/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/svcallers/Pindel.scala
+++ b/public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/svcallers/Pindel.scala
@@ -21,6 +21,7 @@ import java.util.Calendar
 
 import nl.lumc.sasc.biopet.core.PipelineCommand
 import nl.lumc.sasc.biopet.extensions.pindel._
+import nl.lumc.sasc.biopet.utils.BamUtils
 import nl.lumc.sasc.biopet.utils.config.Configurable
 
 /// Pindel is actually a mini pipeline executing binaries from the pindel package
@@ -42,8 +43,10 @@ class Pindel(val root: Configurable) extends SvCaller {
       val cfg = new PindelConfig(this)
       cfg.input = bamFile
 
+      val insertSize: Int = BamUtils.sampleBamInsertSize(bamFile)
+
       // FIXME: get the real insert size of the bam (from bammetrics?)
-      cfg.insertsize = 500
+      cfg.insertsize = insertSize
       cfg.sampleName = sample
       cfg.output = config_file
       add(cfg)
-- 
GitLab