From 3f2913f81f797bd025f3cd6bc80bf0b4b019f1b8 Mon Sep 17 00:00:00 2001
From: Peter van 't Hof <p.j.van_t_hof@lumc.nl>
Date: Tue, 20 May 2014 11:10:31 +0200
Subject: [PATCH] - Fixed memory issue in Variant Recalibration

---
 .../sasc/biopet/pipelines/gatk/Gatk.scala     | 89 ++++++++++---------
 .../biopet/pipelines/mapping/Mapping.scala    | 63 ++++++-------
 2 files changed, 79 insertions(+), 73 deletions(-)

diff --git a/gatk/src/main/java/nl/lumc/sasc/biopet/pipelines/gatk/Gatk.scala b/gatk/src/main/java/nl/lumc/sasc/biopet/pipelines/gatk/Gatk.scala
index 9beec1e93..ca21c3cf6 100644
--- a/gatk/src/main/java/nl/lumc/sasc/biopet/pipelines/gatk/Gatk.scala
+++ b/gatk/src/main/java/nl/lumc/sasc/biopet/pipelines/gatk/Gatk.scala
@@ -60,58 +60,63 @@ class Gatk(private var globalConfig: Config) extends QScript {
         add(genotypeGVCFs)
         
         //Snp recal
-        val snpVariantRecalibrator = new VariantRecalibrator() with gatkArguments
-        snpVariantRecalibrator.input +:= genotypeGVCFs.out
-        snpVariantRecalibrator.nt = 8
-        snpVariantRecalibrator.recal_file = swapExt(genotypeGVCFs.out,".vcf",".snp.recal")
-        snpVariantRecalibrator.tranches_file = swapExt(genotypeGVCFs.out,".vcf",".snp.tranches")
-        snpVariantRecalibrator.resource :+= new TaggedFile(config.getAsString("hapmap"), "known=false,training=true,truth=true,prior=15.0")
-        snpVariantRecalibrator.resource :+= new TaggedFile(config.getAsString("omni"), "known=false,training=true,truth=true,prior=12.0")
-        snpVariantRecalibrator.resource :+= new TaggedFile(config.getAsString("1000G"), "known=false,training=true,truth=false,prior=10.0")
-        snpVariantRecalibrator.resource :+= new TaggedFile(config.getAsString("dbsnp"), "known=true,training=false,truth=false,prior=2.0")
-        snpVariantRecalibrator.an = Seq("QD","MQ","MQRankSum","ReadPosRankSum","FS","DP","InbreedingCoeff")
-        snpVariantRecalibrator.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
+        val snpVariantRecalibrator = new VariantRecalibrator() with gatkArguments {
+          input +:= genotypeGVCFs.out
+          nt = 4; memoryLimit = 2 * nt
+          recal_file = swapExt(genotypeGVCFs.out,".vcf",".snp.recal")
+          tranches_file = swapExt(genotypeGVCFs.out,".vcf",".snp.tranches")
+          resource = Seq(new TaggedFile(config.getAsString("hapmap"), "known=false,training=true,truth=true,prior=15.0"),
+          new TaggedFile(config.getAsString("omni"), "known=false,training=true,truth=true,prior=12.0"),
+          new TaggedFile(config.getAsString("1000G"), "known=false,training=true,truth=false,prior=10.0"),
+          new TaggedFile(config.getAsString("dbsnp"), "known=true,training=false,truth=false,prior=2.0"))
+          an = Seq("QD","MQ","MQRankSum","ReadPosRankSum","FS","DP","InbreedingCoeff")
+          mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
+        }
         add(snpVariantRecalibrator)
         
-        val snpApplyRecalibration = new ApplyRecalibration() with gatkArguments
-        snpApplyRecalibration.input +:= genotypeGVCFs.out
-        snpApplyRecalibration.recal_file = snpVariantRecalibrator.recal_file
-        snpApplyRecalibration.tranches_file = snpVariantRecalibrator.tranches_file
-        snpApplyRecalibration.out = swapExt(genotypeGVCFs.out,".vcf",".snp.recal.vcf")
-        snpApplyRecalibration.ts_filter_level = 99.5
-        snpApplyRecalibration.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
-        snpApplyRecalibration.nt = 3
-        snpApplyRecalibration.scatterCount = scatterCount
+        val snpApplyRecalibration = new ApplyRecalibration() with gatkArguments {
+          input +:= genotypeGVCFs.out
+          recal_file = snpVariantRecalibrator.recal_file
+          tranches_file = snpVariantRecalibrator.tranches_file
+          out = swapExt(genotypeGVCFs.out,".vcf",".snp.recal.vcf")
+          ts_filter_level = 99.5
+          mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.SNP
+          nt = 3; memoryLimit = 2 * nt
+          scatterCount = scatterCount
+        }
         add(snpApplyRecalibration)
         
         //indel recal
-        val indelVariantRecalibrator = new VariantRecalibrator() with gatkArguments
-        indelVariantRecalibrator.input +:= genotypeGVCFs.out
-        indelVariantRecalibrator.nt = 8
-        indelVariantRecalibrator.recal_file = swapExt(genotypeGVCFs.out,".vcf",".indel.recal")
-        indelVariantRecalibrator.tranches_file = swapExt(genotypeGVCFs.out,".vcf",".indel.tranches")
-        indelVariantRecalibrator.resource :+= new TaggedFile(config.getAsString("mills"), "known=false,training=true,truth=true,prior=12.0")
-        indelVariantRecalibrator.resource :+= new TaggedFile(config.getAsString("dbsnp"), "known=true,training=false,truth=false,prior=2.0")
-        indelVariantRecalibrator.an = Seq("QD","DP","FS","ReadPosRankSum","MQRankSum","InbreedingCoeff")
-        indelVariantRecalibrator.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
+        val indelVariantRecalibrator = new VariantRecalibrator() with gatkArguments {
+          input +:= genotypeGVCFs.out
+          nt = 4; memoryLimit = 2 * nt
+          recal_file = swapExt(genotypeGVCFs.out,".vcf",".indel.recal")
+          tranches_file = swapExt(genotypeGVCFs.out,".vcf",".indel.tranches")
+          resource :+= new TaggedFile(config.getAsString("mills"), "known=false,training=true,truth=true,prior=12.0")
+          resource :+= new TaggedFile(config.getAsString("dbsnp"), "known=true,training=false,truth=false,prior=2.0")
+          an = Seq("QD","DP","FS","ReadPosRankSum","MQRankSum","InbreedingCoeff")
+          mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
+        }
         add(indelVariantRecalibrator)
         
-        val indelApplyRecalibration = new ApplyRecalibration() with gatkArguments
-        indelApplyRecalibration.input +:= genotypeGVCFs.out
-        indelApplyRecalibration.recal_file = indelVariantRecalibrator.recal_file
-        indelApplyRecalibration.tranches_file = indelVariantRecalibrator.tranches_file
-        indelApplyRecalibration.out = swapExt(genotypeGVCFs.out,".vcf",".indel.recal.vcf")
-        indelApplyRecalibration.ts_filter_level = 99.0
-        indelApplyRecalibration.mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
-        indelApplyRecalibration.nt = 3
-        indelApplyRecalibration.scatterCount = scatterCount
+        val indelApplyRecalibration = new ApplyRecalibration() with gatkArguments {
+          input +:= genotypeGVCFs.out
+          recal_file = indelVariantRecalibrator.recal_file
+          tranches_file = indelVariantRecalibrator.tranches_file
+          out = swapExt(genotypeGVCFs.out,".vcf",".indel.recal.vcf")
+          ts_filter_level = 99.0
+          mode = org.broadinstitute.sting.gatk.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode.INDEL
+          nt = 3; memoryLimit = 2 * nt
+          scatterCount = scatterCount
+        }
         add(indelApplyRecalibration)
         
         // merge snp and indels
-        val catVariants = new CatVariants()
-        catVariants.variant = Seq(snpApplyRecalibration.out,indelApplyRecalibration.out)
-        catVariants.outputFile = swapExt(genotypeGVCFs.out,".vcf",".recal.vcf")
-        catVariants.reference = referenceFile
+        val catVariants = new CatVariants() {
+          variant = Seq(snpApplyRecalibration.out,indelApplyRecalibration.out)
+          outputFile = swapExt(genotypeGVCFs.out,".vcf",".recal.vcf")
+          reference = referenceFile
+        }
         add(catVariants)
       } else logger.warn("No gVCFs to genotype")
       
diff --git a/mapping/src/main/java/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala b/mapping/src/main/java/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala
index 74573e2a1..7f680b3a6 100644
--- a/mapping/src/main/java/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala
+++ b/mapping/src/main/java/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala
@@ -80,7 +80,6 @@ class Mapping(private var globalConfig: Config) extends QScript {
       if (paired) fastq_R2 = flexiprep.outputFiles("output_R2")
     }
     var bamFile:File = ""
-    //var samFile:File = ""
     if (aligner == "bwa") {
       val bwaCommand = new Bwa(config) { R1 = fastq_R1; if (paired) R2 = fastq_R2; 
                                         RG = getReadGroup; output = new File(outputDir + outputName + ".sam") }
@@ -96,46 +95,48 @@ class Mapping(private var globalConfig: Config) extends QScript {
   }
     
   def addSortSam(inputSam:List[File], outputFile:File, dir:String) : File = {
-    val sortSam = new SortSam
-    sortSam.input = inputSam
-    sortSam.createIndex = true
-    sortSam.output = outputFile
-    sortSam.memoryLimit = 2
-    sortSam.nCoresRequest = 2
-    sortSam.jobResourceRequests :+= "h_vmem=4G"
+    val sortSam = new SortSam {
+      input = inputSam
+      createIndex = true
+      output = outputFile
+      memoryLimit = 2
+      nCoresRequest = 2
+      jobResourceRequests :+= "h_vmem=4G"
+    }
     add(sortSam)
     
     return sortSam.output
   }
   
   def addAddOrReplaceReadGroups(inputSam:List[File], outputFile:File, dir:String) : File = {
-    val addOrReplaceReadGroups = new AddOrReplaceReadGroups
-    addOrReplaceReadGroups.input = inputSam
-    addOrReplaceReadGroups.createIndex = true
-    addOrReplaceReadGroups.output = outputFile
-    addOrReplaceReadGroups.memoryLimit = 2
-    addOrReplaceReadGroups.nCoresRequest = 2
-    addOrReplaceReadGroups.jobResourceRequests :+= "h_vmem=4G"
-    
-    addOrReplaceReadGroups.RGLB = RGLB
-    addOrReplaceReadGroups.RGPL = RGPL
-    addOrReplaceReadGroups.RGPU = RGPU
-    addOrReplaceReadGroups.RGSM = RGSM
-    if (RGCN != null) addOrReplaceReadGroups.RGCN = RGCN
-    if (RGDS != null) addOrReplaceReadGroups.RGDS = RGDS
-    
+    val addOrReplaceReadGroups = new AddOrReplaceReadGroups {
+      input = inputSam
+      output = outputFile
+      createIndex = true
+      memoryLimit = 2
+      nCoresRequest = 2
+      jobResourceRequests :+= "h_vmem=4G"
+
+      this.RGLB = RGLB
+      this.RGPL = RGPL
+      this.RGPU = RGPU
+      this.RGSM = RGSM
+      if (RGCN != null) this.RGCN = RGCN
+      if (RGDS != null) this.RGDS = RGDS
+    }
     return addOrReplaceReadGroups.output
   }
   
   def addMarkDuplicates(inputBams:List[File], outputFile:File, dir:String) : File = {
-    val markDuplicates = new MarkDuplicates
-    markDuplicates.input = inputBams
-    markDuplicates.output = outputFile
-    markDuplicates.REMOVE_DUPLICATES = false
-    markDuplicates.metrics = swapExt(dir,outputFile,".bam",".metrics")
-    markDuplicates.outputIndex = swapExt(dir,markDuplicates.output,".bam",".bai")
-    markDuplicates.memoryLimit = 2
-    markDuplicates.jobResourceRequests :+= "h_vmem=4G"
+    val markDuplicates = new MarkDuplicates {
+      input = inputBams
+      output = outputFile
+      REMOVE_DUPLICATES = false
+      metrics = swapExt(dir,outputFile,".bam",".metrics")
+      outputIndex = swapExt(dir,this.output,".bam",".bai")
+      memoryLimit = 2
+      jobResourceRequests :+= "h_vmem=4G"
+    }
     add(markDuplicates)
     
     return markDuplicates.output
-- 
GitLab