Commit 7d053e65 authored by pjvan_thof's avatar pjvan_thof
Browse files

Adding option to limit number of bins in 1 job

parent c5d042c6
......@@ -2,6 +2,14 @@ package nl.lumc.sasc.biopet.tools.vcfstats
import java.io.File
import nl.lumc.sasc.biopet.tools.vcfstats.VcfStats.{
defaultGenotypeFields,
defaultInfoFields,
generalWiggleOptions,
genotypeWiggleOptions
}
import nl.lumc.sasc.biopet.utils.AbstractOptParser
/**
* Commandline argument for vcfstats
*
......@@ -16,9 +24,78 @@ case class VcfStatsArgs(inputFile: File = null,
allInfoTags: Boolean = false,
allGenotypeTags: Boolean = false,
binSize: Int = 10000000,
maxContigsInSingleJob: Int = 250,
writeBinStats: Boolean = false,
generalWiggle: List[String] = Nil,
genotypeWiggle: List[String] = Nil,
localThreads: Int = 1,
sparkMaster: Option[String] = None,
contigSampleOverlapPlots: Boolean = false)
/**
* Created by pjvanthof on 18/07/2017.
*/
class VcfStatsOptParser(cmdName: String) extends AbstractOptParser[VcfStatsArgs](cmdName) {
opt[File]('I', "inputFile") required () unbounded () maxOccurs 1 valueName "<file>" action {
(x, c) =>
c.copy(inputFile = x.getAbsoluteFile)
} validate { x =>
if (x.exists) success else failure("Input VCF required")
} text "Input VCF file (required)"
opt[File]('R', "referenceFile") required () unbounded () maxOccurs 1 valueName "<file>" action {
(x, c) =>
c.copy(referenceFile = x)
} validate { x =>
if (x.exists) success else failure("Reference file required")
} text "Fasta reference which was used to call input VCF (required)"
opt[File]('o', "outputDir") required () unbounded () maxOccurs 1 valueName "<file>" action {
(x, c) =>
c.copy(outputDir = x)
} validate { x =>
if (x == null) failure("Valid output directory required")
else if (x.exists) success
else failure(s"Output directory does not exist: $x")
} text "Path to directory for output (required)"
opt[File]('i', "intervals") unbounded () valueName "<file>" action { (x, c) =>
c.copy(intervals = Some(x))
} text "Path to interval (BED) file (optional)"
opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) =>
c.copy(infoTags = x :: c.infoTags)
} text s"Summarize these info tags. Default is (${defaultInfoFields.mkString(", ")})"
opt[String]("genotypeTag") unbounded () valueName "<tag>" action { (x, c) =>
c.copy(genotypeTags = x :: c.genotypeTags)
} text s"Summarize these genotype tags. Default is (${defaultGenotypeFields.mkString(", ")})"
opt[Unit]("allInfoTags") unbounded () action { (_, c) =>
c.copy(allInfoTags = true)
} text "Summarize all info tags. Default false"
opt[Unit]("allGenotypeTags") unbounded () action { (_, c) =>
c.copy(allGenotypeTags = true)
} text "Summarize all genotype tags. Default false"
opt[Int]("binSize") unbounded () action { (x, c) =>
c.copy(binSize = x)
} text "Binsize in estimated base pairs"
opt[Int]("maxContigsInSingleJob") unbounded () action { (x, c) =>
c.copy(maxContigsInSingleJob = x)
} text s"Max number of bins to be combined, default is 250"
opt[Unit]("writeBinStats") unbounded () action { (_, c) =>
c.copy(writeBinStats = true)
} text "Write bin statistics. Default False"
opt[String]("generalWiggle") unbounded () action { (x, c) =>
c.copy(generalWiggle = x :: c.generalWiggle, writeBinStats = true)
} validate { x =>
if (generalWiggleOptions.contains(x)) success else failure(s"""Nonexistent field $x""")
} text s"""Create a wiggle track with bin size <binSize> for any of the following statistics:
|${generalWiggleOptions.mkString(", ")}""".stripMargin
opt[String]("genotypeWiggle") unbounded () action { (x, c) =>
c.copy(genotypeWiggle = x :: c.genotypeWiggle, writeBinStats = true)
} validate { x =>
if (genotypeWiggleOptions.contains(x)) success else failure(s"""Non-existent field $x""")
} text s"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
|${genotypeWiggleOptions.mkString(", ")}""".stripMargin
opt[Int]('t', "localThreads") unbounded () action { (x, c) =>
c.copy(localThreads = x)
} text s"Number of local threads to use"
opt[String]("sparkMaster") unbounded () action { (x, c) =>
c.copy(sparkMaster = Some(x))
} text s"Spark master to use"
}
package nl.lumc.sasc.biopet.tools.vcfstats
import java.io.File
import nl.lumc.sasc.biopet.tools.vcfstats.VcfStats.{
defaultGenotypeFields,
defaultInfoFields,
generalWiggleOptions,
genotypeWiggleOptions
}
import nl.lumc.sasc.biopet.utils.AbstractOptParser
/**
* Created by pjvanthof on 18/07/2017.
*/
class VcfStatsOptParser(cmdName: String) extends AbstractOptParser[VcfStatsArgs](cmdName) {
opt[File]('I', "inputFile") required () unbounded () maxOccurs 1 valueName "<file>" action {
(x, c) =>
c.copy(inputFile = x.getAbsoluteFile)
} validate { x =>
if (x.exists) success else failure("Input VCF required")
} text "Input VCF file (required)"
opt[File]('R', "referenceFile") required () unbounded () maxOccurs 1 valueName "<file>" action {
(x, c) =>
c.copy(referenceFile = x)
} validate { x =>
if (x.exists) success else failure("Reference file required")
} text "Fasta reference which was used to call input VCF (required)"
opt[File]('o', "outputDir") required () unbounded () maxOccurs 1 valueName "<file>" action {
(x, c) =>
c.copy(outputDir = x)
} validate { x =>
if (x == null) failure("Valid output directory required")
else if (x.exists) success
else failure(s"Output directory does not exist: $x")
} text "Path to directory for output (required)"
opt[File]('i', "intervals") unbounded () valueName "<file>" action { (x, c) =>
c.copy(intervals = Some(x))
} text "Path to interval (BED) file (optional)"
opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) =>
c.copy(infoTags = x :: c.infoTags)
} text s"Summarize these info tags. Default is (${defaultInfoFields.mkString(", ")})"
opt[String]("genotypeTag") unbounded () valueName "<tag>" action { (x, c) =>
c.copy(genotypeTags = x :: c.genotypeTags)
} text s"Summarize these genotype tags. Default is (${defaultGenotypeFields.mkString(", ")})"
opt[Unit]("allInfoTags") unbounded () action { (_, c) =>
c.copy(allInfoTags = true)
} text "Summarize all info tags. Default false"
opt[Unit]("allGenotypeTags") unbounded () action { (_, c) =>
c.copy(allGenotypeTags = true)
} text "Summarize all genotype tags. Default false"
opt[Int]("binSize") unbounded () action { (x, c) =>
c.copy(binSize = x)
} text "Binsize in estimated base pairs"
opt[Unit]("writeBinStats") unbounded () action { (_, c) =>
c.copy(writeBinStats = true)
} text "Write bin statistics. Default False"
opt[String]("generalWiggle") unbounded () action { (x, c) =>
c.copy(generalWiggle = x :: c.generalWiggle, writeBinStats = true)
} validate { x =>
if (generalWiggleOptions.contains(x)) success else failure(s"""Nonexistent field $x""")
} text s"""Create a wiggle track with bin size <binSize> for any of the following statistics:
|${generalWiggleOptions.mkString(", ")}""".stripMargin
opt[String]("genotypeWiggle") unbounded () action { (x, c) =>
c.copy(genotypeWiggle = x :: c.genotypeWiggle, writeBinStats = true)
} validate { x =>
if (genotypeWiggleOptions.contains(x)) success else failure(s"""Non-existent field $x""")
} text s"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
|${genotypeWiggleOptions.mkString(", ")}""".stripMargin
opt[Int]('t', "localThreads") unbounded () action { (x, c) =>
c.copy(localThreads = x)
} text s"Number of local threads to use"
opt[String]("sparkMaster") unbounded () action { (x, c) =>
c.copy(sparkMaster = Some(x))
} text s"Spark master to use"
}
......@@ -72,7 +72,7 @@ object VcfStatsSpark extends ToolCommand {
BedRecordList.fromFile(i).validateContigs(cmdArgs.referenceFile)
case _ => BedRecordList.fromReference(cmdArgs.referenceFile)
}).combineOverlap
.scatter(cmdArgs.binSize)
.scatter(cmdArgs.binSize, maxContigsInSingleJob = Some(cmdArgs.maxContigsInSingleJob))
val contigs = regions.flatMap(_.map(_.chr)).distinct
val regionStats = sc
......
......@@ -106,7 +106,9 @@ case class BedRecordList(chrRecords: Map[String, List[BedRecord]], header: List[
})
}
def scatter(binSize: Int, combineContigs: Boolean = true): List[List[BedRecord]] = {
def scatter(binSize: Int,
combineContigs: Boolean = true,
maxContigsInSingleJob: Option[Int] = None): List[List[BedRecord]] = {
val list = allRecords
.flatMap(_.scatter(binSize))
.toList
......@@ -119,7 +121,9 @@ case class BedRecordList(chrRecords: Map[String, List[BedRecord]], header: List[
val bufferSize = buffer.map(_.length).sum
if (!combineContigs && buffer.head.chr != record.chr)
(buffer :: finalList, List(record))
else if (bufferSize < (binSize / 2)) (finalList, record :: buffer)
else if (bufferSize < (binSize / 2) &&
buffer.size < maxContigsInSingleJob.getOrElse(Int.MaxValue))
(finalList, record :: buffer)
else (buffer :: finalList, List(record))
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment