Commit 2c2e3865 authored by Peter van 't Hof's avatar Peter van 't Hof

Removing some meta files from jar

parent 38103728
......@@ -155,6 +155,14 @@
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
<executions>
......
......@@ -163,13 +163,20 @@ case class Stats(generalStats: mutable.Map[String, mutable.Map[String, mutable.M
"info" -> infoFields.map(f => f -> getField(f, contig)).toMap,
"sample_distributions" -> sampleDistributions
.map(f => f -> getField("SampleDistribution-" + f, contig))
.toMap,
"sample_compare" -> Map(
"samples" -> samples,
"genotype_overlap" -> samples.map(sample1 => samples.map(sample2 => samplesStats(sample1).sampleToSample(sample2).genotypeOverlap)),
"allele_overlap" -> samples.map(sample1 => samples.map(sample2 => samplesStats(sample1).sampleToSample(sample2).alleleOverlap))
)
)
.toMap
) ++ (if (contig == "total")
Map(
"sample_compare" -> Map(
"samples" -> samples,
"genotype_overlap" -> samples.map(sample1 =>
samples.map(sample2 =>
samplesStats(sample1).sampleToSample(sample2).genotypeOverlap)),
"allele_overlap" -> samples.map(sample1 =>
samples.map(sample2 =>
samplesStats(sample1).sampleToSample(sample2).alleleOverlap))
)
)
else Map())
}
/** This will generate stats for total */
......
......@@ -81,18 +81,18 @@ object VcfStats extends ToolCommand {
)
val genotypeWiggleOptions = List("Total",
"Het",
"HetNonRef",
"Hom",
"HomRef",
"HomVar",
"Mixed",
"NoCall",
"NonInformative",
"Available",
"Called",
"Filtered",
"Variant")
"Het",
"HetNonRef",
"Hom",
"HomRef",
"HomVar",
"Mixed",
"NoCall",
"NonInformative",
"Available",
"Called",
"Filtered",
"Variant")
/** Parsing commandline arguments */
class OptParser extends AbstractOptParser {
......@@ -149,16 +149,16 @@ object VcfStats extends ToolCommand {
if (genotypeWiggleOptions.contains(x)) success else failure(s"""Non-existent field $x""")
} text s"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
|${genotypeWiggleOptions.mkString(", ")}""".stripMargin
opt[Int]('t',"localThreads")unbounded () action { (x, c) =>
opt[Int]('t', "localThreads") unbounded () action { (x, c) =>
c.copy(localThreads = x)
} text s"Number of local threads to use"
opt[String]("sparkMaster")unbounded () action { (x, c) =>
opt[String]("sparkMaster") unbounded () action { (x, c) =>
c.copy(sparkMaster = Some(x))
} text s"Spark master to use"
}
protected var cmdArgs: Args = _
//protected var cmdArgs: Args = _
val defaultGenotypeFields =
List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general")
......@@ -184,7 +184,7 @@ object VcfStats extends ToolCommand {
def main(args: Array[String]): Unit = {
logger.info("Started")
val argsParser = new OptParser
cmdArgs = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
val cmdArgs = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
logger.info("Init spark context")
......@@ -334,14 +334,14 @@ object VcfStats extends ToolCommand {
// Write general wiggle tracks
for (field <- cmdArgs.generalWiggle) {
val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig")
writeWiggle(intervals, field, "count", file, genotype = false)
writeWiggle(intervals, field, "count", file, genotype = false, cmdArgs.outputDir)
}
// Write sample wiggle tracks
for (field <- cmdArgs.genotypeWiggle; sample <- samples) {
val file = new File(cmdArgs.outputDir,
"wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig")
writeWiggle(intervals, field, sample, file, genotype = true)
writeWiggle(intervals, field, sample, file, genotype = true, cmdArgs.outputDir)
}
writeOverlap(stats,
......@@ -362,7 +362,8 @@ object VcfStats extends ToolCommand {
row: String,
column: String,
outputFile: File,
genotype: Boolean): Unit = {
genotype: Boolean,
outputDir: File): Unit = {
val groupedIntervals =
intervals.groupBy(_.getContig).map { case (k, v) => k -> v.sortBy(_.getStart) }
outputFile.getParentFile.mkdirs()
......@@ -375,11 +376,11 @@ object VcfStats extends ToolCommand {
val file = {
if (genotype)
new File(
cmdArgs.outputDir,
outputDir,
"bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv")
else
new File(
cmdArgs.outputDir,
outputDir,
"bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv")
}
writer.println(valueFromTsv(file, row, column).getOrElse(0))
......@@ -639,7 +640,8 @@ object VcfStats extends ToolCommand {
def writeOverlap(stats: Stats,
function: SampleToSampleStats => Int,
prefix: String,
samples: List[String]): Unit = {
samples: List[String],
plots: Boolean = true): Unit = {
val absFile = new File(prefix + ".abs.tsv")
val relFile = new File(prefix + ".rel.tsv")
......@@ -662,7 +664,7 @@ object VcfStats extends ToolCommand {
absWriter.close()
relWriter.close()
plotHeatmap(relFile)
if (plots) plotHeatmap(relFile)
}
/** Plots heatmaps on target tsv file */
......
......@@ -9,7 +9,9 @@ import nl.lumc.sasc.biopet.utils.intervals.{BedRecord, BedRecordList}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.JavaConversions._
import scala.concurrent.{Await, Future}
import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.duration.Duration
/**
* Created by pjvanthof on 14/07/2017.
......@@ -73,15 +75,14 @@ object VcfStatsSpark extends ToolCommand {
if (genotypeWiggleOptions.contains(x)) success else failure(s"""Non-existent field $x""")
} text s"""Create a wiggle track with bin size <binSize> for any of the following genotype fields:
|${genotypeWiggleOptions.mkString(", ")}""".stripMargin
opt[Int]('t',"localThreads")unbounded () action { (x, c) =>
opt[Int]('t', "localThreads") unbounded () action { (x, c) =>
c.copy(localThreads = x)
} text s"Number of local threads to use"
opt[String]("sparkMaster")unbounded () action { (x, c) =>
opt[String]("sparkMaster") unbounded () action { (x, c) =>
c.copy(sparkMaster = Some(x))
} text s"Spark master to use"
}
def main(args: Array[String]): Unit = {
logger.info("Started")
......@@ -97,7 +98,7 @@ object VcfStatsSpark extends ToolCommand {
val adInfoTags = {
(for (infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag)) yield {
require(header.getInfoHeaderLine(infoTag) != null,
"Info tag '" + infoTag + "' not found in header of vcf file")
"Info tag '" + infoTag + "' not found in header of vcf file")
infoTag
}) ::: (for (line <- header.getInfoHeaderLines if cmdArgs.allInfoTags
if !defaultInfoFields.contains(line.getID)
......@@ -109,7 +110,7 @@ object VcfStatsSpark extends ToolCommand {
val adGenotypeTags = (for (genotypeTag <- cmdArgs.genotypeTags
if !defaultGenotypeFields.contains(genotypeTag)) yield {
require(header.getFormatHeaderLine(genotypeTag) != null,
"Format tag '" + genotypeTag + "' not found in header of vcf file")
"Format tag '" + genotypeTag + "' not found in header of vcf file")
genotypeTag
}) ::: (for (line <- header.getFormatHeaderLines if cmdArgs.allGenotypeTags
if !defaultGenotypeFields.contains(line.getID)
......@@ -118,7 +119,6 @@ object VcfStatsSpark extends ToolCommand {
line.getID
}).toList ::: defaultGenotypeFields
logger.info("Init spark context")
val conf = new SparkConf()
......@@ -135,11 +135,31 @@ object VcfStatsSpark extends ToolCommand {
.scatter(cmdArgs.binSize)
.flatten
val regionStats = sc.parallelize(regions, regions.size).groupBy(_.chr).map { case (contig, records) => contig -> records.map(readBin(_, samples, cmdArgs, adInfoTags, adGenotypeTags))}
val regionStats = sc.parallelize(regions, regions.size).groupBy(_.chr).map {
case (contig, records) =>
contig -> records.map(readBin(_, samples, cmdArgs, adInfoTags, adGenotypeTags))
}
val chrStats = regionStats.map {case (contig, stats) => contig -> stats.reduce(_ += _)}
val chrStats =
regionStats.map { case (contig, stats) => contig -> stats.reduce(_ += _) }.cache()
val contigOverlap = chrStats.map {
case (contig, stats) =>
writeOverlap(stats,
_.genotypeOverlap,
cmdArgs.outputDir + s"/sample_compare/contigs/$contig/genotype_overlap",
samples,
cmdArgs.contigSampleOverlapPlots)
writeOverlap(stats,
_.alleleOverlap,
cmdArgs.outputDir + s"/sample_compare/contigs/$contig/allele_overlap",
samples,
cmdArgs.contigSampleOverlapPlots)
}
val totalStats = chrStats.values.reduce(_ += _)
val totalStats = chrStats.values.reduce(_ += _) // Blocking
//Await.ready(contigOverlap, Duration.Inf)
val allWriter = new PrintWriter(new File(cmdArgs.outputDir, "stats.json"))
val json = ConfigUtils.mapToJson(
......@@ -155,19 +175,21 @@ object VcfStatsSpark extends ToolCommand {
//TODO: write wig files
writeOverlap(totalStats,
_.genotypeOverlap,
cmdArgs.outputDir + "/sample_compare/genotype_overlap",
samples)
_.genotypeOverlap,
cmdArgs.outputDir + "/sample_compare/genotype_overlap",
samples)
writeOverlap(totalStats,
_.alleleOverlap,
cmdArgs.outputDir + "/sample_compare/allele_overlap",
samples)
_.alleleOverlap,
cmdArgs.outputDir + "/sample_compare/allele_overlap",
samples)
Thread.sleep(1000000)
sc.stop
logger.info("Done")
}
def readBin(bedRecord: BedRecord, samples: List[String],
def readBin(bedRecord: BedRecord,
samples: List[String],
cmdArgs: Args,
adInfoTags: List[String],
adGenotypeTags: List[String]): Stats = {
......@@ -184,7 +206,7 @@ object VcfStatsSpark extends ToolCommand {
Stats.mergeNestedStatsMap(stats.generalStats, fillGeneral(adInfoTags))
for (sample <- samples) yield {
Stats.mergeNestedStatsMap(stats.samplesStats(sample).genotypeStats,
fillGenotype(adGenotypeTags))
fillGenotype(adGenotypeTags))
}
chunkCounter += 1
}
......@@ -194,7 +216,7 @@ object VcfStatsSpark extends ToolCommand {
for (sample1 <- samples) yield {
val genotype = record.getGenotype(sample1)
Stats.mergeNestedStatsMap(stats.samplesStats(sample1).genotypeStats,
checkGenotype(record, genotype, adGenotypeTags))
checkGenotype(record, genotype, adGenotypeTags))
for (sample2 <- samples) {
val genotype2 = record.getGenotype(sample2)
if (genotype.getAlleles == genotype2.getAlleles)
......@@ -213,16 +235,17 @@ object VcfStatsSpark extends ToolCommand {
/** Commandline argument */
case class VcfStatsArgs(inputFile: File = null,
outputDir: File = null,
referenceFile: File = null,
intervals: Option[File] = None,
infoTags: List[String] = Nil,
genotypeTags: List[String] = Nil,
allInfoTags: Boolean = false,
allGenotypeTags: Boolean = false,
binSize: Int = 10000000,
writeBinStats: Boolean = false,
generalWiggle: List[String] = Nil,
genotypeWiggle: List[String] = Nil,
localThreads: Int = 1,
sparkMaster: Option[String] = None)
outputDir: File = null,
referenceFile: File = null,
intervals: Option[File] = None,
infoTags: List[String] = Nil,
genotypeTags: List[String] = Nil,
allInfoTags: Boolean = false,
allGenotypeTags: Boolean = false,
binSize: Int = 10000000,
writeBinStats: Boolean = false,
generalWiggle: List[String] = Nil,
genotypeWiggle: List[String] = Nil,
localThreads: Int = 1,
sparkMaster: Option[String] = None,
contigSampleOverlapPlots: Boolean = false)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment