diff --git a/docs/general/requirements.md b/docs/general/requirements.md new file mode 100644 index 0000000000000000000000000000000000000000..0105f7ccc29dcbd5def4b6b49a6bb1235031d858 --- /dev/null +++ b/docs/general/requirements.md @@ -0,0 +1,17 @@ +### System Requirements + +Biopet is build on top of GATK Queue, which requires having `java` installed on the analysis machine(s). + +For end-users: + + * [Java 7 JVM](http://www.oracle.com/technetwork/java/javase/downloads/index.html) or [OpenJDK 7](http://openjdk.java.net/install/) + * [Cran R 2.15.3](http://cran.r-project.org/) + +For developers: + + * [OpenJDK 7](http://openjdk.java.net/install/) + * Minimum of 4 GB RAM {todo: provide more accurate estimation on building} + * Maven 3 + * Compiled and installed version 3.4 of [GATK + Queue](https://github.com/broadgsa/gatk-protected/) in your maven repository. + * IntelliJ or Netbeans 8.0 for development + diff --git a/mkdocs.yml b/mkdocs.yml index b5cb8fdb5a2eb8564edf286f868faf57fc58ceca..87a79146a31b767e9a6c551c76df62a168853204 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -3,6 +3,7 @@ pages: - Home: 'index.md' - General: - Config: 'general/config.md' + - Requirements: 'general/requirements.md' - Pipelines: - Basty: 'pipelines/basty.md' - Bam2Wig: 'pipelines/bam2wig.md' diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/ApplyRecalibration.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/ApplyRecalibration.scala index fbf4ad4e79866354f5244277290f9c553e747dd4..5813287fcb014b0bea08226428125462a2363c73 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/ApplyRecalibration.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/ApplyRecalibration.scala @@ -12,10 +12,12 @@ import nl.lumc.sasc.biopet.core.config.Configurable class ApplyRecalibration(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.ApplyRecalibration with GatkGeneral { scatterCount = config("scattercount", default = 0) + override val defaultThreads = 3 + override def beforeGraph() { super.beforeGraph() - nt = Option(getThreads(3)) + nt = Option(getThreads) memoryLimit = Option(nt.getOrElse(1) * 2) import org.broadinstitute.gatk.tools.walkers.variantrecalibration.VariantRecalibratorArgumentCollection.Mode diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/HaplotypeCaller.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/HaplotypeCaller.scala index b805baa356814f03828123d7730c77e0baaa1c10..7dbe56c31bc3072a390c79ef8ed9b608db9103b3 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/HaplotypeCaller.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/HaplotypeCaller.scala @@ -9,6 +9,9 @@ import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.variant.GATKVCFIndexType class HaplotypeCaller(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.HaplotypeCaller with GatkGeneral { + + override val defaultThreads = 1 + min_mapping_quality_score = config("minMappingQualityScore", default = 20) scatterCount = config("scattercount", default = 1) if (config.contains("dbsnp")) this.dbsnp = config("dbsnp") @@ -41,7 +44,7 @@ class HaplotypeCaller(val root: Configurable) extends org.broadinstitute.gatk.qu threads = 1 logger.warn("BamOutput is on, nct/threads is forced to set on 1, this option is only for debug") } - nct = Some(getThreads(1)) + nct = Some(getThreads) memoryLimit = Option(memoryLimit.getOrElse(2.0) * nct.getOrElse(1)) } diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/UnifiedGenotyper.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/UnifiedGenotyper.scala index 5da68c1d94bae7b27e6b6994095aa8a5391087a1..b98495a0e3958ae0eda6cc9ecdde02657859daa2 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/UnifiedGenotyper.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/UnifiedGenotyper.scala @@ -26,11 +26,13 @@ class UnifiedGenotyper(val root: Configurable) extends org.broadinstitute.gatk.q } } + override val defaultThreads = 1 + override def beforeGraph() { super.beforeGraph() genotype_likelihoods_model = org.broadinstitute.gatk.tools.walkers.genotyper.GenotypeLikelihoodsCalculationModel.Model.BOTH - nct = Some(getThreads(1)) + nct = Some(getThreads) memoryLimit = Option(nct.getOrElse(1) * memoryLimit.getOrElse(2.0)) } } diff --git a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantRecalibrator.scala b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantRecalibrator.scala index d4240f01a1970d84dfd9a5e7a329e85ef670b8d6..908d01a695a956cc49f345c0d12a8506dc4af390 100644 --- a/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantRecalibrator.scala +++ b/protected/biopet-gatk-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/broad/VariantRecalibrator.scala @@ -11,7 +11,9 @@ import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.queue.extensions.gatk.TaggedFile class VariantRecalibrator(val root: Configurable) extends org.broadinstitute.gatk.queue.extensions.gatk.VariantRecalibrator with GatkGeneral { - nt = Option(getThreads(4)) + override val defaultThreads = 4 + + nt = Option(getThreads) memoryLimit = Option(nt.getOrElse(1) * 2) if (config.contains("dbsnp")) resource :+= new TaggedFile(config("dbsnp").asString, "known=true,training=false,truth=false,prior=2.0") diff --git a/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/extensions/rscript/plotScatter.R b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/extensions/rscript/plotScatter.R new file mode 100644 index 0000000000000000000000000000000000000000..a1959a262cf868d9949b0320f57c9d54b7c50860 --- /dev/null +++ b/public/biopet-framework/src/main/resources/nl/lumc/sasc/biopet/extensions/rscript/plotScatter.R @@ -0,0 +1,40 @@ +library(reshape2) +library(ggplot2) +library(argparse) + +parser <- ArgumentParser(description='Process some integers') +parser$add_argument('--input', dest='input', type='character', help='Input tsv file', required=TRUE) +parser$add_argument('--output', dest='output', type='character', help='Output png file', required=TRUE) +parser$add_argument('--width', dest='width', type='integer', default = 500) +parser$add_argument('--height', dest='height', type='integer', default = 500) +parser$add_argument('--xlabel', dest='xlabel', type='character') +parser$add_argument('--ylabel', dest='ylabel', type='character', required=TRUE) +parser$add_argument('--llabel', dest='llabel', type='character') +parser$add_argument('--title', dest='title', type='character') +parser$add_argument('--removeZero', dest='removeZero', type='character', default="false") + +arguments <- parser$parse_args() + +png(filename = arguments$output, width = arguments$width, height = arguments$height) + +DF <- read.table(arguments$input, header=TRUE) + +if (is.null(arguments$xlabel)) xlab <- colnames(DF)[1] else xlab <- arguments$xlabel + +colnames(DF)[1] <- "Rank" + +DF1 <- melt(DF, id.var="Rank") + +if (arguments$removeZero == "true") DF1 <- DF1[DF1$value > 0, ] +if (arguments$removeZero == "true") print("Removed 0 values") + +ggplot(DF1, aes(x = Rank, y = value, group = variable, color = variable)) + + xlab(xlab) + + ylab(arguments$ylabel) + + guides(fill=guide_legend(title=arguments$llabel)) + + theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) + + ggtitle(arguments$title) + + theme_bw() + + geom_point() + +dev.off() diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala index edb16ee762338f170cfe1ae3b9f081c7b226a0d0..ea2731c787eb0359ec42ef105fa3ccefe97c5f44 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetCommandLineFunctionTrait.scala @@ -199,13 +199,15 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab BiopetCommandLineFunctionTrait.versionCache.get(versionCommand) } + def getThreads: Int = getThreads(defaultThreads) + /** * Get threads from config * @param default default when not found in config * @return number of threads */ def getThreads(default: Int): Int = { - val maxThreads: Int = config("maxthreads", default = 8) + val maxThreads: Int = config("maxthreads", default = 24) val threads: Int = config("threads", default = default) if (maxThreads > threads) threads else maxThreads @@ -218,7 +220,7 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab * @return number of threads */ def getThreads(default: Int, module: String): Int = { - val maxThreads: Int = config("maxthreads", default = 8, submodule = module) + val maxThreads: Int = config("maxthreads", default = 24, submodule = module) val threads: Int = config("threads", default = default, submodule = module) if (maxThreads > threads) threads else maxThreads diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala index 6cdea7db15affd6cf871f393d7de2cf0be3cc2fb..088123a760b7cc9f14821a1a7f0d7aa37ed27d99 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Bowtie.scala @@ -58,34 +58,37 @@ class Bowtie(val root: Configurable) extends BiopetCommandLineFunction with Refe var strata: Boolean = config("strata", default = false) var maqerr: Option[Int] = config("maqerr") var maxins: Option[Int] = config("maxins") + var largeIndex: Boolean = config("large-index", default = false) override def beforeGraph() { super.beforeGraph() if (reference == null) reference = referenceFasta() + val basename = reference.getName.stripSuffix(".fasta").stripSuffix(".fa") + if (reference.getParentFile.list().toList.filter(_.startsWith(basename)).exists(_.endsWith(".ebwtl"))) + largeIndex = config("large-index", default = true) } /** return commandline to execute */ - def cmdLine = { - required(executable) + - optional("--threads", threads) + - conditional(sam, "--sam") + - conditional(best, "--best") + - conditional(strata, "--strata") + - optional("--sam-RG", sam_RG) + - optional("--seedlen", seedlen) + - optional("--seedmms", seedmms) + - optional("-k", k) + - optional("-m", m) + - optional("--maxbts", maxbts) + - optional("--maqerr", maqerr) + - optional("--maxins", maxins) + - required(reference) + - (R2 match { - case Some(r2) => - required("-1", R1) + - optional("-2", r2) - case _ => required(R1) - }) + - " > " + required(output) - } + def cmdLine = required(executable) + + optional("--threads", threads) + + conditional(sam, "--sam") + + conditional(largeIndex, "--large-index") + + conditional(best, "--best") + + conditional(strata, "--strata") + + optional("--sam-RG", sam_RG) + + optional("--seedlen", seedlen) + + optional("--seedmms", seedmms) + + optional("-k", k) + + optional("-m", m) + + optional("--maxbts", maxbts) + + optional("--maqerr", maqerr) + + optional("--maxins", maxins) + + required(reference.getAbsolutePath.stripSuffix(".fa").stripSuffix(".fasta")) + + (R2 match { + case Some(r2) => + required("-1", R1) + + optional("-2", r2) + case _ => required(R1) + }) + + " > " + required(output) } \ No newline at end of file diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala index 71b4ec736449004a2644ec03c5403e6ebb70df63..55f482701d13dfe44b9bfc5db11d62e8f39065c2 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Raxml.scala @@ -74,7 +74,7 @@ class Raxml(val root: Configurable) extends BiopetCommandLineFunction { /** Sets correct output files to job */ override def beforeGraph() { require(w != null) - if (threads == 0) threads = getThreads(defaultThreads) + if (threads == 0) threads = getThreads executable = if (threads > 1 && executableThreads.isDefined) executableThreads.get else executableNonThreads super.beforeGraph() out :::= List(Some(getInfoFile), getBestTreeFile, getBootstrapFile, getBipartitionsFile).flatten diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Star.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Star.scala index e908a72704edc0d7acc5981c80e5a5de43a1f50d..6014b6399b4b35d4bceb49a9684fe7fd419555a1 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Star.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Star.scala @@ -63,7 +63,7 @@ class Star(val root: Configurable) extends BiopetCommandLineFunction with Refere var outFileNamePrefix: String = _ var runThreadN: Option[Int] = config("runThreadN") - override def defaultCoreMemory = 4.0 + override def defaultCoreMemory = 6.0 override def defaultThreads = 8 /** Sets output files for the graph */ @@ -72,7 +72,7 @@ class Star(val root: Configurable) extends BiopetCommandLineFunction with Refere if (reference == null) reference = referenceFasta() genomeDir = config("genomeDir", new File(reference.getAbsoluteFile.getParent, "star")) if (outFileNamePrefix != null && !outFileNamePrefix.endsWith(".")) outFileNamePrefix += "." - val prefix = if (outFileNamePrefix != null) outputDir + outFileNamePrefix else outputDir + val prefix = if (outFileNamePrefix != null) outputDir + File.separator + outFileNamePrefix else outputDir + File.separator if (runmode == null) { outputSam = new File(prefix + "Aligned.out.sam") outputTab = new File(prefix + "SJ.out.tab") diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Tophat.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Tophat.scala index 82332de297f12df794fa3d770c3dcc22da04f57a..708bfb0363a6d664370380695962717d55a3d6bf 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Tophat.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Tophat.scala @@ -17,14 +17,14 @@ package nl.lumc.sasc.biopet.extensions import java.io.File -import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction +import nl.lumc.sasc.biopet.core.{ Reference, BiopetCommandLineFunction } import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output } /** * Extension for Tophat */ -class Tophat(val root: Configurable) extends BiopetCommandLineFunction { +class Tophat(val root: Configurable) extends BiopetCommandLineFunction with Reference { executable = config("exe", default = "tophat", freeVar = false) @@ -264,6 +264,16 @@ class Tophat(val root: Configurable) extends BiopetCommandLineFunction { var rg_platform: Option[String] = config("rg_platform") + override def beforeGraph: Unit = { + super.beforeGraph + if (bowtie1 && !new File(bowtie_index).getParentFile.list().toList + .filter(_.startsWith(new File(bowtie_index).getName)).exists(_.endsWith(".ebwt"))) + throw new IllegalArgumentException("No bowtie1 index found for tophat") + else if (!new File(bowtie_index).getParentFile.list().toList + .filter(_.startsWith(new File(bowtie_index).getName)).exists(_.endsWith(".bt2"))) + throw new IllegalArgumentException("No bowtie2 index found for tophat") + } + def cmdLine: String = required(executable) + optional("-o", output_dir) + conditional(bowtie1, "--bowtie1") + diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MergeSamFiles.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MergeSamFiles.scala index 19122634b79337d0ef67642f85700b8a75c0ad5a..f347ad3574fc6b13ab2be8abb6bdb4b1f64057fb 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MergeSamFiles.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MergeSamFiles.scala @@ -45,6 +45,14 @@ class MergeSamFiles(val root: Configurable) extends Picard { @Argument(doc = "COMMENT", required = false) var comment: Option[String] = config("comment") + @Output(doc = "Bam Index", required = true) + private var outputIndex: File = _ + + override def beforeGraph() { + super.beforeGraph() + if (createIndex) outputIndex = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai") + } + /** Returns command to execute */ override def commandLine = super.commandLine + repeat("INPUT=", input, spaceSeparated = false) + diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/rscript/ScatterPlot.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/rscript/ScatterPlot.scala new file mode 100644 index 0000000000000000000000000000000000000000..f4cccdda176fb1adc09669b2caae155b981265fe --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/rscript/ScatterPlot.scala @@ -0,0 +1,56 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.rscript + +import java.io.File + +import nl.lumc.sasc.biopet.core.config.Configurable +import nl.lumc.sasc.biopet.extensions.RscriptCommandLineFunction +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +/** + * Extension for en general line plot with R + * + * Created by pjvan_thof on 4/29/15. + */ +class ScatterPlot(val root: Configurable) extends RscriptCommandLineFunction { + protected var script: File = config("script", default = "plotScatter.R") + + @Input + var input: File = _ + + @Output + var output: File = _ + + var width: Option[Int] = config("width") + var height: Option[Int] = config("height") + var xlabel: Option[String] = config("xlabel") + var ylabel: Option[String] = config("ylabel") + var llabel: Option[String] = config("llabel") + var title: Option[String] = config("title") + var removeZero: Boolean = config("removeZero", default = false) + + override def cmdLine: String = super.cmdLine + + required("--input", input) + + required("--output", output) + + optional("--width", width) + + optional("--height", height) + + optional("--xlabel", xlabel) + + required("--ylabel", ylabel) + + optional("--llabel", llabel) + + optional("--title", title) + + optional("--removeZero", removeZero) +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/AnnotateVcfWithBed.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/AnnotateVcfWithBed.scala index 6136c146b91e45fa8ddb85e49d2ad94c903a6033..af3294c47a58f16ccf8d8f618ced98bbc5d0077b 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/AnnotateVcfWithBed.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/AnnotateVcfWithBed.scala @@ -21,6 +21,7 @@ import htsjdk.variant.variantcontext.VariantContextBuilder import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder } import htsjdk.variant.vcf.{ VCFFileReader, VCFHeaderLineCount, VCFHeaderLineType, VCFInfoHeaderLine } import nl.lumc.sasc.biopet.core.ToolCommand +import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList } import scala.collection.JavaConversions._ import scala.collection.mutable @@ -83,19 +84,9 @@ object AnnotateVcfWithBed extends ToolCommand { logger.info("Start") val argsParser = new OptParser - val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) - - val bedRecords: mutable.Map[String, List[(Int, Int, String)]] = mutable.Map() - // Read bed file - /* - // function bedRecord.getName will not compile, not clear why - for (bedRecord <- asScalaIteratorConverter(getFeatureReader(commandArgs.bedFile.toPath.toString, new BEDCodec(), false).iterator()).asScala) { - logger.debug(bedRecord) - bedRecords(bedRecord.getChr) = (bedRecord.getStart, bedRecord.getEnd, bedRecord.getName) :: bedRecords.getOrElse(bedRecord.getChr, Nil) - } - */ + val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) - val fieldType = commandArgs.fieldType match { + val fieldType = cmdArgs.fieldType match { case "Integer" => VCFHeaderLineType.Integer case "Flag" => VCFHeaderLineType.Flag case "Character" => VCFHeaderLineType.Character @@ -104,48 +95,32 @@ object AnnotateVcfWithBed extends ToolCommand { } logger.info("Reading bed file") - - for (line <- Source.fromFile(commandArgs.bedFile).getLines()) { - val values = line.split("\t") - if (values.size >= 4) - bedRecords(values(0)) = (values(1).toInt, values(2).toInt, values(3)) :: bedRecords.getOrElse(values(0), Nil) - else values.size >= 3 && fieldType == VCFHeaderLineType.Flag - bedRecords(values(0)) = (values(1).toInt, values(2).toInt, "") :: bedRecords.getOrElse(values(0), Nil) - } - - logger.info("Sorting bed records") - - // Sort records when needed - for ((chr, record) <- bedRecords) { - bedRecords(chr) = record.sortBy(x => (x._1, x._2)) - } + val bedRecords = BedRecordList.fromFile(cmdArgs.bedFile).sorted logger.info("Starting output file") - val reader = new VCFFileReader(commandArgs.inputFile, false) + val reader = new VCFFileReader(cmdArgs.inputFile, false) val header = reader.getFileHeader val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder(). - setOutputFile(commandArgs.outputFile). + setOutputFile(cmdArgs.outputFile). setReferenceDictionary(header.getSequenceDictionary). build) - header.addMetaDataLine(new VCFInfoHeaderLine(commandArgs.fieldName, - VCFHeaderLineCount.UNBOUNDED, fieldType, commandArgs.fieldDescription)) + header.addMetaDataLine(new VCFInfoHeaderLine(cmdArgs.fieldName, + VCFHeaderLineCount.UNBOUNDED, fieldType, cmdArgs.fieldDescription)) writer.writeHeader(header) logger.info("Start reading vcf records") for (record <- reader) { - val overlaps = bedRecords.getOrElse(record.getContig, Nil).filter(x => { - record.getStart <= x._2 && record.getEnd >= x._1 - }) + val overlaps = bedRecords.overlapWith(BedRecord(record.getContig, record.getStart, record.getEnd)) if (overlaps.isEmpty) { writer.add(record) } else { val builder = new VariantContextBuilder(record) - if (fieldType == VCFHeaderLineType.Flag) builder.attribute(commandArgs.fieldName, true) - else builder.attribute(commandArgs.fieldName, overlaps.map(_._3).mkString(",")) + if (fieldType == VCFHeaderLineType.Flag) builder.attribute(cmdArgs.fieldName, true) + else builder.attribute(cmdArgs.fieldName, overlaps.map(_.name).mkString(",")) writer.add(builder.make) } } diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/RegionAfCount.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/RegionAfCount.scala new file mode 100644 index 0000000000000000000000000000000000000000..4766f31d5fd78af68ece6fcb8a5a410cfb4ae951 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/RegionAfCount.scala @@ -0,0 +1,152 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.tools + +import java.io.{ PrintWriter, InputStream, File } +import java.util + +import htsjdk.variant.vcf.VCFFileReader +import nl.lumc.sasc.biopet.core.ToolCommand +import nl.lumc.sasc.biopet.extensions.rscript.ScatterPlot +import nl.lumc.sasc.biopet.utils.intervals.{ BedRecord, BedRecordList } + +import scala.collection.JavaConversions._ +import scala.collection.mutable + +object RegionAfCount extends ToolCommand { + case class Args(bedFile: File = null, + outputPrefix: String = null, + scatterpPlot: Boolean = false, + vcfFiles: List[File] = Nil) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('b', "bedFile") unbounded () required () maxOccurs 1 valueName "<file>" action { (x, c) => + c.copy(bedFile = x) + } + opt[String]('o', "outputPrefix") unbounded () required () maxOccurs 1 valueName "<file prefix>" action { (x, c) => + c.copy(outputPrefix = x) + } + opt[Unit]('s', "scatterPlot") unbounded () action { (x, c) => + c.copy(scatterpPlot = true) + } + opt[File]('V', "vcfFile") unbounded () minOccurs 1 action { (x, c) => + c.copy(vcfFiles = c.vcfFiles ::: x :: Nil) + } + } + + def main(args: Array[String]): Unit = { + val argsParser = new OptParser + val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) + + logger.info("Start") + logger.info("Reading bed file") + + val bedRecords = BedRecordList.fromFile(cmdArgs.bedFile).sorted + + logger.info(s"Combine ${bedRecords.allRecords.size} bed records") + + val combinedBedRecords = bedRecords.combineOverlap + + logger.info(s"${combinedBedRecords.allRecords.size} left") + logger.info(s"${combinedBedRecords.allRecords.size * cmdArgs.vcfFiles.size} query's to do") + logger.info("Reading vcf files") + + case class AfCounts(var names: Double = 0, + var namesExons: Double = 0, + var namesIntrons: Double = 0, + var namesCoding: Double = 0, + var utr: Double = 0, + var utr5: Double = 0, + var utr3: Double = 0) + + var c = 0 + val afCounts = (for (vcfFile <- cmdArgs.vcfFiles.par) yield vcfFile -> { + val reader = new VCFFileReader(vcfFile, true) + val afCounts: mutable.Map[String, AfCounts] = mutable.Map() + for (region <- combinedBedRecords.allRecords) yield { + val originals = region.originals() + for (variant <- reader.query(region.chr, region.start, region.end)) { + val sum = (variant.getAttribute("AF", 0) match { + case a: util.ArrayList[_] => a.map(_.toString.toDouble).toArray + case s => Array(s.toString.toDouble) + }).sum + val interval = BedRecord(variant.getContig, variant.getStart, variant.getEnd) + originals.foreach { x => + val name = x.name.getOrElse(s"${x.chr}:${x.start}-${x.end}") + if (!afCounts.contains(name)) afCounts += name -> AfCounts() + afCounts(name).names += sum + val exons = x.exons.getOrElse(Seq()).filter(_.overlapWith(interval)) + val introns = x.introns.getOrElse(Seq()).filter(_.overlapWith(interval)) + val utr5 = x.utr5.map(_.overlapWith(interval)) + val utr3 = x.utr3.map(_.overlapWith(interval)) + if (exons.nonEmpty) { + afCounts(name).namesExons += sum + if (!utr5.getOrElse(false) && !utr3.getOrElse(false)) afCounts(name).namesCoding += sum + } + if (introns.nonEmpty) afCounts(name).namesIntrons += sum + if (utr5.getOrElse(false) || utr3.getOrElse(false)) afCounts(name).utr += sum + if (utr5.getOrElse(false)) afCounts(name).utr5 += sum + if (utr3.getOrElse(false)) afCounts(name).utr3 += sum + } + } + c += 1 + if (c % 100 == 0) logger.info(s"$c regions done") + } + afCounts.toMap + }).toMap + + logger.info(s"Done reading, ${c} regions") + + logger.info("Writing output files") + + def writeOutput(tsvFile: File, function: AfCounts => Double): Unit = { + val writer = new PrintWriter(tsvFile) + writer.println("\t" + cmdArgs.vcfFiles.map(_.getName).mkString("\t")) + for (r <- cmdArgs.vcfFiles.foldLeft(Set[String]())((a, b) => a ++ afCounts(b).keySet)) { + writer.print(r + "\t") + writer.println(cmdArgs.vcfFiles.map(x => function(afCounts(x).getOrElse(r, AfCounts()))).mkString("\t")) + } + writer.close() + + if (cmdArgs.scatterpPlot) generatePlot(tsvFile) + } + + def generatePlot(tsvFile: File): Unit = { + logger.info(s"Generate plot for $tsvFile") + + val scatterPlot = new ScatterPlot(null) + scatterPlot.input = tsvFile + scatterPlot.output = new File(tsvFile.getAbsolutePath.stripSuffix(".tsv") + ".png") + scatterPlot.ylabel = Some("Sum of AFs") + scatterPlot.width = Some(1200) + scatterPlot.height = Some(1000) + scatterPlot.runLocal() + } + for ( + arg <- List[(File, AfCounts => Double)]( + (new File(cmdArgs.outputPrefix + ".names.tsv"), _.names), + (new File(cmdArgs.outputPrefix + ".names.exons_only.tsv"), _.namesExons), + (new File(cmdArgs.outputPrefix + ".names.introns_only.tsv"), _.namesIntrons), + (new File(cmdArgs.outputPrefix + ".names.coding.tsv"), _.namesCoding), + (new File(cmdArgs.outputPrefix + ".names.utr.tsv"), _.utr), + (new File(cmdArgs.outputPrefix + ".names.utr5.tsv"), _.utr5), + (new File(cmdArgs.outputPrefix + ".names.utr3.tsv"), _.utr3) + ).par + ) writeOutput(arg._1, arg._2) + + logger.info("Done") + } +} \ No newline at end of file diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SamplesTsvToJson.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SamplesTsvToJson.scala index 46cd2b40cd8645a432aee54081f54e9dd0ec3351..0f0e11c62ac8f935a064b5b3b042d98527e48934 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SamplesTsvToJson.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SamplesTsvToJson.scala @@ -19,6 +19,7 @@ import java.io.File import nl.lumc.sasc.biopet.core.ToolCommand import nl.lumc.sasc.biopet.utils.ConfigUtils._ +import scala.collection.mutable import scala.io.Source @@ -45,21 +46,30 @@ object SamplesTsvToJson extends ToolCommand { val header = lines.head.split("\t") val sampleColumn = header.indexOf("sample") val libraryColumn = header.indexOf("library") - if (sampleColumn == -1) throw new IllegalStateException("sample column does not exist in: " + inputFile) + if (sampleColumn == -1) throw new IllegalStateException("Sample column does not exist in: " + inputFile) + + val sampleLibCache: mutable.Set[(String, Option[String])] = mutable.Set() val librariesValues: List[Map[String, Any]] = for (tsvLine <- lines.tail) yield { val values = tsvLine.split("\t") + require(header.length == values.length, "Number of columns is not the same as the header") val sample = values(sampleColumn) - val library = if (libraryColumn != -1) values(libraryColumn) else null + val library = if (libraryColumn != -1) Some(values(libraryColumn)) else None + + //FIXME: this is a workaround, should be removed after fixing #180 + if (sample.head.isDigit || library.forall(_.head.isDigit)) + throw new IllegalStateException("Sample or library may not start with a number") + + if (sampleLibCache.contains((sample, library))) + throw new IllegalStateException(s"Combination of $sample and $library is found multiple times") + else sampleLibCache.add((sample, library)) val valuesMap = (for ( t <- 0 until values.size if !values(t).isEmpty && t != sampleColumn && t != libraryColumn ) yield header(t) -> values(t)).toMap - val map: Map[String, Any] = if (library != null) { - Map("samples" -> Map(sample -> Map("libraries" -> Map(library -> valuesMap)))) - } else { - Map("samples" -> Map(sample -> valuesMap)) + library match { + case Some(lib) => Map("samples" -> Map(sample -> Map("libraries" -> Map(library -> valuesMap)))) + case _ => Map("samples" -> Map(sample -> valuesMap)) } - map } librariesValues.foldLeft(Map[String, Any]())((acc, kv) => mergeMaps(acc, kv)) } diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SquishBed.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SquishBed.scala new file mode 100644 index 0000000000000000000000000000000000000000..2d7bfcaf10fa1aa261462a9bcdb4f2fa829521f4 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SquishBed.scala @@ -0,0 +1,57 @@ +package nl.lumc.sasc.biopet.tools + +import java.io.File + +import nl.lumc.sasc.biopet.core.ToolCommand +import nl.lumc.sasc.biopet.utils.intervals.BedRecordList + +/** + * Created by pjvanthof on 22/08/15. + */ +object SquishBed extends ToolCommand { + + case class Args(input: File = null, + output: File = null, + strandSensitive: Boolean = false) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('I', "input") required () valueName "<file>" action { (x, c) => + c.copy(input = x) + } + opt[File]('o', "output") required () unbounded () valueName "<file>" action { (x, c) => + c.copy(output = x) + } + opt[Unit]('s', "strandSensitive") unbounded () valueName "<file>" action { (x, c) => + c.copy(strandSensitive = true) + } + } + + /** + * @param args the command line arguments + */ + def main(args: Array[String]): Unit = { + val argsParser = new OptParser + val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) + + if (!cmdArgs.input.exists) throw new IllegalStateException("Input file not found, file: " + cmdArgs.input) + + logger.info("Start") + + val records = BedRecordList.fromFile(cmdArgs.input) + val length = records.length + val refLength = records.combineOverlap.length + logger.info(s"Total bases: $length") + logger.info(s"Total bases on reference: $refLength") + logger.info("Start squishing") + val squishBed = records.squishBed(cmdArgs.strandSensitive).sorted + logger.info("Done squishing") + val squishLength = squishBed.length + val squishRefLength = squishBed.combineOverlap.length + logger.info(s"Total bases left: $squishLength") + logger.info(s"Total bases left on reference: $squishRefLength") + logger.info(s"Total bases removed from ref: ${refLength - squishRefLength}") + squishBed.writeToFile(cmdArgs.output) + + logger.info("Done") + } +} diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala index 47af91bf3dd14014e67b0082b399f5ca8f24f8b5..a051d2c33a4136aeeb813e6e956102644efa9520 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfStats.scala @@ -24,6 +24,7 @@ import htsjdk.variant.vcf.VCFFileReader import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.core.summary.{ Summarizable, SummaryQScript } import nl.lumc.sasc.biopet.core.{ Reference, ToolCommand, ToolCommandFuntion } +import nl.lumc.sasc.biopet.utils.intervals.BedRecordList import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import scala.collection.JavaConversions._ @@ -146,12 +147,9 @@ object VcfStats extends ToolCommand { opt[File]('o', "outputDir") required () unbounded () valueName "<file>" action { (x, c) => c.copy(outputDir = x) } - //TODO: add interval argument - /* opt[File]('i', "intervals") unbounded () valueName ("<file>") action { (x, c) => c.copy(intervals = Some(x)) } - */ opt[String]("infoTag") unbounded () valueName "<tag>" action { (x, c) => c.copy(infoTags = x :: c.infoTags) } @@ -258,7 +256,7 @@ object VcfStats extends ToolCommand { } } - protected var commandArgs: Args = _ + protected var cmdArgs: Args = _ val defaultGenotypeFields = List("DP", "GQ", "AD", "AD-ref", "AD-alt", "AD-used", "AD-not_used", "general") @@ -273,9 +271,9 @@ object VcfStats extends ToolCommand { def main(args: Array[String]): Unit = { logger.info("Started") val argsParser = new OptParser - commandArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1) + cmdArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1) - val reader = new VCFFileReader(commandArgs.inputFile, true) + val reader = new VCFFileReader(cmdArgs.inputFile, true) val header = reader.getFileHeader val samples = header.getSampleNamesInOrder.toList @@ -283,44 +281,36 @@ object VcfStats extends ToolCommand { val adInfoTags = { (for ( - infoTag <- commandArgs.infoTags if !defaultInfoFields.contains(infoTag) + infoTag <- cmdArgs.infoTags if !defaultInfoFields.contains(infoTag) ) yield { require(header.getInfoHeaderLine(infoTag) != null, "Info tag '" + infoTag + "' not found in header of vcf file") infoTag }) ::: (for ( - line <- header.getInfoHeaderLines if commandArgs.allInfoTags if !defaultInfoFields.contains(line.getID) if !commandArgs.infoTags.contains(line.getID) + line <- header.getInfoHeaderLines if cmdArgs.allInfoTags if !defaultInfoFields.contains(line.getID) if !cmdArgs.infoTags.contains(line.getID) ) yield { line.getID }).toList ::: defaultInfoFields } val adGenotypeTags = (for ( - genotypeTag <- commandArgs.genotypeTags if !defaultGenotypeFields.contains(genotypeTag) + genotypeTag <- cmdArgs.genotypeTags if !defaultGenotypeFields.contains(genotypeTag) ) yield { require(header.getFormatHeaderLine(genotypeTag) != null, "Format tag '" + genotypeTag + "' not found in header of vcf file") genotypeTag }) ::: (for ( - line <- header.getFormatHeaderLines if commandArgs.allGenotypeTags if !defaultGenotypeFields.contains(line.getID) if !commandArgs.genotypeTags.contains(line.getID) if line.getID != "PL" + line <- header.getFormatHeaderLines if cmdArgs.allGenotypeTags if !defaultGenotypeFields.contains(line.getID) if !cmdArgs.genotypeTags.contains(line.getID) if line.getID != "PL" ) yield { line.getID }).toList ::: defaultGenotypeFields - val referenceFile = new FastaSequenceFile(commandArgs.referenceFile, true) + val bedRecords = (cmdArgs.intervals match { + case Some(intervals) => BedRecordList.fromFile(intervals).validateContigs(cmdArgs.referenceFile) + case _ => BedRecordList.fromReference(cmdArgs.referenceFile) + }).combineOverlap.scatter(cmdArgs.binSize) - val intervals: List[Interval] = ( - for ( - seq <- referenceFile.getSequenceDictionary.getSequences; - chunks = (seq.getSequenceLength / commandArgs.binSize) + (if (seq.getSequenceLength % commandArgs.binSize == 0) 0 else 1); - i <- 1 to chunks - ) yield { - val size = seq.getSequenceLength / chunks - val begin = size * (i - 1) + 1 - val end = if (i >= chunks) seq.getSequenceLength else size * i - new Interval(seq.getSequenceName, begin, end) - } - ).toList + val intervals: List[Interval] = bedRecords.toSamIntervals.toList - val totalBases = intervals.foldRight(0L)(_.length() + _) + val totalBases = bedRecords.length // Reading vcf records logger.info("Start reading vcf records") @@ -352,7 +342,7 @@ object VcfStats extends ToolCommand { val stats = (for (intervals <- Random.shuffle(intervals).grouped(intervals.size / (if (intervals.size > 10) 4 else 1)).toList.par) yield { val chunkStats = for (intervals <- intervals.grouped(25)) yield { val binStats = for (interval <- intervals.par) yield { - val reader = new VCFFileReader(commandArgs.inputFile, true) + val reader = new VCFFileReader(cmdArgs.inputFile, true) var chunkCounter = 0 val stats = createStats logger.info("Starting on: " + interval) @@ -375,8 +365,8 @@ object VcfStats extends ToolCommand { } reader.close() - if (commandArgs.writeBinStats) { - val binOutputDir = new File(commandArgs.outputDir, "bins" + File.separator + interval.getContig) + if (cmdArgs.writeBinStats) { + val binOutputDir = new File(cmdArgs.outputDir, "bins" + File.separator + interval.getContig) writeGenotypeField(stats, samples, "general", binOutputDir, prefix = "genotype-" + interval.getStart + "-" + interval.getEnd) writeField(stats, "general", binOutputDir, prefix = interval.getStart + "-" + interval.getEnd) @@ -393,51 +383,52 @@ object VcfStats extends ToolCommand { logger.info("Done reading vcf records") // Writing info fields to tsv files - val infoOutputDir = new File(commandArgs.outputDir, "infotags") - writeField(stats, "general", commandArgs.outputDir) + val infoOutputDir = new File(cmdArgs.outputDir, "infotags") + writeField(stats, "general", cmdArgs.outputDir) for (field <- adInfoTags.distinct.par) { writeField(stats, field, infoOutputDir) - for (line <- referenceFile.getSequenceDictionary.getSequences) { + for (line <- new FastaSequenceFile(cmdArgs.referenceFile, true).getSequenceDictionary.getSequences) { val chr = line.getSequenceName writeField(stats, field, new File(infoOutputDir, "chrs" + File.separator + chr), chr = chr) } } // Write genotype field to tsv files - val genotypeOutputDir = new File(commandArgs.outputDir, "genotypetags") - writeGenotypeField(stats, samples, "general", commandArgs.outputDir, prefix = "genotype") + val genotypeOutputDir = new File(cmdArgs.outputDir, "genotypetags") + writeGenotypeField(stats, samples, "general", cmdArgs.outputDir, prefix = "genotype") for (field <- adGenotypeTags.distinct.par) { writeGenotypeField(stats, samples, field, genotypeOutputDir) - for (line <- referenceFile.getSequenceDictionary.getSequences) { + for (line <- new FastaSequenceFile(cmdArgs.referenceFile, true).getSequenceDictionary.getSequences) { val chr = line.getSequenceName writeGenotypeField(stats, samples, field, new File(genotypeOutputDir, "chrs" + File.separator + chr), chr = chr) } } // Write sample distributions to tsv files - val sampleDistributionsOutputDir = new File(commandArgs.outputDir, "sample_distributions") + val sampleDistributionsOutputDir = new File(cmdArgs.outputDir, "sample_distributions") for (field <- sampleDistributions) { writeField(stats, "SampleDistribution-" + field, sampleDistributionsOutputDir) } // Write general wiggle tracks - for (field <- commandArgs.generalWiggle) { - val file = new File(commandArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig") + for (field <- cmdArgs.generalWiggle) { + val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "general-" + field + ".wig") writeWiggle(intervals, field, "count", file, genotype = false) } // Write sample wiggle tracks - for (field <- commandArgs.genotypeWiggle; sample <- samples) { - val file = new File(commandArgs.outputDir, "wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig") + for (field <- cmdArgs.genotypeWiggle; sample <- samples) { + val file = new File(cmdArgs.outputDir, "wigs" + File.separator + "genotype-" + sample + "-" + field + ".wig") writeWiggle(intervals, field, sample, file, genotype = true) } - writeOverlap(stats, _.genotypeOverlap, commandArgs.outputDir + "/sample_compare/genotype_overlap", samples) - writeOverlap(stats, _.alleleOverlap, commandArgs.outputDir + "/sample_compare/allele_overlap", samples) + writeOverlap(stats, _.genotypeOverlap, cmdArgs.outputDir + "/sample_compare/genotype_overlap", samples) + writeOverlap(stats, _.alleleOverlap, cmdArgs.outputDir + "/sample_compare/allele_overlap", samples) logger.info("Done") } + //FIXME: does only work correct for reference and not with a bed file protected def writeWiggle(intervals: List[Interval], row: String, column: String, outputFile: File, genotype: Boolean): Unit = { val groupedIntervals = intervals.groupBy(_.getContig).map { case (k, v) => k -> v.sortBy(_.getStart) } outputFile.getParentFile.mkdirs() @@ -448,8 +439,8 @@ object VcfStats extends ToolCommand { writer.println(s"fixedStep chrom=$chr start=1 step=$length span=$length") for (interval <- intervals) { val file = { - if (genotype) new File(commandArgs.outputDir, "bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv") - else new File(commandArgs.outputDir, "bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv") + if (genotype) new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + "genotype-" + interval.getStart + "-" + interval.getEnd + "-general.tsv") + else new File(cmdArgs.outputDir, "bins" + File.separator + chr + File.separator + interval.getStart + "-" + interval.getEnd + "-general.tsv") } writer.println(valueFromTsv(file, row, column).getOrElse(0)) } diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VepNormalizer.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VepNormalizer.scala index 1b7af776ee186acb2573539eeda86afbc05e07be..69d16351eff7e23d15aa60af58d042171c4fb43a 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VepNormalizer.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VepNormalizer.scala @@ -47,7 +47,7 @@ class VepNormalizer(val root: Configurable) extends ToolCommandFuntion { @Output(doc = "Output VCF", shortName = "OutputFile", required = true) var outputVcf: File = null - var mode: String = config("mode", default = "explode") + var mode: String = config("mode", default = "standard") var doNotRemove: Boolean = config("donotremove", default = false) override def defaultCoreMemory = 4.0 diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala index 96a9dc43631e832d95f0bc1f049ca7d5c0396675..845b4883f6c142a50187c70b50113ededdc84213 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala @@ -24,6 +24,7 @@ import htsjdk.tribble.AbstractFeatureReader.getFeatureReader import htsjdk.tribble.bed.BEDCodec import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.core.{ ToolCommand, ToolCommandFuntion } +import nl.lumc.sasc.biopet.utils.intervals.BedRecordList import org.apache.commons.io.FilenameUtils.getExtension import org.broadinstitute.gatk.utils.commandline.{ Input, Output } @@ -84,10 +85,7 @@ object WipeReads extends ToolCommand { logger.info("Parsing interval file ...") /** Function to create iterator from BED file */ - def makeIntervalFromBed(inFile: File): Iterator[Interval] = - asScalaIteratorConverter(getFeatureReader(inFile.toPath.toString, new BEDCodec(), false).iterator) - .asScala - .map(x => new Interval(x.getContig, x.getStart, x.getEnd)) + def makeIntervalFromBed(inFile: File) = BedRecordList.fromFile(inFile).sorted.toSamIntervals.toIterator /** * Parses a refFlat file to yield Interval objects diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecord.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecord.scala new file mode 100644 index 0000000000000000000000000000000000000000..5b6b1931ab7ea604f8812d88c869893f17929d65 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecord.scala @@ -0,0 +1,148 @@ +package nl.lumc.sasc.biopet.utils.intervals + +import htsjdk.samtools.util.Interval + +import scala.collection.mutable.ListBuffer + +/** + * Created by pjvanthof on 20/08/15. + */ +case class BedRecord(chr: String, + start: Int, + end: Int, + name: Option[String] = None, + score: Option[Double] = None, + strand: Option[Boolean] = None, + thickStart: Option[Int] = None, + thickEnd: Option[Int] = None, + rgbColor: Option[(Int, Int, Int)] = None, + blockCount: Option[Int] = None, + blockSizes: IndexedSeq[Int] = IndexedSeq(), + blockStarts: IndexedSeq[Int] = IndexedSeq(), + protected[intervals] val _originals: List[BedRecord] = Nil) { + + def originals(nested: Boolean = true): List[BedRecord] = { + if (_originals.isEmpty) List(this) + else if (nested) _originals.flatMap(_.originals(true)) + else _originals + } + + def overlapWith(record: BedRecord): Boolean = { + if (chr != record.chr) false + else if (start < record.end && record.start < end) true + else false + } + + def length = end - start + + def scatter(binSize: Int) = { + val binNumber = length / binSize + if (binNumber <= 1) List(this) + else { + val size = length / binNumber + val buffer = ListBuffer[BedRecord]() + for (i <- 1 until binNumber) buffer += BedRecord(chr, start + ((i - 1) * size), start + (i * size)) + buffer += BedRecord(chr, start + ((binNumber - 1) * size), end) + buffer.toList + } + } + + lazy val exons = if (blockCount.isDefined && blockSizes.length > 0 && blockStarts.length > 0) { + Some(for (i <- 0 until blockCount.get) yield { + val exonNumber = strand match { + case Some(false) => blockCount.get - i + case _ => i + 1 + } + BedRecord(chr, start + blockStarts(i), start + blockStarts(i) + blockSizes(i), + Some(s"exon-$exonNumber"), _originals = List(this)) + }) + } else None + + lazy val introns = if (blockCount.isDefined && blockSizes.length > 0 && blockStarts.length > 0) { + Some(for (i <- 0 until (blockCount.get - 1)) yield { + val intronNumber = strand match { + case Some(false) => blockCount.get - i + case _ => i + 1 + } + BedRecord(chr, start + blockStarts(i) + blockSizes(i), start + blockStarts(i + 1), + Some(s"intron-$intronNumber"), _originals = List(this)) + }) + } else None + + lazy val utr5 = (strand, thickStart, thickEnd) match { + case (Some(true), Some(tStart), Some(tEnd)) if (tStart > start && tEnd < end) => + Some(BedRecord(chr, start, tStart, name.map(_ + "_utr5"))) + case (Some(false), Some(tStart), Some(tEnd)) if (tStart > start && tEnd < end) => + Some(BedRecord(chr, tEnd, end, name.map(_ + "_utr5"))) + case _ => None + } + + lazy val utr3 = (strand, thickStart, thickEnd) match { + case (Some(false), Some(tStart), Some(tEnd)) if (tStart > start && tEnd < end) => + Some(BedRecord(chr, start, tStart, name.map(_ + "_utr3"))) + case (Some(true), Some(tStart), Some(tEnd)) if (tStart > start && tEnd < end) => + Some(BedRecord(chr, tEnd, end, name.map(_ + "_utr3"))) + case _ => None + } + + override def toString = { + def arrayToOption[T](array: IndexedSeq[T]): Option[IndexedSeq[T]] = { + if (array.isEmpty) None + else Some(array) + } + List(Some(chr), Some(start), Some(end), + name, score, strand.map(if (_) "+" else "-"), + thickStart, thickEnd, rgbColor.map(x => s"${x._1},${x._2},${x._3}"), + blockCount, arrayToOption(blockSizes).map(_.mkString(",")), arrayToOption(blockStarts).map(_.mkString(","))) + .takeWhile(_.isDefined) + .flatten + .mkString("\t") + } + + def validate = { + require(start < end, "Start is greater then end") + (thickStart, thickEnd) match { + case (Some(s), Some(e)) => require(s <= e, "Thick start is greater then end") + case _ => + } + blockCount match { + case Some(count) => { + require(count == blockSizes.length, "Number of sizes is not the same as blockCount") + require(count == blockStarts.length, "Number of starts is not the same as blockCount") + } + case _ => + } + this + } + + def toSamInterval = (name, strand) match { + case (Some(name), Some(strand)) => new Interval(chr, start + 1, end, !strand, name) + case (Some(name), _) => new Interval(chr, start + 1, end, false, name) + case _ => new Interval(chr, start + 1, end) + } +} + +object BedRecord { + def fromLine(line: String): BedRecord = { + val values = line.split("\t") + require(values.length >= 3, "Not enough columns count for a bed file") + BedRecord( + values(0), + values(1).toInt, + values(2).toInt, + values.lift(3), + values.lift(4).map(_.toDouble), + values.lift(5).map { + case "-" => false + case "+" => true + case _ => throw new IllegalStateException("Strand (column 6) must be '+' or '-'") + }, + values.lift(6).map(_.toInt), + values.lift(7) map (_.toInt), + values.lift(8).map(_.split(",", 3).map(_.toInt)).map(x => (x.headOption.getOrElse(0), x.lift(1).getOrElse(0), x.lift(2).getOrElse(0))), + values.lift(9).map(_.toInt), + values.lift(10).map(_.split(",").map(_.toInt).toIndexedSeq).getOrElse(IndexedSeq()), + values.lift(11).map(_.split(",").map(_.toInt).toIndexedSeq).getOrElse(IndexedSeq()) + ) + } +} \ No newline at end of file diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala new file mode 100644 index 0000000000000000000000000000000000000000..c03c4bccc9e22b0b103e032910ea4338fdc3ed42 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala @@ -0,0 +1,146 @@ +package nl.lumc.sasc.biopet.utils.intervals + +import java.io.{ PrintWriter, File } + +import htsjdk.samtools.reference.FastaSequenceFile + +import scala.collection.JavaConversions._ + +import scala.collection.mutable +import scala.collection.mutable.ListBuffer +import scala.io.Source +import nl.lumc.sasc.biopet.core.Logging + +/** + * Created by pjvan_thof on 8/20/15. + */ +case class BedRecordList(val chrRecords: Map[String, List[BedRecord]], val header: List[String] = Nil) { + def allRecords = for (chr <- chrRecords; record <- chr._2) yield record + + def toSamIntervals = allRecords.map(_.toSamInterval) + + lazy val sorted = { + val sorted = new BedRecordList(chrRecords.map(x => x._1 -> x._2.sortWith((a, b) => a.start < b.start))) + if (sorted.chrRecords.forall(x => x._2 == chrRecords(x._1))) this else sorted + } + + lazy val isSorted = sorted.hashCode() == this.hashCode() || sorted.chrRecords.forall(x => x._2 == chrRecords(x._1)) + + def overlapWith(record: BedRecord) = sorted.chrRecords + .getOrElse(record.chr, Nil) + .dropWhile(_.end <= record.start) + .takeWhile(_.start < record.end) + + def length = allRecords.foldLeft(0L)((a, b) => a + b.length) + + def squishBed(strandSensitive: Boolean = true, nameSensitive: Boolean = true) = BedRecordList.fromList { + (for ((chr, records) <- sorted.chrRecords; record <- records) yield { + val overlaps = overlapWith(record) + .filterNot(_ == record) + .filterNot(strandSensitive && _.strand != record.strand) + .filterNot(nameSensitive && _.name == record.name) + if (overlaps.isEmpty) { + List(record) + } else { + overlaps + .foldLeft(List(record))((result, overlap) => { + (for (r <- result) yield { + (overlap.start <= r.start, overlap.end >= r.end) match { + case (true, true) => + Nil + case (true, false) => + List(r.copy(start = overlap.end, _originals = List(r))) + case (false, true) => + List(r.copy(end = overlap.start, _originals = List(r))) + case (false, false) => + List(r.copy(end = overlap.start, _originals = List(r)), r.copy(start = overlap.end, _originals = List(r))) + } + }).flatten + }) + } + }).flatten + } + + def combineOverlap: BedRecordList = { + new BedRecordList(for ((chr, records) <- sorted.chrRecords) yield chr -> { + def combineOverlap(records: List[BedRecord], + newRecords: ListBuffer[BedRecord] = ListBuffer()): List[BedRecord] = { + if (records.nonEmpty) { + val chr = records.head.chr + val start = records.head.start + val overlapRecords = records.takeWhile(_.start <= records.head.end) + val end = overlapRecords.map(_.end).max + + newRecords += BedRecord(chr, start, end, _originals = overlapRecords) + combineOverlap(records.drop(overlapRecords.length), newRecords) + } else newRecords.toList + } + combineOverlap(records) + }) + } + + def scatter(binSize: Int) = BedRecordList( + chrRecords.map(x => x._1 -> x._2.flatMap(_.scatter(binSize))) + ) + + def validateContigs(reference: File) = { + val referenceFile = new FastaSequenceFile(reference, true) + val dict = referenceFile.getSequenceDictionary + val notExisting = chrRecords.keys.filter(dict.getSequence(_) == null).toList + require(notExisting.isEmpty, s"Contigs found in bed records but are not existing in reference: ${notExisting.mkString(",")}") + this + } + + def writeToFile(file: File): Unit = { + val writer = new PrintWriter(file) + header.foreach(writer.println) + allRecords.foreach(writer.println) + writer.close() + } +} + +object BedRecordList { + def fromListWithHeader(records: Traversable[BedRecord], + header: List[String]): BedRecordList = fromListWithHeader(records.toIterator, header) + + def fromListWithHeader(records: TraversableOnce[BedRecord], header: List[String]): BedRecordList = { + val map = mutable.Map[String, ListBuffer[BedRecord]]() + for (record <- records) { + if (!map.contains(record.chr)) map += record.chr -> ListBuffer() + map(record.chr) += record + } + new BedRecordList(map.toMap.map(m => m._1 -> m._2.toList), header) + } + + def fromList(records: Traversable[BedRecord]): BedRecordList = fromListWithHeader(records.toIterator, Nil) + + def fromList(records: TraversableOnce[BedRecord]): BedRecordList = fromListWithHeader(records, Nil) + + def fromFile(bedFile: File) = { + val reader = Source.fromFile(bedFile) + val all = reader.getLines().toList + val header = all.takeWhile(x => x.startsWith("browser") || x.startsWith("track")) + var lineCount = header.length + val content = all.drop(lineCount) + try { + fromListWithHeader(content.map(line => { + lineCount += 1 + BedRecord.fromLine(line).validate + }), header) + } catch { + case e: Exception => + Logging.logger.warn(s"Parsing line number $lineCount failed on file: ${bedFile.getAbsolutePath}") + throw e + } finally { + reader.close() + } + } + + def fromReference(file: File) = { + val referenceFile = new FastaSequenceFile(file, true) + + fromList(for (contig <- referenceFile.getSequenceDictionary.getSequences) yield { + BedRecord(contig.getSequenceName, 0, contig.getSequenceLength) + }) + } +} \ No newline at end of file diff --git a/public/biopet-framework/src/test/resources/rrna02.bed b/public/biopet-framework/src/test/resources/rrna02.bed index 191138d56b2f30f42f1dfe3a26769777e51e04d3..7d59f4c301bcfeefadafc791c589d52f6244f049 100644 --- a/public/biopet-framework/src/test/resources/rrna02.bed +++ b/public/biopet-framework/src/test/resources/rrna02.bed @@ -2,5 +2,5 @@ chrQ 300 350 rRNA03 0 + chrQ 350 400 rRNA03 0 + chrQ 450 480 rRNA02 0 - chrQ 470 475 rRNA04 0 - -chrQ 1 200 rRNA01 0 . -chrQ 150 250 rRNA01 0 . +chrQ 1 200 rRNA01 0 +chrQ 150 250 rRNA01 0 diff --git a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordListTest.scala b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordListTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..b748006c2800d5e86aad9e5f918a3d4bf5894c2c --- /dev/null +++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordListTest.scala @@ -0,0 +1,154 @@ +package nl.lumc.sasc.biopet.utils.intervals + +import java.io.{ PrintWriter, File } + +import htsjdk.samtools.util.Interval +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.{ Test, AfterClass, BeforeClass } + +import scala.io.Source + +/** + * Created by pjvan_thof on 8/25/15. + */ +class BedRecordListTest extends TestNGSuite with Matchers { + @BeforeClass + def start: Unit = { + { + val writer = new PrintWriter(BedRecordListTest.bedFile) + writer.print(BedRecordListTest.bedContent) + writer.close() + } + { + val writer = new PrintWriter(BedRecordListTest.corruptBedFile) + writer.print(BedRecordListTest.corruptBedContent) + writer.close() + } + { + val writer = new PrintWriter(BedRecordListTest.bedFileUcscHeader) + writer.print(BedRecordListTest.ucscHeader) + writer.print(BedRecordListTest.bedContent) + writer.close() + } + } + + @Test def testReadBedFile { + val records = BedRecordList.fromFile(BedRecordListTest.bedFile) + records.allRecords.size shouldBe 2 + records.header shouldBe Nil + + val tempFile = File.createTempFile("region", ".bed") + records.writeToFile(tempFile) + BedRecordList.fromFile(tempFile) shouldBe records + tempFile.delete() + } + + @Test def testReadBedFileUcscHeader { + val records = BedRecordList.fromFile(BedRecordListTest.bedFileUcscHeader) + records.allRecords.size shouldBe 2 + records.header shouldBe BedRecordListTest.ucscHeader.split("\n").toList + + val tempFile = File.createTempFile("region", ".bed") + records.writeToFile(tempFile) + BedRecordList.fromFile(tempFile) shouldBe records + tempFile.delete() + } + + @Test def testSorted: Unit = { + val unsorted = BedRecordList.fromList(List(BedRecord("chrQ", 10, 20), BedRecord("chrQ", 0, 10))) + unsorted.isSorted shouldBe false + unsorted.sorted.isSorted shouldBe true + val sorted = BedRecordList.fromList(List(BedRecord("chrQ", 0, 10), BedRecord("chrQ", 10, 20))) + sorted.isSorted shouldBe true + sorted.sorted.isSorted shouldBe true + sorted.hashCode() shouldBe sorted.sorted.hashCode() + } + + @Test def testOverlap: Unit = { + val list = BedRecordList.fromList(List(BedRecord("chrQ", 0, 10), BedRecord("chrQ", 10, 20))) + list.overlapWith(BedRecord("chrQ", 5, 15)).size shouldBe 2 + list.overlapWith(BedRecord("chrQ", 0, 10)).size shouldBe 1 + list.overlapWith(BedRecord("chrQ", 10, 20)).size shouldBe 1 + list.overlapWith(BedRecord("chrQ", 19, 25)).size shouldBe 1 + list.overlapWith(BedRecord("chrQ", 20, 25)).size shouldBe 0 + } + + @Test def testLength: Unit = { + val list = BedRecordList.fromList(List(BedRecord("chrQ", 0, 10), BedRecord("chrQ", 10, 20))) + list.length shouldBe 20 + } + + @Test def testCombineOverlap: Unit = { + val noOverlapList = BedRecordList.fromList(List(BedRecord("chrQ", 0, 10), BedRecord("chrQ", 10, 20))) + noOverlapList.length shouldBe 20 + noOverlapList.combineOverlap.length shouldBe 20 + + val overlapList = BedRecordList.fromList(List(BedRecord("chrQ", 0, 10), BedRecord("chrQ", 5, 15), BedRecord("chrQ", 10, 20))) + overlapList.length shouldBe 30 + overlapList.combineOverlap.length shouldBe 20 + } + + @Test def testSquishBed: Unit = { + val noOverlapList = BedRecordList.fromList(List(BedRecord("chrQ", 0, 10), BedRecord("chrQ", 10, 20))) + noOverlapList.length shouldBe 20 + noOverlapList.squishBed().length shouldBe 20 + + val overlapList = BedRecordList.fromList(List( + BedRecord("chrQ", 0, 10), + BedRecord("chrQ", 5, 15), + BedRecord("chrQ", 10, 20), + BedRecord("chrQ", 25, 35), + BedRecord("chrQ", 50, 80), + BedRecord("chrQ", 60, 70) + )) + overlapList.length shouldBe 80 + val squishedList = overlapList.squishBed(strandSensitive = false, nameSensitive = false) + squishedList.allRecords.size shouldBe 5 + squishedList.length shouldBe 40 + } + + @Test def testSamInterval: Unit = { + val list = BedRecordList.fromList(List(BedRecord("chrQ", 0, 10), BedRecord("chrQ", 5, 15))) + list.toSamIntervals.toList shouldBe List(new Interval("chrQ", 1, 10), new Interval("chrQ", 6, 15)) + } + + @Test def testTraversable: Unit = { + val list = List(BedRecord("chrQ", 0, 10)) + BedRecordList.fromList(list) shouldBe BedRecordList.fromList(list.toIterator) + } + + @Test def testErrors: Unit = { + intercept[IllegalArgumentException] { + val records = BedRecordList.fromFile(BedRecordListTest.corruptBedFile) + } + } + + @Test def testScatter: Unit = { + val list = BedRecordList.fromList(List(BedRecord("chrQ", 0, 1000), BedRecord("chrQ", 3000, 3500))) + list.scatter(100).allRecords.size shouldBe 15 + list.scatter(100).length shouldBe 1500 + } + + @AfterClass + def end: Unit = { + BedRecordListTest.bedFile.delete() + BedRecordListTest.corruptBedFile.delete() + BedRecordListTest.bedFileUcscHeader.delete() + } +} + +object BedRecordListTest { + val ucscHeader = """browser position chr7:127471196-127495720 + |browser hide all + |track name="ItemRGBDemo" description="Item RGB demonstration" visibility=2 itemRgb="On" + |""".stripMargin + val bedContent = """chr22 1000 5000 cloneA 960 + 1000 5000 0 2 567,488 0,3512 + |chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399 0,3601""".stripMargin + val corruptBedContent = """chr22 5000 1000 cloneA 960 + 1000 5000 0 2 567,488 0,3512 + |chr22 2000 6000 cloneB 900 - 2000 6000 0 2 433,399 0,3601""".stripMargin + + val bedFile = File.createTempFile("regions", ".bed") + val corruptBedFile = File.createTempFile("regions", ".bed") + val bedFileUcscHeader = File.createTempFile("regions", ".bed") +} \ No newline at end of file diff --git a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordTest.scala b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..a4ae25293748214d8d0d3b27494b01c34f71a684 --- /dev/null +++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordTest.scala @@ -0,0 +1,163 @@ +package nl.lumc.sasc.biopet.utils.intervals + +import htsjdk.samtools.util.Interval +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +/** + * Created by pjvanthof on 24/08/15. + */ +class BedRecordTest extends TestNGSuite with Matchers { + @Test def testLineParse: Unit = { + BedRecord("chrQ", 0, 4) shouldBe BedRecord("chrQ", 0, 4) + BedRecord.fromLine("chrQ\t0\t4") shouldBe BedRecord("chrQ", 0, 4) + BedRecord.fromLine("chrQ\t0\t4\tname\t3\t+") shouldBe BedRecord("chrQ", 0, 4, Some("name"), Some(3.0), Some(true)) + BedRecord.fromLine("chrQ\t0\t4\tname\t3\t+\t1\t3") shouldBe + BedRecord("chrQ", 0, 4, Some("name"), Some(3.0), Some(true), Some(1), Some(3)) + BedRecord.fromLine("chrQ\t0\t4\tname\t3\t+\t1\t3\t255,0,0") shouldBe + BedRecord("chrQ", 0, 4, Some("name"), Some(3.0), Some(true), Some(1), Some(3), Some((255, 0, 0))) + BedRecord.fromLine("chrQ\t0\t100\tname\t3\t+\t1\t3\t255,0,0\t2\t10,20\t20,50") shouldBe + BedRecord("chrQ", 0, 100, Some("name"), Some(3.0), Some(true), Some(1), Some(3), Some((255, 0, 0)), + Some(2), IndexedSeq(10, 20), IndexedSeq(20, 50)) + } + + @Test def testLineOutput: Unit = { + BedRecord("chrQ", 0, 4).toString shouldBe "chrQ\t0\t4" + BedRecord.fromLine("chrQ\t0\t4").toString shouldBe "chrQ\t0\t4" + BedRecord.fromLine("chrQ\t0\t100\tname\t3\t+\t1\t3\t255,0,0\t2\t10,20\t20,50").toString shouldBe "chrQ\t0\t100\tname\t3.0\t+\t1\t3\t255,0,0\t2\t10,20\t20,50" + } + + @Test def testOverlap: Unit = { + BedRecord("chrQ", 0, 4).overlapWith(BedRecord("chrQ", 0, 4)) shouldBe true + BedRecord("chrQ", 0, 4).overlapWith(BedRecord("chrX", 0, 4)) shouldBe false + BedRecord("chrQ", 0, 4).overlapWith(BedRecord("chrQ", 4, 8)) shouldBe false + BedRecord("chrQ", 0, 4).overlapWith(BedRecord("chrQ", 3, 8)) shouldBe true + BedRecord("chrQ", 4, 8).overlapWith(BedRecord("chrQ", 0, 4)) shouldBe false + BedRecord("chrQ", 3, 4).overlapWith(BedRecord("chrQ", 0, 4)) shouldBe true + BedRecord("chrQ", 3, 4).overlapWith(BedRecord("chrQ", 4, 5)) shouldBe false + } + + @Test def testLength: Unit = { + BedRecord("chrQ", 0, 4).length shouldBe 4 + BedRecord("chrQ", 0, 1).length shouldBe 1 + BedRecord("chrQ", 3, 4).length shouldBe 1 + } + + @Test def testToSamInterval: Unit = { + BedRecord("chrQ", 0, 4).toSamInterval shouldBe new Interval("chrQ", 1, 4) + BedRecord("chrQ", 0, 4, Some("name"), Some(0.0), Some(true)).toSamInterval shouldBe new Interval("chrQ", 1, 4, false, "name") + } + + @Test def testExons: Unit = { + BedRecord.fromLine("chrQ\t0\t100\tname\t3\t+\t1\t3\t255,0,0").exons shouldBe None + + val record = BedRecord.fromLine("chrQ\t0\t100\tname\t3\t+\t1\t3\t255,0,0\t2\t10,20\t0,80") + val exons = record.exons + exons should not be None + exons.get(0).originals()(0) shouldBe record + exons.get(0).originals().size shouldBe 1 + exons.get(1).originals()(0) shouldBe record + exons.get(1).originals().size shouldBe 1 + exons.get(0).start shouldBe 0 + exons.get(0).end shouldBe 10 + exons.get(1).start shouldBe 80 + exons.get(1).end shouldBe 100 + exons.get.foldLeft(0)(_ + _.length) shouldBe 30 + } + + @Test def testIntrons: Unit = { + BedRecord.fromLine("chrQ\t0\t100\tname\t3\t+\t1\t3\t255,0,0").introns shouldBe None + + val record = BedRecord.fromLine("chrQ\t0\t100\tname\t3\t+\t1\t3\t255,0,0\t2\t10,20\t0,80") + val introns = record.introns + introns should not be None + introns.get(0).originals()(0) shouldBe record + introns.get(0).originals().size shouldBe 1 + introns.get(0).start shouldBe 10 + introns.get(0).end shouldBe 80 + introns.get.foldLeft(0)(_ + _.length) shouldBe 70 + } + + @Test def testExonIntronOverlap: Unit = { + val record = BedRecord.fromLine("chrQ\t0\t100\tname\t3\t+\t1\t3\t255,0,0\t2\t10,20\t0,80") + val exons = record.exons + val introns = record.introns + for (exon <- exons.get; intron <- introns.get) { + exon.overlapWith(intron) shouldBe false + } + } + + @Test def testUtrsPositive: Unit = { + BedRecord.fromLine("chrQ\t0\t100\tname\t3\t+").utr3 shouldBe None + BedRecord.fromLine("chrQ\t0\t100\tname\t3\t+").utr5 shouldBe None + + val record = BedRecord.fromLine("chrQ\t0\t100\tname\t3\t+\t3\t93\t255,0,0\t2\t10,20\t0,80") + val utr5 = record.utr5 + val utr3 = record.utr3 + utr5 should not be None + utr3 should not be None + utr5.get.length shouldBe 3 + utr3.get.length shouldBe 7 + + } + + @Test def testUtrsNegative: Unit = { + BedRecord.fromLine("chrQ\t0\t100\tname\t3\t-").utr3 shouldBe None + BedRecord.fromLine("chrQ\t0\t100\tname\t3\t-").utr5 shouldBe None + + val record = BedRecord.fromLine("chrQ\t0\t100\tname\t3\t-\t3\t93\t255,0,0\t2\t10,20\t0,80") + val utr5 = record.utr5 + val utr3 = record.utr3 + utr5 should not be None + utr3 should not be None + utr5.get.length shouldBe 7 + utr3.get.length shouldBe 3 + } + + @Test def testOriginals: Unit = { + val original = BedRecord("chrQ", 1, 2) + val level1 = BedRecord("chrQ", 1, 2, _originals = List(original)) + val level2 = BedRecord("chrQ", 2, 3, _originals = List(level1)) + original.originals() shouldBe List(original) + original.originals(nested = false) shouldBe List(original) + level1.originals() shouldBe List(original) + level1.originals(nested = false) shouldBe List(original) + level2.originals() shouldBe List(original) + level2.originals(nested = false) shouldBe List(level1) + } + + @Test def testScatter: Unit = { + val list = BedRecord("chrQ", 0, 1000).scatter(10) + list.size shouldBe 100 + BedRecordList.fromList(list).length shouldBe 1000 + for (l1 <- list; l2 <- list if l1 != l2) l1.overlapWith(l2) shouldBe false + + val list2 = BedRecord("chrQ", 0, 999).scatter(10) + list2.size shouldBe 99 + BedRecordList.fromList(list2).length shouldBe 999 + for (l1 <- list2; l2 <- list2 if l1 != l2) l1.overlapWith(l2) shouldBe false + + val list3 = BedRecord("chrQ", 0, 999).scatter(9) + list3.size shouldBe 111 + BedRecordList.fromList(list3).length shouldBe 999 + for (l1 <- list3; l2 <- list3 if l1 != l2) l1.overlapWith(l2) shouldBe false + } + + @Test def testErrors: Unit = { + BedRecord("chrQ", 0, 3).validate + BedRecord.fromLine("chrQ\t0\t100\tname\t3\t+\t1\t3\t255,0,0\t2\t10,20\t20,50").validate + intercept[IllegalArgumentException] { + BedRecord("chrQ", 0, 0).validate + } + intercept[IllegalArgumentException] { + BedRecord("chrQ", 4, 3).validate + } + intercept[IllegalArgumentException] { + BedRecord.fromLine("chrQ\t0\t100\tname\t3\t+\t1\t3\t255,0,0\t2\t10\t50").validate + } + intercept[IllegalStateException] { + BedRecord.fromLine("chrQ\t0\t100\tname\t3\tx\t1\t3\t255,0,0\t2\t10,20\t20,50").validate + } + } +} diff --git a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingReport.scala b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingReport.scala index c855736f77111a7695b812ebf9af3b3b089a6887..ab703e5b5e44a7cc0762e8cff51b1f3b23c66ab9 100644 --- a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingReport.scala +++ b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingReport.scala @@ -35,18 +35,22 @@ object MappingReport extends ReportBuilder { /** Root page for single BamMetrcis report */ def indexPage = { - val bamMetricsPage = BammetricsReport.bamMetricsPage(summary, sampleId, libId) - ReportPage(List("QC" -> FlexiprepReport.flexiprepPage) ::: bamMetricsPage.subPages ::: List( - "Versions" -> ReportPage(List(), List("Executables" -> ReportSection("/nl/lumc/sasc/biopet/core/report/executables.ssp" - )), Map()), - "Files" -> ReportPage(List(), List( - "Input fastq files" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepInputfiles.ssp"), - "After QC fastq files" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepOutputfiles.ssp"), - "Bam files per lib" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/mapping/outputBamfiles.ssp", Map("sampleLevel" -> false)) - ), Map()) - ), List( + val skipFlexiprep = summary.getValue(sampleId, libId, "mapping", "settings", "skip_flexiprep").getOrElse(false) == true + val bamMetricsPage = if (summary.getValue(sampleId, libId, "mapping", "settings", "skip_metrics").getOrElse(false) != true) { + Some(BammetricsReport.bamMetricsPage(summary, sampleId, libId)) + } else None + ReportPage((if (skipFlexiprep) Nil else List("QC" -> FlexiprepReport.flexiprepPage)) ::: + bamMetricsPage.map(_.subPages).getOrElse(Nil) ::: List( + "Versions" -> ReportPage(List(), List("Executables" -> ReportSection("/nl/lumc/sasc/biopet/core/report/executables.ssp" + )), Map()), + "Files" -> ReportPage(List(), (if (skipFlexiprep) Nil else List( + "Input fastq files" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepInputfiles.ssp"), + "After QC fastq files" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/flexiprep/flexiprepOutputfiles.ssp"))) ::: + List("Bam files per lib" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/mapping/outputBamfiles.ssp", Map("sampleLevel" -> false)) + ), Map()) + ), List( "Report" -> ReportSection("/nl/lumc/sasc/biopet/pipelines/mapping/mappingFront.ssp") - ) ::: bamMetricsPage.sections, + ) ::: bamMetricsPage.map(_.sections).getOrElse(Nil), Map() ) } diff --git a/public/mapping/src/test/resources/ref.1.bt2 b/public/mapping/src/test/resources/ref.1.bt2 new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/public/mapping/src/test/resources/ref.1.ebwt b/public/mapping/src/test/resources/ref.1.ebwt new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/public/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingTest.scala b/public/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingTest.scala index 78b9b3dd400fa9daf5725c3407e60f32f9f38eae..71953da59f9b917d9c28a3a398198a2e823dd82a 100644 --- a/public/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingTest.scala +++ b/public/mapping/src/test/scala/nl/lumc/sasc/biopet/pipelines/mapping/MappingTest.scala @@ -142,11 +142,13 @@ object MappingTest { copyFile("ref.fa") copyFile("ref.dict") copyFile("ref.fa.fai") + copyFile("ref.1.bt2") + copyFile("ref.1.ebwt") val executables = Map( "reference_fasta" -> (outputDir + File.separator + "ref.fa"), "db" -> "test", - "bowtie_index" -> "test", + "bowtie_index" -> (outputDir + File.separator + "ref"), "fastqc" -> Map("exe" -> "test"), "seqtk" -> Map("exe" -> "test"), "gsnap" -> Map("exe" -> "test"), diff --git a/public/pom.xml b/public/pom.xml index 93acfae5e8cdc28359579b70eac5cddc2643f161..a59a97680ac2b080f8a6311d18a56af24f35b6aa 100644 --- a/public/pom.xml +++ b/public/pom.xml @@ -72,6 +72,7 @@ <artifactId>maven-surefire-plugin</artifactId> <version>2.18.1</version> <configuration> + <forkCount>1C</forkCount> <workingDirectory>${project.build.directory}</workingDirectory> </configuration> </plugin>