diff --git a/Jenkinsfile b/Jenkinsfile index 9e16e30192a9e978cc14048dc80c636e05263b15..7c36f6d83c0eacb86a93c4e28bab91b27e90a7f9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,43 +1,49 @@ node('local') { - try { + timeout(45) { + try { - stage('Init') { - tool 'JDK 8u102' - tool 'Apache Maven 3.3.9' - } + stage('Init') { + tool 'JDK 8u102' + tool 'Apache Maven 3.3.9' + } - stage('Checkout') { - checkout scm - sh 'git submodule update --init --recursive' - } + stage('Checkout') { + checkout scm + sh 'git submodule update --init --recursive' + } - stage('Build and Test') { - withMaven(maven: 'Apache Maven 3.3.9', jdk: 'JDK 8u102') { - sh 'mvn -B -T 2 -Dmaven.test.failure.ignore clean package' + stage('Build and Test') { + withMaven(maven: 'Apache Maven 3.3.9', jdk: 'JDK 8u102') { + sh 'mvn -B -T 2 -Dmaven.test.failure.ignore clean package' + } } - } - stage('Report Tests') { - junit '*/target/surefire-reports/*.xml' - } + stage('Report Tests') { + junit '*/target/surefire-reports/*.xml' + } - stage('Check Documentation') { - sh 'mkdocs build --clean --strict' - } + stage('Check git on changes') { + sh 'if [ $(git diff | wc -l) -eq 0 ]; then true; else echo "[ERROR] Git is not clean anymore after build"; git diff; echo "[ERROR] This might be caused by reformated code, if so run maven locally"; false; fi' + } - if(currentBuild.result == null || "SUCCESS".equals(currentBuild.result)) { - currentBuild.result = "SUCCESS" - slackSend (color: '#00FF00', message: "${currentBuild.result}: Job '${env.JOB_NAME} #${env.BUILD_NUMBER}' (<${env.BUILD_URL}|Open>)", channel: '#biopet-bot', teamDomain: 'lumc', tokenCredentialId: 'lumc') - } else { - slackSend (color: '#FFFF00', message: "${currentBuild.result}: Job '${env.JOB_NAME} #${env.BUILD_NUMBER}' (<${env.BUILD_URL}|Open>)", channel: '#biopet-bot', teamDomain: 'lumc', tokenCredentialId: 'lumc') - } + stage('Check Documentation') { + sh 'mkdocs build --clean --strict' + } - } catch (e) { - if(currentBuild.result == null || "FAILED".equals(currentBuild.result)) { - currentBuild.result = "FAILED" - } - slackSend (color: '#FF0000', message: "${currentBuild.result}: Job '${env.JOB_NAME} #${env.BUILD_NUMBER}' (<${env.BUILD_URL}|Open>)", channel: '#biopet-bot', teamDomain: 'lumc', tokenCredentialId: 'lumc') + if (currentBuild.result == null || "SUCCESS".equals(currentBuild.result)) { + currentBuild.result = "SUCCESS" + slackSend(color: '#00FF00', message: "${currentBuild.result}: Job '${env.JOB_NAME} #${env.BUILD_NUMBER}' (<${env.BUILD_URL}|Open>)", channel: '#biopet-bot', teamDomain: 'lumc', tokenCredentialId: 'lumc') + } else { + slackSend(color: '#FFFF00', message: "${currentBuild.result}: Job '${env.JOB_NAME} #${env.BUILD_NUMBER}' (<${env.BUILD_URL}|Open>)", channel: '#biopet-bot', teamDomain: 'lumc', tokenCredentialId: 'lumc') + } + + } catch (e) { + if (currentBuild.result == null || "FAILED".equals(currentBuild.result)) { + currentBuild.result = "FAILED" + } + slackSend(color: '#FF0000', message: "${currentBuild.result}: Job '${env.JOB_NAME} #${env.BUILD_NUMBER}' (<${env.BUILD_URL}|Open>)", channel: '#biopet-bot', teamDomain: 'lumc', tokenCredentialId: 'lumc') - throw e + throw e + } } } diff --git a/README.md b/README.md index ceb0e9488afac4ab327f94ba410b305cfbf722e3..b61b4a863135b039059c7d273d94381069af361d 100755 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Biopet (Bio Pipeline Execution Toolkit) is the main pipeline development framework of the LUMC Sequencing Analysis Support Core team. It contains our main pipelines and some of the command line tools we develop in-house. It is meant to be used in the main [SHARK](https://humgenprojects.lumc.nl/trac/shark) computing cluster. While usage outside of SHARK is technically possible, some adjustments may need to be made in order to do so. -Full documantation is here: [Biopet documantation](http://biopet-docs.readthedocs.io/en/latest/) +Full documentation is here: [Biopet documentation](http://biopet-docs.readthedocs.io/en/latest/) ## Quick Start @@ -60,7 +60,7 @@ Biopet is based on the Queue framework developed by the Broad Institute as part We welcome any kind of contribution, be it merge requests on the code base, documentation updates, or any kinds of other fixes! The main language we use is Scala, though the repository also contains a small bit of Python and R. Our main code repository is located at [https://github.com/biopet/biopet](https://github.com/biopet/biopet/issues), along with our issue tracker. -For more information please go to our [Developer documantation](http://biopet-docs.readthedocs.io/en/develop/developer/getting-started/) +For more information please go to our [Developer documentation](http://biopet-docs.readthedocs.io/en/develop/developer/getting-started/) ## About diff --git a/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/report/ReportBuilder.scala b/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/report/ReportBuilder.scala index b854a02c6039beefc4be999187c2bec8c857eb01..95b06e18b47c61d20dcb03c72806a6bb45d80a3a 100644 --- a/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/report/ReportBuilder.scala +++ b/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/report/ReportBuilder.scala @@ -20,7 +20,7 @@ import nl.lumc.sasc.biopet.core.ToolCommandFunction import nl.lumc.sasc.biopet.utils.summary.Summary import nl.lumc.sasc.biopet.utils.{ IoUtils, Logging, ToolCommand } import org.broadinstitute.gatk.utils.commandline.Input -import org.fusesource.scalate.{ TemplateEngine, TemplateSource } +import org.fusesource.scalate.TemplateEngine import scala.collection.mutable import scala.language.postfixOps @@ -236,9 +236,7 @@ object ReportBuilder { /** Single template render engine, this will have a cache for all compile templates */ protected val engine = new TemplateEngine() - - /** Cache of temp file for templates from the classpath / jar */ - private[report] var templateCache: Map[String, File] = Map() + engine.allowReload = false /** This will give the total number of pages including all nested pages */ def countPages(page: ReportPage): Int = { @@ -254,15 +252,6 @@ object ReportBuilder { def renderTemplate(location: String, args: Map[String, Any] = Map()): String = { Logging.logger.info("Rendering: " + location) - val templateFile: File = templateCache.get(location) match { - case Some(template) => template - case _ => - val tempFile = File.createTempFile("ssp-template", new File(location).getName) - tempFile.deleteOnExit() - IoUtils.copyStreamToFile(getClass.getResourceAsStream(location), tempFile) - templateCache += location -> tempFile - tempFile - } - engine.layout(TemplateSource.fromFile(templateFile), args) + engine.layout(location, args) } } \ No newline at end of file diff --git a/biopet-core/src/test/scala/nl/lumc/sasc/biopet/core/report/ReportBuilderTest.scala b/biopet-core/src/test/scala/nl/lumc/sasc/biopet/core/report/ReportBuilderTest.scala index ef8a194fad26c383ea110a342d4b30ca73ef3515..d71d54d9fd8d35b9f14a42d4860fd9d210473977 100644 --- a/biopet-core/src/test/scala/nl/lumc/sasc/biopet/core/report/ReportBuilderTest.scala +++ b/biopet-core/src/test/scala/nl/lumc/sasc/biopet/core/report/ReportBuilderTest.scala @@ -79,11 +79,8 @@ class ReportBuilderTest extends TestNGSuite with Matchers { @Test def testRenderTemplate: Unit = { - ReportBuilder.templateCache = Map() - ReportBuilder.templateCache shouldBe empty ReportBuilder.renderTemplate("/template.ssp", Map("arg" -> "test")) shouldBe "test" - ReportBuilder.templateCache.size shouldBe 1 ReportBuilder.renderTemplate("/template.ssp", Map("arg" -> "bla")) shouldBe "bla" - ReportBuilder.templateCache.size shouldBe 1 } + } diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala index cb91edfc63e20f2330b0b8f73ea06b85dfe09796..0d86b08ca80a8979fa35b449a2008fd9289773c2 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala @@ -75,8 +75,8 @@ class Fastqc(val root: Configurable) extends BiopetCommandLineFunction with Vers // otherwise, check if adapters are already present (depending on FastQC version) case None => val defaultAdapters = getVersion match { - case Some("v0.11.2") => Option(new File(fastqcDir + "/Configuration/adapter_list.txt")) - case _ => None + case Some(v) if v.contains("v0.11") => Option(new File(fastqcDir + "/Configuration/adapter_list.txt")) + case _ => None } defaultAdapters.collect { case adp => config("adapters", default = adp) } } diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/delly/DellyCaller.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/delly/DellyCaller.scala index f7f76ff5d6c409823391e55b683fe4429d9df88c..f3fc087e905c18bebf2c2d4a2551a76a918a8c67 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/delly/DellyCaller.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/delly/DellyCaller.scala @@ -42,8 +42,19 @@ class DellyCaller(val root: Configurable) extends BiopetCommandLineFunction with var analysistype: String = _ def cmdLine = required(executable) + - "-t" + required(analysistype) + - "-o" + required(outputvcf) + - required(input) + required("-t", analysistype) + + required("-o", outputvcf) + + required(input) + + createEmptyOutputIfNeeded + + // when no variants are found then the tool doesn't generate the output file either, in Biopet it's needed that the empty file would be there + private def createEmptyOutputIfNeeded = + s""" + |c=$$? + |if [ $$c -eq 0 ] && [ ! -f $outputvcf ]; then + | echo '##fileformat=VCFv4.2' > $outputvcf + | echo '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO' >> $outputvcf + |fi + |exit $$c""".stripMargin } diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala index ef97ed813055cd483b08ce41f476eb61a70914f4..f06720b5d9ce8846eda5636e44e5223f326820e2 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/gatk/CatVariants.scala @@ -37,6 +37,9 @@ class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction @Gather(classOf[org.broadinstitute.gatk.queue.function.scattergather.SimpleTextGatherFunction]) var outputFile: File = _ + /** When Gatk's CatVariants has been called with empty VCF-files, then it also outputs an empty file. When this parameter is set to true, then the empty file gets added a valid VCF-file header.*/ + var writeHeaderToEmptyOutput: Boolean = false + /** assumeSorted should be true if the input files are already sorted (based on the position of the variants) */ @Argument(fullName = "assumeSorted", shortName = "assumeSorted", doc = "assumeSorted should be true if the input files are already sorted (based on the position of the variants)", required = false, exclusiveOf = "", validation = "") var assumeSorted: Boolean = _ @@ -73,5 +76,14 @@ class CatVariants(val root: Configurable) extends BiopetJavaCommandLineFunction optional("--variant_index_type", variant_index_type, spaceSeparated = true, escape = true, format = "%s") + optional("--variant_index_parameter", variant_index_parameter, spaceSeparated = true, escape = true, format = "%s") + optional("-l", logging_level, spaceSeparated = true, escape = true, format = "%s") + - optional("-log", log_to_file, spaceSeparated = true, escape = true, format = "%s") + optional("-log", log_to_file, spaceSeparated = true, escape = true, format = "%s") + + (if (writeHeaderToEmptyOutput) s""" + |c=$$? + |if [ $$c -eq 0 ] && [ ! -s $outputFile ]; then + | echo '##fileformat=VCFv4.2' > $outputFile + | echo '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO' >> $outputFile + |fi + |exit $$c""".stripMargin + else "") + } diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsFlagstat.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsFlagstat.scala index 387f0ef83921265a2a206bc518456b18c147a596..950c26b6d3456b85f4214b0b6f076fb015601be4 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsFlagstat.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsFlagstat.scala @@ -28,7 +28,9 @@ class SamtoolsFlagstat(val root: Configurable) extends Samtools { var output: File = _ /** Returns command to execute */ - def cmdLine = required(executable) + required("flagstat") + required(input) + " > " + required(output) + def cmdLine = required(executable) + required("flagstat") + + (if (inputAsStdin) "-" else required(input)) + + (if (outputAsStsout) "" else " > " + required(output)) } object SamtoolsFlagstat { diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsView.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsView.scala index ad87ddc31fb409959450e784c0e3091329c54d67..319840efece705ed1393629b68e261546439825d 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsView.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/samtools/SamtoolsView.scala @@ -33,18 +33,19 @@ class SamtoolsView(val root: Configurable) extends Samtools { var f: List[String] = config("f", default = List.empty[String]) var F: List[String] = config("F", default = List.empty[String]) - def cmdBase = required(executable) + + @Input(required = false) + var L: Option[File] = None + + def cmdLine = required(executable) + required("view") + optional("-q", q) + + optional("-L", L) + repeat("-f", f) + repeat("-F", F) + conditional(b, "-b") + - conditional(h, "-h") - def cmdPipeInput = cmdBase + "-" - def cmdPipe = cmdBase + required(input) - - /** Returns command to execute */ - def cmdLine = cmdPipe + " > " + required(output) + conditional(h, "-h") + + (if (inputAsStdin) "-" else required(input)) + + (if (outputAsStsout) "" else " > " + required(output)) } object SamtoolsView { diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStats.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStats.scala index 2897808348d9f5b6fff471b759e16bf2d089d36d..f4cb547662ee7262b287b58b934047fcaaf64e3a 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStats.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStats.scala @@ -39,7 +39,8 @@ object BamStats extends ToolCommand { bamFile: File = null, referenceFasta: Option[File] = None, binSize: Int = 10000, - threadBinSize: Int = 10000000) extends AbstractArgs + threadBinSize: Int = 10000000, + tsvOutputs: Boolean = false) extends AbstractArgs class OptParser extends AbstractOptParser { opt[File]('R', "reference") valueName "<file>" action { (x, c) => @@ -57,6 +58,9 @@ object BamStats extends ToolCommand { opt[Int]("threadBinSize") valueName "<int>" action { (x, c) => c.copy(threadBinSize = x) } text "Size of region per thread" + opt[Unit]("tsvOutputs") action { (x, c) => + c.copy(tsvOutputs = true) + } text "Also output tsv files, default there is only a json" } /** This is the main entry to [[BamStats]], this will do the argument parsing. */ @@ -68,7 +72,7 @@ object BamStats extends ToolCommand { val sequenceDict = validateReferenceInBam(cmdArgs.bamFile, cmdArgs.referenceFasta) - init(cmdArgs.outputDir, cmdArgs.bamFile, sequenceDict, cmdArgs.binSize, cmdArgs.threadBinSize) + init(cmdArgs.outputDir, cmdArgs.bamFile, sequenceDict, cmdArgs.binSize, cmdArgs.threadBinSize, cmdArgs.tsvOutputs) logger.info("Done") } @@ -96,13 +100,26 @@ object BamStats extends ToolCommand { * @param binSize stats binsize * @param threadBinSize Thread binsize */ - def init(outputDir: File, bamFile: File, referenceDict: SAMSequenceDictionary, binSize: Int, threadBinSize: Int): Unit = { + def init(outputDir: File, bamFile: File, referenceDict: SAMSequenceDictionary, binSize: Int, threadBinSize: Int, tsvOutput: Boolean): Unit = { val contigsFutures = BedRecordList.fromDict(referenceDict).allRecords.map { contig => contig.chr -> processContig(contig, bamFile, binSize, threadBinSize, outputDir) }.toList val stats = waitOnFutures(processUnmappedReads(bamFile) :: contigsFutures.map(_._2)) + if (tsvOutput) { + stats.flagstat.writeAsTsv(new File(outputDir, "flagstats.tsv")) + + stats.insertSizeHistogram.writeFilesAndPlot(outputDir, "insertsize", "Insertsize", "Reads", "Insertsize distribution") + stats.mappingQualityHistogram.writeFilesAndPlot(outputDir, "mappingQuality", "Mapping Quality", "Reads", "Mapping Quality distribution") + stats.clippingHistogram.writeFilesAndPlot(outputDir, "clipping", "CLipped bases", "Reads", "Clipping distribution") + + stats.leftClippingHistogram.writeFilesAndPlot(outputDir, "left_clipping", "CLipped bases", "Reads", "Left Clipping distribution") + stats.rightClippingHistogram.writeFilesAndPlot(outputDir, "right_clipping", "CLipped bases", "Reads", "Right Clipping distribution") + stats._3_ClippingHistogram.writeFilesAndPlot(outputDir, "3prime_clipping", "CLipped bases", "Reads", "3 Prime Clipping distribution") + stats._5_ClippingHistogram.writeFilesAndPlot(outputDir, "5prime_clipping", "CLipped bases", "Reads", "5 Prime Clipping distribution") + } + val statsWriter = new PrintWriter(new File(outputDir, "bamstats.json")) val totalStats = stats.toSummaryMap val statsMap = Map( diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Histogram.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Histogram.scala index 13ce49736c352d30b3703c2cc41dd5dcc309ffc2..169bd4b59d41d49529b867637ba93b84a275c3d3 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Histogram.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Histogram.scala @@ -14,8 +14,10 @@ */ package nl.lumc.sasc.biopet.tools.bamstats -import java.io.{ File, PrintWriter } -import nl.lumc.sasc.biopet.utils.sortAnyAny +import java.io.{ File, IOException, PrintWriter } + +import nl.lumc.sasc.biopet.utils.rscript.LinePlot +import nl.lumc.sasc.biopet.utils.{ Logging, sortAnyAny } import scala.collection.mutable @@ -43,7 +45,7 @@ class Counts[T](_counts: Map[T, Long] = Map[T, Long]())(implicit ord: Ordering[T } /** Write histogram to a tsv/count file */ - def writeToTsv(file: File): Unit = { + def writeHistogramToTsv(file: File): Unit = { val writer = new PrintWriter(file) writer.println("value\tcount") counts.keys.toList.sorted.foreach(x => writer.println(s"$x\t${counts(x)}")) @@ -82,4 +84,28 @@ class Histogram[T](_counts: Map[T, Long] = Map[T, Long]())(implicit ord: Numeric } else Map() } + /** Write histogram to a tsv/count file */ + def writeAggregateToTsv(file: File): Unit = { + val writer = new PrintWriter(file) + aggregateStats.foreach(x => writer.println(x._1 + "\t" + x._2)) + writer.close() + } + + def writeFilesAndPlot(outputDir: File, prefix: String, xlabel: String, ylabel: String, title: String): Unit = { + writeHistogramToTsv(new File(outputDir, prefix + ".histogram.tsv")) + writeAggregateToTsv(new File(outputDir, prefix + ".stats.tsv")) + val plot = new LinePlot(null) + plot.input = new File(outputDir, prefix + ".histogram.tsv") + plot.output = new File(outputDir, prefix + ".histogram.png") + plot.xlabel = Some(xlabel) + plot.ylabel = Some(ylabel) + plot.title = Some(title) + try { + plot.runLocal() + } catch { + // If plotting fails the tools should not fail, this depens on R to be installed + case e: IOException => Logging.logger.warn(s"Error found while plotting ${plot.output}: ${e.getMessage}") + } + } + } diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Stats.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Stats.scala index f28fbf2c6bb84f05edce6e55094af2a12a65581f..094a5e5f6d23eb14d20827ba427d342bdb1a0fb0 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Stats.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/bamstats/Stats.scala @@ -50,13 +50,13 @@ case class Stats(flagstat: FlagstatCollector = new FlagstatCollector(), def writeStatsToFiles(outputDir: File): Unit = { this.flagstat.writeReportToFile(new File(outputDir, "flagstats")) this.flagstat.writeSummaryTofile(new File(outputDir, "flagstats.summary.json")) - this.mappingQualityHistogram.writeToTsv(new File(outputDir, "mapping_quality.tsv")) - this.insertSizeHistogram.writeToTsv(new File(outputDir, "insert_size.tsv")) - this.clippingHistogram.writeToTsv(new File(outputDir, "clipping.tsv")) - this.leftClippingHistogram.writeToTsv(new File(outputDir, "left_clipping.tsv")) - this.rightClippingHistogram.writeToTsv(new File(outputDir, "right_clipping.tsv")) - this._5_ClippingHistogram.writeToTsv(new File(outputDir, "5_prime_clipping.tsv")) - this._3_ClippingHistogram.writeToTsv(new File(outputDir, "3_prime_clipping.tsv")) + this.mappingQualityHistogram.writeHistogramToTsv(new File(outputDir, "mapping_quality.tsv")) + this.insertSizeHistogram.writeHistogramToTsv(new File(outputDir, "insert_size.tsv")) + this.clippingHistogram.writeHistogramToTsv(new File(outputDir, "clipping.tsv")) + this.leftClippingHistogram.writeHistogramToTsv(new File(outputDir, "left_clipping.tsv")) + this.rightClippingHistogram.writeHistogramToTsv(new File(outputDir, "right_clipping.tsv")) + this._5_ClippingHistogram.writeHistogramToTsv(new File(outputDir, "5_prime_clipping.tsv")) + this._3_ClippingHistogram.writeHistogramToTsv(new File(outputDir, "3_prime_clipping.tsv")) } def toSummaryMap = { diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/flagstat/FlagstatCollector.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/flagstat/FlagstatCollector.scala index 362053f99cb0b4c67589388ff4f4876d020749c4..bd44dab1f42ca1226df7d268301fcfec4dbeb694 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/flagstat/FlagstatCollector.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/flagstat/FlagstatCollector.scala @@ -32,6 +32,12 @@ class FlagstatCollector { protected[FlagstatCollector] var totalCounts: Array[Long] = Array() protected[FlagstatCollector] var crossCounts = Array.ofDim[Long](1, 1) + def writeAsTsv(file: File): Unit = { + val writer = new PrintWriter(file) + names.foreach(x => writer.println(x._2 + "\t" + totalCounts(x._1))) + writer.close() + } + def loadDefaultFunctions() { addFunction("All", record => true) addFunction("Mapped", record => !record.getReadUnmappedFlag) diff --git a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/VcfStats.scala b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/VcfStats.scala index e1c2f3539b605da089413f5717c4ca03a6ee0902..65801c97f19249391a0adaa6926380e1ac94345f 100644 --- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/VcfStats.scala +++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/vcfstats/VcfStats.scala @@ -74,7 +74,7 @@ object VcfStats extends ToolCommand { opt[File]('o', "outputDir") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) => c.copy(outputDir = x) } validate { - x => if (x == null) failure("Output directory required") else success + x => if (x == null) failure("Valid output directory required") else if (x.exists) success else failure(s"Output directory does not exist: $x") } text "Path to directory for output (required)" opt[File]('i', "intervals") unbounded () valueName "<file>" action { (x, c) => c.copy(intervals = Some(x)) diff --git a/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala b/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala index f0eeac2b8df529653b1ec2f05db9fbc007c3597f..5a599461c230239c562c03f287048157bf766974 100644 --- a/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala +++ b/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/VcfStatsTest.scala @@ -24,6 +24,7 @@ import org.scalatest.Matchers import org.scalatest.testng.TestNGSuite import org.testng.annotations.Test import nl.lumc.sasc.biopet.utils.sortAnyAny +import org.apache.commons.io.FileUtils import scala.collection.mutable @@ -162,6 +163,16 @@ class VcfStatsTest extends TestNGSuite with Matchers { valueFromTsv(i, "Sample_ID_3", "bam") should be(empty) } + @Test + def testNoExistOutputDir: Unit = { + val tmp = Files.createTempDirectory("vcfStats") + FileUtils.deleteDirectory(new File(tmp.toAbsolutePath.toString)) + val vcf = resourcePath("/chrQ.vcf.gz") + val ref = resourcePath("/fake_chrQ.fa") + + an[IllegalArgumentException] should be thrownBy main(Array("-I", vcf, "-R", ref, "-o", tmp.toAbsolutePath.toString)) + } + @Test def testMain() = { val tmp = Files.createTempDirectory("vcfStats") diff --git a/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStatsTest.scala b/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStatsTest.scala index e93076696b75f95714b4bfa7d8858e0299b96753..a11efaac89142e46e7f263d1bc3cb92409841698 100644 --- a/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStatsTest.scala +++ b/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/BamStatsTest.scala @@ -34,6 +34,43 @@ class BamStatsTest extends TestNGSuite with Matchers { new File(outputDir, "bamstats.json") should exist new File(outputDir, "bamstats.summary.json") should exist + + new File(outputDir, "flagstats.tsv") shouldNot exist + new File(outputDir, "insertsize.stats.tsv") shouldNot exist + new File(outputDir, "insertsize.histogram.tsv") shouldNot exist + new File(outputDir, "mappingQuality.stats.tsv") shouldNot exist + new File(outputDir, "mappingQuality.histogram.tsv") shouldNot exist + new File(outputDir, "clipping.stats.tsv") shouldNot exist + new File(outputDir, "clipping.histogram.tsv") shouldNot exist + + new File(outputDir, "flagstats") shouldNot exist + new File(outputDir, "flagstats.summary.json") shouldNot exist + new File(outputDir, "mapping_quality.tsv") shouldNot exist + new File(outputDir, "insert_size.tsv") shouldNot exist + new File(outputDir, "clipping.tsv") shouldNot exist + new File(outputDir, "left_clipping.tsv") shouldNot exist + new File(outputDir, "right_clipping.tsv") shouldNot exist + new File(outputDir, "5_prime_clipping.tsv") shouldNot exist + new File(outputDir, "3_prime_clipping.tsv") shouldNot exist + } + + @Test + def testTsvOutputs: Unit = { + val outputDir = Files.createTempDir() + outputDir.deleteOnExit() + BamStats.main(Array("-b", BamStatsTest.pairedBam01.getAbsolutePath, "-o", outputDir.getAbsolutePath, "--tsvOutputs")) + + new File(outputDir, "bamstats.json") should exist + new File(outputDir, "bamstats.summary.json") should exist + + new File(outputDir, "flagstats.tsv") should exist + new File(outputDir, "insertsize.stats.tsv") should exist + new File(outputDir, "insertsize.histogram.tsv") should exist + new File(outputDir, "mappingQuality.stats.tsv") should exist + new File(outputDir, "mappingQuality.histogram.tsv") should exist + new File(outputDir, "clipping.stats.tsv") should exist + new File(outputDir, "clipping.histogram.tsv") should exist + new File(outputDir, "flagstats") shouldNot exist new File(outputDir, "flagstats.summary.json") shouldNot exist new File(outputDir, "mapping_quality.tsv") shouldNot exist diff --git a/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/CountsTest.scala b/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/CountsTest.scala index ca6b91eb55fe8997a06be9ede79ad5ab321ae4d8..6d518c75486a6a664f2e2e9f0614800fd98bf24e 100644 --- a/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/CountsTest.scala +++ b/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/CountsTest.scala @@ -86,7 +86,7 @@ class CountsTest extends TestNGSuite with Matchers { val tsvFile = File.createTempFile("counts.", ".tsv") tsvFile.deleteOnExit() - c1.writeToTsv(tsvFile) + c1.writeHistogramToTsv(tsvFile) val reader = Source.fromFile(tsvFile) reader.getLines().toList shouldBe List("value\tcount", "1\t1", "2\t2", "3\t3") diff --git a/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/HistogramTest.scala b/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/HistogramTest.scala index 09ac60263bf23005d2e81401d80de4caf0bd8f00..116719a29be70fa8acca645ab85a429d568dbc5e 100644 --- a/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/HistogramTest.scala +++ b/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/bamstats/HistogramTest.scala @@ -64,7 +64,7 @@ class HistogramTest extends TestNGSuite with Matchers { val tsvFile = File.createTempFile("counts.", ".tsv") tsvFile.deleteOnExit() - c1.writeToTsv(tsvFile) + c1.writeHistogramToTsv(tsvFile) val reader = Source.fromFile(tsvFile) reader.getLines().toList shouldBe List("value\tcount", "1\t1", "2\t2", "3\t3") diff --git a/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala b/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala index f3cbe9909e649aabcf15201a9c18355f353edae3..fe37a653789b2fb34e72aae7227ac074c1c80aff 100644 --- a/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala +++ b/carp/src/main/scala/nl/lumc/sasc/biopet/pipelines/carp/Carp.scala @@ -61,6 +61,8 @@ class Carp(val root: Configurable) extends QScript with MultisampleMappingTrait override def preProcessBam = Some(createFile("filter.bam")) + override def metricsPreprogressBam = false + val controls: List[String] = config("control", default = Nil) override def summarySettings = super.summarySettings ++ Map("controls" -> controls) diff --git a/docs/pipelines/carp.md b/docs/pipelines/carp.md index 436bd7e48c5803a14afc552e93667eb159a1ef64..e0205bb2df3227e02a141f5d039511eb1cb21e7e 100644 --- a/docs/pipelines/carp.md +++ b/docs/pipelines/carp.md @@ -101,6 +101,12 @@ While optional settings are: 1. `aligner`: which aligner to use (`bwa` or `bowtie`) 2. `macs2`: Here only the callpeak modus is implemented. But one can set all the options from [macs2 callpeak](https://github.com/taoliu/MACS/#call-peaks) in this settings config. Note that the config value is: `macs2_callpeak` +[Gears](gears) is run automatically for the data analysed with `Carp`. There are two levels on which this can be done and this should be specified in the [config](../general/config) file: + +*`mapping_to_gears: unmapped` : Unmapped reads after alignment. (default) +*`mapping_to_gears: all` : Trimmed and clipped reads from [Flexiprep](flexiprep). +*`mapping_to_gears: none` : Disable this functionality. + ## Configuration for detection of broad peaks (ATAC-seq) Carp can do broad peak-calling by using the following config: diff --git a/docs/pipelines/gears.md b/docs/pipelines/gears.md index f16cf00fb7f5a5c33ee7a8b9ab6860043d9715bc..92f43889dd101da5dce35271733de88c1b054e23 100644 --- a/docs/pipelines/gears.md +++ b/docs/pipelines/gears.md @@ -1,10 +1,11 @@ # Gears ## Introduction -Gears is a metagenomics pipeline. (``GE``nome ``A``nnotation of ``R``esidual ``S``equences). One can use this pipeline to identify contamination in sequencing runs on either raw FastQ files or BAM files. +Gears is a metagenomics pipeline. (``GE``nome ``A``nnotation of ``R``esidual ``S``equences). It can be used to identify contamination in sequencing runs on either raw FastQ files or BAM files. In case of BAM file as input, it will extract the unaligned read(pair) sequences for analysis. +It can also be used to analyse sequencing data obtained from metagenomics samples, containing a mix of different organisms. Taxonomic labels will be assigned to the input reads and these will be reported. -Analysis result is reported in a krona graph, which is visible and navigatable in a webbrowser. +The result of the analysis is reported in a [Krona graph](https://github.com/marbl/Krona/wiki), which is visible in an interactive way in a web browser. Pipeline analysis components include: @@ -17,7 +18,13 @@ Pipeline analysis components include: ## Gears -This pipeline is used to analyse a group of samples. This pipeline only accepts fastq files. The fastq files first get trimmed and clipped with [Flexiprep](Flexiprep). This can be disabled with the config flags of [Flexiprep](Flexiprep). The samples can be specified with a sample config file, see [Config](../general/Config) +This pipeline is used to analyse a group of samples and only accepts fastq files. The fastq files first get trimmed and clipped with [Flexiprep](flexiprep). +This can be disabled with the config flags of [Flexiprep](flexiprep). The samples can be specified with a sample config file, see [Config](../general/config). +`Gears` uses centrifuge by default as its classification engine. An indexed database, created with ```centrifuge-index``` is required to be specified by including ```centrifuge_index: /path/to/index``` in the [config](../general/config) file. +More information on how to build centrifuge databases and indexes can be found [here](https://github.com/infphilo/centrifuge/blob/master/MANUAL.markdown#database-download-and-index-building). +On LUMC's SHARK the NCBI non-redundant database (nt) is used as the default database against which taxonomic assignments for the short input reads are made. +If you would like to use another classifier you should specify so in the [config](../general/config) file. Multiple classifiers can be used in one run, if you wish to have multiple outputs for comparison. +Note that the Centrifuge and Kraken systems can be used for any kind of input sequences (e.g. shotgun or WGS), while the QIIME system is optimized for 16S-based analyses. ### Config @@ -36,12 +43,14 @@ To start the pipeline (remove `-run` for a dry run): ``` bash biopet pipeline Gears -run \ --config mySettings.json -config samples.json +-config /path/to/mySettings.json -config /path/to/samples.json ``` +Note: If you are using the LUMC High performance Computer cluster (aka SHARK) make sure to include the ```-qsub -jobParaEnv BWA -jobQueue all.q``` flags when invoking the command. + ## GearsSingle -This pipeline can be used to analyse a single sample, this can be fastq files or a bam file. When a bam file is given only the unmapped reads are extracted. +This pipeline can be used to analyse a single sample, be it fastq files or a bam file. When a bam file is provided as input only the unmapped reads are extracted and further analysed. ### Example @@ -49,13 +58,14 @@ To start the pipeline (remove `-run` for a dry run): ``` bash biopet pipeline GearsSingle -run \ --R1 myFirstReadPair -R2 mySecondReadPair -sample mySampleName \ --library myLibname -config mySettings.json +-R1 /path/to/myFirstReadPair -R2 /path/to/mySecondReadPair -sample mySampleName \ +-library myLibname -config /path/to/mySettings.json ``` +Note: If you are using the LUMC High performance Computer cluster (aka SHARK) make sure to include the ```-qsub -jobParaEnv BWA -jobQueue all.q``` flags when invoking the command. -### Commandline flags +### Command line flags For technical reasons, single sample pipelines, such as this pipeline do **not** take a sample config. -Input files are in stead given on the command line as a flag. +Input files are instead given on the command line as a flag. Command line flags for Gears are: @@ -77,7 +87,8 @@ Please refer [to our mapping pipeline](mapping.md) for information about how the | Key | Type | default | Function | | --- | ---- | ------- | -------- | -| gears_use_kraken | Boolean | true | Run fastq file with kraken | +| gears_use_centrifuge | Boolean | true | Run fastq file with centrifuge | +| gears_use_kraken | Boolean | false | Run fastq file with kraken | | gears_use_qiime_closed | Boolean | false | Run fastq files with qiime with the closed reference module | | gears_use_qiime_open | Boolean | false | Run fastq files with qiime with the open reference module | | gears_use_qiime_rtax | Boolean | false | Run fastq files with qiime with the rtax module | @@ -85,15 +96,52 @@ Please refer [to our mapping pipeline](mapping.md) for information about how the ### Result files -The results of `GearsSingle` are stored in the following files: +The results of a `Gears` run are organised in two folders: `report` and `samples`. +In the `report` folder, one can find the html report (index.html) displaying the summarised results over all samples and providing a navigation view on the taxonomy graph and its result, per sample. +In the `samples` folder, one can find a separate folder for each sample. The individual folders follow the input samples naming and contain the results for each analysis run per sample. + +##Example + +~~~ +OutDir ++-- report +| +-- index.html ++-- samples +| +-- <sample_name> +| +-- centrifuge +| +-- <sample_name>.centrifuge.gz +| +-- <sample_name>.centrifuge.kreport +| +-- <sample_name>.krkn.json +| +-- <sample_name>.centrifuge_unique.greport +| +-- <sample_name>.centrifuge_unique.krkn.json +| +-- kraken +| +-- <sample_name>.krkn.raw +| +-- <sample_name>.krkn.full +| +-- <sample_name>.krkn.json +~~~ + +The `Gears`-specific results are contained in a folder named after each tool that was used (by default `Gears` uses centrifuge). They are stored in the following files: + | File suffix | Application | Content | Description | | ----------- | ----------- | ------- | ----------- | +| *.centrifuge.gz | centrifuge | tsv | Annotation per sequence (compressed) | +| *.centrifuge.kreport | centrifuge-kreport | tsv | Kraken-style report of the centrifuge output including taxonomy information | +| *.centrifuge.krkn.json | krakenReportToJson | json | JSON representation of the the taxonomy report | +| *.centrifuge_unique.kreport | centrifuge-kreport | tsv | Kraken-style report of the centrifuge output including taxonomy information for the reads with unique taxonomic assignment | +| *.centrifuge_unique.krkn.json | krakenReportToJson | json | JSON represeantation of the taxonomy report for the uniquely mapped reads | + +Kraken specific output + | *.krkn.raw | kraken | tsv | Annotation per sequence | | *.krkn.full | kraken-report | tsv | List of all annotation possible with counts filled in for this specific sample| -| *.krkn.json | krakenreport2json| json | JSON representation of the taxonomy report, for postprocessing | +| *.krkn.json | krakenReportToJson | json | JSON representation of the taxonomy report, for postprocessing | + +QIIME specific output + +| *.otu_table.biom | qiime | biom | Biom file containing counts for OTUs identified in the input | +| *.otu_map.txt | qiime | tsv | Tab-separated file containing information about which samples a taxon has been identified in | -In a seperate `report` folder, one can find the html report displaying the summary and providing a navigation view on the taxonomy graph and (its) result. ## Getting Help For questions about this pipeline and suggestions, we have a GitHub page where you can submit your ideas and thoughts .[GitHub](https://github.com/biopet/biopet). diff --git a/docs/pipelines/gentrap.md b/docs/pipelines/gentrap.md index cd29f9b95cf20ad378d3e2c04fb29618647229e2..599da298b2816a2424a3d459f6b6412df4c59196 100644 --- a/docs/pipelines/gentrap.md +++ b/docs/pipelines/gentrap.md @@ -28,6 +28,13 @@ Please refer [to our mapping pipeline](mapping.md) for information about how the As with other biopet pipelines, Gentrap relies on a JSON configuration file to run its analyses. There are two important parts here, the configuration for the samples (to determine the sample layout of your experiment) and the configuration for the pipeline settings (to determine which analyses are run). To get help creating the appropriate [configs](../general/config.md) please refer to the config page in the general section. + +[Gears](gears) is run automatically for the data analysed with `Gentrap`. There are two levels on which this can be done and this should be specified in the [config](../general/config) file: + +*`mapping_to_gears: unmapped` : Unmapped reads after alignment. (default) +*`mapping_to_gears: all` : Trimmed and clipped reads from [Flexiprep](flexiprep). +*`mapping_to_gears: none` : Disable this functionality. + ### Sample Configuration Samples are single experimental units whose expression you want to measure. They usually consist of a single sequencing library, but in some cases (for example when the experiment demands each sample have a minimum library depth) a single sample may contain multiple sequencing libraries as well. All this is can be configured using the correct JSON nesting, with the following pattern: diff --git a/docs/pipelines/shiva.md b/docs/pipelines/shiva.md index 15fa14b33eb2afb8dbd02a31b0ec21c8aaa5fc92..d272884e39aed06783f4d16b9e7c249ebd0279d2 100644 --- a/docs/pipelines/shiva.md +++ b/docs/pipelines/shiva.md @@ -52,6 +52,12 @@ biopet pipeline shiva -config MySamples.json -config MySettings.json -run A dry run can be performed by simply removing the `-run` flag from the command line call. +[Gears](gears) is run automatically for the data analysed with `Shiva`. There are two levels on which this can be done and this should be specified in the [config](../general/config) file: + +*`mapping_to_gears: unmapped` : Unmapped reads after alignment. (default) +*`mapping_to_gears: all` : Trimmed and clipped reads from [Flexiprep](flexiprep). +*`mapping_to_gears: none` : Disable this functionality. + ### Only variant calling It is possible to run Shiva while only performing its variant calling steps. @@ -97,8 +103,10 @@ At this moment the following variant callers can be used * <a href="https://samtools.github.io/bcftools/bcftools.html">bcftools</a> * <a href="https://samtools.github.io/bcftools/bcftools.html">bcftools_singlesample</a> * <a href="https://github.com/ekg/freebayes">freebayes</a> +* <a href="http://varscan.sourceforge.net/">varscan</a> * [raw](../tools/MpileupToVcf) + ## Config options To view all possible config options please navigate to our Gitlab wiki page diff --git a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala index 440d19e1e04b86a5e637bfc911047b7f1dc8ede7..7c9e1601c05402899d5146ddbc09395a16dee19d 100644 --- a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala +++ b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala @@ -155,6 +155,8 @@ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(r */ def foundAdapters: Set[AdapterSequence] = { if (dataFile.exists) { // On a dry run this file does not yet exist + val modules = qcModules + /** Returns a list of adapter and/or contaminant sequences known to FastQC */ def getFastqcSeqs(file: Option[File]): Set[AdapterSequence] = file match { case None => Set.empty[AdapterSequence] @@ -170,7 +172,7 @@ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(r val adapterSet = getFastqcSeqs(adapters) val contaminantSet = getFastqcSeqs(contaminants) - val foundAdapterNames: Seq[String] = qcModules.get("Overrepresented sequences") match { + val foundAdapterNames: Seq[String] = modules.get("Overrepresented sequences") match { case None => Seq.empty[String] case Some(qcModule) => for ( @@ -181,7 +183,7 @@ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(r // select full sequences from known adapters and contaminants // based on overrepresented sequences results - val fromKnownList: Set[AdapterSequence] = (adapterSet ++ contaminantSet) + val fromKnownList: Set[AdapterSequence] = contaminantSet .filter(x => foundAdapterNames.exists(_.startsWith(x.name))) val fromKnownListRC: Set[AdapterSequence] = if (enableRCtrimming) fromKnownList.map { @@ -191,7 +193,7 @@ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(r // list all sequences found by FastQC val fastQCFoundSequences: Seq[AdapterSequence] = if (sensitiveAdapterSearch) { - qcModules.get("Overrepresented sequences") match { + modules.get("Overrepresented sequences") match { case None => Seq.empty case Some(qcModule) => for ( @@ -199,17 +201,16 @@ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(r values = line.split("\t") if values.size >= 4 ) yield AdapterSequence(values(3), values(0)) } - } else { - Seq.empty - } + } else Seq() - // we only want to keep adapter sequences which are known by FastQC - // sequences such as "Adapter01 (100% over 12bp)" are valid because "Adapter01" is in FastQC - fastQCFoundSequences.filter(x => { - (adapterSet ++ contaminantSet).count(y => x.name.startsWith(y.name)) == 1 - }) + val foundAdapters = modules.get("Adapter Content").map { x => + val header = x.lines.head.split("\t").tail.zipWithIndex + val lines = x.lines.tail.map(_.split("\t").tail) + val found = header.filter(h => lines.exists(x => x(h._2).toFloat > 0)).map(_._1) + adapterSet.filter(x => found.contains(x.name)) + } - fromKnownList ++ fastQCFoundSequences ++ fromKnownListRC + fromKnownList ++ fastQCFoundSequences ++ fromKnownListRC ++ foundAdapters.getOrElse(Seq()) } else Set() } diff --git a/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/Gentrap.scala b/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/Gentrap.scala index 8ed0df5c4f1b3ec5d77b7b7d25f076b68c54dfe0..9993e5fef382239b5d2795d717b6c9debfc9ba6d 100644 --- a/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/Gentrap.scala +++ b/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/Gentrap.scala @@ -90,6 +90,7 @@ class Gentrap(val root: Configurable) extends QScript case otherwise => throw new IllegalStateException(otherwise.toString) })) else Map()), + "merge_count_files" -> false, "merge_strategy" -> "preprocessmergesam", "gsnap" -> Map( "novelsplicing" -> 1, diff --git a/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/measures/BiosBaseCounts.scala b/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/measures/BiosBaseCounts.scala index ece7da1a09f883ca23d7ba06610db6d24c0cd292..bfe68d1b91290f6ed0f81b2a1a56d453d793380c 100644 --- a/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/measures/BiosBaseCounts.scala +++ b/gentrap/src/main/scala/nl/lumc/sasc/biopet/pipelines/gentrap/measures/BiosBaseCounts.scala @@ -17,6 +17,7 @@ package nl.lumc.sasc.biopet.pipelines.gentrap.measures import nl.lumc.sasc.biopet.core.annotations.AnnotationBed import nl.lumc.sasc.biopet.extensions.bedtools.BedtoolsCoverage +import nl.lumc.sasc.biopet.extensions.samtools.{ SamtoolsFlagstat, SamtoolsView } import nl.lumc.sasc.biopet.pipelines.gentrap.scripts.{ AggrBaseCount, Hist2count } import nl.lumc.sasc.biopet.utils.config.Configurable import org.broadinstitute.gatk.queue.QScript @@ -76,6 +77,15 @@ class BiosBaseCounts(val root: Configurable) extends QScript with Measurement wi exonAggr.inputLabel = sampleName add(exonAggr) + val samtoolsView = new SamtoolsView(this) + samtoolsView.input = bamFile + samtoolsView.b = true + samtoolsView.h = true + samtoolsView.L = Some(annotationBed) + val samtoolsFlagstat = new SamtoolsFlagstat(this) + samtoolsFlagstat.output = new File(outputDir, s"$sampleName.exon.flagstat") + add(samtoolsView | samtoolsFlagstat) + (geneAggr.output, exonAggr.output) } } diff --git a/gentrap/src/test/scala/nl/lumc/sasc/biopet/pipelines/gentrap/GentrapTest.scala b/gentrap/src/test/scala/nl/lumc/sasc/biopet/pipelines/gentrap/GentrapTest.scala index 2e9015b4e1b70c742f811ce1556dcd45fa97f413..b9ad9aab833d7ba9192ad973bfcfde237d78dd2d 100644 --- a/gentrap/src/test/scala/nl/lumc/sasc/biopet/pipelines/gentrap/GentrapTest.scala +++ b/gentrap/src/test/scala/nl/lumc/sasc/biopet/pipelines/gentrap/GentrapTest.scala @@ -138,7 +138,7 @@ abstract class GentrapTestAbstract(val expressionMeasures: List[String]) extends "hisat2" -> classOf[Hisat2] ) - val alignerClass = classMap.get(aligner.getOrElse("gsnap")) + val alignerClass = classMap.get(aligner.getOrElse("star")) alignerClass.foreach(c => assert(functions.keys.exists(_ == c))) classMap.values.filterNot(Some(_) == alignerClass).foreach(x => assert(!functions.keys.exists(_ == x))) diff --git a/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/methods/XhmmMethod.scala b/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/methods/XhmmMethod.scala index 4ef258192b9c4f4ef0fff0f5c0153012fb0561cf..741aeb87f3e7b2e2714675504cd84f633e7bafe3 100644 --- a/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/methods/XhmmMethod.scala +++ b/kopisu/src/main/scala/nl/lumc/sasc/biopet/pipelines/kopisu/methods/XhmmMethod.scala @@ -64,13 +64,6 @@ class XhmmMethod(val root: Configurable) extends CnvMethod with Reference { val firstMatrix = new XhmmMatrix(this) firstMatrix.inputMatrix = merged.output firstMatrix.outputMatrix = swapExt(xhmmDir, merged.output, ".depths.data", ".filtered_centered.data") - firstMatrix.minTargetSize = 10 - firstMatrix.maxTargetSize = 10000 - firstMatrix.minMeanTargetRD = 10 - firstMatrix.maxMeanTargetRD = 500 - firstMatrix.minMeanSampleRD = 25 - firstMatrix.maxMeanSampleRD = 200 - firstMatrix.maxSdSampleRD = 150 firstMatrix.outputExcludedSamples = Some(swapExt(xhmmDir, merged.output, ".depths.data", ".filtered.samples.txt")) firstMatrix.outputExcludedTargets = Some(swapExt(xhmmDir, merged.output, ".depths.data", ".filtered.targets.txt")) add(firstMatrix) diff --git a/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/MultisampleMapping.scala b/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/MultisampleMapping.scala index c278fe6b4670c184b5fbf596cefa97fb1c25d9fc..fcb539cfd04172d92b4da052818139eb942d2897 100644 --- a/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/MultisampleMapping.scala +++ b/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/MultisampleMapping.scala @@ -90,6 +90,8 @@ trait MultisampleMappingTrait extends MultiSampleQScript def makeSample(id: String) = new Sample(id) class Sample(sampleId: String) extends AbstractSample(sampleId) { sample => + def metricsPreprogressBam = true + def makeLibrary(id: String) = new Library(id) class Library(libId: String) extends AbstractLibrary(libId) { lib => @@ -263,7 +265,7 @@ trait MultisampleMappingTrait extends MultiSampleQScript if (mergeStrategy != MergeStrategy.None && libraries.flatMap(_._2.bamFile).nonEmpty) { val bamMetrics = new BamMetrics(qscript) bamMetrics.sampleId = Some(sampleId) - bamMetrics.inputBam = preProcessBam.get + bamMetrics.inputBam = if (metricsPreprogressBam) preProcessBam.get else bamFile.get bamMetrics.outputDir = new File(sampleDir, "metrics") add(bamMetrics) diff --git a/pom.xml b/pom.xml index 7e017a272714dbf63e73077541479db89dc31676..2dc16c00f9a906a54e3a402b6d3c1ac7e32181cf 100644 --- a/pom.xml +++ b/pom.xml @@ -127,6 +127,22 @@ </executions> <!-- ... (see other usage or goals for details) ... --> </plugin> + <plugin> + <groupId>org.scalatra.scalate</groupId> + <artifactId>maven-scalate-plugin_2.10</artifactId> + <version>1.7.0</version> + <executions> + <execution> + <phase>compile</phase> + <goals> + <goal>precompile</goal> + </goals> + <configuration> + <contextClass>org.fusesource.scalate.DefaultRenderContext</contextClass> + </configuration> + </execution> + </executions> + </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-jar-plugin</artifactId> @@ -280,4 +296,12 @@ </plugin> </plugins> </reporting> + + <dependencies> + <dependency> + <groupId>org.scalatra.scalate</groupId> + <artifactId>scalate-core_2.10</artifactId> + <version>1.7.0</version> + </dependency> + </dependencies> </project> diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaSvCalling.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaSvCalling.scala index a5418b3036de74c972e45dd4278556c89b3e47de..26b43d4a04323e04181043863326ee3c81eb0d8a 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaSvCalling.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaSvCalling.scala @@ -69,23 +69,28 @@ class ShivaSvCalling(val root: Configurable) extends QScript with SummaryQScript // merge VCF by sample for ((sample, bamFile) <- inputBams) { - var sampleVCFS: List[Option[File]] = List.empty - callers.foreach { caller => - sampleVCFS ::= caller.outputVCF(sample) + if (callers.size > 1) { + var sampleVCFS: List[Option[File]] = List.empty + callers.foreach { caller => + sampleVCFS ::= caller.outputVCF(sample) + } + val mergeSVcalls = new Pysvtools(this) + mergeSVcalls.input = sampleVCFS.flatten + mergeSVcalls.output = new File(outputDir, sample + ".merged.vcf") + add(mergeSVcalls) + outputMergedVCFbySample += (sample -> mergeSVcalls.output) + } else { + outputMergedVCFbySample += (sample -> callers.head.outputVCF(sample).get) } - val mergeSVcalls = new Pysvtools(this) - mergeSVcalls.input = sampleVCFS.flatten - mergeSVcalls.output = new File(outputDir, sample + ".merged.vcf") - add(mergeSVcalls) - outputMergedVCFbySample += (sample -> mergeSVcalls.output) } - // merge all files from all samples in project - val mergeSVcallsProject = new Pysvtools(this) - mergeSVcallsProject.input = outputMergedVCFbySample.values.toList - mergeSVcallsProject.output = outputMergedVCF - add(mergeSVcallsProject) - + if (inputBams.size > 1) { + // merge all files from all samples in project + val mergeSVcallsProject = new Pysvtools(this) + mergeSVcallsProject.input = outputMergedVCFbySample.values.toList + mergeSVcallsProject.output = outputMergedVCF + add(mergeSVcallsProject) + } // merging the VCF calls by project // basicly this will do all samples from this pipeline run // group by "tags" @@ -104,7 +109,7 @@ class ShivaSvCalling(val root: Configurable) extends QScript with SummaryQScript def summarySettings = Map("sv_callers" -> configCallers.toList) /** Files for the summary */ - def summaryFiles: Map[String, File] = Map("final_mergedvcf" -> outputMergedVCF) + def summaryFiles: Map[String, File] = Map("final_mergedvcf" -> (if (inputBams.size > 1) outputMergedVCF else outputMergedVCFbySample.values.head)) } object ShivaSvCalling extends PipelineCommand \ No newline at end of file diff --git a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/svcallers/Delly.scala b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/svcallers/Delly.scala index 0eccb89d19bb746f826c8d73e99da7da2bbd6859..fd5b06cd4e62719255a5c6a761807d08525cd7a1 100644 --- a/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/svcallers/Delly.scala +++ b/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/svcallers/Delly.scala @@ -36,6 +36,7 @@ class Delly(val root: Configurable) extends SvCaller { val catVariants = new CatVariants(this) catVariants.outputFile = new File(dellyDir, sample + ".delly.vcf") catVariants.isIntermediate = true + catVariants.writeHeaderToEmptyOutput = true if (del) { val delly = new DellyCaller(this)