Commit 0c7bc8eb authored by Peter van 't Hof's avatar Peter van 't Hof Committed by GitHub

Merge branch 'develop' into biopet-609

parents cde650d3 3019cf79
......@@ -20,7 +20,7 @@ import nl.lumc.sasc.biopet.core.ToolCommandFunction
import nl.lumc.sasc.biopet.utils.summary.Summary
import nl.lumc.sasc.biopet.utils.{ IoUtils, Logging, ToolCommand }
import org.broadinstitute.gatk.utils.commandline.Input
import org.fusesource.scalate.{ TemplateEngine, TemplateSource }
import org.fusesource.scalate.TemplateEngine
import scala.collection.mutable
import scala.language.postfixOps
......@@ -236,9 +236,7 @@ object ReportBuilder {
/** Single template render engine, this will have a cache for all compile templates */
protected val engine = new TemplateEngine()
/** Cache of temp file for templates from the classpath / jar */
private[report] var templateCache: Map[String, File] = Map()
engine.allowReload = false
/** This will give the total number of pages including all nested pages */
def countPages(page: ReportPage): Int = {
......@@ -254,15 +252,6 @@ object ReportBuilder {
def renderTemplate(location: String, args: Map[String, Any] = Map()): String = {
Logging.logger.info("Rendering: " + location)
val templateFile: File = templateCache.get(location) match {
case Some(template) => template
case _ =>
val tempFile = File.createTempFile("ssp-template", new File(location).getName)
tempFile.deleteOnExit()
IoUtils.copyStreamToFile(getClass.getResourceAsStream(location), tempFile)
templateCache += location -> tempFile
tempFile
}
engine.layout(TemplateSource.fromFile(templateFile), args)
engine.layout(location, args)
}
}
\ No newline at end of file
......@@ -79,11 +79,8 @@ class ReportBuilderTest extends TestNGSuite with Matchers {
@Test
def testRenderTemplate: Unit = {
ReportBuilder.templateCache = Map()
ReportBuilder.templateCache shouldBe empty
ReportBuilder.renderTemplate("/template.ssp", Map("arg" -> "test")) shouldBe "test"
ReportBuilder.templateCache.size shouldBe 1
ReportBuilder.renderTemplate("/template.ssp", Map("arg" -> "bla")) shouldBe "bla"
ReportBuilder.templateCache.size shouldBe 1
}
}
......@@ -16,10 +16,12 @@ package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.{ Version, BiopetCommandLineFunction }
import nl.lumc.sasc.biopet.core.{ BiopetCommandLineFunction, Version }
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.util.matching.Regex
/**
* Wrapper for the cuffquant command line tool.
* Written based on cuffquant version v2.2.1 (md5: 0765b82b11db9256f5be341a7da884d6)
......@@ -116,11 +118,11 @@ class Cuffquant(val root: Configurable) extends BiopetCommandLineFunction with V
/** Disable SCV correction */
var noScvCorrection: Boolean = config("no_scv_correction", default = false)
def versionRegex = """cuffquant v(.*)""".r
def versionCommand = executable
def versionRegex: Regex = """cuffquant v(.*)""".r
def versionCommand: String = executable
override def versionExitcode = List(0, 1)
def cmdLine =
def cmdLine: String =
required(executable) +
required("--output-dir", outputDir) +
optional("--mask-file", maskFile) +
......
......@@ -154,7 +154,10 @@ object VcfWithVcf extends ToolCommand {
* @param header: header of secondary reader
* @return Map of fields and their values in secondary records
*/
def createFieldMap(fields: List[Fields], record: VariantContext, secondaryRecords: List[VariantContext], header: VCFHeader): Map[String, List[Any]] = {
def createFieldMap(fields: List[Fields],
record: VariantContext,
secondaryRecords: List[VariantContext],
header: VCFHeader): Map[String, List[Any]] = {
val fieldMap = (for (
f <- fields if secondaryRecords.exists(_.hasAttribute(f.inputField))
) yield {
......@@ -236,8 +239,8 @@ object VcfWithVcf extends ToolCommand {
* @return
*/
def numberA(referenceRecord: VariantContext, annotateRecord: VariantContext, field: String): List[Any] = {
val refValues = referenceRecord.getAttributeAsList(field).toArray
annotateRecord.
val refValues = annotateRecord.getAttributeAsList(field).toArray
referenceRecord.
getAlternateAlleles.filter(referenceRecord.hasAlternateAllele).
map(x => referenceRecord.getAlternateAlleles.indexOf(x)).
flatMap(x => refValues.lift(x)).
......@@ -252,8 +255,8 @@ object VcfWithVcf extends ToolCommand {
* @return
*/
def numberR(referenceRecord: VariantContext, annotateRecord: VariantContext, field: String): List[Any] = {
val refValues = referenceRecord.getAttributeAsList(field).toArray
annotateRecord.
val refValues = annotateRecord.getAttributeAsList(field).toArray
referenceRecord.
getAlleles.
filter(referenceRecord.hasAllele).
map(x => referenceRecord.getAlleles.indexOf(x)).
......
......@@ -39,7 +39,8 @@ object BamStats extends ToolCommand {
bamFile: File = null,
referenceFasta: Option[File] = None,
binSize: Int = 10000,
threadBinSize: Int = 10000000) extends AbstractArgs
threadBinSize: Int = 10000000,
tsvOutputs: Boolean = false) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('R', "reference") valueName "<file>" action { (x, c) =>
......@@ -57,6 +58,9 @@ object BamStats extends ToolCommand {
opt[Int]("threadBinSize") valueName "<int>" action { (x, c) =>
c.copy(threadBinSize = x)
} text "Size of region per thread"
opt[Unit]("tsvOutputs") action { (x, c) =>
c.copy(tsvOutputs = true)
} text "Also output tsv files, default there is only a json"
}
/** This is the main entry to [[BamStats]], this will do the argument parsing. */
......@@ -68,7 +72,7 @@ object BamStats extends ToolCommand {
val sequenceDict = validateReferenceInBam(cmdArgs.bamFile, cmdArgs.referenceFasta)
init(cmdArgs.outputDir, cmdArgs.bamFile, sequenceDict, cmdArgs.binSize, cmdArgs.threadBinSize)
init(cmdArgs.outputDir, cmdArgs.bamFile, sequenceDict, cmdArgs.binSize, cmdArgs.threadBinSize, cmdArgs.tsvOutputs)
logger.info("Done")
}
......@@ -96,13 +100,26 @@ object BamStats extends ToolCommand {
* @param binSize stats binsize
* @param threadBinSize Thread binsize
*/
def init(outputDir: File, bamFile: File, referenceDict: SAMSequenceDictionary, binSize: Int, threadBinSize: Int): Unit = {
def init(outputDir: File, bamFile: File, referenceDict: SAMSequenceDictionary, binSize: Int, threadBinSize: Int, tsvOutput: Boolean): Unit = {
val contigsFutures = BedRecordList.fromDict(referenceDict).allRecords.map { contig =>
contig.chr -> processContig(contig, bamFile, binSize, threadBinSize, outputDir)
}.toList
val stats = waitOnFutures(processUnmappedReads(bamFile) :: contigsFutures.map(_._2))
if (tsvOutput) {
stats.flagstat.writeAsTsv(new File(outputDir, "flagstats.tsv"))
stats.insertSizeHistogram.writeFilesAndPlot(outputDir, "insertsize", "Insertsize", "Reads", "Insertsize distribution")
stats.mappingQualityHistogram.writeFilesAndPlot(outputDir, "mappingQuality", "Mapping Quality", "Reads", "Mapping Quality distribution")
stats.clippingHistogram.writeFilesAndPlot(outputDir, "clipping", "CLipped bases", "Reads", "Clipping distribution")
stats.leftClippingHistogram.writeFilesAndPlot(outputDir, "left_clipping", "CLipped bases", "Reads", "Left Clipping distribution")
stats.rightClippingHistogram.writeFilesAndPlot(outputDir, "right_clipping", "CLipped bases", "Reads", "Right Clipping distribution")
stats._3_ClippingHistogram.writeFilesAndPlot(outputDir, "3prime_clipping", "CLipped bases", "Reads", "3 Prime Clipping distribution")
stats._5_ClippingHistogram.writeFilesAndPlot(outputDir, "5prime_clipping", "CLipped bases", "Reads", "5 Prime Clipping distribution")
}
val statsWriter = new PrintWriter(new File(outputDir, "bamstats.json"))
val totalStats = stats.toSummaryMap
val statsMap = Map(
......
......@@ -14,8 +14,10 @@
*/
package nl.lumc.sasc.biopet.tools.bamstats
import java.io.{ File, PrintWriter }
import nl.lumc.sasc.biopet.utils.sortAnyAny
import java.io.{ File, IOException, PrintWriter }
import nl.lumc.sasc.biopet.utils.rscript.LinePlot
import nl.lumc.sasc.biopet.utils.{ Logging, sortAnyAny }
import scala.collection.mutable
......@@ -43,7 +45,7 @@ class Counts[T](_counts: Map[T, Long] = Map[T, Long]())(implicit ord: Ordering[T
}
/** Write histogram to a tsv/count file */
def writeToTsv(file: File): Unit = {
def writeHistogramToTsv(file: File): Unit = {
val writer = new PrintWriter(file)
writer.println("value\tcount")
counts.keys.toList.sorted.foreach(x => writer.println(s"$x\t${counts(x)}"))
......@@ -82,4 +84,28 @@ class Histogram[T](_counts: Map[T, Long] = Map[T, Long]())(implicit ord: Numeric
} else Map()
}
/** Write histogram to a tsv/count file */
def writeAggregateToTsv(file: File): Unit = {
val writer = new PrintWriter(file)
aggregateStats.foreach(x => writer.println(x._1 + "\t" + x._2))
writer.close()
}
def writeFilesAndPlot(outputDir: File, prefix: String, xlabel: String, ylabel: String, title: String): Unit = {
writeHistogramToTsv(new File(outputDir, prefix + ".histogram.tsv"))
writeAggregateToTsv(new File(outputDir, prefix + ".stats.tsv"))
val plot = new LinePlot(null)
plot.input = new File(outputDir, prefix + ".histogram.tsv")
plot.output = new File(outputDir, prefix + ".histogram.png")
plot.xlabel = Some(xlabel)
plot.ylabel = Some(ylabel)
plot.title = Some(title)
try {
plot.runLocal()
} catch {
// If plotting fails the tools should not fail, this depens on R to be installed
case e: IOException => Logging.logger.warn(s"Error found while plotting ${plot.output}: ${e.getMessage}")
}
}
}
......@@ -50,13 +50,13 @@ case class Stats(flagstat: FlagstatCollector = new FlagstatCollector(),
def writeStatsToFiles(outputDir: File): Unit = {
this.flagstat.writeReportToFile(new File(outputDir, "flagstats"))
this.flagstat.writeSummaryTofile(new File(outputDir, "flagstats.summary.json"))
this.mappingQualityHistogram.writeToTsv(new File(outputDir, "mapping_quality.tsv"))
this.insertSizeHistogram.writeToTsv(new File(outputDir, "insert_size.tsv"))
this.clippingHistogram.writeToTsv(new File(outputDir, "clipping.tsv"))
this.leftClippingHistogram.writeToTsv(new File(outputDir, "left_clipping.tsv"))
this.rightClippingHistogram.writeToTsv(new File(outputDir, "right_clipping.tsv"))
this._5_ClippingHistogram.writeToTsv(new File(outputDir, "5_prime_clipping.tsv"))
this._3_ClippingHistogram.writeToTsv(new File(outputDir, "3_prime_clipping.tsv"))
this.mappingQualityHistogram.writeHistogramToTsv(new File(outputDir, "mapping_quality.tsv"))
this.insertSizeHistogram.writeHistogramToTsv(new File(outputDir, "insert_size.tsv"))
this.clippingHistogram.writeHistogramToTsv(new File(outputDir, "clipping.tsv"))
this.leftClippingHistogram.writeHistogramToTsv(new File(outputDir, "left_clipping.tsv"))
this.rightClippingHistogram.writeHistogramToTsv(new File(outputDir, "right_clipping.tsv"))
this._5_ClippingHistogram.writeHistogramToTsv(new File(outputDir, "5_prime_clipping.tsv"))
this._3_ClippingHistogram.writeHistogramToTsv(new File(outputDir, "3_prime_clipping.tsv"))
}
def toSummaryMap = {
......
......@@ -32,6 +32,12 @@ class FlagstatCollector {
protected[FlagstatCollector] var totalCounts: Array[Long] = Array()
protected[FlagstatCollector] var crossCounts = Array.ofDim[Long](1, 1)
def writeAsTsv(file: File): Unit = {
val writer = new PrintWriter(file)
names.foreach(x => writer.println(x._2 + "\t" + totalCounts(x._1)))
writer.close()
}
def loadDefaultFunctions() {
addFunction("All", record => true)
addFunction("Mapped", record => !record.getReadUnmappedFlag)
......
......@@ -34,6 +34,43 @@ class BamStatsTest extends TestNGSuite with Matchers {
new File(outputDir, "bamstats.json") should exist
new File(outputDir, "bamstats.summary.json") should exist
new File(outputDir, "flagstats.tsv") shouldNot exist
new File(outputDir, "insertsize.stats.tsv") shouldNot exist
new File(outputDir, "insertsize.histogram.tsv") shouldNot exist
new File(outputDir, "mappingQuality.stats.tsv") shouldNot exist
new File(outputDir, "mappingQuality.histogram.tsv") shouldNot exist
new File(outputDir, "clipping.stats.tsv") shouldNot exist
new File(outputDir, "clipping.histogram.tsv") shouldNot exist
new File(outputDir, "flagstats") shouldNot exist
new File(outputDir, "flagstats.summary.json") shouldNot exist
new File(outputDir, "mapping_quality.tsv") shouldNot exist
new File(outputDir, "insert_size.tsv") shouldNot exist
new File(outputDir, "clipping.tsv") shouldNot exist
new File(outputDir, "left_clipping.tsv") shouldNot exist
new File(outputDir, "right_clipping.tsv") shouldNot exist
new File(outputDir, "5_prime_clipping.tsv") shouldNot exist
new File(outputDir, "3_prime_clipping.tsv") shouldNot exist
}
@Test
def testTsvOutputs: Unit = {
val outputDir = Files.createTempDir()
outputDir.deleteOnExit()
BamStats.main(Array("-b", BamStatsTest.pairedBam01.getAbsolutePath, "-o", outputDir.getAbsolutePath, "--tsvOutputs"))
new File(outputDir, "bamstats.json") should exist
new File(outputDir, "bamstats.summary.json") should exist
new File(outputDir, "flagstats.tsv") should exist
new File(outputDir, "insertsize.stats.tsv") should exist
new File(outputDir, "insertsize.histogram.tsv") should exist
new File(outputDir, "mappingQuality.stats.tsv") should exist
new File(outputDir, "mappingQuality.histogram.tsv") should exist
new File(outputDir, "clipping.stats.tsv") should exist
new File(outputDir, "clipping.histogram.tsv") should exist
new File(outputDir, "flagstats") shouldNot exist
new File(outputDir, "flagstats.summary.json") shouldNot exist
new File(outputDir, "mapping_quality.tsv") shouldNot exist
......
......@@ -86,7 +86,7 @@ class CountsTest extends TestNGSuite with Matchers {
val tsvFile = File.createTempFile("counts.", ".tsv")
tsvFile.deleteOnExit()
c1.writeToTsv(tsvFile)
c1.writeHistogramToTsv(tsvFile)
val reader = Source.fromFile(tsvFile)
reader.getLines().toList shouldBe List("value\tcount", "1\t1", "2\t2", "3\t3")
......
......@@ -64,7 +64,7 @@ class HistogramTest extends TestNGSuite with Matchers {
val tsvFile = File.createTempFile("counts.", ".tsv")
tsvFile.deleteOnExit()
c1.writeToTsv(tsvFile)
c1.writeHistogramToTsv(tsvFile)
val reader = Source.fromFile(tsvFile)
reader.getLines().toList shouldBe List("value\tcount", "1\t1", "2\t2", "3\t3")
......
# DownloadNcbiAssembly
## Introduction
## Example
The help menu:
~~~
Usage: DownloadNcbiAssembly [options]
-l <value> | --log_level <value>
Level of log information printed. Possible levels: 'debug', 'info', 'warn', 'error'
-h | --help
Print usage
-v | --version
Print version
-a <file> | --assembly_report <file>
refseq ID from NCBI
-o <file> | --output <file>
output Fasta file
--report <file>
where to write report from NCBI
--nameHeader <string>
Which column to use from the NCBI report to name the contigs.
All columns from the report can be used but these are the most common fields to choose from:
- 'Sequence-Name': Name of the contig within the assembly
- 'UCSC-style-name': Name of the contig used by UCSC ( like hg19 )
- 'RefSeq-Accn': Unique name of the contig at RefSeq (default for NCBI)
--mustHaveOne:<key>=<column_name=regex>
This can be used to filter based on the NCBI report, multiple conditions can be given, at least 1 should be true
--mustNotHave:<key>=<column_name=regex>
This can be used to filter based on the NCBI report, multiple conditions can be given, all should be false
~~~
To run the tool use:
~~~
biopet tool DownloadNcbiAssembly
~~~
## Output
# FastqFilter
## Introduction
## Example
The help menu:
~~~
Usage: FastqFilter [options]
-l <value> | --log_level <value>
Level of log information printed. Possible levels: 'debug', 'info', 'warn', 'error'
-h | --help
Print usage
-v | --version
Print version
-I <file> | --inputFile <file>
Path to input file
-o <file> | --output <file>
Path to output file
--idRegex <file>
Regex to match ID
~~~
To run the tool use:
~~~
biopet tool FastqFilter
~~~
## Output
# FastqSync
## Introduction
## Example
The help menu:
~~~
FastqSync - Sync paired-end FASTQ files.
This tool works with gzipped or non-gzipped FASTQ files. The output
file will be gzipped when the input is also gzipped.
Usage: FastqSync [options]
-l <value> | --log_level <value>
Level of log information printed. Possible levels: 'debug', 'info', 'warn', 'error'
-h | --help
Print usage
-v | --version
Print version
-r <fastq> | --ref <fastq>
Reference FASTQ file
-i <fastq> | --in1 <fastq>
Input FASTQ file 1
-j <fastq[.gz]> | --in2 <fastq[.gz]>
Input FASTQ file 2
-o <fastq[.gz]> | --out1 <fastq[.gz]>
Output FASTQ file 1
-p <fastq> | --out2 <fastq>
Output FASTQ file 2
~~~
To run the tool use:
~~~
biopet tool FastqSync
~~~
## Output
# FindOverlapMatch
## Introduction
## Example
The help menu:
~~~
Usage: FindOverlapMatch [options]
-l <value> | --log_level <value>
Level of log information printed. Possible levels: 'debug', 'info', 'warn', 'error'
-h | --help
Print usage
-v | --version
Print version
-i <file> | --input <file>
Input should be a table where the first row and column have the ID's, those can be different
-o <file> | --output <file>
default to stdout
-c <value> | --cutoff <value>
minimum value to report it as pair
--use_same_names
Do not compare samples with the same name
--rowSampleRegex <regex>
Samples in the row should match this regex
--columnSampleRegex <regex>
Samples in the column should match this regex
~~~
To run the tool use:
~~~
biopet tool FindOverlapMatch
~~~
## Output
# GvcfToBed
## Introduction
## Example
The help menu:
~~~
Usage: GvcfToBed [options]
-l <value> | --log_level <value>
Level of log information printed. Possible levels: 'debug', 'info', 'warn', 'error'
-h | --help
Print usage
-v | --version
Print version
-I <file> | --inputVcf <file>
Input vcf file
-O <file> | --outputBed <file>
Output bed file
--invertedOutputBed <file>
Output bed file
-S <sample> | --sample <sample>
Sample to consider. Will take first sample on alphabetical order by default
--minGenomeQuality <int>
Minimum genome quality to consider
~~~
To run the tool use:
~~~
biopet tool GvcfToBed
~~~
## Output
# MergeTables
## Introduction
## Example
The help menu:
~~~
MergeTables - Tabular file merging based on feature ID equality.
Usage: MergeTables [options] [<input_tables> ...]
-l <value> | --log_level <value>
Level of log information printed. Possible levels: 'debug', 'info', 'warn', 'error'
-h | --help
Print usage
-v | --version
Print version
-i <idx1>,<idx2>, ... | --id_column_index <idx1>,<idx2>, ...
Index of feature ID column from each input file (1-based)
-a <idx> | --value_column_index <idx>
Index of column from each input file containing the value to merge (1-based)
-o <path> | --output <path>
Path to output file (default: '-' <stdout>)
-n <name> | --id_column_name <name>
Name of feature ID column in the output merged file (default: feature)
-N <name> | --column_names <name>
Name of feature ID column in the output merged file (default: feature)
-e <ext> | --strip_extension <ext>
Common extension of all input tables to strip (default: empty string)
-m <value> | --num_header_lines <value>
The number of header lines present in all input files (default: 0; no header)
-f <value> | --fallback <value>
The string to use when a value for a feature is missing in one or more sample(s) (default: '-')
-d <value> | --delimiter <value>
The character used for separating columns in the input files (default: '\t')
<input_tables> ...
Input tables to merge
This tool merges multiple tab-delimited files and outputs a single
tab delimited file whose columns are the feature IDs and a single
column from each input files.
Note that in each input file there must not be any duplicate features.
If there are, the tool will only keep one and discard the rest.
~~~
To run the tool use:
~~~
biopet tool MergeTables
~~~