Commit b7ae99c2 authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Merge branch 'tooltests' into 'develop'

Tooltests

Fixes #188, #196  and #200 

See merge request !224
parents 00ca8854 9c9edf13
......@@ -58,7 +58,7 @@ To get the above example out of the tool one should provide 2 TSV files as follo
----
| samples | library | bam |
| sample | library | bam |
| ------- | ------- | --------- |
|Sample_ID_1 |Lib_ID_1 |MyFirst.bam |
|Sample_ID_2 |Lib_ID_2 |MySecond.bam |
......
......@@ -17,7 +17,7 @@ package nl.lumc.sasc.biopet.tools
import java.io.{ File, PrintWriter }
import htsjdk.samtools.SamReaderFactory
import htsjdk.samtools.{ SAMSequenceRecord, SamReaderFactory }
import htsjdk.samtools.reference.IndexedFastaSequenceFile
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.vcf.VCFFileReader
......@@ -28,6 +28,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.collection.JavaConversions._
import scala.collection.mutable.ListBuffer
import scala.collection.parallel.ParMap
class BastyGenerateFasta(val root: Configurable) extends ToolCommandFuntion with Reference {
javaMainClass = getClass.getName
......@@ -155,7 +156,7 @@ object BastyGenerateFasta extends ToolCommand {
}
}
protected var cmdArgs: Args = _
protected implicit var cmdArgs: Args = _
private val chunkSize = 100000
/**
......@@ -165,11 +166,18 @@ object BastyGenerateFasta extends ToolCommand {
val argsParser = new OptParser
cmdArgs = argsParser.parse(args, Args()) getOrElse sys.exit(1)
if (cmdArgs.outputVariants != null) writeVariantsOnly()
if (cmdArgs.outputConsensus != null || cmdArgs.outputConsensusVariants != null) writeConsensus()
if (cmdArgs.outputVariants != null) {
writeVariantsOnly()
}
if (cmdArgs.outputConsensus != null || cmdArgs.outputConsensusVariants != null) {
writeConsensus()
}
//FIXME: what to do if outputcConsensus is set, but not outputConsensusVariants (and vice versa)?
}
protected def writeConsensus() {
//FIXME: preferably split this up in functions, so that they can be unit tested
val referenceFile = new IndexedFastaSequenceFile(cmdArgs.reference)
val referenceDict = referenceFile.getSequenceDictionary
......@@ -253,7 +261,7 @@ object BastyGenerateFasta extends ToolCommand {
}
}
protected def writeVariantsOnly() {
protected[tools] def writeVariantsOnly() {
val writer = new PrintWriter(cmdArgs.outputVariants)
writer.println(">" + cmdArgs.outputName)
val vcfReader = new VCFFileReader(cmdArgs.inputVcf, false)
......@@ -265,17 +273,34 @@ object BastyGenerateFasta extends ToolCommand {
vcfReader.close()
}
protected def getMaxAllele(vcfRecord: VariantContext): String = {
// TODO: what does this do?
// Seems to me it finds the allele in a sample with the highest AD value
// if this allele is shorter than the largest allele, it will append '-' to the string
protected[tools] def getMaxAllele(vcfRecord: VariantContext)(implicit cmdArgs: Args): String = {
val maxSize = getLongestAllele(vcfRecord).getBases.length
if (cmdArgs.sampleName == null) return fillAllele(vcfRecord.getReference.getBaseString, maxSize)
if (cmdArgs.sampleName == null) {
return fillAllele(vcfRecord.getReference.getBaseString, maxSize)
}
val genotype = vcfRecord.getGenotype(cmdArgs.sampleName)
if (genotype == null) return fillAllele("", maxSize)
if (genotype == null) {
return fillAllele("", maxSize)
}
val AD = if (genotype.hasAD) genotype.getAD else Array.fill(vcfRecord.getAlleles.size())(cmdArgs.minAD)
if (AD == null) return fillAllele("", maxSize)
if (AD == null) {
return fillAllele("", maxSize)
}
val maxADid = AD.zipWithIndex.maxBy(_._1)._2
if (AD(maxADid) < cmdArgs.minAD) return fillAllele("", maxSize)
if (AD(maxADid) < cmdArgs.minAD) {
return fillAllele("", maxSize)
}
fillAllele(vcfRecord.getAlleles()(maxADid).getBaseString, maxSize)
}
}
\ No newline at end of file
}
......@@ -61,12 +61,18 @@ object BiopetFlagstat extends ToolCommand {
flagstat
}
case class Args(inputFile: File = null, summaryFile: Option[File] = None, region: Option[String] = None) extends AbstractArgs
case class Args(inputFile: File = null,
outputFile: Option[File] = None,
summaryFile: Option[File] = None,
region: Option[String] = None) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('I', "inputFile") required () valueName "<file>" action { (x, c) =>
c.copy(inputFile = x)
} text "input bam file"
opt[File]('o', "outputFile") valueName "<file>" action { (x, c) =>
c.copy(outputFile = Some(x))
} text "output file"
opt[File]('s', "summaryFile") valueName "<file>" action { (x, c) =>
c.copy(summaryFile = Some(x))
} text "summary output file"
......@@ -151,7 +157,14 @@ object BiopetFlagstat extends ToolCommand {
writer.close()
}
println(flagstatCollector.report)
commandArgs.outputFile match {
case Some(file) => {
val writer = new PrintWriter(file)
writer.println(flagstatCollector.report)
writer.close()
}
case _ => println(flagstatCollector.report)
}
}
class FlagstatCollector {
......
......@@ -116,7 +116,7 @@ object CheckAllelesVcfInBam extends ToolCommand {
}
val counts = for (samRecord <- bamIter if !filterRead(samRecord)) {
checkAlles(samRecord, vcfRecord) match {
checkAlleles(samRecord, vcfRecord) match {
case Some(a) => if (countReports(sample).aCounts.contains(a)) countReports(sample).aCounts(a) += 1
else countReports(sample).aCounts += (a -> 1)
case _ => countReports(sample).notFound += 1
......@@ -142,7 +142,7 @@ object CheckAllelesVcfInBam extends ToolCommand {
writer.close()
}
def checkAlles(samRecord: SAMRecord, vcfRecord: VariantContext): Option[String] = {
def checkAlleles(samRecord: SAMRecord, vcfRecord: VariantContext): Option[String] = {
val readStartPos = List.range(0, samRecord.getReadBases.length)
.find(x => samRecord.getReferencePositionAtReadPosition(x + 1) == vcfRecord.getStart) getOrElse { return None }
val readBases = samRecord.getReadBases
......
......@@ -55,10 +55,10 @@ object FastqSplitter extends ToolCommand {
class OptParser extends AbstractOptParser {
opt[File]('I', "inputFile") required () valueName "<file>" action { (x, c) =>
c.copy(inputFile = x)
} text "out is a required file property"
} text "Path to input file"
opt[File]('o', "output") required () unbounded () valueName "<file>" action { (x, c) =>
c.copy(outputFile = x :: c.outputFile)
} text "out is a required file property"
} text "Path to output file"
}
/**
......
......@@ -15,7 +15,7 @@
*/
package nl.lumc.sasc.biopet.tools
import java.io.File
import java.io.{ PrintWriter, File }
import htsjdk.samtools.{ QueryInterval, SAMRecord, SamReaderFactory, ValidationStringency }
import nl.lumc.sasc.biopet.core.ToolCommand
......@@ -24,15 +24,20 @@ import scala.collection.JavaConversions._
import scala.io.Source
object FindRepeatsPacBio extends ToolCommand {
case class Args(inputBam: File = null, inputBed: File = null) extends AbstractArgs
case class Args(inputBam: File = null,
outputFile: Option[File] = None,
inputBed: File = null) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('I', "inputBam") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(inputBam = x)
}
} text "Path to input file"
opt[File]('o', "outputFile") maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(outputFile = Some(x))
} text "Path to input file"
opt[File]('b', "inputBed") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(inputBed = x)
} text "output file, default to stdout"
} text "Path to bed file"
}
/**
......@@ -50,7 +55,6 @@ object FindRepeatsPacBio extends ToolCommand {
val header = List("chr", "startPos", "stopPos", "Repeat_seq", "repeatLength",
"original_Repeat_readLength", "Calculated_repeat_readLength",
"minLength", "maxLength", "inserts", "deletions", "notSpan")
println(header.mkString("\t"))
for (
bedLine <- Source.fromFile(commandArgs.inputBed).getLines();
......@@ -84,9 +88,21 @@ object FindRepeatsPacBio extends ToolCommand {
if (length < minLength || minLength == -1) minLength = length
}
}
println(List(chr, startPos, stopPos, typeRepeat, repeatLength, oriRepeatLength, calcRepeatLength.mkString(","), minLength,
maxLength, inserts.mkString("/"), deletions.mkString("/"), notSpan).mkString("\t"))
bamIter.close()
commandArgs.outputFile match {
case Some(file) => {
val writer = new PrintWriter(file)
writer.println(header.mkString("\t"))
writer.println(List(chr, startPos, stopPos, typeRepeat, repeatLength, oriRepeatLength, calcRepeatLength.mkString(","), minLength,
maxLength, inserts.mkString("/"), deletions.mkString("/"), notSpan).mkString("\t"))
writer.close()
}
case _ => {
println(header.mkString("\t"))
println(List(chr, startPos, stopPos, typeRepeat, repeatLength, oriRepeatLength, calcRepeatLength.mkString(","), minLength,
maxLength, inserts.mkString("/"), deletions.mkString("/"), notSpan).mkString("\t"))
}
}
}
}
......
......@@ -73,7 +73,7 @@ object SageCountFastq extends ToolCommand {
if (counts.contains(seq)) counts(seq) += 1
else counts += (seq -> 1)
count += 1
if (count % 1000000 == 0) System.err.println(count + " sequences done")
if (count % 1000000 == 0) logger.info(count + " sequences done")
}
})
logger.info(count + " sequences done")
......
......@@ -77,10 +77,10 @@ object SageCreateLibrary extends ToolCommand {
opt[Int]("length") required () unbounded () action { (x, c) =>
c.copy(length = x)
}
opt[File]("noTagsOutput") required () unbounded () valueName "<file>" action { (x, c) =>
opt[File]("noTagsOutput") unbounded () valueName "<file>" action { (x, c) =>
c.copy(noTagsOutput = x)
}
opt[File]("noAntiTagsOutput") required () unbounded () valueName "<file>" action { (x, c) =>
opt[File]("noAntiTagsOutput") unbounded () valueName "<file>" action { (x, c) =>
c.copy(noAntiTagsOutput = x)
}
opt[File]("allGenesOutput") unbounded () valueName "<file>" action { (x, c) =>
......@@ -88,8 +88,7 @@ object SageCreateLibrary extends ToolCommand {
}
}
var tagRegex: Regex = null
var geneRegex = """ENSG[0-9]{11}""".r
val geneRegex = """ENSG[0-9]{11}""".r
val tagGenesMap: mutable.Map[String, TagGenes] = mutable.Map()
......@@ -114,23 +113,24 @@ object SageCreateLibrary extends ToolCommand {
if (!commandArgs.input.exists) throw new IllegalStateException("Input file not found, file: " + commandArgs.input)
tagRegex = (commandArgs.tag + "[CATG]{" + commandArgs.length + "}").r
val tagRegex = (commandArgs.tag + "[CATG]{" + commandArgs.length + "}").r
var count = 0
System.err.println("Reading fasta file")
logger.info("Reading fasta file")
val reader = FastaReaderHelper.readFastaDNASequence(commandArgs.input)
System.err.println("Finding tags")
logger.info("Finding tags")
for ((name, seq) <- reader) {
getTags(name, seq)
val result = getTags(name, seq, tagRegex)
addTagresultToTaglib(name, result)
count += 1
if (count % 10000 == 0) System.err.println(count + " transcripts done")
if (count % 10000 == 0) logger.info(count + " transcripts done")
}
System.err.println(count + " transcripts done")
logger.info(count + " transcripts done")
System.err.println("Start sorting tags")
logger.info("Start sorting tags")
val tagGenesMapSorted: SortedMap[String, TagGenes] = SortedMap(tagGenesMap.toArray: _*)
System.err.println("Writting output files")
logger.info("Writting output files")
val writer = new PrintWriter(commandArgs.output)
writer.println("#tag\tfirstTag\tAllTags\tFirstAntiTag\tAllAntiTags")
for ((tag, genes) <- tagGenesMapSorted) {
......@@ -167,7 +167,7 @@ object SageCreateLibrary extends ToolCommand {
}
}
def addTagresultToTaglib(name: String, tagResult: TagResult) {
private def addTagresultToTaglib(name: String, tagResult: TagResult) {
val id = name.split(" ").head //.stripPrefix("hg19_ensGene_")
val geneID = geneRegex.findFirstIn(name).getOrElse("unknown_gene")
allGenes.add(geneID)
......@@ -195,15 +195,13 @@ object SageCreateLibrary extends ToolCommand {
}
}
def getTags(name: String, seq: DNASequence): TagResult = {
def getTags(name: String, seq: DNASequence, tagRegex: Regex): TagResult = {
val allTags: List[String] = for (tag <- tagRegex.findAllMatchIn(seq.getSequenceAsString).toList) yield tag.toString()
val firstTag = if (allTags.isEmpty) null else allTags.last
val allAntiTags: List[String] = for (tag <- tagRegex.findAllMatchIn(seq.getReverseComplement.getSequenceAsString).toList) yield tag.toString()
val firstAntiTag = if (allAntiTags.isEmpty) null else allAntiTags.head
val result = new TagResult(firstTag, allTags, firstAntiTag, allAntiTags)
addTagresultToTaglib(name, result)
result
}
}
\ No newline at end of file
......@@ -37,7 +37,7 @@ class SageCreateTagCounts(val root: Configurable) extends ToolCommandFuntion {
@Output(doc = "Sense count file", shortName = "sense", required = true)
var countSense: File = _
@Output(doc = "Sense all coun filet", shortName = "allsense", required = true)
@Output(doc = "Sense all count file", shortName = "allsense", required = true)
var countAllSense: File = _
@Output(doc = "AntiSense count file", shortName = "antisense", required = true)
......@@ -148,9 +148,18 @@ object SageCreateTagCounts extends ToolCommand {
writer.close()
}
}
writeFile(commandArgs.countSense, senseCounts)
writeFile(commandArgs.countAllSense, allSenseCounts)
writeFile(commandArgs.countAntiSense, antiSenseCounts)
writeFile(commandArgs.countAllAntiSense, allAntiSenseCounts)
if (commandArgs.countSense != null) {
writeFile(commandArgs.countSense, senseCounts)
}
if (commandArgs.countAllAntiSense != null) {
writeFile(commandArgs.countAllAntiSense, allAntiSenseCounts)
}
if (commandArgs.countAllSense != null) {
writeFile(commandArgs.countAllSense, allSenseCounts)
}
if (commandArgs.countAntiSense != null) {
writeFile(commandArgs.countAntiSense, antiSenseCounts)
}
}
}
\ No newline at end of file
......@@ -15,7 +15,7 @@
*/
package nl.lumc.sasc.biopet.tools
import java.io.File
import java.io.{ PrintWriter, File }
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.utils.ConfigUtils._
......@@ -27,12 +27,15 @@ import scala.io.Source
* This tool can convert a tsv to a json file
*/
object SamplesTsvToJson extends ToolCommand {
case class Args(inputFiles: List[File] = Nil) extends AbstractArgs
case class Args(inputFiles: List[File] = Nil, outputFile: Option[File] = None) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('i', "inputFiles") required () unbounded () valueName "<file>" action { (x, c) =>
c.copy(inputFiles = x :: c.inputFiles)
} text "Input must be a tsv file, first line is seen as header and must at least have a 'sample' column, 'library' column is optional, multiple files allowed"
opt[File]('o', "outputFile") unbounded () valueName "<file>" action { (x, c) =>
c.copy(outputFile = Some(x))
}
}
/** Executes SamplesTsvToJson */
......@@ -40,41 +43,53 @@ object SamplesTsvToJson extends ToolCommand {
val argsParser = new OptParser
val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val fileMaps = for (inputFile <- commandArgs.inputFiles) yield {
val reader = Source.fromFile(inputFile)
val lines = reader.getLines().toList.filter(!_.isEmpty)
val header = lines.head.split("\t")
val sampleColumn = header.indexOf("sample")
val libraryColumn = header.indexOf("library")
if (sampleColumn == -1) throw new IllegalStateException("Sample column does not exist in: " + inputFile)
val jsonString = stringFromInputs(commandArgs.inputFiles)
commandArgs.outputFile match {
case Some(file) => {
val writer = new PrintWriter(file)
writer.println(jsonString)
writer.close()
}
case _ => println(jsonString)
}
}
val sampleLibCache: mutable.Set[(String, Option[String])] = mutable.Set()
def mapFromFile(inputFile: File): Map[String, Any] = {
val reader = Source.fromFile(inputFile)
val lines = reader.getLines().toList.filter(!_.isEmpty)
val header = lines.head.split("\t")
val sampleColumn = header.indexOf("sample")
val libraryColumn = header.indexOf("library")
if (sampleColumn == -1) throw new IllegalStateException("Sample column does not exist in: " + inputFile)
val librariesValues: List[Map[String, Any]] = for (tsvLine <- lines.tail) yield {
val values = tsvLine.split("\t")
require(header.length == values.length, "Number of columns is not the same as the header")
val sample = values(sampleColumn)
val library = if (libraryColumn != -1) Some(values(libraryColumn)) else None
val sampleLibCache: mutable.Set[(String, Option[String])] = mutable.Set()
//FIXME: this is a workaround, should be removed after fixing #180
if (sample.head.isDigit || library.forall(_.head.isDigit))
throw new IllegalStateException("Sample or library may not start with a number")
val librariesValues: List[Map[String, Any]] = for (tsvLine <- lines.tail) yield {
val values = tsvLine.split("\t")
require(header.length == values.length, "Number of columns is not the same as the header")
val sample = values(sampleColumn)
val library = if (libraryColumn != -1) Some(values(libraryColumn)) else None
if (sampleLibCache.contains((sample, library)))
throw new IllegalStateException(s"Combination of $sample and $library is found multiple times")
else sampleLibCache.add((sample, library))
val valuesMap = (for (
t <- 0 until values.size if !values(t).isEmpty && t != sampleColumn && t != libraryColumn
) yield header(t) -> values(t)).toMap
library match {
case Some(lib) => Map("samples" -> Map(sample -> Map("libraries" -> Map(library -> valuesMap))))
case _ => Map("samples" -> Map(sample -> valuesMap))
}
//FIXME: this is a workaround, should be removed after fixing #180
if (sample.head.isDigit || library.forall(_.head.isDigit))
throw new IllegalStateException("Sample or library may not start with a number")
if (sampleLibCache.contains((sample, library)))
throw new IllegalStateException(s"Combination of $sample ${library.map("and " + _).getOrElse("")} is found multiple times")
else sampleLibCache.add((sample, library))
val valuesMap = (for (
t <- 0 until values.size if !values(t).isEmpty && t != sampleColumn && t != libraryColumn
) yield header(t) -> values(t)).toMap
library match {
case Some(lib) => Map("samples" -> Map(sample -> Map("libraries" -> Map(lib -> valuesMap))))
case _ => Map("samples" -> Map(sample -> valuesMap))
}
librariesValues.foldLeft(Map[String, Any]())((acc, kv) => mergeMaps(acc, kv))
}
val map = fileMaps.foldLeft(Map[String, Any]())((acc, kv) => mergeMaps(acc, kv))
val json = mapToJson(map)
println(json.spaces2)
librariesValues.foldLeft(Map[String, Any]())((acc, kv) => mergeMaps(acc, kv))
}
def stringFromInputs(inputs: List[File]): String = {
val map = inputs.map(f => mapFromFile(f)).foldLeft(Map[String, Any]())((acc, kv) => mergeMaps(acc, kv))
mapToJson(map).spaces2
}
}
......@@ -15,7 +15,7 @@
*/
package nl.lumc.sasc.biopet.tools
import java.io.File
import java.io.{ PrintWriter, File }
import htsjdk.samtools.fastq.{ FastqReader, FastqRecord }
import nl.lumc.sasc.biopet.core.config.Configurable
......@@ -45,7 +45,7 @@ class SeqStat(val root: Configurable) extends ToolCommandFuntion with Summarizab
override def defaultCoreMemory = 2.5
override def commandLine = super.commandLine + required("-i", input) + " > " + required(output)
override def commandLine = super.commandLine + required("-i", input) + required("-o", output)
def summaryStats: Map[String, Any] = {
val map = ConfigUtils.fileToConfigMap(output)
......@@ -108,7 +108,7 @@ object SeqStat extends ToolCommand {
private var baseQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
private var readQualHistoMap: mutable.Map[Int, Long] = mutable.Map(0 -> 0)
case class Args(fastq: File = new File("")) extends AbstractArgs
case class Args(fastq: File = null, outputJson: Option[File] = None) extends AbstractArgs
class OptParser extends AbstractOptParser {
......@@ -117,11 +117,14 @@ object SeqStat extends ToolCommand {
|$commandName - Summarize FastQ
""".stripMargin)
opt[File]('i', "fastq") required () valueName "<fastq>" action { (x, c) =>
opt[File]('i', "fastq") required () unbounded () valueName "<fastq>" action { (x, c) =>
c.copy(fastq = x)
} validate {
x => if (x.exists) success else failure("FASTQ file not found")
} text "FastQ file to generate stats from"
opt[File]('o', "output") unbounded () valueName "<json>" action { (x, c) =>
c.copy(outputJson = Some(x))
} text "File to write output to, if not supplied output go to stdout"
}
/**
......@@ -317,6 +320,13 @@ object SeqStat extends ToolCommand {
))
)
println(ConfigUtils.mapToJson(report))
commandArgs.outputJson match {
case Some(file) => {
val writer = new PrintWriter(file)
writer.println(ConfigUtils.mapToJson(report))
writer.close()
}
case _ => println(ConfigUtils.mapToJson(report))
}
}
}
......@@ -15,7 +15,8 @@
*/
package nl.lumc.sasc.biopet.tools
import java.io.File
import java.io.{ PrintWriter, File }
import java.nio.file.Paths
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.summary.Summary
......@@ -35,15 +36,27 @@ object SummaryToTsv extends ToolCommand {
opt[File]('s', "summary") required () unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(summary = x)
}
opt[File]('o', "output") maxOccurs 1 unbounded () valueName "<file>" action { (x, c) =>
opt[File]('o', "outputFile") unbounded () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(outputFile = Some(x))
}
opt[String]('p', "path") required () unbounded () valueName "<value>" action { (x, c) =>
opt[String]('p', "path") required () unbounded () valueName "<string>" action { (x, c) =>
c.copy(values = c.values ::: x :: Nil)
}
} text
"""
|String that determines the values extracted from the summary. Should be of the format: