Commit b76ad0ad authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'develop' into feature-report

Conflicts:
	public/shiva/src/main/scala/nl/lumc/sasc/biopet/pipelines/shiva/ShivaTrait.scala
parents dee7eea1 2b7a4f2f
{
"samples" : {
"sampleA" : {
"libraries" : {
"lib_1" : {
"R1" : "/path/to/inputA_R1.fq.gz",
"R2" : "/path/to/inputA_R2.fq.gz"
}
}
},
"sampleB" : {
"libraries" : {
"lib_1" : {
"R1" : "/path/to/inputB_1_R1.fq.gz",
"R2" : "/path/to/inputB_1_R2.fq.gz"
},
"lib_2": {
"R1" : "/path/to/inputB_2_R1.fq.gz",
"R2" : "/path/to/inputB_2_R2.fq.gz"
}
}
}
},
"gentrap": {
"output_dir": "/path/to/output_dir",
"expression_measures": ["fragments_per_gene", "bases_per_gene", "bases_per_exon"],
"strand_protocol": "non_specific",
"aligner": "gsnap",
"reference": "/share/isilon/system/local/Genomes-new-27-10-2011/H.Sapiens/hg19_nohap/gsnap/reference.fa",
"annotation_gtf": "/path/to/data/annotation/ucsc_refseq.gtf",
"annotation_bed": "/path/to/data/annotation/ucsc_refseq.bed",
"annotation_refflat": "/path/to/data/annotation/ucsc_refseq.refFlat",
"gsnap": {
"dir": "/share/isilon/system/local/Genomes-new-27-10-2011/H.Sapiens/hg19_nohap/gsnap",
"db": "hg19_nohap",
"quiet_if_excessive": true,
"npaths": 1
},
"cutadapt": {
"minimum_length": 20
},
"mapping": {
"flexiprep": {
"fastqc": {
"threads": 6,
"nogroup": true
}
}
},
"rawbasecounter": {
"core_memory": "20G"
}
}
}
......@@ -110,18 +110,23 @@ Thus, an example settings configuration is as follows:
}
~~~
#### Example configurations
In most cases, it's practical to combine the samples and settings configuration into one file. Here is an [example config file](/examples/gentrap_example.json) where both samples and settings are stored into one file. Note also that there are additional tool configurations in the config file.
## Running Gentrap
As with other pipelines in the Biopet suite, Gentrap can be run by specifying the pipeline after the `pipeline` subcommand:
~~~
java -jar </path/to/biopet.jar> pipeline gentrap -config </path/to/config.json> -qsub -jobParaEnv BWA -run
$ java -jar </path/to/biopet.jar> pipeline gentrap -config </path/to/config.json> -qsub -jobParaEnv BWA -run
~~~
If you already have the `biopet` environment module loaded, you can also simply call `biopet`:
You can also use the `biopet` environment module (recommended) when you are running the pipeline in SHARK:
~~~
biopet pipeline gentrap -config </path/to/config.json> -qsub -jobParaEnv BWA -run
$ module load biopet/v0.3.1
$ biopet pipeline gentrap -config </path/to/config.json> -qsub -jobParaEnv BWA -run
~~~
It is also a good idea to specify retries (we recomend `-retry 3` up to `-retry 5`) so that cluster glitches do not interfere with your pipeline runs.
......
......@@ -113,6 +113,8 @@ To view all possible config options please navigate to our Gitlab wiki page
| shiva | use_analyze_covariates | Boolean | false | Analyze covariates during base recalibration step |
| shiva | bam_to_fastq | Boolean | false | Convert bam files to fastq files |
| shiva | correct_readgroups | Boolean | false | Attempt to correct read groups |
| shiva | amplicon_bed | Path | Path to target bed file |
| shiva | regions_of_interest | Array of paths | Array of paths to region of interest (e.g. gene panels) bed files |
| vcffilter | min_sample_depth | Integer | 8 | Filter variants with at least x coverage |
| vcffilter | min_alternate_depth | Integer | 2 | Filter variants with at least x depth on the alternate allele |
| vcffilter | min_samples_pass | Integer | 1 | Minimum amount of samples which pass custom filter (requires additional flags) |
......
......@@ -25,7 +25,15 @@
<div class="col-md-1"></div>
<div class="col-md-6">
<p>
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc risus est, volutpat quis enim sit amet, lacinia posuere ante. Mauris eget massa efficitur, luctus nisl ut, placerat nibh. Pellentesque id nulla maximus, rutrum dui nec, lobortis odio. Fusce eu enim ac sem auctor congue. Ut ac ullamcorper quam, eget sollicitudin felis. Maecenas posuere sagittis blandit. Proin mollis magna lectus, id gravida est consectetur vitae. Nulla id risus at tellus laoreet finibus in id lacus. Duis lobortis commodo nisl viverra tempor. Curabitur sit amet pretium dui, sit amet tincidunt mauris. Duis volutpat eu purus ut molestie.
#if (sampleId.isDefined && libId.isDefined)
Here we show basic <a href="https://en.wikibooks.org/wiki/Next_Generation_Sequencing_%28NGS%29/Alignment">alignment</a> statistics for this run for sample ${sampleId} with library ${libId}. Total number of reads, number of alignments reads and number of duplicate reads are given, and the percentages thereof as a percentage of total.
#elseif(sampleId.isDefined && showPlot)
The following plot shows basic <a href="https://en.wikibooks.org/wiki/Next_Generation_Sequencing_%28NGS%29/Alignment">alignment</a> statistics for this run for sample ${sampleId}. Every library is represented by a multi-color bar. Red represents the total number of properly mapped reads for this sample. Green represents the total number of duplicates reads, which is usually caused by <a href="http://www.cureffi.org/2012/12/11/how-pcr-duplicates-arise-in-next-generation-sequencing/">PCR duplicates</a>. Blue denotes the number of unmapped reads, and purple denotes reads flagged <em>secondary</em> (this is dependent on the aligner used). A table showing similar statistics, including values represented as percent of total, can be downloaded as a tab-delimited file.
#elseif(sampleId.isDefined && !showPlot)
Here we show basic <a href="https://en.wikibooks.org/wiki/Next_Generation_Sequencing_%28NGS%29/Alignment">alignment</a> statistics for this run for every library of sample ${sampleId}. Total number of reads, number of alignments reads and number of duplicate reads are given, and the percentages thereof as a percentage of total.
#else
The following plot shows basic <a href="https://en.wikibooks.org/wiki/Next_Generation_Sequencing_%28NGS%29/Alignment">alignment</a> statistics for this run. Every sample is represented by a multi-color bar. Red represents the total number of properly mapped reads for this sample. Green represents the total number of duplicates reads, which is usually caused by <a href="http://www.cureffi.org/2012/12/11/how-pcr-duplicates-arise-in-next-generation-sequencing/">PCR duplicates</a>. Blue denotes the number of unmapped reads, and purple denotes reads flagged <em>secondary</em> (this is dependent on the aligner used). A table showing similar statistics, including values represented as percent of total, can be downloaded as a tab-delimited file.
#end
</p>
</div>
</div>
......
......@@ -29,7 +29,13 @@
<div class="col-md-1"></div>
<div class="col-md-6">
<p>
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc risus est, volutpat quis enim sit amet, lacinia posuere ante. Mauris eget massa efficitur, luctus nisl ut, placerat nibh. Pellentesque id nulla maximus, rutrum dui nec, lobortis odio. Fusce eu enim ac sem auctor congue. Ut ac ullamcorper quam, eget sollicitudin felis. Maecenas posuere sagittis blandit. Proin mollis magna lectus, id gravida est consectetur vitae. Nulla id risus at tellus laoreet finibus in id lacus. Duis lobortis commodo nisl viverra tempor. Curabitur sit amet pretium dui, sit amet tincidunt mauris. Duis volutpat eu purus ut molestie.
#if (sampleId.isDefined && libId.isDefined)
something
#elseif(sampleId.isDefined)
This plot shows the insert size distribution for the libraries of sample ${sampleId}. <a href="http://thegenomefactory.blogspot.nl/2013/08/paired-end-read-confusion-library.html">Insert size</a> denotes the size of the so-called <em>insert</em> between two read pairs in a paired-end sequencing run. This should correspond to the length of the sequence between the sequencing adaptors. The provided table shows mean and median insert size for each sample, together with the standard deviation.
#else
This plot shows the insert size distribution for each of the ${samples.size} samples. <a href="http://thegenomefactory.blogspot.nl/2013/08/paired-end-read-confusion-library.html">Insert size</a> denotes the size of the so-called <em>insert</em> between two read pairs in a paired-end sequencing run. This should correspond to the length of the sequence between the sequencing adaptors. The provided table shows mean and median insert size for each sample, together with the standard deviation.
#end
</p>
</div>
</div>
......
......@@ -29,7 +29,7 @@
<div class="col-md-1"></div>
<div class="col-md-6">
<p>
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc risus est, volutpat quis enim sit amet, lacinia posuere ante. Mauris eget massa efficitur, luctus nisl ut, placerat nibh. Pellentesque id nulla maximus, rutrum dui nec, lobortis odio. Fusce eu enim ac sem auctor congue. Ut ac ullamcorper quam, eget sollicitudin felis. Maecenas posuere sagittis blandit. Proin mollis magna lectus, id gravida est consectetur vitae. Nulla id risus at tellus laoreet finibus in id lacus. Duis lobortis commodo nisl viverra tempor. Curabitur sit amet pretium dui, sit amet tincidunt mauris. Duis volutpat eu purus ut molestie.
Here we show the total number of positions in the reference that are covered with a given coverage. This plot is whole-genome based, and will therefore be highly skewed in the case of an exome or targeted approach.
</p>
</div>
</div>
......
......@@ -38,7 +38,9 @@ class BamMetrics(val root: Configurable) extends QScript with SummaryQScript wit
/** Bed of amplicon that is used */
var ampliconBedFile: Option[File] = config("amplicon_bed")
var rnaMetrics: Boolean = config("rna_metrcis", default = false)
/** Settings for CollectRnaSeqMetrics */
var rnaMetricsSettings: Map[String, String] = Map()
var transcriptRefFlatFile: Option[File] = config("transcript_refflat")
/** return location of summary file */
def summaryFile = (sampleId, libId) match {
......@@ -89,17 +91,22 @@ class BamMetrics(val root: Configurable) extends QScript with SummaryQScript wit
add(gcBiasMetrics)
addSummarizable(gcBiasMetrics, "gc_bias")
val wgsMetrics = new CollectWgsMetrics(this)
wgsMetrics.input = inputBam
wgsMetrics.output = swapExt(outputDir, inputBam, ".bam", ".wgs.metrics")
add(wgsMetrics)
addSummarizable(wgsMetrics, "wgs")
if (transcriptRefFlatFile.isEmpty) {
val wgsMetrics = new CollectWgsMetrics(this)
wgsMetrics.input = inputBam
wgsMetrics.output = swapExt(outputDir, inputBam, ".bam", ".wgs.metrics")
add(wgsMetrics)
addSummarizable(wgsMetrics, "wgs")
}
if (rnaMetrics) {
if (transcriptRefFlatFile.isDefined) {
val rnaMetrics = new CollectRnaSeqMetrics(this)
rnaMetrics.input = inputBam
rnaMetrics.output = swapExt(outputDir, inputBam, ".bam", ".rna.metrics")
rnaMetrics.chartOutput = Some(swapExt(outputDir, inputBam, ".bam", ".rna.metrics.pdf"))
rnaMetrics.refFlat = transcriptRefFlatFile.get
rnaMetrics.ribosomalIntervals = rnaMetricsSettings.get("ribosomal_intervals").collect { case n => new File(n) }
rnaMetrics.strandSpecificity = rnaMetricsSettings.get("strand_specificity")
add(rnaMetrics)
addSummarizable(rnaMetrics, "rna")
}
......
......@@ -244,7 +244,7 @@ object BammetricsReport extends ReportBuilder {
}
}
var map: Map[Int, Map[String, Int]] = Map()
var map: Map[Int, Map[String, Long]] = Map()
def fill(sample: String, lib: Option[String]): Unit = {
......@@ -257,7 +257,7 @@ object BammetricsReport extends ReportBuilder {
case (l: List[_], l2: List[_]) => {
l.zip(l2).foreach(i => {
val insertSize = i._1.toString.toInt
val count = i._2.toString.toInt
val count = i._2.toString.toLong
val old = map.getOrElse(insertSize, Map())
if (libraryLevel) map += insertSize -> (old + ((s"$sample-" + lib.get) -> count))
else map += insertSize -> (old + (sample -> count))
......
......@@ -45,9 +45,10 @@ class BamMetricsTest extends TestNGSuite with Matchers {
@Test(dataProvider = "bammetricsOptions")
def testFlexiprep(rois: Int, amplicon: Boolean, rna: Boolean) = {
val map = ConfigUtils.mergeMaps(Map("output_dir" -> BamMetricsTest.outputDir, "rna_metrcis" -> rna
), Map(BamMetricsTest.executables.toSeq: _*)) ++
val map = ConfigUtils.mergeMaps(Map("output_dir" -> BamMetricsTest.outputDir),
Map(BamMetricsTest.executables.toSeq: _*)) ++
(if (amplicon) Map("amplicon_bed" -> "amplicon.bed") else Map()) ++
(if (rna) Map("transcript_refflat" -> "transcripts.refFlat") else Map()) ++
Map("regions_of_interest" -> (1 to rois).map("roi_" + _ + ".bed").toList)
val bammetrics: BamMetrics = initPipeline(map)
......@@ -59,7 +60,7 @@ class BamMetricsTest extends TestNGSuite with Matchers {
var regions: Int = rois + (if (amplicon) 1 else 0)
bammetrics.functions.count(_.isInstanceOf[CollectRnaSeqMetrics]) shouldBe (if (rna) 1 else 0)
bammetrics.functions.count(_.isInstanceOf[CollectWgsMetrics]) shouldBe 1
bammetrics.functions.count(_.isInstanceOf[CollectWgsMetrics]) shouldBe (if (rna) 0 else 1)
bammetrics.functions.count(_.isInstanceOf[CollectMultipleMetrics]) shouldBe 1
bammetrics.functions.count(_.isInstanceOf[CalculateHsMetrics]) shouldBe (if (amplicon) 1 else 0)
bammetrics.functions.count(_.isInstanceOf[CollectTargetedPcrMetrics]) shouldBe (if (amplicon) 1 else 0)
......
......@@ -37,7 +37,8 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
var vmem: Option[String] = config("vmem")
protected val defaultCoreMemory: Double = 1.0
var vmemFactor: Double = config("vmem_factor", default = 1.5)
protected val defaultVmemFactor: Double = 1.4
var vmemFactor: Double = config("vmem_factor", default = defaultVmemFactor)
var residentFactor: Double = config("resident_factor", default = 1.2)
......@@ -156,7 +157,13 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
protected val versionExitcode = List(0)
/** Executes the version command */
private def getVersionInternal: Option[String] = {
private[core] def getVersionInternal(): Option[String] = {
if (versionCommand == null || versionRegex == null) None
else getVersionInternal(versionCommand, versionRegex)
}
/** Executes the version command */
private[core] def getVersionInternal(versionCommand: String, versionRegex: Regex): Option[String] = {
if (versionCommand == null || versionRegex == null) return None
val exe = new File(versionCommand.trim.split(" ")(0))
if (!exe.exists()) return None
......@@ -221,7 +228,7 @@ trait BiopetCommandLineFunctionTrait extends CommandLineFunction with Configurab
/** stores global caches */
object BiopetCommandLineFunctionTrait {
import scala.collection.mutable.Map
private val versionCache: Map[String, String] = Map()
private[core] val versionCache: Map[String, String] = Map()
private[core] val executableMd5Cache: Map[String, String] = Map()
private val executableCache: Map[String, String] = Map()
private[core] val executableCache: Map[String, String] = Map()
}
......@@ -25,6 +25,8 @@ trait BiopetJavaCommandLineFunction extends JavaCommandLineFunction with BiopetC
javaGCHeapFreeLimit = config("java_gc_heap_freelimit")
javaGCTimeLimit = config("java_gc_timelimit")
override protected val defaultVmemFactor: Double = 2.0
/** Constructs java opts, this adds scala threads */
override def javaOpts = super.javaOpts +
optional("-Dscala.concurrent.context.numThreads=", threads, spaceSeparated = false, escape = false)
......@@ -37,6 +39,19 @@ trait BiopetJavaCommandLineFunction extends JavaCommandLineFunction with BiopetC
return cmd
}
def javaVersionCommand: String = executable + " -version"
def getJavaVersion: Option[String] = {
if (!BiopetCommandLineFunctionTrait.executableCache.contains(executable))
preProcesExecutable
if (!BiopetCommandLineFunctionTrait.versionCache.contains(javaVersionCommand))
getVersionInternal(javaVersionCommand, """java version "(.*)"""".r) match {
case Some(version) => BiopetCommandLineFunctionTrait.versionCache += javaVersionCommand -> version
case _ =>
}
BiopetCommandLineFunctionTrait.versionCache.get(javaVersionCommand)
}
override def setupRetry(): Unit = {
super.setupRetry()
javaMemoryLimit = memoryLimit
......
......@@ -74,11 +74,13 @@ class WriteSummary(val root: Configurable) extends InProcessFunction with Config
case f: BiopetJavaCommandLineFunction => {
f.configName -> Map("version" -> f.getVersion.getOrElse(None),
"java_md5" -> BiopetCommandLineFunctionTrait.executableMd5Cache.getOrElse(f.executable, None),
"jar_md5" -> SummaryQScript.md5sumCache.getOrElse(f.jarFile, None))
"java_version" -> f.getJavaVersion,
"jar_path" -> f.jarFile)
}
case f: BiopetCommandLineFunction => {
f.configName -> Map("version" -> f.getVersion.getOrElse(None),
"md5" -> BiopetCommandLineFunctionTrait.executableMd5Cache.getOrElse(f.executable, None))
"md5" -> BiopetCommandLineFunctionTrait.executableMd5Cache.getOrElse(f.executable, None),
"path" -> f.executable)
}
case _ => throw new IllegalStateException("This should not be possible")
}
......
......@@ -15,6 +15,8 @@ class CollectMultipleMetrics(val root: Configurable) extends Picard with Summari
javaMainClass = new picard.analysis.CollectMultipleMetrics().getClass.getName
override val defaultCoreMemory = 6.0
@Input(doc = "The input SAM or BAM files to analyze", required = true)
var input: File = null
......@@ -41,7 +43,7 @@ class CollectMultipleMetrics(val root: Configurable) extends Picard with Summari
}
case _ if p == Programs.CollectInsertSizeMetrics.toString => {
outputFiles :+= new File(outputName + ".insert_size_metrics")
outputFiles :+= new File(outputName + ".insert_size_Histogram.pdf")
outputFiles :+= new File(outputName + ".insert_size_histogram.pdf")
}
case _ if p == Programs.QualityScoreDistribution.toString => {
outputFiles :+= new File(outputName + ".quality_distribution_metrics")
......@@ -85,17 +87,25 @@ class CollectMultipleMetrics(val root: Configurable) extends Picard with Summari
case _ => None
}
val sum = new Summarizable {
override def summaryFiles: Map[String, File] = Map()
override def summaryStats = stats
override def summaryFiles: Map[String, File] = Map()
}
qscript.addSummarizable(sum, p)
})
}
def summaryFiles = Map()
def summaryStats = Map()
def summaryFiles = {
program.map {
case p if p == Programs.CollectInsertSizeMetrics.toString =>
Map(
"insert_size_histogram" -> new File(outputName + ".insert_size_histogram.pdf"),
"insert_size_metrics" -> new File(outputName + ".insert_size_metrics"))
case otherwise => Map()
}.foldLeft(Map.empty[String, File]) { case (acc, m) => (acc ++ m) }
}
}
object CollectMultipleMetrics {
......
......@@ -32,7 +32,7 @@ class CollectRnaSeqMetrics(val root: Configurable) extends Picard with Summariza
var input: File = null
@Input(doc = "Gene annotations in refFlat form", required = true)
var refFlat: File = config("refFlat")
var refFlat: File = null
@Input(doc = "Location of rRNA sequences in interval list format", required = false)
var ribosomalIntervals: Option[File] = config("ribosomal_intervals")
......@@ -68,6 +68,7 @@ class CollectRnaSeqMetrics(val root: Configurable) extends Picard with Summariza
var stopAfter: Option[Long] = config("stop_after")
override def beforeGraph: Unit = {
if (refFlat == null) refFlat = config("refFlat")
val validFlags = StrandSpecificity.values.map(_.toString).toSet
strandSpecificity match {
case Some(s) => require(validFlags.contains(s),
......@@ -84,7 +85,9 @@ class CollectRnaSeqMetrics(val root: Configurable) extends Picard with Summariza
"output_chart" -> chartOutput
).collect { case (key, Some(value)) => key -> value }
def summaryStats = Picard.getMetrics(output).getOrElse(Map())
def summaryStats = Map(
"metrics" -> Picard.getMetrics(output).getOrElse(Map()),
"histogram" -> Picard.getHistogram(output).getOrElse(Map()))
override def commandLine = super.commandLine +
required("INPUT=", input, spaceSeparated = false) +
......
......@@ -82,25 +82,29 @@ abstract class Picard extends BiopetJavaCommandLineFunction {
object Picard extends Logging {
lazy val getBiopetPicardVersion: Option[String] = {
val reader = Source.fromInputStream(getClass.getResourceAsStream("/dependency_list.txt"))
val dependencies = reader.getLines().map(_.trim.split(":")).filter(_.size == 5).map(line => Map(
"groupId" -> line(0),
"artifactId" -> line(1),
"type" -> line(2),
"version" -> line(3),
"scope" -> line(4)
)).toList
logger.debug("dependencies: " + dependencies)
val htsjdk = dependencies.find(dep => dep("groupId") == "samtools" && dep("artifactId") == "htsjdk").collect {
case dep =>
"samtools htsjdk " + dep("version")
}
Option(getClass.getResourceAsStream("/dependency_list.txt")) match {
case Some(src) =>
val dependencies = Source.fromInputStream(src)
.getLines().map(_.trim.split(":")).filter(_.size == 5).map(line => Map(
"groupId" -> line(0),
"artifactId" -> line(1),
"type" -> line(2),
"version" -> line(3),
"scope" -> line(4)
)).toList
logger.debug("dependencies: " + dependencies)
val htsjdk = dependencies.find(dep => dep("groupId") == "samtools" && dep("artifactId") == "htsjdk").collect {
case dep =>
"samtools htsjdk " + dep("version")
}
dependencies.find(dep => dep("groupId") == "picard" && dep("artifactId") == "picard").collect {
case dep =>
"Picard " + dep("version") + " using " + htsjdk.getOrElse("unknown htsjdk")
dependencies.find(dep => dep("groupId") == "picard" && dep("artifactId") == "picard").collect {
case dep =>
"Picard " + dep("version") + " using " + htsjdk.getOrElse("unknown htsjdk")
}
case otherwise => None
}
}
......
......@@ -36,7 +36,7 @@ class BiopetFlagstat(val root: Configurable) extends ToolCommandFuntion with Sum
@Output(doc = "summary output file", shortName = "output", required = false)
var summaryFile: File = _
override val defaultCoreMemory = 2.0
override val defaultCoreMemory = 6.0
override def commandLine = super.commandLine + required("-I", input) + required("-s", summaryFile) + " > " + required(output)
......
......@@ -32,7 +32,7 @@ class MergeTables(val root: Configurable) extends ToolCommandFuntion {
javaMainClass = getClass.getName
override val defaultCoreMemory = 2.0
override val defaultCoreMemory = 6.0
/** List of input tabular files */
@Input(doc = "Input table files", required = true)
......@@ -71,7 +71,7 @@ class MergeTables(val root: Configurable) extends ToolCommandFuntion {
required("-a", valueColumnIndex) +
optional("-n", idColumnName) +
optional("-e", fileExtension) +
optional("-h", numHeaderLines) +
optional("-m", numHeaderLines) +
optional("-f", fallbackString) +
optional("-d", delimiter) +
required("-o", output) +
......@@ -164,7 +164,7 @@ object MergeTables extends ToolCommand {
idColumnIndices: Seq[Int] = Seq.empty[Int],
valueColumnIndex: Int = -1,
fileExtension: String = "",
numHeaderLines: Int = 1,
numHeaderLines: Int = 0,
fallbackString: String = "-",
delimiter: Char = '\t',
out: File = new File("-")) extends AbstractArgs
......@@ -206,9 +206,9 @@ object MergeTables extends ToolCommand {
c.copy(fileExtension = x)
} text "Common extension of all input tables to strip (default: empty string)"
opt[Int]('h', "num_header_lines") optional () action { (x, c) =>
opt[Int]('m', "num_header_lines") optional () action { (x, c) =>
c.copy(numHeaderLines = x)
} text "The number of header lines present in all input files (default: 1; 1-line header)"
} text "The number of header lines present in all input files (default: 0; no header)"
opt[String]('f', "fallback") optional () action { (x, c) =>
c.copy(fallbackString = x)
......
......@@ -38,7 +38,7 @@ import nl.lumc.sasc.biopet.utils.ConfigUtils
*
* @param root Configuration object for the pipeline
*/
class Seqstat(val root: Configurable) extends ToolCommandFuntion with Summarizable {
class SeqStat(val root: Configurable) extends ToolCommandFuntion with Summarizable {
javaMainClass = getClass.getName
@Input(doc = "Input FASTQ", shortName = "input", required = true)
......@@ -76,16 +76,16 @@ object FqEncoding extends Enumeration {
val Unknown = Value(0, "Unknown")
}
object Seqstat extends ToolCommand {
def apply(root: Configurable, input: File, output: File): Seqstat = {
val seqstat = new Seqstat(root)
object SeqStat extends ToolCommand {
def apply(root: Configurable, input: File, output: File): SeqStat = {
val seqstat = new SeqStat(root)
seqstat.input = input
seqstat.output = new File(output, input.getName.substring(0, input.getName.lastIndexOf(".")) + ".seqstats.json")
seqstat
}
def apply(root: Configurable, fastqfile: File, outDir: String): Seqstat = {
val seqstat = new Seqstat(root)
def apply(root: Configurable, fastqfile: File, outDir: String): SeqStat = {
val seqstat = new SeqStat(root)
seqstat.input = fastqfile
seqstat.output = new File(outDir, fastqfile.getName.substring(0, fastqfile.getName.lastIndexOf(".")) + ".seqstats.json")
seqstat
......@@ -145,22 +145,14 @@ object Seqstat extends ToolCommand {
val h_qual = quals.length - 1
(l_qual < 59, h_qual > 74) match {
case (false, true) => {
phredEncoding = Solexa
}
case (true, true) => {
// TODO: check this later on
// complex case, we cannot tell wheter this is a sanger or solexa
// but since the h_qual exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
phredEncoding = Solexa
}
case (true, false) => {
// this is definite a sanger sequence, the lower end is sanger only
phredEncoding = Sanger
}
case (_, _) => {
phredEncoding = Unknown
}
case (false, true) => phredEncoding = Solexa
// TODO: check this later on
// complex case, we cannot tell wheter this is a sanger or solexa
// but since the h_qual exceeds any Sanger/Illumina1.8 quals, we can `assume` this is solexa
case (true, true) => phredEncoding = Solexa
// this is definite a sanger sequence, the lower end is sanger only
case (true, false) => phredEncoding = Sanger
case (_, _) => phredEncoding = Unknown
}
}
......@@ -214,7 +206,7 @@ object Seqstat extends ToolCommand {
}
// implicit conversion to Int using foldLeft(0)
val avgQual: Int = (readQual.sum / readQual.length)
val avgQual: Int = readQual.sum / readQual.length
if (readStats.qual.length <= avgQual) {
readStats.qual ++= mutable.ArrayBuffer.fill(avgQual - readStats.qual.length + 1)(0)
}
......@@ -254,6 +246,7 @@ object Seqstat extends ToolCommand {
baseStats(pos).nuc.zipWithIndex foreach { case (value, index) => nucs(index) += value }
}
detectPhredEncoding(quals)
logger.debug("Detected '" + phredEncoding.toString.toLowerCase + "' encoding in fastq file ...")
for (pos <- 0 until nucs.length) {
// always export the N-nucleotide
......@@ -269,19 +262,12 @@ object Seqstat extends ToolCommand {
}
for (pos <- 0 until quals.length) {
var key: Int = pos - phredEncoding.id
if (key > 0) {
// count till the max of baseHistogram.length
for (histokey <- 0 until key + 1) {
baseHistogram(histokey) += quals(pos)
}
val key: Int = pos - phredEncoding.id