Commit 4e9e3cb6 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'feat_wipereads' into 'master'

WipeReads tool

This is the first iteration of the tool (meant) to remove rRNA reads. A few things are still missing: taking into account splicing for region overlap, GTF reading, refFlat reading, and the class wrapper for Queue. However, the main functionalities are all there (mainly paired-end reads removal and optional reads with the same name removal), so I think this is merge-able for now.

See merge request !5
parents d1b18339 10c474fa
*.bam binary
*.bam.bai binary
......@@ -31,6 +31,11 @@
<version>6.8</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.11</artifactId>
<version>2.2.1</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
......@@ -56,6 +61,11 @@
<artifactId>biojava3-sequencing</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>com.twitter</groupId>
<artifactId>algebird-core_2.10</artifactId>
<version>0.8.1</version>
</dependency>
</dependencies>
<build>
<resources>
......
package nl.lumc.sasc.biopet.core
import nl.lumc.sasc.biopet.core.apps.WipeReads
import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics
import nl.lumc.sasc.biopet.pipelines.basty.Basty
import nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep
......@@ -15,39 +16,102 @@ import nl.lumc.sasc.biopet.pipelines.sage.Sage
object BiopetExecutable {
val pipelines: Map[String,PipelineCommand] = Map(
"flexiprep" -> Flexiprep,
"mapping" -> Mapping,
"gentrap" -> Gentrap,
"bam-metrics" -> BamMetrics,
"gatk-benchmark-genotyping" -> GatkBenchmarkGenotyping,
"gatk-genotyping" -> GatkGenotyping,
"gatk-variantcalling" -> GatkVariantcalling,
"gatk-pipeline" -> GatkPipeline,
"gatk-variant-recalibration" -> GatkVariantRecalibration,
"gatk-vcf-sample-compare" -> GatkVcfSampleCompare,
"sage" -> Sage,
"basty" -> Basty
)
val pipelines: List[MainCommand] = List(
Flexiprep,
Mapping,
Gentrap,
BamMetrics,
GatkBenchmarkGenotyping,
GatkGenotyping,
GatkVariantcalling,
GatkPipeline,
GatkVariantRecalibration,
GatkVcfSampleCompare,
Sage,
Basty)
val tools: List[MainCommand] = List(
WipeReads)
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
def toBulletedList(m: List[MainCommand], kind: String = "", bullet: String = "-") =
"Available %s:\n ".format(kind) + bullet + " " + m.map(x => x.name).sorted.mkString("\n " + bullet + " ")
lazy val pipelineList: String = toBulletedList(pipelines, "pipelines")
lazy val toolList: String = toBulletedList(tools, "tools")
lazy val addendum: String =
"""Questions or comments? Email sasc@lumc.nl or check out the project page at https://git.lumc.nl/biopet/biopet.git""".stripMargin
lazy val baseUsage: String =
"""
|Usage: java -jar BiopetFramework.jar {pipeline,tool} {pipeline/tool name} {pipeline/tool-specific options}
|
|%s
|
|%s
""".stripMargin.format("%s", addendum)
lazy val mainUsage: String =
baseUsage.format(pipelineList + "\n\n" + toolList)
lazy val pipelineUsage: String = baseUsage
.replaceFirst("""\{pipeline,tool\}""", "pipeline")
.replace("""pipeline/tool""", "pipeline")
.format(pipelineList)
lazy val toolUsage: String = baseUsage
.replaceFirst("""\{pipeline,tool\}""", "tool")
.replace("""pipeline/tool""", "tool")
.format(toolList)
if (args.isEmpty) {
System.err.println(pipelineList)
System.err.println(mainUsage)
System.exit(1)
}
else if (pipelines.contains(args.head)) pipelines(args.head).main(args.tail)
else {
System.err.println("Pipeline '" + args.head + "' does not exist")
System.err.println(pipelineList)
System.exit(1)
}
def pipelineList: String = {
val pipelinesArray = for ((k,v) <- pipelines) yield k
"Available pipelines:" + pipelinesArray.mkString("\n- ", "\n- ", "\n") + "please supply a valid pipeline"
def retrieveCommand(q: String, cl: List[MainCommand]): Option[MainCommand] =
cl match {
case head :: tail if head.name.toLowerCase == q => Some(head)
case Nil => None
case head :: tail => retrieveCommand(q, tail)
}
args match {
case Array("pipeline", pipelineName, pipelineArgs @ _*) =>
retrieveCommand(pipelineName.toLowerCase, pipelines) match {
case Some(pipeline) =>
pipeline.main(pipelineArgs.toArray)
System.exit(0)
case None =>
System.err.println(s"ERROR: pipeline '$pipelineName' does not exist")
System.err.println(pipelineUsage)
System.exit(1)
}
case Array("pipeline") =>
System.err.println(pipelineUsage)
System.exit(1)
case Array("tool", toolName, toolArgs @ _*) =>
retrieveCommand(toolName.toLowerCase, tools) match {
case Some(tool) =>
tool.main(toolArgs.toArray)
System.exit(0)
case None =>
System.err.println(s"ERROR: tool '$toolName' does not exist")
System.err.println(toolUsage)
System.exit(1)
}
case Array("tool") =>
System.err.println(toolUsage)
System.exit(1)
case _ =>
println(mainUsage)
System.exit(1)
}
}
}
package nl.lumc.sasc.biopet.core
import org.broadinstitute.gatk.queue.util.Logging
trait MainCommand extends Logging {
lazy val name = this.getClass.getSimpleName
.split("\\$").last
def main(args: Array[String])
}
package nl.lumc.sasc.biopet.core
import org.broadinstitute.gatk.queue.util.Logging
trait PipelineCommand extends Logging {
trait PipelineCommand extends MainCommand {
val pipeline = ""
def main(args: Array[String]): Unit = {
var argv: Array[String] = Array()
//argv ++= Array("-S", tempFile.getAbsolutePath)
argv ++= Array("-S", pipeline)
argv ++= args
return BiopetQCommandLine.main(argv)
BiopetQCommandLine.main(argv)
}
}
\ No newline at end of file
Test datasets for the Biopet Framework
======================================
* Please add an entry here when adding a new test dataset.
Filename Explanation
======== ===========
single01.sam single-end SAM file, used for testing WipeReads
single01.bam single01.sam compressed with samtools v0.1.18
single01.bam.bai index for single01.bam
single02.sam single-end SAM file, used for testing WipeReads
single02.bam single02.sam compressed with samtools v0.1.18
single02.bam.bai index for single02.bam
paired01.sam paired-end SAM file, used for testing WipeReads
paired01.bam paired01.sam compressed with samtools v0.1.18
paired01.bam.bai index for paired01.bam
@HD VN:1.0 SO:coordinate
@SQ SN:chrQ LN:10000
@RG ID:001 DS:paired-end reads SM:WipeReadsTestCase
r02 99 chrQ 50 60 10M = 90 50 TACGTACGTA EEFFGGHHII RG:Z:001
r02 147 chrQ 90 60 10M = 50 -50 ATGCATGCAT EEFFGGHHII RG:Z:001
r01 163 chrQ 150 60 10M = 190 50 AAAAAGGGGG GGGGGGGGGG RG:Z:001
r01 83 chrQ 190 60 10M = 150 -50 GGGGGAAAAA GGGGGGGGGG RG:Z:001
r01 163 chrQ 250 60 10M = 290 50 AAAAAGGGGG GGGGGGGGGG RG:Z:001
r01 83 chrQ 290 60 10M = 250 -50 GGGGGAAAAA GGGGGGGGGG RG:Z:001
r04 99 chrQ 450 60 10M = 490 50 CGTACGTACG EEFFGGHHII RG:Z:001
r04 147 chrQ 490 60 10M = 450 -50 GCATGCATGC EEFFGGHHII RG:Z:001
r03 163 chrQ 650 60 10M = 690 50 TTTTTCCCCC HHHHHHHHHH RG:Z:001
r03 83 chrQ 690 60 10M = 650 -50 CCCCCTTTTT HHHHHHHHHH RG:Z:001
r05 99 chrQ 890 60 5M200N5M = 1140 50 TACGTACGTA EEFFGGHHII RG:Z:001
r05 147 chrQ 1140 60 10M = 890 -50 ATGCATGCAT EEFFGGHHII RG:Z:001
r06 4 * 0 0 * * 0 0 ATATATATAT HIHIHIHIHI RG:Z:001
r06 4 * 0 0 * * 0 0 GCGCGCGCGC HIHIHIHIHI RG:Z:001
@HD VN:1.0 SO:coordinate
@SQ SN:chrQ LN:10000
@RG ID:001 DS:paired-end reads SM:WipeReadsTestCase
@RG ID:002 DS:paired-end reads SM:WipeReadsTestCase
r02 99 chrQ 50 60 10M = 90 50 TACGTACGTA EEFFGGHHII RG:Z:001
r02 147 chrQ 90 60 10M = 50 -50 ATGCATGCAT EEFFGGHHII RG:Z:001
r01 163 chrQ 150 30 10M = 190 50 AAAAAGGGGG GGGGGGGGGG RG:Z:002
r01 83 chrQ 190 30 10M = 150 -50 GGGGGAAAAA GGGGGGGGGG RG:Z:002
r01 163 chrQ 250 30 10M = 290 50 AAAAAGGGGG GGGGGGGGGG RG:Z:002
r01 83 chrQ 290 30 10M = 250 -50 GGGGGAAAAA GGGGGGGGGG RG:Z:002
r04 99 chrQ 450 60 10M = 490 50 CGTACGTACG EEFFGGHHII RG:Z:001
r04 147 chrQ 490 60 10M = 450 -50 GCATGCATGC EEFFGGHHII RG:Z:001
r06 4 * 0 0 * * 0 0 ATATATATAT HIHIHIHIHI RG:Z:001
r08 4 * 0 0 * * 0 0 GCGCGCGCGC HIHIHIHIHI RG:Z:002
@HD VN:1.0 SO:coordinate
@SQ SN:chrQ LN:10000
@RG ID:001 DS:paired-end reads SM:WipeReadsTestCase
r02 99 chrQ 50 60 10M = 90 50 TACGTACGTA EEFFGGHHII RG:Z:001
r02 147 chrQ 90 60 10M = 50 -50 ATGCATGCAT EEFFGGHHII RG:Z:001
r01 163 chrQ 150 60 10M = 190 50 AAAAAGGGGG GGGGGGGGGG RG:Z:001
r01 83 chrQ 190 60 10M = 150 -50 GGGGGAAAAA GGGGGGGGGG RG:Z:001
r01 163 chrQ 250 60 10M = 290 50 AAAAAGGGGG GGGGGGGGGG RG:Z:001
r01 83 chrQ 290 60 10M = 250 -50 GGGGGAAAAA GGGGGGGGGG RG:Z:001
r06 4 * 0 0 * * 0 0 ATATATATAT HIHIHIHIHI RG:Z:001
r06 4 * 0 0 * * 0 0 GCGCGCGCGC HIHIHIHIHI RG:Z:001
chrQ 290 320 rRNA01 0 +
chrQ 450 480 rRNA02 0 -
chrQ 990 1000 rRNA03 0 +
@HD VN:1.0 SO:coordinate
@SQ SN:chrQ LN:10000
@RG ID:001 DS:single-end reads SM:WipeReadsTestCase
r02 0 chrQ 50 60 10M * 0 0 TACGTACGTA EEFFGGHHII RG:Z:001
r01 16 chrQ 190 60 10M * 0 0 TACGTACGTA EEFFGGHHII RG:Z:001
r01 16 chrQ 290 60 10M * 0 0 GGGGGAAAAA GGGGGGGGGG RG:Z:001
r04 0 chrQ 450 60 10M * 0 0 CGTACGTACG EEFFGGHHII RG:Z:001
r03 16 chrQ 690 60 10M * 0 0 CCCCCTTTTT HHHHHHHHHH RG:Z:001
r05 0 chrQ 890 60 5M200N5M * 0 0 GATACGATAC FEFEFEFEFE RG:Z:001
r06 4 * 0 0 * * 0 0 ATATATATAT HIHIHIHIHI RG:Z:001
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment