diff --git a/.idea/libraries/Maven__com_baqend_bloom_filter_1_02.xml b/.idea/libraries/Maven__com_baqend_bloom_filter_1_02.xml deleted file mode 100644 index a08d14f4eb8ef4a4d8eb3ab6ef242b63e7000b28..0000000000000000000000000000000000000000 --- a/.idea/libraries/Maven__com_baqend_bloom_filter_1_02.xml +++ /dev/null @@ -1,13 +0,0 @@ -<component name="libraryTable"> - <library name="Maven: com.baqend:bloom-filter:1.02"> - <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/com/baqend/bloom-filter/1.02/bloom-filter-1.02.jar!/" /> - </CLASSES> - <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/com/baqend/bloom-filter/1.02/bloom-filter-1.02-javadoc.jar!/" /> - </JAVADOC> - <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/com/baqend/bloom-filter/1.02/bloom-filter-1.02-sources.jar!/" /> - </SOURCES> - </library> -</component> \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml b/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml deleted file mode 100644 index 4533c1ba79a2f6aad70eab0b302d97e0bfb41dda..0000000000000000000000000000000000000000 --- a/.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml +++ /dev/null @@ -1,13 +0,0 @@ -<component name="libraryTable"> - <library name="Maven: com.google.code.gson:gson:2.2.4"> - <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/com/google/code/gson/gson/2.2.4/gson-2.2.4.jar!/" /> - </CLASSES> - <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/com/google/code/gson/gson/2.2.4/gson-2.2.4-javadoc.jar!/" /> - </JAVADOC> - <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/com/google/code/gson/gson/2.2.4/gson-2.2.4-sources.jar!/" /> - </SOURCES> - </library> -</component> \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_guava_guava_17_0.xml b/.idea/libraries/Maven__com_google_guava_guava_18_0.xml similarity index 68% rename from .idea/libraries/Maven__com_google_guava_guava_17_0.xml rename to .idea/libraries/Maven__com_google_guava_guava_18_0.xml index 2a9069ca399f4895428f9c87f5dddd978dc31766..bbd71d77e995b85a163660856a9d45a449599fcc 100644 --- a/.idea/libraries/Maven__com_google_guava_guava_17_0.xml +++ b/.idea/libraries/Maven__com_google_guava_guava_18_0.xml @@ -1,13 +1,13 @@ <component name="libraryTable"> - <library name="Maven: com.google.guava:guava:17.0"> + <library name="Maven: com.google.guava:guava:18.0"> <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/17.0/guava-17.0.jar!/" /> + <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/18.0/guava-18.0.jar!/" /> </CLASSES> <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/17.0/guava-17.0-javadoc.jar!/" /> + <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/18.0/guava-18.0-javadoc.jar!/" /> </JAVADOC> <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/17.0/guava-17.0-sources.jar!/" /> + <root url="jar://$MAVEN_REPOSITORY$/com/google/guava/guava/18.0/guava-18.0-sources.jar!/" /> </SOURCES> </library> </component> \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_2.xml b/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_2.xml deleted file mode 100644 index aa6a889052a2db3f805e9ec976fdce0ad9faad1d..0000000000000000000000000000000000000000 --- a/.idea/libraries/Maven__org_apache_commons_commons_pool2_2_2.xml +++ /dev/null @@ -1,13 +0,0 @@ -<component name="libraryTable"> - <library name="Maven: org.apache.commons:commons-pool2:2.2"> - <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-pool2/2.2/commons-pool2-2.2.jar!/" /> - </CLASSES> - <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-pool2/2.2/commons-pool2-2.2-javadoc.jar!/" /> - </JAVADOC> - <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/org/apache/commons/commons-pool2/2.2/commons-pool2-2.2-sources.jar!/" /> - </SOURCES> - </library> -</component> \ No newline at end of file diff --git a/.idea/libraries/Maven__redis_clients_jedis_2_5_1.xml b/.idea/libraries/Maven__redis_clients_jedis_2_5_1.xml deleted file mode 100644 index 1d28363366b82a563ff8823880190ac6f4170599..0000000000000000000000000000000000000000 --- a/.idea/libraries/Maven__redis_clients_jedis_2_5_1.xml +++ /dev/null @@ -1,13 +0,0 @@ -<component name="libraryTable"> - <library name="Maven: redis.clients:jedis:2.5.1"> - <CLASSES> - <root url="jar://$MAVEN_REPOSITORY$/redis/clients/jedis/2.5.1/jedis-2.5.1.jar!/" /> - </CLASSES> - <JAVADOC> - <root url="jar://$MAVEN_REPOSITORY$/redis/clients/jedis/2.5.1/jedis-2.5.1-javadoc.jar!/" /> - </JAVADOC> - <SOURCES> - <root url="jar://$MAVEN_REPOSITORY$/redis/clients/jedis/2.5.1/jedis-2.5.1-sources.jar!/" /> - </SOURCES> - </library> -</component> \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index ccf88d1b84fcdeddbef86f014b7292d010c6e432..8a80acb0ffc07bb839ad475b6ca7b3c68aa4e03b 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -10,7 +10,7 @@ </list> </option> </component> - <component name="ProjectRootManager" version="2" languageLevel="JDK_1_6" assert-keyword="true" jdk-15="true" project-jdk-name="1.8" project-jdk-type="JavaSDK"> + <component name="ProjectRootManager" version="2" languageLevel="JDK_1_6" assert-keyword="true" jdk-15="true" project-jdk-name="1.7" project-jdk-type="JavaSDK"> <output url="file://$PROJECT_DIR$/out" /> </component> </project> diff --git a/biopet-framework/BiopetFramework.iml b/biopet-framework/BiopetFramework.iml index fbfeb2328b7a84e67339ad5ee6fe59808f0f55ad..576d75a0e85692d1617394c0fee61366299e856c 100644 --- a/biopet-framework/BiopetFramework.iml +++ b/biopet-framework/BiopetFramework.iml @@ -21,6 +21,7 @@ <sourceFolder url="file://$MODULE_DIR$/src/main/scala" isTestSource="false" /> <sourceFolder url="file://$MODULE_DIR$/src/test/scala" isTestSource="true" /> <sourceFolder url="file://$MODULE_DIR$/src/main/scripts" type="java-resource" /> + <sourceFolder url="file://$MODULE_DIR$/src/main/resources" type="java-resource" /> <sourceFolder url="file://$MODULE_DIR$/src/test/resources" type="java-test-resource" /> <excludeFolder url="file://$MODULE_DIR$/target" /> </content> @@ -46,12 +47,8 @@ <orderEntry type="library" name="Maven: com.github.julien-truffaut:monocle-core_2.11:0.5.0" level="project" /> <orderEntry type="library" name="Maven: org.biojava:biojava3-core:3.1.0" level="project" /> <orderEntry type="library" name="Maven: org.biojava:biojava3-sequencing:3.1.0" level="project" /> - <orderEntry type="library" name="Maven: com.google.guava:guava:17.0" level="project" /> - <orderEntry type="library" name="Maven: com.baqend:bloom-filter:1.02" level="project" /> - <orderEntry type="library" name="Maven: redis.clients:jedis:2.5.1" level="project" /> - <orderEntry type="library" name="Maven: org.apache.commons:commons-pool2:2.2" level="project" /> - <orderEntry type="library" name="Maven: com.google.code.gson:gson:2.2.4" level="project" /> + <orderEntry type="library" name="Maven: com.google.guava:guava:18.0" level="project" /> <orderEntry type="library" name="Maven: com.github.scopt:scopt_2.10:3.2.0" level="project" /> <orderEntry type="library" name="Maven: org.mockito:mockito-all:1.9.5" level="project" /> </component> -</module> \ No newline at end of file +</module> diff --git a/biopet-framework/pom.xml b/biopet-framework/pom.xml index 9754678dbe04a2504189bee889872488d5f09b98..f116688f95fc697f13afafa47dd191f347679c45 100644 --- a/biopet-framework/pom.xml +++ b/biopet-framework/pom.xml @@ -24,10 +24,6 @@ <name>BioJava repository</name> <url>http://www.biojava.org/download/maven/</url> </repository> - <repository> - <id>orestes-bloom-filter</id> - <url>https://raw.githubusercontent.com/Baqend/Orestes-Bloomfilter/master/maven-repo</url> - </repository> </repositories> <dependencies> <dependency> @@ -67,9 +63,9 @@ <version>3.1.0</version> </dependency> <dependency> - <groupId>com.baqend</groupId> - <artifactId>bloom-filter</artifactId> - <version>1.02</version> + <groupId>com.google.guava</groupId> + <artifactId>guava</artifactId> + <version>18.0</version> </dependency> <dependency> <groupId>com.github.scopt</groupId> @@ -114,7 +110,7 @@ <prefix>git</prefix> <dateFormat>dd.MM.yyyy '@' HH:mm:ss z</dateFormat> <verbose>false</verbose> - <useNativeGit>false</useNativeGit> + <useNativeGit>true</useNativeGit> <dotGitDirectory>${project.basedir}/../.git</dotGitDirectory> <skipPoms>false</skipPoms> <generateGitPropertiesFile>true</generateGitPropertiesFile> diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutable.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutable.scala index 172e12d2bcd192a5ad5e1c6953cdf38312735964..80308d9f6393c38a5fb278c2fba39fb67fc2f6fe 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutable.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutable.scala @@ -29,7 +29,11 @@ object BiopetExecutable { nl.lumc.sasc.biopet.tools.FindRepeatsPacBio, nl.lumc.sasc.biopet.tools.BedToInterval, nl.lumc.sasc.biopet.tools.MpileupToVcf, - nl.lumc.sasc.biopet.tools.FastqSplitter) + nl.lumc.sasc.biopet.tools.FastqSplitter, + nl.lumc.sasc.biopet.tools.BedtoolsCoverageToCounts, + nl.lumc.sasc.biopet.tools.SageCountFastq, + nl.lumc.sasc.biopet.tools.SageCreateLibrary, + nl.lumc.sasc.biopet.tools.SageCreateTagCounts) ) /** @@ -38,7 +42,7 @@ object BiopetExecutable { def main(args: Array[String]): Unit = { def toBulletedList(m: List[MainCommand], kind: String = "", bullet: String = "-") = - "Available %ss:\n ".format(kind) + bullet + " " + m.map(x => x.commandName).sorted.mkString("\n " + bullet + " ") + "Available %s(s):\n ".format(kind) + bullet + " " + m.map(x => x.commandName).sorted.mkString("\n " + bullet + " ") def usage(module: String = null): String = { if (module != null) checkModule(module) diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala index 7e1d70a98f4ab7866af10def3c097cb96b9b5de5..8c6cbb13b730f4ad12e9db86aa310f5dbb0af76c 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala @@ -8,7 +8,6 @@ import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument } import java.io.File - class BreakdancerConfig(val root: Configurable) extends BiopetCommandLineFunction { executable = config("exe", default = "bam2cfg.pl", freeVar = false) @@ -23,22 +22,22 @@ class BreakdancerConfig(val root: Configurable) extends BiopetCommandLineFunctio var min_insertsize: Option[Int] = config("min_insertsize", default = 450) var solid_data: Boolean = config("solid", default = false) var sd_cutoff: Option[Int] = config("sd_cutoff", default = 4) // Cutoff in unit of standard deviation [4] - + // we set this to a higher number to avoid biases in small numbers in sorted bams var min_observations: Option[Int] = config("min_observations", default = 10000) // Number of observation required to estimate mean and s.d. insert size [10_000] var coefvar_cutoff: Option[Int] = config("coef_cutoff", default = 1) // Cutoff on coefficients of variation [1] var histogram_bins: Option[Int] = config("histogram_bins", default = 50) // Number of bins in the histogram [50] - + def cmdLine = required(executable) + - optional("-q", min_mq) + - conditional(use_mq, "-m") + - optional("-s", min_insertsize) + - conditional(solid_data, "-s") + - optional("-c", sd_cutoff) + - optional("-n", min_observations) + - optional("-v", coefvar_cutoff) + - optional("-b", histogram_bins) + - required(input) + " 1> " + required(output) + optional("-q", min_mq) + + conditional(use_mq, "-m") + + optional("-s", min_insertsize) + + conditional(solid_data, "-s") + + optional("-c", sd_cutoff) + + optional("-n", min_observations) + + optional("-v", coefvar_cutoff) + + optional("-b", histogram_bins) + + required(input) + " 1> " + required(output) } object BreakdancerConfig { @@ -62,35 +61,30 @@ object BreakdancerConfig { private def swapExtension(inputFile: String) = inputFile.substring(0, inputFile.lastIndexOf(".bam")) + ".breakdancer.cfg" } - - - /* * The caller * **/ - -class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunction { +class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunction { executable = config("exe", default = "breakdancer-max", freeVar = false) - - override val defaultVmem = "4G" + + override val defaultVmem = "6G" override val defaultThreads = 1 // breakdancer can only work on 1 single thread - + override val versionRegex = """.*[Vv]ersion:? (.*)""".r override val versionExitcode = List(1) override def versionCommand = executable - - + @Input(doc = "The breakdancer configuration file") var input: File = _ - -// @Argument(doc = "Work directory") -// var workdir: String = _ - - @Output(doc = "Breakdancer VCF output") + + // @Argument(doc = "Work directory") + // var workdir: String = _ + + @Output(doc = "Breakdancer TSV output") var output: File = _ - + /* Options: -o STRING operate on a single chromosome [all chromosome] @@ -116,36 +110,36 @@ class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunctio var q: Option[Int] = config("qs", default = 35) var r: Option[Int] = config("r", default = 2) var x: Option[Int] = config("x", default = 1000) - var b: Option[Int] = config("b", default = 100) + var b: Option[Int] = config("b", default = 100) var t: Boolean = config("t", default = false) var d: String = config("d") var g: String = config("g") var l: Boolean = config("l", default = false) var a: Boolean = config("a", default = false) var h: Boolean = config("h", default = false) - var y: Option[Int] = config("y", default = 30) - + var y: Option[Int] = config("y", default = 30) + override def beforeCmd { } - def cmdLine = required(executable) + - optional("-s", s) + - optional("-c", c) + - optional("-m", m) + - optional("-q", q) + - optional("-r", r) + - optional("-x", x) + - optional("-b", b) + - conditional(t ,"-t") + - optional("-d", d) + - optional("-g", g) + - conditional(l ,"-l") + - conditional(a ,"-a") + - conditional(h ,"-h") + - optional("-y", y) + - required(input) + - ">" + - required(output) + def cmdLine = required(executable) + + optional("-s", s) + + optional("-c", c) + + optional("-m", m) + + optional("-q", q) + + optional("-r", r) + + optional("-x", x) + + optional("-b", b) + + conditional(t, "-t") + + optional("-d", d) + + optional("-g", g) + + conditional(l, "-l") + + conditional(a, "-a") + + conditional(h, "-h") + + optional("-y", y) + + required(input) + + ">" + + required(output) } object BreakdancerCaller { @@ -160,21 +154,18 @@ object BreakdancerCaller { /// Breakdancer is actually a mini pipeline executing binaries from the breakdancer package class Breakdancer(val root: Configurable) extends QScript with BiopetQScript { def this() = this(null) - + @Input(doc = "Input file (bam)") var input: File = _ @Input(doc = "Reference Fasta file") var reference: File = _ - + @Argument(doc = "Work directory") var workdir: String = _ - -// @Output(doc = "Breakdancer VCF output") -// lazy val outputvcf: File = { -// new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.vcf") -// } - + + var deps: List[File] = Nil + @Output(doc = "Breakdancer config") lazy val configfile: File = { new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.cfg") @@ -183,32 +174,35 @@ class Breakdancer(val root: Configurable) extends QScript with BiopetQScript { lazy val outputraw: File = { new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.tsv") } - + @Output(doc = "Breakdancer VCF output") + lazy val outputvcf: File = { + new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.vcf") + } + override def init() { } def biopetScript() { // read config and set all parameters for the pipeline logger.info("Starting Breakdancer configuration") - + val bdcfg = BreakdancerConfig(this, input, this.configfile) - outputFiles += ("breakdancer_cfg" -> bdcfg.output ) - add( bdcfg ) - - val output_tsv: File = this.outputraw - val breakdancer = BreakdancerCaller( this, bdcfg.output, output_tsv ) - add( breakdancer ) - outputFiles += ("breakdancer_tsv" -> breakdancer.output ) - -// val output_vcf: File = this.outputvcf - // convert this tsv to vcf using the python script - - + bdcfg.deps = this.deps + outputFiles += ("cfg" -> bdcfg.output) + add(bdcfg) + + val breakdancer = BreakdancerCaller(this, bdcfg.output, this.outputraw) + add(breakdancer) + outputFiles += ("tsv" -> breakdancer.output) + + val bdvcf = BreakdancerVCF(this, breakdancer.output, this.outputvcf) + add(bdvcf) + outputFiles += ("vcf" -> bdvcf.output) } } object Breakdancer extends PipelineCommand { - def apply(root: Configurable, input: File, reference: File, runDir: String): Breakdancer = { + def apply(root: Configurable, input: File, reference: File, runDir: String): Breakdancer = { val breakdancer = new Breakdancer(root) breakdancer.input = input breakdancer.reference = reference diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala new file mode 100644 index 0000000000000000000000000000000000000000..6869a590dbe05f3f6743ea05c61257321ecf72ae --- /dev/null +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala @@ -0,0 +1,36 @@ +package nl.lumc.sasc.biopet.extensions.svcallers + +import java.io.File + +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +import argonaut._, Argonaut._ +import scalaz._, Scalaz._ + +import nl.lumc.sasc.biopet.core.config.Configurable +import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction + +class BreakdancerVCF(val root: Configurable) extends PythonCommandLineFunction { + setPythonScript("breakdancer2vcf.py") + + @Input(doc = "Breakdancer TSV") + var input: File = _ + + @Output(doc = "Output VCF to PATH") + var output: File = _ + + def cmdLine = { + getPythonCommand + + "-i " + required(input) + + "-o " + required(output) + } +} + +object BreakdancerVCF { + def apply(root: Configurable, input: File, output: File): BreakdancerVCF = { + val bd = new BreakdancerVCF(root) + bd.input = input + bd.output = output + return bd + } +} \ No newline at end of file diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala index e501a3cf10d0e0a9a3c3e77f2f20f6e49f27bf90..36da78d18051e41b59085f98340a0575ef9ea89c 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala @@ -8,7 +8,11 @@ import nl.lumc.sasc.biopet.extensions.picard.MergeSamFiles import nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep import nl.lumc.sasc.biopet.pipelines.mapping.Mapping import nl.lumc.sasc.biopet.scripts.PrefixFastq +import nl.lumc.sasc.biopet.tools.BedtoolsCoverageToCounts import nl.lumc.sasc.biopet.scripts.SquishBed +import nl.lumc.sasc.biopet.tools.SageCountFastq +import nl.lumc.sasc.biopet.tools.SageCreateLibrary +import nl.lumc.sasc.biopet.tools.SageCreateTagCounts import org.broadinstitute.gatk.queue.QScript import org.broadinstitute.gatk.queue.function._ @@ -64,7 +68,7 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript { } if (tagsLibrary == null) { - val cdl = new CreateDeepsageLibrary(this) + val cdl = new SageCreateLibrary(this) cdl.input = transcriptome cdl.output = outputDir + "taglib/tag.lib" cdl.noAntiTagsOutput = outputDir + "taglib/no_antisense_genes.txt" @@ -181,12 +185,12 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript { } def addTablibCounts(fastq:File, outputPrefix: String, outputDir: String) { - val countFastq = new CountFastq(this) + val countFastq = new SageCountFastq(this) countFastq.input = fastq countFastq.output = outputDir + outputPrefix + ".raw.counts" add(countFastq) - val createTagCounts = new CreateTagCounts(this) + val createTagCounts = new SageCreateTagCounts(this) createTagCounts.input = countFastq.output createTagCounts.tagLib = tagsLibrary createTagCounts.countSense = outputDir + outputPrefix + ".tagcount.sense.counts" diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala index 3c00eb3cc86d1c1858dd7f1641c9cc7d898cb7d2..16460f0fd66978f98ba79268cbf93e2a10425e00 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala @@ -8,7 +8,6 @@ import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.core.MultiSampleQScript import nl.lumc.sasc.biopet.core.PipelineCommand -import nl.lumc.sasc.biopet.extensions.picard.MergeSamFiles import nl.lumc.sasc.biopet.extensions.sambamba.{ SambambaIndex, SambambaMerge } import nl.lumc.sasc.biopet.extensions.svcallers.pindel.Pindel import nl.lumc.sasc.biopet.extensions.svcallers.{ Breakdancer, Clever } @@ -84,24 +83,24 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { mergeSamFiles.input = libraryBamfiles mergeSamFiles.output = alignmentDir + sampleID + ".merged.bam" add(mergeSamFiles) - - val bamIndex = SambambaIndex(root, mergeSamFiles.output) - add(bamIndex) - mergeSamFiles.output } else null + + val bamIndex = SambambaIndex(root, bamFile) + add(bamIndex) /// bamfile will be used as input for the SV callers. First run Clever // val cleverVCF : File = sampleDir + "/" + sampleID + ".clever.vcf" val cleverDir = svcallingDir + sampleID + ".clever/" val clever = Clever(this, bamFile, this.reference, svcallingDir, cleverDir) + clever.deps = List(bamIndex.output) sampleOutput.vcf += ("clever" -> List(clever.outputvcf)) add(clever) val breakdancerDir = svcallingDir + sampleID + ".breakdancer/" val breakdancer = Breakdancer(this, bamFile, this.reference, breakdancerDir) - sampleOutput.vcf += ("breakdancer" -> List(breakdancer.outputraw)) + sampleOutput.vcf += ("breakdancer" -> List(breakdancer.outputvcf)) addAll(breakdancer.functions) // for pindel we should use per library config collected into one config file diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedToInterval.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedToInterval.scala index 0a8cbdb3accf11ef641708b5a2dd4ac44f910467..550853dff22d871690bb71631b80d6c55dc19698 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedToInterval.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedToInterval.scala @@ -25,7 +25,7 @@ class BedToInterval(val root: Configurable) extends BiopetJavaCommandLineFunctio override val defaultVmem = "8G" memoryLimit = Option(4.0) - override def commandLine = super.commandLine + required(input) + required(bamFile) + required(output) + override def commandLine = super.commandLine + required("-I", input) + required("-b", bamFile) + required("-o", output) } object BedToInterval extends ToolCommand { @@ -45,13 +45,15 @@ object BedToInterval extends ToolCommand { return bedToInterval } - case class Args (inputFile:File = null, outputFile:File = null) extends AbstractArgs + case class Args (inputFile:File = null, outputFile:File = null, bamFile:File = null) extends AbstractArgs class OptParser extends AbstractOptParser { opt[File]('I', "inputFile") required() valueName("<file>") action { (x, c) => - c.copy(inputFile = x) } text("out is a required file property") + c.copy(inputFile = x) } opt[File]('o', "output") required() valueName("<file>") action { (x, c) => - c.copy(outputFile = x) } text("out is a required file property") + c.copy(outputFile = x) } + opt[File]('b', "bam") required() valueName("<file>") action { (x, c) => + c.copy(bamFile = x) } } /** @@ -63,7 +65,7 @@ object BedToInterval extends ToolCommand { val writer = new PrintWriter(commandArgs.outputFile) - val inputSam = new SAMFileReader(commandArgs.inputFile) + val inputSam = new SAMFileReader(commandArgs.bamFile) val refs = for (SQ <- inputSam.getFileHeader.getSequenceDictionary.getSequences.toArray) yield { val record = SQ.asInstanceOf[SAMSequenceRecord] writer.write("@SQ\tSN:" + record.getSequenceName + "\tLN:" + record.getSequenceLength + "\n") @@ -72,7 +74,7 @@ object BedToInterval extends ToolCommand { inputSam.close val refsMap = Map(refs:_*) - val bedFile = Source.fromFile(args(0)) + val bedFile = Source.fromFile(commandArgs.inputFile) for ( line <- bedFile.getLines; val split = line.split("\t") diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/BedtoolsCoverageToCounts.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedtoolsCoverageToCounts.scala similarity index 62% rename from biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/BedtoolsCoverageToCounts.scala rename to biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedtoolsCoverageToCounts.scala index 439febc7dd5e00001d960eb1845441d92751ee21..004d322caabfb8fe0dd24ad793b14b141371d30b 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/BedtoolsCoverageToCounts.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedtoolsCoverageToCounts.scala @@ -1,8 +1,9 @@ -package nl.lumc.sasc.biopet.pipelines.sage +package nl.lumc.sasc.biopet.tools import java.io.File import java.io.PrintWriter import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction +import nl.lumc.sasc.biopet.core.ToolCommand import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import scala.collection.JavaConversions._ @@ -27,26 +28,27 @@ class BedtoolsCoverageToCounts(val root: Configurable) extends BiopetJavaCommand required("-o", output) } -object BedtoolsCoverageToCounts { - var input: File = _ - var output: File = _ +object BedtoolsCoverageToCounts extends ToolCommand { + case class Args (input:File = null, output:File = null) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('I', "input") required() valueName("<file>") action { (x, c) => + c.copy(input = x) } + opt[File]('o', "output") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(output = x) } + } /** * @param args the command line arguments */ def main(args: Array[String]): Unit = { - for (t <- 0 until args.size) { - args(t) match { - case "-I" => input = new File(args(t+1)) - case "-o" => output = new File(args(t+1)) - case _ => - } - } - if (input == null || !input.exists) throw new IllegalStateException("Input file not found, use -I") - if (output == null) throw new IllegalStateException("Output file not found, use -o") + val argsParser = new OptParser + val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) + + if (!commandArgs.input.exists) throw new IllegalStateException("Input file not found, file: " + commandArgs.input) val counts:Map[String, Long] = Map() - for (line <- Source.fromFile(input).getLines) { + for (line <- Source.fromFile(commandArgs.input).getLines) { val values = line.split("\t") val gene = values(3) val count = values(6).toLong @@ -56,7 +58,7 @@ object BedtoolsCoverageToCounts { val sortedCounts:SortedMap[String, Long] = SortedMap(counts.toArray:_*) - val writer = new PrintWriter(output) + val writer = new PrintWriter(commandArgs.output) for ((seq,count) <- sortedCounts) { if (count > 0) writer.println(seq + "\t" + count) } diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CountFastq.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCountFastq.scala similarity index 58% rename from biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CountFastq.scala rename to biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCountFastq.scala index e8f57b25014b87921efb4cd83ca3f49260113f06..a07cc4b8bb8eb73aeb00d1dbc2b2880b761b6acc 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CountFastq.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCountFastq.scala @@ -1,8 +1,9 @@ -package nl.lumc.sasc.biopet.pipelines.sage +package nl.lumc.sasc.biopet.tools import java.io.File import java.io.PrintWriter import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction +import nl.lumc.sasc.biopet.core.ToolCommand import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import org.biojava3.sequencing.io.fastq.{SangerFastqReader, StreamListener, Fastq} @@ -11,7 +12,7 @@ import scala.collection.SortedMap import scala.collection.mutable.Map import java.io.FileReader -class CountFastq(val root: Configurable) extends BiopetJavaCommandLineFunction { +class SageCountFastq(val root: Configurable) extends BiopetJavaCommandLineFunction { javaMainClass = getClass.getName @Input(doc = "Input fasta", shortName = "input", required = true) @@ -28,29 +29,30 @@ class CountFastq(val root: Configurable) extends BiopetJavaCommandLineFunction { required("-o", output) } -object CountFastq { - var input: File = _ - var output: File = _ +object SageCountFastq extends ToolCommand { + case class Args (input:File = null, output:File = null) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('I', "input") required() valueName("<file>") action { (x, c) => + c.copy(input = x) } + opt[File]('o', "output") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(output = x) } + } /** * @param args the command line arguments */ def main(args: Array[String]): Unit = { - for (t <- 0 until args.size) { - args(t) match { - case "-I" => input = new File(args(t+1)) - case "-o" => output = new File(args(t+1)) - case _ => - } - } - if (input == null || !input.exists) throw new IllegalStateException("Input file not found, use -I") - if (output == null) throw new IllegalStateException("Output file not found, use -o") + val argsParser = new OptParser + val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) + + if (!commandArgs.input.exists) throw new IllegalStateException("Input file not found, file: " + commandArgs.input) val counts:Map[String, Long] = Map() val reader = new SangerFastqReader var count = 0 - System.err.println("Reading fastq file: " + input) - val fileReader = new FileReader(input) + logger.info("Reading fastq file: " + commandArgs.input) + val fileReader = new FileReader(commandArgs.input) reader.stream(fileReader, new StreamListener { def fastq(fastq:Fastq) { val seq = fastq.getSequence @@ -60,13 +62,13 @@ object CountFastq { if (count % 1000000 == 0) System.err.println(count + " sequences done") } }) - System.err.println(count + " sequences done") + logger.info(count + " sequences done") - System.err.println("Sorting") + logger.info("Sorting") val sortedCounts:SortedMap[String, Long] = SortedMap(counts.toArray:_*) - System.err.println("Writting outputfile: " + output) - val writer = new PrintWriter(output) + logger.info("Writting outputfile: " + commandArgs.output) + val writer = new PrintWriter(commandArgs.output) for ((seq,count) <- sortedCounts) { writer.println(seq + "\t" + count) } diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CreateDeepsageLibrary.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCreateLibrary.scala similarity index 68% rename from biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CreateDeepsageLibrary.scala rename to biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCreateLibrary.scala index 7ed2fb0e89e3f9eeaca81e8d7481654feadf3b69..b339ab2471e756e654fbbe621e15a1ac7f5a36cb 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CreateDeepsageLibrary.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCreateLibrary.scala @@ -1,8 +1,9 @@ -package nl.lumc.sasc.biopet.pipelines.sage +package nl.lumc.sasc.biopet.tools import java.io.File import java.io.PrintWriter import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction +import nl.lumc.sasc.biopet.core.ToolCommand import nl.lumc.sasc.biopet.core.config.Configurable import org.biojava3.core.sequence.DNASequence import org.biojava3.core.sequence.io.FastaReaderHelper @@ -10,8 +11,9 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import scala.collection.SortedMap import scala.collection.mutable.{Map, Set} import scala.collection.JavaConversions._ +import scala.util.matching.Regex -class CreateDeepsageLibrary(val root: Configurable) extends BiopetJavaCommandLineFunction { +class SageCreateLibrary(val root: Configurable) extends BiopetJavaCommandLineFunction { javaMainClass = getClass.getName @Input(doc = "Input fasta", shortName = "input", required = true) @@ -37,22 +39,35 @@ class CreateDeepsageLibrary(val root: Configurable) extends BiopetJavaCommandLin override def commandLine = super.commandLine + required("-I", input) + - optional("-tag", tag) + - optional("-length", length) + - optional("-notag", noTagsOutput) + - optional("-noantitag", noAntiTagsOutput) + + optional("--tag", tag) + + optional("--length", length) + + optional("--notag", noTagsOutput) + + optional("--noantitag", noAntiTagsOutput) + required("-o", output) } -object CreateDeepsageLibrary { - var tag = "CATG" - var length = 17 - var input: File = _ - var noTagsOutput: File = _ - var noAntiTagsOutput: File = _ - var allGenesOutput: File = _ - var output: File = _ - lazy val tagRegex = (tag + "[CATG]{" + length + "}").r +object SageCreateLibrary extends ToolCommand { + case class Args (input:File = null, tag:String = "CATG", length:Int = 17,output:File = null, noTagsOutput:File = null, + noAntiTagsOutput:File = null, allGenesOutput:File = null) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('I', "input") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(input = x) } + opt[File]('o', "output") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(output = x) } + opt[String]("tag") required() unbounded() action { (x, c) => + c.copy(tag = x) } + opt[Int]("length") required() unbounded() action { (x, c) => + c.copy(length = x) } + opt[File]("noTagsOutput") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(noTagsOutput = x) } + opt[File]("noAntiTagsOutput") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(noAntiTagsOutput = x) } + opt[File]("allGenesOutput") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(allGenesOutput = x) } + } + + var tagRegex: Regex = null var geneRegex = """ENSG[0-9]{11}""".r val tagGenesMap: Map[String, TagGenes] = Map() @@ -73,24 +88,16 @@ object CreateDeepsageLibrary { * @param args the command line arguments */ def main(args: Array[String]): Unit = { - for (t <- 0 until args.size) { - args(t) match { - case "-I" => input = new File(args(t+1)) - case "-tag" => tag = args(t+1) - case "-length" => length = args(t+1).toInt - case "-o" => output = new File(args(t+1)) - case "-notag" => noTagsOutput = new File(args(t+1)) - case "-noantitag" => noAntiTagsOutput = new File(args(t+1)) - case "-allgenes" => allGenesOutput = new File(args(t+1)) - case _ => - } - } - if (input == null || !input.exists) throw new IllegalStateException("Input file not found, use -I") - if (output == null) throw new IllegalStateException("Output file not found, use -o") + val argsParser = new OptParser + val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) + + if (!commandArgs.input.exists) throw new IllegalStateException("Input file not found, file: " + commandArgs.input) + + tagRegex = (commandArgs.tag + "[CATG]{" + commandArgs.length + "}").r var count = 0 System.err.println("Reading fasta file") - val reader = FastaReaderHelper.readFastaDNASequence(input) + val reader = FastaReaderHelper.readFastaDNASequence(commandArgs.input) System.err.println("Finding tags") for ((name, seq) <- reader) { getTags(name, seq) @@ -103,7 +110,7 @@ object CreateDeepsageLibrary { val tagGenesMapSorted:SortedMap[String, TagGenes] = SortedMap(tagGenesMap.toArray:_*) System.err.println("Writting output files") - val writer = new PrintWriter(output) + val writer = new PrintWriter(commandArgs.output) writer.println("#tag\tfirstTag\tAllTags\tFirstAntiTag\tAllAntiTags") for ((tag, genes) <- tagGenesMapSorted) { val line = tag + "\t" + genes.firstTag.mkString(",") + @@ -114,24 +121,24 @@ object CreateDeepsageLibrary { } writer.close() - if (noTagsOutput != null) { - val writer = new PrintWriter(noTagsOutput) + if (commandArgs.noTagsOutput != null) { + val writer = new PrintWriter(commandArgs.noTagsOutput) for (gene <- allGenes if !tagGenes.contains(gene)) { writer.println(gene) } writer.close } - if (noAntiTagsOutput != null) { - val writer = new PrintWriter(noAntiTagsOutput) + if (commandArgs.noAntiTagsOutput != null) { + val writer = new PrintWriter(commandArgs.noAntiTagsOutput) for (gene <- allGenes if !antiTagGenes.contains(gene)) { writer.println(gene) } writer.close } - if (allGenesOutput != null) { - val writer = new PrintWriter(allGenesOutput) + if (commandArgs.allGenesOutput != null) { + val writer = new PrintWriter(commandArgs.allGenesOutput) for (gene <- allGenes) { writer.println(gene) } diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CreateTagCounts.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCreateTagCounts.scala similarity index 63% rename from biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CreateTagCounts.scala rename to biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCreateTagCounts.scala index 0cf113051596c5ae8f7a8cf90897bf864daacb54..1bfc6e0f8606c3ed14576791f9ac50a8cfa99b04 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CreateTagCounts.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCreateTagCounts.scala @@ -1,15 +1,16 @@ -package nl.lumc.sasc.biopet.pipelines.sage +package nl.lumc.sasc.biopet.tools import java.io.File import java.io.PrintWriter import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction +import nl.lumc.sasc.biopet.core.ToolCommand import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import scala.io.Source import scala.collection.mutable.Map import scala.collection.SortedMap -class CreateTagCounts(val root: Configurable) extends BiopetJavaCommandLineFunction { +class SageCreateTagCounts(val root: Configurable) extends BiopetJavaCommandLineFunction { javaMainClass = getClass.getName @Input(doc = "Raw count file", shortName = "input", required = true) @@ -35,42 +36,43 @@ class CreateTagCounts(val root: Configurable) extends BiopetJavaCommandLineFunct override def commandLine = super.commandLine + required("-I", input) + - required("-taglib", tagLib) + - optional("-sense", countSense) + - optional("-allsense", countAllSense) + - optional("-antisense", countAntiSense) + - optional("-allantisense", countAllAntiSense) + required("--taglib", tagLib) + + optional("--sense", countSense) + + optional("--allsense", countAllSense) + + optional("--antisense", countAntiSense) + + optional("--allantisense", countAllAntiSense) } -object CreateTagCounts { - var input: File = _ - var tagLib: File = _ - var countSense: File = _ - var countAllSense: File = _ - var countAntiSense: File = _ - var countAllAntiSense: File = _ +object SageCreateTagCounts extends ToolCommand { + case class Args (input:File = null, tagLib:File = null, countSense:File = null, countAllSense:File = null, + countAntiSense:File = null, countAllAntiSense:File = null) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('I', "input") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(input = x) } + opt[File]('t', "tagLib") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(tagLib = x) } + opt[File]("countSense") unbounded() valueName("<file>") action { (x, c) => + c.copy(countSense = x) } + opt[File]("countAllSense") unbounded() valueName("<file>") action { (x, c) => + c.copy(countAllSense = x) } + opt[File]("countAntiSense") unbounded() valueName("<file>") action { (x, c) => + c.copy(countAntiSense = x) } + opt[File]("countAllAntiSense") unbounded() valueName("<file>") action { (x, c) => + c.copy(countAllAntiSense = x) } + } /** * @param args the command line arguments */ def main(args: Array[String]): Unit = { + val argsParser = new OptParser + val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) - for (t <- 0 until args.size) { - args(t) match { - case "-I" => input = new File(args(t+1)) - case "-taglib" => tagLib = new File(args(t+1)) - case "-sense" => countSense = new File(args(t+1)) - case "-allsense" => countAllSense = new File(args(t+1)) - case "-antisense" => countAntiSense = new File(args(t+1)) - case "-allantisense" => countAllAntiSense = new File(args(t+1)) - case _ => - } - } - if (input == null || !input.exists) throw new IllegalStateException("Input file not found, use -I") - if (tagLib == null) throw new IllegalStateException("Output file not found, use -o") + if (!commandArgs.input.exists) throw new IllegalStateException("Input file not found, file: " + commandArgs.input) val rawCounts: Map[String, Long] = Map() - for (line <- Source.fromFile(input).getLines) { + for (line <- Source.fromFile(commandArgs.input).getLines) { val values = line.split("\t") val gene = values(0) val count = values(1).toLong @@ -83,7 +85,7 @@ object CreateTagCounts { val antiSenseCounts: Map[String, Long] = Map() val allAntiSenseCounts: Map[String, Long] = Map() - for (line <- Source.fromFile(tagLib).getLines if !line.startsWith("#")) { + for (line <- Source.fromFile(commandArgs.tagLib).getLines if !line.startsWith("#")) { val values = line.split("\t") val tag = values(0) val sense = values(1) @@ -126,9 +128,9 @@ object CreateTagCounts { writer.close } } - writeFile(countSense, senseCounts) - writeFile(countAllSense, allSenseCounts) - writeFile(countAntiSense, antiSenseCounts) - writeFile(countAllAntiSense, allAntiSenseCounts) + writeFile(commandArgs.countSense, senseCounts) + writeFile(commandArgs.countAllSense, allSenseCounts) + writeFile(commandArgs.countAntiSense, antiSenseCounts) + writeFile(commandArgs.countAllAntiSense, allAntiSenseCounts) } } \ No newline at end of file diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala index a990bab67fb7dc2ed67ad91304e74814fb9ba5cc..d131692da08156dcc7b4fd085e71cb791bfca2c0 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/WipeReads.scala @@ -4,10 +4,11 @@ */ package nl.lumc.sasc.biopet.tools -import java.io.{ File, IOException } +import java.io.File import scala.collection.JavaConverters._ +import com.google.common.hash.{Funnel, BloomFilter, PrimitiveSink} import htsjdk.samtools.AlignmentBlock import htsjdk.samtools.SAMFileReader import htsjdk.samtools.QueryInterval @@ -19,8 +20,6 @@ import htsjdk.tribble.Feature import htsjdk.tribble.BasicFeature import htsjdk.tribble.bed.BEDCodec import htsjdk.tribble.index.interval.{ Interval, IntervalTree } -import orestes.bloomfilter.HashProvider.HashMethod -import orestes.bloomfilter.{ BloomFilter, FilterBuilder } import org.apache.commons.io.FilenameUtils.getExtension import org.broadinstitute.gatk.utils.commandline.{ Input, Output } @@ -39,10 +38,10 @@ class WipeReads(val root: Configurable) extends BiopetJavaCommandLineFunction { javaMainClass = getClass.getName @Input(doc = "Input BAM file (must be indexed)", shortName = "I", required = true) - var inputBAM: File = _ + var inputBam: File = _ @Output(doc = "Output BAM", shortName = "o", required = true) - var outputBAM: File = _ + var outputBam: File = _ } @@ -62,8 +61,10 @@ object WipeReads extends ToolCommand { else feat1.getStart <= feat2.getStart && feat1.getEnd >= feat2.getStart - /** Function to merge two overlapping intervals - * NOTE: we assume subtypes of Feature has a constructor with (chr, start, end) signature */ + /** + * Function to merge two overlapping intervals + * NOTE: we assume subtypes of Feature has a constructor with (chr, start, end) signature + */ private def merge[T <: Feature](feat1: T, feat2: T): T = { if (feat1.getChr != feat2.getChr) throw new IllegalArgumentException("Can not merge features from different chromosomes") @@ -73,7 +74,7 @@ object WipeReads extends ToolCommand { else if (overlaps(feat1, feat2)) new BasicFeature(feat1.getChr, feat1.getStart, feat2.getEnd).asInstanceOf[T] else - throw new IllegalArgumentException("Can not merge non-overlapping RawInterval objects") + throw new IllegalArgumentException("Can not merge non-overlapping Feature objects") } /** Function to create an iterator yielding non-overlapped Feature objects */ @@ -81,11 +82,13 @@ object WipeReads extends ToolCommand { ri.toList .sortBy(x => (x.getChr, x.getStart, x.getEnd)) .foldLeft(List.empty[T]) { - (acc, i) => acc match { - case head :: tail if overlaps(head, i) => merge(head, i) :: tail - case _ => i :: acc + (acc, i) => + acc match { + case head :: tail if overlaps(head, i) => merge(head, i) :: tail + case _ => i :: acc - }} + } + } .toIterator /** @@ -98,70 +101,60 @@ object WipeReads extends ToolCommand { logger.info("Parsing interval file ...") /** Function to create iterator from BED file */ - def makeFeatureFromBED(inFile: File): Iterator[Feature] = + def makeFeatureFromBed(inFile: File): Iterator[Feature] = asScalaIteratorConverter(getFeatureReader(inFile.toPath.toString, new BEDCodec(), false).iterator).asScala - // TODO: implementation /** Function to create iterator from refFlat file */ def makeFeatureFromRefFlat(inFile: File): Iterator[Feature] = ??? - // convert coordinate to 1-based fully closed - // parse chrom, start blocks, end blocks, strands + // convert coordinate to 1-based fully closed + // parse chrom, start blocks, end blocks, strands - // TODO: implementation /** Function to create iterator from GTF file */ - def makeFeatureFromGTF(inFile: File): Iterator[Feature] = ??? - // convert coordinate to 1-based fully closed - // parse chrom, start blocks, end blocks, strands + def makeFeatureFromGtf(inFile: File): Iterator[Feature] = ??? + // convert coordinate to 1-based fully closed + // parse chrom, start blocks, end blocks, strands // detect interval file format from extension val iterFunc: (File => Iterator[Feature]) = if (getExtension(inFile.toString.toLowerCase) == "bed") - makeFeatureFromBED + makeFeatureFromBed else throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath) mergeOverlappingIntervals(iterFunc(inFile)) } - def bloomParamsOk(bloomSize: Long, bloomFp: Double): Boolean = { - val optimalArraySize = FilterBuilder.optimalM(bloomSize, bloomFp) - // we are currently limited to maximum integer size - // if the optimal array size equals or exceeds it, we assume - // that it's a result of a truncation and return false - optimalArraySize <= Int.MaxValue - } - // TODO: set minimum fraction for overlap /** * Function to create function to check SAMRecord for exclusion in filtered BAM file. * * The returned function evaluates all filtered-in SAMRecord to false. * - * @param iv iterator yielding RawInterval objects - * @param inBAM input BAM file - * @param inBAMIndex input BAM file index + * @param iv iterator yielding Feature objects + * @param inBam input BAM file + * @param inBamIndex input BAM file index * @param filterOutMulti whether to filter out reads with same name outside target region (default: true) * @param minMapQ minimum MapQ of reads in target region to filter out (default: 0) - * @param readGroupIDs read group IDs of reads in target region to filter out (default: all IDs) + * @param readGroupIds read group IDs of reads in target region to filter out (default: all IDs) * @param bloomSize expected size of elements to contain in the Bloom filter * @param bloomFp expected Bloom filter false positive rate * @return function that checks whether a SAMRecord or String is to be excluded */ - def makeFilterOutFunction(iv: Iterator[Feature], - inBAM: File, inBAMIndex: File = null, + def makeFilterNotFunction(iv: Iterator[Feature], + inBam: File, inBamIndex: File = null, filterOutMulti: Boolean = true, - minMapQ: Int = 0, readGroupIDs: Set[String] = Set(), + minMapQ: Int = 0, readGroupIds: Set[String] = Set(), bloomSize: Long, bloomFp: Double): (SAMRecord => Boolean) = { logger.info("Building set of reads to exclude ...") // TODO: implement optional index creation /** Function to check for BAM file index and return a SAMFileReader given a File */ - def prepIndexedInputBAM(): SAMFileReader = - if (inBAMIndex != null) - new SAMFileReader(inBAM, inBAMIndex) + def prepIndexedInputBam(): SAMFileReader = + if (inBamIndex != null) + new SAMFileReader(inBam, inBamIndex) else { - val sfr = new SAMFileReader(inBAM) + val sfr = new SAMFileReader(inBam) sfr.setValidationStringency(ValidationStringency.LENIENT) if (!sfr.hasIndex) throw new IllegalStateException("Input BAM file must be indexed") @@ -175,27 +168,28 @@ object WipeReads extends ToolCommand { * The function still works when only either one of the interval or BAM * file contig is prepended with "chr" * - * @param inBAM BAM file to query as SAMFileReader + * @param inBam BAM file to query as SAMFileReader * @param feat feature object containing query * @return QueryInterval wrapped in Option */ - def monadicMakeQueryInterval(inBAM: SAMFileReader, feat: Feature): Option[QueryInterval] = - if (inBAM.getFileHeader.getSequenceIndex(feat.getChr) > -1) - Some(inBAM.makeQueryInterval(feat.getChr, feat.getStart, feat.getEnd)) + def monadicMakeQueryInterval(inBam: SAMFileReader, feat: Feature): Option[QueryInterval] = + if (inBam.getFileHeader.getSequenceIndex(feat.getChr) > -1) + Some(inBam.makeQueryInterval(feat.getChr, feat.getStart, feat.getEnd)) else if (feat.getChr.startsWith("chr") - && inBAM.getFileHeader.getSequenceIndex(feat.getChr.substring(3)) > -1) - Some(inBAM.makeQueryInterval(feat.getChr.substring(3), feat.getStart, feat.getEnd)) + && inBam.getFileHeader.getSequenceIndex(feat.getChr.substring(3)) > -1) + Some(inBam.makeQueryInterval(feat.getChr.substring(3), feat.getStart, feat.getEnd)) else if (!feat.getChr.startsWith("chr") - && inBAM.getFileHeader.getSequenceIndex("chr" + feat.getChr) > -1) - Some(inBAM.makeQueryInterval("chr" + feat.getChr, feat.getStart, feat.getEnd)) + && inBam.getFileHeader.getSequenceIndex("chr" + feat.getChr) > -1) + Some(inBam.makeQueryInterval("chr" + feat.getChr, feat.getStart, feat.getEnd)) else None - /** function to make IntervalTree from our RawInterval objects - * - * @param ifeat iterable over feature objects - * @return IntervalTree objects, filled with intervals from RawInterval - */ + /** + * function to make IntervalTree from our Feature objects + * + * @param ifeat iterable over feature objects + * @return IntervalTree objects, filled with intervals from Feature + */ def makeIntervalTree[T <: Feature](ifeat: Iterable[T]): IntervalTree = { val ivt = new IntervalTree for (iv <- ifeat) @@ -224,104 +218,116 @@ object WipeReads extends ToolCommand { return true } false - } else - true + } else true + + /** function to create a fake SAMRecord pair ~ hack to limit querying BAM file for real pair */ + def makeMockPair(rec: SAMRecord): SAMRecord = { + require(rec.getReadPairedFlag) + val fakePair = rec.clone.asInstanceOf[SAMRecord] + fakePair.setAlignmentStart(rec.getMateAlignmentStart) + fakePair + } + + /** function to create set element from SAMRecord */ + def elemFromSam(rec: SAMRecord): String = { + if (filterOutMulti) + rec.getReadName + else + rec.getReadName + "_" + rec.getAlignmentStart.toString + } + + /** object for use by BloomFilter */ + object SAMFunnel extends Funnel[SAMRecord] { + override def funnel(rec: SAMRecord, into: PrimitiveSink): Unit = { + val elem = elemFromSam(rec) + logger.debug("Adding " + elem + " to set ...") + into.putUnencodedChars(elem) + } + } /** filter function for read IDs */ val rgFilter = - if (readGroupIDs.size == 0) + if (readGroupIds.size == 0) (r: SAMRecord) => true else - (r: SAMRecord) => readGroupIDs.contains(r.getReadGroup.getReadGroupId) + (r: SAMRecord) => readGroupIds.contains(r.getReadGroup.getReadGroupId) - /** function to get set element */ - val SAMRecordElement = - if (filterOutMulti) - (r: SAMRecord) => r.getReadName - else - (r: SAMRecord) => r.getReadName + "_" + r.getAlignmentStart.toString - - val SAMRecordMateElement = - (r: SAMRecord) => r.getReadName + "_" + r.getMateAlignmentStart.toString - val firstBAM = prepIndexedInputBAM() + val readyBam = prepIndexedInputBam() /* NOTE: the interval vector here should be bypass-able if we can make the BAM query intervals with Interval objects. This is not possible - at the moment since we can not retrieve star and end coordinates - of an Interval, so we resort to our own RawInterval vector + at the moment since we can not retrieve start and end coordinates + of an Interval, so we resort to our own Feature vector */ lazy val intervals = iv.toVector - lazy val intervalTreeMap: Map[String, IntervalTree] = intervals - .groupBy(x => x.getChr) - .map({ case (key, value) => (key, makeIntervalTree(value)) }) + lazy val queryIntervals = intervals - .flatMap(x => monadicMakeQueryInterval(firstBAM, x)) + .flatMap(x => monadicMakeQueryInterval(readyBam, x)) // makeQueryInterval only accepts a sorted QueryInterval list .sortBy(x => (x.referenceIndex, x.start, x.end)) .toArray - val filteredRecords: Iterator[SAMRecord] = firstBAM.queryOverlapping(queryIntervals).asScala + lazy val intervalTreeMap: Map[String, IntervalTree] = intervals + .groupBy(x => x.getChr) + .map({ case (key, value) => (key, makeIntervalTree(value)) }) + + lazy val filteredOutSet: BloomFilter[SAMRecord] = readyBam + // query BAM file with intervals + .queryOverlapping(queryIntervals) + // for compatibility + .asScala // ensure spliced reads have at least one block overlapping target region .filter(x => alignmentBlockOverlaps(x, intervalTreeMap)) // filter for MAPQ on target region reads .filter(x => x.getMappingQuality >= minMapQ) // filter on specific read group IDs .filter(x => rgFilter(x)) - - val filteredOutSet: BloomFilter[String] = new FilterBuilder(bloomSize.toInt, bloomFp) - .hashFunction(HashMethod.Murmur3KirschMitzenmacher) - .buildBloomFilter() - - for (rec <- filteredRecords) { - logger.debug("Adding read " + rec.getReadName + " to set ...") - if ((!filterOutMulti) && rec.getReadPairedFlag) { - filteredOutSet.add(SAMRecordElement(rec)) - filteredOutSet.add(SAMRecordMateElement(rec)) - } - else - filteredOutSet.add(SAMRecordElement(rec)) - } + // fold starting from empty set + .foldLeft(BloomFilter.create(SAMFunnel, bloomSize.toInt, bloomFp) + )((acc, rec) => { + acc.put(rec) + if (rec.getReadPairedFlag) acc.put(makeMockPair(rec)) + acc + }) if (filterOutMulti) - (rec: SAMRecord) => filteredOutSet.contains(rec.getReadName) + (rec: SAMRecord) => filteredOutSet.mightContain(rec) else (rec: SAMRecord) => { if (rec.getReadPairedFlag) - filteredOutSet.contains(SAMRecordElement(rec)) && - filteredOutSet.contains(SAMRecordMateElement(rec)) + filteredOutSet.mightContain(rec) && filteredOutSet.mightContain(makeMockPair(rec)) else - filteredOutSet.contains(SAMRecordElement(rec)) + filteredOutSet.mightContain(rec) } } - // TODO: implement stats file output /** * Function to filter input BAM and write its output to the filesystem * * @param filterFunc filter function that evaluates true for excluded SAMRecord - * @param inBAM input BAM file - * @param outBAM output BAM file + * @param inBam input BAM file + * @param outBam output BAM file * @param writeIndex whether to write index for output BAM file * @param async whether to write asynchronously - * @param filteredOutBAM whether to write excluded SAMRecords to their own BAM file + * @param filteredOutBam whether to write excluded SAMRecords to their own BAM file */ - def writeFilteredBAM(filterFunc: (SAMRecord => Boolean), inBAM: File, outBAM: File, + def writeFilteredBam(filterFunc: (SAMRecord => Boolean), inBam: File, outBam: File, writeIndex: Boolean = true, async: Boolean = true, - filteredOutBAM: File = null) = { + filteredOutBam: File = null) = { val factory = new SAMFileWriterFactory() .setCreateIndex(writeIndex) .setUseAsyncIo(async) - val templateBAM = new SAMFileReader(inBAM) - templateBAM.setValidationStringency(ValidationStringency.LENIENT) + val templateBam = new SAMFileReader(inBam) + templateBam.setValidationStringency(ValidationStringency.LENIENT) - val targetBAM = factory.makeBAMWriter(templateBAM.getFileHeader, true, outBAM) + val targetBam = factory.makeBAMWriter(templateBam.getFileHeader, true, outBam) - val filteredBAM = - if (filteredOutBAM != null) - factory.makeBAMWriter(templateBAM.getFileHeader, true, filteredOutBAM) + val filteredBam = + if (filteredOutBam != null) + factory.makeBAMWriter(templateBam.getFileHeader, true, filteredOutBam) else null @@ -329,35 +335,34 @@ object WipeReads extends ToolCommand { try { var (incl, excl) = (0, 0) - for (rec <- templateBAM.asScala) { + for (rec <- templateBam.asScala) { if (!filterFunc(rec)) { - targetBAM.addAlignment(rec) + targetBam.addAlignment(rec) incl += 1 - } - else { + } else { excl += 1 - if (filteredBAM != null) filteredBAM.addAlignment(rec) + if (filteredBam != null) filteredBam.addAlignment(rec) } } println(List("input_bam", "output_bam", "count_included", "count_excluded").mkString("\t")) - println(List(inBAM.getName, outBAM.getName, incl, excl).mkString("\t")) + println(List(inBam.getName, outBam.getName, incl, excl).mkString("\t")) } finally { - templateBAM.close() - targetBAM.close() - if (filteredBAM != null) filteredBAM.close() + templateBam.close() + targetBam.close() + if (filteredBam != null) filteredBam.close() } } - case class Args (inputBAM: File = null, - targetRegions: File = null, - outputBAM: File = null, - filteredOutBAM: File = null, - readGroupIDs: Set[String] = Set.empty[String], - minMapQ: Int = 0, - limitToRegion: Boolean = false, - noMakeIndex: Boolean = false, - bloomSize: Long = 70000000, - bloomFp: Double = 4e-7) extends AbstractArgs + case class Args(inputBam: File = null, + targetRegions: File = null, + outputBam: File = null, + filteredOutBam: File = null, + readGroupIds: Set[String] = Set.empty[String], + minMapQ: Int = 0, + limitToRegion: Boolean = false, + noMakeIndex: Boolean = false, + bloomSize: Long = 70000000, + bloomFp: Double = 4e-7) extends AbstractArgs class OptParser extends AbstractOptParser { @@ -366,43 +371,53 @@ object WipeReads extends ToolCommand { |$commandName - Region-based reads removal from an indexed BAM file """.stripMargin) - opt[File]('I', "input_file") required() valueName "<bam>" action { (x, c) => - c.copy(inputBAM = x) } validate { - x => if (x.exists) success else failure("Input BAM file not found") - } text "Input BAM file" - - opt[File]('r', "interval_file") required() valueName "<bed>" action { (x, c) => - c.copy(targetRegions = x) } validate { - x => if (x.exists) success else failure("Target regions file not found") - } text "Interval BED file" - - opt[File]('o', "output_file") required() valueName "<bam>" action { (x, c) => - c.copy(outputBAM = x) } text "Output BAM file" - - opt[File]('f', "discarded_file") optional() valueName "<bam>" action { (x, c) => - c.copy(filteredOutBAM = x) } text "Discarded reads BAM file (default: none)" - - opt[Int]('Q', "min_mapq") optional() action { (x, c) => - c.copy(minMapQ = x) } text "Minimum MAPQ of reads in target region to remove (default: 0)" - - opt[String]('G', "read_group") unbounded() optional() valueName "<rgid>" action { (x, c) => - c.copy(readGroupIDs = c.readGroupIDs + x) } text "Read group IDs to be removed (default: remove reads from all read groups)" - - opt[Unit]("limit_removal") optional() action { (_, c) => - c.copy(limitToRegion = true) } text + opt[File]('I', "input_file") required () valueName "<bam>" action { (x, c) => + c.copy(inputBam = x) + } validate { + x => if (x.exists) success else failure("Input BAM file not found") + } text "Input BAM file" + + opt[File]('r', "interval_file") required () valueName "<bed>" action { (x, c) => + c.copy(targetRegions = x) + } validate { + x => if (x.exists) success else failure("Target regions file not found") + } text "Interval BED file" + + opt[File]('o', "output_file") required () valueName "<bam>" action { (x, c) => + c.copy(outputBam = x) + } text "Output BAM file" + + opt[File]('f', "discarded_file") optional () valueName "<bam>" action { (x, c) => + c.copy(filteredOutBam = x) + } text "Discarded reads BAM file (default: none)" + + opt[Int]('Q', "min_mapq") optional () action { (x, c) => + c.copy(minMapQ = x) + } text "Minimum MAPQ of reads in target region to remove (default: 0)" + + opt[String]('G', "read_group") unbounded () optional () valueName "<rgid>" action { (x, c) => + c.copy(readGroupIds = c.readGroupIds + x) + } text "Read group IDs to be removed (default: remove reads from all read groups)" + + opt[Unit]("limit_removal") optional () action { (_, c) => + c.copy(limitToRegion = true) + } text "Whether to remove multiple-mapped reads outside the target regions (default: yes)" - opt[Unit]("no_make_index") optional() action { (_, c) => - c.copy(noMakeIndex = true) } text + opt[Unit]("no_make_index") optional () action { (_, c) => + c.copy(noMakeIndex = true) + } text "Whether to index output BAM file or not (default: yes)" note("\nAdvanced options") - opt[Long]("bloom_size") optional() action { (x, c) => - c.copy(bloomSize = x) } text "expected maximum number of reads in target regions (default: 7e7)" + opt[Long]("bloom_size") optional () action { (x, c) => + c.copy(bloomSize = x) + } text "expected maximum number of reads in target regions (default: 7e7)" - opt[Double]("false_positive") optional() action { (x, c) => - c.copy(bloomFp = x) } text "false positive rate (default: 4e-7)" + opt[Double]("false_positive") optional () action { (x, c) => + c.copy(bloomFp = x) + } text "false positive rate (default: 4e-7)" note( """ @@ -411,12 +426,6 @@ object WipeReads extends ToolCommand { |the given ones, they will also be removed. """.stripMargin) - checkConfig { c=> - if (!bloomParamsOk(c.bloomSize, c.bloomFp)) - failure("Bloom parameters combination exceed Int limitation") - else - success - } } def main(args: Array[String]): Unit = { @@ -425,22 +434,22 @@ object WipeReads extends ToolCommand { .parse(args, Args()) .getOrElse(sys.exit(1)) - val filterFunc = makeFilterOutFunction( + val filterFunc = makeFilterNotFunction( iv = makeFeatureFromFile(commandArgs.targetRegions), - inBAM = commandArgs.inputBAM, + inBam = commandArgs.inputBam, filterOutMulti = !commandArgs.limitToRegion, minMapQ = commandArgs.minMapQ, - readGroupIDs = commandArgs.readGroupIDs, + readGroupIds = commandArgs.readGroupIds, bloomSize = commandArgs.bloomSize, bloomFp = commandArgs.bloomFp ) - writeFilteredBAM( + writeFilteredBam( filterFunc, - commandArgs.inputBAM, - commandArgs.outputBAM, + commandArgs.inputBam, + commandArgs.outputBam, writeIndex = !commandArgs.noMakeIndex, - filteredOutBAM = commandArgs.filteredOutBAM + filteredOutBam = commandArgs.filteredOutBam ) } } diff --git a/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extensions/svcallers/breakdancer2vcf.py b/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extensions/svcallers/breakdancer2vcf.py new file mode 100644 index 0000000000000000000000000000000000000000..be19779229fe65b7501e1ef6b4b409d522684eeb --- /dev/null +++ b/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extensions/svcallers/breakdancer2vcf.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python + +__copyright__ = """ +Copyright (C) 2013 - Tim te Beek +Copyright (C) 2013 - Wai Yi Leung +Copyright (C) 2013 AllBio (see AUTHORS file) +""" + +__desc__ = """Convert breakdancer output to pseudo .vcf file format.""" +__created__ = "Mar 18, 2013" +__author__ = "tbeek" + +import argparse +import csv +import os.path +import sys +import datetime + + +def main(tsvfile, vcffile): + ''' + :param tsvfile: filename of input file.tsv + :type tsvfile: string + :param vcffile: filename of output file.vcf + :type vcffile: string + ''' + with open(tsvfile) as reader: + # Parse file + dictreader = _parse_tsvfile(reader) + print dictreader.fieldnames + + # Write out file + _format_vcffile(dictreader, vcffile) + + # Quick output + with open(vcffile) as reader: + print reader.read(1000) + + +def _parse_tsvfile(readable): + ''' + Read readable using csv.Sniffer and csv.DictReader + :param readable: open file.tsv handle to read with csv.DictReader + :type readable: file + ''' + prev, curr = 0, 0 + while True: + line = readable.readline() + if not line.startswith('#'): + # lets start from prev # line, without the hash sign + readable.seek(prev + 1) + break + else: + prev = curr + curr = readable.tell() + + # Determine dialect + curr = readable.tell() + #dialect = csv.Sniffer().sniff(readable.read(3000)) + dialect = 'excel-tab' + readable.seek(curr) + + # Read file + dictreader = csv.DictReader(readable, dialect=dialect) + return dictreader + + +_tsv_fields = ('Chr1', 'Pos1', 'Orientation1', + 'Chr2', 'Pos2', 'Orientation2', + 'Type', 'Size', 'Score', + 'num_Reads', 'num_Reads_lib', + 'ERR031544.sort.bam') +# 'Chr1': '1', +# 'Pos1': '269907', +# 'Orientation1': '39+39-', +# 'Chr2': '1', +# 'Pos2': '270919', +# 'Orientation2': '39+39-', +# 'Type': 'DEL', +# 'Size': '99', +# 'Score': '99', +# 'num_Reads': '38', +# 'num_Reads_lib': '/home/allbio/ERR031544.sort.bam|38', +# 'ERR031544.sort.bam': 'NA' + + + +_vcf_fields = ('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'default') + +TS_NOW = datetime.datetime.now() + +VCF_HEADER = """##fileformat=VCFv4.1 +##fileDate={filedate} +##source=breakdancer-max +##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data"> +##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency"> +##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of variation"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> +##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">""".format( filedate=TS_NOW.strftime( "%Y%m%d" ) ) + +def _format_vcffile(dictreader, vcffile): + ''' + Create a pseudo .vcf file based on values read from DictReader instance. + :param dictreader: DictReader instance to read data from + :type dictreader: csv.DictRedaer + :param vcffile: output file.vcf filename + :type vcffile: string + ''' + FORMAT = "GT:DP" + with open(vcffile, mode='w') as writer: + writer.write('{header}\n#{columns}\n'.format(header=VCF_HEADER, columns='\t'.join(_vcf_fields))) + output_vcf = [] + for line in dictreader: + CHROM = line['Chr1'] + # TODO Figure out whether we have zero or one based positioning + POS = int(line['Pos1']) + ALT = '.' + SVEND = int(line['Pos2']) + + INFO = 'PROGRAM=breakdancer;SVTYPE={}'.format(line['Type']) + + if line['Type'] not in ['CTX']: + INFO += ';SVLEN={}'.format(int(line['Size'])) + INFO += ";SVEND={}".format(SVEND) + + # write alternate ALT field for Intrachromosomal translocations + if line['Type'] in ['CTX']: + ALT = "N[{}:{}[".format(line['Chr2'], line['Pos2']) + + + + SAMPLEINFO = "{}:{}".format( '1/.', line['num_Reads'] ) + # Create record + output_vcf.append([CHROM, POS, '.', '.', ALT, '.', 'PASS', INFO, FORMAT, SAMPLEINFO]) + + # Sort all results + output_vcf.sort() + output = "\n".join(["\t".join(map(str,vcf_row)) for vcf_row in output_vcf]) + # Write record + writer.write(output) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--breakdancertsv', dest='breakdancertsv', type=str, + help='Breakdancer TSV outputfile') + parser.add_argument('-o', '--outputvcf', dest='outputvcf', type=str, + help='Output vcf to') + + args = parser.parse_args() + main(args.breakdancertsv, args.outputvcf) diff --git a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala index ee438ae74e82bf157a013360daf9b20d5a1647b6..0590dba978b19e531284112d2e9adecac88f90e2 100644 --- a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala +++ b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/WipeReadsUnitTest.scala @@ -6,6 +6,7 @@ package nl.lumc.sasc.biopet.tools import java.io.File import java.nio.file.Paths +import scala.collection.JavaConverters._ import htsjdk.samtools._ import htsjdk.tribble._ @@ -13,17 +14,14 @@ import org.scalatest.Matchers import org.scalatest.testng.TestNGSuite import org.testng.annotations.Test -import scala.collection.JavaConverters._ - - class WipeReadsUnitTest extends TestNGSuite with Matchers { - import nl.lumc.sasc.biopet.tools.WipeReads._ + import WipeReads._ private def resourcePath(p: String): String = Paths.get(getClass.getResource(p).toURI).toString - private lazy val samP: SAMLineParser = { + private val samP: SAMLineParser = { val samh = new SAMFileHeader samh.addSequence(new SAMSequenceRecord("chrQ", 10000)) samh.addReadGroup(new SAMReadGroupRecord("001")) @@ -31,20 +29,20 @@ class WipeReadsUnitTest extends TestNGSuite with Matchers { new SAMLineParser(samh) } - private def makeSAMs(raws: String*): Seq[SAMRecord] = + private def makeSams(raws: String*): Seq[SAMRecord] = raws.map(s => samP.parseLine(s)) - private def makeTempBAM(): File = + private def makeTempBam(): File = File.createTempFile("WipeReads", java.util.UUID.randomUUID.toString + ".bam") - private def makeTempBAMIndex(bam: File): File = + private def makeTempBamIndex(bam: File): File = new File(bam.getAbsolutePath.stripSuffix(".bam") + ".bai") val bloomSize: Long = 1000 val bloomFp: Double = 1e-10 - val sBAMFile1 = new File(resourcePath("/single01.bam")) - val sBAMRecs1 = makeSAMs( + val sBamFile1 = new File(resourcePath("/single01.bam")) + val sBamRecs1 = makeSams( "r02\t0\tchrQ\t50\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", "r01\t16\tchrQ\t190\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", "r01\t16\tchrQ\t290\t60\t10M\t*\t0\t0\tGGGGGAAAAA\tGGGGGGGGGG\tRG:Z:001", @@ -54,8 +52,8 @@ class WipeReadsUnitTest extends TestNGSuite with Matchers { "r06\t4\t*\t0\t0\t*\t*\t0\t0\tATATATATAT\tHIHIHIHIHI\tRG:Z:001" ) - val sBAMFile2 = new File(resourcePath("/single02.bam")) - val sBAMRecs2 = makeSAMs( + val sBamFile2 = new File(resourcePath("/single02.bam")) + val sBamRecs2 = makeSams( "r02\t0\tchrQ\t50\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", "r01\t16\tchrQ\t190\t30\t10M\t*\t0\t0\tGGGGGAAAAA\tGGGGGGGGGG\tRG:Z:002", "r01\t16\tchrQ\t290\t30\t10M\t*\t0\t0\tGGGGGAAAAA\tGGGGGGGGGG\tRG:Z:002", @@ -66,11 +64,11 @@ class WipeReadsUnitTest extends TestNGSuite with Matchers { "r08\t4\t*\t0\t0\t*\t*\t0\t0\tATATATATAT\tHIHIHIHIHI\tRG:Z:002" ) - val sBAMFile3 = new File(resourcePath("/single03.bam")) - val sBAMFile4 = new File(resourcePath("/single04.bam")) + val sBamFile3 = new File(resourcePath("/single03.bam")) + val sBamFile4 = new File(resourcePath("/single04.bam")) - val pBAMFile1 = new File(resourcePath("/paired01.bam")) - val pBAMRecs1 = makeSAMs( + val pBamFile1 = new File(resourcePath("/paired01.bam")) + val pBamRecs1 = makeSams( "r02\t99\tchrQ\t50\t60\t10M\t=\t90\t50\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", "r02\t147\tchrQ\t90\t60\t10M\t=\t50\t-50\tATGCATGCAT\tEEFFGGHHII\tRG:Z:001", "r01\t163\tchrQ\t150\t60\t10M\t=\t190\t50\tAAAAAGGGGG\tGGGGGGGGGG\tRG:Z:001", @@ -87,8 +85,8 @@ class WipeReadsUnitTest extends TestNGSuite with Matchers { "r06\t4\t*\t0\t0\t*\t*\t0\t0\tGCGCGCGCGC\tHIHIHIHIHI\tRG:Z:001" ) - val pBAMFile2 = new File(resourcePath("/paired02.bam")) - val pBAMRecs2 = makeSAMs( + val pBamFile2 = new File(resourcePath("/paired02.bam")) + val pBamRecs2 = makeSams( "r02\t99\tchrQ\t50\t60\t10M\t=\t90\t50\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001", "r02\t147\tchrQ\t90\t60\t10M\t=\t50\t-50\tATGCATGCAT\tEEFFGGHHII\tRG:Z:001", "r01\t163\tchrQ\t150\t30\t10M\t=\t190\t50\tAAAAAGGGGG\tGGGGGGGGGG\tRG:Z:002", @@ -101,22 +99,22 @@ class WipeReadsUnitTest extends TestNGSuite with Matchers { "r08\t4\t*\t0\t0\t*\t*\t0\t0\tGCGCGCGCGC\tHIHIHIHIHI\tRG:Z:002" ) - val pBAMFile3 = new File(resourcePath("/paired03.bam")) - val BEDFile1 = new File(resourcePath("/rrna01.bed")) - val minArgList = List("-I", sBAMFile1.toString, "-l", BEDFile1.toString, "-o", "mock.bam") - - @Test def testMakeFeatureFromBED() = { - val intervals: Vector[Feature] = makeFeatureFromFile(BEDFile1).toVector - intervals.length should be (3) - intervals.head.getChr should === ("chrQ") - intervals.head.getStart should be (991) - intervals.head.getEnd should be (1000) - intervals.last.getChr should === ("chrQ") - intervals.last.getStart should be (291) - intervals.last.getEnd should be (320) + val pBamFile3 = new File(resourcePath("/paired03.bam")) + val BedFile1 = new File(resourcePath("/rrna01.bed")) + val minArgList = List("-I", sBamFile1.toString, "-l", BedFile1.toString, "-o", "mock.bam") + + @Test def testMakeFeatureFromBed() = { + val intervals: Vector[Feature] = makeFeatureFromFile(BedFile1).toVector + intervals.length should be(3) + intervals.head.getChr should ===("chrQ") + intervals.head.getStart should be(991) + intervals.head.getEnd should be(1000) + intervals.last.getChr should ===("chrQ") + intervals.last.getStart should be(291) + intervals.last.getEnd should be(320) } - @Test def testSingleBAMDefault() = { + @Test def testSingleBamDefault() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), // overlaps r01, second hit, new BasicFeature("chrQ", 451, 480), // overlaps r04 @@ -125,317 +123,317 @@ class WipeReadsUnitTest extends TestNGSuite with Matchers { // NOTE: while it's possible to have our filter produce false positives // it is highly unlikely in our test cases as we are setting a very low FP rate // and only filling the filter with a few items - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp) + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile1, bloomSize = bloomSize, bloomFp = bloomFp) // by default, set elements are SAM record read names - assert(!isFilteredOut(sBAMRecs1(0))) - assert(isFilteredOut(sBAMRecs1(1))) - assert(isFilteredOut(sBAMRecs1(2))) - assert(isFilteredOut(sBAMRecs1(3))) - assert(!isFilteredOut(sBAMRecs1(4))) - assert(!isFilteredOut(sBAMRecs1(5))) - assert(!isFilteredOut(sBAMRecs1(6))) + filterNotFunc(sBamRecs1(0)) shouldBe false + filterNotFunc(sBamRecs1(1)) shouldBe true + filterNotFunc(sBamRecs1(2)) shouldBe true + filterNotFunc(sBamRecs1(3)) shouldBe true + filterNotFunc(sBamRecs1(4)) shouldBe false + filterNotFunc(sBamRecs1(5)) shouldBe false + filterNotFunc(sBamRecs1(6)) shouldBe false } - @Test def testSingleBAMIntervalWithoutChr() = { + @Test def testSingleBamIntervalWithoutChr() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("Q", 291, 320), new BasicFeature("chrQ", 451, 480), new BasicFeature("P", 191, 480) ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp) - assert(!isFilteredOut(sBAMRecs1(0))) - assert(isFilteredOut(sBAMRecs1(1))) - assert(isFilteredOut(sBAMRecs1(2))) - assert(isFilteredOut(sBAMRecs1(3))) - assert(!isFilteredOut(sBAMRecs1(4))) - assert(!isFilteredOut(sBAMRecs1(5))) - assert(!isFilteredOut(sBAMRecs1(6))) + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile1, bloomSize = bloomSize, bloomFp = bloomFp) + filterNotFunc(sBamRecs1(0)) shouldBe false + filterNotFunc(sBamRecs1(1)) shouldBe true + filterNotFunc(sBamRecs1(2)) shouldBe true + filterNotFunc(sBamRecs1(3)) shouldBe true + filterNotFunc(sBamRecs1(4)) shouldBe false + filterNotFunc(sBamRecs1(5)) shouldBe false + filterNotFunc(sBamRecs1(6)) shouldBe false } - @Test def testSingleBAMDefaultPartialExonOverlap() = { + @Test def testSingleBamDefaultPartialExonOverlap() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 881, 1000) // overlaps first exon of r05 ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp) - assert(!isFilteredOut(sBAMRecs1(0))) - assert(!isFilteredOut(sBAMRecs1(1))) - assert(!isFilteredOut(sBAMRecs1(2))) - assert(!isFilteredOut(sBAMRecs1(3))) - assert(!isFilteredOut(sBAMRecs1(4))) - assert(isFilteredOut(sBAMRecs1(5))) - assert(!isFilteredOut(sBAMRecs1(6))) + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile1, bloomSize = bloomSize, bloomFp = bloomFp) + filterNotFunc(sBamRecs1(0)) shouldBe false + filterNotFunc(sBamRecs1(1)) shouldBe false + filterNotFunc(sBamRecs1(2)) shouldBe false + filterNotFunc(sBamRecs1(3)) shouldBe false + filterNotFunc(sBamRecs1(4)) shouldBe false + filterNotFunc(sBamRecs1(5)) shouldBe true + filterNotFunc(sBamRecs1(6)) shouldBe false } - @Test def testSingleBAMDefaultNoExonOverlap() = { + @Test def testSingleBamDefaultNoExonOverlap() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrP", 881, 1000), new BasicFeature("chrQ", 900, 920) ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp) - assert(!isFilteredOut(sBAMRecs1(0))) - assert(!isFilteredOut(sBAMRecs1(1))) - assert(!isFilteredOut(sBAMRecs1(2))) - assert(!isFilteredOut(sBAMRecs1(3))) - assert(!isFilteredOut(sBAMRecs1(4))) - assert(!isFilteredOut(sBAMRecs1(5))) - assert(!isFilteredOut(sBAMRecs1(5))) - assert(!isFilteredOut(sBAMRecs1(6))) + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile1, bloomSize = bloomSize, bloomFp = bloomFp) + filterNotFunc(sBamRecs1(0)) shouldBe false + filterNotFunc(sBamRecs1(1)) shouldBe false + filterNotFunc(sBamRecs1(2)) shouldBe false + filterNotFunc(sBamRecs1(3)) shouldBe false + filterNotFunc(sBamRecs1(4)) shouldBe false + filterNotFunc(sBamRecs1(5)) shouldBe false + filterNotFunc(sBamRecs1(5)) shouldBe false + filterNotFunc(sBamRecs1(6)) shouldBe false } - @Test def testSingleBAMFilterOutMultiNotSet() = { + @Test def testSingleBamFilterOutMultiNotSet() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), // overlaps r01, second hit, new BasicFeature("chrQ", 451, 480), // overlaps r04 new BasicFeature("chrQ", 991, 1000) // overlaps nothing; lies in the spliced region of r05 ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp, + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile1, bloomSize = bloomSize, bloomFp = bloomFp, filterOutMulti = false) - assert(!isFilteredOut(sBAMRecs1(0))) - assert(!isFilteredOut(sBAMRecs1(1))) - assert(isFilteredOut(sBAMRecs1(2))) - assert(isFilteredOut(sBAMRecs1(3))) - assert(!isFilteredOut(sBAMRecs1(4))) - assert(!isFilteredOut(sBAMRecs1(5))) - assert(!isFilteredOut(sBAMRecs1(6))) + filterNotFunc(sBamRecs1(0)) shouldBe false + filterNotFunc(sBamRecs1(1)) shouldBe false + filterNotFunc(sBamRecs1(2)) shouldBe true + filterNotFunc(sBamRecs1(3)) shouldBe true + filterNotFunc(sBamRecs1(4)) shouldBe false + filterNotFunc(sBamRecs1(5)) shouldBe false + filterNotFunc(sBamRecs1(6)) shouldBe false } - @Test def testSingleBAMFilterMinMapQ() = { + @Test def testSingleBamFilterMinMapQ() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), new BasicFeature("chrQ", 451, 480) ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile2, bloomSize = bloomSize, bloomFp = bloomFp, + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile2, bloomSize = bloomSize, bloomFp = bloomFp, minMapQ = 60) - assert(!isFilteredOut(sBAMRecs2(0))) + filterNotFunc(sBamRecs2(0)) shouldBe false // r01 is not in since it is below the MAPQ threshold - assert(!isFilteredOut(sBAMRecs2(1))) - assert(!isFilteredOut(sBAMRecs2(2))) - assert(isFilteredOut(sBAMRecs2(3))) - assert(isFilteredOut(sBAMRecs2(4))) - assert(isFilteredOut(sBAMRecs2(5))) - assert(!isFilteredOut(sBAMRecs2(6))) - assert(!isFilteredOut(sBAMRecs2(7))) + filterNotFunc(sBamRecs2(1)) shouldBe false + filterNotFunc(sBamRecs2(2)) shouldBe false + filterNotFunc(sBamRecs2(3)) shouldBe true + filterNotFunc(sBamRecs2(4)) shouldBe true + filterNotFunc(sBamRecs2(5)) shouldBe true + filterNotFunc(sBamRecs2(6)) shouldBe false + filterNotFunc(sBamRecs2(7)) shouldBe false } - @Test def testSingleBAMFilterMinMapQFilterOutMultiNotSet() = { + @Test def testSingleBamFilterMinMapQFilterOutMultiNotSet() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), new BasicFeature("chrQ", 451, 480) ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile2, bloomSize = bloomSize, bloomFp = bloomFp, + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile2, bloomSize = bloomSize, bloomFp = bloomFp, minMapQ = 60, filterOutMulti = false) - assert(!isFilteredOut(sBAMRecs2(0))) - assert(!isFilteredOut(sBAMRecs2(1))) + filterNotFunc(sBamRecs2(0)) shouldBe false + filterNotFunc(sBamRecs2(1)) shouldBe false // this r01 is not in since it is below the MAPQ threshold - assert(!isFilteredOut(sBAMRecs2(2))) - assert(isFilteredOut(sBAMRecs2(3))) - assert(isFilteredOut(sBAMRecs2(4))) + filterNotFunc(sBamRecs2(2)) shouldBe false + filterNotFunc(sBamRecs2(3)) shouldBe true + filterNotFunc(sBamRecs2(4)) shouldBe true // this r07 is not in since filterOuMulti is false - assert(!isFilteredOut(sBAMRecs2(5))) - assert(!isFilteredOut(sBAMRecs2(6))) - assert(!isFilteredOut(sBAMRecs2(7))) + filterNotFunc(sBamRecs2(5)) shouldBe false + filterNotFunc(sBamRecs2(6)) shouldBe false + filterNotFunc(sBamRecs2(7)) shouldBe false } - @Test def testSingleBAMFilterReadGroupIDs() = { + @Test def testSingleBamFilterReadGroupIDs() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), new BasicFeature("chrQ", 451, 480) ) - val isFilteredOut = makeFilterOutFunction(intervals, sBAMFile2, bloomSize = bloomSize, bloomFp = bloomFp, - readGroupIDs = Set("002", "003")) - assert(!isFilteredOut(sBAMRecs2(0))) + val filterNotFunc = makeFilterNotFunction(intervals, sBamFile2, bloomSize = bloomSize, bloomFp = bloomFp, + readGroupIds = Set("002", "003")) + filterNotFunc(sBamRecs2(0)) shouldBe false // only r01 is in the set since it is RG 002 - assert(isFilteredOut(sBAMRecs2(1))) - assert(isFilteredOut(sBAMRecs2(2))) - assert(!isFilteredOut(sBAMRecs2(3))) - assert(!isFilteredOut(sBAMRecs2(4))) - assert(!isFilteredOut(sBAMRecs2(5))) - assert(!isFilteredOut(sBAMRecs2(6))) - assert(!isFilteredOut(sBAMRecs2(7))) + filterNotFunc(sBamRecs2(1)) shouldBe true + filterNotFunc(sBamRecs2(2)) shouldBe true + filterNotFunc(sBamRecs2(3)) shouldBe false + filterNotFunc(sBamRecs2(4)) shouldBe false + filterNotFunc(sBamRecs2(5)) shouldBe false + filterNotFunc(sBamRecs2(6)) shouldBe false + filterNotFunc(sBamRecs2(7)) shouldBe false } - @Test def testPairBAMDefault() = { + @Test def testPairBamDefault() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), // overlaps r01, second hit, new BasicFeature("chrQ", 451, 480), // overlaps r04 new BasicFeature("chrQ", 991, 1000) // overlaps nothing; lies in the spliced region of r05 ) - val isFilteredOut = makeFilterOutFunction(intervals, pBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp) - assert(!isFilteredOut(pBAMRecs1(0))) - assert(!isFilteredOut(pBAMRecs1(1))) - assert(isFilteredOut(pBAMRecs1(2))) - assert(isFilteredOut(pBAMRecs1(3))) - assert(isFilteredOut(pBAMRecs1(4))) - assert(isFilteredOut(pBAMRecs1(5))) - assert(isFilteredOut(pBAMRecs1(6))) - assert(isFilteredOut(pBAMRecs1(7))) - assert(!isFilteredOut(pBAMRecs1(8))) - assert(!isFilteredOut(pBAMRecs1(9))) - assert(!isFilteredOut(pBAMRecs1(10))) - assert(!isFilteredOut(pBAMRecs1(11))) - assert(!isFilteredOut(pBAMRecs1(12))) - assert(!isFilteredOut(pBAMRecs1(13))) + val filterNotFunc = makeFilterNotFunction(intervals, pBamFile1, bloomSize = bloomSize, bloomFp = bloomFp) + filterNotFunc(pBamRecs1(0)) shouldBe false + filterNotFunc(pBamRecs1(1)) shouldBe false + filterNotFunc(pBamRecs1(2)) shouldBe true + filterNotFunc(pBamRecs1(3)) shouldBe true + filterNotFunc(pBamRecs1(4)) shouldBe true + filterNotFunc(pBamRecs1(5)) shouldBe true + filterNotFunc(pBamRecs1(6)) shouldBe true + filterNotFunc(pBamRecs1(7)) shouldBe true + filterNotFunc(pBamRecs1(8)) shouldBe false + filterNotFunc(pBamRecs1(9)) shouldBe false + filterNotFunc(pBamRecs1(10)) shouldBe false + filterNotFunc(pBamRecs1(11)) shouldBe false + filterNotFunc(pBamRecs1(12)) shouldBe false + filterNotFunc(pBamRecs1(13)) shouldBe false } - @Test def testPairBAMPartialExonOverlap() = { + @Test def testPairBamPartialExonOverlap() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 891, 1000) ) - val isFilteredOut = makeFilterOutFunction(intervals, pBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp) - assert(!isFilteredOut(pBAMRecs1(0))) - assert(!isFilteredOut(pBAMRecs1(1))) - assert(!isFilteredOut(pBAMRecs1(2))) - assert(!isFilteredOut(pBAMRecs1(3))) - assert(!isFilteredOut(pBAMRecs1(4))) - assert(!isFilteredOut(pBAMRecs1(5))) - assert(!isFilteredOut(pBAMRecs1(6))) - assert(!isFilteredOut(pBAMRecs1(7))) - assert(!isFilteredOut(pBAMRecs1(8))) - assert(!isFilteredOut(pBAMRecs1(9))) - assert(isFilteredOut(pBAMRecs1(10))) - assert(isFilteredOut(pBAMRecs1(11))) - assert(!isFilteredOut(pBAMRecs1(12))) - assert(!isFilteredOut(pBAMRecs1(13))) + val filterNotFunc = makeFilterNotFunction(intervals, pBamFile1, bloomSize = bloomSize, bloomFp = bloomFp) + filterNotFunc(pBamRecs1(0)) shouldBe false + filterNotFunc(pBamRecs1(1)) shouldBe false + filterNotFunc(pBamRecs1(2)) shouldBe false + filterNotFunc(pBamRecs1(3)) shouldBe false + filterNotFunc(pBamRecs1(4)) shouldBe false + filterNotFunc(pBamRecs1(5)) shouldBe false + filterNotFunc(pBamRecs1(6)) shouldBe false + filterNotFunc(pBamRecs1(7)) shouldBe false + filterNotFunc(pBamRecs1(8)) shouldBe false + filterNotFunc(pBamRecs1(9)) shouldBe false + filterNotFunc(pBamRecs1(10)) shouldBe true + filterNotFunc(pBamRecs1(11)) shouldBe true + filterNotFunc(pBamRecs1(12)) shouldBe false + filterNotFunc(pBamRecs1(13)) shouldBe false } - @Test def testPairBAMFilterOutMultiNotSet() = { + @Test def testPairBamFilterOutMultiNotSet() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), // overlaps r01, second hit, new BasicFeature("chrQ", 451, 480), // overlaps r04 new BasicFeature("chrQ", 991, 1000) // overlaps nothing; lies in the spliced region of r05 ) - val isFilteredOut = makeFilterOutFunction(intervals, pBAMFile1, bloomSize = bloomSize, bloomFp = bloomFp, + val filterNotFunc = makeFilterNotFunction(intervals, pBamFile1, bloomSize = bloomSize, bloomFp = bloomFp, filterOutMulti = false) - assert(!isFilteredOut(pBAMRecs1(0))) - assert(!isFilteredOut(pBAMRecs1(1))) - assert(!isFilteredOut(pBAMRecs1(2))) - assert(!isFilteredOut(pBAMRecs1(3))) - assert(isFilteredOut(pBAMRecs1(4))) - assert(isFilteredOut(pBAMRecs1(5))) - assert(isFilteredOut(pBAMRecs1(6))) - assert(isFilteredOut(pBAMRecs1(7))) - assert(!isFilteredOut(pBAMRecs1(8))) - assert(!isFilteredOut(pBAMRecs1(9))) - assert(!isFilteredOut(pBAMRecs1(10))) - assert(!isFilteredOut(pBAMRecs1(11))) - assert(!isFilteredOut(pBAMRecs1(12))) - assert(!isFilteredOut(pBAMRecs1(13))) + filterNotFunc(pBamRecs1(0)) shouldBe false + filterNotFunc(pBamRecs1(1)) shouldBe false + filterNotFunc(pBamRecs1(2)) shouldBe false + filterNotFunc(pBamRecs1(3)) shouldBe false + filterNotFunc(pBamRecs1(4)) shouldBe true + filterNotFunc(pBamRecs1(5)) shouldBe true + filterNotFunc(pBamRecs1(6)) shouldBe true + filterNotFunc(pBamRecs1(7)) shouldBe true + filterNotFunc(pBamRecs1(8)) shouldBe false + filterNotFunc(pBamRecs1(9)) shouldBe false + filterNotFunc(pBamRecs1(10)) shouldBe false + filterNotFunc(pBamRecs1(11)) shouldBe false + filterNotFunc(pBamRecs1(12)) shouldBe false + filterNotFunc(pBamRecs1(13)) shouldBe false } - @Test def testPairBAMFilterMinMapQ() = { + @Test def testPairBamFilterMinMapQ() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), new BasicFeature("chrQ", 451, 480) ) - val isFilteredOut = makeFilterOutFunction(intervals, pBAMFile2, bloomSize = bloomSize, bloomFp = bloomFp, + val filterNotFunc = makeFilterNotFunction(intervals, pBamFile2, bloomSize = bloomSize, bloomFp = bloomFp, minMapQ = 60) // r01 is not in since it is below the MAPQ threshold - assert(!isFilteredOut(pBAMRecs2(0))) - assert(!isFilteredOut(pBAMRecs2(1))) - assert(!isFilteredOut(pBAMRecs2(2))) - assert(!isFilteredOut(pBAMRecs2(3))) - assert(!isFilteredOut(pBAMRecs2(4))) - assert(!isFilteredOut(pBAMRecs2(5))) - assert(isFilteredOut(pBAMRecs2(6))) - assert(isFilteredOut(pBAMRecs2(7))) - assert(!isFilteredOut(pBAMRecs2(8))) - assert(!isFilteredOut(pBAMRecs2(9))) + filterNotFunc(pBamRecs2(0)) shouldBe false + filterNotFunc(pBamRecs2(1)) shouldBe false + filterNotFunc(pBamRecs2(2)) shouldBe false + filterNotFunc(pBamRecs2(3)) shouldBe false + filterNotFunc(pBamRecs2(4)) shouldBe false + filterNotFunc(pBamRecs2(5)) shouldBe false + filterNotFunc(pBamRecs2(6)) shouldBe true + filterNotFunc(pBamRecs2(7)) shouldBe true + filterNotFunc(pBamRecs2(8)) shouldBe false + filterNotFunc(pBamRecs2(9)) shouldBe false } - @Test def testPairBAMFilterReadGroupIDs() = { + @Test def testPairBamFilterReadGroupIDs() = { val intervals: Iterator[Feature] = Iterator( new BasicFeature("chrQ", 291, 320), new BasicFeature("chrQ", 451, 480) ) - val isFilteredOut = makeFilterOutFunction(intervals, pBAMFile2, bloomSize = bloomSize, bloomFp = bloomFp, - readGroupIDs = Set("002", "003")) + val filterNotFunc = makeFilterNotFunction(intervals, pBamFile2, bloomSize = bloomSize, bloomFp = bloomFp, + readGroupIds = Set("002", "003")) // only r01 is in the set since it is RG 002 - assert(!isFilteredOut(pBAMRecs2(0))) - assert(!isFilteredOut(pBAMRecs2(1))) - assert(isFilteredOut(pBAMRecs2(2))) - assert(isFilteredOut(pBAMRecs2(3))) - assert(isFilteredOut(pBAMRecs2(4))) - assert(isFilteredOut(pBAMRecs2(5))) - assert(!isFilteredOut(pBAMRecs2(6))) - assert(!isFilteredOut(pBAMRecs2(7))) - assert(!isFilteredOut(pBAMRecs2(8))) - assert(!isFilteredOut(pBAMRecs2(9))) + filterNotFunc(pBamRecs2(0)) shouldBe false + filterNotFunc(pBamRecs2(1)) shouldBe false + filterNotFunc(pBamRecs2(2)) shouldBe true + filterNotFunc(pBamRecs2(3)) shouldBe true + filterNotFunc(pBamRecs2(4)) shouldBe true + filterNotFunc(pBamRecs2(5)) shouldBe true + filterNotFunc(pBamRecs2(6)) shouldBe false + filterNotFunc(pBamRecs2(7)) shouldBe false + filterNotFunc(pBamRecs2(8)) shouldBe false + filterNotFunc(pBamRecs2(9)) shouldBe false } - @Test def testWriteSingleBAMDefault() = { + @Test def testWriteSingleBamDefault() = { val mockFilterOutFunc = (r: SAMRecord) => Set("r03", "r04", "r05").contains(r.getReadName) - val outBAM = makeTempBAM() - val outBAMIndex = makeTempBAMIndex(outBAM) - outBAM.deleteOnExit() - outBAMIndex.deleteOnExit() + val outBam = makeTempBam() + val outBamIndex = makeTempBamIndex(outBam) + outBam.deleteOnExit() + outBamIndex.deleteOnExit() val stdout = new java.io.ByteArrayOutputStream Console.withOut(stdout) { - writeFilteredBAM(mockFilterOutFunc, sBAMFile1, outBAM) + writeFilteredBam(mockFilterOutFunc, sBamFile1, outBam) } - stdout.toString should === ( + stdout.toString should ===( "input_bam\toutput_bam\tcount_included\tcount_excluded\n%s\t%s\t%d\t%d\n" - .format(sBAMFile1.getName, outBAM.getName, 4, 3) + .format(sBamFile1.getName, outBam.getName, 4, 3) ) - val exp = new SAMFileReader(sBAMFile3).asScala - val obs = new SAMFileReader(outBAM).asScala + val exp = new SAMFileReader(sBamFile3).asScala + val obs = new SAMFileReader(outBam).asScala for ((e, o) <- exp.zip(obs)) - e.getSAMString should === (o.getSAMString) - outBAM should be ('exists) - outBAMIndex should be ('exists) + e.getSAMString should ===(o.getSAMString) + outBam should be('exists) + outBamIndex should be('exists) } - @Test def testWriteSingleBAMAndFilteredBAM() = { + @Test def testWriteSingleBamAndFilteredBAM() = { val mockFilterOutFunc = (r: SAMRecord) => Set("r03", "r04", "r05").contains(r.getReadName) - val outBAM = makeTempBAM() - val outBAMIndex = makeTempBAMIndex(outBAM) - outBAM.deleteOnExit() - outBAMIndex.deleteOnExit() - val filteredOutBAM = makeTempBAM() - val filteredOutBAMIndex = makeTempBAMIndex(filteredOutBAM) - filteredOutBAM.deleteOnExit() - filteredOutBAMIndex.deleteOnExit() + val outBam = makeTempBam() + val outBamIndex = makeTempBamIndex(outBam) + outBam.deleteOnExit() + outBamIndex.deleteOnExit() + val filteredOutBam = makeTempBam() + val filteredOutBamIndex = makeTempBamIndex(filteredOutBam) + filteredOutBam.deleteOnExit() + filteredOutBamIndex.deleteOnExit() val stdout = new java.io.ByteArrayOutputStream Console.withOut(stdout) { - writeFilteredBAM(mockFilterOutFunc, sBAMFile1, outBAM, filteredOutBAM = filteredOutBAM) + writeFilteredBam(mockFilterOutFunc, sBamFile1, outBam, filteredOutBam = filteredOutBam) } - stdout.toString should === ( + stdout.toString should ===( "input_bam\toutput_bam\tcount_included\tcount_excluded\n%s\t%s\t%d\t%d\n" - .format(sBAMFile1.getName, outBAM.getName, 4, 3) + .format(sBamFile1.getName, outBam.getName, 4, 3) ) - val exp = new SAMFileReader(sBAMFile4).asScala - val obs = new SAMFileReader(filteredOutBAM).asScala + val exp = new SAMFileReader(sBamFile4).asScala + val obs = new SAMFileReader(filteredOutBam).asScala for ((e, o) <- exp.zip(obs)) - e.getSAMString should === (o.getSAMString) - outBAM should be ('exists) - outBAMIndex should be ('exists) - filteredOutBAM should be ('exists) - filteredOutBAMIndex should be ('exists) + e.getSAMString should ===(o.getSAMString) + outBam should be('exists) + outBamIndex should be('exists) + filteredOutBam should be('exists) + filteredOutBamIndex should be('exists) } - @Test def testWritePairBAMDefault() = { + @Test def testWritePairBamDefault() = { val mockFilterOutFunc = (r: SAMRecord) => Set("r03", "r04", "r05").contains(r.getReadName) - val outBAM = makeTempBAM() - val outBAMIndex = makeTempBAMIndex(outBAM) - outBAM.deleteOnExit() - outBAMIndex.deleteOnExit() + val outBam = makeTempBam() + val outBamIndex = makeTempBamIndex(outBam) + outBam.deleteOnExit() + outBamIndex.deleteOnExit() val stdout = new java.io.ByteArrayOutputStream Console.withOut(stdout) { - writeFilteredBAM(mockFilterOutFunc, pBAMFile1, outBAM) + writeFilteredBam(mockFilterOutFunc, pBamFile1, outBam) } - stdout.toString should === ( + stdout.toString should ===( "input_bam\toutput_bam\tcount_included\tcount_excluded\n%s\t%s\t%d\t%d\n" - .format(pBAMFile1.getName, outBAM.getName, 8, 6) + .format(pBamFile1.getName, outBam.getName, 8, 6) ) - val exp = new SAMFileReader(pBAMFile3).asScala - val obs = new SAMFileReader(outBAM).asScala + val exp = new SAMFileReader(pBamFile3).asScala + val obs = new SAMFileReader(outBam).asScala for ((e, o) <- exp.zip(obs)) - e.getSAMString should === (o.getSAMString) - outBAM should be ('exists) - outBAMIndex should be ('exists) + e.getSAMString should ===(o.getSAMString) + outBam should be('exists) + outBamIndex should be('exists) } } \ No newline at end of file