diff --git a/.idea/libraries/Maven__org_mockito_mockito_all_1_9_5.xml b/.idea/libraries/Maven__org_mockito_mockito_all_1_9_5.xml new file mode 100644 index 0000000000000000000000000000000000000000..7797878d7a6e42ddf7b4bca81d9b434485b1c1f9 --- /dev/null +++ b/.idea/libraries/Maven__org_mockito_mockito_all_1_9_5.xml @@ -0,0 +1,13 @@ +<component name="libraryTable"> + <library name="Maven: org.mockito:mockito-all:1.9.5"> + <CLASSES> + <root url="jar://$MAVEN_REPOSITORY$/org/mockito/mockito-all/1.9.5/mockito-all-1.9.5.jar!/" /> + </CLASSES> + <JAVADOC> + <root url="jar://$MAVEN_REPOSITORY$/org/mockito/mockito-all/1.9.5/mockito-all-1.9.5-javadoc.jar!/" /> + </JAVADOC> + <SOURCES> + <root url="jar://$MAVEN_REPOSITORY$/org/mockito/mockito-all/1.9.5/mockito-all-1.9.5-sources.jar!/" /> + </SOURCES> + </library> +</component> \ No newline at end of file diff --git a/Biopet.iml b/Biopet.iml index 02d0bed78622af5b387d7774db3539c061857363..87ee5a467b1042f6a3cde8c772db14b72b4d553b 100644 --- a/Biopet.iml +++ b/Biopet.iml @@ -1,5 +1,10 @@ <?xml version="1.0" encoding="UTF-8"?> <module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4"> + <component name="FacetManager"> + <facet type="Python" name="Python"> + <configuration sdkName="" /> + </facet> + </component> <component name="NewModuleRootManager" inherit-compiler-output="false"> <output url="file://$MODULE_DIR$/target/classes" /> <output-test url="file://$MODULE_DIR$/target/test-classes" /> @@ -8,6 +13,6 @@ </content> <orderEntry type="inheritedJdk" /> <orderEntry type="sourceFolder" forTests="false" /> + <orderEntry type="library" name="gatk-queue-package-distribution-3.3" level="project" /> </component> -</module> - +</module> \ No newline at end of file diff --git a/biopet-framework/BiopetFramework.iml b/biopet-framework/BiopetFramework.iml index cd52bbb064aa631f89cd2e626866d9f7ee166ef3..576d75a0e85692d1617394c0fee61366299e856c 100644 --- a/biopet-framework/BiopetFramework.iml +++ b/biopet-framework/BiopetFramework.iml @@ -29,6 +29,7 @@ <orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="library" name="com.twitter:algebird-core_2.10:0.8.1" level="project" /> <orderEntry type="library" name="gatk-protected" level="project" /> + <orderEntry type="library" name="Maven: org.mockito:mockito-all:1.9.5" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.testng:testng:6.8" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: junit:junit:4.10" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.hamcrest:hamcrest-core:1.1" level="project" /> @@ -36,9 +37,9 @@ <orderEntry type="library" scope="TEST" name="Maven: com.beust:jcommander:1.27" level="project" /> <orderEntry type="library" scope="TEST" name="Maven: org.yaml:snakeyaml:1.6" level="project" /> <orderEntry type="library" name="Maven: org.scalatest:scalatest_2.11:2.2.1" level="project" /> - <orderEntry type="library" name="Maven: org.scala-lang:scala-library:2.11.2" level="project" /> <orderEntry type="library" name="Maven: org.scala-lang:scala-reflect:2.11.2" level="project" /> <orderEntry type="library" name="Maven: org.scala-lang.modules:scala-xml_2.11:1.0.2" level="project" /> + <orderEntry type="library" name="Maven: org.scala-lang:scala-library:2.11.2" level="project" /> <orderEntry type="library" name="Maven: org.broadinstitute.gatk:gatk-queue-package-distribution:3.3" level="project" /> <orderEntry type="library" name="Maven: io.argonaut:argonaut_2.11:6.1-M4" level="project" /> <orderEntry type="library" name="Maven: org.scalaz:scalaz-core_2.11:7.1.0" level="project" /> @@ -48,6 +49,6 @@ <orderEntry type="library" name="Maven: org.biojava:biojava3-sequencing:3.1.0" level="project" /> <orderEntry type="library" name="Maven: com.google.guava:guava:18.0" level="project" /> <orderEntry type="library" name="Maven: com.github.scopt:scopt_2.10:3.2.0" level="project" /> + <orderEntry type="library" name="Maven: org.mockito:mockito-all:1.9.5" level="project" /> </component> </module> - diff --git a/biopet-framework/pom.xml b/biopet-framework/pom.xml index d9c76caa15ed86cf8ce06c527fec4702358737dc..69c3c3ff4f7ac147c87ade7ec0c9a0c790090a33 100644 --- a/biopet-framework/pom.xml +++ b/biopet-framework/pom.xml @@ -72,6 +72,11 @@ <artifactId>scopt_2.10</artifactId> <version>3.2.0</version> </dependency> + <dependency> + <groupId>org.mockito</groupId> + <artifactId>mockito-all</artifactId> + <version>1.9.5</version> + </dependency> </dependencies> <build> <resources> @@ -105,7 +110,7 @@ <prefix>git</prefix> <dateFormat>dd.MM.yyyy '@' HH:mm:ss z</dateFormat> <verbose>false</verbose> - <useNativeGit>false</useNativeGit> + <useNativeGit>true</useNativeGit> <dotGitDirectory>${project.basedir}/../.git</dotGitDirectory> <skipPoms>false</skipPoms> <generateGitPropertiesFile>true</generateGitPropertiesFile> diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutable.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutable.scala index 0b377b8800ae297bc5bcc3d9a14a7f4b5679e281..80308d9f6393c38a5fb278c2fba39fb67fc2f6fe 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutable.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutable.scala @@ -21,6 +21,7 @@ object BiopetExecutable { nl.lumc.sasc.biopet.pipelines.yamsvp.Yamsvp), "tool" -> List( nl.lumc.sasc.biopet.tools.WipeReads, + nl.lumc.sasc.biopet.tools.ExtractAlignedFastq, nl.lumc.sasc.biopet.tools.BiopetFlagstat, nl.lumc.sasc.biopet.tools.CheckAllelesVcfInBam, nl.lumc.sasc.biopet.tools.VcfToTsv, @@ -28,7 +29,11 @@ object BiopetExecutable { nl.lumc.sasc.biopet.tools.FindRepeatsPacBio, nl.lumc.sasc.biopet.tools.BedToInterval, nl.lumc.sasc.biopet.tools.MpileupToVcf, - nl.lumc.sasc.biopet.tools.FastqSplitter) + nl.lumc.sasc.biopet.tools.FastqSplitter, + nl.lumc.sasc.biopet.tools.BedtoolsCoverageToCounts, + nl.lumc.sasc.biopet.tools.SageCountFastq, + nl.lumc.sasc.biopet.tools.SageCreateLibrary, + nl.lumc.sasc.biopet.tools.SageCreateTagCounts) ) /** @@ -37,7 +42,7 @@ object BiopetExecutable { def main(args: Array[String]): Unit = { def toBulletedList(m: List[MainCommand], kind: String = "", bullet: String = "-") = - "Available %ss:\n ".format(kind) + bullet + " " + m.map(x => x.commandName).sorted.mkString("\n " + bullet + " ") + "Available %s(s):\n ".format(kind) + bullet + " " + m.map(x => x.commandName).sorted.mkString("\n " + bullet + " ") def usage(module: String = null): String = { if (module != null) checkModule(module) diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SamToFastq.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SamToFastq.scala new file mode 100644 index 0000000000000000000000000000000000000000..5733c8b7331742164e03af896c7045e14948226c --- /dev/null +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/SamToFastq.scala @@ -0,0 +1,85 @@ +package nl.lumc.sasc.biopet.extensions.picard + +import java.io.File +import nl.lumc.sasc.biopet.core.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument } + +class SamToFastq(val root: Configurable) extends Picard { + javaMainClass = "picard.sam.SamToFastq" + + @Input(doc = "The input SAM or BAM files to analyze.", required = true) + var input: File = _ + + @Output(doc = "R1", required = true) + var fastqR1: File = _ + + @Output(doc = "R1", required = false) + var fastqR2: File = _ + + @Output(doc = "Unpaired", required = false) + var fastqUnpaired: File = _ + + @Argument(doc = "Output per readgroup", required = false) + var outputPerRg: Boolean = config("outputPerRg", default = false) + + @Argument(doc = "Output dir", required = false) + var outputDir: String = config("outputDir") + + @Argument(doc = "re reverse", required = false) + var reReverse: Boolean = config("reReverse", default = false) + + @Argument(doc = "The output file to bam file to", required = false) + var interleave: Boolean = config("interleave", default = false) + + @Argument(doc = "includeNonPjReads", required = false) + var includeNonPjReads: Boolean = config("includeNonPjReads", default = false) + + @Argument(doc = "clippingAtribute", required = false) + var clippingAtribute: String = config("clippingAtribute") + + @Argument(doc = "clippingAction", required = false) + var clippingAction: String = config("clippingAction") + + @Argument(doc = "read1Trim", required = false) + var read1Trim: Option[Int] = config("read1Trim") + + @Argument(doc = "read1MaxBasesToWrite", required = false) + var read1MaxBasesToWrite: Option[Int] = config("read1MaxBasesToWrite") + + @Argument(doc = "read2Trim", required = false) + var read2Trim: Option[Int] = config("read2Trim") + + @Argument(doc = "read2MaxBasesToWrite", required = false) + var read2MaxBasesToWrite: Option[Int] = config("read2MaxBasesToWrite") + + @Argument(doc = "includeNonPrimaryAlignments", required = false) + var includeNonPrimaryAlignments: Boolean = config("includeNonPrimaryAlignments", default = false) + + override def commandLine = super.commandLine + + required("INPUT=", input, spaceSeparated = false) + + required("FASTQ=", fastqR1, spaceSeparated = false) + + optional("SECOND_END_FASTQ=", fastqR2, spaceSeparated = false) + + optional("UNPAIRED_FASTQ=", fastqUnpaired, spaceSeparated = false) + + conditional(outputPerRg, "OUTPUT_PER_RG=TRUE") + + optional("OUTPUT_DIR=", outputDir, spaceSeparated = false) + + conditional(reReverse, "RE_REVERSE=TRUE") + + conditional(interleave, "INTERLEAVE=TRUE") + + conditional(includeNonPjReads, "INCLUDE_NON_PF_READS=TRUE") + + optional("CLIPPING_ATTRIBUTE=", clippingAtribute, spaceSeparated = false) + + optional("CLIPPING_ACTION=", clippingAction, spaceSeparated = false) + + optional("READ1_TRIM=", read1Trim, spaceSeparated = false) + + optional("READ1_MAX_BASES_TO_WRITE=", read1MaxBasesToWrite, spaceSeparated = false) + + optional("READ2_TRIM=", read2Trim, spaceSeparated = false) + + optional("READ2_MAX_BASES_TO_WRITE=", read2MaxBasesToWrite, spaceSeparated = false) + + conditional(includeNonPrimaryAlignments, "INCLUDE_NON_PRIMARY_ALIGNMENTS=TRUE") +} + +object SamToFastq { + def apply(root: Configurable, input: File, fastqR1: File, fastqR2: File = null): SamToFastq = { + val samToFastq = new SamToFastq(root) + samToFastq.input = input + samToFastq.fastqR1 = fastqR1 + samToFastq.fastqR2 = fastqR2 + return samToFastq + } +} \ No newline at end of file diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala index 7e1d70a98f4ab7866af10def3c097cb96b9b5de5..8c6cbb13b730f4ad12e9db86aa310f5dbb0af76c 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/Breakdancer.scala @@ -8,7 +8,6 @@ import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument } import java.io.File - class BreakdancerConfig(val root: Configurable) extends BiopetCommandLineFunction { executable = config("exe", default = "bam2cfg.pl", freeVar = false) @@ -23,22 +22,22 @@ class BreakdancerConfig(val root: Configurable) extends BiopetCommandLineFunctio var min_insertsize: Option[Int] = config("min_insertsize", default = 450) var solid_data: Boolean = config("solid", default = false) var sd_cutoff: Option[Int] = config("sd_cutoff", default = 4) // Cutoff in unit of standard deviation [4] - + // we set this to a higher number to avoid biases in small numbers in sorted bams var min_observations: Option[Int] = config("min_observations", default = 10000) // Number of observation required to estimate mean and s.d. insert size [10_000] var coefvar_cutoff: Option[Int] = config("coef_cutoff", default = 1) // Cutoff on coefficients of variation [1] var histogram_bins: Option[Int] = config("histogram_bins", default = 50) // Number of bins in the histogram [50] - + def cmdLine = required(executable) + - optional("-q", min_mq) + - conditional(use_mq, "-m") + - optional("-s", min_insertsize) + - conditional(solid_data, "-s") + - optional("-c", sd_cutoff) + - optional("-n", min_observations) + - optional("-v", coefvar_cutoff) + - optional("-b", histogram_bins) + - required(input) + " 1> " + required(output) + optional("-q", min_mq) + + conditional(use_mq, "-m") + + optional("-s", min_insertsize) + + conditional(solid_data, "-s") + + optional("-c", sd_cutoff) + + optional("-n", min_observations) + + optional("-v", coefvar_cutoff) + + optional("-b", histogram_bins) + + required(input) + " 1> " + required(output) } object BreakdancerConfig { @@ -62,35 +61,30 @@ object BreakdancerConfig { private def swapExtension(inputFile: String) = inputFile.substring(0, inputFile.lastIndexOf(".bam")) + ".breakdancer.cfg" } - - - /* * The caller * **/ - -class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunction { +class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunction { executable = config("exe", default = "breakdancer-max", freeVar = false) - - override val defaultVmem = "4G" + + override val defaultVmem = "6G" override val defaultThreads = 1 // breakdancer can only work on 1 single thread - + override val versionRegex = """.*[Vv]ersion:? (.*)""".r override val versionExitcode = List(1) override def versionCommand = executable - - + @Input(doc = "The breakdancer configuration file") var input: File = _ - -// @Argument(doc = "Work directory") -// var workdir: String = _ - - @Output(doc = "Breakdancer VCF output") + + // @Argument(doc = "Work directory") + // var workdir: String = _ + + @Output(doc = "Breakdancer TSV output") var output: File = _ - + /* Options: -o STRING operate on a single chromosome [all chromosome] @@ -116,36 +110,36 @@ class BreakdancerCaller(val root: Configurable) extends BiopetCommandLineFunctio var q: Option[Int] = config("qs", default = 35) var r: Option[Int] = config("r", default = 2) var x: Option[Int] = config("x", default = 1000) - var b: Option[Int] = config("b", default = 100) + var b: Option[Int] = config("b", default = 100) var t: Boolean = config("t", default = false) var d: String = config("d") var g: String = config("g") var l: Boolean = config("l", default = false) var a: Boolean = config("a", default = false) var h: Boolean = config("h", default = false) - var y: Option[Int] = config("y", default = 30) - + var y: Option[Int] = config("y", default = 30) + override def beforeCmd { } - def cmdLine = required(executable) + - optional("-s", s) + - optional("-c", c) + - optional("-m", m) + - optional("-q", q) + - optional("-r", r) + - optional("-x", x) + - optional("-b", b) + - conditional(t ,"-t") + - optional("-d", d) + - optional("-g", g) + - conditional(l ,"-l") + - conditional(a ,"-a") + - conditional(h ,"-h") + - optional("-y", y) + - required(input) + - ">" + - required(output) + def cmdLine = required(executable) + + optional("-s", s) + + optional("-c", c) + + optional("-m", m) + + optional("-q", q) + + optional("-r", r) + + optional("-x", x) + + optional("-b", b) + + conditional(t, "-t") + + optional("-d", d) + + optional("-g", g) + + conditional(l, "-l") + + conditional(a, "-a") + + conditional(h, "-h") + + optional("-y", y) + + required(input) + + ">" + + required(output) } object BreakdancerCaller { @@ -160,21 +154,18 @@ object BreakdancerCaller { /// Breakdancer is actually a mini pipeline executing binaries from the breakdancer package class Breakdancer(val root: Configurable) extends QScript with BiopetQScript { def this() = this(null) - + @Input(doc = "Input file (bam)") var input: File = _ @Input(doc = "Reference Fasta file") var reference: File = _ - + @Argument(doc = "Work directory") var workdir: String = _ - -// @Output(doc = "Breakdancer VCF output") -// lazy val outputvcf: File = { -// new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.vcf") -// } - + + var deps: List[File] = Nil + @Output(doc = "Breakdancer config") lazy val configfile: File = { new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.cfg") @@ -183,32 +174,35 @@ class Breakdancer(val root: Configurable) extends QScript with BiopetQScript { lazy val outputraw: File = { new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.tsv") } - + @Output(doc = "Breakdancer VCF output") + lazy val outputvcf: File = { + new File(workdir + "/" + input.getName.substring(0, input.getName.lastIndexOf(".bam")) + ".breakdancer.vcf") + } + override def init() { } def biopetScript() { // read config and set all parameters for the pipeline logger.info("Starting Breakdancer configuration") - + val bdcfg = BreakdancerConfig(this, input, this.configfile) - outputFiles += ("breakdancer_cfg" -> bdcfg.output ) - add( bdcfg ) - - val output_tsv: File = this.outputraw - val breakdancer = BreakdancerCaller( this, bdcfg.output, output_tsv ) - add( breakdancer ) - outputFiles += ("breakdancer_tsv" -> breakdancer.output ) - -// val output_vcf: File = this.outputvcf - // convert this tsv to vcf using the python script - - + bdcfg.deps = this.deps + outputFiles += ("cfg" -> bdcfg.output) + add(bdcfg) + + val breakdancer = BreakdancerCaller(this, bdcfg.output, this.outputraw) + add(breakdancer) + outputFiles += ("tsv" -> breakdancer.output) + + val bdvcf = BreakdancerVCF(this, breakdancer.output, this.outputvcf) + add(bdvcf) + outputFiles += ("vcf" -> bdvcf.output) } } object Breakdancer extends PipelineCommand { - def apply(root: Configurable, input: File, reference: File, runDir: String): Breakdancer = { + def apply(root: Configurable, input: File, reference: File, runDir: String): Breakdancer = { val breakdancer = new Breakdancer(root) breakdancer.input = input breakdancer.reference = reference diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala new file mode 100644 index 0000000000000000000000000000000000000000..6869a590dbe05f3f6743ea05c61257321ecf72ae --- /dev/null +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/svcallers/BreakdancerVCF.scala @@ -0,0 +1,36 @@ +package nl.lumc.sasc.biopet.extensions.svcallers + +import java.io.File + +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } + +import argonaut._, Argonaut._ +import scalaz._, Scalaz._ + +import nl.lumc.sasc.biopet.core.config.Configurable +import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction + +class BreakdancerVCF(val root: Configurable) extends PythonCommandLineFunction { + setPythonScript("breakdancer2vcf.py") + + @Input(doc = "Breakdancer TSV") + var input: File = _ + + @Output(doc = "Output VCF to PATH") + var output: File = _ + + def cmdLine = { + getPythonCommand + + "-i " + required(input) + + "-o " + required(output) + } +} + +object BreakdancerVCF { + def apply(root: Configurable, input: File, output: File): BreakdancerVCF = { + val bd = new BreakdancerVCF(root) + bd.input = input + bd.output = output + return bd + } +} \ No newline at end of file diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala index a8b3daf680e02d130ed10c857c27114c918a6c21..9cd93a32dc2f0663bcedf50ebb40e53945458765 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/gatk/GatkPipeline.scala @@ -8,6 +8,7 @@ import scala.collection.JavaConversions._ import java.io.File import nl.lumc.sasc.biopet.extensions.gatk.{ CombineVariants, CombineGVCFs } import nl.lumc.sasc.biopet.extensions.picard.AddOrReplaceReadGroups +import nl.lumc.sasc.biopet.extensions.picard.SamToFastq import nl.lumc.sasc.biopet.pipelines.bammetrics.BamMetrics import nl.lumc.sasc.biopet.pipelines.mapping.Mapping import org.broadinstitute.gatk.queue.QScript @@ -169,36 +170,49 @@ class GatkPipeline(val root: Configurable) extends QScript with MultiSampleQScri var bamFile = new File(runConfig("bam").toString) if (!bamFile.exists) throw new IllegalStateException("Bam in config does not exist, file: " + bamFile) - var readGroupOke = true - val inputSam = new SAMFileReader(bamFile) - val header = inputSam.getFileHeader.getReadGroups - for (readGroup <- inputSam.getFileHeader.getReadGroups) { - if (readGroup.getSample != sampleID) logger.warn("Sample ID readgroup in bam file is not the same") - if (readGroup.getLibrary != runID) logger.warn("Library ID readgroup in bam file is not the same") - if (readGroup.getSample != sampleID || readGroup.getLibrary != runID) readGroupOke = false - } - inputSam.close - - if (!readGroupOke) { - if (config("correct_readgroups", default = false)) { - logger.info("Correcting readgroups, file:" + bamFile) - val aorrg = AddOrReplaceReadGroups(this, bamFile, new File(runDir + sampleID + "-" + runID + ".bam")) - aorrg.RGID = sampleID + "-" + runID - aorrg.RGLB = runID - aorrg.RGSM = sampleID - if (runConfig.contains("PL")) aorrg.RGPL = runConfig("PL").toString - else aorrg.RGPL = "illumina" - if (runConfig.contains("PU")) aorrg.RGPU = runConfig("PU").toString - else aorrg.RGPU = "na" - if (runConfig.contains("CN")) aorrg.RGCN = runConfig("CN").toString - add(aorrg, isIntermediate = true) - bamFile = aorrg.output - } else throw new IllegalStateException("Readgroup sample and/or library of input bamfile is not correct, file: " + bamFile + - "\nPossible to set 'correct_readgroups' to true on config to automatic fix this") + if (config("bam_to_fastq", default = false).getBoolean) { + val samToFastq = SamToFastq(this, bamFile, runDir + sampleID + "-" + runID + ".R1.fastq", + runDir + sampleID + "-" + runID + ".R2.fastq") + add(samToFastq, isIntermediate = true) + val mapping = Mapping.loadFromLibraryConfig(this, runConfig, sampleConfig, runDir, startJobs = false) + mapping.input_R1 = samToFastq.fastqR1 + mapping.input_R2 = samToFastq.fastqR2 + mapping.init + mapping.biopetScript + addAll(mapping.functions) // Add functions of mapping to curent function pool + libraryOutput.mappedBamFile = mapping.outputFiles("finalBamFile") + } else { + var readGroupOke = true + val inputSam = new SAMFileReader(bamFile) + val header = inputSam.getFileHeader.getReadGroups + for (readGroup <- inputSam.getFileHeader.getReadGroups) { + if (readGroup.getSample != sampleID) logger.warn("Sample ID readgroup in bam file is not the same") + if (readGroup.getLibrary != runID) logger.warn("Library ID readgroup in bam file is not the same") + if (readGroup.getSample != sampleID || readGroup.getLibrary != runID) readGroupOke = false + } + inputSam.close + + if (!readGroupOke) { + if (config("correct_readgroups", default = false)) { + logger.info("Correcting readgroups, file:" + bamFile) + val aorrg = AddOrReplaceReadGroups(this, bamFile, new File(runDir + sampleID + "-" + runID + ".bam")) + aorrg.RGID = sampleID + "-" + runID + aorrg.RGLB = runID + aorrg.RGSM = sampleID + if (runConfig.contains("PL")) aorrg.RGPL = runConfig("PL").toString + else aorrg.RGPL = "illumina" + if (runConfig.contains("PU")) aorrg.RGPU = runConfig("PU").toString + else aorrg.RGPU = "na" + if (runConfig.contains("CN")) aorrg.RGCN = runConfig("CN").toString + add(aorrg, isIntermediate = true) + bamFile = aorrg.output + } else throw new IllegalStateException("Readgroup sample and/or library of input bamfile is not correct, file: " + bamFile + + "\nPossible to set 'correct_readgroups' to true on config to automatic fix this") + } + addAll(BamMetrics(this, bamFile, runDir + "metrics/").functions) + + libraryOutput.mappedBamFile = bamFile } - addAll(BamMetrics(this, bamFile, runDir + "metrics/").functions) - - libraryOutput.mappedBamFile = bamFile } else logger.error("Sample: " + sampleID + ": No R1 found for run: " + runConfig) val gatkVariantcalling = new GatkVariantcalling(this) diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala index 8edc9727e0f3f36e9691162a1e5bf03b21284efd..d3085d3447cafaaf8579ed684fdead1c5c3c1e36 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala @@ -305,7 +305,8 @@ class Mapping(val root: Configurable) extends QScript with BiopetQScript { } object Mapping extends PipelineCommand { - def loadFromLibraryConfig(root: Configurable, runConfig: Map[String, Any], sampleConfig: Map[String, Any], runDir: String): Mapping = { + def loadFromLibraryConfig(root: Configurable, runConfig: Map[String, Any], sampleConfig: Map[String, Any], + runDir: String, startJobs: Boolean = true): Mapping = { val mapping = new Mapping(root) logger.debug("Mapping runconfig: " + runConfig) @@ -322,9 +323,11 @@ object Mapping extends PipelineCommand { if (runConfig.contains("PU")) mapping.RGPU = runConfig("PU").toString if (runConfig.contains("CN")) mapping.RGCN = runConfig("CN").toString mapping.outputDir = runDir - - mapping.init - mapping.biopetScript + + if (startJobs) { + mapping.init + mapping.biopetScript + } return mapping } } diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala index e501a3cf10d0e0a9a3c3e77f2f20f6e49f27bf90..36da78d18051e41b59085f98340a0575ef9ea89c 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/Sage.scala @@ -8,7 +8,11 @@ import nl.lumc.sasc.biopet.extensions.picard.MergeSamFiles import nl.lumc.sasc.biopet.pipelines.flexiprep.Flexiprep import nl.lumc.sasc.biopet.pipelines.mapping.Mapping import nl.lumc.sasc.biopet.scripts.PrefixFastq +import nl.lumc.sasc.biopet.tools.BedtoolsCoverageToCounts import nl.lumc.sasc.biopet.scripts.SquishBed +import nl.lumc.sasc.biopet.tools.SageCountFastq +import nl.lumc.sasc.biopet.tools.SageCreateLibrary +import nl.lumc.sasc.biopet.tools.SageCreateTagCounts import org.broadinstitute.gatk.queue.QScript import org.broadinstitute.gatk.queue.function._ @@ -64,7 +68,7 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript { } if (tagsLibrary == null) { - val cdl = new CreateDeepsageLibrary(this) + val cdl = new SageCreateLibrary(this) cdl.input = transcriptome cdl.output = outputDir + "taglib/tag.lib" cdl.noAntiTagsOutput = outputDir + "taglib/no_antisense_genes.txt" @@ -181,12 +185,12 @@ class Sage(val root: Configurable) extends QScript with MultiSampleQScript { } def addTablibCounts(fastq:File, outputPrefix: String, outputDir: String) { - val countFastq = new CountFastq(this) + val countFastq = new SageCountFastq(this) countFastq.input = fastq countFastq.output = outputDir + outputPrefix + ".raw.counts" add(countFastq) - val createTagCounts = new CreateTagCounts(this) + val createTagCounts = new SageCreateTagCounts(this) createTagCounts.input = countFastq.output createTagCounts.tagLib = tagsLibrary createTagCounts.countSense = outputDir + outputPrefix + ".tagcount.sense.counts" diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala index 3c00eb3cc86d1c1858dd7f1641c9cc7d898cb7d2..16460f0fd66978f98ba79268cbf93e2a10425e00 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/yamsvp/Yamsvp.scala @@ -8,7 +8,6 @@ import nl.lumc.sasc.biopet.core.config.Configurable import nl.lumc.sasc.biopet.core.MultiSampleQScript import nl.lumc.sasc.biopet.core.PipelineCommand -import nl.lumc.sasc.biopet.extensions.picard.MergeSamFiles import nl.lumc.sasc.biopet.extensions.sambamba.{ SambambaIndex, SambambaMerge } import nl.lumc.sasc.biopet.extensions.svcallers.pindel.Pindel import nl.lumc.sasc.biopet.extensions.svcallers.{ Breakdancer, Clever } @@ -84,24 +83,24 @@ class Yamsvp(val root: Configurable) extends QScript with MultiSampleQScript { mergeSamFiles.input = libraryBamfiles mergeSamFiles.output = alignmentDir + sampleID + ".merged.bam" add(mergeSamFiles) - - val bamIndex = SambambaIndex(root, mergeSamFiles.output) - add(bamIndex) - mergeSamFiles.output } else null + + val bamIndex = SambambaIndex(root, bamFile) + add(bamIndex) /// bamfile will be used as input for the SV callers. First run Clever // val cleverVCF : File = sampleDir + "/" + sampleID + ".clever.vcf" val cleverDir = svcallingDir + sampleID + ".clever/" val clever = Clever(this, bamFile, this.reference, svcallingDir, cleverDir) + clever.deps = List(bamIndex.output) sampleOutput.vcf += ("clever" -> List(clever.outputvcf)) add(clever) val breakdancerDir = svcallingDir + sampleID + ".breakdancer/" val breakdancer = Breakdancer(this, bamFile, this.reference, breakdancerDir) - sampleOutput.vcf += ("breakdancer" -> List(breakdancer.outputraw)) + sampleOutput.vcf += ("breakdancer" -> List(breakdancer.outputvcf)) addAll(breakdancer.functions) // for pindel we should use per library config collected into one config file diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedToInterval.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedToInterval.scala index 0a8cbdb3accf11ef641708b5a2dd4ac44f910467..550853dff22d871690bb71631b80d6c55dc19698 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedToInterval.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedToInterval.scala @@ -25,7 +25,7 @@ class BedToInterval(val root: Configurable) extends BiopetJavaCommandLineFunctio override val defaultVmem = "8G" memoryLimit = Option(4.0) - override def commandLine = super.commandLine + required(input) + required(bamFile) + required(output) + override def commandLine = super.commandLine + required("-I", input) + required("-b", bamFile) + required("-o", output) } object BedToInterval extends ToolCommand { @@ -45,13 +45,15 @@ object BedToInterval extends ToolCommand { return bedToInterval } - case class Args (inputFile:File = null, outputFile:File = null) extends AbstractArgs + case class Args (inputFile:File = null, outputFile:File = null, bamFile:File = null) extends AbstractArgs class OptParser extends AbstractOptParser { opt[File]('I', "inputFile") required() valueName("<file>") action { (x, c) => - c.copy(inputFile = x) } text("out is a required file property") + c.copy(inputFile = x) } opt[File]('o', "output") required() valueName("<file>") action { (x, c) => - c.copy(outputFile = x) } text("out is a required file property") + c.copy(outputFile = x) } + opt[File]('b', "bam") required() valueName("<file>") action { (x, c) => + c.copy(bamFile = x) } } /** @@ -63,7 +65,7 @@ object BedToInterval extends ToolCommand { val writer = new PrintWriter(commandArgs.outputFile) - val inputSam = new SAMFileReader(commandArgs.inputFile) + val inputSam = new SAMFileReader(commandArgs.bamFile) val refs = for (SQ <- inputSam.getFileHeader.getSequenceDictionary.getSequences.toArray) yield { val record = SQ.asInstanceOf[SAMSequenceRecord] writer.write("@SQ\tSN:" + record.getSequenceName + "\tLN:" + record.getSequenceLength + "\n") @@ -72,7 +74,7 @@ object BedToInterval extends ToolCommand { inputSam.close val refsMap = Map(refs:_*) - val bedFile = Source.fromFile(args(0)) + val bedFile = Source.fromFile(commandArgs.inputFile) for ( line <- bedFile.getLines; val split = line.split("\t") diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/BedtoolsCoverageToCounts.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedtoolsCoverageToCounts.scala similarity index 62% rename from biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/BedtoolsCoverageToCounts.scala rename to biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedtoolsCoverageToCounts.scala index 439febc7dd5e00001d960eb1845441d92751ee21..004d322caabfb8fe0dd24ad793b14b141371d30b 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/BedtoolsCoverageToCounts.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/BedtoolsCoverageToCounts.scala @@ -1,8 +1,9 @@ -package nl.lumc.sasc.biopet.pipelines.sage +package nl.lumc.sasc.biopet.tools import java.io.File import java.io.PrintWriter import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction +import nl.lumc.sasc.biopet.core.ToolCommand import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import scala.collection.JavaConversions._ @@ -27,26 +28,27 @@ class BedtoolsCoverageToCounts(val root: Configurable) extends BiopetJavaCommand required("-o", output) } -object BedtoolsCoverageToCounts { - var input: File = _ - var output: File = _ +object BedtoolsCoverageToCounts extends ToolCommand { + case class Args (input:File = null, output:File = null) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('I', "input") required() valueName("<file>") action { (x, c) => + c.copy(input = x) } + opt[File]('o', "output") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(output = x) } + } /** * @param args the command line arguments */ def main(args: Array[String]): Unit = { - for (t <- 0 until args.size) { - args(t) match { - case "-I" => input = new File(args(t+1)) - case "-o" => output = new File(args(t+1)) - case _ => - } - } - if (input == null || !input.exists) throw new IllegalStateException("Input file not found, use -I") - if (output == null) throw new IllegalStateException("Output file not found, use -o") + val argsParser = new OptParser + val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) + + if (!commandArgs.input.exists) throw new IllegalStateException("Input file not found, file: " + commandArgs.input) val counts:Map[String, Long] = Map() - for (line <- Source.fromFile(input).getLines) { + for (line <- Source.fromFile(commandArgs.input).getLines) { val values = line.split("\t") val gene = values(3) val count = values(6).toLong @@ -56,7 +58,7 @@ object BedtoolsCoverageToCounts { val sortedCounts:SortedMap[String, Long] = SortedMap(counts.toArray:_*) - val writer = new PrintWriter(output) + val writer = new PrintWriter(commandArgs.output) for ((seq,count) <- sortedCounts) { if (count > 0) writer.println(seq + "\t" + count) } diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastq.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastq.scala new file mode 100644 index 0000000000000000000000000000000000000000..83a607de4a035013cbf82072223f533380ff942c --- /dev/null +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastq.scala @@ -0,0 +1,249 @@ +/** + * Copyright (c) 2014 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl> + * @author Wibowo Arindrarto <w.arindrarto@lumc.nl> + */ +package nl.lumc.sasc.biopet.tools + +import java.io.File + +import scala.collection.mutable.{ Set => MSet } +import scala.collection.JavaConverters._ + +import htsjdk.samtools.QueryInterval +import htsjdk.samtools.SamReaderFactory +import htsjdk.samtools.ValidationStringency +import htsjdk.samtools.fastq.{ BasicFastqWriter, FastqReader, FastqRecord } +import htsjdk.samtools.util.Interval + +import nl.lumc.sasc.biopet.core.ToolCommand + +object ExtractAlignedFastq extends ToolCommand { + + type FastqPair = (FastqRecord, FastqRecord) + + /** + * Function to create iterator over Interval given input interval string + * + * Valid interval strings are either of these: + * - chr5:10000-11000 + * - chr5:10,000-11,000 + * - chr5:10.000-11.000 + * - chr5:10000-11,000 + * In all cases above, the region span base #10,000 to base number #11,000 in chromosome 5 + * (first base is numbered 1) + * + * An interval string with a single base is also allowed: + * - chr5:10000 + * - chr5:10,000 + * - chr5:10.000 + * + * @param inStrings iterable yielding input interval string + */ + def makeIntervalFromString(inStrings: Iterable[String]): Iterator[Interval] = { + + // FIXME: can we combine these two patterns into one regex? + // matches intervals with start and end coordinates + val ptn1 = """([\w_-]+):([\d.,]+)-([\d.,]+)""".r + // matches intervals with start coordinate only + val ptn2 = """([\w_-]+):([\d.,]+)""".r + // make ints from coordinate strings + // NOTE: while it is possible for coordinates to exceed Int.MaxValue, we are limited + // by the Interval constructor only accepting ints + def intFromCoord(s: String): Int = s.replaceAll(",", "").replaceAll("\\.", "").toInt + + inStrings.map(x => x match { + case ptn1(chr, start, end) => new Interval(chr, intFromCoord(start), intFromCoord(end)) + case ptn2(chr, start) => + val startCoord = intFromCoord(start) + new Interval(chr, startCoord, startCoord) + case _ => throw new IllegalArgumentException("Invalid interval string: " + x) + }) + .toIterator + } + + /** + * Function to create object that checks whether a given FASTQ record is mapped + * to the given interval or not + * + * @param iv iterable yielding features to check + * @param inAln input SAM/BAM file + * @param minMapQ minimum mapping quality of read to include + * @param commonSuffixLength length of suffix common to all read pairs + * @return + */ + def makeMembershipFunction(iv: Iterator[Interval], + inAln: File, + minMapQ: Int = 0, + commonSuffixLength: Int = 0): (FastqPair => Boolean) = { + + val inAlnReader = SamReaderFactory + .make() + .validationStringency(ValidationStringency.LENIENT) + .open(inAln) + require(inAlnReader.hasIndex) + + def getSequenceIndex(name: String): Int = inAlnReader.getFileHeader.getSequenceIndex(name) match { + case x if x >= 0 => + x + case otherwise => + throw new IllegalArgumentException("Chromosome " + name + " is not found in the alignment file") + } + + val queries: Array[QueryInterval] = iv.toList + // sort Interval + .sortBy(x => (x.getSequence, x.getStart, x.getEnd)) + // transform to QueryInterval + .map(x => new QueryInterval(getSequenceIndex(x.getSequence), x.getStart, x.getEnd)) + // cast to array + .toArray + + lazy val selected: MSet[String] = inAlnReader + // query BAM file for overlapping reads + .queryOverlapping(queries) + // for Scala compatibility + .asScala + // filter based on mapping quality + .filter(x => x.getMappingQuality >= minMapQ) + // iteratively add read name to the selected set + .foldLeft(MSet.empty[String])( + (acc, x) => { + logger.debug("Adding " + x.getReadName + " to set ...") + acc += x.getReadName + } + ) + + (pair: FastqPair) => pair._2 match { + case null => selected.contains(pair._1.getReadHeader) + case otherwise => + require(commonSuffixLength < pair._1.getReadHeader.length) + require(commonSuffixLength < pair._2.getReadHeader.length) + selected.contains(pair._1.getReadHeader.dropRight(commonSuffixLength)) + } + } + + def selectFastqReads(memFunc: FastqPair => Boolean, + inputFastq1: FastqReader, + outputFastq1: BasicFastqWriter, + inputFastq2: FastqReader = null, + outputFastq2: BasicFastqWriter = null): Unit = { + + val i1 = inputFastq1.iterator.asScala + val i2 = inputFastq2 match { + case null => Iterator.continually(null) + case otherwise => otherwise.iterator.asScala + } + val o1 = outputFastq1 + val o2 = (inputFastq2, outputFastq2) match { + case (null, null) => null + case (_, null) => throw new IllegalArgumentException("Missing output FASTQ 2") + case (null, _) => throw new IllegalArgumentException("Output FASTQ 2 supplied but there is no input FASTQ 2") + case (x, y) => outputFastq2 + } + + logger.info("Writing output file(s) ...") + // zip, filter based on function, and write to output file(s) + i1.zip(i2) + .filter(rec => memFunc(rec._1, rec._2)) + .foreach { + case (rec1, null) => + o1.write(rec1) + case (rec1, rec2) => + o1.write(rec1) + o2.write(rec2) + } + + } + + case class Args(inputBam: File = null, + intervals: List[String] = List.empty[String], + inputFastq1: File = null, + inputFastq2: File = null, + outputFastq1: File = null, + outputFastq2: File = null, + minMapQ: Int = 0, + commonSuffixLength: Int = 0) extends AbstractArgs + + class OptParser extends AbstractOptParser { + + head( + s""" + |$commandName - Select aligned FASTQ records + """.stripMargin) + + opt[File]('I', "input_file") required () valueName "<bam>" action { (x, c) => + c.copy(inputBam = x) + } validate { + x => if (x.exists) success else failure("Input BAM file not found") + } text "Input BAM file" + + opt[String]('r', "interval") required () unbounded () valueName "<interval>" action { (x, c) => + // yes, we are appending and yes it's O(n) ~ preserving order is more important than speed here + c.copy(intervals = c.intervals :+ x) + } text "Interval strings" + + opt[File]('i', "in1") required () valueName "<fastq>" action { (x, c) => + c.copy(inputFastq1 = x) + } validate { + x => if (x.exists) success else failure("Input FASTQ file 1 not found") + } text "Input FASTQ file 1" + + opt[File]('j', "in2") optional () valueName "<fastq>" action { (x, c) => + c.copy(inputFastq1 = x) + } validate { + x => if (x.exists) success else failure("Input FASTQ file 2 not found") + } text "Input FASTQ file 2 (default: none)" + + opt[File]('o', "out1") required () valueName "<fastq>" action { (x, c) => + c.copy(outputFastq1 = x) + } text "Output FASTQ file 1" + + opt[File]('p', "out2") optional () valueName "<fastq>" action { (x, c) => + c.copy(outputFastq1 = x) + } text "Output FASTQ file 2 (default: none)" + + opt[Int]('Q', "min_mapq") optional () action { (x, c) => + c.copy(minMapQ = x) + } text "Minimum MAPQ of reads in target region to remove (default: 0)" + + opt[Int]('s', "read_suffix_length") optional () action { (x, c) => + c.copy(commonSuffixLength = x) + } text "Length of common suffix from each read pair (default: 0)" + + note( + """ + |This tool creates FASTQ file(s) containing reads mapped to the given alignment intervals. + """.stripMargin) + + checkConfig { c => + if (!c.inputBam.exists) + failure("Input BAM file not found") + else if (!c.inputFastq1.exists) + failure("Input FASTQ file 1 not found") + else if (c.inputFastq2 != null && c.outputFastq2 == null) + failure("Missing output FASTQ file 2") + else if (c.inputFastq2 == null && c.outputFastq2 != null) + failure("Missing input FASTQ file 2") + else + success + } + } + + def main(args: Array[String]): Unit = { + + val commandArgs: Args = new OptParser() + .parse(args, Args()) + .getOrElse(sys.exit(1)) + + val memFunc = makeMembershipFunction( + iv = makeIntervalFromString(commandArgs.intervals), + inAln = commandArgs.inputBam, + minMapQ = commandArgs.minMapQ, + commonSuffixLength = commandArgs.commonSuffixLength) + + selectFastqReads(memFunc, + inputFastq1 = new FastqReader(commandArgs.inputFastq1), + inputFastq2 = new FastqReader(commandArgs.inputFastq2), + outputFastq1 = new BasicFastqWriter(commandArgs.outputFastq1), + outputFastq2 = new BasicFastqWriter(commandArgs.outputFastq2)) + } +} diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CountFastq.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCountFastq.scala similarity index 58% rename from biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CountFastq.scala rename to biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCountFastq.scala index e8f57b25014b87921efb4cd83ca3f49260113f06..a07cc4b8bb8eb73aeb00d1dbc2b2880b761b6acc 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CountFastq.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCountFastq.scala @@ -1,8 +1,9 @@ -package nl.lumc.sasc.biopet.pipelines.sage +package nl.lumc.sasc.biopet.tools import java.io.File import java.io.PrintWriter import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction +import nl.lumc.sasc.biopet.core.ToolCommand import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import org.biojava3.sequencing.io.fastq.{SangerFastqReader, StreamListener, Fastq} @@ -11,7 +12,7 @@ import scala.collection.SortedMap import scala.collection.mutable.Map import java.io.FileReader -class CountFastq(val root: Configurable) extends BiopetJavaCommandLineFunction { +class SageCountFastq(val root: Configurable) extends BiopetJavaCommandLineFunction { javaMainClass = getClass.getName @Input(doc = "Input fasta", shortName = "input", required = true) @@ -28,29 +29,30 @@ class CountFastq(val root: Configurable) extends BiopetJavaCommandLineFunction { required("-o", output) } -object CountFastq { - var input: File = _ - var output: File = _ +object SageCountFastq extends ToolCommand { + case class Args (input:File = null, output:File = null) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('I', "input") required() valueName("<file>") action { (x, c) => + c.copy(input = x) } + opt[File]('o', "output") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(output = x) } + } /** * @param args the command line arguments */ def main(args: Array[String]): Unit = { - for (t <- 0 until args.size) { - args(t) match { - case "-I" => input = new File(args(t+1)) - case "-o" => output = new File(args(t+1)) - case _ => - } - } - if (input == null || !input.exists) throw new IllegalStateException("Input file not found, use -I") - if (output == null) throw new IllegalStateException("Output file not found, use -o") + val argsParser = new OptParser + val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) + + if (!commandArgs.input.exists) throw new IllegalStateException("Input file not found, file: " + commandArgs.input) val counts:Map[String, Long] = Map() val reader = new SangerFastqReader var count = 0 - System.err.println("Reading fastq file: " + input) - val fileReader = new FileReader(input) + logger.info("Reading fastq file: " + commandArgs.input) + val fileReader = new FileReader(commandArgs.input) reader.stream(fileReader, new StreamListener { def fastq(fastq:Fastq) { val seq = fastq.getSequence @@ -60,13 +62,13 @@ object CountFastq { if (count % 1000000 == 0) System.err.println(count + " sequences done") } }) - System.err.println(count + " sequences done") + logger.info(count + " sequences done") - System.err.println("Sorting") + logger.info("Sorting") val sortedCounts:SortedMap[String, Long] = SortedMap(counts.toArray:_*) - System.err.println("Writting outputfile: " + output) - val writer = new PrintWriter(output) + logger.info("Writting outputfile: " + commandArgs.output) + val writer = new PrintWriter(commandArgs.output) for ((seq,count) <- sortedCounts) { writer.println(seq + "\t" + count) } diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CreateDeepsageLibrary.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCreateLibrary.scala similarity index 68% rename from biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CreateDeepsageLibrary.scala rename to biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCreateLibrary.scala index 7ed2fb0e89e3f9eeaca81e8d7481654feadf3b69..b339ab2471e756e654fbbe621e15a1ac7f5a36cb 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CreateDeepsageLibrary.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCreateLibrary.scala @@ -1,8 +1,9 @@ -package nl.lumc.sasc.biopet.pipelines.sage +package nl.lumc.sasc.biopet.tools import java.io.File import java.io.PrintWriter import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction +import nl.lumc.sasc.biopet.core.ToolCommand import nl.lumc.sasc.biopet.core.config.Configurable import org.biojava3.core.sequence.DNASequence import org.biojava3.core.sequence.io.FastaReaderHelper @@ -10,8 +11,9 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import scala.collection.SortedMap import scala.collection.mutable.{Map, Set} import scala.collection.JavaConversions._ +import scala.util.matching.Regex -class CreateDeepsageLibrary(val root: Configurable) extends BiopetJavaCommandLineFunction { +class SageCreateLibrary(val root: Configurable) extends BiopetJavaCommandLineFunction { javaMainClass = getClass.getName @Input(doc = "Input fasta", shortName = "input", required = true) @@ -37,22 +39,35 @@ class CreateDeepsageLibrary(val root: Configurable) extends BiopetJavaCommandLin override def commandLine = super.commandLine + required("-I", input) + - optional("-tag", tag) + - optional("-length", length) + - optional("-notag", noTagsOutput) + - optional("-noantitag", noAntiTagsOutput) + + optional("--tag", tag) + + optional("--length", length) + + optional("--notag", noTagsOutput) + + optional("--noantitag", noAntiTagsOutput) + required("-o", output) } -object CreateDeepsageLibrary { - var tag = "CATG" - var length = 17 - var input: File = _ - var noTagsOutput: File = _ - var noAntiTagsOutput: File = _ - var allGenesOutput: File = _ - var output: File = _ - lazy val tagRegex = (tag + "[CATG]{" + length + "}").r +object SageCreateLibrary extends ToolCommand { + case class Args (input:File = null, tag:String = "CATG", length:Int = 17,output:File = null, noTagsOutput:File = null, + noAntiTagsOutput:File = null, allGenesOutput:File = null) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('I', "input") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(input = x) } + opt[File]('o', "output") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(output = x) } + opt[String]("tag") required() unbounded() action { (x, c) => + c.copy(tag = x) } + opt[Int]("length") required() unbounded() action { (x, c) => + c.copy(length = x) } + opt[File]("noTagsOutput") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(noTagsOutput = x) } + opt[File]("noAntiTagsOutput") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(noAntiTagsOutput = x) } + opt[File]("allGenesOutput") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(allGenesOutput = x) } + } + + var tagRegex: Regex = null var geneRegex = """ENSG[0-9]{11}""".r val tagGenesMap: Map[String, TagGenes] = Map() @@ -73,24 +88,16 @@ object CreateDeepsageLibrary { * @param args the command line arguments */ def main(args: Array[String]): Unit = { - for (t <- 0 until args.size) { - args(t) match { - case "-I" => input = new File(args(t+1)) - case "-tag" => tag = args(t+1) - case "-length" => length = args(t+1).toInt - case "-o" => output = new File(args(t+1)) - case "-notag" => noTagsOutput = new File(args(t+1)) - case "-noantitag" => noAntiTagsOutput = new File(args(t+1)) - case "-allgenes" => allGenesOutput = new File(args(t+1)) - case _ => - } - } - if (input == null || !input.exists) throw new IllegalStateException("Input file not found, use -I") - if (output == null) throw new IllegalStateException("Output file not found, use -o") + val argsParser = new OptParser + val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) + + if (!commandArgs.input.exists) throw new IllegalStateException("Input file not found, file: " + commandArgs.input) + + tagRegex = (commandArgs.tag + "[CATG]{" + commandArgs.length + "}").r var count = 0 System.err.println("Reading fasta file") - val reader = FastaReaderHelper.readFastaDNASequence(input) + val reader = FastaReaderHelper.readFastaDNASequence(commandArgs.input) System.err.println("Finding tags") for ((name, seq) <- reader) { getTags(name, seq) @@ -103,7 +110,7 @@ object CreateDeepsageLibrary { val tagGenesMapSorted:SortedMap[String, TagGenes] = SortedMap(tagGenesMap.toArray:_*) System.err.println("Writting output files") - val writer = new PrintWriter(output) + val writer = new PrintWriter(commandArgs.output) writer.println("#tag\tfirstTag\tAllTags\tFirstAntiTag\tAllAntiTags") for ((tag, genes) <- tagGenesMapSorted) { val line = tag + "\t" + genes.firstTag.mkString(",") + @@ -114,24 +121,24 @@ object CreateDeepsageLibrary { } writer.close() - if (noTagsOutput != null) { - val writer = new PrintWriter(noTagsOutput) + if (commandArgs.noTagsOutput != null) { + val writer = new PrintWriter(commandArgs.noTagsOutput) for (gene <- allGenes if !tagGenes.contains(gene)) { writer.println(gene) } writer.close } - if (noAntiTagsOutput != null) { - val writer = new PrintWriter(noAntiTagsOutput) + if (commandArgs.noAntiTagsOutput != null) { + val writer = new PrintWriter(commandArgs.noAntiTagsOutput) for (gene <- allGenes if !antiTagGenes.contains(gene)) { writer.println(gene) } writer.close } - if (allGenesOutput != null) { - val writer = new PrintWriter(allGenesOutput) + if (commandArgs.allGenesOutput != null) { + val writer = new PrintWriter(commandArgs.allGenesOutput) for (gene <- allGenes) { writer.println(gene) } diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CreateTagCounts.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCreateTagCounts.scala similarity index 63% rename from biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CreateTagCounts.scala rename to biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCreateTagCounts.scala index 0cf113051596c5ae8f7a8cf90897bf864daacb54..1bfc6e0f8606c3ed14576791f9ac50a8cfa99b04 100644 --- a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/pipelines/sage/CreateTagCounts.scala +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/SageCreateTagCounts.scala @@ -1,15 +1,16 @@ -package nl.lumc.sasc.biopet.pipelines.sage +package nl.lumc.sasc.biopet.tools import java.io.File import java.io.PrintWriter import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction +import nl.lumc.sasc.biopet.core.ToolCommand import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import scala.io.Source import scala.collection.mutable.Map import scala.collection.SortedMap -class CreateTagCounts(val root: Configurable) extends BiopetJavaCommandLineFunction { +class SageCreateTagCounts(val root: Configurable) extends BiopetJavaCommandLineFunction { javaMainClass = getClass.getName @Input(doc = "Raw count file", shortName = "input", required = true) @@ -35,42 +36,43 @@ class CreateTagCounts(val root: Configurable) extends BiopetJavaCommandLineFunct override def commandLine = super.commandLine + required("-I", input) + - required("-taglib", tagLib) + - optional("-sense", countSense) + - optional("-allsense", countAllSense) + - optional("-antisense", countAntiSense) + - optional("-allantisense", countAllAntiSense) + required("--taglib", tagLib) + + optional("--sense", countSense) + + optional("--allsense", countAllSense) + + optional("--antisense", countAntiSense) + + optional("--allantisense", countAllAntiSense) } -object CreateTagCounts { - var input: File = _ - var tagLib: File = _ - var countSense: File = _ - var countAllSense: File = _ - var countAntiSense: File = _ - var countAllAntiSense: File = _ +object SageCreateTagCounts extends ToolCommand { + case class Args (input:File = null, tagLib:File = null, countSense:File = null, countAllSense:File = null, + countAntiSense:File = null, countAllAntiSense:File = null) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('I', "input") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(input = x) } + opt[File]('t', "tagLib") required() unbounded() valueName("<file>") action { (x, c) => + c.copy(tagLib = x) } + opt[File]("countSense") unbounded() valueName("<file>") action { (x, c) => + c.copy(countSense = x) } + opt[File]("countAllSense") unbounded() valueName("<file>") action { (x, c) => + c.copy(countAllSense = x) } + opt[File]("countAntiSense") unbounded() valueName("<file>") action { (x, c) => + c.copy(countAntiSense = x) } + opt[File]("countAllAntiSense") unbounded() valueName("<file>") action { (x, c) => + c.copy(countAllAntiSense = x) } + } /** * @param args the command line arguments */ def main(args: Array[String]): Unit = { + val argsParser = new OptParser + val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) - for (t <- 0 until args.size) { - args(t) match { - case "-I" => input = new File(args(t+1)) - case "-taglib" => tagLib = new File(args(t+1)) - case "-sense" => countSense = new File(args(t+1)) - case "-allsense" => countAllSense = new File(args(t+1)) - case "-antisense" => countAntiSense = new File(args(t+1)) - case "-allantisense" => countAllAntiSense = new File(args(t+1)) - case _ => - } - } - if (input == null || !input.exists) throw new IllegalStateException("Input file not found, use -I") - if (tagLib == null) throw new IllegalStateException("Output file not found, use -o") + if (!commandArgs.input.exists) throw new IllegalStateException("Input file not found, file: " + commandArgs.input) val rawCounts: Map[String, Long] = Map() - for (line <- Source.fromFile(input).getLines) { + for (line <- Source.fromFile(commandArgs.input).getLines) { val values = line.split("\t") val gene = values(0) val count = values(1).toLong @@ -83,7 +85,7 @@ object CreateTagCounts { val antiSenseCounts: Map[String, Long] = Map() val allAntiSenseCounts: Map[String, Long] = Map() - for (line <- Source.fromFile(tagLib).getLines if !line.startsWith("#")) { + for (line <- Source.fromFile(commandArgs.tagLib).getLines if !line.startsWith("#")) { val values = line.split("\t") val tag = values(0) val sense = values(1) @@ -126,9 +128,9 @@ object CreateTagCounts { writer.close } } - writeFile(countSense, senseCounts) - writeFile(countAllSense, allSenseCounts) - writeFile(countAntiSense, antiSenseCounts) - writeFile(countAllAntiSense, allAntiSenseCounts) + writeFile(commandArgs.countSense, senseCounts) + writeFile(commandArgs.countAllSense, allSenseCounts) + writeFile(commandArgs.countAntiSense, antiSenseCounts) + writeFile(commandArgs.countAllAntiSense, allAntiSenseCounts) } } \ No newline at end of file diff --git a/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extensions/svcallers/breakdancer2vcf.py b/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extensions/svcallers/breakdancer2vcf.py new file mode 100644 index 0000000000000000000000000000000000000000..be19779229fe65b7501e1ef6b4b409d522684eeb --- /dev/null +++ b/biopet-framework/src/main/scripts/nl/lumc/sasc/biopet/extensions/svcallers/breakdancer2vcf.py @@ -0,0 +1,153 @@ +#!/usr/bin/env python + +__copyright__ = """ +Copyright (C) 2013 - Tim te Beek +Copyright (C) 2013 - Wai Yi Leung +Copyright (C) 2013 AllBio (see AUTHORS file) +""" + +__desc__ = """Convert breakdancer output to pseudo .vcf file format.""" +__created__ = "Mar 18, 2013" +__author__ = "tbeek" + +import argparse +import csv +import os.path +import sys +import datetime + + +def main(tsvfile, vcffile): + ''' + :param tsvfile: filename of input file.tsv + :type tsvfile: string + :param vcffile: filename of output file.vcf + :type vcffile: string + ''' + with open(tsvfile) as reader: + # Parse file + dictreader = _parse_tsvfile(reader) + print dictreader.fieldnames + + # Write out file + _format_vcffile(dictreader, vcffile) + + # Quick output + with open(vcffile) as reader: + print reader.read(1000) + + +def _parse_tsvfile(readable): + ''' + Read readable using csv.Sniffer and csv.DictReader + :param readable: open file.tsv handle to read with csv.DictReader + :type readable: file + ''' + prev, curr = 0, 0 + while True: + line = readable.readline() + if not line.startswith('#'): + # lets start from prev # line, without the hash sign + readable.seek(prev + 1) + break + else: + prev = curr + curr = readable.tell() + + # Determine dialect + curr = readable.tell() + #dialect = csv.Sniffer().sniff(readable.read(3000)) + dialect = 'excel-tab' + readable.seek(curr) + + # Read file + dictreader = csv.DictReader(readable, dialect=dialect) + return dictreader + + +_tsv_fields = ('Chr1', 'Pos1', 'Orientation1', + 'Chr2', 'Pos2', 'Orientation2', + 'Type', 'Size', 'Score', + 'num_Reads', 'num_Reads_lib', + 'ERR031544.sort.bam') +# 'Chr1': '1', +# 'Pos1': '269907', +# 'Orientation1': '39+39-', +# 'Chr2': '1', +# 'Pos2': '270919', +# 'Orientation2': '39+39-', +# 'Type': 'DEL', +# 'Size': '99', +# 'Score': '99', +# 'num_Reads': '38', +# 'num_Reads_lib': '/home/allbio/ERR031544.sort.bam|38', +# 'ERR031544.sort.bam': 'NA' + + + +_vcf_fields = ('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'default') + +TS_NOW = datetime.datetime.now() + +VCF_HEADER = """##fileformat=VCFv4.1 +##fileDate={filedate} +##source=breakdancer-max +##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data"> +##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency"> +##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of variation"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality"> +##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">""".format( filedate=TS_NOW.strftime( "%Y%m%d" ) ) + +def _format_vcffile(dictreader, vcffile): + ''' + Create a pseudo .vcf file based on values read from DictReader instance. + :param dictreader: DictReader instance to read data from + :type dictreader: csv.DictRedaer + :param vcffile: output file.vcf filename + :type vcffile: string + ''' + FORMAT = "GT:DP" + with open(vcffile, mode='w') as writer: + writer.write('{header}\n#{columns}\n'.format(header=VCF_HEADER, columns='\t'.join(_vcf_fields))) + output_vcf = [] + for line in dictreader: + CHROM = line['Chr1'] + # TODO Figure out whether we have zero or one based positioning + POS = int(line['Pos1']) + ALT = '.' + SVEND = int(line['Pos2']) + + INFO = 'PROGRAM=breakdancer;SVTYPE={}'.format(line['Type']) + + if line['Type'] not in ['CTX']: + INFO += ';SVLEN={}'.format(int(line['Size'])) + INFO += ";SVEND={}".format(SVEND) + + # write alternate ALT field for Intrachromosomal translocations + if line['Type'] in ['CTX']: + ALT = "N[{}:{}[".format(line['Chr2'], line['Pos2']) + + + + SAMPLEINFO = "{}:{}".format( '1/.', line['num_Reads'] ) + # Create record + output_vcf.append([CHROM, POS, '.', '.', ALT, '.', 'PASS', INFO, FORMAT, SAMPLEINFO]) + + # Sort all results + output_vcf.sort() + output = "\n".join(["\t".join(map(str,vcf_row)) for vcf_row in output_vcf]) + # Write record + writer.write(output) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--breakdancertsv', dest='breakdancertsv', type=str, + help='Breakdancer TSV outputfile') + parser.add_argument('-o', '--outputvcf', dest='outputvcf', type=str, + help='Output vcf to') + + args = parser.parse_args() + main(args.breakdancertsv, args.outputvcf) diff --git a/biopet-framework/src/test/resources/paired01a.fq b/biopet-framework/src/test/resources/paired01a.fq new file mode 100644 index 0000000000000000000000000000000000000000..530a1bb9bada4865a047a8c8fe73dd5700d0b84d --- /dev/null +++ b/biopet-framework/src/test/resources/paired01a.fq @@ -0,0 +1,20 @@ +@r01/1 +A ++ +H +@r02/1 +T ++ +I +@r03/1 +G ++ +H +@r04/1 +C ++ +I +@r05/1 +A ++ +H diff --git a/biopet-framework/src/test/resources/paired01b.fq b/biopet-framework/src/test/resources/paired01b.fq new file mode 100644 index 0000000000000000000000000000000000000000..72cf9246dcce5c29a2500ced269dc3c17fd18847 --- /dev/null +++ b/biopet-framework/src/test/resources/paired01b.fq @@ -0,0 +1,20 @@ +@r01/2 +T ++ +I +@r02/2 +A ++ +H +@r03/2 +C ++ +I +@r04/2 +G ++ +H +@r05/2 +T ++ +I diff --git a/biopet-framework/src/test/resources/single01.fq b/biopet-framework/src/test/resources/single01.fq new file mode 100644 index 0000000000000000000000000000000000000000..acf48ca7e1a85ccaee4566da3c1c9094cce84c91 --- /dev/null +++ b/biopet-framework/src/test/resources/single01.fq @@ -0,0 +1,20 @@ +@r01 +A ++ +H +@r02 +T ++ +I +@r03 +G ++ +H +@r04 +C ++ +I +@r05 +A ++ +H diff --git a/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastqUnitTest.scala b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastqUnitTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..0dac57875f5ae3966b009d9605a2e7c7d55b5d96 --- /dev/null +++ b/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/ExtractAlignedFastqUnitTest.scala @@ -0,0 +1,237 @@ +/** + * Copyright (c) 2014 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl> + * @author Wibowo Arindrarto <w.arindrarto@lumc.nl> + */ +package nl.lumc.sasc.biopet.tools + +import java.io.File +import java.nio.file.Paths + +import org.mockito.Matchers._ +import org.mockito.Mockito._ +import org.scalatest.Matchers +import org.scalatest.mock.MockitoSugar +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.{ DataProvider, Test } + +import htsjdk.samtools.util.Interval +import htsjdk.samtools.fastq.{ BasicFastqWriter, FastqReader, FastqRecord } + +class ExtractAlignedFastqUnitTest extends TestNGSuite with MockitoSugar with Matchers { + + import ExtractAlignedFastq._ + + private def resourceFile(p: String): File = + new File(Paths.get(getClass.getResource(p).toURI).toString) + + private def makeInterval(chr: String, start: Int, end: Int): Interval = + new Interval(chr, start, end) + + private def makeRecord(header: String): FastqRecord = + new FastqRecord(header, "ATGC", "", "HIHI") + + private def makeSingleRecords(headers: String*): Map[String, FastqPair] = + headers.map(x => (x, (makeRecord(x), null))).toMap + + private def makePairRecords(headers: (String, (String, String))*): Map[String, FastqPair] = + headers.map(x => (x._1, (makeRecord(x._2._1), makeRecord(x._2._2)))).toMap + + private def makeClue(tName: String, f: File, rName: String): String = + tName + " on " + f.getName + ", read " + rName + ": " + + @Test def testIntervalStartEnd() = { + val obs = makeIntervalFromString(List("chr5:1000-1100")).next() + val exp = new Interval("chr5", 1000, 1100) + obs.getSequence should === (exp.getSequence) + obs.getStart should === (exp.getStart) + obs.getEnd should === (exp.getEnd) + } + + @Test def testIntervalStartEndComma() = { + val obs = makeIntervalFromString(List("chr5:1,000-1,100")).next() + val exp = new Interval("chr5", 1000, 1100) + obs.getSequence should === (exp.getSequence) + obs.getStart should === (exp.getStart) + obs.getEnd should === (exp.getEnd) + } + + @Test def testIntervalStartEndDot() = { + val obs = makeIntervalFromString(List("chr5:1.000-1.100")).next() + val exp = new Interval("chr5", 1000, 1100) + obs.getSequence should === (exp.getSequence) + obs.getStart should === (exp.getStart) + obs.getEnd should === (exp.getEnd) + } + + @Test def testIntervalStart() = { + val obs = makeIntervalFromString(List("chr5:1000")).next() + val exp = new Interval("chr5", 1000, 1000) + obs.getSequence should === (exp.getSequence) + obs.getStart should === (exp.getStart) + obs.getEnd should === (exp.getEnd) + } + + @Test def testIntervalError() = { + val thrown = intercept[IllegalArgumentException] { + makeIntervalFromString(List("chr5:1000-")).next() + } + thrown.getMessage should === ("Invalid interval string: chr5:1000-") + } + + @Test def testMemFuncIntervalError() = { + val iv = Iterator(new Interval("chrP", 1, 1000)) + val inAln = resourceFile("/single01.bam") + val thrown = intercept[IllegalArgumentException] { + makeMembershipFunction(iv, inAln) + } + thrown.getMessage should === ("Chromosome chrP is not found in the alignment file") + } + + @DataProvider(name = "singleAlnProvider1", parallel = true) + def singleAlnProvider1() = { + val sFastq1 = makeSingleRecords("r01", "r02", "r03", "r04", "r05") + val sFastq1Default = sFastq1.keys.map(x => (x, false)).toMap + val sBam01 = resourceFile("/single01.bam") + + Array( + Array("adjacent left", + makeInterval("chrQ", 30, 49), sBam01, sFastq1, sFastq1Default), + Array("adjacent right", + makeInterval("chrQ", 200, 210), sBam01, sFastq1, sFastq1Default), + Array("no overlap", + makeInterval("chrQ", 220, 230), sBam01, sFastq1, sFastq1Default), + Array("partial overlap", + makeInterval("chrQ", 430, 460), sBam01, sFastq1, sFastq1Default.updated("r04", true)), + Array("enveloped", + makeInterval("chrQ", 693, 698), sBam01, sFastq1, sFastq1Default.updated("r03", true)) + ) + } + + @Test(dataProvider = "singleAlnProvider1") + def testSingleBamDefault(name: String, feat: Interval, inAln: File, + fastqMap: Map[String, FastqPair], resultMap: Map[String, Boolean]) = { + require(resultMap.keySet == fastqMap.keySet) + val memFunc = makeMembershipFunction(Iterator(feat), inAln) + for ((key, (rec1, rec2)) <- fastqMap) { + withClue(makeClue(name, inAln, key)) { + memFunc(rec1, rec2) shouldBe resultMap(key) + } + } + } + + @DataProvider(name = "singleAlnProvider2", parallel = true) + def singleAlnProvider2() = { + val sFastq2 = makeSingleRecords("r01", "r02", "r04", "r07", "r06", "r08") + val sFastq2Default = sFastq2.keys.map(x => (x, false)).toMap + val sBam02 = resourceFile("/single02.bam") + + Array( + Array("less than minimum MAPQ", + makeInterval("chrQ", 830, 890), sBam02, 60, sFastq2, sFastq2Default), + Array("greater than minimum MAPQ", + makeInterval("chrQ", 830, 890), sBam02, 20, sFastq2, sFastq2Default.updated("r07", true)), + Array("equal to minimum MAPQ", + makeInterval("chrQ", 260, 320), sBam02, 30, sFastq2, sFastq2Default.updated("r01", true)) + ) + } + + @Test(dataProvider = "singleAlnProvider2") + def testSingleBamMinMapQ(name: String, feat: Interval, inAln: File, minMapQ: Int, + fastqMap: Map[String, FastqPair], resultMap: Map[String, Boolean]) = { + require(resultMap.keySet == fastqMap.keySet) + val memFunc = makeMembershipFunction(Iterator(feat), inAln, minMapQ) + for ((key, (rec1, rec2)) <- fastqMap) { + withClue(makeClue(name, inAln, key)) { + memFunc(rec1, rec2) shouldBe resultMap(key) + } + } + } + @DataProvider(name = "pairAlnProvider1", parallel = true) + def pairAlnProvider1() = { + val pFastq1 = makePairRecords( + ("r01", ("r01/1", "r01/2")), + ("r02", ("r02/1", "r02/2")), + ("r03", ("r03/1", "r03/2")), + ("r04", ("r04/1", "r04/2")), + ("r05", ("r05/1", "r05/2"))) + val pFastq1Default = pFastq1.keys.map(x => (x, false)).toMap + val pBam01 = resourceFile("/paired01.bam") + + Array( + Array("adjacent left", + makeInterval("chrQ", 30, 49), pBam01, pFastq1, pFastq1Default), + Array("adjacent right", + makeInterval("chrQ", 200, 210), pBam01, pFastq1, pFastq1Default), + Array("no overlap", + makeInterval("chrQ", 220, 230), pBam01, pFastq1, pFastq1Default), + Array("partial overlap", + makeInterval("chrQ", 430, 460), pBam01, pFastq1, pFastq1Default.updated("r04", true)), + Array("enveloped", + makeInterval("chrQ", 693, 698), pBam01, pFastq1, pFastq1Default.updated("r03", true)), + Array("in intron", + makeInterval("chrQ", 900, 999), pBam01, pFastq1, pFastq1Default.updated("r05", true)) + ) + } + + @Test(dataProvider = "pairAlnProvider1") + def testPairBamDefault(name: String, feat: Interval, inAln: File, + fastqMap: Map[String, FastqPair], resultMap: Map[String, Boolean]) = { + require(resultMap.keySet == fastqMap.keySet) + val memFunc = makeMembershipFunction(Iterator(feat), inAln, commonSuffixLength = 2) + for ((key, (rec1, rec2)) <- fastqMap) { + withClue(makeClue(name, inAln, key)) { + memFunc(rec1, rec2) shouldBe resultMap(key) + } + } + } + + @Test def testWriteSingleBamDefault() = { + val memFunc = (recs: FastqPair) => Set("r01", "r03").contains(recs._1.getReadHeader) + val in1 = new FastqReader(resourceFile("/single01.fq")) + val mo1 = mock[BasicFastqWriter] + selectFastqReads(memFunc, in1, mo1) + verify(mo1, times(2)).write(anyObject.asInstanceOf[FastqRecord]) + verify(mo1).write(new FastqRecord("r01", "A", "", "H")) + verify(mo1).write(new FastqRecord("r03", "G", "", "H")) + } + + @Test def testWritePairBamDefault() = { + val memFunc = (recs: FastqPair) => Set("r01/1", "r01/2", "r03/1", "r03/2").contains(recs._1.getReadHeader) + val in1 = new FastqReader(resourceFile("/paired01a.fq")) + val in2 = new FastqReader(resourceFile("/paired01b.fq")) + val mo1 = mock[BasicFastqWriter] + val mo2 = mock[BasicFastqWriter] + selectFastqReads(memFunc, in1, mo1, in2, mo2) + verify(mo1, times(2)).write(anyObject.asInstanceOf[FastqRecord]) + verify(mo1).write(new FastqRecord("r01/1", "A", "", "H")) + verify(mo1).write(new FastqRecord("r03/1", "G", "", "H")) + verify(mo2, times(2)).write(anyObject.asInstanceOf[FastqRecord]) + verify(mo2).write(new FastqRecord("r01/2", "T", "", "I")) + verify(mo2).write(new FastqRecord("r03/2", "C", "", "I")) + } + + @Test def testWriteNoOutputFastq2() = { + val memFunc: (FastqPair => Boolean) = (recs) => true + val in1 = mock[FastqReader] + val in2 = mock[FastqReader] + val out1 = mock[BasicFastqWriter] + val thrown = intercept[IllegalArgumentException] { + selectFastqReads(memFunc, in1, out1, in2) + } + thrown.getMessage should === ("Missing output FASTQ 2") + verify(out1, never).write(anyObject.asInstanceOf[FastqRecord]) + } + + @Test def testWriteNoInputFastq2() = { + val memFunc: (FastqPair => Boolean) = (recs) => true + val in1 = mock[FastqReader] + val out1 = mock[BasicFastqWriter] + val out2 = mock[BasicFastqWriter] + val thrown = intercept[IllegalArgumentException] { + selectFastqReads(memFunc, in1, out1, outputFastq2 = out2) + } + thrown.getMessage should === ("Output FASTQ 2 supplied but there is no input FASTQ 2") + verify(out1, never).write(anyObject.asInstanceOf[FastqRecord]) + verify(out2, never).write(anyObject.asInstanceOf[FastqRecord]) + } +}