Commit f697f3e0 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Adding a tool to generate a contig map

parent 8452cc90
package nl.lumc.sasc.biopet.extensions
import java.io.File
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{Input, Output}
/**
* Created by pjvanthof on 30/05/2017.
*/
class Cp(val parent: Configurable) extends BiopetCommandLineFunction {
@Input(doc = "Source file", required = true)
var source: File = _
@Output(doc = "Target file", required = true)
var target: File = _
executable = config("exe", default = "cp")
/** Returns command to execute */
def cmdLine =
required(executable) +
required(source) +
required(target)
}
object Cp {
def apply(parent: Configurable, source: File, target: File): Cp = {
val cp = new Cp(parent)
cp.source = source
cp.target = target
cp
}
}
\ No newline at end of file
...@@ -20,9 +20,6 @@ import nl.lumc.sasc.biopet.core.ToolCommandFunction ...@@ -20,9 +20,6 @@ import nl.lumc.sasc.biopet.core.ToolCommandFunction
import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{Input, Output} import org.broadinstitute.gatk.utils.commandline.{Input, Output}
/**
* @deprecated Use picard.util.BedToIntervalList instead
*/
class DownloadNcbiAssembly(val parent: Configurable) extends ToolCommandFunction { class DownloadNcbiAssembly(val parent: Configurable) extends ToolCommandFunction {
def toolObject = nl.lumc.sasc.biopet.tools.DownloadNcbiAssembly def toolObject = nl.lumc.sasc.biopet.tools.DownloadNcbiAssembly
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.tools
import java.io.File
import nl.lumc.sasc.biopet.core.ToolCommandFunction
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{Input, Output}
class NcbiReportToContigMap(val parent: Configurable) extends ToolCommandFunction {
def toolObject = nl.lumc.sasc.biopet.tools.NcbiReportToContigMap
@Output(doc = "Output fasta file", required = true)
var contigMap: File = _
var outputReport: File = _
@Input(required = true)
var assemblyReport: File = _
var nameHeader: String = _
override def defaultCoreMemory = 4.0
override def cmdLine =
super.cmdLine +
required("-a", assemblyReport) +
required("--report", outputReport) +
required("-o", contigMap) +
required("--nameHeader", nameHeader)
}
...@@ -56,6 +56,7 @@ object BiopetToolsExecutable extends BiopetExecutable { ...@@ -56,6 +56,7 @@ object BiopetToolsExecutable extends BiopetExecutable {
nl.lumc.sasc.biopet.tools.VcfWithVcf, nl.lumc.sasc.biopet.tools.VcfWithVcf,
nl.lumc.sasc.biopet.tools.VepNormalizer, nl.lumc.sasc.biopet.tools.VepNormalizer,
nl.lumc.sasc.biopet.tools.WipeReads, nl.lumc.sasc.biopet.tools.WipeReads,
nl.lumc.sasc.biopet.tools.NcbiReportToContigMap,
nl.lumc.sasc.biopet.tools.DownloadNcbiAssembly nl.lumc.sasc.biopet.tools.DownloadNcbiAssembly
) )
......
package nl.lumc.sasc.biopet.tools
import java.io.{File, PrintWriter}
import nl.lumc.sasc.biopet.utils.ToolCommand
import scala.io.Source
/**
* Created by pjvanthof on 30/05/2017.
*/
object NcbiReportToContigMap extends ToolCommand {
case class Args(assemblyReport: File = null,
outputFile: File = null,
reportFile: Option[File] = None,
contigNameHeader: String = null,
names: List[String] = List("Sequence-Name", "UCSC-style-name", "GenBank-Accn", "RefSeq-Accn"))
extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('a', "assembly_report") required() unbounded() valueName "<file>" action {
(x, c) =>
c.copy(assemblyReport = x)
} text "refseq ID from NCBI"
opt[File]('o', "output") required() unbounded() valueName "<file>" action { (x, c) =>
c.copy(outputFile = x)
} text "output Fasta file"
opt[String]("nameHeader") required() unbounded() valueName "<string>" action { (x, c) =>
c.copy(contigNameHeader = x)
} text
"""
| What column to use from the NCBI report for the name of the contigs.
| All columns in the report can be used but this are the most common field to choose from:
| - 'Sequence-Name': Name of the contig within the assembly
| - 'UCSC-style-name': Name of the contig used by UCSC ( like hg19 )
| - 'RefSeq-Accn': Unique name of the contig at RefSeq (default for NCBI)""".stripMargin
opt[String]("names") unbounded() action { (x, c) =>
c.copy(names = x.split(",").toList)
} text "output Fasta file"
}
/**
* @param args the command line arguments
*/
def main(args: Array[String]): Unit = {
val argsParser = new OptParser
val cmdargs
: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
logger.info(s"Reading ${cmdargs.assemblyReport}")
val reader = Source.fromFile(cmdargs.assemblyReport)
val assamblyReport = reader.getLines().toList
reader.close()
cmdargs.reportFile.foreach { file =>
val writer = new PrintWriter(file)
assamblyReport.foreach(writer.println)
writer.close()
}
val headers = assamblyReport
.filter(_.startsWith("#"))
.last
.stripPrefix("# ")
.split("\t")
.zipWithIndex
.toMap
val allContigs = assamblyReport
.filter(!_.startsWith("#"))
.map(_.split("\t"))
val altNameIds = cmdargs.names.filter(_ != cmdargs.contigNameHeader).map(headers)
val nameId = headers(cmdargs.contigNameHeader)
val writer = new PrintWriter(cmdargs.outputFile)
for (line <- assamblyReport.filter(!_.startsWith("#"))) {
val values = line.split("\t")
val altNames = altNameIds.map(i => values(i)).filter(_ != "na").distinct
writer.println(values(nameId) + "\t" + altNames.mkString(";"))
}
writer.close()
}
}
package nl.lumc.sasc.biopet.tools
import java.io.File
import java.nio.file.Paths
import org.scalatest.Matchers
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.Test
/**
* Created by pjvanthof on 30/05/2017.
*/
class NcbiReportToContigMapTest extends TestNGSuite with Matchers {
private def resourcePath(p: String): String = {
Paths.get(getClass.getResource (p).toURI).toString
}
@Test
def test: Unit = {
val report = new File(resourcePath("/GCF_000844745.1.report"))
val output = File.createTempFile("test.", ".tsv")
output.deleteOnExit()
NcbiReportToContigMap.main(Array("-a", report.getAbsolutePath, "-o", output.getAbsolutePath, "--nameHeader", "Sequence-Name"))
}
}
...@@ -22,7 +22,7 @@ import nl.lumc.sasc.biopet.core.{BiopetQScript, PipelineCommand} ...@@ -22,7 +22,7 @@ import nl.lumc.sasc.biopet.core.{BiopetQScript, PipelineCommand}
import nl.lumc.sasc.biopet.extensions._ import nl.lumc.sasc.biopet.extensions._
import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants import nl.lumc.sasc.biopet.extensions.gatk.CombineVariants
import nl.lumc.sasc.biopet.extensions.picard.NormalizeFasta import nl.lumc.sasc.biopet.extensions.picard.NormalizeFasta
import nl.lumc.sasc.biopet.extensions.tools.DownloadNcbiAssembly import nl.lumc.sasc.biopet.extensions.tools.{DownloadNcbiAssembly, NcbiReportToContigMap}
import nl.lumc.sasc.biopet.utils.ConfigUtils import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Configurable import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript import org.broadinstitute.gatk.queue.QScript
...@@ -79,14 +79,29 @@ class DownloadGenomes(val parent: Configurable) extends QScript with BiopetQScri ...@@ -79,14 +79,29 @@ class DownloadGenomes(val parent: Configurable) extends QScript with BiopetQScri
genomeConfig.get("ncbi_assembly_report") match { genomeConfig.get("ncbi_assembly_report") match {
case Some(assemblyReport: String) => case Some(assemblyReport: String) =>
val ncbiAssemblyReport = new File(genomeDir, assemblyReport.getName)
add(Cp(this, new File(assemblyReport), ncbiAssemblyReport))
renderReadme.ncbiAssemblyReport = Some(ncbiAssemblyReport)
val ncbiReportToContigMap = new NcbiReportToContigMap(this)
ncbiReportToContigMap.assemblyReport = ncbiAssemblyReport
ncbiReportToContigMap.contigMap = new File(genomeDir, "contig.map.tsv")
ncbiReportToContigMap.nameHeader = genomeConfig
.get("ncbi_assembly_header_name")
.map(_.toString)
.getOrElse("RefSeq-Accn")
add(ncbiReportToContigMap)
val downloadAssembly = new DownloadNcbiAssembly(this) val downloadAssembly = new DownloadNcbiAssembly(this)
downloadAssembly.assemblyReport = new File(assemblyReport) downloadAssembly.assemblyReport = ncbiAssemblyReport
renderReadme.ncbiAssemblyReport = Some(downloadAssembly.assemblyReport)
downloadAssembly.output = downloadFastaFile downloadAssembly.output = downloadFastaFile
downloadAssembly.outputReport = downloadAssembly.outputReport =
new File(genomeDir, s"$speciesName-$genomeName.assembly.report") new File(genomeDir, s"$speciesName-$genomeName.assembly.report")
downloadAssembly.nameHeader = downloadAssembly.nameHeader = Some(
genomeConfig.get("ncbi_assembly_header_name").map(_.toString) genomeConfig
.get("ncbi_assembly_header_name")
.map(_.toString)
.getOrElse("RefSeq-Accn"))
downloadAssembly.mustHaveOne = genomeConfig downloadAssembly.mustHaveOne = genomeConfig
.get("ncbi_assembly_must_have_one") .get("ncbi_assembly_must_have_one")
.map(_.asInstanceOf[util.ArrayList[util.LinkedHashMap[String, String]]]) .map(_.asInstanceOf[util.ArrayList[util.LinkedHashMap[String, String]]])
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment