DownloadNcbiAssembly.scala 5.03 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
15 16
package nl.lumc.sasc.biopet.tools

Peter van 't Hof's avatar
Peter van 't Hof committed
17
import java.io.{ File, PrintWriter }
18 19 20 21 22 23

import nl.lumc.sasc.biopet.utils.ToolCommand

import scala.io.Source

/**
Peter van 't Hof's avatar
Peter van 't Hof committed
24 25
 * Created by pjvan_thof on 21-9-16.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
26
object DownloadNcbiAssembly extends ToolCommand {
27

28
  case class Args(assemblyReport: File = null,
29
                  outputFile: File = null,
30
                  reportFile: Option[File] = None,
31
                  contigNameHeader: Option[String] = None,
Peter van 't Hof's avatar
Peter van 't Hof committed
32 33
                  mustHaveOne: List[(String, String)] = List(),
                  mustNotHave: List[(String, String)] = List()) extends AbstractArgs
34 35

  class OptParser extends AbstractOptParser {
36 37
    opt[File]('a', "assembly_report") required () unbounded () valueName "<file>" action { (x, c) =>
      c.copy(assemblyReport = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
38
    } text "refseq ID from NCBI"
Peter van 't Hof's avatar
Peter van 't Hof committed
39
    opt[File]('o', "output") required () unbounded () valueName "<file>" action { (x, c) =>
40
      c.copy(outputFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
41
    } text "output Fasta file"
Peter van 't Hof's avatar
Peter van 't Hof committed
42
    opt[File]("report") unbounded () valueName "<file>" action { (x, c) =>
43
      c.copy(reportFile = Some(x))
Peter van 't Hof's avatar
Peter van 't Hof committed
44
    } text "where to write report from ncbi"
Peter van 't Hof's avatar
Peter van 't Hof committed
45
    opt[String]("nameHeader") unbounded () valueName "<string>" action { (x, c) =>
46
      c.copy(contigNameHeader = Some(x))
47 48 49 50 51 52 53
    } text
      """
        | What column to use from the NCBI report for the name of the contigs.
        | All columns in the report can be used but this are the most common field to choose from:
        | - 'Sequence-Name': Name of the contig within the assembly
        | - 'UCSC-style-name': Name of the contig used by UCSC ( like hg19 )
        | - 'RefSeq-Accn': Unique name of the contig at RefSeq (default for NCBI)""".stripMargin
Peter van 't Hof's avatar
Peter van 't Hof committed
54
    opt[(String, String)]("mustHaveOne") unbounded () valueName "<column_name=regex>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
55
      c.copy(mustHaveOne = (x._1, x._2) :: c.mustHaveOne)
Peter van 't Hof's avatar
Peter van 't Hof committed
56 57
    } text "This can be used to filter based on the NCBI report, multiple conditions can be given, at least 1 should be true"
    opt[(String, String)]("mustNotHave") unbounded () valueName "<column_name=regex>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
58
      c.copy(mustNotHave = (x._1, x._2) :: c.mustNotHave)
Peter van 't Hof's avatar
Peter van 't Hof committed
59
    } text "This can be used to filter based on the NCBI report, multiple conditions can be given, all should be false"
60 61 62
  }

  /**
Peter van 't Hof's avatar
Peter van 't Hof committed
63 64
   * @param args the command line arguments
   */
65 66
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
67
    val cmdargs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
68

69 70
    logger.info(s"Reading ${cmdargs.assemblyReport}")
    val reader = Source.fromFile(cmdargs.assemblyReport)
71 72
    val assamblyReport = reader.getLines().toList
    reader.close()
73 74 75 76 77
    cmdargs.reportFile.foreach { file =>
      val writer = new PrintWriter(file)
      assamblyReport.foreach(writer.println)
      writer.close()
    }
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102

    val headers = assamblyReport.filter(_.startsWith("#")).last.stripPrefix("# ").split("\t").zipWithIndex.toMap
    val nameId = cmdargs.contigNameHeader.map(x => headers(x))
    val lengthId = headers.get("Sequence-Length")

    val baseUrlEutils = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

    val fastaWriter = new PrintWriter(cmdargs.outputFile)

    val allContigs = assamblyReport.filter(!_.startsWith("#"))
      .map(_.split("\t"))
    val totalLength = lengthId.map(id => allContigs.map(_.apply(id).toLong).sum)

    logger.info(s"${allContigs.size} contigs found")
    totalLength.foreach(l => logger.info(s"Total length: ${l}"))

    val filterContigs = allContigs
      .filter(values => cmdargs.mustNotHave.forall(x => values(headers(x._1)) != x._2))
      .filter(values => cmdargs.mustHaveOne.exists(x => values(headers(x._1)) == x._2) || cmdargs.mustHaveOne.isEmpty)
    val filterLength = lengthId.map(id => filterContigs.map(_.apply(id).toLong).sum)

    logger.info(s"${filterContigs.size} contigs left after filtering")
    filterLength.foreach(l => logger.info(s"Filtered length: ${l}"))

    filterContigs.foreach { values =>
Peter van 't Hof's avatar
Peter van 't Hof committed
103 104 105 106 107 108 109 110
      val id = if (values(6) == "na") values(4) else values(6)
      logger.info(s"Start download ${id}")
      val fastaReader = Source.fromURL(s"${baseUrlEutils}/efetch.fcgi?db=nuccore&id=${id}&retmode=text&rettype=fasta")
      fastaReader.getLines()
        .map(x => nameId.map(y => x.replace(">", s">${values(y)} ")).getOrElse(x))
        .foreach(fastaWriter.println)
      fastaReader.close()
    }
111 112 113 114 115 116 117

    logger.info("Downloading complete")

    fastaWriter.close()

  }
}