DownloadNcbiAssembly.scala 4.07 KB
Newer Older
1
2
package nl.lumc.sasc.biopet.tools

Peter van 't Hof's avatar
Peter van 't Hof committed
3
import java.io.{ File, PrintWriter }
4
5
6
7
8
9

import nl.lumc.sasc.biopet.utils.ToolCommand

import scala.io.Source

/**
Peter van 't Hof's avatar
Peter van 't Hof committed
10
11
 * Created by pjvan_thof on 21-9-16.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
12
object DownloadNcbiAssembly extends ToolCommand {
13

Peter van 't Hof's avatar
Peter van 't Hof committed
14
  case class Args(assemblyId: String = null,
15
                  outputFile: File = null,
16
                  reportFile: Option[File] = None,
17
                  contigNameHeader: Option[String] = None,
Peter van 't Hof's avatar
Peter van 't Hof committed
18
19
                  mustHaveOne: List[(String, String)] = List(),
                  mustNotHave: List[(String, String)] = List()) extends AbstractArgs
20
21

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
22
    opt[String]('a', "assembly id") required () unbounded () valueName "<file>" action { (x, c) =>
23
      c.copy(assemblyId = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
24
    } text "refseq ID from NCBI"
Peter van 't Hof's avatar
Peter van 't Hof committed
25
    opt[File]('o', "output") required () unbounded () valueName "<file>" action { (x, c) =>
26
      c.copy(outputFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
27
    } text "output Fasta file"
Peter van 't Hof's avatar
Peter van 't Hof committed
28
    opt[File]("report") unbounded () valueName "<file>" action { (x, c) =>
29
      c.copy(reportFile = Some(x))
Peter van 't Hof's avatar
Peter van 't Hof committed
30
    } text "where to write report from ncbi"
Peter van 't Hof's avatar
Peter van 't Hof committed
31
    opt[String]("nameHeader") unbounded () valueName "<string>" action { (x, c) =>
32
      c.copy(contigNameHeader = Some(x))
Peter van 't Hof's avatar
Peter van 't Hof committed
33
34
    } text "What column to use from the NCBI report for the nam,e of the contigs"
    opt[(String, String)]("mustHaveOne") unbounded () valueName "<column_name=regex>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
35
      c.copy(mustHaveOne = (x._1, x._2) :: c.mustHaveOne)
Peter van 't Hof's avatar
Peter van 't Hof committed
36
37
    } text "This can be used to filter based on the NCBI report, multiple conditions can be given, at least 1 should be true"
    opt[(String, String)]("mustNotHave") unbounded () valueName "<column_name=regex>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
38
      c.copy(mustNotHave = (x._1, x._2) :: c.mustNotHave)
Peter van 't Hof's avatar
Peter van 't Hof committed
39
    } text "This can be used to filter based on the NCBI report, multiple conditions can be given, all should be false"
40
41
42
  }

  /**
Peter van 't Hof's avatar
Peter van 't Hof committed
43
44
   * @param args the command line arguments
   */
45
46
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
47
    val cmdargs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
48
49
50
51
52

    logger.info(s"Reading ${cmdargs.assemblyId} from NCBI")
    val reader = Source.fromURL(s"ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/${cmdargs.assemblyId}.assembly.txt")
    val assamblyReport = reader.getLines().toList
    reader.close()
53
54
55
56
57
    cmdargs.reportFile.foreach { file =>
      val writer = new PrintWriter(file)
      assamblyReport.foreach(writer.println)
      writer.close()
    }
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

    val headers = assamblyReport.filter(_.startsWith("#")).last.stripPrefix("# ").split("\t").zipWithIndex.toMap
    val nameId = cmdargs.contigNameHeader.map(x => headers(x))
    val lengthId = headers.get("Sequence-Length")

    val baseUrlEutils = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

    val fastaWriter = new PrintWriter(cmdargs.outputFile)

    val allContigs = assamblyReport.filter(!_.startsWith("#"))
      .map(_.split("\t"))
    val totalLength = lengthId.map(id => allContigs.map(_.apply(id).toLong).sum)

    logger.info(s"${allContigs.size} contigs found")
    totalLength.foreach(l => logger.info(s"Total length: ${l}"))

    val filterContigs = allContigs
      .filter(values => cmdargs.mustNotHave.forall(x => values(headers(x._1)) != x._2))
      .filter(values => cmdargs.mustHaveOne.exists(x => values(headers(x._1)) == x._2) || cmdargs.mustHaveOne.isEmpty)
    val filterLength = lengthId.map(id => filterContigs.map(_.apply(id).toLong).sum)

    logger.info(s"${filterContigs.size} contigs left after filtering")
    filterLength.foreach(l => logger.info(s"Filtered length: ${l}"))

    filterContigs.foreach { values =>
Peter van 't Hof's avatar
Peter van 't Hof committed
83
84
85
86
87
88
89
90
      val id = if (values(6) == "na") values(4) else values(6)
      logger.info(s"Start download ${id}")
      val fastaReader = Source.fromURL(s"${baseUrlEutils}/efetch.fcgi?db=nuccore&id=${id}&retmode=text&rettype=fasta")
      fastaReader.getLines()
        .map(x => nameId.map(y => x.replace(">", s">${values(y)} ")).getOrElse(x))
        .foreach(fastaWriter.println)
      fastaReader.close()
    }
91
92
93
94
95
96
97

    logger.info("Downloading complete")

    fastaWriter.close()

  }
}