DownloadNcbiAssembly.scala 5.09 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
15
16
package nl.lumc.sasc.biopet.tools

Peter van 't Hof's avatar
Peter van 't Hof committed
17
import java.io.{ File, PrintWriter }
18
19
20
21
22
23

import nl.lumc.sasc.biopet.utils.ToolCommand

import scala.io.Source

/**
Peter van 't Hof's avatar
Peter van 't Hof committed
24
25
 * Created by pjvan_thof on 21-9-16.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
26
object DownloadNcbiAssembly extends ToolCommand {
27

Peter van 't Hof's avatar
Peter van 't Hof committed
28
  case class Args(assemblyId: String = null,
29
                  outputFile: File = null,
30
                  reportFile: Option[File] = None,
31
                  contigNameHeader: Option[String] = None,
Peter van 't Hof's avatar
Peter van 't Hof committed
32
33
                  mustHaveOne: List[(String, String)] = List(),
                  mustNotHave: List[(String, String)] = List()) extends AbstractArgs
34
35

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
36
    opt[String]('a', "assembly id") required () unbounded () valueName "<file>" action { (x, c) =>
37
      c.copy(assemblyId = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
38
    } text "refseq ID from NCBI"
Peter van 't Hof's avatar
Peter van 't Hof committed
39
    opt[File]('o', "output") required () unbounded () valueName "<file>" action { (x, c) =>
40
      c.copy(outputFile = x)
Peter van 't Hof's avatar
Peter van 't Hof committed
41
    } text "output Fasta file"
Peter van 't Hof's avatar
Peter van 't Hof committed
42
    opt[File]("report") unbounded () valueName "<file>" action { (x, c) =>
43
      c.copy(reportFile = Some(x))
Peter van 't Hof's avatar
Peter van 't Hof committed
44
    } text "where to write report from ncbi"
Peter van 't Hof's avatar
Peter van 't Hof committed
45
    opt[String]("nameHeader") unbounded () valueName "<string>" action { (x, c) =>
46
      c.copy(contigNameHeader = Some(x))
47
48
49
50
51
52
53
    } text
      """
        | What column to use from the NCBI report for the name of the contigs.
        | All columns in the report can be used but this are the most common field to choose from:
        | - 'Sequence-Name': Name of the contig within the assembly
        | - 'UCSC-style-name': Name of the contig used by UCSC ( like hg19 )
        | - 'RefSeq-Accn': Unique name of the contig at RefSeq (default for NCBI)""".stripMargin
Peter van 't Hof's avatar
Peter van 't Hof committed
54
    opt[(String, String)]("mustHaveOne") unbounded () valueName "<column_name=regex>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
55
      c.copy(mustHaveOne = (x._1, x._2) :: c.mustHaveOne)
Peter van 't Hof's avatar
Peter van 't Hof committed
56
57
    } text "This can be used to filter based on the NCBI report, multiple conditions can be given, at least 1 should be true"
    opt[(String, String)]("mustNotHave") unbounded () valueName "<column_name=regex>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
58
      c.copy(mustNotHave = (x._1, x._2) :: c.mustNotHave)
Peter van 't Hof's avatar
Peter van 't Hof committed
59
    } text "This can be used to filter based on the NCBI report, multiple conditions can be given, all should be false"
60
61
62
  }

  /**
Peter van 't Hof's avatar
Peter van 't Hof committed
63
64
   * @param args the command line arguments
   */
65
66
  def main(args: Array[String]): Unit = {
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
67
    val cmdargs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
68
69
70
71
72

    logger.info(s"Reading ${cmdargs.assemblyId} from NCBI")
    val reader = Source.fromURL(s"ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/${cmdargs.assemblyId}.assembly.txt")
    val assamblyReport = reader.getLines().toList
    reader.close()
73
74
75
76
77
    cmdargs.reportFile.foreach { file =>
      val writer = new PrintWriter(file)
      assamblyReport.foreach(writer.println)
      writer.close()
    }
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102

    val headers = assamblyReport.filter(_.startsWith("#")).last.stripPrefix("# ").split("\t").zipWithIndex.toMap
    val nameId = cmdargs.contigNameHeader.map(x => headers(x))
    val lengthId = headers.get("Sequence-Length")

    val baseUrlEutils = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

    val fastaWriter = new PrintWriter(cmdargs.outputFile)

    val allContigs = assamblyReport.filter(!_.startsWith("#"))
      .map(_.split("\t"))
    val totalLength = lengthId.map(id => allContigs.map(_.apply(id).toLong).sum)

    logger.info(s"${allContigs.size} contigs found")
    totalLength.foreach(l => logger.info(s"Total length: ${l}"))

    val filterContigs = allContigs
      .filter(values => cmdargs.mustNotHave.forall(x => values(headers(x._1)) != x._2))
      .filter(values => cmdargs.mustHaveOne.exists(x => values(headers(x._1)) == x._2) || cmdargs.mustHaveOne.isEmpty)
    val filterLength = lengthId.map(id => filterContigs.map(_.apply(id).toLong).sum)

    logger.info(s"${filterContigs.size} contigs left after filtering")
    filterLength.foreach(l => logger.info(s"Filtered length: ${l}"))

    filterContigs.foreach { values =>
Peter van 't Hof's avatar
Peter van 't Hof committed
103
104
105
106
107
108
109
110
      val id = if (values(6) == "na") values(4) else values(6)
      logger.info(s"Start download ${id}")
      val fastaReader = Source.fromURL(s"${baseUrlEutils}/efetch.fcgi?db=nuccore&id=${id}&retmode=text&rettype=fasta")
      fastaReader.getLines()
        .map(x => nameId.map(y => x.replace(">", s">${values(y)} ")).getOrElse(x))
        .foreach(fastaWriter.println)
      fastaReader.close()
    }
111
112
113
114
115
116
117

    logger.info("Downloading complete")

    fastaWriter.close()

  }
}