SageCreateLibrary.scala 6.47 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
16
package nl.lumc.sasc.biopet.tools
17

Peter van 't Hof's avatar
Peter van 't Hof committed
18 19
import java.io.{ File, PrintWriter }

Peter van 't Hof's avatar
Peter van 't Hof committed
20
import nl.lumc.sasc.biopet.utils.ToolCommand
21 22
import org.biojava3.core.sequence.DNASequence
import org.biojava3.core.sequence.io.FastaReaderHelper
Peter van 't Hof's avatar
Peter van 't Hof committed
23

24
import scala.collection.{ SortedMap, mutable }
25
import scala.util.matching.Regex
26
import scala.collection.JavaConversions._
27

28
object SageCreateLibrary extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
29 30
  case class Args(input: File = null, tag: String = "CATG", length: Int = 17, output: File = null, noTagsOutput: File = null,
                  noAntiTagsOutput: File = null, allGenesOutput: File = null) extends AbstractArgs
31 32

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
33
    opt[File]('I', "input") required () unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
34 35
      c.copy(input = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
36
    opt[File]('o', "output") required () unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
37 38 39 40 41 42 43 44
      c.copy(output = x)
    }
    opt[String]("tag") required () unbounded () action { (x, c) =>
      c.copy(tag = x)
    }
    opt[Int]("length") required () unbounded () action { (x, c) =>
      c.copy(length = x)
    }
Sander Bollen's avatar
Sander Bollen committed
45
    opt[File]("noTagsOutput") unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
46 47
      c.copy(noTagsOutput = x)
    }
Sander Bollen's avatar
Sander Bollen committed
48
    opt[File]("noAntiTagsOutput") unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
49 50
      c.copy(noAntiTagsOutput = x)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
51
    opt[File]("allGenesOutput") unbounded () valueName "<file>" action { (x, c) =>
Peter van 't Hof's avatar
Peter van 't Hof committed
52 53
      c.copy(allGenesOutput = x)
    }
54
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
55

56
  val geneRegex = """ENSG[0-9]{11}""".r
Peter van 't Hof's avatar
Peter van 't Hof committed
57

Peter van 't Hof's avatar
Peter van 't Hof committed
58
  val tagGenesMap: mutable.Map[String, TagGenes] = mutable.Map()
Peter van 't Hof's avatar
Peter van 't Hof committed
59

Peter van 't Hof's avatar
Peter van 't Hof committed
60 61 62
  val allGenes: mutable.Set[String] = mutable.Set()
  val tagGenes: mutable.Set[String] = mutable.Set()
  val antiTagGenes: mutable.Set[String] = mutable.Set()
Peter van 't Hof's avatar
Peter van 't Hof committed
63

64
  class TagGenes {
Peter van 't Hof's avatar
Peter van 't Hof committed
65 66 67 68
    val firstTag: mutable.Set[String] = mutable.Set()
    val allTags: mutable.Set[String] = mutable.Set()
    val firstAntiTag: mutable.Set[String] = mutable.Set()
    val allAntiTags: mutable.Set[String] = mutable.Set()
69
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
70 71
  class TagResult(val firstTag: String, val allTags: List[String], val firstAntiTag: String, val allAntiTags: List[String])

72 73 74 75
  /**
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
76
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
77
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse (throw new IllegalArgumentException)
Peter van 't Hof's avatar
Peter van 't Hof committed
78

79
    if (!commandArgs.input.exists) throw new IllegalStateException("Input file not found, file: " + commandArgs.input)
Peter van 't Hof's avatar
Peter van 't Hof committed
80

81
    val tagRegex = (commandArgs.tag + "[CATG]{" + commandArgs.length + "}").r
Peter van 't Hof's avatar
Peter van 't Hof committed
82

83
    var count = 0
84
    logger.info("Reading fasta file")
85
    val reader = FastaReaderHelper.readFastaDNASequence(commandArgs.input)
86
    logger.info("Finding tags")
87
    for ((name, seq) <- reader) {
Sander Bollen's avatar
Sander Bollen committed
88 89
      val result = getTags(name, seq, tagRegex)
      addTagresultToTaglib(name, result)
90
      count += 1
91
      if (count % 10000 == 0) logger.info(count + " transcripts done")
92
    }
93
    logger.info(count + " transcripts done")
Peter van 't Hof's avatar
Peter van 't Hof committed
94

95
    logger.info("Start sorting tags")
Peter van 't Hof's avatar
Peter van 't Hof committed
96 97
    val tagGenesMapSorted: SortedMap[String, TagGenes] = SortedMap(tagGenesMap.toArray: _*)

98
    logger.info("Writting output files")
99
    val writer = new PrintWriter(commandArgs.output)
100 101 102
    writer.println("#tag\tfirstTag\tAllTags\tFirstAntiTag\tAllAntiTags")
    for ((tag, genes) <- tagGenesMapSorted) {
      val line = tag + "\t" + genes.firstTag.mkString(",") +
Peter van 't Hof's avatar
Peter van 't Hof committed
103 104 105
        "\t" + genes.allTags.mkString(",") +
        "\t" + genes.firstAntiTag.mkString(",") +
        "\t" + genes.allAntiTags.mkString(",")
106 107 108
      writer.println(line)
    }
    writer.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
109

110 111
    if (commandArgs.noTagsOutput != null) {
      val writer = new PrintWriter(commandArgs.noTagsOutput)
112 113 114
      for (gene <- allGenes if !tagGenes.contains(gene)) {
        writer.println(gene)
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
115
      writer.close()
116
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
117

118 119
    if (commandArgs.noAntiTagsOutput != null) {
      val writer = new PrintWriter(commandArgs.noAntiTagsOutput)
120 121 122
      for (gene <- allGenes if !antiTagGenes.contains(gene)) {
        writer.println(gene)
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
123
      writer.close()
124
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
125

126 127
    if (commandArgs.allGenesOutput != null) {
      val writer = new PrintWriter(commandArgs.allGenesOutput)
128 129 130
      for (gene <- allGenes) {
        writer.println(gene)
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
131
      writer.close()
132 133
    }
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
134

Sander Bollen's avatar
Sander Bollen committed
135
  private def addTagresultToTaglib(name: String, tagResult: TagResult) {
136 137 138
    val id = name.split(" ").head //.stripPrefix("hg19_ensGene_")
    val geneID = geneRegex.findFirstIn(name).getOrElse("unknown_gene")
    allGenes.add(geneID)
Peter van 't Hof's avatar
Peter van 't Hof committed
139

140 141 142 143 144
    if (tagResult.firstTag != null) {
      if (!tagGenesMap.contains(tagResult.firstTag)) tagGenesMap += (tagResult.firstTag -> new TagGenes)
      tagGenesMap(tagResult.firstTag).firstTag.add(geneID)
      tagGenes.add(geneID)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
145

146 147 148 149
    for (tag <- tagResult.allTags) {
      if (!tagGenesMap.contains(tag)) tagGenesMap += (tag -> new TagGenes)
      tagGenesMap(tag).allTags.add(geneID)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
150

151 152 153 154 155
    if (tagResult.firstAntiTag != null) {
      if (!tagGenesMap.contains(tagResult.firstAntiTag)) tagGenesMap += (tagResult.firstAntiTag -> new TagGenes)
      tagGenesMap(tagResult.firstAntiTag).firstAntiTag.add(geneID)
      antiTagGenes.add(geneID)
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
156

157 158 159 160 161
    for (tag <- tagResult.allAntiTags) {
      if (!tagGenesMap.contains(tag)) tagGenesMap += (tag -> new TagGenes)
      tagGenesMap(tag).allAntiTags.add(geneID)
    }
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
162

Sander Bollen's avatar
Sander Bollen committed
163
  def getTags(name: String, seq: DNASequence, tagRegex: Regex): TagResult = {
Peter van 't Hof's avatar
Peter van 't Hof committed
164
    val allTags: List[String] = for (tag <- tagRegex.findAllMatchIn(seq.getSequenceAsString).toList) yield tag.toString()
165
    val firstTag = if (allTags.isEmpty) null else allTags.last
Peter van 't Hof's avatar
Peter van 't Hof committed
166
    val allAntiTags: List[String] = for (tag <- tagRegex.findAllMatchIn(seq.getReverseComplement.getSequenceAsString).toList) yield tag.toString()
Peter van 't Hof's avatar
Peter van 't Hof committed
167
    val firstAntiTag = if (allAntiTags.isEmpty) null else allAntiTags.head
168
    val result = new TagResult(firstTag, allTags, firstAntiTag, allAntiTags)
Peter van 't Hof's avatar
Peter van 't Hof committed
169

Peter van 't Hof's avatar
Peter van 't Hof committed
170
    result
171
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
172
}