FastqSplitter.scala 3.33 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
 * A dual licensing mode is applied. The source code within this project that are
 * not part of GATK Queue is freely available for non-commercial use under an AGPL
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
16
package nl.lumc.sasc.biopet.tools
Peter van 't Hof's avatar
Peter van 't Hof committed
17

18
19
import java.io.File
import htsjdk.samtools.fastq.{ AsyncFastqWriter, FastqReader, BasicFastqWriter }
Peter van 't Hof's avatar
Peter van 't Hof committed
20
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
Peter van 't Hof's avatar
Peter van 't Hof committed
21
import nl.lumc.sasc.biopet.core.ToolCommand
Peter van 't Hof's avatar
Peter van 't Hof committed
22
import nl.lumc.sasc.biopet.core.config.Configurable
bow's avatar
bow committed
23
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
24
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
25

Peter van 't Hof's avatar
Peter van 't Hof committed
26
27
28
29
/**
 * Queue extension for the FastqSplitter
 * @param root Parent object
 */
bow's avatar
bow committed
30
class FastqSplitter(val root: Configurable) extends BiopetJavaCommandLineFunction {
Peter van 't Hof's avatar
Peter van 't Hof committed
31
  javaMainClass = getClass.getName
bow's avatar
bow committed
32
33

  @Input(doc = "Input fastq", shortName = "input", required = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
34
  var input: File = _
bow's avatar
bow committed
35
36

  @Output(doc = "Output fastq files", shortName = "output", required = true)
Peter van 't Hof's avatar
Peter van 't Hof committed
37
  var output: List[File] = Nil
bow's avatar
bow committed
38

Peter van 't Hof's avatar
Peter van 't Hof committed
39
  override val defaultCoreMemory = 4.0
bow's avatar
bow committed
40

Peter van 't Hof's avatar
Peter van 't Hof committed
41
42
43
44
  /** * Generate command to execute */
  override def commandLine = super.commandLine +
    required("-I", input) +
    repeat("-o", output)
Peter van 't Hof's avatar
Peter van 't Hof committed
45
46
}

Peter van 't Hof's avatar
Peter van 't Hof committed
47
object FastqSplitter extends ToolCommand {
Peter van 't Hof's avatar
Peter van 't Hof committed
48
49
50
51
52
53

  /**
   * Arg for commandline program
   * @param inputFile input fastq file
   * @param outputFile output fastq files
   */
Peter van 't Hof's avatar
Peter van 't Hof committed
54
  case class Args(inputFile: File = null, outputFile: List[File] = Nil) extends AbstractArgs
Peter van 't Hof's avatar
Peter van 't Hof committed
55
56

  class OptParser extends AbstractOptParser {
Peter van 't Hof's avatar
Peter van 't Hof committed
57
58
59
60
61
62
    opt[File]('I', "inputFile") required () valueName ("<file>") action { (x, c) =>
      c.copy(inputFile = x)
    } text ("out is a required file property")
    opt[File]('o', "output") required () unbounded () valueName ("<file>") action { (x, c) =>
      c.copy(outputFile = x :: c.outputFile)
    } text ("out is a required file property")
Peter van 't Hof's avatar
Peter van 't Hof committed
63
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
64

Peter van 't Hof's avatar
Peter van 't Hof committed
65
  /**
Peter van 't Hof's avatar
Peter van 't Hof committed
66
67
   * Program will split fastq file in multiple fastq files
   *
Peter van 't Hof's avatar
Peter van 't Hof committed
68
69
70
   * @param args the command line arguments
   */
  def main(args: Array[String]): Unit = {
Peter van 't Hof's avatar
Peter van 't Hof committed
71
    val argsParser = new OptParser
Peter van 't Hof's avatar
Peter van 't Hof committed
72
73
    val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)

Peter van 't Hof's avatar
Peter van 't Hof committed
74
75
    val groupSize = 100
    val output = for (file <- commandArgs.outputFile) yield new AsyncFastqWriter(new BasicFastqWriter(file), groupSize)
76
77
78
79
80
    val reader = new FastqReader(commandArgs.inputFile)

    logger.info("Starting to split fatsq file: " + commandArgs.inputFile)
    logger.info("Output files: " + commandArgs.outputFile.mkString(", "))

Peter van 't Hof's avatar
Peter van 't Hof committed
81
    var counter: Long = 0
82
83
    while (reader.hasNext) {
      for (writer <- output) {
Peter van 't Hof's avatar
Peter van 't Hof committed
84
        for (t <- 1 to groupSize if reader.hasNext) {
85
          writer.write(reader.next())
Peter van 't Hof's avatar
Peter van 't Hof committed
86
87
          counter += 1
          if (counter % 1000000 == 0) logger.info(counter + " reads processed")
Peter van 't Hof's avatar
Peter van 't Hof committed
88
89
90
        }
      }
    }
91
    for (writer <- output) writer.close
Peter van 't Hof's avatar
Peter van 't Hof committed
92
    logger.info("Done, " + counter + " reads processed")
Peter van 't Hof's avatar
Peter van 't Hof committed
93
94
  }
}