ExtractAlignedFastq.scala 9.68 KB
Newer Older
bow's avatar
bow committed
1
2
3
4
5
6
7
/**
 * Copyright (c) 2014 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl>
 * @author Wibowo Arindrarto <w.arindrarto@lumc.nl>
 */
package nl.lumc.sasc.biopet.tools

import java.io.File
8

bow's avatar
bow committed
9
10
import scala.collection.mutable.{ Set => MSet }
import scala.collection.JavaConverters._
bow's avatar
bow committed
11

bow's avatar
bow committed
12
import htsjdk.samtools.QueryInterval
bow's avatar
bow committed
13
14
import htsjdk.samtools.SamReaderFactory
import htsjdk.samtools.ValidationStringency
15
import htsjdk.samtools.fastq.{ BasicFastqWriter, FastqReader, FastqRecord }
bow's avatar
bow committed
16
import htsjdk.samtools.util.Interval
bow's avatar
bow committed
17
18
19
20
21

import nl.lumc.sasc.biopet.core.ToolCommand

object ExtractAlignedFastq extends ToolCommand {

bow's avatar
bow committed
22
  /** type alias for Fastq input (may or may not be paired) */
23
  type FastqInput = (FastqRecord, Option[FastqRecord])
bow's avatar
bow committed
24

bow's avatar
bow committed
25
  /** Get the FastqRecord ID */
26
27
  def fastqId(rec: FastqRecord) = rec.getReadHeader.split(" ")(0)

bow's avatar
bow committed
28
  /**
bow's avatar
bow committed
29
   * Function to create iterator over Interval given input interval string
bow's avatar
bow committed
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
   *
   * Valid interval strings are either of these:
   *    - chr5:10000-11000
   *    - chr5:10,000-11,000
   *    - chr5:10.000-11.000
   *    - chr5:10000-11,000
   * In all cases above, the region span base #10,000 to base number #11,000 in chromosome 5
   * (first base is numbered 1)
   *
   * An interval string with a single base is also allowed:
   *    - chr5:10000
   *    - chr5:10,000
   *    - chr5:10.000
   *
   * @param inStrings iterable yielding input interval string
   */
bow's avatar
bow committed
46
  def makeIntervalFromString(inStrings: Iterable[String]): Iterator[Interval] = {
bow's avatar
bow committed
47
48
49
50
51
52
53
54

    // FIXME: can we combine these two patterns into one regex?
    // matches intervals with start and end coordinates
    val ptn1 = """([\w_-]+):([\d.,]+)-([\d.,]+)""".r
    // matches intervals with start coordinate only
    val ptn2 = """([\w_-]+):([\d.,]+)""".r
    // make ints from coordinate strings
    // NOTE: while it is possible for coordinates to exceed Int.MaxValue, we are limited
bow's avatar
bow committed
55
    // by the Interval constructor only accepting ints
bow's avatar
bow committed
56
57
    def intFromCoord(s: String): Int = s.replaceAll(",", "").replaceAll("\\.", "").toInt

58
    inStrings.map {
bow's avatar
bow committed
59
60
61
62
      case ptn1(chr, start, end) => new Interval(chr, intFromCoord(start), intFromCoord(end))
      case ptn2(chr, start) =>
        val startCoord = intFromCoord(start)
        new Interval(chr, startCoord, startCoord)
Peter van 't Hof's avatar
Peter van 't Hof committed
63
      case otherwise => throw new IllegalArgumentException("Invalid interval string: " + otherwise)
64
    }.toIterator
bow's avatar
bow committed
65
66
  }

bow's avatar
bow committed
67
  /**
bow's avatar
bow committed
68
   * Function to create object that checks whether a given FASTQ record is mapped
bow's avatar
bow committed
69
70
   * to the given interval or not
   *
bow's avatar
bow committed
71
   * @param iv iterable yielding features to check
bow's avatar
bow committed
72
   * @param inAln input SAM/BAM file
bow's avatar
bow committed
73
   * @param minMapQ minimum mapping quality of read to include
bow's avatar
bow committed
74
   * @param commonSuffixLength length of suffix common to all read pairs
bow's avatar
bow committed
75
76
   * @return
   */
bow's avatar
bow committed
77
  def makeMembershipFunction(iv: Iterator[Interval],
bow's avatar
bow committed
78
                             inAln: File,
bow's avatar
bow committed
79
                             minMapQ: Int = 0,
80
                             commonSuffixLength: Int = 0): (FastqInput => Boolean) = {
bow's avatar
bow committed
81

bow's avatar
bow committed
82
83
84
85
    val inAlnReader = SamReaderFactory
      .make()
      .validationStringency(ValidationStringency.LENIENT)
      .open(inAln)
bow's avatar
bow committed
86
    require(inAlnReader.hasIndex)
bow's avatar
bow committed
87

bow's avatar
bow committed
88
    def getSequenceIndex(name: String): Int = inAlnReader.getFileHeader.getSequenceIndex(name) match {
Peter van 't Hof's avatar
Peter van 't Hof committed
89
      case x if x >= 0 =>
bow's avatar
bow committed
90
        x
Peter van 't Hof's avatar
Peter van 't Hof committed
91
      case otherwise =>
bow's avatar
bow committed
92
93
94
        throw new IllegalArgumentException("Chromosome " + name + " is not found in the alignment file")
    }

bow's avatar
bow committed
95
    val queries: Array[QueryInterval] = iv.toList
bow's avatar
bow committed
96
97
      // transform to QueryInterval
      .map(x => new QueryInterval(getSequenceIndex(x.getSequence), x.getStart, x.getEnd))
bow's avatar
bow committed
98
99
      // sort Interval
      .sortBy(x => (x.referenceIndex, x.start, x.end))
bow's avatar
bow committed
100
      // cast to array
bow's avatar
bow committed
101
102
103
104
105
106
107
      .toArray

    lazy val selected: MSet[String] = inAlnReader
      // query BAM file for overlapping reads
      .queryOverlapping(queries)
      // for Scala compatibility
      .asScala
bow's avatar
bow committed
108
109
      // filter based on mapping quality
      .filter(x => x.getMappingQuality >= minMapQ)
bow's avatar
bow committed
110
111
      // iteratively add read name to the selected set
      .foldLeft(MSet.empty[String])(
bow's avatar
bow committed
112
113
114
115
        (acc, x) => {
          logger.debug("Adding " + x.getReadName + " to set ...")
          acc += x.getReadName
        }
bow's avatar
bow committed
116
      )
bow's avatar
bow committed
117

118
    (pair: FastqInput) => pair._2 match {
119
      case None => selected.contains(fastqId(pair._1))
120
      case Some(x) =>
121
122
123
124
        val rec1Id = fastqId(pair._1)
        require(commonSuffixLength < rec1Id.length)
        require(commonSuffixLength < fastqId(x).length)
        selected.contains(rec1Id.dropRight(commonSuffixLength))
bow's avatar
bow committed
125
126
127
    }
  }

bow's avatar
bow committed
128
129
130
131
132
133
134
  /**
   * Extracts reads from the given input Fastq file and writes to a new output Fastq file
   *
   * @param memFunc Predicate for extracting reads. If evaluates to true, the read is extracted.
   * @param inputFastq1 Input [[FastqReader]] object.
   * @param outputFastq1 Output [[BasicFastqWriter]] object.
   */
135
  def extractReads(memFunc: FastqInput => Boolean,
bow's avatar
bow committed
136
                   inputFastq1: FastqReader, outputFastq1: BasicFastqWriter): Unit =
137
138
    inputFastq1.iterator.asScala
      .zip(Iterator.continually(None))
139
      .filter(rec => memFunc(rec._1, rec._2))
140
141
      .foreach(rec => outputFastq1.write(rec._1))

bow's avatar
bow committed
142
143
144
145
146
147
148
149
150
  /**
   * Extracts reads from the given input Fastq pairs and writes to new output Fastq pair files
   *
   * @param memFunc Predicate for extracting reads. If evaluates to true, the read is extracted.
   * @param inputFastq1 Input [[FastqReader]] object for pair 1.
   * @param outputFastq1 Input [[FastqReader]] object for pair 2.
   * @param inputFastq2 Output [[BasicFastqWriter]] object for pair 1.
   * @param outputFastq2 Output [[BasicFastqWriter]] object for pair 2.
   */
151
  def extractReads(memFunc: FastqInput => Boolean,
bow's avatar
bow committed
152
153
                   inputFastq1: FastqReader, outputFastq1: BasicFastqWriter,
                   inputFastq2: FastqReader, outputFastq2: BasicFastqWriter): Unit =
154
155
156
157
158
159
160
    inputFastq1.iterator.asScala
      .zip(inputFastq2.iterator.asScala)
      .filter(rec => memFunc(rec._1, Some(rec._2)))
      .foreach(rec => {
        outputFastq1.write(rec._1)
        outputFastq2.write(rec._2)
      })
bow's avatar
bow committed
161

bow's avatar
bow committed
162
  /** Default arguments */
163
  case class Args(inputBam: File = new File(""),
bow's avatar
bow committed
164
                  intervals: List[String] = List.empty[String],
165
166
167
168
                  inputFastq1: File = new File(""),
                  inputFastq2: Option[File] = None,
                  outputFastq1: File = new File(""),
                  outputFastq2: Option[File] = None,
bow's avatar
bow committed
169
170
                  minMapQ: Int = 0,
                  commonSuffixLength: Int = 0) extends AbstractArgs
bow's avatar
bow committed
171

bow's avatar
bow committed
172
  /** Command line argument parser */
bow's avatar
bow committed
173
174
175
176
177
178
179
  class OptParser extends AbstractOptParser {

    head(
      s"""
        |$commandName - Select aligned FASTQ records
      """.stripMargin)

bow's avatar
bow committed
180
181
182
    opt[File]('I', "input_file") required () valueName "<bam>" action { (x, c) =>
      c.copy(inputBam = x)
    } validate {
bow's avatar
bow committed
183
184
185
      x => if (x.exists) success else failure("Input BAM file not found")
    } text "Input BAM file"

bow's avatar
bow committed
186
    opt[String]('r', "interval") required () unbounded () valueName "<interval>" action { (x, c) =>
bow's avatar
bow committed
187
      // yes, we are appending and yes it's O(n) ~ preserving order is more important than speed here
bow's avatar
bow committed
188
189
      c.copy(intervals = c.intervals :+ x)
    } text "Interval strings"
bow's avatar
bow committed
190

bow's avatar
bow committed
191
192
193
    opt[File]('i', "in1") required () valueName "<fastq>" action { (x, c) =>
      c.copy(inputFastq1 = x)
    } validate {
bow's avatar
bow committed
194
195
196
      x => if (x.exists) success else failure("Input FASTQ file 1 not found")
    } text "Input FASTQ file 1"

bow's avatar
bow committed
197
    opt[File]('j', "in2") optional () valueName "<fastq>" action { (x, c) =>
198
      c.copy(inputFastq2 = Option(x))
bow's avatar
bow committed
199
    } validate {
bow's avatar
bow committed
200
201
202
      x => if (x.exists) success else failure("Input FASTQ file 2 not found")
    } text "Input FASTQ file 2 (default: none)"

bow's avatar
bow committed
203
204
205
    opt[File]('o', "out1") required () valueName "<fastq>" action { (x, c) =>
      c.copy(outputFastq1 = x)
    } text "Output FASTQ file 1"
bow's avatar
bow committed
206

bow's avatar
bow committed
207
    opt[File]('p', "out2") optional () valueName "<fastq>" action { (x, c) =>
208
      c.copy(outputFastq2 = Option(x))
bow's avatar
bow committed
209
    } text "Output FASTQ file 2 (default: none)"
bow's avatar
bow committed
210

bow's avatar
bow committed
211
212
213
    opt[Int]('Q', "min_mapq") optional () action { (x, c) =>
      c.copy(minMapQ = x)
    } text "Minimum MAPQ of reads in target region to remove (default: 0)"
bow's avatar
bow committed
214

bow's avatar
bow committed
215
216
217
    opt[Int]('s', "read_suffix_length") optional () action { (x, c) =>
      c.copy(commonSuffixLength = x)
    } text "Length of common suffix from each read pair (default: 0)"
218

bow's avatar
bow committed
219
220
221
222
223
    note(
      """
        |This tool creates FASTQ file(s) containing reads mapped to the given alignment intervals.
      """.stripMargin)

bow's avatar
bow committed
224
    checkConfig { c =>
225
      if (c.inputFastq2 != None && c.outputFastq2 == None)
bow's avatar
bow committed
226
        failure("Missing output FASTQ file 2")
227
      else if (c.inputFastq2 == None && c.outputFastq2 != None)
bow's avatar
bow committed
228
229
230
231
232
233
        failure("Missing input FASTQ file 2")
      else
        success
    }
  }

bow's avatar
bow committed
234
  /** Parses the command line argument */
bow's avatar
bow committed
235
236
  def parseArgs(args: Array[String]): Args =
    new OptParser()
bow's avatar
bow committed
237
238
      .parse(args, Args())
      .getOrElse(sys.exit(1))
bow's avatar
bow committed
239

bow's avatar
bow committed
240
241
242
243
  def main(args: Array[String]): Unit = {

    val commandArgs: Args = parseArgs(args)

bow's avatar
bow committed
244
    val memFunc = makeMembershipFunction(
bow's avatar
bow committed
245
      iv = makeIntervalFromString(commandArgs.intervals),
bow's avatar
bow committed
246
247
248
249
      inAln = commandArgs.inputBam,
      minMapQ = commandArgs.minMapQ,
      commonSuffixLength = commandArgs.commonSuffixLength)

250
    logger.info("Writing to output file(s) ...")
251
252
253
254
255
256
257
258
259
260
261
262
    (commandArgs.inputFastq2, commandArgs.outputFastq2) match {

      case (None, None) => extractReads(memFunc,
        new FastqReader(commandArgs.inputFastq1),
        new BasicFastqWriter(commandArgs.inputFastq1))

      case (Some(i2), Some(o2)) => extractReads(memFunc,
        new FastqReader(commandArgs.inputFastq1),
        new BasicFastqWriter(commandArgs.outputFastq1),
        new FastqReader(i2),
        new BasicFastqWriter(o2))

bow's avatar
bow committed
263
      case _ => // handled by the command line config check above
264
    }
bow's avatar
bow committed
265
266
  }
}