BedRecordList.scala 7.2 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
2
3
4
5
6
7
8
9
10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
Peter van 't Hof's avatar
Peter van 't Hof committed
12
13
14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
16
package nl.lumc.sasc.biopet.utils.intervals

17
import java.io.{ File, PrintWriter }
Peter van 't Hof's avatar
Peter van 't Hof committed
18

19
import htsjdk.samtools.SAMSequenceDictionary
20

Peter van 't Hof's avatar
Peter van 't Hof committed
21
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
22
23
24
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.io.Source
Peter van 't Hof's avatar
Peter van 't Hof committed
25
import nl.lumc.sasc.biopet.utils.{ FastaUtils, Logging }
Peter van 't Hof's avatar
Peter van 't Hof committed
26
27
28
29

/**
 * Created by pjvan_thof on 8/20/15.
 */
30
case class BedRecordList(val chrRecords: Map[String, List[BedRecord]], val header: List[String] = Nil) {
Peter van 't Hof's avatar
Peter van 't Hof committed
31
32
  def allRecords = for (chr <- chrRecords; record <- chr._2) yield record

Peter van 't Hof's avatar
Peter van 't Hof committed
33
  def toSamIntervals = allRecords.map(_.toSamInterval)
34

35
  lazy val sorted = {
Peter van 't Hof's avatar
Peter van 't Hof committed
36
37
    val sorted = new BedRecordList(chrRecords.map(x => x._1 -> x._2.sortWith((a, b) => a.start < b.start)))
    if (sorted.chrRecords.forall(x => x._2 == chrRecords(x._1))) this else sorted
Peter van 't Hof's avatar
Peter van 't Hof committed
38
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
39

40
  lazy val isSorted = sorted.hashCode() == this.hashCode() || sorted.chrRecords.forall(x => x._2 == chrRecords(x._1))
Peter van 't Hof's avatar
Peter van 't Hof committed
41

42
  def overlapWith(record: BedRecord) = sorted.chrRecords
Peter van 't Hof's avatar
Peter van 't Hof committed
43
    .getOrElse(record.chr, Nil)
Peter van 't Hof's avatar
Peter van 't Hof committed
44
45
    .dropWhile(_.end <= record.start)
    .takeWhile(_.start < record.end)
Peter van 't Hof's avatar
Peter van 't Hof committed
46

47
  def length = allRecords.foldLeft(0L)((a, b) => a + b.length)
Peter van 't Hof's avatar
Peter van 't Hof committed
48

49
  def squishBed(strandSensitive: Boolean = true, nameSensitive: Boolean = true) = BedRecordList.fromList {
50
    (for ((chr, records) <- sorted.chrRecords; record <- records) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
51
      val overlaps = overlapWith(record)
52
        .filterNot(_ == record)
Peter van 't Hof's avatar
Peter van 't Hof committed
53
        .filterNot(strandSensitive && _.strand != record.strand)
54
        .filterNot(nameSensitive && _.name == record.name)
Peter van 't Hof's avatar
Peter van 't Hof committed
55
56
57
58
59
      if (overlaps.isEmpty) {
        List(record)
      } else {
        overlaps
          .foldLeft(List(record))((result, overlap) => {
60
            (for (r <- result) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
61
62
63
64
65
66
67
68
69
70
71
72
              if (r.overlapWith(overlap)) {
                (overlap.start <= r.start, overlap.end >= r.end) match {
                  case (true, true) =>
                    Nil
                  case (true, false) =>
                    List(r.copy(start = overlap.end, _originals = List(r)))
                  case (false, true) =>
                    List(r.copy(end = overlap.start, _originals = List(r)))
                  case (false, false) =>
                    List(r.copy(end = overlap.start, _originals = List(r)), r.copy(start = overlap.end, _originals = List(r)))
                }
              } else List(r)
73
74
            }).flatten
          })
Peter van 't Hof's avatar
Peter van 't Hof committed
75
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
76
77
    }).flatten
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
78

79
  def combineOverlap: BedRecordList = {
80
    new BedRecordList(for ((chr, records) <- sorted.chrRecords) yield chr -> {
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
      def combineOverlap(records: List[BedRecord],
                         newRecords: ListBuffer[BedRecord] = ListBuffer()): List[BedRecord] = {
        if (records.nonEmpty) {
          val chr = records.head.chr
          val start = records.head.start
          val overlapRecords = records.takeWhile(_.start <= records.head.end)
          val end = overlapRecords.map(_.end).max

          newRecords += BedRecord(chr, start, end, _originals = overlapRecords)
          combineOverlap(records.drop(overlapRecords.length), newRecords)
        } else newRecords.toList
      }
      combineOverlap(records)
    })
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
97
98
99
100
  def scatter(binSize: Int) = BedRecordList(
    chrRecords.map(x => x._1 -> x._2.flatMap(_.scatter(binSize)))
  )

101
  def validateContigs(reference: File) = {
Peter van 't Hof's avatar
Peter van 't Hof committed
102
    val dict = FastaUtils.getCachedDict(reference)
103
104
105
106
107
    val notExisting = chrRecords.keys.filter(dict.getSequence(_) == null).toList
    require(notExisting.isEmpty, s"Contigs found in bed records but are not existing in reference: ${notExisting.mkString(",")}")
    this
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
108
109
  def writeToFile(file: File): Unit = {
    val writer = new PrintWriter(file)
110
    header.foreach(writer.println)
Peter van 't Hof's avatar
Peter van 't Hof committed
111
112
113
    allRecords.foreach(writer.println)
    writer.close()
  }
114
115
116
117
118
119
120
121
122

  /** This return the fraction of the regions comparing to a length */
  def fractionOf(length: Long): Double = length.toDouble / length

  /** This return the fraction of the regions comparing to a reference */
  def fractionOfReference(dict: SAMSequenceDictionary): Double = fractionOf(dict.getReferenceLength)

  /** This return the fraction of the regions comparing to a reference */
  def fractionOfReference(file: File): Double = fractionOfReference(FastaUtils.getCachedDict(file))
Peter van 't Hof's avatar
Peter van 't Hof committed
123
124
125
}

object BedRecordList {
Peter van 't Hof's avatar
Peter van 't Hof committed
126
  def fromListWithHeader(records: Traversable[BedRecord],
127
                         header: List[String]): BedRecordList = fromListWithHeader(records.toIterator, header)
Peter van 't Hof's avatar
Peter van 't Hof committed
128

Peter van 't Hof's avatar
Peter van 't Hof committed
129
  def fromListWithHeader(records: TraversableOnce[BedRecord], header: List[String]): BedRecordList = {
Peter van 't Hof's avatar
Peter van 't Hof committed
130
131
132
133
134
    val map = mutable.Map[String, ListBuffer[BedRecord]]()
    for (record <- records) {
      if (!map.contains(record.chr)) map += record.chr -> ListBuffer()
      map(record.chr) += record
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
135
    new BedRecordList(map.toMap.map(m => m._1 -> m._2.toList), header)
Peter van 't Hof's avatar
Peter van 't Hof committed
136
137
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
138
139
140
141
  def fromList(records: Traversable[BedRecord]): BedRecordList = fromListWithHeader(records.toIterator, Nil)

  def fromList(records: TraversableOnce[BedRecord]): BedRecordList = fromListWithHeader(records, Nil)

142
  /**
Peter van 't Hof's avatar
Peter van 't Hof committed
143
144
145
146
147
   * This creates a [[BedRecordList]] based on multiple files. This method combines overlapping regions
   *
   * @param bedFiles Input bed files
   * @return
   */
148
149
150
151
152
  def fromFilesCombine(bedFiles: File*) = {
    fromFiles(bedFiles, combine = true)
  }

  /**
Peter van 't Hof's avatar
Peter van 't Hof committed
153
154
155
156
157
158
   * This creates a [[BedRecordList]] based on multiple files
   *
   * @param bedFiles Input bed files
   * @param combine When true overlaping regions are merged
   * @return
   */
159
  def fromFiles(bedFiles: Seq[File], combine: Boolean = false) = {
Peter van 't Hof's avatar
Peter van 't Hof committed
160
    val list = bedFiles.foldLeft(empty)((a, b) => fromList(fromFile(b).allRecords ++ a.allRecords))
161
162
163
164
165
166
167
    if (combine) list.combineOverlap
    else list
  }

  /** This created a empty [[BedRecordList]] */
  def empty = fromList(Nil)

Peter van 't Hof's avatar
Peter van 't Hof committed
168
  def fromFile(bedFile: File) = {
Peter van 't Hof's avatar
Peter van 't Hof committed
169
170
171
172
173
174
175
176
    val reader = Source.fromFile(bedFile)
    val all = reader.getLines().toList
    val header = all.takeWhile(x => x.startsWith("browser") || x.startsWith("track"))
    var lineCount = header.length
    val content = all.drop(lineCount)
    try {
      fromListWithHeader(content.map(line => {
        lineCount += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
177
        BedRecord.fromLine(line).validate
Peter van 't Hof's avatar
Peter van 't Hof committed
178
179
180
      }), header)
    } catch {
      case e: Exception =>
Peter van 't Hof's avatar
Peter van 't Hof committed
181
        Logging.logger.warn(s"Parsing line number $lineCount failed on file: ${bedFile.getAbsolutePath}")
Peter van 't Hof's avatar
Peter van 't Hof committed
182
        throw e
183
184
    } finally {
      reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
185
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
186
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
187

Peter van 't Hof's avatar
Peter van 't Hof committed
188
  def fromReference(file: File) = fromDict(FastaUtils.getCachedDict(file))
Peter van 't Hof's avatar
Peter van 't Hof committed
189

190
191
  def fromDict(dict: SAMSequenceDictionary) = {
    fromList(for (contig <- dict.getSequences) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
192
193
194
      BedRecord(contig.getSequenceName, 0, contig.getSequenceLength)
    })
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
195
}