BedRecordList.scala 6.04 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
2
3
4
5
6
7
8
9
10
/**
 * Biopet is built on top of GATK Queue for building bioinformatic
 * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
 * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
 * should also be able to execute Biopet tools and pipelines.
 *
 * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
 *
 * Contact us at: sasc@lumc.nl
 *
11
 * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
Peter van 't Hof's avatar
Peter van 't Hof committed
12
13
14
 * license; For commercial users or users who do not want to follow the AGPL
 * license, please contact us to obtain a separate license.
 */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
16
package nl.lumc.sasc.biopet.utils.intervals

17
import java.io.{ File, PrintWriter }
Peter van 't Hof's avatar
Peter van 't Hof committed
18

19
import htsjdk.samtools.SAMSequenceDictionary
Peter van 't Hof's avatar
Peter van 't Hof committed
20
import htsjdk.samtools.reference.FastaSequenceFile
21

Peter van 't Hof's avatar
Peter van 't Hof committed
22
import scala.collection.JavaConversions._
Peter van 't Hof's avatar
Peter van 't Hof committed
23
24
25
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.io.Source
Peter van 't Hof's avatar
Peter van 't Hof committed
26
import nl.lumc.sasc.biopet.utils.{ FastaUtils, Logging }
Peter van 't Hof's avatar
Peter van 't Hof committed
27
28
29
30

/**
 * Created by pjvan_thof on 8/20/15.
 */
31
case class BedRecordList(val chrRecords: Map[String, List[BedRecord]], val header: List[String] = Nil) {
Peter van 't Hof's avatar
Peter van 't Hof committed
32
33
  def allRecords = for (chr <- chrRecords; record <- chr._2) yield record

Peter van 't Hof's avatar
Peter van 't Hof committed
34
  def toSamIntervals = allRecords.map(_.toSamInterval)
35

36
  lazy val sorted = {
Peter van 't Hof's avatar
Peter van 't Hof committed
37
38
    val sorted = new BedRecordList(chrRecords.map(x => x._1 -> x._2.sortWith((a, b) => a.start < b.start)))
    if (sorted.chrRecords.forall(x => x._2 == chrRecords(x._1))) this else sorted
Peter van 't Hof's avatar
Peter van 't Hof committed
39
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
40

41
  lazy val isSorted = sorted.hashCode() == this.hashCode() || sorted.chrRecords.forall(x => x._2 == chrRecords(x._1))
Peter van 't Hof's avatar
Peter van 't Hof committed
42

43
  def overlapWith(record: BedRecord) = sorted.chrRecords
Peter van 't Hof's avatar
Peter van 't Hof committed
44
    .getOrElse(record.chr, Nil)
Peter van 't Hof's avatar
Peter van 't Hof committed
45
46
    .dropWhile(_.end <= record.start)
    .takeWhile(_.start < record.end)
Peter van 't Hof's avatar
Peter van 't Hof committed
47

48
  def length = allRecords.foldLeft(0L)((a, b) => a + b.length)
Peter van 't Hof's avatar
Peter van 't Hof committed
49

50
  def squishBed(strandSensitive: Boolean = true, nameSensitive: Boolean = true) = BedRecordList.fromList {
51
    (for ((chr, records) <- sorted.chrRecords; record <- records) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
52
      val overlaps = overlapWith(record)
53
        .filterNot(_ == record)
Peter van 't Hof's avatar
Peter van 't Hof committed
54
        .filterNot(strandSensitive && _.strand != record.strand)
55
        .filterNot(nameSensitive && _.name == record.name)
Peter van 't Hof's avatar
Peter van 't Hof committed
56
57
58
59
60
      if (overlaps.isEmpty) {
        List(record)
      } else {
        overlaps
          .foldLeft(List(record))((result, overlap) => {
61
            (for (r <- result) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
62
63
64
65
66
67
68
69
70
71
72
73
              if (r.overlapWith(overlap)) {
                (overlap.start <= r.start, overlap.end >= r.end) match {
                  case (true, true) =>
                    Nil
                  case (true, false) =>
                    List(r.copy(start = overlap.end, _originals = List(r)))
                  case (false, true) =>
                    List(r.copy(end = overlap.start, _originals = List(r)))
                  case (false, false) =>
                    List(r.copy(end = overlap.start, _originals = List(r)), r.copy(start = overlap.end, _originals = List(r)))
                }
              } else List(r)
74
75
            }).flatten
          })
Peter van 't Hof's avatar
Peter van 't Hof committed
76
      }
Peter van 't Hof's avatar
Peter van 't Hof committed
77
78
    }).flatten
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
79

80
  def combineOverlap: BedRecordList = {
81
    new BedRecordList(for ((chr, records) <- sorted.chrRecords) yield chr -> {
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
      def combineOverlap(records: List[BedRecord],
                         newRecords: ListBuffer[BedRecord] = ListBuffer()): List[BedRecord] = {
        if (records.nonEmpty) {
          val chr = records.head.chr
          val start = records.head.start
          val overlapRecords = records.takeWhile(_.start <= records.head.end)
          val end = overlapRecords.map(_.end).max

          newRecords += BedRecord(chr, start, end, _originals = overlapRecords)
          combineOverlap(records.drop(overlapRecords.length), newRecords)
        } else newRecords.toList
      }
      combineOverlap(records)
    })
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
98
99
100
101
  def scatter(binSize: Int) = BedRecordList(
    chrRecords.map(x => x._1 -> x._2.flatMap(_.scatter(binSize)))
  )

102
  def validateContigs(reference: File) = {
Peter van 't Hof's avatar
Peter van 't Hof committed
103
    val dict = FastaUtils.getCachedDict(reference)
104
105
106
107
108
    val notExisting = chrRecords.keys.filter(dict.getSequence(_) == null).toList
    require(notExisting.isEmpty, s"Contigs found in bed records but are not existing in reference: ${notExisting.mkString(",")}")
    this
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
109
110
  def writeToFile(file: File): Unit = {
    val writer = new PrintWriter(file)
111
    header.foreach(writer.println)
Peter van 't Hof's avatar
Peter van 't Hof committed
112
113
114
    allRecords.foreach(writer.println)
    writer.close()
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
115
116
117
}

object BedRecordList {
Peter van 't Hof's avatar
Peter van 't Hof committed
118
  def fromListWithHeader(records: Traversable[BedRecord],
119
                         header: List[String]): BedRecordList = fromListWithHeader(records.toIterator, header)
Peter van 't Hof's avatar
Peter van 't Hof committed
120

Peter van 't Hof's avatar
Peter van 't Hof committed
121
  def fromListWithHeader(records: TraversableOnce[BedRecord], header: List[String]): BedRecordList = {
Peter van 't Hof's avatar
Peter van 't Hof committed
122
123
124
125
126
    val map = mutable.Map[String, ListBuffer[BedRecord]]()
    for (record <- records) {
      if (!map.contains(record.chr)) map += record.chr -> ListBuffer()
      map(record.chr) += record
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
127
    new BedRecordList(map.toMap.map(m => m._1 -> m._2.toList), header)
Peter van 't Hof's avatar
Peter van 't Hof committed
128
129
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
130
131
132
133
  def fromList(records: Traversable[BedRecord]): BedRecordList = fromListWithHeader(records.toIterator, Nil)

  def fromList(records: TraversableOnce[BedRecord]): BedRecordList = fromListWithHeader(records, Nil)

Peter van 't Hof's avatar
Peter van 't Hof committed
134
  def fromFile(bedFile: File) = {
Peter van 't Hof's avatar
Peter van 't Hof committed
135
136
137
138
139
140
141
142
    val reader = Source.fromFile(bedFile)
    val all = reader.getLines().toList
    val header = all.takeWhile(x => x.startsWith("browser") || x.startsWith("track"))
    var lineCount = header.length
    val content = all.drop(lineCount)
    try {
      fromListWithHeader(content.map(line => {
        lineCount += 1
Peter van 't Hof's avatar
Peter van 't Hof committed
143
        BedRecord.fromLine(line).validate
Peter van 't Hof's avatar
Peter van 't Hof committed
144
145
146
      }), header)
    } catch {
      case e: Exception =>
Peter van 't Hof's avatar
Peter van 't Hof committed
147
        Logging.logger.warn(s"Parsing line number $lineCount failed on file: ${bedFile.getAbsolutePath}")
Peter van 't Hof's avatar
Peter van 't Hof committed
148
        throw e
149
150
    } finally {
      reader.close()
Peter van 't Hof's avatar
Peter van 't Hof committed
151
    }
Peter van 't Hof's avatar
Peter van 't Hof committed
152
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
153

Peter van 't Hof's avatar
Peter van 't Hof committed
154
  def fromReference(file: File) = fromDict(FastaUtils.getCachedDict(file))
Peter van 't Hof's avatar
Peter van 't Hof committed
155

156
157
  def fromDict(dict: SAMSequenceDictionary) = {
    fromList(for (contig <- dict.getSequences) yield {
Peter van 't Hof's avatar
Peter van 't Hof committed
158
159
160
      BedRecord(contig.getSequenceName, 0, contig.getSequenceLength)
    })
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
161
}