From ffe6ea8cff029df52682280a703f20393e9a0e14 Mon Sep 17 00:00:00 2001 From: Peter van 't Hof <p.j.van_t_hof@lumc.nl> Date: Thu, 20 Aug 2015 17:53:43 +0200 Subject: [PATCH] Added a generic bed reader --- .../biopet/utils/intervals/BedRecord.scala | 4 +- .../utils/intervals/BedRecordList.scala | 57 +++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecord.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecord.scala index 441c0652d..eb3e2ac82 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecord.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecord.scala @@ -15,6 +15,8 @@ case class BedRecord(chr: String, blockCount: Option[Int] = None, blockSizes: Array[Int] = Array(), blockStarts: Array[Int] = Array()) { + + //TODO: Complete bed line output override def toString = { s"$chr\t$start\t$end" } @@ -37,7 +39,7 @@ object BedRecord { }, values.lift(6).map(_.toInt), values.lift(7)map(_.toInt), - values.lift(8).map(_.split(",", 3).map(_.toInt)).map(x => (x(0),x(1),x(2))), + values.lift(8).map(_.split(",", 3).map(_.toInt)).map(x => (x.lift(0).getOrElse(0),x.lift(1).getOrElse(0),x.lift(2).getOrElse(0))), values.lift(9).map(_.toInt), values.lift(10).map(_.split(",").map(_.toInt)).getOrElse(Array()), values.lift(11).map(_.split(",").map(_.toInt)).getOrElse(Array()) diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala new file mode 100644 index 000000000..f734464a3 --- /dev/null +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala @@ -0,0 +1,57 @@ +package nl.lumc.sasc.biopet.utils.intervals + +import java.io.File + +import htsjdk.samtools.util.Interval + +import scala.annotation.tailrec +import scala.collection.mutable +import scala.collection.mutable.ListBuffer +import scala.io.Source + +/** + * Created by pjvan_thof on 8/20/15. + */ +class BedRecordList(val chrRecords: Map[String, List[BedRecord]]) { + def allRecords = for (chr <- chrRecords; record <- chr._2) yield record + + def sort = new BedRecordList(chrRecords.map(x => x._1 -> x._2.sortBy(_.start))) + + def overlapWith(record: BedRecord) = chrRecords + .getOrElse(record.chr, Nil) + .dropWhile(_.end < record.start) + .takeWhile(_.start <= record.end) +} + +object BedRecordList { + def fromList(records: Traversable[BedRecord]): BedRecordList = fromList(records.toIterator) + + def fromList(records: TraversableOnce[BedRecord]): BedRecordList = { + val map = mutable.Map[String, List[BedRecord]]() + for (record <- records) + map += record.chr -> (record :: map.getOrElse(record.chr, List())) + new BedRecordList(map.toMap) + } + + def fromFile(bedFile: File) = { + fromList(Source.fromFile(bedFile).getLines().map(BedRecord.fromLine(_))) + } + + def combineOverlap(list: BedRecordList): BedRecordList = { + new BedRecordList(for ((chr, records) <- list.sort.chrRecords) yield chr -> { + def combineOverlap(records: List[BedRecord], + newRecords: ListBuffer[BedRecord] = ListBuffer()): List[BedRecord] = { + if (records.nonEmpty) { + val chr = records.head.chr + val start = records.head.start + val overlapRecords = records.takeWhile(_.start <= records.head.end) + val end = overlapRecords.map(_.end).max + + newRecords += BedRecord(chr, start, end) + combineOverlap(records.drop(overlapRecords.length), newRecords) + } else newRecords.toList + } + combineOverlap(records) + }) + } +} \ No newline at end of file -- GitLab