From 68b01fdc371823840539bc51bd2ebf686409d8ba Mon Sep 17 00:00:00 2001 From: Peter van 't Hof <p.j.van_t_hof@lumc.nl> Date: Sun, 23 Aug 2015 13:49:46 +0200 Subject: [PATCH] Added UCSC header skipping --- .../biopet/utils/intervals/BedRecord.scala | 2 +- .../utils/intervals/BedRecordList.scala | 37 ++++++++++++------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecord.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecord.scala index c395d0bc5..9a798fbaf 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecord.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecord.scala @@ -81,7 +81,7 @@ case class BedRecord(chr: String, object BedRecord { def fromLine(line: String): BedRecord = { val values = line.split("\t") - require(values.length >= 3) + require(values.length >= 3, "Not enough columns count for a bed file") BedRecord( values(0), values(1).toInt, diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala index 783f32481..029356e5f 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/utils/intervals/BedRecordList.scala @@ -10,7 +10,7 @@ import nl.lumc.sasc.biopet.core.Logging /** * Created by pjvan_thof on 8/20/15. */ -class BedRecordList(val chrRecords: Map[String, List[BedRecord]]) { +class BedRecordList(val chrRecords: Map[String, List[BedRecord]], header: List[String] = Nil) { def allRecords = for (chr <- chrRecords; record <- chr._2) yield record lazy val sort = { @@ -58,29 +58,38 @@ class BedRecordList(val chrRecords: Map[String, List[BedRecord]]) { } object BedRecordList { - def fromList(records: Traversable[BedRecord]): BedRecordList = fromList(records.toIterator) + def fromListWithHeader(records: Traversable[BedRecord], + header: List[String]): BedRecordList = fromListWithHeader(records.toIterator, header) - def fromList(records: TraversableOnce[BedRecord]): BedRecordList = { + def fromListWithHeader(records: TraversableOnce[BedRecord], header: List[String]): BedRecordList = { val map = mutable.Map[String, ListBuffer[BedRecord]]() for (record <- records) { if (!map.contains(record.chr)) map += record.chr -> ListBuffer() map(record.chr) += record } - new BedRecordList(map.toMap.map(m => m._1 -> m._2.toList)) + new BedRecordList(map.toMap.map(m => m._1 -> m._2.toList), header) } + def fromList(records: Traversable[BedRecord]): BedRecordList = fromListWithHeader(records.toIterator, Nil) + + def fromList(records: TraversableOnce[BedRecord]): BedRecordList = fromListWithHeader(records, Nil) + def fromFile(bedFile: File) = { - var lineCount = 0L - fromList(Source.fromFile(bedFile).getLines().map(line => { - lineCount += 1 - try { + val reader = Source.fromFile(bedFile) + val all = reader.getLines().toList + val header = all.takeWhile(x => x.startsWith("browser") || x.startsWith("track")) + var lineCount = header.length + val content = all.drop(lineCount) + try { + fromListWithHeader(content.map(line => { + lineCount += 1 BedRecord.fromLine(line).validate - } catch { - case e: Exception => - Logging.logger.error(s"Parsing line number $lineCount failed on file: ${bedFile.getAbsolutePath}") - throw e - } - })) + }), header) + } catch { + case e: Exception => + Logging.logger.error(s"Parsing line number $lineCount failed on file: ${bedFile.getAbsolutePath}") + throw e + } } def combineOverlap(list: BedRecordList): BedRecordList = { -- GitLab