Skip to content
Snippets Groups Projects
Commit 9b5aa989 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'feature-wipereads' into 'develop'

Feature wipereads (final)

Now we have simple GTF +refFlat support, with more tests + some cleanup :).

The feature branch can be deleted afterwards.

See merge request !33
parents a92bc861 96d99632
No related branches found
No related tags found
No related merge requests found
......@@ -6,6 +6,7 @@ package nl.lumc.sasc.biopet.tools
import java.io.File
import scala.collection.JavaConverters._
import scala.io.Source
import scala.math.{ max, min }
import com.google.common.hash.{ Funnel, BloomFilter, PrimitiveSink }
......@@ -73,7 +74,7 @@ object WipeReads extends ToolCommand {
*
* @param inFile input interval file
*/
def makeIntervalFromFile(inFile: File): List[Interval] = {
def makeIntervalFromFile(inFile: File, gtfFeatureType: String = "exon"): List[Interval] = {
logger.info("Parsing interval file ...")
......@@ -83,20 +84,62 @@ object WipeReads extends ToolCommand {
.asScala
.map(x => new Interval(x.getChr, x.getStart, x.getEnd))
/** Function to create iterator from refFlat file */
def makeIntervalFromRefFlat(inFile: File): Iterator[Interval] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
/**
* Parses a refFlat file to yield Interval objects
*
* Format description:
* http://genome.csdb.cn/cgi-bin/hgTables?hgsid=6&hgta_doSchemaDb=hg18&hgta_doSchemaTable=refFlat
*
* @param inFile input refFlat file
*/
def makeIntervalFromRefFlat(inFile: File): Iterator[Interval] =
Source.fromFile(inFile)
// read each line
.getLines()
// skip all empty lines
.filterNot(x => x.trim.isEmpty)
// split per column
.map(line => line.trim.split("\t"))
// take chromosome and exonEnds and exonStars
.map(x => (x(2), x.reverse.take(2)))
// split starts and ends based on comma
.map(x => (x._1, x._2.map(y => y.split(","))))
// zip exonStarts and exonEnds, note the index was reversed because we did .reverse above
.map(x => (x._1, x._2(1).zip(x._2(0))))
// make Intervals, accounting for the fact that refFlat coordinates are 0-based
.map(x => x._2.map(y => new Interval(x._1, y._1.toInt + 1, y._2.toInt)))
// flatten sublist
.flatten
/** Function to create iterator from GTF file */
def makeIntervalFromGtf(inFile: File): Iterator[Interval] = ???
// convert coordinate to 1-based fully closed
// parse chrom, start blocks, end blocks, strands
/**
* Parses a GTF file to yield Interval objects
*
* @param inFile input GTF file
* @return
*/
def makeIntervalFromGtf(inFile: File): Iterator[Interval] =
Source.fromFile(inFile)
// read each line
.getLines()
// skip all empty lines
.filterNot(x => x.trim.isEmpty)
// skip all UCSC track lines and/or ensembl comment lines
.dropWhile(x => x.matches("^track | ^browser | ^#"))
// split to columns
.map(x => x.split("\t"))
// exclude intervals whose type is different from the supplied one
.filter(x => x(2) == gtfFeatureType)
// and finally create the interval objects
.map(x => new Interval(x(0), x(3).toInt, x(4).toInt))
// detect interval file format from extension
val iterFunc: (File => Iterator[Interval]) =
if (getExtension(inFile.toString.toLowerCase) == "bed")
makeIntervalFromBed
else if (getExtension(inFile.toString.toLowerCase) == "refflat")
makeIntervalFromRefFlat
else if (getExtension(inFile.toString.toLowerCase) == "gtf")
makeIntervalFromGtf
else
throw new IllegalArgumentException("Unexpected interval file type: " + inFile.getPath)
......@@ -107,7 +150,7 @@ object WipeReads extends ToolCommand {
acc match {
case head :: tail if x.intersects(head) =>
new Interval(x.getSequence, min(x.getStart, head.getStart), max(x.getEnd, head.getEnd)) :: tail
case _ => x :: acc
case _ => x :: acc
}
}
)
......@@ -150,12 +193,10 @@ object WipeReads extends ToolCommand {
else if (iv.getSequence.startsWith("chr") && getIndex(iv.getSequence.substring(3)) > -1) {
logger.warn("Removing 'chr' prefix from interval " + iv.toString)
Some(new QueryInterval(getIndex(iv.getSequence.substring(3)), iv.getStart, iv.getEnd))
}
else if (!iv.getSequence.startsWith("chr") && getIndex("chr" + iv.getSequence) > -1) {
} else if (!iv.getSequence.startsWith("chr") && getIndex("chr" + iv.getSequence) > -1) {
logger.warn("Adding 'chr' prefix to interval " + iv.toString)
Some(new QueryInterval(getIndex("chr" + iv.getSequence), iv.getStart, iv.getEnd))
}
else {
} else {
logger.warn("Sequence " + iv.getSequence + " does not exist in alignment")
None
}
......@@ -303,6 +344,7 @@ object WipeReads extends ToolCommand {
minMapQ: Int = 0,
limitToRegion: Boolean = false,
noMakeIndex: Boolean = false,
featureType: String = "exon",
bloomSize: Long = 70000000,
bloomFp: Double = 4e-7) extends AbstractArgs
......@@ -319,7 +361,7 @@ object WipeReads extends ToolCommand {
x => if (x.exists) success else failure("Input BAM file not found")
} text "Input BAM file"
opt[File]('r', "interval_file") required () valueName "<bed>" action { (x, c) =>
opt[File]('r', "interval_file") required () valueName "<bed/gtf/refflat>" action { (x, c) =>
c.copy(targetRegions = x)
} validate {
x => if (x.exists) success else failure("Target regions file not found")
......@@ -351,15 +393,21 @@ object WipeReads extends ToolCommand {
} text
"Whether to index output BAM file or not (default: yes)"
note("\nAdvanced options")
note("\nGTF-only options:")
opt[String]('t', "feature_type") optional () valueName "<gtf_feature_type>" action { (x, c) =>
c.copy(featureType = x)
} text "GTF feature containing intervals (default: exon)"
note("\nAdvanced options:")
opt[Long]("bloom_size") optional () action { (x, c) =>
c.copy(bloomSize = x)
} text "expected maximum number of reads in target regions (default: 7e7)"
} text "Expected maximum number of reads in target regions (default: 7e7)"
opt[Double]("false_positive") optional () action { (x, c) =>
c.copy(bloomFp = x)
} text "false positive rate (default: 4e-7)"
} text "False positive rate (default: 4e-7)"
note(
"""
......@@ -386,7 +434,7 @@ object WipeReads extends ToolCommand {
// cannot use SamReader as inBam directly since it only allows one active iterator at any given time
val filterFunc = makeFilterNotFunction(
ivl = makeIntervalFromFile(commandArgs.targetRegions),
ivl = makeIntervalFromFile(commandArgs.targetRegions, gtfFeatureType = commandArgs.featureType),
inBam = commandArgs.inputBam,
filterOutMulti = !commandArgs.limitToRegion,
minMapQ = commandArgs.minMapQ,
......@@ -402,4 +450,4 @@ object WipeReads extends ToolCommand {
commandArgs.filteredOutBam.map(x => prepOutBam(x, commandArgs.inputBam, writeIndex = !commandArgs.noMakeIndex))
)
}
}
}
\ No newline at end of file
chrQ ensembl gene 669 778 . - . gene_id "ENSG00000252956";
chrQ ensembl transcript 669 778 . - . gene_id "ENSG00000252956";
chrQ ensembl exon 669 778 . - . gene_id "ENSG00000252956";
chrQ ensembl gene 184 284 . - . gene_id "ENSG00000222952";
chrQ ensembl transcript 184 284 . - . gene_id "ENSG00000222952";
chrQ ensembl exon 184 284 . - . gene_id "ENSG00000222952";
chrP ensembl gene 2949 3063 . + . gene_id "ENSG00000201148";
chrP ensembl transcript 2949 3063 . + . gene_id "ENSG00000201148";
chrP ensembl exon 2949 3063 . + . gene_id "ENSG00000201148";
chrP ensembl gene 677 786 . - . gene_id "ENSG00000252368";
gene1 trans1 chrS + 100 500 100 500 1 100, 500,
gene1 trans1 chrR + 100 500 200 400 1 100, 500,
gene1 trans1 chrQ + 100 1000 200 900 3 100,300,800 200,400,1000
chrQ 300 350 rRNA03 0 +
chrQ 350 400 rRNA03 0 +
chrQ 450 480 rRNA02 0 -
chrQ 470 475 rRNA04 0 -
chrQ 1 200 rRNA01 0 .
chrQ 150 250 rRNA01 0 .
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
@HD VN:1.0 SO:coordinate
@SQ SN:chrQ LN:10000
@SQ SN:chrR LN:10000
@RG ID:001 DS:single-end reads SM:WipeReadsTestCase
r02 16 chrQ 50 60 10M * 0 0 TACGTACGTA EEFFGGHHII RG:Z:001
r04 0 chrQ 500 60 10M * 0 0 CGTACGTACG EEFFGGHHII RG:Z:001
r01 0 chrR 50 60 10M * 0 0 TACGTACGTA EEFFGGHHII RG:Z:001
r03 16 chrR 500 60 10M * 0 0 GGGGGAAAAA GGGGGGGGGG RG:Z:001
r05 4 * 0 0 * * 0 0 ATATATATAT HIHIHIHIHI RG:Z:001
......@@ -35,6 +35,7 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers {
private val samP: SAMLineParser = {
val samh = new SAMFileHeader
samh.addSequence(new SAMSequenceRecord("chrQ", 10000))
samh.addSequence(new SAMSequenceRecord("chrR", 10000))
samh.addReadGroup(new SAMReadGroupRecord("001"))
samh.addReadGroup(new SAMReadGroupRecord("002"))
new SAMLineParser(samh)
......@@ -43,12 +44,6 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers {
private def makeSams(raws: String*): Seq[SAMRecord] =
raws.map(s => samP.parseLine(s))
private def makeTempBam(): File =
File.createTempFile("WipeReads", java.util.UUID.randomUUID.toString + ".bam")
private def makeTempBamIndex(bam: File): File =
new File(bam.getAbsolutePath.stripSuffix(".bam") + ".bai")
private def makeSamReader(f: File): SamReader = SamReaderFactory
.make()
.validationStringency(ValidationStringency.LENIENT)
......@@ -83,6 +78,15 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers {
val sBamFile3 = new File(resourcePath("/single03.bam"))
val sBamFile4 = new File(resourcePath("/single04.bam"))
val sBamFile5 = new File(resourcePath("/single05.bam"))
val sBamRecs5 = makeSams(
"r02\t16\tchrR\t50\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001",
"r04\t0\tchrQ\t500\t60\t10M\t*\t0\t0\tCGTACGTACG\tEEFFGGHHII\tRG:Z:001",
"r01\t0\tchrR\t50\t60\t10M\t*\t0\t0\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001",
"r03\t16\tchrQ\t500\t60\t10M\t*\t0\t0\tGGGGGAAAAA\tGGGGGGGGGG\tRG:Z:001",
"r05\t4\t*\t0\t0\t*\t*\t0\t0\tATATATATAT\tHIHIHIHIHI\tRG:Z:001"
)
val pBamFile1 = new File(resourcePath("/paired01.bam"))
val pBamRecs1 = makeSams(
"r02\t99\tchrQ\t50\t60\t10M\t=\t90\t50\tTACGTACGTA\tEEFFGGHHII\tRG:Z:001",
......@@ -116,18 +120,64 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers {
)
val pBamFile3 = new File(resourcePath("/paired03.bam"))
val BedFile1 = new File(resourcePath("/rrna01.bed"))
val minArgList = List("-I", sBamFile1.toString, "-l", BedFile1.toString, "-o", "mock.bam")
val BedFile2 = new File(resourcePath("/rrna02.bed"))
val RefFlatFile1 = new File(resourcePath("/rrna01.refFlat"))
val GtfFile1 = new File(resourcePath("/rrna01.gtf"))
@Test def testMakeFeatureFromBed() = {
@Test def testMakeIntervalFromUnknown() = {
val thrown = intercept[IllegalArgumentException] {
makeIntervalFromFile(new File("false.bam"))
}
thrown.getMessage should ===("Unexpected interval file type: false.bam")
}
@Test def testMakeIntervalFromBed() = {
val intervals: List[Interval] = makeIntervalFromFile(BedFile1)
intervals.length should be(3)
intervals.length shouldBe 3
intervals.head.getSequence should ===("chrQ")
intervals.head.getStart shouldBe 991
intervals.head.getEnd shouldBe 1000
intervals.last.getSequence should ===("chrQ")
intervals.last.getStart shouldBe 291
intervals.last.getEnd shouldBe 320
}
@Test def testMakeIntervalFromRefFlat() = {
val intervals: List[Interval] = makeIntervalFromFile(RefFlatFile1)
intervals.length shouldBe 5
intervals.head.getSequence should ===("chrS")
intervals.head.getStart shouldBe 101
intervals.head.getEnd shouldBe 500
intervals(2).getSequence should ===("chrQ")
intervals(2).getStart shouldBe 801
intervals(2).getEnd shouldBe 1000
intervals.last.getSequence should ===("chrQ")
intervals.last.getStart shouldBe 101
intervals.last.getEnd shouldBe 200
}
@Test def testMakeIntervalFromGtf() = {
val intervals: List[Interval] = makeIntervalFromFile(GtfFile1, "exon")
intervals.length shouldBe 3
intervals.head.getSequence should ===("chrQ")
intervals.head.getStart should be(991)
intervals.head.getEnd should be(1000)
intervals.head.getStart shouldBe 669
intervals.head.getEnd shouldBe 778
intervals.last.getSequence should ===("chrP")
intervals.last.getStart shouldBe 2949
intervals.last.getEnd shouldBe 3063
}
@Test def testMakeIntervalFromBedOverlap() = {
val intervals: List[Interval] = makeIntervalFromFile(BedFile2)
intervals.length shouldBe 4
intervals.head.getSequence should ===("chrQ")
intervals.head.getStart shouldBe 451
intervals.head.getEnd shouldBe 480
intervals.last.getSequence should ===("chrQ")
intervals.last.getStart should be(291)
intervals.last.getEnd should be(320)
intervals.last.getStart shouldBe 2
intervals.last.getEnd shouldBe 250
}
@Test def testSingleBamDefault() = {
......@@ -196,6 +246,19 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers {
filterNotFunc(sBamRecs1(6)) shouldBe false
}
@Test def testSingleBamDifferentChromosomes() = {
val intervals: List[Interval] = List(
new Interval("chrQ", 50, 55),
new Interval("chrR", 500, 505)
)
val filterNotFunc = makeFilterNotFunction(intervals, sBamFile5, bloomSize = bloomSize, bloomFp = bloomFp)
filterNotFunc(sBamRecs5(0)) shouldBe true
filterNotFunc(sBamRecs5(1)) shouldBe false
filterNotFunc(sBamRecs5(2)) shouldBe false
filterNotFunc(sBamRecs5(3)) shouldBe true
filterNotFunc(sBamRecs5(4)) shouldBe false
}
@Test def testSingleBamFilterOutMultiNotSet() = {
val intervals: List[Interval] = List(
new Interval("chrQ", 291, 320), // overlaps r01, second hit,
......@@ -244,7 +307,7 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers {
filterNotFunc(sBamRecs2(2)) shouldBe false
filterNotFunc(sBamRecs2(3)) shouldBe true
filterNotFunc(sBamRecs2(4)) shouldBe true
// this r07 is not in since filterOuMulti is false
// this r07 is not in since filterOutMulti is false
filterNotFunc(sBamRecs2(5)) shouldBe false
filterNotFunc(sBamRecs2(6)) shouldBe false
filterNotFunc(sBamRecs2(7)) shouldBe false
......@@ -460,6 +523,7 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers {
"-G", "002",
"--limit_removal",
"--no_make_index",
"--feature_type", "gene",
"--bloom_size", "10000",
"--false_positive", "1e-8"
))
......@@ -472,6 +536,7 @@ class WipeReadsUnitTest extends TestNGSuite with MockitoSugar with Matchers {
parsed.readGroupIds should contain("002")
parsed.limitToRegion shouldBe true
parsed.noMakeIndex shouldBe true
parsed.featureType should ===("gene")
parsed.bloomSize shouldBe 10000
parsed.bloomFp shouldBe 1e-8
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment