Skip to content
Snippets Groups Projects
Commit 7bedb39b authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge bed records

parent 798f296b
No related branches found
No related tags found
No related merge requests found
......@@ -4,7 +4,7 @@ import java.io.{ File, PrintWriter }
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.utils.ToolCommand
import nl.lumc.sasc.biopet.utils.{ VcfUtils, ToolCommand }
import nl.lumc.sasc.biopet.utils.intervals.BedRecord
import scala.collection.JavaConversions._
......@@ -17,6 +17,7 @@ object GvcfToBed extends ToolCommand {
case class Args(inputVcf: File = null,
outputBed: File = null,
invertedOutputBed: Option[File] = None,
sample: Option[String] = None,
minGenomeQuality: Int = 0,
inverse: Boolean = false) extends AbstractArgs
......@@ -28,15 +29,15 @@ object GvcfToBed extends ToolCommand {
opt[File]('O', "outputBed") required () maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(outputBed = x)
} text "Output bed file"
opt[File]("invertedOutputBed") maxOccurs 1 valueName "<file>" action { (x, c) =>
c.copy(invertedOutputBed = Some(x))
} text "Output bed file"
opt[String]('S', "sample") unbounded () maxOccurs 1 valueName "<sample>" action { (x, c) =>
c.copy(sample = Some(x))
} text "Sample to consider. Will take first sample on alphabetical order by default"
opt[Int]("minGenomeQuality") unbounded () maxOccurs 1 valueName "<int>" action { (x, c) =>
c.copy(minGenomeQuality = x)
} text "Minimum genome quality to consider"
opt[Boolean]("invert") unbounded () maxOccurs 1 valueName "<boolean>" action { (x, c) =>
c.copy(inverse = x)
} text "Invert filter (i.e. emit only records NOT passing filter)"
}
def main(args: Array[String]): Unit = {
......@@ -47,54 +48,61 @@ object GvcfToBed extends ToolCommand {
val reader = new VCFFileReader(cmdArgs.inputVcf, false)
logger.debug("Opening writer")
val writer = new PrintWriter(cmdArgs.outputBed)
val invertedWriter = cmdArgs.invertedOutputBed.collect {
case file =>
logger.debug("Opening inverted writer")
new PrintWriter(file)
}
var counter = 0
val sample = cmdArgs.sample.getOrElse(reader.getFileHeader.getSampleNamesInOrder.head)
val it = reader.iterator()
val firstRecord = it.next()
var contig = firstRecord.getContig
var start = firstRecord.getStart
var end = firstRecord.getEnd
var pass = VcfUtils.hasMinGenomeQuality(firstRecord, sample, cmdArgs.minGenomeQuality)
def writeResetCachedRecord(newRecord: VariantContext): Unit = {
writeCachedRecord()
contig = newRecord.getContig
start = newRecord.getStart
end = newRecord.getEnd
pass = VcfUtils.hasMinGenomeQuality(newRecord, sample, cmdArgs.minGenomeQuality)
}
def writeCachedRecord(): Unit = {
if (pass) writer.println(new BedRecord(contig, start - 1, end))
else invertedWriter.foreach(_.println(new BedRecord(contig, start - 1, end)))
}
var counter = 1
logger.info("Start")
for (r <- reader) {
if (!hasMinGenomeQuality(r, cmdArgs.sample, cmdArgs.minGenomeQuality) && cmdArgs.inverse) {
writer.println(createBedRecord(r).toString)
} else if (hasMinGenomeQuality(r, cmdArgs.sample, cmdArgs.minGenomeQuality) && !cmdArgs.inverse) {
writer.println(createBedRecord(r).toString)
}
for (r <- it) {
if (contig == r.getContig) {
val p = VcfUtils.hasMinGenomeQuality(r, sample, cmdArgs.minGenomeQuality)
if (p != pass || r.getStart > (end + 1)) writeResetCachedRecord(r)
else end = r.getEnd
} else writeResetCachedRecord(r)
counter += 1
if (counter % 100000 == 0) {
logger.info(s"Processed $counter records")
}
}
writeCachedRecord()
logger.info(s"Processed $counter records")
logger.debug("Closing writer")
writer.close()
invertedWriter.foreach { w =>
logger.debug("Closing inverted writer")
w.close()
}
logger.debug("Closing reader")
reader.close()
logger.info("Finished!")
}
/**
* Check whether record has minimum genome qality
* @param record variant context
* @param sample Option[String] with sample name
* @param minGQ minimum genome quality value
* @return
*/
def hasMinGenomeQuality(record: VariantContext, sample: Option[String], minGQ: Int): Boolean = {
sample foreach { x =>
if (!record.getSampleNamesOrderedByName.contains(x))
throw new IllegalArgumentException("Sample does not exist")
}
val gt = record.getGenotype(sample.getOrElse(record.getSampleNamesOrderedByName.head))
gt.hasGQ && gt.getGQ >= minGQ
}
/**
* Create bed record from variantcontext
* @param record variant context
* @return BedRecord
*/
def createBedRecord(record: VariantContext): BedRecord = {
new BedRecord(record.getContig, record.getStart, record.getEnd)
}
}
......@@ -4,6 +4,7 @@ import java.io.File
import java.nio.file.Paths
import htsjdk.variant.vcf.VCFFileReader
import nl.lumc.sasc.biopet.utils.VcfUtils
import org.scalatest.Matchers
import org.scalatest.mock.MockitoSugar
import org.scalatest.testng.TestNGSuite
......@@ -30,43 +31,12 @@ class GvcfToBedTest extends TestNGSuite with Matchers with MockitoSugar {
val reader = new VCFFileReader(vepped, false)
val record = reader.iterator().next()
hasMinGenomeQuality(record, None, 99) shouldBe true
hasMinGenomeQuality(record, Some("Sample_101"), 99) shouldBe true
VcfUtils.hasMinGenomeQuality(record, "Sample_101", 99) shouldBe true
val reader2 = new VCFFileReader(unvepped, false)
val record2 = reader2.iterator.next()
hasMinGenomeQuality(record2, None, 99) shouldBe false
hasMinGenomeQuality(record2, None, 0) shouldBe false
hasMinGenomeQuality(record2, Some("Sample_102"), 3) shouldBe true
hasMinGenomeQuality(record2, Some("Sample_102"), 99) shouldBe false
VcfUtils.hasMinGenomeQuality(record2, "Sample_102", 3) shouldBe true
VcfUtils.hasMinGenomeQuality(record2, "Sample_102", 99) shouldBe false
}
@Test def wrongSample = {
val reader = new VCFFileReader(vepped, false)
val record = reader.iterator().next()
an[IllegalArgumentException] should be thrownBy hasMinGenomeQuality(record, Some("dummy"), 99)
}
@Test def testCreateBedRecord = {
val reader = new VCFFileReader(vepped, false)
val record = reader.iterator().next()
val bed = createBedRecord(record)
bed.chr shouldBe "chr1"
bed.start shouldBe 871042
bed.end shouldBe 871042
val reader2 = new VCFFileReader(unvepped, false)
val record2 = reader2.iterator.next()
val bed2 = createBedRecord(record2)
bed2.chr shouldBe "chr1"
bed2.start shouldBe 14599
bed2.end shouldBe 14599
//TODO: add GVCF-block vcf file to test
}
}
......@@ -18,7 +18,7 @@ package nl.lumc.sasc.biopet.utils
import java.io.File
import java.util
import htsjdk.variant.variantcontext.VariantContext
import htsjdk.variant.variantcontext.{ Genotype, VariantContext }
import htsjdk.variant.vcf.{ VCFFileReader, VCFHeader, VCFFilterHeaderLine }
import scala.collection.JavaConversions._
......@@ -103,4 +103,28 @@ object VcfUtils {
reader.close()
samples
}
/**
* Check whether record has minimum genome Quality
* @param record variant context
* @param sample sample name
* @param minGQ minimum genome quality value
* @return
*/
def hasMinGenomeQuality(record: VariantContext, sample: String, minGQ: Int): Boolean = {
if (!record.getSampleNamesOrderedByName.contains(sample))
throw new IllegalArgumentException("Sample does not exist")
val gt = record.getGenotype(sample)
hasMinGenomeQuality(gt, minGQ)
}
/**
* Check whether genotype has minimum genome Quality
* @param gt Genotype
* @param minGQ minimum genome quality value
* @return
*/
def hasMinGenomeQuality(gt: Genotype, minGQ: Int): Boolean = {
gt.hasGQ && gt.getGQ >= minGQ
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment