From 0499783ff1a964d9604b140124f316df197481df Mon Sep 17 00:00:00 2001 From: Sander Bollen <a.h.b.bollen@lumc.nl> Date: Thu, 3 Sep 2015 15:50:31 +0200 Subject: [PATCH] VcfWithVCf refactor --- .../lumc/sasc/biopet/tools/VcfWithVcf.scala | 127 +++++++++++++----- 1 file changed, 90 insertions(+), 37 deletions(-) diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfWithVcf.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfWithVcf.scala index ef67616f2..15c71efa6 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfWithVcf.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfWithVcf.scala @@ -16,8 +16,9 @@ package nl.lumc.sasc.biopet.tools import java.io.File +import java.util -import htsjdk.variant.variantcontext.VariantContextBuilder +import htsjdk.variant.variantcontext.{VariantContext, VariantContextBuilder} import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder } import htsjdk.variant.vcf._ import nl.lumc.sasc.biopet.core.{ ToolCommandFuntion, ToolCommand } @@ -25,6 +26,7 @@ import nl.lumc.sasc.biopet.core.config.Configurable import org.broadinstitute.gatk.utils.commandline.{ Output, Input } import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ /** * Biopet extension for tool VcfWithVcf @@ -140,44 +142,11 @@ object VcfWithVcf extends ToolCommand { var counter = 0 for (record <- reader) { - val secondaryRecords = if (commandArgs.matchAllele) { - secondaryReader.query(record.getContig, record.getStart, record.getEnd).toList. - filter(x => record.getAlternateAlleles.exists(x.hasAlternateAllele)) - } else { - secondaryReader.query(record.getContig, record.getStart, record.getEnd).toList - } + val secondaryRecords = getSecondaryRecords(secondaryReader, record, commandArgs.matchAllele) - val fieldMap = (for ( - f <- commandArgs.fields if secondaryRecords.exists(_.hasAttribute(f.inputField)) - ) yield { - f.outputField -> (for ( - secondRecord <- secondaryRecords if secondRecord.hasAttribute(f.inputField) - ) yield { - secondRecord.getAttribute(f.inputField) match { - case l: List[_] => l - case x => List(x) - } - }).fold(Nil)(_ ::: _) - }).toMap + val fieldMap = createFieldMap(commandArgs.fields, secondaryRecords) - writer.add(fieldMap.foldLeft(new VariantContextBuilder(record))((builder, attribute) => { - builder.attribute(attribute._1, commandArgs.fields.filter(_.outputField == attribute._1).head.fieldMethod match { - case FieldMethod.max => - header.getInfoHeaderLine(attribute._1).getType match { - case VCFHeaderLineType.Integer => Array(attribute._2.map(_.toString.toInt).max) - case VCFHeaderLineType.Float => Array(attribute._2.map(_.toString.toFloat).max) - case _ => throw new IllegalArgumentException("Type of field " + attribute._1 + " is not numeric") - } - case FieldMethod.min => - header.getInfoHeaderLine(attribute._1).getType match { - case VCFHeaderLineType.Integer => Array(attribute._2.map(_.toString.toInt).min) - case VCFHeaderLineType.Float => Array(attribute._2.map(_.toString.toFloat).min) - case _ => throw new IllegalArgumentException("Type of field " + attribute._1 + " is not numeric") - } - case FieldMethod.unique => attribute._2.distinct.toArray - case _ => attribute._2.toArray - }) - }).make()) + writer.add(createRecord(fieldMap, record, commandArgs.fields, header)) counter += 1 if (counter % 100000 == 0) { @@ -192,4 +161,88 @@ object VcfWithVcf extends ToolCommand { secondaryReader.close() logger.info("Done") } + + + /** + * Create Map of field -> List of attributes in secondary records + * @param fields List of Field + * @param secondaryRecords List of VariantContext with secondary records + * @return Map of fields and their values in secondary records + */ + def createFieldMap(fields: List[Fields], secondaryRecords: List[VariantContext]) : Map[String, List[Any]] = { + val fieldMap = (for ( + f <- fields if secondaryRecords.exists(_.hasAttribute(f.inputField)) + ) yield { + f.outputField -> (for ( + secondRecord <- secondaryRecords if secondRecord.hasAttribute(f.inputField) + ) yield { + secondRecord.getAttribute(f.inputField) match { + case l: List[_] => l + case x => List(x) + } + }).fold(Nil)(_ ::: _) + }).toMap + fieldMap + } + + /** + * Get secondary records matching the query record + * @param secondaryReader reader for secondary records + * @param record query record + * @param matchAllele allele has to match query allele? + * @return List of VariantContext + */ + def getSecondaryRecords(secondaryReader: VCFFileReader, + record: VariantContext, matchAllele: Boolean) : List[VariantContext] = { + if (matchAllele) { + secondaryReader.query(record.getContig, record.getStart, record.getEnd).toList. + filter(x => record.getAlternateAlleles.exists(x.hasAlternateAllele)) + } else { + secondaryReader.query(record.getContig, record.getStart, record.getEnd).toList + } + } + + def createRecord(fieldMap: Map[String, List[Any]], record: VariantContext, + fields: List[Fields], header: VCFHeader): VariantContext = { + fieldMap.foldLeft(new VariantContextBuilder(record))((builder, attribute) => { + builder.attribute(attribute._1, fields.filter(_.outputField == attribute._1).head.fieldMethod match { + case FieldMethod.max => + header.getInfoHeaderLine(attribute._1).getType match { + case VCFHeaderLineType.Integer => sL2JOAL(List(attribute._2.map(_.toString.toInt).max)) + case VCFHeaderLineType.Float => sL2JOAL(List(attribute._2.map(_.toString.toFloat).max)) + case _ => throw new IllegalArgumentException("Type of field " + attribute._1 + " is not numeric") + } + case FieldMethod.min => + header.getInfoHeaderLine(attribute._1).getType match { + case VCFHeaderLineType.Integer => sL2JOAL(List(attribute._2.map(_.toString.toInt).min)) + case VCFHeaderLineType.Float => sL2JOAL(List(attribute._2.map(_.toString.toFloat).min)) + case _ => throw new IllegalArgumentException("Type of field " + attribute._1 + " is not numeric") + } + case FieldMethod.unique => sL2JOAL(attribute._2.distinct) + case _ => sL2JOAL(attribute._2) + }) + }).make() + } + + + /** + * HACK!! + * Stands for scalaListToJavaObjectArrayList + * Convert a scala List[Any] to a java ArrayList[Object]. This is necessary for BCF conversions + * As scala ints and floats cannot be directly cast to java objects (they aren't objects), + * we need to box them. + * For items not int and float, we assume them to be strings (TODO: sane assumption?) + * @param array scala List[Any] + * @return converted java ArrayList[Object] + */ + private def sL2JOAL(array: List[Any]): util.ArrayList[Object] = { + val out = new util.ArrayList[Object]() + + array.foreach { + case x: Int => out.add(Int.box(x)) + case x: Float => out.add(Float.box(x)) + case x => out.add(x.toString) + } + out + } } -- GitLab