diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VEPNormalizer.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VEPNormalizer.scala index 94b2d8883c6bcfd24294baaa781610a09bd02770..54f39071f44592e833ee0703133d433b4068c8f0 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VEPNormalizer.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VEPNormalizer.scala @@ -180,44 +180,55 @@ object VEPNormalizer extends ToolCommand { * Explode a single VEP-annotated record to multiple normal records * Based on the number of annotated transcripts in the CSQ tag * @param record the record as a VariantContext object - * @param csq_infos An array with names of new info tags + * @param csqInfos An array with names of new info tags * @return An array with the new records */ - def explodeTranscripts(record: VariantContext, csq_infos: Array[String], remove_CSQ: Boolean): Array[VariantContext] = { + def explodeTranscripts(record: VariantContext, csqInfos: Array[String], removeCsq: Boolean): Array[VariantContext] = { val csq = record.getAttributeAsString("CSQ", "unknown") - val attributes = if (remove_CSQ) record.getAttributes.toMap - "CSQ" else record.getAttributes.toMap - csq. + val builder = { + if (removeCsq) new VariantContextBuilder(record).rmAttribute("CSQ") + else new VariantContextBuilder(record) + } + + // atributes for each transcript (transcript)(csq field index) + val arti = csq. stripPrefix("["). stripSuffix("]"). - split(","). - map(x => attributes ++ csq_infos.zip(x.split("""\|""", -1))). - map(x => { - if (remove_CSQ) new VariantContextBuilder(record) - .attributes(x) - .make() - else new VariantContextBuilder(record).attributes(x).make() - }) + split(",").map(_.split("""\|""")) + + for (transcript <- arti) yield { + (for ( + fieldId <- 0 until csqInfos.size if transcript.isDefinedAt(fieldId) && !transcript(fieldId).isEmpty + ) yield csqInfos(fieldId) -> transcript(fieldId).trim) + .filterNot(_._2.isEmpty) + .foldLeft(builder)((builder, artibute) => builder.attribute(artibute._1, artibute._2)) + .make() + } } def standardTranscripts(record: VariantContext, csqInfos: Array[String], removeCsq: Boolean): VariantContext = { val csq = record.getAttributeAsString("CSQ", "unknown") - val attributes = if (removeCsq) record.getAttributes.toMap - "CSQ" else record.getAttributes.toMap - val newAttrs = attributes ++ csqInfos.zip(csq. + val builder = { + if (removeCsq) new VariantContextBuilder(record).rmAttribute("CSQ") + else new VariantContextBuilder(record) + } + + // atributes for each transcript (transcript)(csq field index) + val arti = csq. stripPrefix("["). stripSuffix("]"). - split(","). - // This makes a list of lists with each annotation for every transcript in a top-level list element - foldLeft(List.fill(csqInfos.length) { List.empty[String] })( - (acc, x) => { - val broken = x.split("""\|""", -1) - acc.zip(broken).map(x => x._2 :: x._1) - } - ). - map(x => x.mkString(","))) - - new VariantContextBuilder(record).attributes(newAttrs).make() + split(",").map(_.split("""\|""")) + + (for (fieldId <- 0 until csqInfos.size) yield csqInfos(fieldId) -> { + for ( + transcript <- arti if transcript.isDefinedAt(fieldId) && !transcript(fieldId).isEmpty + ) yield transcript(fieldId).trim + }) + .filterNot(_._2.isEmpty) + .foldLeft(builder)((builder, artibute) => builder.attribute(artibute._1, artibute._2)) + .make() } case class Args(inputVCF: File = null, diff --git a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VEPNormalizerTest.scala b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VEPNormalizerTest.scala index 9c821ed21761ab99e8bcad868c10add18d194d00..3d26895fdf321db1f82a435eee7e3d907694e595 100644 --- a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VEPNormalizerTest.scala +++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VEPNormalizerTest.scala @@ -71,7 +71,10 @@ class VEPNormalizerTest extends TestNGSuite with MockitoSugar with Matchers { } def check(item: String) = { - record.getAttribute(item).toString.split(""",""", -1).length should be(11) + record.getAttribute(item) match { + case l: List[_] => l.length should be(11) + case _ => + } } val items = Array("AA_MAF", "AFR_MAF", "ALLELE_NUM", "AMR_MAF", "ASN_MAF", "Allele",