From c46b0592b8a56af2834fe17981b755beeba3b247 Mon Sep 17 00:00:00 2001 From: Peter van 't Hof <p.j.van_t_hof@lumc.nl> Date: Sat, 8 Nov 2014 17:34:46 +0100 Subject: [PATCH] Added MergeAlleles tool --- .../lumc/sasc/biopet/tools/MergeAlleles.scala | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MergeAlleles.scala diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MergeAlleles.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MergeAlleles.scala new file mode 100644 index 000000000..24b36d3ee --- /dev/null +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/MergeAlleles.scala @@ -0,0 +1,91 @@ +package nl.lumc.sasc.biopet.tools + +import htsjdk.samtools.reference.FastaSequenceFile +import htsjdk.variant.variantcontext.Allele +import htsjdk.variant.variantcontext.VariantContext +import htsjdk.variant.variantcontext.VariantContextBuilder +import htsjdk.variant.variantcontext.writer.AsyncVariantContextWriter +import htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder +import htsjdk.variant.vcf.VCFFileReader +import htsjdk.variant.vcf.VCFHeader +import java.io.File +import nl.lumc.sasc.biopet.core.ToolCommand +import scala.collection.mutable.{ Map, Set } +import scala.collection.JavaConversions._ + +object MergeAlleles extends ToolCommand { + case class Args(inputFiles: List[File] = Nil, outputFile: File = null, reference: File = null) extends AbstractArgs + + class OptParser extends AbstractOptParser { + opt[File]('I', "inputVcf") minOccurs (2) required () unbounded () valueName ("<file>") action { (x, c) => + c.copy(inputFiles = x :: c.inputFiles) + } + opt[File]('o', "outputVcf") required () unbounded () maxOccurs (1) valueName ("<file>") action { (x, c) => + c.copy(outputFile = x) + } + opt[File]('R', "reference") required () unbounded () maxOccurs (1) valueName ("<file>") action { (x, c) => + c.copy(reference = x) + } + } + + private val chunkSize = 50000 + + /** + * @param args the command line arguments + */ + def main(args: Array[String]): Unit = { + val argsParser = new OptParser + val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) + + val referenceFile = new FastaSequenceFile(commandArgs.reference, true) + val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder().setOutputFile(commandArgs.outputFile).build) + val header = new VCFHeader + val referenceDict = referenceFile.getSequenceDictionary + header.setSequenceDictionary(referenceDict) + writer.writeHeader(header) + + for (chr <- referenceDict.getSequences; chunk <- (0 to (chr.getSequenceLength / chunkSize))) { + val readers = commandArgs.inputFiles.map(new VCFFileReader(_, true)) + val output: Map[Int, List[VariantContext]] = Map() + + val chrName = chr.getSequenceName + val begin = chunk * chunkSize + 1 + val end = { + val e = (chunk + 1) * chunkSize + if (e > chr.getSequenceLength) chr.getSequenceLength else e + } + + for (reader <- readers.par; variant <- reader.query(chrName, begin, end)) { + val start = variant.getStart + if (output.contains(start)) output += variant.getStart -> (variant :: output(start)) + else output += variant.getStart -> List(variant) + } + + for ((_, l) <- output) { + writer.add(mergeAlleles(l)) + } + } + } + + def mergeAlleles(records: List[VariantContext]): VariantContext = { + val longestRef = { + var l: Array[Byte] = Array() + for (a <- records.map(_.getReference.getBases) if (a.length > l.size)) l = a + Allele.create(l, true) + } + val alleles: Set[Allele] = Set(longestRef) + val builder = new VariantContextBuilder + builder.chr(records.head.getChr) + builder.start(records.head.getStart) + + for (record <- records) { + if (record.getReference == longestRef) alleles ++= record.getAlternateAlleles + else { + val suffix = longestRef.getBaseString.stripPrefix(record.getReference.getBaseString) + for (r <- record.getAlternateAlleles) alleles += Allele.create(r.getBaseString + suffix) + } + } + builder.alleles(alleles.toSeq) + builder.make + } +} -- GitLab