diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfToTsv.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfToTsv.scala index dc1d98156e2a67122d6ef299df742549faafb1af..72dbf81c95336340483620d1e515cde61c1cd128 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfToTsv.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/tools/VcfToTsv.scala @@ -15,6 +15,9 @@ */ package nl.lumc.sasc.biopet.tools +import java.text.DecimalFormat +import java.util + import htsjdk.variant.vcf.VCFFileReader import java.io.File import java.io.PrintStream @@ -28,8 +31,9 @@ class VcfToTsv { object VcfToTsv extends ToolCommand { case class Args(inputFile: File = null, outputFile: File = null, fields: List[String] = Nil, infoFields: List[String] = Nil, - sampleFileds: List[String] = Nil, disableDefaults: Boolean = false, - allInfo: Boolean = false, allFormat: Boolean = false) extends AbstractArgs + sampleFields: List[String] = Nil, disableDefaults: Boolean = false, + allInfo: Boolean = false, allFormat: Boolean = false, + separator: String = "\t", listSeparator: String = ",", maxDecimals: Int = 2) extends AbstractArgs class OptParser extends AbstractOptParser { opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) => @@ -51,19 +55,35 @@ object VcfToTsv extends ToolCommand { c.copy(allFormat = true) } opt[String]('s', "sample_field") unbounded () action { (x, c) => - c.copy(sampleFileds = x :: c.sampleFileds) + c.copy(sampleFields = x :: c.sampleFields) } opt[Unit]('d', "disable_defaults") unbounded () action { (x, c) => c.copy(disableDefaults = true) } + opt[String]("separator") maxOccurs (1) action { (x, c) => + c.copy(separator = x) + } text ("Optional separator. Default is tab-delimited") + opt[String]("list_separator") maxOccurs (1) action { (x, c) => + c.copy(listSeparator = x) + } text ("Optional list separator. By default, lists are separated by a comma") + opt[Int]("max_decimals") maxOccurs(1) action { (x, c) => + c.copy(maxDecimals = x) + } text ("Number of decimal places for numbers. Default is 2") } - val defaultFields = List("chr", "pos", "id", "ref", "alt", "qual") + val defaultFields = List("CHROM", "POS", "ID", "REF", "ALT", "QUAL") def main(args: Array[String]): Unit = { val argsParser = new OptParser val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1) + // Throw exception if separator and listSeparator are identical + if (commandArgs.separator == commandArgs.listSeparator) throw new IllegalArgumentException( + "Separator and list_separator should not be identical" + ) + + val formatter = createFormatter(commandArgs.maxDecimals) + val reader = new VCFFileReader(commandArgs.inputFile, false) val header = reader.getFileHeader val samples = header.getSampleNamesInOrder @@ -75,49 +95,37 @@ object VcfToTsv extends ToolCommand { commandArgs.fields.toSet[String] ++ (if (commandArgs.allInfo) allInfoFields else commandArgs.infoFields).map("INFO-" + _) ++ { val buffer: ListBuffer[String] = ListBuffer() - for (f <- (if (commandArgs.allFormat) allFormatFields else commandArgs.sampleFileds); sample <- samples) { + for (f <- (if (commandArgs.allFormat) allFormatFields else commandArgs.sampleFields); sample <- samples) { buffer += sample + "-" + f } buffer.toSet[String] } - val sortedFields = fields.toList.sortWith((a, b) => { - val aT = if (a.startsWith("INFO-")) 'i' else if (samples.exists(x => a.startsWith(x + "-"))) 'f' else 'g' - val bT = if (b.startsWith("INFO-")) 'i' else if (samples.exists(x => b.startsWith(x + "-"))) 'f' else 'g' - if (aT == 'g' && bT == 'g') { - val ai = defaultFields.indexOf(a) - val bi = defaultFields.indexOf(b) - if (bi < 0) true - else ai <= bi - } else if (aT == 'g') true - else if (bT == 'g') false - else if (aT == bT) (if (a.compareTo(b) > 0) false else true) - else if (aT == 'i') true - else false - }) - val witter = if (commandArgs.outputFile != null) new PrintStream(commandArgs.outputFile) + val sortedFields = sortFields(fields, samples.toList) + + val writer = if (commandArgs.outputFile != null) new PrintStream(commandArgs.outputFile) else sys.process.stdout - witter.println(sortedFields.mkString("#", "\t", "")) + writer.println(sortedFields.mkString("#", commandArgs.separator, "")) for (vcfRecord <- reader) { val values: Map[String, Any] = Map() - values += "chr" -> vcfRecord.getChr - values += "pos" -> vcfRecord.getStart - values += "id" -> vcfRecord.getID - values += "ref" -> vcfRecord.getReference.getBaseString - values += "alt" -> { + values += "CHROM" -> vcfRecord.getChr + values += "POS" -> vcfRecord.getStart + values += "ID" -> vcfRecord.getID + values += "REF" -> vcfRecord.getReference.getBaseString + values += "ALT" -> { val t = for (a <- vcfRecord.getAlternateAlleles) yield a.getBaseString - t.mkString(",") + t.mkString(commandArgs.listSeparator) } - values += "qual" -> (if (vcfRecord.getPhredScaledQual == -10) "." else scala.math.round(vcfRecord.getPhredScaledQual * 100.0) / 100.0) - values += "filter" -> vcfRecord.getFilters + values += "QUAL" -> (if (vcfRecord.getPhredScaledQual == -10) "." else formatter.format(vcfRecord.getPhredScaledQual)) + values += "INFO" -> vcfRecord.getFilters for ((field, content) <- vcfRecord.getAttributes) { values += "INFO-" + field -> { content match { - case a: List[_] => a.mkString(",") - case a: Array[_] => a.mkString(",") - case a: java.util.ArrayList[_] => a.mkString(",") + case a: List[_] => a.mkString(commandArgs.listSeparator) + case a: Array[_] => a.mkString(commandArgs.listSeparator) + case a: java.util.ArrayList[_] => a.mkString(commandArgs.listSeparator) case _ => content } } @@ -129,10 +137,10 @@ object VcfToTsv extends ToolCommand { val l = for (g <- genotype.getAlleles) yield vcfRecord.getAlleleIndex(g) l.map(x => if (x < 0) "." else x).mkString("/") } - if (genotype.hasAD) values += sample + "-AD" -> List(genotype.getAD: _*).mkString(",") - if (genotype.hasDP) values += sample + "-DP" -> genotype.getDP - if (genotype.hasGQ) values += sample + "-GQ" -> genotype.getGQ - if (genotype.hasPL) values += sample + "-PL" -> List(genotype.getPL: _*).mkString(",") + if (genotype.hasAD) values += sample + "-AD" -> List(genotype.getAD: _*).mkString(commandArgs.listSeparator) + if (genotype.hasDP) values += sample + "-DP" -> genotype.getDP + if (genotype.hasGQ) values += sample + "-GQ" -> genotype.getGQ + if (genotype.hasPL) values += sample + "-PL" -> List(genotype.getPL: _*).mkString(commandArgs.listSeparator) for ((field, content) <- genotype.getExtendedAttributes) { values += sample + "-" + field -> content } @@ -142,7 +150,58 @@ object VcfToTsv extends ToolCommand { values(f) } else "" } - witter.println(line.mkString("\t")) + writer.println(line.mkString(commandArgs.separator)) } } + + /** + * This function creates a correct DecimalFormat for a specific length of decimals + * @param len number of decimal places + * @return DecimalFormat formatter + */ + def createFormatter(len: Int): DecimalFormat = { + val patternString = "###." + (for (x <- (1 to len)) yield "#").mkString("") + new DecimalFormat(patternString) + } + + + /** + * This fields sorts fields, such that non-info and non-sample specific fields (e.g. general ones) are on front + * followed by info fields + * followed by sample-specific fields + * @param fields fields + * @param samples samples + * @return sorted samples + */ + def sortFields(fields: Set[String], samples: List[String]): List[String] = { + def fieldType(x: String) = x match { + case _ if x.startsWith("INFO-") => 'i' + case _ if (samples.exists(y => x.startsWith(y + "-"))) => 'f' + case _ => 'g' + } + + fields.toList.sortWith((a, b) => { + (fieldType(a), fieldType(b)) match { + case ('g','g') => { + val ai = defaultFields.indexOf(a) + val bi = defaultFields.indexOf(b) + if (bi < 0) true else ai <= bi + } + case ('f', 'f') => { + val sampleA = a.split("-").head + val sampleB = b.split("-").head + sampleA.compareTo(sampleB) match { + case 0 => !(a.compareTo(b) > 0) + case i if (i > 0) => false + case _ => true + } + } + case ('g', _) => true + case (_, 'g') => false + case (a, b) if a == b => !(a.compareTo(b) > 0) + case ('i', _) => true + case _ => false + } + }) + } } \ No newline at end of file diff --git a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfToTsvTest.scala b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfToTsvTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..598f4550d3f9743ae63f6ca3196c236b39b774d9 --- /dev/null +++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/tools/VcfToTsvTest.scala @@ -0,0 +1,70 @@ +package nl.lumc.sasc.biopet.tools + +import java.nio.file.Paths +import java.util +import scala.collection.JavaConversions._ + +import org.scalatest.Matchers +import org.scalatest.mock.MockitoSugar +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +import scala.util.Random + +/** + * Created by ahbbollen on 13-4-15. + */ +class VcfToTsvTest extends TestNGSuite with MockitoSugar with Matchers { + import VcfToTsv._ + private def resourcePath(p: String): String = { + Paths.get(getClass.getResource(p).toURI).toString + } + + val rand = new Random() + + val vepped = resourcePath("/VEP_oneline.vcf") + val unvepped = resourcePath("/unvepped.vcf") + + @Test def testAllFields() = { + val tmp_path = "/tmp/VcfToTsv_" + rand.nextString(10) + ".tsv" + val arguments = Array("-I", unvepped, "-o", tmp_path, "--all_info") + main(arguments) + } + + @Test def testSpecificField() = { + val tmp_path = "/tmp/VcfToTsv_" + rand.nextString(10) + ".tsv" + val arguments = Array("-I", vepped, "-o", tmp_path, "-i", "CSQ") + main(arguments) + } + + @Test def testNewSeparators() = { + val tmp_path = "/tmp/VcfToTsv_" + rand.nextString(10) + ".tsv" + val arguments = Array("-I", vepped, "-o", tmp_path, "--all_info", "--separator", ",", "--list_separator", "|") + main(arguments) + } + + @Test(expectedExceptions = Array(classOf[IllegalArgumentException])) + def testIdenticalSeparators() = { + val tmp_path = "/tmp/VcfToTsv_" + rand.nextString(10) + ".tsv" + val arguments = Array("-I", vepped, "-o", tmp_path, "--all_info", "--separator", ",") + main(arguments) + } + + @Test def testFormatter() = { + val formatter = createFormatter(2) + formatter.format(5000.12345) should be("5000.12") + val nformatter = createFormatter(3) + nformatter.format(5000.12345) should be("5000.123") + } + + @Test def testSortFields() = { + val unsortedFields = Set("Child01-GT", "Mother02-GT", "Father03-GT", "INFO-Something", "INFO-ScoreSomething", + "INFO-AlleleScoreSomething", "WeirdField") + val samples = List("Child01", "Father03", "Mother02") + + val sorted = sortFields(unsortedFields, samples) + sorted should be(List("WeirdField", "INFO-AlleleScoreSomething", "INFO-ScoreSomething", "INFO-Something", + "Child01-GT", "Father03-GT", "Mother02-GT")) + } + +}