Skip to content
Snippets Groups Projects
Commit f7e873b0 authored by Wai Yi Leung's avatar Wai Yi Leung
Browse files

Merge branch 'tsv_mod' into 'develop'

Tsv mod

This fixes/closes #73

Changes:
* Use VCF naming conventions for default fields
* Spelling/typos
* Adding the option to change the separator
* Adding option to change amount of decimals for numeric values
* Fixes sorting function
* adding tests

See merge request !153
parents aa097f5e 51141a27
No related branches found
No related tags found
No related merge requests found
......@@ -15,6 +15,9 @@
*/
package nl.lumc.sasc.biopet.tools
import java.text.DecimalFormat
import java.util
import htsjdk.variant.vcf.VCFFileReader
import java.io.File
import java.io.PrintStream
......@@ -28,8 +31,9 @@ class VcfToTsv {
object VcfToTsv extends ToolCommand {
case class Args(inputFile: File = null, outputFile: File = null, fields: List[String] = Nil, infoFields: List[String] = Nil,
sampleFileds: List[String] = Nil, disableDefaults: Boolean = false,
allInfo: Boolean = false, allFormat: Boolean = false) extends AbstractArgs
sampleFields: List[String] = Nil, disableDefaults: Boolean = false,
allInfo: Boolean = false, allFormat: Boolean = false,
separator: String = "\t", listSeparator: String = ",", maxDecimals: Int = 2) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
......@@ -51,19 +55,35 @@ object VcfToTsv extends ToolCommand {
c.copy(allFormat = true)
}
opt[String]('s', "sample_field") unbounded () action { (x, c) =>
c.copy(sampleFileds = x :: c.sampleFileds)
c.copy(sampleFields = x :: c.sampleFields)
}
opt[Unit]('d', "disable_defaults") unbounded () action { (x, c) =>
c.copy(disableDefaults = true)
}
opt[String]("separator") maxOccurs (1) action { (x, c) =>
c.copy(separator = x)
} text ("Optional separator. Default is tab-delimited")
opt[String]("list_separator") maxOccurs (1) action { (x, c) =>
c.copy(listSeparator = x)
} text ("Optional list separator. By default, lists are separated by a comma")
opt[Int]("max_decimals") maxOccurs(1) action { (x, c) =>
c.copy(maxDecimals = x)
} text ("Number of decimal places for numbers. Default is 2")
}
val defaultFields = List("chr", "pos", "id", "ref", "alt", "qual")
val defaultFields = List("CHROM", "POS", "ID", "REF", "ALT", "QUAL")
def main(args: Array[String]): Unit = {
val argsParser = new OptParser
val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
// Throw exception if separator and listSeparator are identical
if (commandArgs.separator == commandArgs.listSeparator) throw new IllegalArgumentException(
"Separator and list_separator should not be identical"
)
val formatter = createFormatter(commandArgs.maxDecimals)
val reader = new VCFFileReader(commandArgs.inputFile, false)
val header = reader.getFileHeader
val samples = header.getSampleNamesInOrder
......@@ -75,49 +95,37 @@ object VcfToTsv extends ToolCommand {
commandArgs.fields.toSet[String] ++
(if (commandArgs.allInfo) allInfoFields else commandArgs.infoFields).map("INFO-" + _) ++ {
val buffer: ListBuffer[String] = ListBuffer()
for (f <- (if (commandArgs.allFormat) allFormatFields else commandArgs.sampleFileds); sample <- samples) {
for (f <- (if (commandArgs.allFormat) allFormatFields else commandArgs.sampleFields); sample <- samples) {
buffer += sample + "-" + f
}
buffer.toSet[String]
}
val sortedFields = fields.toList.sortWith((a, b) => {
val aT = if (a.startsWith("INFO-")) 'i' else if (samples.exists(x => a.startsWith(x + "-"))) 'f' else 'g'
val bT = if (b.startsWith("INFO-")) 'i' else if (samples.exists(x => b.startsWith(x + "-"))) 'f' else 'g'
if (aT == 'g' && bT == 'g') {
val ai = defaultFields.indexOf(a)
val bi = defaultFields.indexOf(b)
if (bi < 0) true
else ai <= bi
} else if (aT == 'g') true
else if (bT == 'g') false
else if (aT == bT) (if (a.compareTo(b) > 0) false else true)
else if (aT == 'i') true
else false
})
val witter = if (commandArgs.outputFile != null) new PrintStream(commandArgs.outputFile)
val sortedFields = sortFields(fields, samples.toList)
val writer = if (commandArgs.outputFile != null) new PrintStream(commandArgs.outputFile)
else sys.process.stdout
witter.println(sortedFields.mkString("#", "\t", ""))
writer.println(sortedFields.mkString("#", commandArgs.separator, ""))
for (vcfRecord <- reader) {
val values: Map[String, Any] = Map()
values += "chr" -> vcfRecord.getChr
values += "pos" -> vcfRecord.getStart
values += "id" -> vcfRecord.getID
values += "ref" -> vcfRecord.getReference.getBaseString
values += "alt" -> {
values += "CHROM" -> vcfRecord.getChr
values += "POS" -> vcfRecord.getStart
values += "ID" -> vcfRecord.getID
values += "REF" -> vcfRecord.getReference.getBaseString
values += "ALT" -> {
val t = for (a <- vcfRecord.getAlternateAlleles) yield a.getBaseString
t.mkString(",")
t.mkString(commandArgs.listSeparator)
}
values += "qual" -> (if (vcfRecord.getPhredScaledQual == -10) "." else scala.math.round(vcfRecord.getPhredScaledQual * 100.0) / 100.0)
values += "filter" -> vcfRecord.getFilters
values += "QUAL" -> (if (vcfRecord.getPhredScaledQual == -10) "." else formatter.format(vcfRecord.getPhredScaledQual))
values += "INFO" -> vcfRecord.getFilters
for ((field, content) <- vcfRecord.getAttributes) {
values += "INFO-" + field -> {
content match {
case a: List[_] => a.mkString(",")
case a: Array[_] => a.mkString(",")
case a: java.util.ArrayList[_] => a.mkString(",")
case a: List[_] => a.mkString(commandArgs.listSeparator)
case a: Array[_] => a.mkString(commandArgs.listSeparator)
case a: java.util.ArrayList[_] => a.mkString(commandArgs.listSeparator)
case _ => content
}
}
......@@ -129,10 +137,10 @@ object VcfToTsv extends ToolCommand {
val l = for (g <- genotype.getAlleles) yield vcfRecord.getAlleleIndex(g)
l.map(x => if (x < 0) "." else x).mkString("/")
}
if (genotype.hasAD) values += sample + "-AD" -> List(genotype.getAD: _*).mkString(",")
if (genotype.hasDP) values += sample + "-DP" -> genotype.getDP
if (genotype.hasGQ) values += sample + "-GQ" -> genotype.getGQ
if (genotype.hasPL) values += sample + "-PL" -> List(genotype.getPL: _*).mkString(",")
if (genotype.hasAD) values += sample + "-AD" -> List(genotype.getAD: _*).mkString(commandArgs.listSeparator)
if (genotype.hasDP) values += sample + "-DP" -> genotype.getDP
if (genotype.hasGQ) values += sample + "-GQ" -> genotype.getGQ
if (genotype.hasPL) values += sample + "-PL" -> List(genotype.getPL: _*).mkString(commandArgs.listSeparator)
for ((field, content) <- genotype.getExtendedAttributes) {
values += sample + "-" + field -> content
}
......@@ -142,7 +150,58 @@ object VcfToTsv extends ToolCommand {
values(f)
} else ""
}
witter.println(line.mkString("\t"))
writer.println(line.mkString(commandArgs.separator))
}
}
/**
* This function creates a correct DecimalFormat for a specific length of decimals
* @param len number of decimal places
* @return DecimalFormat formatter
*/
def createFormatter(len: Int): DecimalFormat = {
val patternString = "###." + (for (x <- (1 to len)) yield "#").mkString("")
new DecimalFormat(patternString)
}
/**
* This fields sorts fields, such that non-info and non-sample specific fields (e.g. general ones) are on front
* followed by info fields
* followed by sample-specific fields
* @param fields fields
* @param samples samples
* @return sorted samples
*/
def sortFields(fields: Set[String], samples: List[String]): List[String] = {
def fieldType(x: String) = x match {
case _ if x.startsWith("INFO-") => 'i'
case _ if (samples.exists(y => x.startsWith(y + "-"))) => 'f'
case _ => 'g'
}
fields.toList.sortWith((a, b) => {
(fieldType(a), fieldType(b)) match {
case ('g','g') => {
val ai = defaultFields.indexOf(a)
val bi = defaultFields.indexOf(b)
if (bi < 0) true else ai <= bi
}
case ('f', 'f') => {
val sampleA = a.split("-").head
val sampleB = b.split("-").head
sampleA.compareTo(sampleB) match {
case 0 => !(a.compareTo(b) > 0)
case i if (i > 0) => false
case _ => true
}
}
case ('g', _) => true
case (_, 'g') => false
case (a, b) if a == b => !(a.compareTo(b) > 0)
case ('i', _) => true
case _ => false
}
})
}
}
\ No newline at end of file
package nl.lumc.sasc.biopet.tools
import java.nio.file.Paths
import java.util
import scala.collection.JavaConversions._
import org.scalatest.Matchers
import org.scalatest.mock.MockitoSugar
import org.scalatest.testng.TestNGSuite
import org.testng.annotations.Test
import scala.util.Random
/**
* Created by ahbbollen on 13-4-15.
*/
class VcfToTsvTest extends TestNGSuite with MockitoSugar with Matchers {
import VcfToTsv._
private def resourcePath(p: String): String = {
Paths.get(getClass.getResource(p).toURI).toString
}
val rand = new Random()
val vepped = resourcePath("/VEP_oneline.vcf")
val unvepped = resourcePath("/unvepped.vcf")
@Test def testAllFields() = {
val tmp_path = "/tmp/VcfToTsv_" + rand.nextString(10) + ".tsv"
val arguments = Array("-I", unvepped, "-o", tmp_path, "--all_info")
main(arguments)
}
@Test def testSpecificField() = {
val tmp_path = "/tmp/VcfToTsv_" + rand.nextString(10) + ".tsv"
val arguments = Array("-I", vepped, "-o", tmp_path, "-i", "CSQ")
main(arguments)
}
@Test def testNewSeparators() = {
val tmp_path = "/tmp/VcfToTsv_" + rand.nextString(10) + ".tsv"
val arguments = Array("-I", vepped, "-o", tmp_path, "--all_info", "--separator", ",", "--list_separator", "|")
main(arguments)
}
@Test(expectedExceptions = Array(classOf[IllegalArgumentException]))
def testIdenticalSeparators() = {
val tmp_path = "/tmp/VcfToTsv_" + rand.nextString(10) + ".tsv"
val arguments = Array("-I", vepped, "-o", tmp_path, "--all_info", "--separator", ",")
main(arguments)
}
@Test def testFormatter() = {
val formatter = createFormatter(2)
formatter.format(5000.12345) should be("5000.12")
val nformatter = createFormatter(3)
nformatter.format(5000.12345) should be("5000.123")
}
@Test def testSortFields() = {
val unsortedFields = Set("Child01-GT", "Mother02-GT", "Father03-GT", "INFO-Something", "INFO-ScoreSomething",
"INFO-AlleleScoreSomething", "WeirdField")
val samples = List("Child01", "Father03", "Mother02")
val sorted = sortFields(unsortedFields, samples)
sorted should be(List("WeirdField", "INFO-AlleleScoreSomething", "INFO-ScoreSomething", "INFO-Something",
"Child01-GT", "Father03-GT", "Mother02-GT"))
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment