Commit 52a9a955 authored by Sander Bollen's avatar Sander Bollen
Browse files

sample fields back like they were

Quality scores based on formatter
max nr of decimals can be set by flag
separators can be changed by flag
parent d6f8ddff
......@@ -15,6 +15,9 @@
*/
package nl.lumc.sasc.biopet.tools
import java.text.DecimalFormat
import java.util
import htsjdk.variant.vcf.VCFFileReader
import java.io.File
import java.io.PrintStream
......@@ -30,7 +33,7 @@ object VcfToTsv extends ToolCommand {
case class Args(inputFile: File = null, outputFile: File = null, fields: List[String] = Nil, infoFields: List[String] = Nil,
sampleFields: List[String] = Nil, disableDefaults: Boolean = false,
allInfo: Boolean = false, allFormat: Boolean = false,
separator: String = "\t", listSeparator: String = ",") extends AbstractArgs
separator: String = "\t", listSeparator: String = ",", maxDecimals: Int = 2) extends AbstractArgs
class OptParser extends AbstractOptParser {
opt[File]('I', "inputFile") required () maxOccurs (1) valueName ("<file>") action { (x, c) =>
......@@ -63,6 +66,9 @@ object VcfToTsv extends ToolCommand {
opt[String]("list_separator") maxOccurs (1) action { (x, c) =>
c.copy(listSeparator = x)
} text ("Optional list separator. By default, lists are separated by a comma")
opt[Int]("max_decimals") maxOccurs(1) action { (x, c) =>
c.copy(maxDecimals = x)
} text ("Number of decimal places for numbers. Default is 2")
}
val defaultFields = List("CHROM", "POS", "ID", "REF", "ALT", "QUAL")
......@@ -71,6 +77,13 @@ object VcfToTsv extends ToolCommand {
val argsParser = new OptParser
val commandArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
// Throw exception if separator and listSeparator are identical
if (commandArgs.separator == commandArgs.listSeparator) throw new IllegalArgumentException(
"Separator and list_separator should not be identical"
)
val formatter = createFormatter(commandArgs.maxDecimals)
val reader = new VCFFileReader(commandArgs.inputFile, false)
val header = reader.getFileHeader
val samples = header.getSampleNamesInOrder
......@@ -88,20 +101,8 @@ object VcfToTsv extends ToolCommand {
buffer.toSet[String]
}
val sortedFields = fields.toList.sortWith((a, b) => {
val aT = if (a.startsWith("INFO-")) 'i' else if (samples.exists(x => a.startsWith(x + "-"))) 'f' else 'g'
val bT = if (b.startsWith("INFO-")) 'i' else if (samples.exists(x => b.startsWith(x + "-"))) 'f' else 'g'
if (aT == 'g' && bT == 'g') {
val ai = defaultFields.indexOf(a)
val bi = defaultFields.indexOf(b)
if (bi < 0) true
else ai <= bi
} else if (aT == 'g') true
else if (bT == 'g') false
else if (aT == bT) (if (a.compareTo(b) > 0) false else true)
else if (aT == 'i') true
else false
})
val sortedFields = sortFields(fields, samples.toList)
val writer = if (commandArgs.outputFile != null) new PrintStream(commandArgs.outputFile)
else sys.process.stdout
......@@ -117,7 +118,7 @@ object VcfToTsv extends ToolCommand {
val t = for (a <- vcfRecord.getAlternateAlleles) yield a.getBaseString
t.mkString(commandArgs.listSeparator)
}
values += "QUAL" -> (if (vcfRecord.getPhredScaledQual == -10) "." else scala.math.round(vcfRecord.getPhredScaledQual * 100.0) / 100.0)
values += "QUAL" -> (if (vcfRecord.getPhredScaledQual == -10) "." else formatter.format(vcfRecord.getPhredScaledQual))
values += "INFO" -> vcfRecord.getFilters
for ((field, content) <- vcfRecord.getAttributes) {
values += "INFO-" + field -> {
......@@ -136,10 +137,10 @@ object VcfToTsv extends ToolCommand {
val l = for (g <- genotype.getAlleles) yield vcfRecord.getAlleleIndex(g)
l.map(x => if (x < 0) "." else x).mkString("/")
}
if (genotype.hasAD) values += "AD-" + sample -> List(genotype.getAD: _*).mkString(commandArgs.listSeparator)
if (genotype.hasDP) values += "DP-" + sample -> genotype.getDP
if (genotype.hasGQ) values += "GQ-" + sample -> genotype.getGQ
if (genotype.hasPL) values += "PL-" + sample -> List(genotype.getPL: _*).mkString(commandArgs.listSeparator)
if (genotype.hasAD) values += sample + "-AD" -> List(genotype.getAD: _*).mkString(commandArgs.listSeparator)
if (genotype.hasDP) values += sample + "-DP" -> genotype.getDP
if (genotype.hasGQ) values += sample + "-GQ" -> genotype.getGQ
if (genotype.hasPL) values += sample + "-PL" -> List(genotype.getPL: _*).mkString(commandArgs.listSeparator)
for ((field, content) <- genotype.getExtendedAttributes) {
values += sample + "-" + field -> content
}
......@@ -152,4 +153,55 @@ object VcfToTsv extends ToolCommand {
writer.println(line.mkString(commandArgs.separator))
}
}
/**
* This function creates a correct DecimalFormat for a specific length of decimals
* @param len number of decimal places
* @return DecimalFormat formatter
*/
def createFormatter(len: Int): DecimalFormat = {
val patternString = "###." + (for (x <- (1 to len)) yield "#").mkString("")
new DecimalFormat(patternString)
}
/**
* This fields sorts fields, such that non-info and non-sample specific fields (e.g. general ones) are on front
* followed by info fields
* followed by sample-specific fields
* @param fields fields
* @param samples samples
* @return sorted samples
*/
def sortFields(fields: Set[String], samples: List[String]): List[String] = {
def fieldType(x: String) = x match {
case _ if x.startsWith("INFO-") => 'i'
case _ if (samples.exists(y => x.startsWith(y + "-"))) => 'f'
case _ => 'g'
}
fields.toList.sortWith((a, b) => {
(fieldType(a), fieldType(b)) match {
case ('g','g') => {
val ai = defaultFields.indexOf(a)
val bi = defaultFields.indexOf(b)
if (bi < 0) true else ai <= bi
}
case ('f', 'f') => {
val sampleA = a.split("-").head
val sampleB = b.split("-").head
sampleA.compareTo(sampleB) match {
case 0 => !(a.compareTo(b) > 0)
case i if (i > 0) => false
case _ => true
}
}
case ('g', _) => true
case (_, 'g') => false
case (a, b) if a == b => !(a.compareTo(b) > 0)
case ('i', _) => true
case _ => false
}
})
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment