Merge branch 'develop' into feature-centrifuge_pipe

8ad8f6c9 · Peter van 't Hof · ff5e56e1 · 0cea2b66 · 8ad8f6c9 · 8ad8f6c9
Commit 8ad8f6c9 authored 8 years ago by Peter van 't Hof
--- a/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala
+++ b/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/BiopetQScript.scala
@@ -38,6 +38,8 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript =>
    if (config.contains("output_dir", path = Nil)) config("output_dir", path = Nil).asFile
    else new File(".")
  }
+  require(outputDir.canRead, s"No premision to read outputdir: $outputDir")
+  require(outputDir.canWrite, s"No premision to read outputdir: $outputDir")
  @Argument(doc = "Disable all scatters", shortName = "DSC", required = false)
  var disableScatter: Boolean = false
@@ -67,6 +69,9 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript =>
  final def script() {
    outputDir = config("output_dir")
    outputDir = outputDir.getAbsoluteFile
+    BiopetQScript.checkOutputDir(outputDir)
    init()
    biopetScript()
    logger.info("Biopet script done")
@@ -153,4 +158,13 @@ trait BiopetQScript extends Configurable with GatkLogging { qscript: QScript =>
 object BiopetQScript {
  case class InputFile(file: File, md5: Option[String] = None)
+  def checkOutputDir(outputDir: File): Unit = {
+    // Sanity checks
+    require(outputDir.getParentFile.canRead, s"No premision to read parent of outputdir: ${outputDir.getParentFile}")
+    require(outputDir.getParentFile.canWrite, s"No premision to write parent of outputdir: ${outputDir.getParentFile}")
+    outputDir.mkdir()
+    require(outputDir.canRead, s"No premision to read outputdir: $outputDir")
+    require(outputDir.canWrite, s"No premision to write outputdir: $outputDir")
+  }
 }
--- a/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala
+++ b/biopet-core/src/main/scala/nl/lumc/sasc/biopet/core/PipelineCommand.scala
@@ -77,6 +77,7 @@ trait PipelineCommand extends MainCommand with GatkLogging with ImplicitConversi
      val pipelineName = this.getClass.getSimpleName.toLowerCase.split("""\$""").head
      val pipelineConfig = globalConfig.map.getOrElse(pipelineName, Map()).asInstanceOf[Map[String, Any]]
      val pipelineOutputDir = new File(globalConfig.map.getOrElse("output_dir", pipelineConfig.getOrElse("output_dir", "./")).toString)
+      BiopetQScript.checkOutputDir(pipelineOutputDir)
      val logDir: File = new File(pipelineOutputDir, ".log")
      logDir.mkdirs()
      new File(logDir, "biopet." + BiopetQCommandLine.timestamp + ".log")

--- a/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfWithVcf.scala
+++ b/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/VcfWithVcf.scala
@@ -17,7 +17,6 @@ package nl.lumc.sasc.biopet.tools
 import java.io.File
 import java.util
-import htsjdk.samtools.reference.FastaSequenceFile
 import htsjdk.variant.variantcontext.{ VariantContext, VariantContextBuilder }
 import htsjdk.variant.variantcontext.writer.{ AsyncVariantContextWriter, VariantContextWriterBuilder }
 import htsjdk.variant.vcf._
@@ -66,7 +65,10 @@ object VcfWithVcf extends ToolCommand {
      else c.copy(fields = Fields(x, x) :: c.fields)
    } text """| If only <field> is given, the field's identifier in the output VCF will be identical to <field>.
              | By default we will return all values found for a given field.
-              | With <method> the values will processed after getting it from the secondary VCF file, posible methods are:
+              | For INFO fields with type R or A we will take the respective alleles present in the input file.
+              | If a <method> is supplied, a method will be applied over the contents of the field.
+              | In this case, all values will be considered.
+              | The following methods are available:
              |   - max   : takes maximum of found value, only works for numeric (integer/float) fields
              |   - min   : takes minimum of found value, only works for numeric (integer/float) fields
              |   - unique: takes only unique values """.stripMargin
@@ -126,7 +128,7 @@ object VcfWithVcf extends ToolCommand {
      require(vcfDict.getSequence(record.getContig) != null, s"Contig ${record.getContig} does not exist on reference")
      val secondaryRecords = getSecondaryRecords(secondaryReader, record, commandArgs.matchAllele)
-      val fieldMap = createFieldMap(commandArgs.fields, secondaryRecords)
+      val fieldMap = createFieldMap(commandArgs.fields, record, secondaryRecords, secondHeader)
      writer.add(createRecord(fieldMap, record, commandArgs.fields, header))
@@ -147,17 +149,19 @@ object VcfWithVcf extends ToolCommand {
  /**
   * Create Map of field -> List of attributes in secondary records
   * @param fields List of Field
+   * @param record Original record
   * @param secondaryRecords List of VariantContext with secondary records
+   * @param header: header of secondary reader
   * @return Map of fields and their values in secondary records
   */
-  def createFieldMap(fields: List[Fields], secondaryRecords: List[VariantContext]): Map[String, List[Any]] = {
+  def createFieldMap(fields: List[Fields], record: VariantContext, secondaryRecords: List[VariantContext], header: VCFHeader): Map[String, List[Any]] = {
    val fieldMap = (for (
      f <- fields if secondaryRecords.exists(_.hasAttribute(f.inputField))
    ) yield {
      f.outputField -> (for (
        secondRecord <- secondaryRecords if secondRecord.hasAttribute(f.inputField)
      ) yield {
-        secondRecord.getAttribute(f.inputField) match {
+        getSecondaryField(record, secondRecord, f.inputField, header) match {
          case l: List[_]           => l
          case y: util.ArrayList[_] => y.toList
          case x                    => List(x)
@@ -207,4 +211,53 @@ object VcfWithVcf extends ToolCommand {
      })
    }).make()
  }
+  /**
+   * Get the proper representation of a field from a secondary record given an original record
+   * @param record original record
+   * @param secondaryRecord secondary record
+   * @param field field
+   * @param header header of secondary record
+   * @return
+   */
+  def getSecondaryField(record: VariantContext, secondaryRecord: VariantContext, field: String, header: VCFHeader): Any = {
+    header.getInfoHeaderLine(field).getCountType match {
+      case VCFHeaderLineCount.A => numberA(record, secondaryRecord, field)
+      case VCFHeaderLineCount.R => numberR(record, secondaryRecord, field)
+      case _                    => secondaryRecord.getAttribute(field)
+    }
+  }
+  /**
+   * Get the correct values from a field that has number=A
+   * @param referenceRecord the reference record
+   * @param annotateRecord the to-be-annotated record
+   * @param field the field to annotate
+   * @return
+   */
+  def numberA(referenceRecord: VariantContext, annotateRecord: VariantContext, field: String): List[Any] = {
+    val refValues = referenceRecord.getAttributeAsList(field).toArray
+    annotateRecord.
+      getAlternateAlleles.filter(referenceRecord.hasAlternateAllele).
+      map(x => referenceRecord.getAlternateAlleles.indexOf(x)).
+      flatMap(x => refValues.lift(x)).
+      toList
+  }
+  /**
+   * Get the correct values from a field that has number=R
+   * @param referenceRecord the reference record
+   * @param annotateRecord the to-be-annotated record
+   * @param field the field to annotate
+   * @return
+   */
+  def numberR(referenceRecord: VariantContext, annotateRecord: VariantContext, field: String): List[Any] = {
+    val refValues = referenceRecord.getAttributeAsList(field).toArray
+    annotateRecord.
+      getAlleles.
+      filter(referenceRecord.hasAllele).
+      map(x => referenceRecord.getAlleles.indexOf(x)).
+      flatMap(x => refValues.lift(x)).
+      toList
+  }
 }
--- a/biopet-tools/src/test/resources/chrQ_monoallelic.vcf
+++ b/biopet-tools/src/test/resources/chrQ_monoallelic.vcf
+##fileformat=VCFv4.2
+##INFO=<ID=DN,Number=1,Type=Integer,Description="inDbSNP">
+##INFO=<ID=DT,Number=0,Type=Flag,Description="in1000Genomes">
+##INFO=<ID=DA,Number=1,Type=String,Description="allelesDBSNP">
+##INFO=<ID=FG,Number=.,Type=String,Description="functionGVS">
+##INFO=<ID=FD,Number=.,Type=String,Description="functionDBSNP">
+##INFO=<ID=GM,Number=.,Type=String,Description="accession">
+##INFO=<ID=GL,Number=.,Type=String,Description="geneList">
+##INFO=<ID=AAC,Number=.,Type=String,Description="aminoAcids">
+##INFO=<ID=PP,Number=.,Type=String,Description="proteinPosition">
+##INFO=<ID=CDP,Number=.,Type=String,Description="cDNAPosition">
+##INFO=<ID=PH,Number=.,Type=String,Description="polyPhen">
+##INFO=<ID=CP,Number=1,Type=String,Description="scorePhastCons">
+##INFO=<ID=CG,Number=1,Type=String,Description="consScoreGERP">
+##INFO=<ID=AA,Number=1,Type=String,Description="chimpAllele">
+##INFO=<ID=CN,Number=.,Type=String,Description="CNV">
+##INFO=<ID=HA,Number=1,Type=String,Description="AfricanHapMapFreq">
+##INFO=<ID=HE,Number=1,Type=String,Description="EuropeanHapMapFreq">
+##INFO=<ID=HC,Number=1,Type=String,Description="AsianHapMapFreq">
+##INFO=<ID=DG,Number=0,Type=Flag,Description="hasGenotypes">
+##INFO=<ID=DV,Number=.,Type=String,Description="dbSNPValidation">
+##INFO=<ID=RM,Number=.,Type=String,Description="repeatMasker">
+##INFO=<ID=RT,Number=.,Type=String,Description="tandemRepeat">
+##INFO=<ID=CA,Number=0,Type=Flag,Description="clinicalAssociation">
+##INFO=<ID=DSP,Number=1,Type=Integer,Description="distanceToSplice">
+##INFO=<ID=GS,Number=.,Type=String,Description="granthamScore">
+##INFO=<ID=MR,Number=.,Type=String,Description="microRNAs">
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
+##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
+##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
+##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
+##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
+##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
+##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
+##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
+##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
+##INFO=<ID=NEGATIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the negative training set of bad variants">
+##INFO=<ID=POSITIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the positive training set of good variants">
+##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
+##INFO=<ID=RPA,Number=.,Type=Integer,Description="Number of times tandem repeat unit is repeated, for each allele (including reference)">
+##INFO=<ID=RU,Number=1,Type=String,Description="Tandem repeat unit (bases)">
+##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
+##INFO=<ID=STR,Number=0,Type=Flag,Description="Variant is a short tandem repeat">
+##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds ratio of being a true variant versus being false under the trained gaussian mixture model">
+##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
+##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
+##INFO=<ID=GATKCaller,Number=.,Type=String,Description="GATK variant caller used to call the variant">
+##INFO=<ID=PartOfCompound,Number=.,Type=String,Description="Whether the record was originally part of a record containing compound variants">
+##INFO=<ID=ALL_ALLELE,Number=R,Type=String,Description="A field with number R">
+##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##FILTER=<ID=LowQual,Description="Low quality">
+##FILTER=<ID=VQSRTrancheINDEL99.00to99.90,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -1.4714 <= x < -0.3324">
+##FILTER=<ID=VQSRTrancheINDEL99.90to100.00+,Description="Truth sensitivity tranche level for INDEL model at VQS Lod < -6.093">
+##FILTER=<ID=VQSRTrancheINDEL99.90to100.00,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -6.093 <= x < -1.4714">
+##FILTER=<ID=VQSRTrancheSNP99.00to99.90,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -4.8126 <= x < 0.2264">
+##FILTER=<ID=VQSRTrancheSNP99.90to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -39474.9285">
+##FILTER=<ID=VQSRTrancheSNP99.90to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -39474.9285 <= x < -4.8126">
+##FILTER=<ID=TooHigh1000GAF,Description="Allele frequency in 1000G is more than 5%">
+##FILTER=<ID=TooHighGoNLAF,Description="Allele frequency in 1000G is more than 5%">
+##FILTER=<ID=IndexNotCalled,Description="Position in index sample is not called">
+##FILTER=<ID=IndexIsVariant,Description="Index call is a variant">
+##FILTER=<ID=InArtificialChrom,Description="Variant found in an artificial chromosome">
+##FILTER=<ID=IsIntergenic,Description="Variant found in intergenic region">
+##contig=<ID=chrQ,length=16571>
+##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|AA_MAF|EA_MAF|ALLELE_NUM|DISTANCE|STRAND|CLIN_SIG|SYMBOL|SYMBOL_SOURCE|GMAF|HGVSc|HGVSp|AFR_MAF|AMR_MAF|ASN_MAF|EUR_MAF|PUBMED">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Sample_101
+chrQ	1042	rs199537431	C	A	1541.12	PASS	AF=0.333;ALL_ALLELE=C,A	GT:AD:DP:GQ:PL	1/2:24,21:45:99:838,0,889
--- a/biopet-tools/src/test/resources/chrQ_monoallelic.vcf.gz
+++ b/biopet-tools/src/test/resources/chrQ_monoallelic.vcf.gz
--- a/biopet-tools/src/test/resources/chrQ_monoallelic.vcf.gz.tbi
+++ b/biopet-tools/src/test/resources/chrQ_monoallelic.vcf.gz.tbi
--- a/biopet-tools/src/test/resources/chrQ_multiallelic.vcf
+++ b/biopet-tools/src/test/resources/chrQ_multiallelic.vcf
+##fileformat=VCFv4.2
+##INFO=<ID=DN,Number=1,Type=Integer,Description="inDbSNP">
+##INFO=<ID=DT,Number=0,Type=Flag,Description="in1000Genomes">
+##INFO=<ID=DA,Number=1,Type=String,Description="allelesDBSNP">
+##INFO=<ID=FG,Number=.,Type=String,Description="functionGVS">
+##INFO=<ID=FD,Number=.,Type=String,Description="functionDBSNP">
+##INFO=<ID=GM,Number=.,Type=String,Description="accession">
+##INFO=<ID=GL,Number=.,Type=String,Description="geneList">
+##INFO=<ID=AAC,Number=.,Type=String,Description="aminoAcids">
+##INFO=<ID=PP,Number=.,Type=String,Description="proteinPosition">
+##INFO=<ID=CDP,Number=.,Type=String,Description="cDNAPosition">
+##INFO=<ID=PH,Number=.,Type=String,Description="polyPhen">
+##INFO=<ID=CP,Number=1,Type=String,Description="scorePhastCons">
+##INFO=<ID=CG,Number=1,Type=String,Description="consScoreGERP">
+##INFO=<ID=AA,Number=1,Type=String,Description="chimpAllele">
+##INFO=<ID=CN,Number=.,Type=String,Description="CNV">
+##INFO=<ID=HA,Number=1,Type=String,Description="AfricanHapMapFreq">
+##INFO=<ID=HE,Number=1,Type=String,Description="EuropeanHapMapFreq">
+##INFO=<ID=HC,Number=1,Type=String,Description="AsianHapMapFreq">
+##INFO=<ID=DG,Number=0,Type=Flag,Description="hasGenotypes">
+##INFO=<ID=DV,Number=.,Type=String,Description="dbSNPValidation">
+##INFO=<ID=RM,Number=.,Type=String,Description="repeatMasker">
+##INFO=<ID=RT,Number=.,Type=String,Description="tandemRepeat">
+##INFO=<ID=CA,Number=0,Type=Flag,Description="clinicalAssociation">
+##INFO=<ID=DSP,Number=1,Type=Integer,Description="distanceToSplice">
+##INFO=<ID=GS,Number=.,Type=String,Description="granthamScore">
+##INFO=<ID=MR,Number=.,Type=String,Description="microRNAs">
+##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
+##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
+##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions">
+##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
+##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
+##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
+##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
+##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
+##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
+##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
+##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
+##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
+##INFO=<ID=NEGATIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the negative training set of bad variants">
+##INFO=<ID=POSITIVE_TRAIN_SITE,Number=0,Type=Flag,Description="This variant was used to build the positive training set of good variants">
+##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
+##INFO=<ID=RPA,Number=.,Type=Integer,Description="Number of times tandem repeat unit is repeated, for each allele (including reference)">
+##INFO=<ID=RU,Number=1,Type=String,Description="Tandem repeat unit (bases)">
+##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
+##INFO=<ID=STR,Number=0,Type=Flag,Description="Variant is a short tandem repeat">
+##INFO=<ID=VQSLOD,Number=1,Type=Float,Description="Log odds ratio of being a true variant versus being false under the trained gaussian mixture model">
+##INFO=<ID=culprit,Number=1,Type=String,Description="The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out">
+##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
+##INFO=<ID=GATKCaller,Number=.,Type=String,Description="GATK variant caller used to call the variant">
+##INFO=<ID=PartOfCompound,Number=.,Type=String,Description="Whether the record was originally part of a record containing compound variants">
+##INFO=<ID=ALL_ALLELE,Number=R,Type=String,Description="A field with number R">
+##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
+##FILTER=<ID=LowQual,Description="Low quality">
+##FILTER=<ID=VQSRTrancheINDEL99.00to99.90,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -1.4714 <= x < -0.3324">
+##FILTER=<ID=VQSRTrancheINDEL99.90to100.00+,Description="Truth sensitivity tranche level for INDEL model at VQS Lod < -6.093">
+##FILTER=<ID=VQSRTrancheINDEL99.90to100.00,Description="Truth sensitivity tranche level for INDEL model at VQS Lod: -6.093 <= x < -1.4714">
+##FILTER=<ID=VQSRTrancheSNP99.00to99.90,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -4.8126 <= x < 0.2264">
+##FILTER=<ID=VQSRTrancheSNP99.90to100.00+,Description="Truth sensitivity tranche level for SNP model at VQS Lod < -39474.9285">
+##FILTER=<ID=VQSRTrancheSNP99.90to100.00,Description="Truth sensitivity tranche level for SNP model at VQS Lod: -39474.9285 <= x < -4.8126">
+##FILTER=<ID=TooHigh1000GAF,Description="Allele frequency in 1000G is more than 5%">
+##FILTER=<ID=TooHighGoNLAF,Description="Allele frequency in 1000G is more than 5%">
+##FILTER=<ID=IndexNotCalled,Description="Position in index sample is not called">
+##FILTER=<ID=IndexIsVariant,Description="Index call is a variant">
+##FILTER=<ID=InArtificialChrom,Description="Variant found in an artificial chromosome">
+##FILTER=<ID=IsIntergenic,Description="Variant found in intergenic region">
+##contig=<ID=chrQ,length=16571>
+##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence type as predicted by VEP. Format: Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|AA_MAF|EA_MAF|ALLELE_NUM|DISTANCE|STRAND|CLIN_SIG|SYMBOL|SYMBOL_SOURCE|GMAF|HGVSc|HGVSp|AFR_MAF|AMR_MAF|ASN_MAF|EUR_MAF|PUBMED">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Sample_101
+chrQ	1042	rs199537431	C	A,T	1541.12	PASS	AF=0.333,0.667;ALL_ALLELE=C,A,T	GT:AD:DP:GQ:PL	1/2:24,21:45:99:838,0,889
--- a/biopet-tools/src/test/resources/chrQ_multiallelic.vcf.gz
+++ b/biopet-tools/src/test/resources/chrQ_multiallelic.vcf.gz
--- a/biopet-tools/src/test/resources/chrQ_multiallelic.vcf.gz.tbi
+++ b/biopet-tools/src/test/resources/chrQ_multiallelic.vcf.gz.tbi
--- a/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/VcfWithVcfTest.scala
+++ b/biopet-tools/src/test/scala/nl/lumc/sasc/biopet/tools/VcfWithVcfTest.scala
@@ -18,6 +18,7 @@ import java.io.File
 import java.nio.file.Paths
 import java.util
+import htsjdk.variant.vcf
 import htsjdk.variant.vcf.VCFFileReader
 import org.scalatest.Matchers
 import org.scalatest.mock.MockitoSugar
@@ -26,7 +27,6 @@ import org.testng.annotations.Test
 import scala.util.Random
 import scala.collection.JavaConversions._
 import nl.lumc.sasc.biopet.utils.VcfUtils.identicalVariantContext
 /**
@@ -44,6 +44,8 @@ class VcfWithVcfTest extends TestNGSuite with MockitoSugar with Matchers {
  val veppedPath = resourcePath("/VEP_oneline.vcf.gz")
  val unveppedPath = resourcePath("/unvep_online.vcf.gz")
  val referenceFasta = resourcePath("/fake_chrQ.fa")
+  val monoPath = resourcePath("/chrQ_monoallelic.vcf.gz")
+  val multiPath = resourcePath("/chrQ_multiallelic.vcf.gz")
  val rand = new Random()
  @Test
@@ -71,7 +73,7 @@ class VcfWithVcfTest extends TestNGSuite with MockitoSugar with Matchers {
  }
  @Test
-  def testOutputFieldException = {
+  def testOutputFieldException() = {
    val tmpFile = File.createTempFile("VCFWithVCf", ".vcf")
    tmpFile.deleteOnExit()
    val args = Array("-I", unveppedPath, "-s", veppedPath, "-o", tmpFile.getAbsolutePath, "-f", "CSQ:AC", "-R", referenceFasta)
@@ -81,7 +83,7 @@ class VcfWithVcfTest extends TestNGSuite with MockitoSugar with Matchers {
  }
  @Test
-  def testInputFieldException = {
+  def testInputFieldException() = {
    val tmpFile = File.createTempFile("VCFWithVCf", ".vcf")
    tmpFile.deleteOnExit()
    val args = Array("-I", unveppedPath, "-s", unveppedPath, "-o", tmpFile.getAbsolutePath, "-f", "CSQ:NEW_CSQ", "-R", referenceFasta)
@@ -91,7 +93,7 @@ class VcfWithVcfTest extends TestNGSuite with MockitoSugar with Matchers {
  }
  @Test
-  def testMinMethodException = {
+  def testMinMethodException() = {
    val tmpFile = File.createTempFile("VcfWithVcf_", ".vcf")
    tmpFile.deleteOnExit()
    val args = Array("-I", unveppedPath, "-s", veppedPath, "-o", tmpFile.getAbsolutePath, "-f", "CSQ:CSQ:min", "-R", referenceFasta)
@@ -101,7 +103,7 @@ class VcfWithVcfTest extends TestNGSuite with MockitoSugar with Matchers {
  }
  @Test
-  def testMaxMethodException = {
+  def testMaxMethodException() = {
    val tmpFile = File.createTempFile("VcfWithVcf_", ".vcf")
    tmpFile.deleteOnExit()
    val args = Array("-I", unveppedPath, "-s", veppedPath, "-o", tmpFile.getAbsolutePath, "-f", "CSQ:CSQ:max", "-R", referenceFasta)
@@ -111,8 +113,10 @@ class VcfWithVcfTest extends TestNGSuite with MockitoSugar with Matchers {
  }
  @Test
-  def testFieldMap = {
+  def testFieldMap() = {
-    val unvepRecord = new VCFFileReader(new File(unveppedPath)).iterator().next()
+    val unvepReader = new VCFFileReader(new File(unveppedPath))
+    val header = unvepReader.getFileHeader
+    val unvepRecord = unvepReader.iterator().next()
    var fields = List(new Fields("FG", "FG"))
    fields :::= List(new Fields("FD", "FD"))
@@ -140,7 +144,7 @@ class VcfWithVcfTest extends TestNGSuite with MockitoSugar with Matchers {
    fields :::= List(new Fields("VQSLOD", "VQSLOD"))
    fields :::= List(new Fields("culprit", "culprit"))
-    val fieldMap = createFieldMap(fields, List(unvepRecord))
+    val fieldMap = createFieldMap(fields, unvepRecord, List(unvepRecord), header)
    fieldMap("FG") shouldBe List("intron")
    fieldMap("FD") shouldBe List("unknown")
@@ -170,7 +174,7 @@ class VcfWithVcfTest extends TestNGSuite with MockitoSugar with Matchers {
  }
  @Test
-  def testGetSecondaryRecords = {
+  def testGetSecondaryRecords() = {
    val unvepRecord = new VCFFileReader(new File(unveppedPath)).iterator().next()
    val vepReader = new VCFFileReader(new File(veppedPath))
    val vepRecord = vepReader.iterator().next()
@@ -181,7 +185,7 @@ class VcfWithVcfTest extends TestNGSuite with MockitoSugar with Matchers {
  }
  @Test
-  def testCreateRecord = {
+  def testCreateRecord() = {
    val unvepRecord = new VCFFileReader(new File(unveppedPath)).iterator().next()
    val vepReader = new VCFFileReader(new File(veppedPath))
    val header = vepReader.getFileHeader
@@ -189,9 +193,53 @@ class VcfWithVcfTest extends TestNGSuite with MockitoSugar with Matchers {
    val secRec = getSecondaryRecords(vepReader, unvepRecord, false)
-    val fieldMap = createFieldMap(List(new Fields("CSQ", "CSQ")), secRec)
+    val fieldMap = createFieldMap(List(new Fields("CSQ", "CSQ")), vepRecord, secRec, header)
    val createdRecord = createRecord(fieldMap, unvepRecord, List(new Fields("CSQ", "CSQ")), header)
    identicalVariantContext(createdRecord, vepRecord) shouldBe true
  }
+  @Test
+  def testNumberA() = {
+    val multiRecord = new VCFFileReader(new File(multiPath)).iterator().next()
+    val monoRecord = new VCFFileReader(new File(monoPath)).iterator().next()
+    val annot = numberA(multiRecord, monoRecord, "AF")
+    annot shouldBe List("0.333")
+  }
+  @Test
+  def testNumberR() = {
+    val multiRecord = new VCFFileReader(new File(multiPath)).iterator().next()
+    val monoRecord = new VCFFileReader(new File(monoPath)).iterator().next()
+    val annot = numberR(multiRecord, monoRecord, "ALL_ALLELE")
+    annot shouldBe List("C", "A")
+  }
+  @Test
+  def testNumberAOutput() = {
+    val tmpFile = File.createTempFile("numberA", ".vcf.gz")
+    tmpFile.deleteOnExit()
+    val arguments = Array("-I", monoPath, "-s", multiPath, "-o", tmpFile.getAbsolutePath, "-f", "AF:MULTI_AF", "-R", referenceFasta)
+    main(arguments)
+    val annotatedRecord = new VCFFileReader(tmpFile).iterator().next()
+    annotatedRecord.getAttribute("MULTI_AF").toString shouldBe "0.333"
+  }
+  @Test
+  def testNumberROutput() = {
+    val tmpFile = File.createTempFile("numberR", ".vcf.gz")
+    tmpFile.deleteOnExit()
+    val arguments = Array("-I", monoPath, "-s", multiPath, "-o", tmpFile.getAbsolutePath, "-f", "ALL_ALLELE:MULTI_ALL_ALLELE", "-R", referenceFasta)
+    main(arguments)
+    val annotatedRecord = new VCFFileReader(tmpFile).iterator().next()
+    annotatedRecord.getAttribute("MULTI_ALL_ALLELE") match {
+      case l: List[_]           => l shouldBe List("C", "A")
+      case u: util.ArrayList[_] => u.toList shouldBe List("C", "A")
+      case _                    => throw new IllegalStateException("Not a list")
+    }
+  }
 }