diff --git a/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/SnptestToVcf.scala b/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/SnptestToVcf.scala new file mode 100644 index 0000000000000000000000000000000000000000..1352a977d2419f48fbc779de01c33f8a23d11a60 --- /dev/null +++ b/public/biopet-tools-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/tools/SnptestToVcf.scala @@ -0,0 +1,57 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.extensions.tools + +import java.io.File + +import nl.lumc.sasc.biopet.core.{Reference, ToolCommandFunction} +import nl.lumc.sasc.biopet.utils.config.Configurable +import org.broadinstitute.gatk.utils.commandline.{Input, Output} + +/** + * + */ +class SnptestToVcf(val root: Configurable) extends ToolCommandFunction with Reference { + def toolObject = nl.lumc.sasc.biopet.tools.SnptestToVcf + + @Input(doc = "input Info file", required = true) + var inputInfo: File = null + + @Input(required = true) + var reference: File = _ + + @Output(required = true) + var outputVcf: File = _ + + var contig: String = _ + + override def defaultCoreMemory = 6.0 + + override def beforeGraph(): Unit = { + super.beforeGraph() + if (reference == null) reference = referenceFasta() + if (contig == null) throw new IllegalStateException + //if (outputVcf.getName.endsWith(".vcf.gz")) outputFiles :+= new File(outputVcf.getAbsolutePath + ".tbi") + } + + override def cmdLine = super.cmdLine + + required("--inputInfo", inputInfo) + + required("--outputVcf", outputVcf) + + optional("--contig", contig) + + required("--referenceFasta", reference) + +} + diff --git a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/SnptestToVcf.scala b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/SnptestToVcf.scala index 1ae91e1be727c1cb0eba8f2ec0268365c4742f58..f7cb33976959e8f0d69bca990b58ae3a656564f7 100644 --- a/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/SnptestToVcf.scala +++ b/public/biopet-tools/src/main/scala/nl/lumc/sasc/biopet/tools/SnptestToVcf.scala @@ -4,7 +4,7 @@ import java.io.File import java.util import htsjdk.samtools.reference.{FastaSequenceFile, ReferenceSequenceFileFactory} -import htsjdk.variant.variantcontext.writer.{AsyncVariantContextWriter, VariantContextWriterBuilder} +import htsjdk.variant.variantcontext.writer.{AsyncVariantContextWriter, Options, VariantContextWriterBuilder} import htsjdk.variant.variantcontext.{Allele, GenotypeBuilder, VariantContextBuilder} import htsjdk.variant.vcf._ import nl.lumc.sasc.biopet.utils.ToolCommand @@ -72,15 +72,38 @@ object SnptestToVcf extends ToolCommand { val writer = new AsyncVariantContextWriter(new VariantContextWriterBuilder() .setOutputFile(cmdArgs.outputVcf) .setReferenceDictionary(vcfHeader.getSequenceDictionary) + .unsetOption(Options.INDEX_ON_THE_FLY) .build) writer.writeHeader(vcfHeader) + val infoKeys = for (key <- headerKeys if key != "rsid" if key != "chromosome" if key != "position" + if key != "alleleA" if key != "alleleB" if key != "alleleA") yield key + + var counter = 0 for (line <- lineIt if !line.startsWith("#")) { val values = line.split(" ") require(values.size == headerKeys.size, "Number of values are not the same as number of header keys") - + val alleles = List(Allele.create(values(headerMap("alleleA")), true), Allele.create(values(headerMap("alleleB")))) + val start = values(headerMap("position")).toLong + val end = alleles.head.length() + start - 1 + val rsid = values(headerMap("rsid")) + val builder = (new VariantContextBuilder) + .chr(cmdArgs.contig) + .alleles(alleles) + .start(start) + .stop(end) + .noGenotypes() + + val infoBuilder = infoKeys.foldLeft(builder) { case (a,b) => a.attribute("ST_" + b, values(headerMap(b))) } + + writer.add(builder.id(rsid).make()) + + counter += 1 + if (counter % 10000 == 0) logger.info(s"$counter lines processed") } + logger.info(s"$counter lines processed") + writer.close() }