From a61296b2fd0fdd5c64ca8ac1e9868c5c9a4c322a Mon Sep 17 00:00:00 2001 From: Peter van 't Hof <p.j.van_t_hof@lumc.nl> Date: Thu, 19 Feb 2015 13:06:56 +0100 Subject: [PATCH] Added Markduplicates to summary --- .../extensions/picard/MarkDuplicates.scala | 19 ++- .../src/test/resources/picard.dedup.metrics | 112 ++++++++++++++++++ .../picard/MarkDuplicatesTest.scala | 23 ++++ .../biopet/pipelines/mapping/Mapping.scala | 13 +- 4 files changed, 164 insertions(+), 3 deletions(-) create mode 100644 public/biopet-framework/src/test/resources/picard.dedup.metrics create mode 100644 public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicatesTest.scala diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala index 181494ae1..fd0f28e73 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicates.scala @@ -17,9 +17,10 @@ package nl.lumc.sasc.biopet.extensions.picard import java.io.File import nl.lumc.sasc.biopet.core.config.Configurable +import nl.lumc.sasc.biopet.core.summary.Summarizable import org.broadinstitute.gatk.utils.commandline.{ Input, Output, Argument } -class MarkDuplicates(val root: Configurable) extends Picard { +class MarkDuplicates(val root: Configurable) extends Picard with Summarizable { javaMainClass = "picard.sam.MarkDuplicates" @Input(doc = "The input SAM or BAM files to analyze. Must be coordinate sorted.", required = true) @@ -91,6 +92,22 @@ class MarkDuplicates(val root: Configurable) extends Picard { optional("SORTING_COLLECTION_SIZE_RATIO=", sortingCollectionSizeRatio, spaceSeparated = false) + optional("READ_NAME_REGEX=", readNameRegex, spaceSeparated = false) + optional("OPTICAL_DUPLICATE_PIXEL_DISTANCE=", opticalDuplicatePixelDistance, spaceSeparated = false) + + def summaryFiles: Map[String, File] = Map() + + def summaryStats: Map[String, Any] = { + val (header, content) = Picard.getMetrics(outputMetrics) + + (for (category <- 0 until content.size) yield { + content(category)(0) -> ( + for ( + i <- 1 until header.size if i < content(category).size + ) yield { + header(i).toLowerCase -> content(category)(i) + }).toMap + } + ).toMap + } } object MarkDuplicates { def apply(root: Configurable, input: List[File], outputDir: String): MarkDuplicates = { diff --git a/public/biopet-framework/src/test/resources/picard.dedup.metrics b/public/biopet-framework/src/test/resources/picard.dedup.metrics new file mode 100644 index 000000000..5734917b1 --- /dev/null +++ b/public/biopet-framework/src/test/resources/picard.dedup.metrics @@ -0,0 +1,112 @@ +## htsjdk.samtools.metrics.StringHeader +# picard.sam.MarkDuplicates INPUT=[/data/DIV5/SASC/project-049-SNPtypingbac/analysis/runs/sp/samples/8080_2#43/lib_8080_1/8080_2#43-8080_1.bam] OUTPUT=/data/DIV5/SASC/project-049-SNPtypingbac/analysis/runs/sp/samples/8080_2#43/lib_8080_1/8080_2#43-8080_1.dedup.bam METRICS_FILE=/data/DIV5/SASC/project-049-SNPtypingbac/analysis/runs/sp/samples/8080_2#43/lib_8080_1/8080_2#43-8080_1.dedup.metrics TMP_DIR=[/data/DIV5/SASC/project-049-SNPtypingbac/analysis/runs/sp/.queue/tmp] CREATE_INDEX=true PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates REMOVE_DUPLICATES=false ASSUME_SORTED=false MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 READ_NAME_REGEX=[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).* OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 VERBOSITY=INFO QUIET=false VALIDATION_STRINGENCY=STRICT COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_MD5_FILE=false +## htsjdk.samtools.metrics.StringHeader +# Started on: Wed Feb 18 17:32:02 CET 2015 + +## METRICS CLASS picard.sam.DuplicationMetrics +LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE +8080_1 5238 603803 115660 1077 5760 26 0.010386 31586584 + +## HISTOGRAM java.lang.Double +BIN VALUE +1.0 1.000043 +2.0 1.98115 +3.0 2.943681 +4.0 3.887988 +5.0 4.814414 +6.0 5.723299 +7.0 6.614976 +8.0 7.489769 +9.0 8.347998 +10.0 9.189977 +11.0 10.016015 +12.0 10.826412 +13.0 11.621464 +14.0 12.401463 +15.0 13.166693 +16.0 13.917434 +17.0 14.653961 +18.0 15.376541 +19.0 16.085441 +20.0 16.780918 +21.0 17.463226 +22.0 18.132615 +23.0 18.78933 +24.0 19.433611 +25.0 20.065693 +26.0 20.685806 +27.0 21.294179 +28.0 21.891032 +29.0 22.476584 +30.0 23.051049 +31.0 23.614637 +32.0 24.167554 +33.0 24.710002 +34.0 25.24218 +35.0 25.76428 +36.0 26.276495 +37.0 26.779012 +38.0 27.272014 +39.0 27.755681 +40.0 28.230191 +41.0 28.695716 +42.0 29.152426 +43.0 29.60049 +44.0 30.040069 +45.0 30.471325 +46.0 30.894416 +47.0 31.309496 +48.0 31.716716 +49.0 32.116227 +50.0 32.508172 +51.0 32.892697 +52.0 33.269941 +53.0 33.640042 +54.0 34.003135 +55.0 34.359354 +56.0 34.708828 +57.0 35.051684 +58.0 35.388049 +59.0 35.718046 +60.0 36.041794 +61.0 36.359412 +62.0 36.671016 +63.0 36.97672 +64.0 37.276636 +65.0 37.570873 +66.0 37.859539 +67.0 38.14274 +68.0 38.420578 +69.0 38.693155 +70.0 38.960572 +71.0 39.222925 +72.0 39.480311 +73.0 39.732823 +74.0 39.980554 +75.0 40.223595 +76.0 40.462034 +77.0 40.695958 +78.0 40.925453 +79.0 41.150602 +80.0 41.371489 +81.0 41.588193 +82.0 41.800794 +83.0 42.00937 +84.0 42.213996 +85.0 42.414748 +86.0 42.611699 +87.0 42.804921 +88.0 42.994484 +89.0 43.180458 +90.0 43.362911 +91.0 43.541909 +92.0 43.717518 +93.0 43.889802 +94.0 44.058824 +95.0 44.224645 +96.0 44.387327 +97.0 44.546929 +98.0 44.703508 +99.0 44.857123 +100.0 45.00783 + diff --git a/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicatesTest.scala b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicatesTest.scala new file mode 100644 index 000000000..ca34b83d8 --- /dev/null +++ b/public/biopet-framework/src/test/scala/nl/lumc/sasc/biopet/extensions/picard/MarkDuplicatesTest.scala @@ -0,0 +1,23 @@ +package nl.lumc.sasc.biopet.extensions.picard + +import java.io.File +import java.nio.file.Paths + +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +/** + * Created by pjvan_thof on 2/19/15. + */ +class MarkDuplicatesTest extends TestNGSuite with Matchers { + + @Test + def summaryData: Unit = { + val file = new File(Paths.get(getClass.getResource("/picard.dedup.metrics").toURI).toString) + val job = new MarkDuplicates(null) + job.outputMetrics = file + + job.summaryStats + } +} \ No newline at end of file diff --git a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala index 307b160a6..3ee791dc2 100644 --- a/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala +++ b/public/mapping/src/main/scala/nl/lumc/sasc/biopet/pipelines/mapping/Mapping.scala @@ -96,7 +96,14 @@ class Mapping(val root: Configurable) extends QScript with SummaryQScript with S def summaryFiles = Map() - def summarySettings = Map() + def summarySettings = Map( + "skip_metrics" -> skipMetrics, + "skip_flexiprep" -> skipFlexiprep, + "skip_markduplicates" -> skipMarkduplicates, + "aligner" -> aligner, + "chunking" -> chunking, + "numberChunks" -> numberChunks.getOrElse(1) + ) def init() { require(outputDir != null, "Missing output directory on mapping module") @@ -208,7 +215,9 @@ class Mapping(val root: Configurable) extends QScript with SummaryQScript with S var bamFile = bamFiles.head if (!skipMarkduplicates) { bamFile = new File(outputDir, outputName + ".dedup.bam") - add(MarkDuplicates(this, bamFiles, bamFile)) + val md = MarkDuplicates(this, bamFiles, bamFile) + add(md) + addSummarizable(md, "mark_duplicates") } else if (skipMarkduplicates && chunking) { val mergeSamFile = MergeSamFiles(this, bamFiles, outputDir) add(mergeSamFile) -- GitLab