Stats.scala 7.06 KB
Newer Older
Peter van 't Hof's avatar
Peter van 't Hof committed
1
/**
2
3
4
5
6
7
8
9
10
11
12
13
14
  * Biopet is built on top of GATK Queue for building bioinformatic
  * pipelines. It is mainly intended to support LUMC SHARK cluster which is running
  * SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
  * should also be able to execute Biopet tools and pipelines.
  *
  * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
  *
  * Contact us at: sasc@lumc.nl
  *
  * A dual licensing mode is applied. The source code within this project is freely available for non-commercial use under an AGPL
  * license; For commercial users or users who do not want to follow the AGPL
  * license, please contact us to obtain a separate license.
  */
Peter van 't Hof's avatar
Peter van 't Hof committed
15
16
package nl.lumc.sasc.biopet.tools.vcfstats

17
import java.io.{File, PrintWriter}
Peter van 't Hof's avatar
Peter van 't Hof committed
18

19
import nl.lumc.sasc.biopet.tools.vcfstats.VcfStats.sampleDistributions
Peter van 't Hof's avatar
Peter van 't Hof committed
20

21
22
import scala.collection.mutable
import nl.lumc.sasc.biopet.utils.{ConfigUtils, sortAnyAny}
Peter van 't Hof's avatar
Peter van 't Hof committed
23

Peter van 't Hof's avatar
Peter van 't Hof committed
24
/**
25
26
27
28
29
  * General stats class to store vcf stats
  *
  * @param generalStats Stores are general stats
  * @param samplesStats Stores all sample/genotype specific stats
  */
30
case class Stats(generalStats: mutable.Map[String, mutable.Map[Any, Int]] =
31
                   mutable.Map(),
Peter van 't Hof's avatar
Peter van 't Hof committed
32
                 samplesStats: mutable.Map[String, SampleStats] = mutable.Map()) {
33

Peter van 't Hof's avatar
Peter van 't Hof committed
34
35
36
37
38
39
  /** Add an other class */
  def +=(other: Stats): Stats = {
    for ((key, value) <- other.samplesStats) {
      if (this.samplesStats.contains(key)) this.samplesStats(key) += value
      else this.samplesStats(key) = value
    }
40
41
    for ((field, fieldMap) <- other.generalStats) {
      val thisField = this.generalStats.get(field)
Peter van 't Hof's avatar
Peter van 't Hof committed
42
      if (thisField.isDefined) Stats.mergeStatsMap(thisField.get, fieldMap)
43
      else this.generalStats += field -> fieldMap
Peter van 't Hof's avatar
Peter van 't Hof committed
44
45
46
    }
    this
  }
Peter van 't Hof's avatar
Peter van 't Hof committed
47
48

  /** Function to write 1 specific general field */
49
50
  def writeField(field: String,
                 outputDir: File,
51
52
53
54
                 prefix: String = ""): File = {
    val file = prefix match {
      case "" => new File(outputDir, field + ".tsv")
      case _ => new File(outputDir, prefix + "-" + field + ".tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
55
56
    }

57
58
    val data = this.generalStats
      .getOrElse(field, mutable.Map[Any, Int]())
Peter van 't Hof's avatar
Peter van 't Hof committed
59
60
61
62
63
64
65
66
67
68
69

    file.getParentFile.mkdirs()
    val writer = new PrintWriter(file)
    writer.println("value\tcount")
    for (key <- data.keySet.toList.sortWith(sortAnyAny)) {
      writer.println(key + "\t" + data(key))
    }
    writer.close()
    file
  }

70
  /** Function to write 1 specific general field */
71
  def getField(field: String): Map[String, Array[Any]] = {
72

73
74
    val data = this.generalStats
      .getOrElse(field, mutable.Map[Any, Int]())
75
76
77
78
79
80
    val rows = for (key <- data.keySet.toArray.sortWith(sortAnyAny)) yield {
      (key, data(key))
    }
    Map("value" -> rows.map(_._1), "count" -> rows.map(_._2))
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
81
  /** Function to write 1 specific genotype field */
82
83
84
  def writeGenotypeField(samples: List[String],
                         field: String,
                         outputDir: File,
85
86
87
88
                         prefix: String = ""): Unit = {
    val file = prefix match {
      case "" => new File(outputDir, field + ".tsv")
      case _ => new File(outputDir, prefix + "-" + field + ".tsv")
Peter van 't Hof's avatar
Peter van 't Hof committed
89
90
91
92
93
    }

    file.getParentFile.mkdirs()
    val writer = new PrintWriter(file)
    writer.println(samples.mkString(field + "\t", "\t", ""))
94
95
96
97
98
99
100
    val keySet = (for (sample <- samples)
      yield
        this
          .samplesStats(sample)
          .genotypeStats
          .getOrElse(field, Map[Any, Int]())
          .keySet).fold(Set[Any]())(_ ++ _)
Peter van 't Hof's avatar
Peter van 't Hof committed
101
    for (key <- keySet.toList.sortWith(sortAnyAny)) {
102
103
104
105
106
107
108
      val values = for (sample <- samples)
        yield
          this
            .samplesStats(sample)
            .genotypeStats
            .getOrElse(field, Map[Any, Int]())
            .getOrElse(key, 0)
Peter van 't Hof's avatar
Peter van 't Hof committed
109
110
111
112
113
      writer.println(values.mkString(key + "\t", "\t", ""))
    }
    writer.close()
  }

114
  /** Function to write 1 specific genotype field */
115
  def getGenotypeField(samples: List[String],
116
                       field: String): Map[String, Map[String, Any]] = {
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
    val keySet = (for (sample <- samples)
      yield
        this
          .samplesStats(sample)
          .genotypeStats
          .getOrElse(field, Map[Any, Int]())
          .keySet).fold(Set[Any]())(_ ++ _)

    (for (sample <- samples)
      yield
        sample -> {
          keySet
            .map(
              key =>
                key.toString -> this
                  .samplesStats(sample)
                  .genotypeStats
                  .getOrElse(field, Map[Any, Int]())
                  .get(key))
            .filter(_._2.isDefined)
            .toMap
        }).toMap
139
  }
140
141

  /** This will generate stats for one contig */
142
  def getStatsAsMap(samples: List[String],
Peter van 't Hof's avatar
Peter van 't Hof committed
143
                     genotypeFields: List[String] = Nil,
144
145
                     infoFields: List[String] = Nil,
                     sampleDistributions: List[String] = Nil): Map[String, Any] = {
146
    Map(
147
148
      "genotype" -> genotypeFields.map(f => f -> getGenotypeField(samples, f)).toMap,
      "info" -> infoFields.map(f => f -> getField(f)).toMap,
149
      "sample_distributions" -> sampleDistributions
150
        .map(f => f -> getField("SampleDistribution-" + f))
151
        .toMap
152
    ) ++ Map(
153
154
155
156
157
158
159
160
161
162
              "sample_compare" -> Map(
                "samples" -> samples,
                "genotype_overlap" -> samples.map(sample1 =>
                  samples.map(sample2 =>
                    samplesStats(sample1).sampleToSample(sample2).genotypeOverlap)),
                "allele_overlap" -> samples.map(sample1 =>
                  samples.map(sample2 =>
                    samplesStats(sample1).sampleToSample(sample2).alleleOverlap))
              )
            )
163
164
  }

165
  def writeToFile(outputFile: File,
166
                  samples: List[String],
Peter van 't Hof's avatar
Peter van 't Hof committed
167
                  genotypeFields: List[String] = Nil,
168
                  infoFields: List[String] = Nil,
169
170
171
172
173
174
175
                  sampleDistributions: List[String] = Nil): Unit = {
    val allWriter = new PrintWriter(outputFile)
    val json = ConfigUtils.mapToJson(
      this.getStatsAsMap(samples, genotypeFields, infoFields, sampleDistributions))
    allWriter.println(json.nospaces)
    allWriter.close()

176
  }
177

Peter van 't Hof's avatar
Peter van 't Hof committed
178
179
180
}

object Stats {
181

pjvan_thof's avatar
pjvan_thof committed
182
183
184
185
186
187
188
189
190
191
192
193
  def emptyStats(samples: List[String]): Stats = {
    val stats = new Stats
    //init stats
    for (sample1 <- samples) {
      stats.samplesStats += sample1 -> new SampleStats
      for (sample2 <- samples) {
        stats.samplesStats(sample1).sampleToSample += sample2 -> new SampleToSampleStats
      }
    }
    stats
  }

Peter van 't Hof's avatar
Peter van 't Hof committed
194
195
196
197
198
199
200
  /** Merge m2 into m1 */
  def mergeStatsMap(m1: mutable.Map[Any, Int], m2: mutable.Map[Any, Int]): Unit = {
    for (key <- m2.keySet)
      m1(key) = m1.getOrElse(key, 0) + m2(key)
  }

  /** Merge m2 into m1 */
201
202
203
204
205
206
207
208
209
  def mergeNestedStatsMap(m1: mutable.Map[String, mutable.Map[Any, Int]],
                          m2: Map[String, Map[Any, Int]]): Unit = {
    for ((field, fieldMap) <- m2) {
      if (m1.contains(field)) {
        for ((key, value) <- fieldMap) {
          if (m1(field).contains(key)) m1(field)(key) += value
          else m1(field)(key) = value
        }
      } else m1(field) = mutable.Map(fieldMap.toList: _*)
Peter van 't Hof's avatar
Peter van 't Hof committed
210
211
    }
  }
212
}