PedigreeQscript.scala 6.47 KB
Newer Older
1 2 3 4 5
package nl.lumc.sasc.biopet.core

import java.io.PrintWriter

import nl.lumc.sasc.biopet.core.PedigreeQscript.PedMergeStrategy
6
import nl.lumc.sasc.biopet.utils.Logging
7 8 9 10 11 12
import org.broadinstitute.gatk.queue.QScript

import scala.io.Source

/**
 * Created by Sander Bollen on 28-3-17.
Sander Bollen's avatar
Sander Bollen committed
13 14 15 16 17 18
  *
  * A multi-sample Qscript with additional Pedigree information.
  * Pedigrees follow the PED standard.
  * See: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml for the format
  *
  * Pedigrees may be parsed from the sample config and/or a supplied PED file.
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
 */
trait PedigreeQscript extends MultiSampleQScript { qscript: QScript =>

  /* Optionally parse from ped file */
  def ped: Option[File] = None

  /* The merge stategy to use when we have both a ped file and sample tag information */
  def mergeStrategy: PedMergeStrategy.Value = PedMergeStrategy.Concatenate

  /**
   * Case class representing a PED samples
   * For the PED format, see:
   * http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml
   * @param familyId family id
   * @param individualId individual id
   * @param paternalId Optional paternal id
   * @param maternalId Optional maternal id
   * @param gender gender
   * @param affectedPhenotype Optional boolean
   * @param genotypeFields optional genotype fileds
   */
  case class PedSample(familyId: String, individualId: String,
                       paternalId: Option[String],
                       maternalId: Option[String],
                       gender: MultiSampleQScript.Gender.Value,
                       affectedPhenotype: Option[Boolean],
                       genotypeFields: List[String] = Nil)

  lazy val pedSamples: List[PedSample] = {
    mergeStrategy match {
      case PedMergeStrategy.Concatenate => pedSamplesFromConfig() ::: parsePedFile()
      case _                            => throw new NotImplementedError() // todo
    }
  }

  /**
   * Get affected samples
   *
   * @return List[PedSample]
   */
  def getIndexSamples: List[PedSample] = {
    pedSamples.filter(x => x.affectedPhenotype)
  }

  /**
   * Get pedSamples from sample tags in config
   * May return empty list if no pedigree can be constructed
66 67
   * PedSamples can only be constructed for those samples where family is defined
   * Furthermore, if a father or mother is given, another sample with this id must also exist
68 69 70 71
   *
   * @return
   */
  def pedSamplesFromConfig(): List[PedSample] = {
72
    val withFam = samples.values.filter(_.family.isDefined)
Sander Bollen's avatar
Sander Bollen committed
73 74 75
    val sampleIds = withFam.map(_.sampleId).toSet
    val fathers = withFam.flatMap(_.father)
    val mothers = withFam.flatMap(_.mother)
76
    fathers.foreach { f =>
Sander Bollen's avatar
Sander Bollen committed
77
      if (!sampleIds.contains(f)) {
78
        Logging.addError(s"Father $f does not exist in samples")
79
      }
80 81
    }
    mothers.foreach { m =>
Sander Bollen's avatar
Sander Bollen committed
82
      if (!sampleIds.contains(m)) {
83 84 85 86 87
        Logging.addError(s"Mother $m does not exist in samples")
      }
    }
    withFam.map { s =>
      PedSample(s.family.get, s.sampleId, s.father, s.mother, s.gender, None)
88 89 90
    }.toList
  }

91
  /* Parse ped file to list of PedSamples */
92 93 94 95 96 97 98
  def parsePedFile(): List[PedSample] = {
    ped match {
      case Some(p) => Source.fromFile(p).getLines().map { x => parseSinglePedLine(x) }.toList
      case _       => Nil
    }
  }

99
  /* Parse a single Ped line to a PedSample */
100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
  def parseSinglePedLine(line: String): PedSample = {
    val arr = line.split("\\s")
    var genotypeFields: List[String] = Nil
    if (arr.size < 6) {
      throw new IllegalArgumentException("Ped file contains less than 6 columns")
    } else if (arr.length == 6) {
      genotypeFields = Nil
    } else {
      genotypeFields = arr.drop(6).toList
    }
    val gender = arr(4) match {
      case "1" => MultiSampleQScript.Gender.Male
      case "2" => MultiSampleQScript.Gender.Female
      case _   => MultiSampleQScript.Gender.Unknown
    }
    val affected = arr(5) match {
      case "1" => Some(false)
      case "2" => Some(true)
      case _   => None
    }
    val paternalId = arr(2) match {
      case "0"       => None
      case otherwise => Some(otherwise)
    }
    val maternalId = arr(3) match {
      case "0"       => None
      case otherwise => Some(otherwise)
    }
    PedSample(arr(0), arr(1), paternalId, maternalId, gender, affected, genotypeFields)
  }

131
  /* Check whether sample is a mother */
132 133 134 135 136
  def isMother(pedSample: PedSample): Boolean = {
    val motherIds = pedSamples.flatMap(_.maternalId)
    motherIds.contains(pedSample.individualId)
  }

137
  /* Check whether sample is a father */
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
  def isFather(pedSample: PedSample): Boolean = {
    val fatherIds = pedSamples.flatMap(_.paternalId)
    fatherIds.contains(pedSample.individualId)
  }

  /**
   * Convenience method for checking whether current pedigree constitutes a single patient pedigree
   */
  def isSingle: Boolean = {
    pedSamples.size == 1 && pedSamples.head.maternalId.isEmpty && pedSamples.head.paternalId.isEmpty
  }

  /**
   * Convenience method for checking whether current pedigree constitutes a trio pedigree
   */
  def isTrio: Boolean = {
    getIndexSamples.size == 1 &&
      pedSamples.size == 3 &&
      (getIndexSamples.head.maternalId.isDefined && pedSamples.map(_.individualId).contains(getIndexSamples.head.maternalId.get)) &&
      (getIndexSamples.head.paternalId.isDefined && pedSamples.map(_.individualId).contains(getIndexSamples.head.paternalId.get))
  }

  /**
   * Write list of ped samples to a PED file
   *
   * For the PED format see:
   * http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml
   */
  def writeToPedFile(outFile: File): Unit = {
    val writer = new PrintWriter(outFile)
    pedSamples.foreach { p =>
      val paternalField = p.paternalId match {
        case Some(s) => s
        case _       => "0"
      }
      val maternalField = p.maternalId match {
        case Some(s) => s
        case _       => "0"
      }
      val genderField = p.gender match {
        case MultiSampleQScript.Gender.Male   => "1"
        case MultiSampleQScript.Gender.Female => "2"
        case _                                => "-9"
      }
      val affectedField = p.affectedPhenotype match {
        case Some(b) => b match {
          case true => "2"
          case _    => "1"
        }
        case _ => "0"
      }
      val line: String = s"${p.familyId}\t${p.individualId}\t$paternalField\t$maternalField\t$genderField\t$affectedField\t" +
        p.genotypeFields.mkString("\t")
      writer.write(line + "\n")
    }
    writer.close()
  }

}

object PedigreeQscript {
  object PedMergeStrategy extends Enumeration {
    val Concatenate, Update, ConcatenatedAndUpdate = Value
  }
}