Commit 142c815a authored by Peter van 't Hof's avatar Peter van 't Hof

Merge remote-tracking branch 'remotes/origin/develop' into feature-index_generation

parents ab30e376 7eb9af74
......@@ -17,15 +17,16 @@ package nl.lumc.sasc.biopet.pipelines.bammetrics
import java.io.File
import nl.lumc.sasc.biopet.core.annotations.{ RibosomalRefFlat, AnnotationRefFlat }
import nl.lumc.sasc.biopet.core.annotations.{ AnnotationRefFlat, RibosomalRefFlat }
import nl.lumc.sasc.biopet.utils.config.Configurable
import nl.lumc.sasc.biopet.core.summary.SummaryQScript
import nl.lumc.sasc.biopet.core.{ Reference, BiopetFifoPipe, PipelineCommand, SampleLibraryTag }
import nl.lumc.sasc.biopet.core.{ BiopetFifoPipe, PipelineCommand, Reference, SampleLibraryTag }
import nl.lumc.sasc.biopet.extensions.bedtools.{ BedtoolsCoverage, BedtoolsIntersect }
import nl.lumc.sasc.biopet.extensions.picard._
import nl.lumc.sasc.biopet.extensions.samtools.SamtoolsFlagstat
import nl.lumc.sasc.biopet.pipelines.bammetrics.scripts.CoverageStats
import nl.lumc.sasc.biopet.extensions.tools.BiopetFlagstat
import nl.lumc.sasc.biopet.utils.intervals.BedCheck
import org.broadinstitute.gatk.queue.QScript
class BamMetrics(val root: Configurable) extends QScript
......@@ -72,6 +73,8 @@ class BamMetrics(val root: Configurable) extends QScript
/** executed before script */
def init(): Unit = {
inputFiles :+= new InputFile(inputBam)
ampliconBedFile.foreach(BedCheck.checkBedFileToReference(_, referenceFasta(), biopetError = true))
roiBedFiles.foreach(BedCheck.checkBedFileToReference(_, referenceFasta(), biopetError = true))
}
/** Script to add jobs */
......
......@@ -61,9 +61,9 @@ class BamMetricsTest extends TestNGSuite with Matchers {
def testBamMetrics(rois: Int, amplicon: Boolean, rna: Boolean, wgs: Boolean) = {
val map = ConfigUtils.mergeMaps(Map("output_dir" -> BamMetricsTest.outputDir, "rna_metrics" -> rna, "wgs_metrics" -> wgs),
Map(BamMetricsTest.executables.toSeq: _*)) ++
(if (amplicon) Map("amplicon_bed" -> "amplicon.bed") else Map()) ++
(if (amplicon) Map("amplicon_bed" -> BamMetricsTest.ampliconBed.getAbsolutePath) else Map()) ++
(if (rna) Map("annotation_refflat" -> "transcripts.refFlat") else Map()) ++
Map("regions_of_interest" -> (1 to rois).map("roi_" + _ + ".bed").toList)
Map("regions_of_interest" -> (1 to rois).map(BamMetricsTest.roi(_).getAbsolutePath).toList)
val bammetrics: BamMetrics = initPipeline(map)
bammetrics.inputBam = BamMetricsTest.bam
......@@ -94,6 +94,14 @@ object BamMetricsTest {
val bam = new File(outputDir, "input" + File.separator + "bla.bam")
Files.touch(bam)
val ampliconBed = new File(outputDir, "input" + File.separator + "amplicon_bed.bed")
Files.touch(ampliconBed)
def roi(i: Int): File = {
val roi = new File(outputDir, "input" + File.separator + s"roi${i}.bed")
Files.touch(roi)
roi
}
private def copyFile(name: String): Unit = {
val is = getClass.getResourceAsStream("/" + name)
......
......@@ -26,7 +26,6 @@ import nl.lumc.sasc.biopet.core.{ MultiSampleQScript, PipelineCommand }
import nl.lumc.sasc.biopet.extensions.{ Cat, Raxml, RunGubbins }
import nl.lumc.sasc.biopet.pipelines.shiva.Shiva
import nl.lumc.sasc.biopet.extensions.tools.BastyGenerateFasta
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript
......
......@@ -73,6 +73,8 @@ class BiopetFifoPipe(val root: Configurable,
_pipesJobs :::= commands
_pipesJobs = _pipesJobs.distinct
analysisName = commands.map(_.analysisName).mkString("_")
}
override def beforeCmd(): Unit = {
......
......@@ -132,7 +132,7 @@ trait ReportBuilder extends ToolCommand {
logger.info("Start")
val argsParser = new OptParser
val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse sys.exit(1)
val cmdArgs: Args = argsParser.parse(args, Args()) getOrElse(throw new IllegalArgumentException)
require(cmdArgs.outputDir.exists(), "Output dir does not exist")
require(cmdArgs.outputDir.isDirectory, "Output dir is not a directory")
......
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library('cn.mops'))
suppressPackageStartupMessages(library('optparse'))
# Script from https://git.lumc.nl/lgtc-bioinformatics/gapss3/blob/master/src/CNV/makeCnmops.sh
# modified to take arguments
option_list <- list(
make_option(c("--rawoutput"), dest="rawoutput"),
make_option(c("--cnv"), dest="cnv"),
make_option(c("--cnr"), dest="cnr"),
make_option(c("--chr"), dest="chr"),
make_option(c("--threads"), dest="threads", default=8, type="integer")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
arguments = parse_args(parser, positional_arguments=TRUE)
opt = arguments$options
args = arguments$args
chromosome <- opt$chr
CNVoutput <- opt$cnv
CNRoutput <- opt$cnr
bamFile <- args
BAMFiles <- c(bamFile)
bamDataRanges <- getReadCountsFromBAM(BAMFiles, mode="paired", refSeqName=chromosome, WL=1000, parallel=opt$threads)
write.table(as.data.frame( bamDataRanges ), quote = FALSE, opt$rawoutput, row.names=FALSE)
res <- cn.mops(bamDataRanges)
res <- calcIntegerCopyNumbers(res)
write.table(as.data.frame(cnvs(res)), quote = FALSE, CNVoutput, row.names=FALSE)
write.table(as.data.frame(cnvr(res)), quote = FALSE, CNRoutput, row.names=FALSE)
ppi <- 300
plot_margins <- c(3,4,1,2)+0.1
label_positions <- c(2,0.5,0)
dir.create(chromosome, showWarnings=FALSE, recursive=TRUE, mode="0744")
# Plot chromosome per sample.
for ( i in 1:length(BAMFiles)){
png(file=paste(chromosome,"/",chromosome,"-segplot-",i,".png", sep=""),
width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white"
)
par(mfrow = c(1,1))
par(mar=plot_margins)
par(mgp=label_positions)
segplot(res,sampleIdx=i)
dev.off()
}
# Plot cnvr regions.
for ( i in 1:nrow(as.data.frame(cnvr(res)))) {
png(file=paste(chromosome,"/",chromosome,"-cnv-",i,".png",sep=""),
width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white")
par(mfrow = c(1,1))
par(mar=plot_margins)
par(mgp=label_positions)
plot(res,which=i,toFile=TRUE)
dev.off()
}
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library('cn.mops'))
suppressPackageStartupMessages(library('optparse'))
# Script from https://git.lumc.nl/lgtc-bioinformatics/gapss3/blob/master/src/CNV/makeCnmops.sh
# modified to take arguments
option_list <- list(
make_option(c("--rawoutput"), dest="rawoutput"),
make_option(c("--cnv"), dest="cnv"),
make_option(c("--cnr"), dest="cnr"),
make_option(c("--chr"), dest="chr"),
make_option(c("--targetBed"), dest="targetBed"),
make_option(c("--threads"), dest="threads", default=8, type="integer")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
arguments = parse_args(parser, positional_arguments=TRUE)
opt = arguments$options
args = arguments$args
chromosome <- opt$chr
CNVoutput <- opt$cnv
CNRoutput <- opt$cnr
bamFile <- args
BAMFiles <- c(bamFile)
### WES Specific code
segments <- read.table(opt$targetBed, sep="\t", as.is=TRUE)
# filter the segments by the requested chromosome
segments <- segments[ segments[,1] == chromosome, ]
gr <- GRanges(segments[,1],IRanges(segments[,2],segments[,3]))
### END WES Specific code
bamDataRanges <- getSegmentReadCountsFromBAM(BAMFiles, GR=gr, mode="paired", parallel=opt$threads)
write.table(as.data.frame( bamDataRanges ), quote = FALSE, opt$rawoutput, row.names=FALSE)
res <- exomecn.mops(bamDataRanges)
res <- calcIntegerCopyNumbers(res)
write.table(as.data.frame(cnvs(res)), quote = FALSE, CNVoutput, row.names=FALSE)
write.table(as.data.frame(cnvr(res)), quote = FALSE, CNRoutput, row.names=FALSE)
ppi <- 300
plot_margins <- c(3,4,1,2)+0.1
label_positions <- c(2,0.5,0)
dir.create(chromosome, showWarnings=FALSE, recursive=TRUE, mode="0744")
# Plot chromosome per sample.
for ( i in 1:length(BAMFiles)){
png(file=paste(chromosome,"/",chromosome,"-segplot-",i,".png", sep=""),
width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white"
)
par(mfrow = c(1,1))
par(mar=plot_margins)
par(mgp=label_positions)
segplot(res,sampleIdx=i)
dev.off()
}
# Plot cnvr regions.
for ( i in 1:nrow(as.data.frame(cnvr(res)))) {
png(file=paste(chromosome,"/",chromosome,"-cnv-",i,".png",sep=""),
width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white")
par(mfrow = c(1,1))
par(mar=plot_margins)
par(mgp=label_positions)
plot(res,which=i,toFile=TRUE)
dev.off()
}
library('optparse')
# Script taken from http://bioinfo-out.curie.fr/projects/freec/tutorial.html and modified for biopet
option_list <- list(
make_option(c("-i", "--input"), dest="input"),
make_option(c("-o", "--output"), dest="output")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
opt = parse_args(parser)
#
# Load Data
#
dataTable <-read.table(opt$input, header=TRUE);
BAF<-data.frame(dataTable)
chromosomes <- levels(dataTable$Chromosome)
ppi <- 300
plot_margins <- c(3,4,1,2)+0.1
label_positions <- c(2,0.5,0)
png(filename = opt$output, width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white")
par(mfrow = c(6,4))
par(mar=plot_margins)
par(mgp=label_positions)
for (i in chromosomes) {
tt <- which(BAF$Chromosome==i)
if (length(tt)>0){
lBAF <-BAF[tt,]
plot(lBAF$Position,
lBAF$BAF,
ylim = c(-0.1,1.1),
xlab = paste ("position, chr",i),
ylab = "BAF",
pch = ".",
col = colors()[1])
tt <- which(lBAF$A==0.5)
points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[92])
tt <- which(lBAF$A!=0.5 & lBAF$A>=0)
points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[62])
tt <- 1
pres <- 1
if (length(lBAF$A)>4) {
for (j in c(2:(length(lBAF$A)-pres-1))) {
if (lBAF$A[j]==lBAF$A[j+pres]) {
tt[length(tt)+1] <- j
}
}
points(lBAF$Position[tt],lBAF$A[tt],pch = ".",col = colors()[24],cex=4)
points(lBAF$Position[tt],lBAF$B[tt],pch = ".",col = colors()[24],cex=4)
}
tt <- 1
pres <- 1
if (length(lBAF$FittedA)>4) {
for (j in c(2:(length(lBAF$FittedA)-pres-1))) {
if (lBAF$FittedA[j]==lBAF$FittedA[j+pres]) {
tt[length(tt)+1] <- j
}
}
points(lBAF$Position[tt],lBAF$FittedA[tt],pch = ".",col = colors()[463],cex=4)
points(lBAF$Position[tt],lBAF$FittedB[tt],pch = ".",col = colors()[463],cex=4)
}
}
}
dev.off()
library('optparse')
library('naturalsort')
# Script taken from http://bioinfo-out.curie.fr/projects/freec/tutorial.html and modified for biopet
option_list <- list(
make_option(c("-m", "--mappability"), dest="mappability"),
make_option(c("-p", "--ploidy"), default=2, type="integer", dest="ploidy"),
make_option(c("-i", "--input"), dest="input"),
make_option(c("-o", "--output"), dest="output")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
opt = parse_args(parser)
#
# Load mappability track
#
mappabilityFile <- opt$mappability
mappabilityTrack <- read.table(mappabilityFile, header=FALSE, col.names=c("chrom", "start", "end", "score"))
mappabilityTrack$Start <- mappabilityTrack$start+1
mappabilityTrack$Chromosome <- gsub("chr", "", mappabilityTrack$chrom)
#
# Load Data
#
dataTable <- read.table( opt$input , header=TRUE)
input_ratio <- data.frame(dataTable)
chromosomes <- naturalsort(levels(input_ratio$Chromosome))
input_ratio$Chromosome <- factor(input_ratio$Chromosome, levels=chromosomes, ordered=T)
sorted_ratio <- input_ratio[order(input_ratio$Chromosome),]
ratio <- merge(sorted_ratio, mappabilityTrack, sort=TRUE)
ratio <- ratio[order(ratio$Chromosome, ratio$Start),]
ploidy <- opt$ploidy
ppi <- 300
plot_margins <- c(3,4,1,2)+0.1
label_positions <- c(2,0.5,0)
maxLevelToPlot <- 3
for (i in c(1:length(ratio$Ratio))) {
if (ratio$Ratio[i]>maxLevelToPlot) {
ratio$Ratio[i]=maxLevelToPlot
}
}
#
# Plot the graphs per chromosome
#
for (i in chromosomes) {
png(filename = paste(opt$output, ".", i,".png",sep=""), width = 4 * ppi, height = 2.5 * ppi,
res=ppi, bg = "white")
par(mfrow = c(1,1))
par(mar=plot_margins)
par(mgp=label_positions)
tt <- which(ratio$Chromosome==i)
if (length(tt)>0) {
plot(ratio$Start[tt],
ratio$Ratio[tt]*ploidy,
ylim = c(0,maxLevelToPlot*ploidy),
xlab = paste ("position, chr",i),
ylab = "normalized CN",
pch = ".",
col = colors()[88])
title(outer=TRUE)
tt <- which(ratio$Chromosome==i & ratio$CopyNumber>ploidy )
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136])
tt <- which(ratio$Chromosome==i & ratio$Ratio==maxLevelToPlot & ratio$CopyNumber>ploidy)
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4)
tt <- which(ratio$Chromosome==i & ratio$CopyNumber<ploidy & ratio$CopyNumber!= -1)
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[461])
tt <- which(ratio$Chromosome==i)
#UNCOMMENT HERE TO SEE THE PREDICTED COPY NUMBER LEVEL:
#points(ratio$Start[tt],ratio$CopyNumber[tt], pch = ".", col = colors()[24],cex=4)
}
#tt <- which(ratio$Chromosome==i)
#UNCOMMENT HERE TO SEE THE EVALUATED MEDIAN LEVEL PER SEGMENT:
#points(ratio$Start[tt],ratio$MedianRatio[tt]*ploidy, pch = ".", col = colors()[463],cex=4)
dev.off()
}
png(filename = paste(opt$output, ".png",sep=""), width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white")
par(mfrow = c(6,4))
par(mar=plot_margins)
par(mgp=label_positions)
for (i in chromosomes) {
tt <- which(ratio$Chromosome==i)
if (length(tt)>0) {
plot(ratio$Start[tt],
ratio$Ratio[tt]*ploidy,
ylim = c(0,maxLevelToPlot*ploidy),
xlab = paste ("position, chr",i),
ylab = "normalized CN",
pch = ".",
col = colors()[88])
tt <- which(ratio$Chromosome==i & ratio$CopyNumber>ploidy )
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136])
tt <- which(ratio$Chromosome==i & ratio$Ratio==maxLevelToPlot & ratio$CopyNumber>ploidy)
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4)
tt <- which(ratio$Chromosome==i & ratio$CopyNumber<ploidy & ratio$CopyNumber!= -1)
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[461])
tt <- which(ratio$Chromosome==i)
#UNCOMMENT HERE TO SEE THE PREDICTED COPY NUMBER LEVEL:
#points(ratio$Start[tt],ratio$CopyNumber[tt], pch = ".", col = colors()[24],cex=4)
}
#tt <- which(ratio$Chromosome==i)
#UNCOMMENT HERE TO SEE THE EVALUATED MEDIAN LEVEL PER SEGMENT:
#points(ratio$Start[tt],ratio$MedianRatio[tt]*ploidy, pch = ".", col = colors()[463],cex=4)
}
dev.off()
# Export the whole genome graph
png(filename = paste(opt$output, ".wg.png",sep=""), width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white")
plot_margins <- c(3,4,2,2)+0.1
label_positions <- c(2,0.5,0)
par(mfrow = c(1,1))
par(mar=plot_margins)
par(mgp=label_positions)
par(xaxs="i", yaxs="i")
maxLevelToPlot <- 3
for (i in c(1:length(ratio$Ratio))) {
if (ratio$Ratio[i]>maxLevelToPlot) {
ratio$Ratio[i]=maxLevelToPlot
}
}
for (i in c(1:length(ratio$Start))) {
ratio$Position[i] = (i-1) *5000 +1
}
plotRatioLT <- 0.10
filteredSet <- ratio[ ratio$score > plotRatioLT, ]
plot(filteredSet$Position,
filteredSet$Ratio*ploidy,
ylim = c(0,maxLevelToPlot*ploidy),
xlab = paste ("Chr. on genome"),
ylab = "normalized CN",
pch = ".",
col = colors()[88])
title(outer=TRUE)
tt <- which(filteredSet$CopyNumber>ploidy)
points(filteredSet$Position[tt],filteredSet$Ratio[tt]*ploidy,pch = ".",col = colors()[136])
tt <- which(filteredSet$Ratio==maxLevelToPlot & filteredSet$CopyNumber>ploidy)
points(filteredSet$Position[tt],filteredSet$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4)
tt <- which(filteredSet$CopyNumber<ploidy & filteredSet$CopyNumber!= -1)
points(filteredSet$Position[tt],filteredSet$Ratio[tt]*ploidy,pch = ".",col = colors()[461], bg="black")
for (chrom in chromosomes) {
tt <- which(filteredSet$Chromosome == chrom)
print(filteredSet[tt[1],])
xpos <- filteredSet$Position[tt][1]
abline(v=xpos, col="grey")
axis(3, at=xpos, labels=chrom , las=2)
}
dev.off()
library(rtracklayer)
library('optparse')
library(GenomicRanges);
library(IRanges);
# Script taken from http://bioinfo-out.curie.fr/projects/freec/tutorial.html and modified for biopet
option_list <- list(
make_option(c("-c", "--cnv"), dest="cnv"),
make_option(c("-r", "--ratios"), dest="ratios"),
make_option(c("-o", "--output"), dest="output")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
opt = parse_args(parser)
dataTable <-read.table(opt$ratios, header=TRUE);
ratio<-data.frame(dataTable)
dataTable <- read.table(opt$cnv, header=FALSE)
cnvs<- data.frame(dataTable)
ratio$Ratio[which(ratio$Ratio==-1)]=NA
cnvs.bed=GRanges(cnvs[,1],IRanges(cnvs[,2],cnvs[,3]))
ratio.bed=GRanges(ratio$Chromosome,IRanges(ratio$Start,ratio$Start),score=ratio$Ratio)
overlaps <- subsetByOverlaps(ratio.bed,cnvs.bed)
normals <- setdiff(ratio.bed,cnvs.bed)
normals <- subsetByOverlaps(ratio.bed,normals)
#mu <- mean(score(normals),na.rm=TRUE)
#sigma<- sd(score(normals),na.rm=TRUE)
#hist(score(normals),n=500,xlim=c(0,2))
#hist(log(score(normals)),n=500,xlim=c(-1,1))
#shapiro.test(score(normals)[which(!is.na(score(normals)))][5001:10000])
#qqnorm (score(normals)[which(!is.na(score(normals)))],ylim=(c(0,10)))
#qqline(score(normals)[which(!is.na(score(normals)))], col = 2)
#shapiro.test(log(score(normals))[which(!is.na(score(normals)))][5001:10000])
#qqnorm (log(score(normals))[which(!is.na(score(normals)))],ylim=(c(-6,10)))
#qqline(log(score(normals))[which(!is.na(score(normals)))], col = 2)
numberOfCol=length(cnvs)
for (i in c(1:length(cnvs[,1]))) {
values <- score(subsetByOverlaps(ratio.bed,cnvs.bed[i]))
#wilcox.test(values,mu=mu)
W <- function(values,normals){resultw <- try(wilcox.test(values,score(normals)), silent = TRUE)
if(class(resultw)=="try-error") return(list("statistic"=NA,"parameter"=NA,"p.value"=NA,"null.value"=NA,"alternative"=NA,"method"=NA,"data.name"=NA)) else resultw}
KS <- function(values,normals){resultks <- try(ks.test(values,score(normals)), silent = TRUE)
if(class(resultks)=="try-error") return(list("statistic"=NA,"p.value"=NA,"alternative"=NA,"method"=NA,"data.name"=NA)) else resultks}
#resultks <- try(KS <- ks.test(values,score(normals)), silent = TRUE)
# if(class(resultks)=="try-error") NA) else resultks
cnvs[i,numberOfCol+1]=W(values,normals)$p.value
cnvs[i,numberOfCol+2]=KS(values,normals)$p.value
}
if (numberOfCol==5) {
names(cnvs)=c("chr","start","end","copy number","status","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")
}
if (numberOfCol==7) {
names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")
}
if (numberOfCol==9) {
names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","somatic/germline","precentageOfGermline","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")
}
write.table(cnvs, file=opt$output, sep="\t",quote=F,row.names=F)
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.