Commit 5ca76501 authored by bow's avatar bow
Browse files

Merge branch 'feature-cnvpipeline' into 'develop'

Feature cnvpipeline

Work in progress

* [ ] - ConiferExome
* [ ] - ConiferExome Testing
* [x] - FreeC
* [ ] - FreeC testing
* [ ] - cnMOPS
* [ ] - cnMOPS testing
* [ ] - TITAN

Moved to next release:

* [ ] - HMMCopy
* [ ] - HMMCopy testing

See merge request !100
parents 0eeaa7b5 bb66d720
......@@ -26,7 +26,6 @@ import nl.lumc.sasc.biopet.core.{ MultiSampleQScript, PipelineCommand }
import nl.lumc.sasc.biopet.extensions.{ Cat, Raxml, RunGubbins }
import nl.lumc.sasc.biopet.pipelines.shiva.Shiva
import nl.lumc.sasc.biopet.extensions.tools.BastyGenerateFasta
import nl.lumc.sasc.biopet.utils.ConfigUtils
import nl.lumc.sasc.biopet.utils.config.Configurable
import org.broadinstitute.gatk.queue.QScript
......
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library('cn.mops'))
suppressPackageStartupMessages(library('optparse'))
# Script from https://git.lumc.nl/lgtc-bioinformatics/gapss3/blob/master/src/CNV/makeCnmops.sh
# modified to take arguments
option_list <- list(
make_option(c("--rawoutput"), dest="rawoutput"),
make_option(c("--cnv"), dest="cnv"),
make_option(c("--cnr"), dest="cnr"),
make_option(c("--chr"), dest="chr"),
make_option(c("--threads"), dest="threads", default=8, type="integer")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
arguments = parse_args(parser, positional_arguments=TRUE)
opt = arguments$options
args = arguments$args
chromosome <- opt$chr
CNVoutput <- opt$cnv
CNRoutput <- opt$cnr
bamFile <- args
BAMFiles <- c(bamFile)
bamDataRanges <- getReadCountsFromBAM(BAMFiles, mode="paired", refSeqName=chromosome, WL=1000, parallel=opt$threads)
write.table(as.data.frame( bamDataRanges ), quote = FALSE, opt$rawoutput, row.names=FALSE)
res <- cn.mops(bamDataRanges)
res <- calcIntegerCopyNumbers(res)
write.table(as.data.frame(cnvs(res)), quote = FALSE, CNVoutput, row.names=FALSE)
write.table(as.data.frame(cnvr(res)), quote = FALSE, CNRoutput, row.names=FALSE)
ppi <- 300
plot_margins <- c(3,4,1,2)+0.1
label_positions <- c(2,0.5,0)
dir.create(chromosome, showWarnings=FALSE, recursive=TRUE, mode="0744")
# Plot chromosome per sample.
for ( i in 1:length(BAMFiles)){
png(file=paste(chromosome,"/",chromosome,"-segplot-",i,".png", sep=""),
width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white"
)
par(mfrow = c(1,1))
par(mar=plot_margins)
par(mgp=label_positions)
segplot(res,sampleIdx=i)
dev.off()
}
# Plot cnvr regions.
for ( i in 1:nrow(as.data.frame(cnvr(res)))) {
png(file=paste(chromosome,"/",chromosome,"-cnv-",i,".png",sep=""),
width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white")
par(mfrow = c(1,1))
par(mar=plot_margins)
par(mgp=label_positions)
plot(res,which=i,toFile=TRUE)
dev.off()
}
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library('cn.mops'))
suppressPackageStartupMessages(library('optparse'))
# Script from https://git.lumc.nl/lgtc-bioinformatics/gapss3/blob/master/src/CNV/makeCnmops.sh
# modified to take arguments
option_list <- list(
make_option(c("--rawoutput"), dest="rawoutput"),
make_option(c("--cnv"), dest="cnv"),
make_option(c("--cnr"), dest="cnr"),
make_option(c("--chr"), dest="chr"),
make_option(c("--targetBed"), dest="targetBed"),
make_option(c("--threads"), dest="threads", default=8, type="integer")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
arguments = parse_args(parser, positional_arguments=TRUE)
opt = arguments$options
args = arguments$args
chromosome <- opt$chr
CNVoutput <- opt$cnv
CNRoutput <- opt$cnr
bamFile <- args
BAMFiles <- c(bamFile)
### WES Specific code
segments <- read.table(opt$targetBed, sep="\t", as.is=TRUE)
# filter the segments by the requested chromosome
segments <- segments[ segments[,1] == chromosome, ]
gr <- GRanges(segments[,1],IRanges(segments[,2],segments[,3]))
### END WES Specific code
bamDataRanges <- getSegmentReadCountsFromBAM(BAMFiles, GR=gr, mode="paired", parallel=opt$threads)
write.table(as.data.frame( bamDataRanges ), quote = FALSE, opt$rawoutput, row.names=FALSE)
res <- exomecn.mops(bamDataRanges)
res <- calcIntegerCopyNumbers(res)
write.table(as.data.frame(cnvs(res)), quote = FALSE, CNVoutput, row.names=FALSE)
write.table(as.data.frame(cnvr(res)), quote = FALSE, CNRoutput, row.names=FALSE)
ppi <- 300
plot_margins <- c(3,4,1,2)+0.1
label_positions <- c(2,0.5,0)
dir.create(chromosome, showWarnings=FALSE, recursive=TRUE, mode="0744")
# Plot chromosome per sample.
for ( i in 1:length(BAMFiles)){
png(file=paste(chromosome,"/",chromosome,"-segplot-",i,".png", sep=""),
width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white"
)
par(mfrow = c(1,1))
par(mar=plot_margins)
par(mgp=label_positions)
segplot(res,sampleIdx=i)
dev.off()
}
# Plot cnvr regions.
for ( i in 1:nrow(as.data.frame(cnvr(res)))) {
png(file=paste(chromosome,"/",chromosome,"-cnv-",i,".png",sep=""),
width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white")
par(mfrow = c(1,1))
par(mar=plot_margins)
par(mgp=label_positions)
plot(res,which=i,toFile=TRUE)
dev.off()
}
library('optparse')
# Script taken from http://bioinfo-out.curie.fr/projects/freec/tutorial.html and modified for biopet
option_list <- list(
make_option(c("-i", "--input"), dest="input"),
make_option(c("-o", "--output"), dest="output")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
opt = parse_args(parser)
#
# Load Data
#
dataTable <-read.table(opt$input, header=TRUE);
BAF<-data.frame(dataTable)
chromosomes <- levels(dataTable$Chromosome)
ppi <- 300
plot_margins <- c(3,4,1,2)+0.1
label_positions <- c(2,0.5,0)
png(filename = opt$output, width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white")
par(mfrow = c(6,4))
par(mar=plot_margins)
par(mgp=label_positions)
for (i in chromosomes) {
tt <- which(BAF$Chromosome==i)
if (length(tt)>0){
lBAF <-BAF[tt,]
plot(lBAF$Position,
lBAF$BAF,
ylim = c(-0.1,1.1),
xlab = paste ("position, chr",i),
ylab = "BAF",
pch = ".",
col = colors()[1])
tt <- which(lBAF$A==0.5)
points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[92])
tt <- which(lBAF$A!=0.5 & lBAF$A>=0)
points(lBAF$Position[tt],lBAF$BAF[tt],pch = ".",col = colors()[62])
tt <- 1
pres <- 1
if (length(lBAF$A)>4) {
for (j in c(2:(length(lBAF$A)-pres-1))) {
if (lBAF$A[j]==lBAF$A[j+pres]) {
tt[length(tt)+1] <- j
}
}
points(lBAF$Position[tt],lBAF$A[tt],pch = ".",col = colors()[24],cex=4)
points(lBAF$Position[tt],lBAF$B[tt],pch = ".",col = colors()[24],cex=4)
}
tt <- 1
pres <- 1
if (length(lBAF$FittedA)>4) {
for (j in c(2:(length(lBAF$FittedA)-pres-1))) {
if (lBAF$FittedA[j]==lBAF$FittedA[j+pres]) {
tt[length(tt)+1] <- j
}
}
points(lBAF$Position[tt],lBAF$FittedA[tt],pch = ".",col = colors()[463],cex=4)
points(lBAF$Position[tt],lBAF$FittedB[tt],pch = ".",col = colors()[463],cex=4)
}
}
}
dev.off()
library('optparse')
library('naturalsort')
# Script taken from http://bioinfo-out.curie.fr/projects/freec/tutorial.html and modified for biopet
option_list <- list(
make_option(c("-m", "--mappability"), dest="mappability"),
make_option(c("-p", "--ploidy"), default=2, type="integer", dest="ploidy"),
make_option(c("-i", "--input"), dest="input"),
make_option(c("-o", "--output"), dest="output")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
opt = parse_args(parser)
#
# Load mappability track
#
mappabilityFile <- opt$mappability
mappabilityTrack <- read.table(mappabilityFile, header=FALSE, col.names=c("chrom", "start", "end", "score"))
mappabilityTrack$Start <- mappabilityTrack$start+1
mappabilityTrack$Chromosome <- gsub("chr", "", mappabilityTrack$chrom)
#
# Load Data
#
dataTable <- read.table( opt$input , header=TRUE)
input_ratio <- data.frame(dataTable)
chromosomes <- naturalsort(levels(input_ratio$Chromosome))
input_ratio$Chromosome <- factor(input_ratio$Chromosome, levels=chromosomes, ordered=T)
sorted_ratio <- input_ratio[order(input_ratio$Chromosome),]
ratio <- merge(sorted_ratio, mappabilityTrack, sort=TRUE)
ratio <- ratio[order(ratio$Chromosome, ratio$Start),]
ploidy <- opt$ploidy
ppi <- 300
plot_margins <- c(3,4,1,2)+0.1
label_positions <- c(2,0.5,0)
maxLevelToPlot <- 3
for (i in c(1:length(ratio$Ratio))) {
if (ratio$Ratio[i]>maxLevelToPlot) {
ratio$Ratio[i]=maxLevelToPlot
}
}
#
# Plot the graphs per chromosome
#
for (i in chromosomes) {
png(filename = paste(opt$output, ".", i,".png",sep=""), width = 4 * ppi, height = 2.5 * ppi,
res=ppi, bg = "white")
par(mfrow = c(1,1))
par(mar=plot_margins)
par(mgp=label_positions)
tt <- which(ratio$Chromosome==i)
if (length(tt)>0) {
plot(ratio$Start[tt],
ratio$Ratio[tt]*ploidy,
ylim = c(0,maxLevelToPlot*ploidy),
xlab = paste ("position, chr",i),
ylab = "normalized CN",
pch = ".",
col = colors()[88])
title(outer=TRUE)
tt <- which(ratio$Chromosome==i & ratio$CopyNumber>ploidy )
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136])
tt <- which(ratio$Chromosome==i & ratio$Ratio==maxLevelToPlot & ratio$CopyNumber>ploidy)
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4)
tt <- which(ratio$Chromosome==i & ratio$CopyNumber<ploidy & ratio$CopyNumber!= -1)
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[461])
tt <- which(ratio$Chromosome==i)
#UNCOMMENT HERE TO SEE THE PREDICTED COPY NUMBER LEVEL:
#points(ratio$Start[tt],ratio$CopyNumber[tt], pch = ".", col = colors()[24],cex=4)
}
#tt <- which(ratio$Chromosome==i)
#UNCOMMENT HERE TO SEE THE EVALUATED MEDIAN LEVEL PER SEGMENT:
#points(ratio$Start[tt],ratio$MedianRatio[tt]*ploidy, pch = ".", col = colors()[463],cex=4)
dev.off()
}
png(filename = paste(opt$output, ".png",sep=""), width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white")
par(mfrow = c(6,4))
par(mar=plot_margins)
par(mgp=label_positions)
for (i in chromosomes) {
tt <- which(ratio$Chromosome==i)
if (length(tt)>0) {
plot(ratio$Start[tt],
ratio$Ratio[tt]*ploidy,
ylim = c(0,maxLevelToPlot*ploidy),
xlab = paste ("position, chr",i),
ylab = "normalized CN",
pch = ".",
col = colors()[88])
tt <- which(ratio$Chromosome==i & ratio$CopyNumber>ploidy )
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136])
tt <- which(ratio$Chromosome==i & ratio$Ratio==maxLevelToPlot & ratio$CopyNumber>ploidy)
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4)
tt <- which(ratio$Chromosome==i & ratio$CopyNumber<ploidy & ratio$CopyNumber!= -1)
points(ratio$Start[tt],ratio$Ratio[tt]*ploidy,pch = ".",col = colors()[461])
tt <- which(ratio$Chromosome==i)
#UNCOMMENT HERE TO SEE THE PREDICTED COPY NUMBER LEVEL:
#points(ratio$Start[tt],ratio$CopyNumber[tt], pch = ".", col = colors()[24],cex=4)
}
#tt <- which(ratio$Chromosome==i)
#UNCOMMENT HERE TO SEE THE EVALUATED MEDIAN LEVEL PER SEGMENT:
#points(ratio$Start[tt],ratio$MedianRatio[tt]*ploidy, pch = ".", col = colors()[463],cex=4)
}
dev.off()
# Export the whole genome graph
png(filename = paste(opt$output, ".wg.png",sep=""), width = 16 * ppi, height = 10 * ppi,
res=ppi, bg = "white")
plot_margins <- c(3,4,2,2)+0.1
label_positions <- c(2,0.5,0)
par(mfrow = c(1,1))
par(mar=plot_margins)
par(mgp=label_positions)
par(xaxs="i", yaxs="i")
maxLevelToPlot <- 3
for (i in c(1:length(ratio$Ratio))) {
if (ratio$Ratio[i]>maxLevelToPlot) {
ratio$Ratio[i]=maxLevelToPlot
}
}
for (i in c(1:length(ratio$Start))) {
ratio$Position[i] = (i-1) *5000 +1
}
plotRatioLT <- 0.10
filteredSet <- ratio[ ratio$score > plotRatioLT, ]
plot(filteredSet$Position,
filteredSet$Ratio*ploidy,
ylim = c(0,maxLevelToPlot*ploidy),
xlab = paste ("Chr. on genome"),
ylab = "normalized CN",
pch = ".",
col = colors()[88])
title(outer=TRUE)
tt <- which(filteredSet$CopyNumber>ploidy)
points(filteredSet$Position[tt],filteredSet$Ratio[tt]*ploidy,pch = ".",col = colors()[136])
tt <- which(filteredSet$Ratio==maxLevelToPlot & filteredSet$CopyNumber>ploidy)
points(filteredSet$Position[tt],filteredSet$Ratio[tt]*ploidy,pch = ".",col = colors()[136],cex=4)
tt <- which(filteredSet$CopyNumber<ploidy & filteredSet$CopyNumber!= -1)
points(filteredSet$Position[tt],filteredSet$Ratio[tt]*ploidy,pch = ".",col = colors()[461], bg="black")
for (chrom in chromosomes) {
tt <- which(filteredSet$Chromosome == chrom)
print(filteredSet[tt[1],])
xpos <- filteredSet$Position[tt][1]
abline(v=xpos, col="grey")
axis(3, at=xpos, labels=chrom , las=2)
}
dev.off()
library(rtracklayer)
library('optparse')
library(GenomicRanges);
library(IRanges);
# Script taken from http://bioinfo-out.curie.fr/projects/freec/tutorial.html and modified for biopet
option_list <- list(
make_option(c("-c", "--cnv"), dest="cnv"),
make_option(c("-r", "--ratios"), dest="ratios"),
make_option(c("-o", "--output"), dest="output")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
opt = parse_args(parser)
dataTable <-read.table(opt$ratios, header=TRUE);
ratio<-data.frame(dataTable)
dataTable <- read.table(opt$cnv, header=FALSE)
cnvs<- data.frame(dataTable)
ratio$Ratio[which(ratio$Ratio==-1)]=NA
cnvs.bed=GRanges(cnvs[,1],IRanges(cnvs[,2],cnvs[,3]))
ratio.bed=GRanges(ratio$Chromosome,IRanges(ratio$Start,ratio$Start),score=ratio$Ratio)
overlaps <- subsetByOverlaps(ratio.bed,cnvs.bed)
normals <- setdiff(ratio.bed,cnvs.bed)
normals <- subsetByOverlaps(ratio.bed,normals)
#mu <- mean(score(normals),na.rm=TRUE)
#sigma<- sd(score(normals),na.rm=TRUE)
#hist(score(normals),n=500,xlim=c(0,2))
#hist(log(score(normals)),n=500,xlim=c(-1,1))
#shapiro.test(score(normals)[which(!is.na(score(normals)))][5001:10000])
#qqnorm (score(normals)[which(!is.na(score(normals)))],ylim=(c(0,10)))
#qqline(score(normals)[which(!is.na(score(normals)))], col = 2)
#shapiro.test(log(score(normals))[which(!is.na(score(normals)))][5001:10000])
#qqnorm (log(score(normals))[which(!is.na(score(normals)))],ylim=(c(-6,10)))
#qqline(log(score(normals))[which(!is.na(score(normals)))], col = 2)
numberOfCol=length(cnvs)
for (i in c(1:length(cnvs[,1]))) {
values <- score(subsetByOverlaps(ratio.bed,cnvs.bed[i]))
#wilcox.test(values,mu=mu)
W <- function(values,normals){resultw <- try(wilcox.test(values,score(normals)), silent = TRUE)
if(class(resultw)=="try-error") return(list("statistic"=NA,"parameter"=NA,"p.value"=NA,"null.value"=NA,"alternative"=NA,"method"=NA,"data.name"=NA)) else resultw}
KS <- function(values,normals){resultks <- try(ks.test(values,score(normals)), silent = TRUE)
if(class(resultks)=="try-error") return(list("statistic"=NA,"p.value"=NA,"alternative"=NA,"method"=NA,"data.name"=NA)) else resultks}
#resultks <- try(KS <- ks.test(values,score(normals)), silent = TRUE)
# if(class(resultks)=="try-error") NA) else resultks
cnvs[i,numberOfCol+1]=W(values,normals)$p.value
cnvs[i,numberOfCol+2]=KS(values,normals)$p.value
}
if (numberOfCol==5) {
names(cnvs)=c("chr","start","end","copy number","status","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")
}
if (numberOfCol==7) {
names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")
}
if (numberOfCol==9) {
names(cnvs)=c("chr","start","end","copy number","status","genotype","uncertainty","somatic/germline","precentageOfGermline","WilcoxonRankSumTestPvalue","KolmogorovSmirnovPvalue")
}
write.table(cnvs, file=opt$output, sep="\t",quote=F,row.names=F)
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2015 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/