Commit 51da5c65 authored by bow's avatar bow
Browse files

Merge branch 'develop' into feature-Chipcap

Conflicts:
	public/biopet-public-package/pom.xml
	public/biopet-public-package/src/main/scala/nl/lumc/sasc/biopet/core/BiopetExecutablePublic.scala
parents 3e4e6bad c874be08
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="EntryPointsManager">
<entry_points version="2.0" />
</component>
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
......
#!/usr/bin/env python
#
# Biopet is built on top of GATK Queue for building bioinformatic
# pipelines. It is mainly intended to support LUMC SHARK cluster which is running
# SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
# should also be able to execute Biopet tools and pipelines.
#
# Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
#
# Contact us at: sasc@lumc.nl
#
# A dual licensing mode is applied. The source code within this project that are
# not part of GATK Queue is freely available for non-commercial use under an AGPL
# license; For commercial users or users who do not want to follow the AGPL
# license, please contact us to obtain a separate license.
#
"""
(Re-)sync two filtered paired end FASTQ files.
Given two filtered paired end read files and one of the original read files,
re-sync the filtered reads by filtering out anything that is only present in
one of the two files.
Usage:
{command} <orig.fq> <reads_1.fq> <reads_2.fq> \\
<reads_1.synced.fq> <reads_2.synced.fq>
The synced reads are written to disk as <reads_1.synced.fq> and
<reads_2.synced.fq>. Afterwards some counts are printed.
Both Illumina old-style and new-style paired-end header lines are supported.
The original read file is used to speed up processing: it contains all
possible reads from both edited reads (in all files in the same order) so it
can process all files line by line, not having to read a single file in
memory. Some ideas were taken from [1].
[1] https://gist.github.com/588841/
2011-11-03, Martijn Vermaat <m.vermaat.hg@lumc.nl>
"""
import sys
import re
def sync_paired_end_reads(original, reads_a, reads_b, synced_a, synced_b):
"""
Filter out reads from two paired end read files that are not present in
both of them. Do this in a reasonable amount of time by using a file
containing all of the reads for one of the paired ends.
All arguments are open file handles.
@arg original: File containing all original reads for one of the paired
ends.
@arg reads_a: First from paired end read files.
@arg reads_b: Second from paired end read files.
@arg synced_a: Filtered reads from first paired end read file.
@arg synced_b: Filtered reads from second paired end read file.
@return: Triple (filtered_a, filtered_b, kept) containing counts
of the number of reads filtered from both input files and
the total number of reads kept in the synced results.
@todo: Print warnings if obvious things are not right (a or b still has
lines after original is processed).
"""
# This matches 1, 2, or 3 preceded by / _ or whitespace. Its rightmost
# match in a header line is used to identify the read pair.
sep = re.compile('[\s_/][123]')
def next_record(fh):
return [fh.readline().strip() for i in range(4)]
def head(record):
return sep.split(record[0])[:-1]
headers = (sep.split(x.strip())[:-1] for i, x in enumerate(original)
if not (i % 4))
filtered_a = filtered_b = kept = 0
a, b = next_record(reads_a), next_record(reads_b)
for header in headers:
if header == head(a) and head(b) != header:
a = next_record(reads_a)
filtered_a += 1
if header == head(b) and head(a) != header:
b = next_record(reads_b)
filtered_b += 1
if header == head(a) == head(b):
print >>synced_a, '\n'.join(a)
print >>synced_b, '\n'.join(b)
a, b = next_record(reads_a), next_record(reads_b)
kept += 1
return filtered_a, filtered_b, kept
if __name__ == '__main__':
if len(sys.argv) < 6:
sys.stderr.write(__doc__.split('\n\n\n')[0].strip().format(
command=sys.argv[0]) + '\n')
sys.exit(1)
try:
original = open(sys.argv[1], 'r')
reads_a = open(sys.argv[2], 'r')
reads_b = open(sys.argv[3], 'r')
synced_a = open(sys.argv[4], 'w')
synced_b = open(sys.argv[5], 'w')
filtered_a, filtered_b, kept = \
sync_paired_end_reads(original, reads_a, reads_b,
synced_a, synced_b)
print 'Filtered %i reads from first read file.' % filtered_a
print 'Filtered %i reads from second read file.' % filtered_b
print 'Synced read files contain %i reads.' % kept
except IOError as (_, message):
sys.stderr.write('Error: %s\n' % message)
sys.exit(1)
......@@ -28,7 +28,14 @@ trait PythonCommandLineFunction extends BiopetCommandLineFunction {
executable = config("exe", default = "python", submodule = "python")
protected var python_script_name: String = _
def setPythonScript(script: String) { setPythonScript(script, "") }
def setPythonScript(script: String) {
python_script = new File(script)
if (!python_script.exists()) {
setPythonScript(script, "")
} else {
python_script_name = script
}
}
def setPythonScript(script: String, subpackage: String) {
python_script_name = script
python_script = new File(".queue/tmp/" + subpackage + python_script_name)
......
......@@ -6,7 +6,11 @@ import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
/**
* Created by pjvan_thof on 1/16/15.
* BWA sampe wrapper
*
* based on executable version 0.7.10-r789
*
* @param root Configurable
*/
class BwaSampe(val root: Configurable) extends Bwa {
@Input(doc = "Fastq file R1", required = true)
......@@ -39,11 +43,14 @@ class BwaSampe(val root: Configurable) extends Bwa {
var r: String = _
def cmdLine = required(executable) +
required("samse") +
required("sampe") +
optional("-a", a) +
optional("-o", o) +
optional("-n", n) +
optional("-N", N) +
optional("-c", c) +
optional("-f", output) +
optional("-r", r) +
optional("-c", c) +
conditional(P, "-P") +
conditional(s, "-s") +
conditional(A, "-A") +
......
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.conifer
import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction
import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
abstract class Conifer extends PythonCommandLineFunction {
override def subPath = "conifer" :: super.subPath
// executable = config("exe", default = "conifer")
setPythonScript(config("script", default = "conifer"))
override val versionRegex = """(.*)""".r
override val versionExitcode = List(0)
override def versionCommand = executable + " " + python_script + " --version"
override val defaultVmem = "8G"
override val defaultThreads = 1
def cmdLine = getPythonCommand
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.conifer
import java.io.File
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.Ln
import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output }
class ConiferAnalyze(val root: Configurable) extends Conifer {
@Input(doc = "Probes / capture kit definition as bed file: chr,start,stop,gene-annot", required = true)
var probes: File = _
// @Input(doc = "Path to Conifer RPKM files", required = true)
var rpkmDir: File = _
@Output(doc = "Output analyse.hdf5", shortName = "out")
var output: File = _
@Argument(doc = "SVD, number of components to remove", minRecommendedValue = 2, maxRecommendedValue = 5,
minValue = 2, maxValue = 20, required = false)
var svd: Option[Int] = config("svd", default = 1)
@Argument(doc = "Minimum population median RPKM per probe", required = false)
var min_rpkm: Option[Double] = config("min_rpkm")
override def cmdLine = super.cmdLine +
" analyze " +
" --probes" + required(probes) +
" --rpkm_dir" + required(rpkmDir) +
" --output" + required(output) +
optional("--svd", svd) +
optional("--min_rpkm", min_rpkm)
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.conifer
import java.io.File
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Argument, Input, Output }
class ConiferCall(val root: Configurable) extends Conifer {
@Input(doc = "Input analysis.hdf5", required = true)
var input: File = _
@Output(doc = "Output calls.txt", shortName = "out")
var output: File = _
override def cmdLine = super.cmdLine +
" call " +
" --input" + required(input) +
" --output" + required(output)
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.conifer
import java.io.File
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
class ConiferExport(val root: Configurable) extends Conifer {
@Input(doc = "Input analysis.hdf5", required = true)
var input: File = _
@Output(doc = "Output <sample>.svdzrpkm.bed", shortName = "out", required = true)
var output: File = _
override def afterGraph {
this.checkExecutable
}
override def cmdLine = super.cmdLine +
" export " +
" --input" + required(input) +
" --output" + required(output)
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.extensions.conifer
import java.io.File
import nl.lumc.sasc.biopet.core.config.Configurable
import org.broadinstitute.gatk.utils.commandline.{ Output, Input }
class ConiferRPKM(val root: Configurable) extends Conifer {
@Input(doc = "Bam file", required = true)
var bamFile: File = _
@Input(doc = "Probes / capture kit definition as bed file: chr,start,stop,gene-annot", required = true)
var probes: File = _
/** The output RPKM should outputted to a directory which contains all the RPKM files from previous experiments */
@Output(doc = "Output RPKM.txt", shortName = "out")
var output: File = _
override def cmdLine = super.cmdLine +
" rpkm " +
" --probes" + required(probes) +
" --input" + required(bamFile) +
" --output" + required(output)
}
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.scripts
import java.io.File
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import argonaut._, Argonaut._
import scalaz._, Scalaz._
import nl.lumc.sasc.biopet.core.config.Configurable
import nl.lumc.sasc.biopet.extensions.PythonCommandLineFunction
import scala.io.Source
class FastqSync(val root: Configurable) extends PythonCommandLineFunction {
setPythonScript("sync_paired_end_reads.py")
@Input(doc = "Start fastq")
var input_start_fastq: File = _
@Input(doc = "R1 input")
var input_R1: File = _
@Input(doc = "R2 input")
var input_R2: File = _
@Output(doc = "R1 output")
var output_R1: File = _
@Output(doc = "R2 output")
var output_R2: File = _
//No output Annotation so file
var output_stats: File = _
def cmdLine = {
getPythonCommand +
required(input_start_fastq) +
required(input_R1) +
required(input_R2) +
required(output_R1) +
required(output_R2) +
" > " +
required(output_stats)
}
def getSummary: Json = {
val R1_filteredR = """Filtered (\d*) reads from first read file.""".r
val R2_filteredR = """Filtered (\d*) reads from second read file.""".r
val readsLeftR = """Synced read files contain (\d*) reads.""".r
var R1_filtered = 0
var R2_filtered = 0
var readsLeft = 0
if (output_stats.exists) for (line <- Source.fromFile(output_stats).getLines) {
line match {
case R1_filteredR(m) => R1_filtered = m.toInt
case R2_filteredR(m) => R2_filtered = m.toInt
case readsLeftR(m) => readsLeft = m.toInt
case _ =>
}
}
return ("num_reads_discarded_R1" := R1_filtered) ->:
("num_reads_discarded_R2" := R2_filtered) ->:
("num_reads_kept" := readsLeft) ->:
jEmptyObject
}
}
object FastqSync {
def apply(root: Configurable, input_start_fastq: File, input_R1: File, input_R2: File,
output_R1: File, output_R2: File, output_stats: File): FastqSync = {
val fastqSync = new FastqSync(root)
fastqSync.input_start_fastq = input_start_fastq
fastqSync.input_R1 = input_R1
fastqSync.input_R2 = input_R2
fastqSync.output_R1 = output_R1
fastqSync.output_R2 = output_R2
fastqSync.output_stats = output_stats
return fastqSync
}
def mergeSummaries(jsons: List[Json]): Json = {
var R1_filtered = 0
var R2_filtered = 0
var readsLeft = 0
for (json <- jsons) {
R1_filtered += json.field("num_reads_discarded_R1").get.numberOrZero.toInt
R2_filtered += json.field("num_reads_discarded_R2").get.numberOrZero.toInt
readsLeft += json.field("num_reads_kept").get.numberOrZero.toInt
}
return ("num_reads_discarded_R1" := R1_filtered) ->:
("num_reads_discarded_R2" := R2_filtered) ->:
("num_reads_kept" := readsLeft) ->:
jEmptyObject
}
}
\ No newline at end of file
/**
* Copyright (c) 2014 Leiden University Medical Center - Sequencing Analysis Support Core <sasc@lumc.nl>
* @author Wibowo Arindrarto <w.arindrarto@lumc.nl>
*
* This tool is a port of a Python implementation written by Martijn Vermaat[1]
*
* [1] https://github.com/martijnvermaat/bio-playground/blob/master/sync-paired-end-reads/sync_paired_end_reads.py
*/
package nl.lumc.sasc.biopet.tools
import java.io.File
import scala.io.Source
import scala.util.matching.Regex
import scala.annotation.tailrec
import scala.collection.JavaConverters._
import argonaut._, Argonaut._
import scalaz._, Scalaz._
import htsjdk.samtools.fastq.{ AsyncFastqWriter, BasicFastqWriter, FastqReader, FastqRecord }
import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import nl.lumc.sasc.biopet.core.BiopetJavaCommandLineFunction
import nl.lumc.sasc.biopet.core.ToolCommand
import nl.lumc.sasc.biopet.core.config.Configurable
/**
* FastqSync function class for usage in Biopet pipelines
*
* @param root Configuration object for the pipeline
*/
class FastqSync(val root: Configurable) extends BiopetJavaCommandLineFunction {
javaMainClass = getClass.getName
@Input(doc = "Original FASTQ file (read 1 or 2)", shortName = "r", required = true)
var refFastq: File = _
@Input(doc = "Input read 1 FASTQ file", shortName = "i", required = true)
var inputFastq1: File = _
@Input(doc = "Input read 2 FASTQ file", shortName = "j", required = true)
var inputFastq2: File = _
@Output(doc = "Output read 1 FASTQ file", shortName = "o", required = true)
var outputFastq1: File = _
@Output(doc = "Output read 2 FASTQ file", shortName = "p", required = true)
var outputFastq2: File = _
@Output(doc = "Sync statistics", required = true)
var outputStats: File = _
// executed command line
override def commandLine =
super.commandLine +
required("-r", refFastq) +
required("-i", inputFastq1) +
required("-j", inputFastq2) +
required("-o", outputFastq1) +
required("-p", outputFastq2) + " > " +
required(outputStats)