diff --git a/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/function/Seqtk.scala b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/function/Seqtk.scala new file mode 100644 index 0000000000000000000000000000000000000000..286b7235b4e3ab8bccb0e3fbb19f752ef9cfc93f --- /dev/null +++ b/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/function/Seqtk.scala @@ -0,0 +1,121 @@ +/** + * Copyright (c) 2014 Leiden University Medical Center + * + * @author Wibowo Arindrarto + */ + +package nl.lumc.sasc.biopet.function + +import java.io.File +import org.broadinstitute.gatk.utils.commandline.{ Input, Output } +import nl.lumc.sasc.biopet.core.BiopetCommandLineFunction +import nl.lumc.sasc.biopet.core.config.Configurable + +/** + * Abstract class for all seqtk wrappers. + */ +abstract class Seqtk extends BiopetCommandLineFunction { + executable = config("exe", default = "seqtk", submodule = "seqtk") + override def versionCommand = executable + override val versionRegex = """Version: (.*)""".r +} + +/** + * Wrapper for the seqtk seq subcommand. + * Written based on seqtk version 1.0-r63-dirty. + */ +class SeqtkSeq(val root: Configurable) extends Seqtk { + + /** input file */ + @Input(doc = "Input file (FASTQ or FASTA)") + var input: File = _ + + /** output file */ + @Output(doc = "Output file") + var output: File = _ + + /** mask bases with quality lower than INT [0] */ + var q: Option[Int] = config("q") + + /** masked bases converted to CHAR; 0 for lowercase [0] */ + var n: String = config("n") + + /** number of residues per line; 0 for 2^32-1 [0] */ + var l: Option[Int] = config("l") + + /** quality shift: ASCII-INT gives base quality [33] */ + var Q: Option[Int] = config("Q") + + /** random seed (effective with -f) [11] */ + var s: Option[Int] = config("s") + + /** sample FLOAT fraction of sequences [1] */ + var f: Option[Int] = config("f") + + /** mask regions in BED or name list FILE [null] */ + var M: File = config("M") + + /** drop sequences with length shorter than INT [0] */ + var L: Option[Int] = config("L") + + /** mask complement region (effective with -M) */ + var c: Boolean = config("c") + + /** reverse complement */ + var r: Boolean = config("r") + + /** force FASTA output (discard quality) */ + var A: Boolean = config("A") + + /** drop comments at the header lines */ + var C: Boolean = config("C") + + /** drop sequences containing ambiguous bases */ + var N: Boolean = config("N") + + /** output the 2n-1 reads only */ + var flag1: Boolean = config("1") + + /** output the 2n reads only */ + var flag2: Boolean = config("2") + + /** shift quality by '(-Q) - 33' */ + var V: Boolean = config("V") + + def cmdLine = { + required(executable) + + " seq " + + optional("-q", q) + + optional("-n", n) + + optional("-l", l) + + optional("-Q", Q) + + optional("-s", s) + + optional("-f", f) + + optional("-M", M) + + optional("-L", L) + + conditional(c, "-c") + + conditional(r, "-r") + + conditional(A, "-A") + + conditional(C, "-C") + + conditional(N, "-N") + + conditional(flag1, "-1") + + conditional(flag2, "-2") + + conditional(V, "-V") + + required(input) + + " > " + required(output) + } + + /** + * Calculates the offset required for the -Q flag for format conversion (-V flag set). + * This is required since seqtk computes the encoding offset indirectly from the input + * and output offsets. + * + * @param inQualOffset ASCII offset of the input file encoding + * @param outQualOffset ASCII offset of the output file encoding + * @return the value to be used with the -Q flag with -V set + */ + def calcQForV(inQualOffset: Int, outQualOffset: Int): Int = { + // For the input for the -Q flag for seqtk, together with -V + inQualOffset - (outQualOffset - 33) + } +} diff --git a/extras/git.pre-commit b/extras/git.pre-commit new file mode 100755 index 0000000000000000000000000000000000000000..5a627198950f191e4c2c64fe44d23cf170cfcf50 --- /dev/null +++ b/extras/git.pre-commit @@ -0,0 +1,139 @@ +#!/usr/bin/env python + +# Adapted from: http://tech.yipit.com/2011/11/16/183772396/ +# Changes by Wibowo Arindrarto +# Changes: +# - Allow code modification by linters to be comitted +# - Updated CHECKS +# - Python 3 calls + code style updates +# +# Usage: save this file into your .git/hooks directory as `pre-commit` +# and set it to executable + +import os +import re +import subprocess +import sys + +modified = re.compile(r"^[MA]\s+(?P<name>.*)$") + +CHECKS = [ + { + "exe": "scalariform", + "output": "Formatting code with scalariform ...", + # Remove lines without filenames + "command": "scalariform -s=2.11.1 -p=scalariformStyle.properties --quiet %s", + "match_files": [".*scala$"], + "print_filename": False, + "commit_changes": True, + }, +] + + +def matches_file(file_name, match_files): + return any(re.compile(match_file).match(file_name) for match_file + in match_files) + + +def check_files(files, check): + result = 0 + print(check["output"]) + for file_name in files: + + if not "match_files" in check or \ + matches_file(file_name, check["match_files"]): + + if not "ignore_files" in check or \ + not matches_file(file_name, check["ignore_files"]): + + process = subprocess.Popen(check["command"] % file_name, + stdout=subprocess.PIPE, stderr=subprocess.PIPE, + shell=True) + + out, err = process.communicate() + if out or err: + if check["print_filename"]: + prefix = "\t%s:" % file_name + else: + prefix = "\t" + output_lines = ["%s%s" % (prefix, line) for + line in out.splitlines()] + print("\n".join(output_lines)) + if err: + print(err) + result = 1 + elif check["commit_changes"]: + p = subprocess.Popen(["git", "add", file_name], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p.communicate() + return result + + +def main(all_files): + # Check that the required linters and code checkers are all present + for check in CHECKS: + p = subprocess.Popen(["which", check["exe"]], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = p.communicate() + if not out: + print("Required commit hook executable '%s' not found." % check["exe"]) + sys.exit(1) + + # Stash any changes to the working tree that are not going to be committed + subprocess.call(["git", "stash", "-u", "--keep-index"], stdout=subprocess.PIPE) + + files = [] + if all_files: + for root, dirs, file_names in os.walk("."): + for file_name in file_names: + files.append(os.path.join(root, file_name)) + else: + p = subprocess.Popen(["git", "status", "--porcelain"], + stdout=subprocess.PIPE) + out, err = p.communicate() + for line in out.splitlines(): + match = modified.match(line) + if match: + files.append(match.group("name")) + + result = 0 + for check in CHECKS: + result = check_files(files, check) or result + + # Strategy: + # - Check if the linters made any changes + # - If there are no changes, pop the stash and commit + # - Otherwise: + # - Stash the change + # - Pop stash@{1} + # - Checkout stash@{0} + # - Drop stash@{0} (cannot pop directly since stash may conflict) + # - Commit + # This is because the initial stash will conflict with any possible + # changes made by the linters + p = subprocess.Popen(["git", "status", "--porcelain"], + stdout=subprocess.PIPE) + out, err = p.communicate() + if not out.strip(): + subprocess.call(["git", "stash", "pop"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + else: + subprocess.call(["git", "stash"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + subprocess.call(["git", "stash", "pop", "--quiet", "--index", "stash@{1}"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + subprocess.call(["git", "checkout", "stash", "--", "."], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + subprocess.call(["git", "stash", "drop"], + stdout=subprocess.PIPE, stderr=subprocess.PIPE) + sys.exit(result) + + +if __name__ == "__main__": + + all_files = False + + if len(sys.argv) > 1 and sys.argv[1] == "--all-files": + all_files = True + + main(all_files) diff --git a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/function/fastq/Cutadapt.scala b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/function/fastq/Cutadapt.scala index 17df80bf4cede5cc5c530a5ba83aa6dbcc668efd..2760ead8853b1cdd4c61a1906a13062bdb363045 100644 --- a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/function/fastq/Cutadapt.scala +++ b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/function/fastq/Cutadapt.scala @@ -19,6 +19,9 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction { @Output(doc = "Output fastq file") var fastq_output: File = _ + @Output(doc = "Output statistics file") + var stats_output: File = _ + executable = config("exe", default = "cutadapt") override def versionCommand = executable + " --version" override val versionRegex = """(.*)""".r @@ -49,7 +52,8 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction { optional("-M", opt_maximum_length) + // input / output required(fastq_input) + - " > " + required(fastq_output) + required("--output", fastq_output) + + " > " + required(stats_output) } else { analysisName = getClass.getSimpleName + "-ln" "ln -sf " + diff --git a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/function/fastq/Sickle.scala b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/function/fastq/Sickle.scala index d31ce0ffbad4d582b1d37b663acc640aad187971..e29f8a5997cfe333f7bb9691613837ccf86aa564 100644 --- a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/function/fastq/Sickle.scala +++ b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/function/fastq/Sickle.scala @@ -11,25 +11,25 @@ import nl.lumc.sasc.biopet.core.config._ class Sickle(val root: Configurable) extends BiopetCommandLineFunction { @Input(doc = "R1 input") - var input_R1: File = null + var input_R1: File = _ @Input(doc = "R2 input", required = false) - var input_R2: File = null + var input_R2: File = _ @Input(doc = "qualityType file", required = false) var qualityTypeFile: File = _ @Output(doc = "R1 output") - var output_R1: File = null + var output_R1: File = _ @Output(doc = "R2 output", required = false) - var output_R2: File = null + var output_R2: File = _ @Output(doc = "singles output", required = false) - var output_singles: File = null + var output_singles: File = _ @Output(doc = "stats output") - var output_stats: File = null + var output_stats: File = _ executable = config("exe", default = "sickle") var qualityType: String = config("qualitytype") diff --git a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala index 00eb4cba9f707b308c25504e1578c02911cf4b7f..ecc911a7720341d2a8e9d4957adceb0875f72238 100644 --- a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala +++ b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Flexiprep.scala @@ -128,13 +128,19 @@ class Flexiprep(val root: Configurable) extends QScript with BiopetQScript { var R2: File = new File(R2_in) if (!skipClip) { // Adapter clipping + val cutadapt_R1 = new Cutadapt(this) + if (!skipTrim || paired) cutadapt_R1.isIntermediate = true cutadapt_R1.fastq_input = R1 cutadapt_R1.fastq_output = swapExt(outDir, R1, R1_ext, ".clip" + R1_ext) + cutadapt_R1.stats_output = swapExt(outDir, R1, R1_ext, ".clip.stats") + if (outputFiles.contains("contams_R1")) cutadapt_R1.contams_file = outputFiles("contams_R1") + add(cutadapt_R1) R1 = cutadapt_R1.fastq_output + if (paired) { val cutadapt_R2 = new Cutadapt(this) if (!skipTrim || paired) cutadapt_R2.isIntermediate = true diff --git a/scalariformStyle.properties b/scalariformStyle.properties index 7e5f561df9c86dfbe0cc7608f49c6248027e0ba3..eb4298d0ee410be3bffedcc178bd4ecba1c94234 100644 --- a/scalariformStyle.properties +++ b/scalariformStyle.properties @@ -9,6 +9,7 @@ formatXml=true indentLocalDefs=false indentPackageBlocks=true indentSpaces=2 +placeScaladocAsterisksBeneathSecondAsterisk=false preserveDanglingCloseParenthesis=false preserveSpaceBeforeArguments=false rewriteArrowSymbols=false