diff --git a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala index 6c36a4f0f4bb5bb24dc4fca1fbca78ad35936356..fed5e514be34618f41d2e8348eba6d15e67fab29 100644 --- a/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala +++ b/public/biopet-framework/src/main/scala/nl/lumc/sasc/biopet/extensions/Fastqc.scala @@ -25,16 +25,16 @@ import nl.lumc.sasc.biopet.core.config.Configurable class Fastqc(val root: Configurable) extends BiopetCommandLineFunction { @Input(doc = "Contaminants", required = false) - var contaminants: File = _ + var contaminants: Option[File] = None @Input(doc = "Adapters", required = false) - var adapters: File = _ + var adapters: Option[File] = None @Input(doc = "Fastq file", shortName = "FQ") - var fastqfile: File = _ + var fastqfile: File = null @Output(doc = "Output", shortName = "out") - var output: File = _ + var output: File = null executable = config("exe", default = "fastqc") var java_exe: String = config("exe", default = "java", submodule = "java", freeVar = false) @@ -50,17 +50,31 @@ class Fastqc(val root: Configurable) extends BiopetCommandLineFunction { override def afterGraph { this.checkExecutable - if (contaminants == null) { - val fastqcDir = executable.substring(0, executable.lastIndexOf("/")) - val defaultContams = getVersion match { - case "v0.11.2" => new File(fastqcDir + "/Configuration/contaminant_list.txt") - case _ => new File(fastqcDir + "/Contaminants/contaminant_list.txt") - } - val defaultAdapters = getVersion match { - case "v0.11.2" => new File(fastqcDir + "/Configuration/adapter_list.txt") - case _ => null - } - contaminants = config("contaminants", default = defaultContams) + + val fastqcDir = executable.substring(0, executable.lastIndexOf("/")) + + contaminants = contaminants match { + // user-defined contaminants file take precedence + case userDefinedValue @ Some(_) => userDefinedValue + // otherwise, use default contaminants file (depending on FastQC version) + case None => + val defaultContams = getVersion match { + case "v0.11.2" => new File(fastqcDir + "/Configuration/contaminant_list.txt") + case _ => new File(fastqcDir + "/Contaminants/contaminant_list.txt") + } + config("contaminants", default = defaultContams) + } + + adapters = adapters match { + // user-defined contaminants file take precedence + case userDefinedValue @ Some(_) => userDefinedValue + // otherwise, check if adapters are already present (depending on FastQC version) + case None => + val defaultAdapters = getVersion match { + case "v0.11.2" => Option(new File(fastqcDir + "/Configuration/adapter_list.txt")) + case _ => None + } + defaultAdapters.collect { case adp => config("adapters", default = adp) } } } @@ -74,6 +88,6 @@ class Fastqc(val root: Configurable) extends BiopetCommandLineFunction { conditional(noextract, "--noextract") + conditional(extract, "--extract") + conditional(quiet, "--quiet") + - required("-o", output.getParent()) + + required("-o", output.getParent) + required(fastqfile) } diff --git a/public/flexiprep/pom.xml b/public/flexiprep/pom.xml index e9b58ab28a615ac8ce4e76063285b125f2b66b1b..86666db29645cd35f9d8f71c6b1aa775d0f22444 100644 --- a/public/flexiprep/pom.xml +++ b/public/flexiprep/pom.xml @@ -39,5 +39,17 @@ <artifactId>BiopetFramework</artifactId> <version>${project.version}</version> </dependency> + <dependency> + <groupId>org.testng</groupId> + <artifactId>testng</artifactId> + <version>6.8</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.scalatest</groupId> + <artifactId>scalatest_2.11</artifactId> + <version>2.2.1</version> + <scope>test</scope> + </dependency> </dependencies> </project> diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala index 1bd84bb36e21c8e577adaf5e9ad33d02b1db47fa..9aaca5f66336e38b16b215a9c175781fafc97fe0 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala @@ -33,14 +33,14 @@ class Cutadapt(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Cutada override def beforeCmd() { super.beforeCmd - val foundAdapters = fastqc.getFoundAdapters.map(_.seq) + val foundAdapters = fastqc.foundAdapters.map(_.seq) if (default_clip_mode == "3") opt_adapter ++= foundAdapters else if (default_clip_mode == "5") opt_front ++= foundAdapters else if (default_clip_mode == "both") opt_anywhere ++= foundAdapters } override def cmdLine = { - if (!opt_adapter.isEmpty || !opt_anywhere.isEmpty || !opt_front.isEmpty) { + if (opt_adapter.nonEmpty || opt_anywhere.nonEmpty || opt_front.nonEmpty) { analysisName = getClass.getSimpleName super.cmdLine } else { diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala index 4ee01c2605d5449dac33b19ac9c2ab360b383d45..50ee00b2c5625226a00e0985bbe761ded375237c 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Fastqc.scala @@ -16,82 +16,154 @@ package nl.lumc.sasc.biopet.pipelines.flexiprep -import java.io.File -import nl.lumc.sasc.biopet.core.config.Configurable +import java.io.{ File, FileNotFoundException } + import scala.io.Source import argonaut._, Argonaut._ import scalaz._, Scalaz._ +import nl.lumc.sasc.biopet.core.config.Configurable +import nl.lumc.sasc.biopet.utils.ConfigUtils + +/** + * FastQC wrapper with added functionality for the Flexiprep pipeline + * + * This wrapper implements additional methods for parsing FastQC output files and aggregating everything in a summary + * object. The current implementation is based on FastQC v0.10.1. + */ class Fastqc(root: Configurable) extends nl.lumc.sasc.biopet.extensions.Fastqc(root) { - def getDataBlock(name: String): Array[String] = { // Based on Fastqc v0.10.1 - val outputDir = output.getAbsolutePath.stripSuffix(".zip") - val dataFile = new File(outputDir + "/fastqc_data.txt") - if (!dataFile.exists) return null - val data = Source.fromFile(dataFile).mkString - for (block <- data.split(">>END_MODULE\n")) { - val b = if (block.startsWith("##FastQC")) block.substring(block.indexOf("\n") + 1) else block - if (b.startsWith(">>" + name)) - return for (line <- b.split("\n")) - yield line - } - return null - } - def getEncoding: String = { - val block = getDataBlock("Basic Statistics") - if (block == null) return null - for ( - line <- block if (line.startsWith("Encoding")) - ) return line.stripPrefix("Encoding\t") - return null // Could be default Sanger with a warning in the log + /** Class for storing a single FastQC module result */ + protected case class FastQCModule(name: String, status: String, lines: Seq[String]) + + /** Default FastQC output directory containing actual results */ + // this is a def instead of a val since the value depends on the variable `output`, which is null on class creation + def outputDir: File = new File(output.getAbsolutePath.stripSuffix(".zip")) + + /** Default FastQC output data file */ + // this is a def instead of a val since the value depends on the variable `output`, which is null on class creation + def dataFile: File = new File(outputDir, "fastqc_data.txt") + + /** + * FastQC QC modules. + * + * @return Mapping of FastQC module names and its contents as array of strings (one item per line) + * @throws FileNotFoundException if the FastQC data file can not be found. + * @throws IllegalStateException if the module lines have no content or mapping is empty. + */ + @throws(classOf[FileNotFoundException]) + @throws(classOf[IllegalStateException]) + def qcModules: Map[String, FastQCModule] = { + + val fqModules = Source.fromFile(dataFile) + // drop all the characters before the first module delimiter (i.e. '>>') + .dropWhile(_ != '>') + // pull everything into a string + .mkString + // split into modules + .split(">>END_MODULE\n") + // make map of module name -> module lines + .map { + case (modString) => + // module name is in the first line, without '>>' and before the tab character + val Array(firstLine, otherLines) = modString + // drop all '>>' character (start of module) + .dropWhile(_ == '>') + // split first line and others + .split("\n", 2) + // and slice them + .slice(0, 2) + // extract module name and module status + val Array(modName, modStatus) = firstLine + .split("\t", 2) + .slice(0, 2) + modName -> FastQCModule(modName, modStatus, otherLines.split("\n").toSeq) + } + .toMap + + if (fqModules.isEmpty) throw new IllegalStateException("Empty FastQC data file " + dataFile.toString) + else fqModules } - protected case class Sequence(name: String, seq: String) - def getFoundAdapters: List[Sequence] = { - def getSeqs(file: File) = { - if (file != null) { - (for ( - line <- Source.fromFile(file).getLines(); if line.startsWith("#"); - values = line.split("\t*") if values.size >= 2 - ) yield Sequence(values(0), values(1))).toList - } else Nil - } + /** + * Retrieves the FASTQ file encoding as computed by FastQC. + * + * @return encoding name + * @throws NoSuchElementException when the "Basic Statistics" key does not exist in the mapping or + * when a line starting with "Encoding" does not exist. + */ + @throws(classOf[NoSuchElementException]) + def encoding: String = + qcModules("Basic Statistics") + .lines + .dropWhile(!_.startsWith("Encoding")) + .head + .stripPrefix("Encoding\t") + .stripSuffix("\t") + + /** Case class representing a known adapter sequence */ + protected case class AdapterSequence(name: String, seq: String) - val seqs = getSeqs(adapters) ::: getSeqs(contaminants) + /** + * Retrieves overrepresented sequences found by FastQ. + * + * @return a [[Set]] of [[AdapterSequence]] objects. + */ + def foundAdapters: Set[AdapterSequence] = { - val block = getDataBlock("Overrepresented sequences") - if (block == null) return Nil + /** Returns a list of adapter and/or contaminant sequences known to FastQC */ + def getFastqcSeqs(file: Option[File]): Set[AdapterSequence] = file match { + case None => Set.empty[AdapterSequence] + case Some(f) => + (for { + line <- Source.fromFile(f).getLines() + if !line.startsWith("#") + values = line.split("\t+") + if values.size >= 2 + } yield AdapterSequence(values(0), values(1))).toSet + } - val found = for ( - line <- block if !line.startsWith("#"); - values = line.split("\t") if values.size >= 4 - ) yield values(3) + val found = qcModules.get("Overrepresented sequences") match { + case None => Seq.empty[String] + case Some(qcModule) => + for ( + line <- qcModule.lines if !(line.startsWith("#") || line.startsWith(">")); + values = line.split("\t") if values.size >= 4 + ) yield values(3) + } - seqs.filter(x => found.exists(_.startsWith(x.name))) + // select full sequences from known adapters and contaminants + // based on overrepresented sequences results + (getFastqcSeqs(adapters) ++ getFastqcSeqs(contaminants)) + .filter(x => found.exists(_.startsWith(x.name))) } - def getSummary: Json = { - val subfixs = Map("plot_duplication_levels" -> "Images/duplication_levels.png", - "plot_kmer_profiles" -> "Images/kmer_profiles.png", - "plot_per_base_gc_content" -> "Images/per_base_gc_content.png", - "plot_per_base_n_content" -> "Images/per_base_n_content.png", - "plot_per_base_quality" -> "Images/per_base_quality.png", - "plot_per_base_sequence_content" -> "Images/per_base_sequence_content.png", - "plot_per_sequence_gc_content" -> "Images/per_sequence_gc_content.png", - "plot_per_sequence_quality" -> "Images/per_sequence_quality.png", - "plot_sequence_length_distribution" -> "Images/sequence_length_distribution.png", - "fastqc_data" -> "fastqc_data.txt") - val dir = output.getAbsolutePath.stripSuffix(".zip") + "/" - var outputMap: Map[String, Map[String, String]] = Map() - for ((k, v) <- subfixs) outputMap += (k -> Map("path" -> (dir + v))) - - val temp = ("" := outputMap) ->: jEmptyObject - return temp.fieldOrEmptyObject("") + /** Summary of the FastQC run, stored in a [[Json]] object */ + def summary: Json = { + + val outputMap = + Map("plot_duplication_levels" -> "Images/duplication_levels.png", + "plot_kmer_profiles" -> "Images/kmer_profiles.png", + "plot_per_base_gc_content" -> "Images/per_base_gc_content.png", + "plot_per_base_n_content" -> "Images/per_base_n_content.png", + "plot_per_base_quality" -> "Images/per_base_quality.png", + "plot_per_base_sequence_content" -> "Images/per_base_sequence_content.png", + "plot_per_sequence_gc_content" -> "Images/per_sequence_gc_content.png", + "plot_per_sequence_quality" -> "Images/per_sequence_quality.png", + "plot_sequence_length_distribution" -> "Images/sequence_length_distribution.png", + "fastqc_data" -> "fastqc_data.txt") + .map { + case (name, relPath) => + name -> Map("path" -> (outputDir + File.separator + relPath)) + } + + ConfigUtils.mapToJson(outputMap) } } object Fastqc { + def apply(root: Configurable, fastqfile: File, outDir: String): Fastqc = { val fastqcCommand = new Fastqc(root) fastqcCommand.fastqfile = fastqfile @@ -102,6 +174,6 @@ object Fastqc { //if (filename.endsWith(".fq")) filename = filename.substring(0,filename.size - 3) fastqcCommand.output = new File(outDir + "/" + filename + "_fastqc.zip") fastqcCommand.afterGraph - return fastqcCommand + fastqcCommand } } diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala index 353fb1acb83c75fe121d064c1ab6b0149f87565d..4ff18fb7cc90d7c3a90255c3c9a49ffb0a191eda 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FlexiprepSummary.scala @@ -201,7 +201,7 @@ class FlexiprepSummary(val root: Configurable) extends InProcessFunction with Co def fastqcSummary(fastqc: Fastqc): Option[Json] = { if (fastqc == null) return None - else return Option(fastqc.getSummary) + else return Option(fastqc.summary) } def clipstatSummary(): Option[Json] = { diff --git a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/SeqtkSeq.scala b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/SeqtkSeq.scala index f6d6ac9d7c2727445723bd26c61d77398d081e95..0fdeee289de9672d917264e3e9dc2d556f6bc48b 100644 --- a/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/SeqtkSeq.scala +++ b/public/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/SeqtkSeq.scala @@ -25,7 +25,7 @@ class SeqtkSeq(root: Configurable) extends nl.lumc.sasc.biopet.extensions.seqtk. override def beforeCmd { super.beforeCmd if (fastqc != null && Q == None) { - val encoding = fastqc.getEncoding + val encoding = fastqc.encoding Q = encoding match { case null => None case s if (s.contains("Sanger / Illumina 1.9")) => None diff --git a/public/flexiprep/src/test/resources/fqc_contaminants_v0101.txt b/public/flexiprep/src/test/resources/fqc_contaminants_v0101.txt new file mode 100644 index 0000000000000000000000000000000000000000..13c6a999940201a402c3a7f9dd931ab9102de360 --- /dev/null +++ b/public/flexiprep/src/test/resources/fqc_contaminants_v0101.txt @@ -0,0 +1,170 @@ +# This file contains a list of potential contaminants which are +# frequently found in high throughput sequencing reactions. These +# are mostly sequences of adapters / primers used in the various +# sequencing chemistries. +# +# Please DO NOT rely on these sequences to design your own oligos, some +# of them are truncated at ambiguous positions, and none of them are +# definitive sequences from the manufacturers so don't blame us if you +# try to use them and they don't work. +# +# You can add more sequences to the file by putting one line per entry +# and specifying a name[tab]sequence. If the contaminant you add is +# likely to be of use to others please consider sending it to the FastQ +# authors, either via a bug report at www.bioinformatics.bbsrc.ac.uk/bugzilla/ +# or by directly emailing simon.andrews@bbsrc.ac.uk so other users of +# the program can benefit. + +Illumina Single End Adapter 1 GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG +Illumina Single End Adapter 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +Illumina Single End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Single End PCR Primer 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +Illumina Single End Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT + +Illumina Paired End Adapter 1 ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Paired End Adapter 2 GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +Illumina Paried End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Paired End PCR Primer 2 CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +Illumina Paried End Sequencing Primer 1 ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Paired End Sequencing Primer 2 CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT + +Illumina DpnII expression Adapter 1 ACAGGTTCAGAGTTCTACAGTCCGAC +Illumina DpnII expression Adapter 2 CAAGCAGAAGACGGCATACGA +Illumina DpnII expression PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina DpnII expression PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina DpnII expression Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +Illumina NlaIII expression Adapter 1 ACAGGTTCAGAGTTCTACAGTCCGACATG +Illumina NlaIII expression Adapter 2 CAAGCAGAAGACGGCATACGA +Illumina NlaIII expression PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina NlaIII expression PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina NlaIII expression Sequencing Primer CCGACAGGTTCAGAGTTCTACAGTCCGACATG + +Illumina Small RNA Adapter 1 GTTCAGAGTTCTACAGTCCGACGATC +Illumina Small RNA Adapter 2 TGGAATTCTCGGGTGCCAAGG +Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA +Illumina Small RNA PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina Small RNA PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina Small RNA Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +Illumina Multiplexing Adapter 1 GATCGGAAGAGCACACGTCT +Illumina Multiplexing Adapter 2 ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Multiplexing PCR Primer 1.01 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Multiplexing PCR Primer 2.01 GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +Illumina Multiplexing Read1 Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Multiplexing Index Sequencing Primer GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +Illumina Multiplexing Read2 Sequencing Primer GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT + +Illumina PCR Primer Index 1 CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC +Illumina PCR Primer Index 2 CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC +Illumina PCR Primer Index 3 CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC +Illumina PCR Primer Index 4 CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC +Illumina PCR Primer Index 5 CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC +Illumina PCR Primer Index 6 CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC +Illumina PCR Primer Index 7 CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC +Illumina PCR Primer Index 8 CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC +Illumina PCR Primer Index 9 CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC +Illumina PCR Primer Index 10 CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC +Illumina PCR Primer Index 11 CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC +Illumina PCR Primer Index 12 CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC + +Illumina DpnII Gex Adapter 1 GATCGTCGGACTGTAGAACTCTGAAC +Illumina DpnII Gex Adapter 1.01 ACAGGTTCAGAGTTCTACAGTCCGAC +Illumina DpnII Gex Adapter 2 CAAGCAGAAGACGGCATACGA +Illumina DpnII Gex Adapter 2.01 TCGTATGCCGTCTTCTGCTTG +Illumina DpnII Gex PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina DpnII Gex PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina DpnII Gex Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +Illumina NlaIII Gex Adapter 1.01 TCGGACTGTAGAACTCTGAAC +Illumina NlaIII Gex Adapter 1.02 ACAGGTTCAGAGTTCTACAGTCCGACATG +Illumina NlaIII Gex Adapter 2.01 CAAGCAGAAGACGGCATACGA +Illumina NlaIII Gex Adapter 2.02 TCGTATGCCGTCTTCTGCTTG +Illumina NlaIII Gex PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina NlaIII Gex PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina NlaIII Gex Sequencing Primer CCGACAGGTTCAGAGTTCTACAGTCCGACATG + +Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA +Illumina 5p RNA Adapter GTTCAGAGTTCTACAGTCCGACGATC +Illumina RNA Adapter1 TGGAATTCTCGGGTGCCAAGG + +Illumina Small RNA 3p Adapter 1 ATCTCGTATGCCGTCTTCTGCTTG +Illumina Small RNA PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina Small RNA PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina Small RNA Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +TruSeq Universal Adapter AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +TruSeq Adapter, Index 1 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 2 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 3 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 4 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 5 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 6 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 7 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 8 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 9 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 10 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 11 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 12 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG + +Illumina RNA RT Primer GCCTTGGCACCCGAGAATTCCA +Illumina RNA PCR Primer AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA + +RNA PCR Primer, Index 1 CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 2 CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 3 CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 4 CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 5 CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 6 CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 7 CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 8 CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 9 CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 10 CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 11 CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 12 CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 13 CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 14 CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 15 CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 16 CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 17 CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 18 CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 19 CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 20 CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 21 CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 22 CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 23 CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 24 CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 25 CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 26 CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 27 CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 28 CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 29 CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 30 CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 31 CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 32 CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 33 CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 34 CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 35 CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 36 CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 37 CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 38 CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 39 CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 40 CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 41 CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 42 CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 43 CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 44 CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 45 CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 46 CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 47 CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 48 CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA + +ABI Dynabead EcoP Oligo CTGATCTAGAGGTACCGGATCCCAGCAGT +ABI Solid3 Adapter A CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG +ABI Solid3 Adapter B CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT +ABI Solid3 5' AMP Primer CCACTACGCCTCCGCTTTCCTCTCTATG +ABI Solid3 3' AMP Primer CTGCCCCGGGTTCCTCATTCT +ABI Solid3 EF1 alpha Sense Primer CATGTGTGTTGAGAGCTTC +ABI Solid3 EF1 alpha Antisense Primer GAAAACCAAAGTGGTCCAC +ABI Solid3 GAPDH Forward Primer TTAGCACCCCTGGCCAAGG +ABI Solid3 GAPDH Reverse Primer CTTACTCCTTGGAGGCCATG diff --git a/public/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt b/public/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d44bfae6fa962cd5d3e88084107b22efed3b025 --- /dev/null +++ b/public/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt @@ -0,0 +1,838 @@ +##FastQC 0.10.1 +>>Basic Statistics pass +#Measure Value +Filename ct_r1.fq +File type Conventional base calls +Encoding Sanger / Illumina 1.9 +Total Sequences 1000 +Filtered Sequences 0 +Sequence length 100 +%GC 53 +>>END_MODULE +>>Per base sequence quality fail +#Base Mean Median Lower Quartile Upper Quartile 10th Percentile 90th Percentile +1 32.244 33.0 31.0 34.0 30.0 34.0 +2 32.589 34.0 31.0 34.0 31.0 34.0 +3 32.814 34.0 31.0 34.0 31.0 34.0 +4 36.231 37.0 35.0 37.0 35.0 37.0 +5 35.907 37.0 35.0 37.0 35.0 37.0 +6 35.934 37.0 35.0 37.0 35.0 37.0 +7 35.783 37.0 35.0 37.0 35.0 37.0 +8 36.008 37.0 35.0 37.0 35.0 37.0 +9 37.706 39.0 37.0 39.0 35.0 39.0 +10-14 37.857600000000005 39.2 37.2 39.4 34.8 39.4 +15-19 38.9788 40.2 38.0 41.0 35.0 41.0 +20-24 38.8246 40.0 38.0 41.0 34.8 41.0 +25-29 38.589600000000004 40.0 38.0 41.0 34.4 41.0 +30-34 38.3568 40.0 38.0 41.0 33.8 41.0 +35-39 38.1592 40.0 37.4 41.0 33.6 41.0 +40-44 37.4808 39.8 36.0 41.0 32.6 41.0 +45-49 36.9478 39.0 35.0 40.8 31.2 41.0 +50-54 35.845600000000005 37.8 34.6 40.0 29.4 41.0 +55-59 34.739 36.6 33.6 40.0 27.4 41.0 +60-64 34.1336 35.4 33.4 38.6 27.2 40.2 +65-69 32.7464 35.0 32.6 37.2 24.6 39.6 +70-74 29.3478 34.0 29.6 35.6 2.0 38.6 +75-79 27.4908 33.2 26.4 35.0 2.0 36.6 +80-84 25.893000000000008 33.0 21.8 35.0 2.0 35.4 +85-89 25.031799999999997 32.4 16.2 34.6 2.0 35.0 +90-94 23.9446 31.4 6.4 34.0 2.0 35.0 +95-99 22.9358 30.4 2.0 34.0 2.0 35.0 +100 21.984 30.0 2.0 34.0 2.0 35.0 +>>END_MODULE +>>Per sequence quality scores pass +#Quality Count +11 1.0 +12 4.0 +13 3.0 +14 1.0 +15 4.0 +16 4.0 +17 6.0 +18 7.0 +19 4.0 +20 2.0 +21 7.0 +22 9.0 +23 9.0 +24 17.0 +25 23.0 +26 30.0 +27 52.0 +28 39.0 +29 28.0 +30 23.0 +31 33.0 +32 43.0 +33 47.0 +34 74.0 +35 88.0 +36 148.0 +37 202.0 +38 89.0 +39 3.0 +>>END_MODULE +>>Per base sequence content fail +#Base G A T C +1 52.35707121364093 17.251755265797392 11.735205616850552 18.655967903711137 +2 34.300000000000004 11.1 24.8 29.799999999999997 +3 41.0 6.5 20.200000000000003 32.300000000000004 +4 37.5 8.7 26.0 27.800000000000004 +5 35.4 12.4 31.8 20.4 +6 57.3 11.1 1.6 30.0 +7 20.9 24.7 32.6 21.8 +8 20.0 27.200000000000003 30.0 22.8 +9 24.5 21.5 27.800000000000004 26.200000000000003 +10-14 25.22 23.28 26.26 25.240000000000002 +15-19 26.44 21.34 26.1 26.119999999999997 +20-24 25.240000000000002 22.1 24.6 28.060000000000002 +25-29 24.62 22.06 25.119999999999997 28.199999999999996 +30-34 26.240000000000002 21.44 24.279999999999998 28.04 +35-39 24.8 22.439999999999998 24.34 28.42 +40-44 25.8 22.84 23.9 27.46 +45-49 26.26 22.64 23.66 27.439999999999998 +50-54 26.72 22.58 23.18 27.52 +55-59 25.019999999999996 22.58 24.38 28.02 +60-64 26.251501802162597 22.00640768922707 23.28794553464157 28.454144973968766 +65-69 25.683829444891394 23.873692679002414 23.049074818986323 27.39340305711987 +70-74 25.554134697357206 25.44757033248082 21.717817561807333 27.28047740835465 +75-79 25.818501428257523 23.643155350472423 23.071852340145025 27.466490881125026 +80-84 26.973532796317606 23.95857307249712 21.74913693901036 27.318757192174914 +85-89 25.452016689847014 24.849327770050998 22.624014835419565 27.07464070468243 +90-94 24.547101449275363 22.35054347826087 24.139492753623188 28.962862318840582 +95-99 25.318837549655026 24.231653773782146 23.186284758519758 27.263223918043067 +100 24.0 26.0 21.9 28.1 +>>END_MODULE +>>Per base GC content fail +#Base %GC +1 71.01303911735206 +2 64.1 +3 73.3 +4 65.3 +5 55.800000000000004 +6 87.3 +7 42.699999999999996 +8 42.8 +9 50.7 +10-14 50.46000000000001 +15-19 52.559999999999995 +20-24 53.300000000000004 +25-29 52.82 +30-34 54.279999999999994 +35-39 53.22 +40-44 53.26 +45-49 53.7 +50-54 54.24 +55-59 53.04 +60-64 54.70564677613135 +65-69 53.07723250201126 +70-74 52.834612105711855 +75-79 53.28499230938255 +80-84 54.29228998849251 +85-89 52.526657394529444 +90-94 53.509963768115945 +95-99 52.5820614676981 +100 52.1 +>>END_MODULE +>>Per sequence GC content fail +#GC Content Count +0 0.0 +1 0.0 +2 0.0 +3 0.0 +4 0.0 +5 0.0 +6 0.0 +7 0.0 +8 0.0 +9 0.0 +10 0.0 +11 0.0 +12 0.0 +13 0.0 +14 0.0 +15 0.0 +16 0.0 +17 0.0 +18 0.0 +19 0.0 +20 0.0 +21 0.0 +22 0.0 +23 0.5 +24 0.5 +25 0.5 +26 1.0 +27 1.5 +28 2.0 +29 3.5 +30 5.5 +31 6.0 +32 6.5 +33 6.0 +34 4.5 +35 6.0 +36 11.0 +37 17.0 +38 21.0 +39 16.5 +40 15.0 +41 24.0 +42 28.5 +43 33.0 +44 35.5 +45 32.5 +46 32.0 +47 32.0 +48 29.5 +49 30.5 +50 30.0 +51 29.5 +52 30.0 +53 27.5 +54 26.5 +55 27.0 +56 29.5 +57 34.0 +58 36.0 +59 36.0 +60 37.0 +61 31.5 +62 24.0 +63 22.5 +64 27.0 +65 28.5 +66 20.5 +67 15.0 +68 17.0 +69 13.5 +70 8.0 +71 7.0 +72 9.0 +73 8.0 +74 5.5 +75 4.5 +76 2.0 +77 2.0 +78 3.0 +79 2.0 +80 1.5 +81 1.0 +82 0.0 +83 0.5 +84 1.0 +85 0.5 +86 0.0 +87 0.0 +88 0.0 +89 0.0 +90 0.0 +91 0.0 +92 0.0 +93 0.0 +94 0.0 +95 0.0 +96 0.0 +97 0.0 +98 0.0 +99 0.0 +100 0.0 +>>END_MODULE +>>Per base N content warn +#Base N-Count +1 0.3 +2 0.0 +3 0.0 +4 0.0 +5 0.0 +6 0.0 +7 0.0 +8 0.0 +9 0.0 +10-14 0.0 +15-19 0.0 +20-24 0.0 +25-29 0.0 +30-34 0.0 +35-39 0.0 +40-44 0.0 +45-49 0.0 +50-54 0.0 +55-59 0.0 +60-64 0.12 +65-69 0.5599999999999999 +70-74 6.16 +75-79 8.98 +80-84 13.100000000000001 +85-89 13.719999999999999 +90-94 11.68 +95-99 4.34 +100 0.0 +>>END_MODULE +>>Sequence Length Distribution pass +#Length Count +100 1000.0 +>>END_MODULE +>>Sequence Duplication Levels pass +#Total Duplicate Percentage 3.4 +#Duplication Level Relative count +1 100.0 +2 0.4140786749482402 +3 0.0 +4 0.0 +5 0.0 +6 0.0 +7 0.0 +8 0.0 +9 0.0 +10++ 0.2070393374741201 +>>END_MODULE +>>Overrepresented sequences fail +#Sequence Count Percentage Possible Source +AGATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTAT 14 1.4000000000000001 TruSeq Adapter, Index 1 (97% over 36bp) +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTATG 12 1.2 TruSeq Adapter, Index 1 (97% over 36bp) +AGGGGGAATGATGGTTGTCTTTGGATATACTACAGCGATGGCTATTGAGG 2 0.2 No Hit +GGCTTGTTTTATTTTAATGGCTGATCTATGTAATCACAGAGGCCAGTATG 2 0.2 No Hit +GTGGGGTGGTGTTTGTGGGGGACTTCATCATCTCAGGCTTCCCAGGGTCC 2 0.2 No Hit +CGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTATGCCG 2 0.2 TruSeq Adapter, Index 1 (96% over 33bp) +>>END_MODULE +>>Kmer Content fail +#Sequence Count Obs/Exp Overall Obs/Exp Max Max Obs/Exp Position +AAAAA 385 7.3597403 68.038994 65-69 +AGATC 435 5.4375157 23.135067 1 +GAAGA 375 5.258809 32.443344 6 +GGAAG 420 5.044668 33.345257 5 +TCCAG 475 4.8355613 14.131038 2 +AAGAG 320 4.487517 25.954676 7 +CCAGG 475 4.4180827 17.21471 3 +GAGCA 380 4.3399205 21.1377 9 +AGCAC 395 4.2895336 15.0741825 7 +CTCCA 415 4.0171337 12.105032 95-96 +AGAGC 340 3.883087 21.137697 8 +TTTTT 280 3.8749053 8.964593 10-14 +CTTCT 370 3.8646336 11.598914 55-59 +CTGAA 305 3.812511 13.130004 90-94 +CGGAA 320 3.65467 26.422123 5 +ACCAG 335 3.6379597 10.049457 7 +TCTGA 310 3.6325634 12.308498 90-94 +CACAC 340 3.5108058 14.806036 85-89 +ATCGG 325 3.4795394 24.768969 3 +TCGGA 320 3.426008 19.815174 3 +GATCG 320 3.426008 19.815174 1 +CGTCT 355 3.387832 11.578538 85-89 +CTGCT 355 3.387832 17.662533 3 +GCACA 310 3.3664696 15.0741825 8 +TCTTC 320 3.3423858 7.7326093 50-54 +CAGCA 305 3.3121717 10.049455 6 +GAACT 260 3.2500093 13.130004 90-94 +GTCTG 320 3.2116532 12.65067 90-94 +CAGGA 280 3.197836 15.8532715 3 +AACTC 265 3.1497202 23.781752 95-96 +TGAAC 250 3.125009 13.130004 90-94 +CCAGC 350 3.0954454 6.6359653 95-96 +AGTCA 240 3.0000086 10.41078 25-29 +CACCA 290 2.9945107 6.079907 70-74 +TGCTG 295 2.960743 9.2877 2 +CAGAT 230 2.875008 11.040063 70-74 +CTTCC 315 2.8583732 10.916445 30-34 +CACGT 280 2.8504362 12.351324 85-89 +CAGGG 290 2.8367646 22.630535 9 +ACACG 260 2.8234906 13.175687 85-89 +TTCCA 250 2.7855206 9.279795 30-34 +TTCTT 230 2.765239 6.6755276 50-54 +AGCAG 240 2.7410026 15.853272 2 +TTCTG 240 2.6363494 10.165324 55-59 +ACTCC 270 2.6135564 14.526036 95-96 +GCCAG 280 2.6043434 8.607355 1 +ACGTC 255 2.595933 10.105629 85-89 +GATCT 220 2.5779483 8.675031 40-44 +TCTGC 265 2.5289452 13.2469015 2 +AAGAT 160 2.4557784 12.783248 35-39 +ATCTC 220 2.4512577 9.279794 40-44 +CAGTC 240 2.4432309 8.554544 90-94 +TCCAA 205 2.4365761 10.999062 7 +CTTTT 200 2.4045558 16.688818 6 +TTCCT 230 2.40234 9.665762 7 +CCAGT 235 2.3923304 9.4206915 25-29 +TTTCT 195 2.3444414 16.688818 8 +CTGGG 255 2.3383298 6.004135 80-84 +TGCTT 210 2.3068056 10.165323 4 +TCTTT 190 2.284328 5.5629396 15-19 +TTTTC 190 2.2843277 11.125878 7 +GGGGG 255 2.2468696 16.307867 2 +AGGAA 160 2.2437584 19.466007 5 +GTCAC 220 2.2396283 10.184532 95-96 +TCACT 200 2.2284167 8.360176 95-96 +CACTT 200 2.2284167 10.3108835 30-34 +GAAAA 135 2.2103586 10.606119 60-64 +ACTTC 195 2.172706 9.279794 30-34 +TTGAA 150 2.1582448 11.9834385 60-64 +CTCCT 235 2.1324375 16.794533 4 +TCCTC 235 2.1324372 8.397265 5 +ATCTT 165 2.11616 7.1210704 10-14 +GGGGA 205 2.1089406 14.2801 3 +ACACA 165 2.092039 11.7331705 8 +TGCAG 195 2.0877237 9.907587 5 +GACCA 190 2.0633202 10.049455 6 +AGGGG 200 2.057503 9.520067 1 +CCTCC 260 2.049668 14.590484 5 +AGGAG 170 2.0418897 5.557543 2 +TCCTT 195 2.0367663 14.498643 4 +GTCTT 185 2.032186 15.247986 7 +GCTGG 220 2.0173824 8.485845 1 +CCAGA 185 2.0090222 5.3284492 70-74 +CCTGG 230 2.0054333 8.068818 3 +GCAGG 205 2.005299 9.052214 3 +GGACC 215 1.9997637 8.607355 5 +TTCAT 155 1.987908 5.934226 2 +CCTTT 190 1.9845415 14.498643 5 +TTTCC 190 1.9845415 5.799457 15-19 +TGGCA 185 1.980661 14.861383 2 +TCTTG 180 1.977262 10.165323 5 +CCAAG 180 1.9547247 9.044511 35-39 +CTTCA 175 1.9498644 10.310883 6 +CAAGA 145 1.933477 12.339583 35-39 +CTGGA 180 1.9271295 9.907587 6 +GGCTG 210 1.9256833 16.97169 2 +AATGA 125 1.918577 7.677627 95-96 +TGAAA 125 1.918577 15.623971 60-64 +GCTTC 200 1.9086379 13.2469015 2 +GTCCA 185 1.8833237 14.131036 1 +AGAAA 115 1.882898 7.5757995 7 +TGGGG 195 1.8805519 13.386638 1 +TTCTC 180 1.880092 5.799457 25-29 +CTTGA 160 1.8748715 8.675031 60-64 +ACAAA 120 1.8682072 5.762797 40-44 +TCTCG 195 1.8609219 8.831266 5 +GGGAC 190 1.8585701 9.052216 5 +TGAGG 165 1.8578365 5.209824 2 +TGAAG 140 1.8404517 6.082693 2 +CATCT 165 1.8384434 5.155441 4 +CACTG 180 1.8324232 9.4206915 6 +CTGCA 180 1.8324231 5.3465896 90-94 +GCTGC 210 1.8310483 8.068819 1 +GCAGA 160 1.8273348 10.568848 3 +CCTTC 200 1.8148402 8.397265 9 +AGGGA 150 1.8016673 6.0081544 95-96 +TTTCA 140 1.7955297 7.1210704 15-19 +CACAG 165 1.7918309 5.432139 95-96 +AAACA 115 1.7903653 7.6389136 70-74 +ATTTT 120 1.7715117 13.661307 6 +TTTTG 140 1.7701824 17.551357 7 +GGGGC 210 1.7594293 11.629828 3 +GATTT 130 1.7534488 12.481857 6 +CAAAT 120 1.7513192 6.7527947 50-54 +GAGGG 170 1.7488776 9.520067 1 +GAAGG 145 1.7416117 6.0081544 95-96 +CATTT 135 1.7314036 5.9342256 5 +ATTTC 135 1.7314036 5.9342256 7 +CCTCT 190 1.7240983 8.397266 1 +ATCCA 145 1.7234317 5.49953 4 +GCAGC 185 1.7207267 6.9789357 95-96 +TCCTG 180 1.717774 13.2469 2 +CTCTG 180 1.717774 13.2469 2 +AAAAC 110 1.7125233 7.6389136 70-74 +CTTGG 170 1.7061908 9.2877 2 +AAAAT 95 1.7024158 8.291661 9 +TCACC 175 1.693972 8.957724 8 +TCCAC 175 1.693972 8.957724 5 +GAGAA 120 1.6828189 6.488669 6 +TCTCC 185 1.6787271 5.038359 55-59 +GAGCC 180 1.6742208 8.607355 9 +TCATC 150 1.6713123 5.1554413 2 +AGACA 125 1.6667906 6.169792 2 +TGATG 135 1.6636823 11.404236 9 +GGGAG 160 1.6460025 9.520067 1 +AGCCA 150 1.6289369 6.029673 10-14 +ATGCC 160 1.6288207 8.478622 45-49 +CTCGT 170 1.6223421 8.831266 3 +GAGGA 135 1.6215005 11.115086 3 +TGTTG 140 1.6173534 10.690706 2 +CTCAT 145 1.6156021 5.1554418 2 +CAGGT 150 1.6059413 9.907587 4 +GCTTG 160 1.6058266 9.2877 60-64 +GGGTC 175 1.6047363 12.728768 2 +TCATT 125 1.6031516 5.934226 9 +GTTGA 130 1.6020645 5.702118 1 +ACAGA 120 1.6001189 10.005068 95-96 +GGAGG 155 1.5945649 9.520067 2 +GGGGT 165 1.5912362 13.386638 1 +TGGGA 140 1.5763463 10.419649 2 +GGATG 140 1.5763462 15.629472 6 +GCCTC 190 1.575248 7.672287 2 +CCTGC 190 1.5752479 11.508429 2 +GCTCC 190 1.5752479 11.508429 6 +TCTCT 150 1.5667434 5.224736 95-96 +GGGAA 130 1.561445 11.115086 4 +TCCAT 140 1.5598917 10.3108835 8 +GGCTT 155 1.5556445 13.93155 1 +TTGAT 115 1.5511277 6.240928 4 +CATCA 130 1.5451456 5.49953 2 +AGAGA 110 1.542584 6.488669 9 +AGGAC 135 1.541814 6.341309 55-59 +GTATG 125 1.5404466 9.123388 45-49 +AACAT 105 1.5324043 13.5055895 9 +AGCTC 150 1.5270194 9.4206915 5 +TTTGT 120 1.5172992 17.551357 8 +GATGA 115 1.5117996 6.082693 5 +GAGAT 115 1.5117996 6.082693 4 +AGGAT 115 1.5117996 12.165386 4 +TGAGA 115 1.5117996 6.082693 5 +CTGGT 150 1.5054625 9.2877 4 +GCTGT 150 1.5054625 18.5754 3 +TTCAC 135 1.504181 10.310883 7 +CCCAG 170 1.5035021 12.276537 2 +CAGTG 140 1.4988785 9.907587 5 +CTCCC 190 1.4978343 7.295242 1 +CCCTG 180 1.4923402 11.5084305 2 +CAGAG 130 1.4847097 7.398194 20-24 +CTTTG 135 1.4829465 10.165323 2 +CAAAA 95 1.4789973 7.203496 9 +TCTCA 130 1.4484707 5.1554413 8 +GAATG 110 1.4460692 12.165386 7 +GGAAT 110 1.4460692 12.165386 5 +TTTGG 125 1.4440656 5.345353 7 +GGCCT 165 1.4386805 12.103227 1 +GCTCT 150 1.4314783 6.1818867 20-24 +TCTGT 130 1.4280226 15.247986 3 +CTGTT 130 1.4280226 15.247986 4 +AGGTT 115 1.4172109 11.404235 8 +TTGAG 115 1.4172107 5.702117 4 +TTTGA 105 1.416247 7.4891143 10-14 +ATCTG 120 1.4061534 5.4218936 2 +GGTCT 140 1.4050984 9.287701 6 +TTTTA 95 1.4024467 7.384491 95-96 +GGGTG 145 1.3983592 13.386638 2 +GGCAC 150 1.3951839 8.607355 4 +AAAGA 85 1.3917071 7.5757985 8 +AAGAA 85 1.3917071 5.254889 75-79 +TTGTT 110 1.3908576 5.850453 4 +GGAGA 115 1.3812783 5.557543 3 +ATGAC 110 1.3750039 6.252721 95-96 +TGTTC 125 1.3730987 10.165325 5 +GGGCA 140 1.3694727 9.052216 4 +ATGAT 95 1.3668885 6.6574664 6 +CCACT 140 1.3551775 5.3746343 30-34 +TGGCT 135 1.3549163 13.931552 3 +GATGG 120 1.3511539 10.419648 9 +TCGTA 115 1.3475639 5.421894 40-44 +TGTCA 115 1.3475639 5.421894 5 +GCTGA 125 1.3382844 9.907587 6 +CAGAA 100 1.3334324 5.6025352 90-94 +CCAAA 105 1.3312978 5.8665853 8 +GGGCT 145 1.3296387 12.728768 1 +TAGGA 100 1.3146083 12.165386 4 +GACAG 115 1.313397 5.2844243 1 +GGTCC 150 1.3078917 8.068819 6 +CCATC 135 1.3067783 8.957724 9 +AAATG 85 1.3046323 7.101804 6 +TTCAA 95 1.2997144 6.330293 9 +CGTAT 110 1.2889742 8.675031 45-49 +TGACT 110 1.2889742 5.421894 3 +TATGC 110 1.2889739 8.67503 45-49 +GCCCT 155 1.2850707 7.672287 3 +TGGGC 140 1.283789 8.485846 7 +ACTTT 100 1.2825212 5.9342256 1 +ATGTT 95 1.2813665 6.2409286 1 +ATTTG 95 1.2813663 12.481856 9 +TGGTT 110 1.2707777 5.345353 5 +TGGTG 120 1.2666163 9.767722 7 +GTTTT 100 1.2644161 5.8504534 6 +GCCTG 145 1.2642952 12.103229 1 +TTGCT 115 1.2632507 6.0991945 50-54 +CCACC 150 1.2614243 7.7821474 5 +GGACA 110 1.2562928 15.853274 6 +GAAGC 110 1.2562928 10.568849 9 +TGACA 100 1.2500036 5.7837667 9 +GACAT 100 1.2500035 11.567533 7 +TGGAA 95 1.248878 6.082693 5 +ACAGC 115 1.2488517 10.049455 5 +AATCC 105 1.2480024 5.499531 7 +TGCCT 130 1.2406145 8.831266 3 +AGGTG 110 1.2385577 5.209824 4 +GTGGC 135 1.2379395 12.728768 1 +CATGT 105 1.2303842 5.4218936 1 +TAGAT 85 1.2230055 6.0453725 90-94 +CCCTC 155 1.2219174 7.295242 4 +GCCGT 140 1.2206988 8.068819 3 +AGTTT 90 1.2139261 6.2409286 7 +TTTAG 90 1.213926 6.240928 8 +TTGGG 115 1.2138406 9.767722 2 +ACCTC 125 1.20998 8.957724 1 +AGCAA 90 1.2000892 6.169792 9 +CAAAG 90 1.2000891 6.169791 5 +AAAGC 90 1.2000891 6.169791 6 +ACAGG 105 1.1991886 10.568849 8 +AGGCA 105 1.1991886 5.712891 95-96 +ATCAG 95 1.1875033 5.7837663 6 +ATGAG 90 1.1831475 6.082693 25-29 +CAGTT 100 1.1717947 5.1698627 85-89 +ATGCT 100 1.1717947 5.421894 8 +TCAAT 85 1.1629024 6.3302937 10-14 +TGTGT 100 1.1552525 10.690706 3 +GCCCA 130 1.1497369 12.276536 1 +TGATT 85 1.1464858 12.481857 5 +TGCTC 120 1.1451827 8.831267 4 +TGTCC 120 1.1451827 13.2469015 2 +TCCCC 145 1.143084 7.295242 2 +AAGGC 100 1.1420842 5.493164 65-69 +CAACA 90 1.1411123 5.8665853 8 +CACAA 90 1.1411123 11.7331705 9 +ACATC 95 1.129145 5.4995303 8 +AAGCT 90 1.1250031 6.2527194 95-96 +GAAAG 80 1.1218792 12.977338 7 +AAGGA 80 1.1218792 6.488669 3 +GCACT 110 1.1198142 9.4206915 5 +CCTGA 110 1.119814 9.420691 9 +ACCTT 100 1.1142083 5.1554418 7 +GTCAT 95 1.113205 5.421894 1 +TGATC 95 1.113205 10.843788 5 +TCATG 95 1.113205 5.421894 3 +TGGAT 90 1.1091216 5.702118 9 +GTGGG 115 1.1090435 8.924425 1 +CTGTG 110 1.1040058 9.2877 4 +GCTTT 100 1.0984789 5.4947696 95-96 +TGTCT 100 1.0984789 10.165323 5 +TTGGT 95 1.0974898 5.345353 4 +CTGTC 115 1.0974668 17.662535 4 +CAGAC 100 1.0859579 5.0247273 5 +GGAAC 95 1.0849801 5.2844243 6 +CCTCG 130 1.0778012 7.672287 6 +GCGGC 135 1.075477 7.372196 1 +ATAAA 60 1.0752101 8.291662 7 +GGGAT 95 1.0696635 10.419649 3 +CATCC 110 1.0647823 8.957723 3 +ACAGT 85 1.062503 5.7837663 4 +ACTGA 85 1.062503 11.567533 7 +GTTGG 100 1.0555136 9.767722 1 +TGTGG 100 1.0555136 9.767722 5 +GGAAA 75 1.0517617 19.466007 6 +GTGAA 80 1.0516868 6.082693 1 +GAAGT 80 1.0516866 6.082693 5 +GTCTC 110 1.0497508 8.831267 1 +CGGCT 120 1.046313 8.068818 1 +TTTAT 70 1.0333818 5.4645233 10-14 +GACAC 95 1.0316601 10.049455 7 +GGCAA 90 1.0278759 10.56885 3 +TCATA 75 1.0260904 6.330293 5 +ATTCA 75 1.0260903 6.3302927 7 +TAACA 70 1.0216029 6.7527957 8 +GGTCA 95 1.0170963 9.907589 3 +ATGGC 95 1.0170962 9.907587 1 +TCAGG 95 1.0170962 9.907587 8 +GGTGA 90 1.0133655 15.629474 3 +TGTTT 80 1.0115329 5.8504534 5 +TGAAT 70 1.007181 6.6574664 5 +ATTGA 70 1.0071809 6.6574664 7 +AAGTT 70 1.0071809 6.6574664 6 +TTGCC 105 1.0020349 8.831267 2 +CTTGC 105 1.0020349 8.831267 6 +GCAAA 75 1.0000744 6.169792 4 +CATAG 80 1.0000029 6.2527204 95-96 +GACTT 85 0.99602544 5.421894 1 +CTGAT 85 0.99602544 5.421894 4 +CTTGT 90 0.988631 10.165323 3 +AATGG 75 0.98595625 6.082693 8 +AAGGT 75 0.9859562 6.0826926 4 +GATGT 80 0.98588586 5.7021174 7 +GGATT 80 0.98588586 11.404235 5 +GGCGG 115 0.96349704 7.753219 1 +AGAGG 80 0.9608892 5.557543 8 +GAGGT 85 0.95706743 5.2098246 3 +ATGGG 85 0.9570673 5.209824 1 +CCGTC 115 0.95343953 7.672287 4 +TAGCA 75 0.9375027 5.7837667 1 +ACATG 75 0.9375026 5.7837663 2 +TTGCA 80 0.93743575 5.421894 4 +GTTCA 80 0.93743575 5.421894 6 +ATGTC 80 0.93743575 5.421894 5 +TTCAG 80 0.93743575 5.421894 8 +TTGAC 80 0.9374356 5.4218936 2 +GTTCT 85 0.93370706 5.0826616 1 +TTGTC 85 0.93370706 5.0826616 9 +TTTGC 85 0.93370706 5.0826616 3 +ATGGT 75 0.924268 5.7021174 4 +ATGAA 60 0.920917 7.1018047 9 +AGATG 70 0.92022586 6.082693 5 +GCTCA 90 0.91621155 5.092265 95-96 +AGTGC 85 0.9100334 9.907587 2 +AGGGT 80 0.90076935 10.419649 1 +GTAGG 80 0.90076923 10.419648 6 +AGTGG 80 0.90076923 5.209824 2 +TAAAA 50 0.89600843 8.291662 8 +CACAT 75 0.89143026 5.499531 6 +CCATT 80 0.89136666 10.3108835 9 +ATACT 65 0.8892783 6.330293 9 +ACATT 65 0.88927823 6.3302927 7 +GCGGG 105 0.87971467 7.753219 2 +ACACC 85 0.8777014 9.555587 9 +CATAA 60 0.8756596 6.7527947 6 +ACCCT 90 0.8711856 13.436585 1 +GAACA 65 0.8667311 6.169792 7 +ACTGC 85 0.8653109 5.092265 95-96 +GGTAT 70 0.86265016 17.106354 6 +AGTTG 70 0.86265016 5.702118 7 +GAGAC 75 0.85656327 5.2844243 1 +GTGTC 85 0.8530954 13.93155 1 +GTTGC 85 0.8530954 9.2877 1 +ATAGA 55 0.84417385 7.1018047 8 +GAAAT 55 0.84417385 7.1018047 5 +CATTC 75 0.83565605 5.155441 6 +TCACA 70 0.83200157 5.499531 3 +TGCGG 90 0.8252928 8.485845 3 +GCATT 70 0.8202563 5.421894 4 +GAACC 75 0.8144686 5.0247283 6 +CTCGA 80 0.81441027 9.420691 6 +GAATC 65 0.8125023 5.7837667 6 +TACAG 65 0.81250226 11.567533 7 +TGGTA 65 0.80103225 11.404236 5 +AAGAC 60 0.80005944 6.169791 8 +CAAGG 70 0.7994591 5.2844243 2 +ATGTA 55 0.7913565 6.6574664 4 +AATGT 55 0.7913565 6.6574664 3 +CGGCA 85 0.7906042 8.607354 2 +GAGAG 65 0.7807225 5.557543 8 +ACCAT 65 0.7725729 5.499531 8 +TTCTA 60 0.7695128 5.934226 9 +TAGAA 50 0.7674308 7.1018047 9 +GCATC 75 0.7635097 9.4206915 1 +GTTCC 80 0.76345515 8.831267 6 +AGCTT 65 0.76166654 5.421894 1 +TTAGC 65 0.76166654 5.421894 9 +CTGTA 65 0.76166654 5.421894 2 +ACTTG 65 0.7616664 5.4218936 2 +GTGCT 75 0.7527313 9.287701 3 +ATCAT 55 0.7524662 6.3302927 3 +GTTTG 65 0.7509141 5.345353 9 +GTGTT 65 0.7509141 10.690706 1 +GTCAA 60 0.75000215 11.5675335 6 +AATGC 60 0.75000215 6.252721 95-96 +CAAGT 60 0.7500021 5.7837663 9 +GCAAT 60 0.7500021 5.7837663 4 +GCAAG 65 0.74235487 5.2844243 1 +AGTGT 60 0.7394144 5.7021174 1 +TTAGG 60 0.7394144 5.702118 7 +AGCGG 75 0.73364604 9.052214 1 +ATCCT 65 0.72423524 5.155441 4 +ACTCT 65 0.72423524 5.155441 9 +AGTGA 55 0.7230346 6.082693 6 +AATAA 40 0.71680677 8.291662 6 +AACCT 60 0.71314424 5.4995303 1 +ATTCT 55 0.70538664 5.9342256 7 +AGTCT 60 0.7030768 5.421894 3 +GTGCA 65 0.69590795 9.907589 6 +AAAGT 45 0.69068766 7.101804 8 +AACTG 55 0.6875019 5.7837663 1 +CGAAG 60 0.68525064 5.2844243 4 +GATTG 55 0.67779654 5.702118 6 +GTGAT 55 0.67779654 11.404236 4 +TGTTA 50 0.67440337 12.481857 5 +TTGTA 50 0.6744033 6.240928 9 +TATTG 50 0.6744033 6.240928 7 +CTCTA 60 0.6685249 5.1554413 7 +TACCT 60 0.66852486 10.310882 8 +ATGGA 50 0.65730417 6.082693 8 +ATACA 45 0.6567447 6.7527957 6 +ATCAA 45 0.65674466 6.7527947 9 +TGTAA 45 0.6474735 6.6574664 7 +GCGGT 70 0.6418945 8.485846 4 +GGCCG 80 0.63731974 7.372196 2 +GGTTT 55 0.63538885 10.690706 9 +TTGTG 55 0.63538885 5.345353 1 +TATAT 40 0.62991583 7.2865515 8 +CCTGT 65 0.62030727 8.831266 3 +GTGAG 55 0.6192789 5.2098246 1 +TAGGG 55 0.61927885 5.209824 8 +GAGTT 50 0.6161787 5.7021174 6 +ATGTG 50 0.6161787 5.702118 2 +GAATA 40 0.61394465 7.1018047 6 +CTGCG 70 0.6103493 8.068818 2 +CGGTG 65 0.59604484 8.485845 2 +TAAGG 45 0.5915738 6.082693 9 +AAGTG 45 0.5915737 6.0826926 1 +TATTT 40 0.5905039 6.8306537 8 +GGCAT 55 0.5888452 14.861383 3 +GTATC 50 0.5858973 5.421894 4 +ATAAC 40 0.5837731 13.505591 7 +TTACT 45 0.57713455 5.934226 9 +GTATA 40 0.575532 13.314933 7 +GAGTG 50 0.5629808 5.209824 1 +GTACA 45 0.5625016 5.7837667 6 +ATAGC 45 0.5625016 5.7837667 9 +TCTAC 50 0.5571041 5.1554413 8 +GCGAG 55 0.53800714 9.052216 1 +ACGGG 55 0.5380071 9.052214 1 +GATAA 35 0.5372016 7.1018047 6 +AATAG 35 0.5372016 7.101805 7 +CAACT 45 0.53485817 5.4995303 6 +CATAC 45 0.53485817 5.4995303 5 +GATTC 45 0.52730757 5.421894 6 +AGGTA 40 0.5258433 12.165386 5 +CGGTC 60 0.52315664 8.068819 5 +ACGAG 45 0.51393795 5.2844243 7 +TATTC 40 0.5130085 5.9342256 7 +CTAAA 35 0.51080143 6.7527957 9 +TACAA 35 0.51080143 5.402236 35-39 +CCTTA 45 0.5013937 5.1554413 6 +CAGTA 40 0.50000143 5.7837667 4 +GTGTA 40 0.49294293 5.702118 4 +TAACT 35 0.47884214 6.330293 8 +CTTAA 35 0.47884214 6.330293 7 +CTATA 35 0.47884214 6.330293 4 +TTAAC 35 0.47884214 6.330293 8 +TATCA 35 0.4788421 6.3302927 5 +TCAAC 40 0.47542948 5.499531 7 +ACTCA 40 0.47542942 5.49953 8 +TTAGT 35 0.47208238 10.120425 95-96 +TGTAT 35 0.47208238 6.2409286 3 +ATTGT 35 0.47208235 6.240928 8 +GTTAC 40 0.46871787 5.421894 6 +TGTAC 40 0.46871787 10.843788 7 +AGAGT 35 0.46011293 6.082693 5 +AGTAG 35 0.46011293 6.082693 5 +CTCCG 55 0.45599285 7.672287 6 +GGTAG 40 0.45038468 5.2098246 2 +TTTAC 35 0.44888243 5.9342256 8 +CTACT 40 0.44568333 5.1554418 4 +AACTA 30 0.4378298 6.7527947 9 +TATAG 30 0.43164897 6.6574664 5 +ATATA 25 0.4199739 7.7728767 9 +CTCAA 35 0.41600078 5.499531 9 +TATAC 30 0.4104361 6.3302927 5 +ACTAT 30 0.4104361 6.3302927 6 +TACTA 30 0.4104361 6.3302927 5 +TCGAT 35 0.41012815 10.843788 7 +ACGTT 35 0.41012815 5.421894 4 +CGAAA 30 0.40002972 6.169792 9 +GTAAG 30 0.3943825 6.082693 8 +ATAGG 30 0.3943825 6.082693 3 +TCCTA 35 0.38997287 5.1554413 5 +TTACC 35 0.38997287 5.1554413 7 +ACCGA 35 0.3800853 5.0247273 7 +GCATA 30 0.37500107 5.7837667 1 +TCGAA 30 0.37500107 5.7837667 4 +GCTAA 30 0.37500107 5.7837667 8 +TAGGT 30 0.3697072 5.7021174 7 +GTTAG 30 0.3697072 5.702118 6 +CAATA 25 0.36485815 6.7527947 5 +ATACC 30 0.35657212 5.499531 6 +GACGA 30 0.3426253 5.284424 6 +AAGCG 30 0.3426253 10.568848 7 +GTTTA 25 0.33720168 6.2409286 7 +GTATT 25 0.33720168 12.481857 6 +AGATA 20 0.30697232 7.1018047 5 +CGTCA 30 0.30540386 9.420691 5 +CCTAA 25 0.29714343 5.499531 7 +TACCA 25 0.2971434 5.49953 9 +TGCTA 25 0.29294866 5.421894 7 +TACGT 25 0.29294863 5.4218936 9 +AGACG 25 0.2855211 5.284425 9 +CCTAT 25 0.2785521 5.1554418 3 +TAAGC 20 0.25000072 5.7837667 9 +CTAAG 20 0.25000072 5.7837667 8 +CGATT 20 0.23435894 5.421894 9 +GGGTA 20 0.22519234 5.2098246 2 +ACGCA 20 0.21719159 5.0247273 5 +GCGAA 15 0.17131266 5.284425 3 +CGAAC 15 0.16289368 5.0247273 5 +>>END_MODULE diff --git a/public/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala b/public/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala new file mode 100644 index 0000000000000000000000000000000000000000..0951bea84834b611c323c8e0b1b77ae55f0461b1 --- /dev/null +++ b/public/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala @@ -0,0 +1,80 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines.flexiprep + +import java.io.File +import java.nio.file.Paths + +import org.scalatest.Matchers +import org.scalatest.testng.TestNGSuite +import org.testng.annotations.Test + +class FastqcV0101Test extends TestNGSuite with Matchers { + + /** Returns the absolute path to test resource directory as a File object */ + private val resourceDir: File = new File(Paths.get(getClass.getResource("/").toURI).toString) + + /** Given a resource file name, returns the the absolute path to it as a File object */ + private def resourceFile(p: String): File = new File(resourceDir, p) + + /** Mock output file of a FastQC v0.10.1 run */ + // the file doesn't actually exist, we just need it so the outputDir value can be computed correctly + private val outputv0101: File = resourceFile("v0101.fq_fastqc.zip") + + @Test def testOutputDir() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + fqc.outputDir shouldBe new File(resourceDir, "v0101.fq_fastqc") + } + + @Test def testQcModules() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + // 11 QC modules + fqc.qcModules.size shouldBe 11 + // first module + fqc.qcModules.keySet should contain("Basic Statistics") + // mid (6th module) + fqc.qcModules.keySet should contain("Per sequence GC content") + // last module + fqc.qcModules.keySet should contain("Kmer Content") + } + + @Test def testSingleQcModule() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + fqc.qcModules("Basic Statistics").name should ===("Basic Statistics") + fqc.qcModules("Basic Statistics").status should ===("pass") + fqc.qcModules("Basic Statistics").lines.size shouldBe 8 + } + + @Test def testEncoding() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + fqc.encoding shouldBe "Sanger / Illumina 1.9" + } + + @Test def testFoundAdapter() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + fqc.contaminants = Option(resourceFile("fqc_contaminants_v0101.txt")) + val adapters = fqc.foundAdapters + adapters.size shouldBe 1 + adapters.head.name should ===("TruSeq Adapter, Index 1") + // from fqc_contaminants_v0101.txt + adapters.head.seq should ===("GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG") + } +} \ No newline at end of file