diff --git a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala index 60c25a5a69820b72a7e5bbd0f17cc8b5f0dac3fe..fb99be4a4f8716c8f96fb6be12d9caa264b249be 100644 --- a/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala +++ b/biopet-extensions/src/main/scala/nl/lumc/sasc/biopet/extensions/Cutadapt.scala @@ -24,6 +24,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output } import scala.collection.mutable import scala.io.Source +import scala.util.matching.Regex /** * Extension for cutadapt @@ -163,6 +164,51 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su (if (outputAsStsout) "" else required("--output", fastqOutput) + " > " + required(statsOutput)) + def extractClippedAdapters(statsOutput: File): Map[String, Any] = { + val histoCountRow: Regex = """([\d]+)\t([\d]+)\t.*""".r + val adapterR = """Sequence: ([C|T|A|G]+);.*Trimmed: ([\d]+) times\.""".r + + val statsFile = Source.fromFile(statsOutput) + val adapterRawStats: Array[String] = statsFile.mkString + .split("=== Adapter [\\d]+ ===") + .filter(_.contains("Sequence") + ) + statsFile.close() + + adapterRawStats.map(adapter => { + var adapterName = "" + var adapterCount = 0 + // identify the adapter name and count + for (line <- adapter.split("\n")) { + line match { + case adapterR(adapter, count) => { + adapterName = adapter + adapterCount = count.toInt + } + case _ => + } + } + + // parse the block that gives the histogram of clipped bases and from which end + val counts = adapter.split("Overview of removed sequences ") + .filter(x => x.contains("length")) + .map(clipSideRawStats => { + val clipSideLabel = if (clipSideRawStats.contains("5'")) { "5p" } else { "3p" } + + val histogramValues = clipSideRawStats.split("\n").flatMap({ + case histoCountRow(length, count) => Some(length.toInt -> count.toInt) + case _ => None + }) + clipSideLabel -> histogramValues.toMap + }) + + adapterName -> Map( + "count" -> adapterCount, + "histogram" -> counts.toMap + ) + }).toMap // converting the Array[String] containing map-items to Map with 'toMap' + } + /** Output summary stats */ def summaryStats: Map[String, Any] = { /** @@ -177,7 +223,6 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su val tooLongR = """.* that were too long: *([,\d]+) .*""".r val tooManyN = """.* with too many N: *([,\d]+) .*""".r - val adapterR = """Sequence ([C|T|A|G]*);.*Trimmed: ([,\d]+) times.""".r val basePairsProcessed = """Total basepairs processed: *([,\d]+) bp""".r val basePairsWritten = """Total written \(filtered\): *([,\d]+) bp .*""".r @@ -192,24 +237,28 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su "bpoutput" -> 0, "toomanyn" -> 0 ) - val adapterStats: mutable.Map[String, Long] = mutable.Map() + + // extract the adapters with its histogram + val adapterStats = if (statsOutput.exists) { + extractClippedAdapters(statsOutput) + } else Map.empty if (statsOutput.exists) { val statsFile = Source.fromFile(statsOutput) for (line <- statsFile.getLines()) { line match { - case processedReads(m) => stats("processed") = m.replaceAll(",", "").toLong - case withAdapters(m) => stats("withadapters") = m.replaceAll(",", "").toLong - case readsPassingFilters(m) => stats("passingfilters") = m.replaceAll(",", "").toLong - case tooShortR(m) => stats("tooshort") = m.replaceAll(",", "").toLong - case tooLongR(m) => stats("toolong") = m.replaceAll(",", "").toLong - case tooManyN(m) => stats("toomanyn") = m.replaceAll(",", "").toLong - case basePairsProcessed(m) => stats("bpinput") = m.replaceAll(",", "").toLong - case basePairsWritten(m) => stats("bpoutput") = m.replaceAll(",", "").toLong - case adapterR(adapter, count) => adapterStats += (adapter -> count.toLong) - case _ => + case processedReads(m) => stats("processed") = m.replaceAll(",", "").toLong + case withAdapters(m) => stats("withadapters") = m.replaceAll(",", "").toLong + case readsPassingFilters(m) => stats("passingfilters") = m.replaceAll(",", "").toLong + case tooShortR(m) => stats("tooshort") = m.replaceAll(",", "").toLong + case tooLongR(m) => stats("toolong") = m.replaceAll(",", "").toLong + case tooManyN(m) => stats("toomanyn") = m.replaceAll(",", "").toLong + case basePairsProcessed(m) => stats("bpinput") = m.replaceAll(",", "").toLong + case basePairsWritten(m) => stats("bpoutput") = m.replaceAll(",", "").toLong + case _ => } } + statsFile.close() } val cleanReads = stats("processed") - stats("withadapters") @@ -223,8 +272,8 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su "num_reads_discarded_too_long" -> stats("toolong"), "num_reads_discarded_many_n" -> stats("toomanyn"), "num_bases_input" -> stats("bpinput"), - "num_based_output" -> stats("bpoutput"), - adaptersStatsName -> adapterStats.toMap + "num_bases_output" -> stats("bpoutput"), + adaptersStatsName -> adapterStats ) } diff --git a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala index fc8db7ab30f7581c7638f15c48bba6e9443eb195..3cb06df0e160cb97b98710de74f7ca9fa31ce919 100644 --- a/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala +++ b/flexiprep/src/main/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/Cutadapt.scala @@ -16,6 +16,7 @@ package nl.lumc.sasc.biopet.pipelines.flexiprep import nl.lumc.sasc.biopet.utils.config.Configurable +import scala.collection.JavaConversions._ /** * Cutadapt wrapper specific for Flexiprep. @@ -41,23 +42,26 @@ class Cutadapt(root: Configurable, fastqc: Fastqc) extends nl.lumc.sasc.biopet.e val adapterCounts: Map[String, Any] = initStats.get(adaptersStatsName) match { // "adapters" key found in statistics case Some(m: Map[_, _]) => m.flatMap { - case (seq: String, count) => - seqToNameMap.get(seq) match { + case (adapterSequence: String, adapterStats: Map[_, _]) => + seqToNameMap.get(adapterSequence) match { // adapter sequence is found by FastQC - case Some(n) => Some(n -> Map("sequence" -> seq, "count" -> count)) + case Some(adapterSeqName) => { + Some(adapterSeqName -> + Map("sequence" -> adapterSequence, "stats" -> adapterStats.toMap) + ) + } // adapter sequence is clipped but not found by FastQC ~ should not happen since all clipped adapter // sequences come from FastQC case _ => - throw new IllegalStateException(s"Adapter '$seq' is clipped but not found by FastQC in '$fastqInput'.") + throw new IllegalStateException(s"Adapter '$adapterSequence' is clipped but not found by FastQC in '$fastqInput'.") } // FastQC found no adapters case otherwise => - ; logger.debug(s"No adapters found for summarizing in '$fastqInput'.") None } // "adapters" key not found ~ something went wrong in our part - case _ => throw new RuntimeException(s"Required key 'adapters' not found in stats entry '$fastqInput'.") + case _ => throw new RuntimeException(s"Required key '${adaptersStatsName}' not found in stats entry '${fastqInput}'.") } initStats.updated(adaptersStatsName, adapterCounts) } diff --git a/flexiprep/src/test/resources/ct-test.R1.clip.stats b/flexiprep/src/test/resources/ct-test.R1.clip.stats new file mode 100644 index 0000000000000000000000000000000000000000..4a280ef0a7d2588169c02b5e40432f4f903c69b8 --- /dev/null +++ b/flexiprep/src/test/resources/ct-test.R1.clip.stats @@ -0,0 +1,160 @@ +This is cutadapt 1.9.1 with Python 2.7.6 +Command line parameters: -b CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC -b GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG -b CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC -b GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG --error-rate 0.2 --times 2 -m 15 ct_r1.fq.gz.seqtk.fq --output ct_r1.fq.gz.cutadapt.fq +Trimming 4 adapters with at most 20.0% errors in single-end mode ... +Finished in 0.19 s (189 us/read; 0.32 M reads/minute). + +=== Summary === + +Total reads processed: 1,000 +Reads with adapters: 440 (44.0%) +Reads that were too short: 15 (1.5%) +Reads written (passing filters): 985 (98.5%) + +Total basepairs processed: 100,000 bp +Total written (filtered): 89,423 bp (89.4%) + +=== Adapter 1 === + +Sequence: CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC; Type: variable 5'/3'; Length: 63; Trimmed: 94 times. +18 times, it overlapped the 5' end of a read +76 times, it overlapped the 3' end or was within the read + +No. of allowed errors: +0-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-34 bp: 6; 35-39 bp: 7; 40-44 bp: 8; 45-49 bp: 9; 50-54 bp: 10; 55-59 bp: 11; 60-63 bp: 12 + +Overview of removed sequences (5') +length count expect max.err error counts +3 8 15.6 0 8 +4 3 3.9 0 2 1 +5 2 1.0 1 0 2 +6 4 0.2 1 1 3 +9 1 0.0 1 0 0 1 + + +Overview of removed sequences (3' or within) +length count expect max.err error counts +3 13 15.6 0 13 +4 19 3.9 0 3 16 +5 21 1.0 1 0 21 +6 18 0.2 1 1 17 +7 2 0.1 1 0 2 +9 1 0.0 1 0 0 1 +11 1 0.0 2 0 0 1 +12 1 0.0 2 0 0 1 + +=== Adapter 2 === + +Sequence: GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG; Type: variable 5'/3'; Length: 63; Trimmed: 340 times. +117 times, it overlapped the 5' end of a read +223 times, it overlapped the 3' end or was within the read + +No. of allowed errors: +0-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-34 bp: 6; 35-39 bp: 7; 40-44 bp: 8; 45-49 bp: 9; 50-54 bp: 10; 55-59 bp: 11; 60-63 bp: 12 + +Overview of removed sequences (5') +length count expect max.err error counts +3 14 15.6 0 14 +4 29 3.9 0 6 23 +5 32 1.0 1 3 29 +6 36 0.2 1 0 36 +8 1 0.0 1 0 1 +9 1 0.0 1 0 0 1 +10 1 0.0 2 0 0 1 +11 2 0.0 2 0 0 2 +37 1 0.0 7 0 0 0 0 0 1 + + +Overview of removed sequences (3' or within) +length count expect max.err error counts +3 18 15.6 0 18 +4 9 3.9 0 5 4 +5 15 1.0 1 8 7 +6 10 0.2 1 8 2 +7 7 0.1 1 5 2 +8 10 0.0 1 9 1 +9 6 0.0 1 5 1 +10 8 0.0 2 5 0 3 +11 4 0.0 2 4 +12 4 0.0 2 4 +13 9 0.0 2 9 +14 4 0.0 2 3 0 1 +15 7 0.0 3 7 +16 2 0.0 3 2 +17 4 0.0 3 2 1 0 1 +18 2 0.0 3 2 +19 2 0.0 3 2 +20 2 0.0 4 0 1 1 +21 7 0.0 4 6 1 +22 7 0.0 4 7 +23 2 0.0 4 2 +24 3 0.0 4 3 +25 5 0.0 5 5 +26 5 0.0 5 5 +27 8 0.0 5 8 +28 6 0.0 5 5 1 +29 2 0.0 5 2 +30 5 0.0 6 5 +31 3 0.0 6 3 +32 8 0.0 6 8 +33 1 0.0 6 1 +34 5 0.0 6 0 5 +35 2 0.0 7 0 0 0 0 0 0 2 +36 3 0.0 7 0 0 0 0 0 0 3 +37 4 0.0 7 0 0 0 0 0 0 0 2 2 +38 2 0.0 7 0 0 0 0 0 0 0 0 0 2 +39 4 0.0 7 0 0 0 0 1 0 0 0 0 3 +40 3 0.0 8 0 0 0 0 0 0 0 3 +41 1 0.0 8 0 0 0 0 0 0 0 1 +42 4 0.0 8 0 0 0 0 0 0 0 0 4 +43 5 0.0 8 0 0 0 0 0 0 0 0 0 5 +44 3 0.0 8 0 0 0 0 0 0 0 0 0 0 3 +46 1 0.0 9 0 0 0 0 0 0 0 0 0 0 1 +49 1 0.0 9 0 0 0 0 0 1 + +=== Adapter 3 === + +Sequence: CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC; Type: variable 5'/3'; Length: 63; Trimmed: 0 times. + +=== Adapter 4 === + +Sequence: GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG; Type: variable 5'/3'; Length: 63; Trimmed: 82 times. +15 times, it overlapped the 5' end of a read +67 times, it overlapped the 3' end or was within the read + +No. of allowed errors: +0-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-34 bp: 6; 35-39 bp: 7; 40-44 bp: 8; 45-49 bp: 9; 50-54 bp: 10; 55-59 bp: 11; 60-63 bp: 12 + +Overview of removed sequences (5') +length count expect max.err error counts +26 1 0.0 5 0 1 +61 2 0.0 12 0 0 0 2 +64 11 0.0 12 0 0 0 11 +72 1 0.0 12 0 0 0 0 0 0 0 0 0 0 0 1 + + +Overview of removed sequences (3' or within) +length count expect max.err error counts +45 3 0.0 9 0 0 0 3 +46 2 0.0 9 0 0 0 2 +47 3 0.0 9 0 0 0 3 +48 3 0.0 9 0 0 0 3 +49 2 0.0 9 0 0 0 2 +50 3 0.0 10 0 0 0 3 +51 2 0.0 10 0 0 0 2 +52 6 0.0 10 0 0 0 6 +53 1 0.0 10 0 0 0 1 +54 5 0.0 10 0 0 0 4 0 1 +56 2 0.0 11 0 0 0 2 +57 2 0.0 11 0 0 0 2 +58 2 0.0 11 0 0 0 2 +59 3 0.0 11 0 0 0 2 0 0 0 0 0 1 +61 1 0.0 12 0 0 0 0 0 1 +62 3 0.0 12 0 0 0 2 1 +63 1 0.0 12 0 0 0 0 1 +66 3 0.0 12 0 0 0 3 +67 3 0.0 12 0 0 0 3 +70 1 0.0 12 0 0 0 1 +72 1 0.0 12 0 0 0 1 +80 1 0.0 12 0 0 0 1 +99 14 0.0 12 0 0 0 14 + diff --git a/flexiprep/src/test/resources/fqc_contaminants_v0112.txt b/flexiprep/src/test/resources/fqc_contaminants_v0112.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2c29bee8171e0454994c6d7d6f0f4780efb3921 --- /dev/null +++ b/flexiprep/src/test/resources/fqc_contaminants_v0112.txt @@ -0,0 +1,182 @@ +# This file contains a list of potential contaminants which are +# frequently found in high throughput sequencing reactions. These +# are mostly sequences of adapters / primers used in the various +# sequencing chemistries. +# +# Please DO NOT rely on these sequences to design your own oligos, some +# of them are truncated at ambiguous positions, and none of them are +# definitive sequences from the manufacturers so don't blame us if you +# try to use them and they don't work. +# +# You can add more sequences to the file by putting one line per entry +# and specifying a name[tab]sequence. If the contaminant you add is +# likely to be of use to others please consider sending it to the FastQ +# authors, either via a bug report at www.bioinformatics.babraham.ac.uk/bugzilla/ +# or by directly emailing simon.andrews@babraham.ac.uk so other users of +# the program can benefit. + +Illumina Single End Adapter 1 GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG +Illumina Single End Adapter 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +Illumina Single End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Single End PCR Primer 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +Illumina Single End Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT + +Illumina Paired End Adapter 1 ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Paired End Adapter 2 GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +Illumina Paried End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Paired End PCR Primer 2 CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +Illumina Paried End Sequencing Primer 1 ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Paired End Sequencing Primer 2 CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT + +Illumina DpnII expression Adapter 1 ACAGGTTCAGAGTTCTACAGTCCGAC +Illumina DpnII expression Adapter 2 CAAGCAGAAGACGGCATACGA +Illumina DpnII expression PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina DpnII expression PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina DpnII expression Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +Illumina NlaIII expression Adapter 1 ACAGGTTCAGAGTTCTACAGTCCGACATG +Illumina NlaIII expression Adapter 2 CAAGCAGAAGACGGCATACGA +Illumina NlaIII expression PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina NlaIII expression PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina NlaIII expression Sequencing Primer CCGACAGGTTCAGAGTTCTACAGTCCGACATG + +Illumina Small RNA Adapter 1 GTTCAGAGTTCTACAGTCCGACGATC +Illumina Small RNA Adapter 2 TGGAATTCTCGGGTGCCAAGG +Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA +Illumina Small RNA PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina Small RNA PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina Small RNA Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +Illumina Multiplexing Adapter 1 GATCGGAAGAGCACACGTCT +Illumina Multiplexing Adapter 2 ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Multiplexing PCR Primer 1.01 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Multiplexing PCR Primer 2.01 GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +Illumina Multiplexing Read1 Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT +Illumina Multiplexing Index Sequencing Primer GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +Illumina Multiplexing Read2 Sequencing Primer GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT + +Illumina PCR Primer Index 1 CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC +Illumina PCR Primer Index 2 CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC +Illumina PCR Primer Index 3 CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC +Illumina PCR Primer Index 4 CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC +Illumina PCR Primer Index 5 CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC +Illumina PCR Primer Index 6 CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC +Illumina PCR Primer Index 7 CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC +Illumina PCR Primer Index 8 CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC +Illumina PCR Primer Index 9 CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC +Illumina PCR Primer Index 10 CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC +Illumina PCR Primer Index 11 CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC +Illumina PCR Primer Index 12 CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC + +Illumina DpnII Gex Adapter 1 GATCGTCGGACTGTAGAACTCTGAAC +Illumina DpnII Gex Adapter 1.01 ACAGGTTCAGAGTTCTACAGTCCGAC +Illumina DpnII Gex Adapter 2 CAAGCAGAAGACGGCATACGA +Illumina DpnII Gex Adapter 2.01 TCGTATGCCGTCTTCTGCTTG +Illumina DpnII Gex PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina DpnII Gex PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina DpnII Gex Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +Illumina NlaIII Gex Adapter 1.01 TCGGACTGTAGAACTCTGAAC +Illumina NlaIII Gex Adapter 1.02 ACAGGTTCAGAGTTCTACAGTCCGACATG +Illumina NlaIII Gex Adapter 2.01 CAAGCAGAAGACGGCATACGA +Illumina NlaIII Gex Adapter 2.02 TCGTATGCCGTCTTCTGCTTG +Illumina NlaIII Gex PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina NlaIII Gex PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina NlaIII Gex Sequencing Primer CCGACAGGTTCAGAGTTCTACAGTCCGACATG + +Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA +Illumina 5p RNA Adapter GTTCAGAGTTCTACAGTCCGACGATC +Illumina RNA Adapter1 TGGAATTCTCGGGTGCCAAGG + +Illumina Small RNA 3p Adapter 1 ATCTCGTATGCCGTCTTCTGCTTG +Illumina Small RNA PCR Primer 1 CAAGCAGAAGACGGCATACGA +Illumina Small RNA PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA +Illumina Small RNA Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC + +TruSeq Universal Adapter AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +TruSeq Adapter, Index 1 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 2 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 3 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 4 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 5 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 6 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 7 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 8 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 9 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 10 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 11 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 12 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 13 GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 14 GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 15 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 16 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 18 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 19 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 20 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 21 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 22 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 23 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCACTCTTCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 25 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATCTCGTATGCCGTCTTCTGCTTG +TruSeq Adapter, Index 27 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTCTCGTATGCCGTCTTCTGCTTG + +Illumina RNA RT Primer GCCTTGGCACCCGAGAATTCCA +Illumina RNA PCR Primer AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA + +RNA PCR Primer, Index 1 CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 2 CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 3 CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 4 CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 5 CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 6 CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 7 CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 8 CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 9 CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 10 CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 11 CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 12 CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 13 CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 14 CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 15 CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 16 CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 17 CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 18 CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 19 CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 20 CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 21 CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 22 CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 23 CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 24 CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 25 CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 26 CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 27 CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 28 CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 29 CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 30 CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 31 CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 32 CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 33 CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 34 CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 35 CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 36 CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 37 CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 38 CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 39 CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 40 CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 41 CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 42 CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 43 CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 44 CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 45 CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 46 CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 47 CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA +RNA PCR Primer, Index 48 CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA + +ABI Dynabead EcoP Oligo CTGATCTAGAGGTACCGGATCCCAGCAGT +ABI Solid3 Adapter A CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG +ABI Solid3 Adapter B CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT +ABI Solid3 5' AMP Primer CCACTACGCCTCCGCTTTCCTCTCTATG +ABI Solid3 3' AMP Primer CTGCCCCGGGTTCCTCATTCT +ABI Solid3 EF1 alpha Sense Primer CATGTGTGTTGAGAGCTTC +ABI Solid3 EF1 alpha Antisense Primer GAAAACCAAAGTGGTCCAC +ABI Solid3 GAPDH Forward Primer TTAGCACCCCTGGCCAAGG +ABI Solid3 GAPDH Reverse Primer CTTACTCCTTGGAGGCCATG diff --git a/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt b/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt index 74938a52b7d505b1185b1962ffe7234ddb304a52..02b9e3f0cbf01c6ce54fa715df93d7cfc6ba4bab 100644 --- a/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt +++ b/flexiprep/src/test/resources/v0101.fq_fastqc/fastqc_data.txt @@ -1,29 +1,13 @@ -==== - Biopet is built on top of GATK Queue for building bioinformatic - pipelines. It is mainly intended to support LUMC SHARK cluster which is running - SGE. But other types of HPC that are supported by GATK Queue (such as PBS) - should also be able to execute Biopet tools and pipelines. - - Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center - - Contact us at: sasc@lumc.nl - - A dual licensing mode is applied. The source code within this project that are - not part of GATK Queue is freely available for non-commercial use under an AGPL - license; For commercial users or users who do not want to follow the AGPL - license, please contact us to obtain a separate license. -==== - -##FastQC 0.10.1 +##FastQC 0.11.2 >>Basic Statistics pass -#Measure Value -Filename ct_r1.fq -File type Conventional base calls -Encoding Sanger / Illumina 1.9 -Total Sequences 1000 -Filtered Sequences 0 -Sequence length 100 -%GC 53 +#Measure Value +Filename ct_r1.fq.gz +File type Conventional base calls +Encoding Sanger / Illumina 1.9 +Total Sequences 1000 +Sequences flagged as poor quality 0 +Sequence length 100 +%GC 53 >>END_MODULE >>Per base sequence quality fail #Base Mean Median Lower Quartile Upper Quartile 10th Percentile 90th Percentile @@ -36,26 +20,111 @@ Sequence length 100 7 35.783 37.0 35.0 37.0 35.0 37.0 8 36.008 37.0 35.0 37.0 35.0 37.0 9 37.706 39.0 37.0 39.0 35.0 39.0 -10-14 37.857600000000005 39.2 37.2 39.4 34.8 39.4 -15-19 38.9788 40.2 38.0 41.0 35.0 41.0 -20-24 38.8246 40.0 38.0 41.0 34.8 41.0 -25-29 38.589600000000004 40.0 38.0 41.0 34.4 41.0 -30-34 38.3568 40.0 38.0 41.0 33.8 41.0 -35-39 38.1592 40.0 37.4 41.0 33.6 41.0 -40-44 37.4808 39.8 36.0 41.0 32.6 41.0 -45-49 36.9478 39.0 35.0 40.8 31.2 41.0 -50-54 35.845600000000005 37.8 34.6 40.0 29.4 41.0 -55-59 34.739 36.6 33.6 40.0 27.4 41.0 -60-64 34.1336 35.4 33.4 38.6 27.2 40.2 -65-69 32.7464 35.0 32.6 37.2 24.6 39.6 -70-74 29.3478 34.0 29.6 35.6 2.0 38.6 -75-79 27.4908 33.2 26.4 35.0 2.0 36.6 -80-84 25.893000000000008 33.0 21.8 35.0 2.0 35.4 -85-89 25.031799999999997 32.4 16.2 34.6 2.0 35.0 -90-94 23.9446 31.4 6.4 34.0 2.0 35.0 -95-99 22.9358 30.4 2.0 34.0 2.0 35.0 +10-11 37.709 39.0 37.0 39.0 35.0 39.0 +12-13 37.6135 39.0 37.0 39.0 35.0 39.0 +14-15 38.793 40.0 38.0 41.0 34.5 41.0 +16-17 39.033500000000004 40.5 38.0 41.0 35.0 41.0 +18-19 38.942 40.0 38.0 41.0 35.0 41.0 +20-21 38.888 40.0 38.0 41.0 35.0 41.0 +22-23 38.807 40.0 38.0 41.0 35.0 41.0 +24-25 38.702 40.0 38.0 41.0 34.0 41.0 +26-27 38.65 40.0 38.0 41.0 34.5 41.0 +28-29 38.4885 40.0 38.0 41.0 34.5 41.0 +30-31 38.307 40.0 38.0 41.0 34.0 41.0 +32-33 38.433499999999995 40.0 38.0 41.0 34.0 41.0 +34-35 38.3425 40.0 38.0 41.0 33.5 41.0 +36-37 38.1185 40.0 37.5 41.0 33.5 41.0 +38-39 38.088499999999996 40.0 37.0 41.0 33.5 41.0 +40-41 37.555 40.0 36.0 41.0 32.5 41.0 +42-43 37.504999999999995 40.0 36.0 41.0 33.0 41.0 +44-45 37.167 39.0 35.5 41.0 32.0 41.0 +46-47 36.980999999999995 39.0 35.0 41.0 31.0 41.0 +48-49 36.8635 39.0 35.0 40.5 31.0 41.0 +50-51 36.4125 38.5 35.0 40.0 30.5 41.0 +52-53 35.528000000000006 37.5 34.5 40.0 28.5 41.0 +54-55 34.925 37.0 33.5 40.0 27.5 41.0 +56-57 34.8735 37.0 34.0 40.0 27.5 41.0 +58-59 34.7225 36.0 33.5 40.0 28.0 41.0 +60-61 34.67400000000001 36.0 34.0 39.0 28.5 40.5 +62-63 33.841499999999996 35.0 33.0 38.5 26.5 40.0 +64-65 33.549 35.0 33.0 38.0 26.0 40.0 +66-67 32.971999999999994 35.0 33.0 37.0 26.0 40.0 +68-69 32.1635 35.0 32.0 37.0 22.5 39.0 +70-71 30.002000000000002 34.0 30.5 36.0 2.0 39.0 +72-73 29.0695 34.0 29.0 35.5 2.0 38.5 +74-75 28.641 34.0 29.0 35.0 2.0 38.0 +76-77 27.8495 33.0 27.5 35.0 2.0 36.0 +78-79 26.5345 33.0 24.0 35.0 2.0 36.5 +80-81 26.140500000000003 33.0 23.0 35.0 2.0 36.0 +82-83 25.784 33.0 21.5 35.0 2.0 35.0 +84-85 25.6115 33.0 20.0 35.0 2.0 35.0 +86-87 25.1755 33.0 17.0 35.0 2.0 35.0 +88-89 24.600499999999997 31.5 13.5 34.0 2.0 35.0 +90-91 24.088 31.5 6.5 34.0 2.0 35.0 +92-93 24.16 32.0 8.5 34.0 2.0 35.0 +94-95 23.02 30.0 2.0 34.0 2.0 35.0 +96-97 23.183 30.5 2.0 34.0 2.0 35.0 +98-99 22.75 30.5 2.0 34.0 2.0 35.0 100 21.984 30.0 2.0 34.0 2.0 35.0 >>END_MODULE +>>Per tile sequence quality pass +#Tile Base Mean +1101 1 0.0 +1101 2 0.0 +1101 3 0.0 +1101 4 0.0 +1101 5 0.0 +1101 6 0.0 +1101 7 0.0 +1101 8 0.0 +1101 9 0.0 +1101 10-11 0.0 +1101 12-13 0.0 +1101 14-15 0.0 +1101 16-17 0.0 +1101 18-19 0.0 +1101 20-21 0.0 +1101 22-23 0.0 +1101 24-25 0.0 +1101 26-27 0.0 +1101 28-29 0.0 +1101 30-31 0.0 +1101 32-33 0.0 +1101 34-35 0.0 +1101 36-37 0.0 +1101 38-39 0.0 +1101 40-41 0.0 +1101 42-43 0.0 +1101 44-45 0.0 +1101 46-47 0.0 +1101 48-49 0.0 +1101 50-51 0.0 +1101 52-53 0.0 +1101 54-55 0.0 +1101 56-57 0.0 +1101 58-59 0.0 +1101 60-61 0.0 +1101 62-63 0.0 +1101 64-65 0.0 +1101 66-67 0.0 +1101 68-69 0.0 +1101 70-71 0.0 +1101 72-73 0.0 +1101 74-75 0.0 +1101 76-77 0.0 +1101 78-79 0.0 +1101 80-81 0.0 +1101 82-83 0.0 +1101 84-85 0.0 +1101 86-87 0.0 +1101 88-89 0.0 +1101 90-91 0.0 +1101 92-93 0.0 +1101 94-95 0.0 +1101 96-97 0.0 +1101 98-99 0.0 +1101 100 0.0 +>>END_MODULE >>Per sequence quality scores pass #Quality Count 11 1.0 @@ -99,57 +168,53 @@ Sequence length 100 7 20.9 24.7 32.6 21.8 8 20.0 27.200000000000003 30.0 22.8 9 24.5 21.5 27.800000000000004 26.200000000000003 -10-14 25.22 23.28 26.26 25.240000000000002 -15-19 26.44 21.34 26.1 26.119999999999997 -20-24 25.240000000000002 22.1 24.6 28.060000000000002 -25-29 24.62 22.06 25.119999999999997 28.199999999999996 -30-34 26.240000000000002 21.44 24.279999999999998 28.04 -35-39 24.8 22.439999999999998 24.34 28.42 -40-44 25.8 22.84 23.9 27.46 -45-49 26.26 22.64 23.66 27.439999999999998 -50-54 26.72 22.58 23.18 27.52 -55-59 25.019999999999996 22.58 24.38 28.02 -60-64 26.251501802162597 22.00640768922707 23.28794553464157 28.454144973968766 -65-69 25.683829444891394 23.873692679002414 23.049074818986323 27.39340305711987 -70-74 25.554134697357206 25.44757033248082 21.717817561807333 27.28047740835465 -75-79 25.818501428257523 23.643155350472423 23.071852340145025 27.466490881125026 -80-84 26.973532796317606 23.95857307249712 21.74913693901036 27.318757192174914 -85-89 25.452016689847014 24.849327770050998 22.624014835419565 27.07464070468243 -90-94 24.547101449275363 22.35054347826087 24.139492753623188 28.962862318840582 -95-99 25.318837549655026 24.231653773782146 23.186284758519758 27.263223918043067 +10-11 25.15 24.0 27.55 23.3 +12-13 26.200000000000003 22.3 24.65 26.85 +14-15 24.75 21.95 26.3 27.0 +16-17 25.4 21.7 26.55 26.35 +18-19 27.650000000000002 21.6 25.85 24.9 +20-21 24.8 21.8 24.3 29.099999999999998 +22-23 25.900000000000002 23.05 24.15 26.900000000000002 +24-25 24.85 21.4 25.900000000000002 27.85 +26-27 24.7 20.849999999999998 25.0 29.45 +28-29 24.4 23.3 24.95 27.35 +30-31 27.35 20.95 25.15 26.55 +32-33 24.9 22.05 23.400000000000002 29.65 +34-35 25.6 22.15 25.900000000000002 26.35 +36-37 24.95 21.2 23.400000000000002 30.45 +38-39 24.8 23.35 23.7 28.15 +40-41 27.0 23.35 23.599999999999998 26.05 +42-43 25.15 22.35 23.799999999999997 28.7 +44-45 26.200000000000003 20.7 24.3 28.799999999999997 +46-47 26.3 24.0 23.150000000000002 26.55 +48-49 25.5 23.3 24.05 27.150000000000002 +50-51 27.55 22.75 23.7 26.0 +52-53 24.45 23.400000000000002 23.1 29.049999999999997 +54-55 27.450000000000003 21.85 23.0 27.700000000000003 +56-57 25.85 22.15 23.5 28.499999999999996 +58-59 24.05 22.75 25.6 27.6 +60-61 25.25 20.95 23.45 30.349999999999998 +62-63 27.3 21.9 23.7 27.1 +64-65 26.178535606820464 24.57372116349047 22.617853560682047 26.629889669007024 +66-67 25.7 23.75 22.05 28.499999999999996 +68-69 25.405679513184587 23.52941176470588 24.036511156186613 27.028397565922923 +70-71 25.159574468085104 23.085106382978722 23.138297872340424 28.617021276595743 +72-73 26.031065881092662 26.513122656668454 20.51419389394751 26.941617568291377 +74-75 25.197680548234054 26.56826568265683 21.929362150764366 26.304691618344755 +76-77 25.911812738160044 23.51660315732172 24.550898203592812 26.02068590092542 +78-79 26.16345062429058 22.985244040862657 21.793416572077184 29.05788876276958 +80-81 26.98324022346369 25.474860335195533 21.005586592178773 26.536312849162012 +82-83 26.46370023419204 24.355971896955502 22.131147540983605 27.049180327868854 +84-85 26.124567474048444 23.18339100346021 22.145328719723185 28.546712802768166 +86-87 25.976331360946748 25.443786982248522 22.36686390532544 26.21301775147929 +88-89 25.503742084052966 23.54634427173287 23.316062176165804 27.63385146804836 +90-91 23.832052040212893 21.525724423418097 25.901833234772326 28.74039030159669 +92-93 24.525139664804467 22.849162011173185 23.743016759776538 28.88268156424581 +94-95 25.161987041036717 24.028077753779698 22.4622030237581 28.347732181425485 +96-97 25.37393162393162 24.412393162393162 23.664529914529915 26.549145299145298 +98-99 25.67703109327984 23.620862587763288 22.71815446339017 27.9839518555667 100 24.0 26.0 21.9 28.1 >>END_MODULE ->>Per base GC content fail -#Base %GC -1 71.01303911735206 -2 64.1 -3 73.3 -4 65.3 -5 55.800000000000004 -6 87.3 -7 42.699999999999996 -8 42.8 -9 50.7 -10-14 50.46000000000001 -15-19 52.559999999999995 -20-24 53.300000000000004 -25-29 52.82 -30-34 54.279999999999994 -35-39 53.22 -40-44 53.26 -45-49 53.7 -50-54 54.24 -55-59 53.04 -60-64 54.70564677613135 -65-69 53.07723250201126 -70-74 52.834612105711855 -75-79 53.28499230938255 -80-84 54.29228998849251 -85-89 52.526657394529444 -90-94 53.509963768115945 -95-99 52.5820614676981 -100 52.1 ->>END_MODULE >>Per sequence GC content fail #GC Content Count 0 0.0 @@ -265,24 +330,51 @@ Sequence length 100 7 0.0 8 0.0 9 0.0 -10-14 0.0 -15-19 0.0 -20-24 0.0 -25-29 0.0 -30-34 0.0 -35-39 0.0 -40-44 0.0 -45-49 0.0 -50-54 0.0 -55-59 0.0 -60-64 0.12 -65-69 0.5599999999999999 -70-74 6.16 -75-79 8.98 -80-84 13.100000000000001 -85-89 13.719999999999999 -90-94 11.68 -95-99 4.34 +10-11 0.0 +12-13 0.0 +14-15 0.0 +16-17 0.0 +18-19 0.0 +20-21 0.0 +22-23 0.0 +24-25 0.0 +26-27 0.0 +28-29 0.0 +30-31 0.0 +32-33 0.0 +34-35 0.0 +36-37 0.0 +38-39 0.0 +40-41 0.0 +42-43 0.0 +44-45 0.0 +46-47 0.0 +48-49 0.0 +50-51 0.0 +52-53 0.0 +54-55 0.0 +56-57 0.0 +58-59 0.0 +60-61 0.0 +62-63 0.0 +64-65 0.3 +66-67 0.0 +68-69 1.4000000000000001 +70-71 6.0 +72-73 6.65 +74-75 5.1499999999999995 +76-77 8.15 +78-79 11.899999999999999 +80-81 10.5 +82-83 14.6 +84-85 13.3 +86-87 15.5 +88-89 13.15 +90-91 15.45 +92-93 10.5 +94-95 7.3999999999999995 +96-97 6.4 +98-99 0.3 100 0.0 >>END_MODULE >>Sequence Length Distribution pass @@ -290,565 +382,85 @@ Sequence length 100 100 1000.0 >>END_MODULE >>Sequence Duplication Levels pass -#Total Duplicate Percentage 3.4 -#Duplication Level Relative count -1 100.0 -2 0.4140786749482402 -3 0.0 -4 0.0 -5 0.0 -6 0.0 -7 0.0 -8 0.0 -9 0.0 -10++ 0.2070393374741201 +#Total Deduplicated Percentage 97.2 +#Duplication Level Percentage of deduplicated Percentage of total +1 99.38271604938271 96.6 +2 0.411522633744856 0.8 +3 0.0 0.0 +4 0.0 0.0 +5 0.0 0.0 +6 0.0 0.0 +7 0.0 0.0 +8 0.0 0.0 +9 0.0 0.0 +>10 0.205761316872428 2.6 +>50 0.0 0.0 +>100 0.0 0.0 +>500 0.0 0.0 +>1k 0.0 0.0 +>5k 0.0 0.0 +>10k+ 0.0 0.0 >>END_MODULE >>Overrepresented sequences fail #Sequence Count Percentage Possible Source -AGATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTAT 14 1.4000000000000001 TruSeq Adapter, Index 1 (97% over 36bp) -GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTATG 12 1.2 TruSeq Adapter, Index 1 (97% over 36bp) +AGATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTAT 14 1.4000000000000001 TruSeq Adapter, Index 18 (97% over 37bp) +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTATG 12 1.2 TruSeq Adapter, Index 18 (97% over 37bp) AGGGGGAATGATGGTTGTCTTTGGATATACTACAGCGATGGCTATTGAGG 2 0.2 No Hit GGCTTGTTTTATTTTAATGGCTGATCTATGTAATCACAGAGGCCAGTATG 2 0.2 No Hit GTGGGGTGGTGTTTGTGGGGGACTTCATCATCTCAGGCTTCCCAGGGTCC 2 0.2 No Hit -CGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTATGCCG 2 0.2 TruSeq Adapter, Index 1 (96% over 33bp) +CGGAAGAGCACACGTCTGAACTCCAGTCACTTCCAAGATCTCGTATGCCG 2 0.2 TruSeq Adapter, Index 18 (97% over 34bp) +>>END_MODULE +>>Adapter Content fail +#Position Illumina Universal Adapter Illumina Small RNA Adapter Nextera Transposase Sequence +1 1.4 0.0 0.0 +2 1.4 0.0 0.0 +3 1.4 0.0 0.0 +4 1.4 0.0 0.0 +5 1.4 0.0 0.0 +6 1.4 0.0 0.0 +7 1.4 0.0 0.0 +8 1.4 0.0 0.0 +9 1.4 0.0 0.0 +10-11 1.4 0.0 0.0 +12-13 1.4 0.0 0.0 +14-15 1.4 0.0 0.0 +16-17 1.4 0.0 0.0 +18-19 1.4 0.0 0.0 +20-21 1.5 0.0 0.0 +22-23 1.5 0.0 0.0 +24-25 1.5 0.0 0.0 +26-27 1.5 0.0 0.0 +28-29 1.6 0.0 0.0 +30-31 1.7 0.0 0.0 +32-33 1.9 0.0 0.0 +34-35 2.4 0.0 0.0 +36-37 2.45 0.0 0.0 +38-39 2.95 0.0 0.0 +40-41 3.25 0.0 0.0 +42-43 3.75 0.0 0.0 +44-45 4.2 0.0 0.0 +46-47 4.9 0.0 0.0 +48-49 5.699999999999999 0.0 0.0 +50-51 6.300000000000001 0.0 0.0 +52-53 6.949999999999999 0.0 0.0 +54-55 7.65 0.0 0.0 +56-57 8.399999999999999 0.0 0.0 +58-59 9.350000000000001 0.0 0.0 +60-61 9.899999999999999 0.0 0.0 +62-63 10.600000000000001 0.0 0.0 +64-65 11.3 0.0 0.0 +66-67 12.0 0.0 0.0 +68-69 13.05 0.0 0.0 +70-71 13.6 0.0 0.0 +72-73 14.5 0.0 0.0 +74-75 15.55 0.0 0.0 +76-77 16.15 0.0 0.0 +78-79 17.2 0.0 0.0 +80-81 17.700000000000003 0.0 0.0 +82-83 18.15 0.0 0.0 +84-85 18.75 0.0 0.0 +86-87 19.799999999999997 0.0 0.0 +88 20.6 0.0 0.0 >>END_MODULE ->>Kmer Content fail -#Sequence Count Obs/Exp Overall Obs/Exp Max Max Obs/Exp Position -AAAAA 385 7.3597403 68.038994 65-69 -AGATC 435 5.4375157 23.135067 1 -GAAGA 375 5.258809 32.443344 6 -GGAAG 420 5.044668 33.345257 5 -TCCAG 475 4.8355613 14.131038 2 -AAGAG 320 4.487517 25.954676 7 -CCAGG 475 4.4180827 17.21471 3 -GAGCA 380 4.3399205 21.1377 9 -AGCAC 395 4.2895336 15.0741825 7 -CTCCA 415 4.0171337 12.105032 95-96 -AGAGC 340 3.883087 21.137697 8 -TTTTT 280 3.8749053 8.964593 10-14 -CTTCT 370 3.8646336 11.598914 55-59 -CTGAA 305 3.812511 13.130004 90-94 -CGGAA 320 3.65467 26.422123 5 -ACCAG 335 3.6379597 10.049457 7 -TCTGA 310 3.6325634 12.308498 90-94 -CACAC 340 3.5108058 14.806036 85-89 -ATCGG 325 3.4795394 24.768969 3 -TCGGA 320 3.426008 19.815174 3 -GATCG 320 3.426008 19.815174 1 -CGTCT 355 3.387832 11.578538 85-89 -CTGCT 355 3.387832 17.662533 3 -GCACA 310 3.3664696 15.0741825 8 -TCTTC 320 3.3423858 7.7326093 50-54 -CAGCA 305 3.3121717 10.049455 6 -GAACT 260 3.2500093 13.130004 90-94 -GTCTG 320 3.2116532 12.65067 90-94 -CAGGA 280 3.197836 15.8532715 3 -AACTC 265 3.1497202 23.781752 95-96 -TGAAC 250 3.125009 13.130004 90-94 -CCAGC 350 3.0954454 6.6359653 95-96 -AGTCA 240 3.0000086 10.41078 25-29 -CACCA 290 2.9945107 6.079907 70-74 -TGCTG 295 2.960743 9.2877 2 -CAGAT 230 2.875008 11.040063 70-74 -CTTCC 315 2.8583732 10.916445 30-34 -CACGT 280 2.8504362 12.351324 85-89 -CAGGG 290 2.8367646 22.630535 9 -ACACG 260 2.8234906 13.175687 85-89 -TTCCA 250 2.7855206 9.279795 30-34 -TTCTT 230 2.765239 6.6755276 50-54 -AGCAG 240 2.7410026 15.853272 2 -TTCTG 240 2.6363494 10.165324 55-59 -ACTCC 270 2.6135564 14.526036 95-96 -GCCAG 280 2.6043434 8.607355 1 -ACGTC 255 2.595933 10.105629 85-89 -GATCT 220 2.5779483 8.675031 40-44 -TCTGC 265 2.5289452 13.2469015 2 -AAGAT 160 2.4557784 12.783248 35-39 -ATCTC 220 2.4512577 9.279794 40-44 -CAGTC 240 2.4432309 8.554544 90-94 -TCCAA 205 2.4365761 10.999062 7 -CTTTT 200 2.4045558 16.688818 6 -TTCCT 230 2.40234 9.665762 7 -CCAGT 235 2.3923304 9.4206915 25-29 -TTTCT 195 2.3444414 16.688818 8 -CTGGG 255 2.3383298 6.004135 80-84 -TGCTT 210 2.3068056 10.165323 4 -TCTTT 190 2.284328 5.5629396 15-19 -TTTTC 190 2.2843277 11.125878 7 -GGGGG 255 2.2468696 16.307867 2 -AGGAA 160 2.2437584 19.466007 5 -GTCAC 220 2.2396283 10.184532 95-96 -TCACT 200 2.2284167 8.360176 95-96 -CACTT 200 2.2284167 10.3108835 30-34 -GAAAA 135 2.2103586 10.606119 60-64 -ACTTC 195 2.172706 9.279794 30-34 -TTGAA 150 2.1582448 11.9834385 60-64 -CTCCT 235 2.1324375 16.794533 4 -TCCTC 235 2.1324372 8.397265 5 -ATCTT 165 2.11616 7.1210704 10-14 -GGGGA 205 2.1089406 14.2801 3 -ACACA 165 2.092039 11.7331705 8 -TGCAG 195 2.0877237 9.907587 5 -GACCA 190 2.0633202 10.049455 6 -AGGGG 200 2.057503 9.520067 1 -CCTCC 260 2.049668 14.590484 5 -AGGAG 170 2.0418897 5.557543 2 -TCCTT 195 2.0367663 14.498643 4 -GTCTT 185 2.032186 15.247986 7 -GCTGG 220 2.0173824 8.485845 1 -CCAGA 185 2.0090222 5.3284492 70-74 -CCTGG 230 2.0054333 8.068818 3 -GCAGG 205 2.005299 9.052214 3 -GGACC 215 1.9997637 8.607355 5 -TTCAT 155 1.987908 5.934226 2 -CCTTT 190 1.9845415 14.498643 5 -TTTCC 190 1.9845415 5.799457 15-19 -TGGCA 185 1.980661 14.861383 2 -TCTTG 180 1.977262 10.165323 5 -CCAAG 180 1.9547247 9.044511 35-39 -CTTCA 175 1.9498644 10.310883 6 -CAAGA 145 1.933477 12.339583 35-39 -CTGGA 180 1.9271295 9.907587 6 -GGCTG 210 1.9256833 16.97169 2 -AATGA 125 1.918577 7.677627 95-96 -TGAAA 125 1.918577 15.623971 60-64 -GCTTC 200 1.9086379 13.2469015 2 -GTCCA 185 1.8833237 14.131036 1 -AGAAA 115 1.882898 7.5757995 7 -TGGGG 195 1.8805519 13.386638 1 -TTCTC 180 1.880092 5.799457 25-29 -CTTGA 160 1.8748715 8.675031 60-64 -ACAAA 120 1.8682072 5.762797 40-44 -TCTCG 195 1.8609219 8.831266 5 -GGGAC 190 1.8585701 9.052216 5 -TGAGG 165 1.8578365 5.209824 2 -TGAAG 140 1.8404517 6.082693 2 -CATCT 165 1.8384434 5.155441 4 -CACTG 180 1.8324232 9.4206915 6 -CTGCA 180 1.8324231 5.3465896 90-94 -GCTGC 210 1.8310483 8.068819 1 -GCAGA 160 1.8273348 10.568848 3 -CCTTC 200 1.8148402 8.397265 9 -AGGGA 150 1.8016673 6.0081544 95-96 -TTTCA 140 1.7955297 7.1210704 15-19 -CACAG 165 1.7918309 5.432139 95-96 -AAACA 115 1.7903653 7.6389136 70-74 -ATTTT 120 1.7715117 13.661307 6 -TTTTG 140 1.7701824 17.551357 7 -GGGGC 210 1.7594293 11.629828 3 -GATTT 130 1.7534488 12.481857 6 -CAAAT 120 1.7513192 6.7527947 50-54 -GAGGG 170 1.7488776 9.520067 1 -GAAGG 145 1.7416117 6.0081544 95-96 -CATTT 135 1.7314036 5.9342256 5 -ATTTC 135 1.7314036 5.9342256 7 -CCTCT 190 1.7240983 8.397266 1 -ATCCA 145 1.7234317 5.49953 4 -GCAGC 185 1.7207267 6.9789357 95-96 -TCCTG 180 1.717774 13.2469 2 -CTCTG 180 1.717774 13.2469 2 -AAAAC 110 1.7125233 7.6389136 70-74 -CTTGG 170 1.7061908 9.2877 2 -AAAAT 95 1.7024158 8.291661 9 -TCACC 175 1.693972 8.957724 8 -TCCAC 175 1.693972 8.957724 5 -GAGAA 120 1.6828189 6.488669 6 -TCTCC 185 1.6787271 5.038359 55-59 -GAGCC 180 1.6742208 8.607355 9 -TCATC 150 1.6713123 5.1554413 2 -AGACA 125 1.6667906 6.169792 2 -TGATG 135 1.6636823 11.404236 9 -GGGAG 160 1.6460025 9.520067 1 -AGCCA 150 1.6289369 6.029673 10-14 -ATGCC 160 1.6288207 8.478622 45-49 -CTCGT 170 1.6223421 8.831266 3 -GAGGA 135 1.6215005 11.115086 3 -TGTTG 140 1.6173534 10.690706 2 -CTCAT 145 1.6156021 5.1554418 2 -CAGGT 150 1.6059413 9.907587 4 -GCTTG 160 1.6058266 9.2877 60-64 -GGGTC 175 1.6047363 12.728768 2 -TCATT 125 1.6031516 5.934226 9 -GTTGA 130 1.6020645 5.702118 1 -ACAGA 120 1.6001189 10.005068 95-96 -GGAGG 155 1.5945649 9.520067 2 -GGGGT 165 1.5912362 13.386638 1 -TGGGA 140 1.5763463 10.419649 2 -GGATG 140 1.5763462 15.629472 6 -GCCTC 190 1.575248 7.672287 2 -CCTGC 190 1.5752479 11.508429 2 -GCTCC 190 1.5752479 11.508429 6 -TCTCT 150 1.5667434 5.224736 95-96 -GGGAA 130 1.561445 11.115086 4 -TCCAT 140 1.5598917 10.3108835 8 -GGCTT 155 1.5556445 13.93155 1 -TTGAT 115 1.5511277 6.240928 4 -CATCA 130 1.5451456 5.49953 2 -AGAGA 110 1.542584 6.488669 9 -AGGAC 135 1.541814 6.341309 55-59 -GTATG 125 1.5404466 9.123388 45-49 -AACAT 105 1.5324043 13.5055895 9 -AGCTC 150 1.5270194 9.4206915 5 -TTTGT 120 1.5172992 17.551357 8 -GATGA 115 1.5117996 6.082693 5 -GAGAT 115 1.5117996 6.082693 4 -AGGAT 115 1.5117996 12.165386 4 -TGAGA 115 1.5117996 6.082693 5 -CTGGT 150 1.5054625 9.2877 4 -GCTGT 150 1.5054625 18.5754 3 -TTCAC 135 1.504181 10.310883 7 -CCCAG 170 1.5035021 12.276537 2 -CAGTG 140 1.4988785 9.907587 5 -CTCCC 190 1.4978343 7.295242 1 -CCCTG 180 1.4923402 11.5084305 2 -CAGAG 130 1.4847097 7.398194 20-24 -CTTTG 135 1.4829465 10.165323 2 -CAAAA 95 1.4789973 7.203496 9 -TCTCA 130 1.4484707 5.1554413 8 -GAATG 110 1.4460692 12.165386 7 -GGAAT 110 1.4460692 12.165386 5 -TTTGG 125 1.4440656 5.345353 7 -GGCCT 165 1.4386805 12.103227 1 -GCTCT 150 1.4314783 6.1818867 20-24 -TCTGT 130 1.4280226 15.247986 3 -CTGTT 130 1.4280226 15.247986 4 -AGGTT 115 1.4172109 11.404235 8 -TTGAG 115 1.4172107 5.702117 4 -TTTGA 105 1.416247 7.4891143 10-14 -ATCTG 120 1.4061534 5.4218936 2 -GGTCT 140 1.4050984 9.287701 6 -TTTTA 95 1.4024467 7.384491 95-96 -GGGTG 145 1.3983592 13.386638 2 -GGCAC 150 1.3951839 8.607355 4 -AAAGA 85 1.3917071 7.5757985 8 -AAGAA 85 1.3917071 5.254889 75-79 -TTGTT 110 1.3908576 5.850453 4 -GGAGA 115 1.3812783 5.557543 3 -ATGAC 110 1.3750039 6.252721 95-96 -TGTTC 125 1.3730987 10.165325 5 -GGGCA 140 1.3694727 9.052216 4 -ATGAT 95 1.3668885 6.6574664 6 -CCACT 140 1.3551775 5.3746343 30-34 -TGGCT 135 1.3549163 13.931552 3 -GATGG 120 1.3511539 10.419648 9 -TCGTA 115 1.3475639 5.421894 40-44 -TGTCA 115 1.3475639 5.421894 5 -GCTGA 125 1.3382844 9.907587 6 -CAGAA 100 1.3334324 5.6025352 90-94 -CCAAA 105 1.3312978 5.8665853 8 -GGGCT 145 1.3296387 12.728768 1 -TAGGA 100 1.3146083 12.165386 4 -GACAG 115 1.313397 5.2844243 1 -GGTCC 150 1.3078917 8.068819 6 -CCATC 135 1.3067783 8.957724 9 -AAATG 85 1.3046323 7.101804 6 -TTCAA 95 1.2997144 6.330293 9 -CGTAT 110 1.2889742 8.675031 45-49 -TGACT 110 1.2889742 5.421894 3 -TATGC 110 1.2889739 8.67503 45-49 -GCCCT 155 1.2850707 7.672287 3 -TGGGC 140 1.283789 8.485846 7 -ACTTT 100 1.2825212 5.9342256 1 -ATGTT 95 1.2813665 6.2409286 1 -ATTTG 95 1.2813663 12.481856 9 -TGGTT 110 1.2707777 5.345353 5 -TGGTG 120 1.2666163 9.767722 7 -GTTTT 100 1.2644161 5.8504534 6 -GCCTG 145 1.2642952 12.103229 1 -TTGCT 115 1.2632507 6.0991945 50-54 -CCACC 150 1.2614243 7.7821474 5 -GGACA 110 1.2562928 15.853274 6 -GAAGC 110 1.2562928 10.568849 9 -TGACA 100 1.2500036 5.7837667 9 -GACAT 100 1.2500035 11.567533 7 -TGGAA 95 1.248878 6.082693 5 -ACAGC 115 1.2488517 10.049455 5 -AATCC 105 1.2480024 5.499531 7 -TGCCT 130 1.2406145 8.831266 3 -AGGTG 110 1.2385577 5.209824 4 -GTGGC 135 1.2379395 12.728768 1 -CATGT 105 1.2303842 5.4218936 1 -TAGAT 85 1.2230055 6.0453725 90-94 -CCCTC 155 1.2219174 7.295242 4 -GCCGT 140 1.2206988 8.068819 3 -AGTTT 90 1.2139261 6.2409286 7 -TTTAG 90 1.213926 6.240928 8 -TTGGG 115 1.2138406 9.767722 2 -ACCTC 125 1.20998 8.957724 1 -AGCAA 90 1.2000892 6.169792 9 -CAAAG 90 1.2000891 6.169791 5 -AAAGC 90 1.2000891 6.169791 6 -ACAGG 105 1.1991886 10.568849 8 -AGGCA 105 1.1991886 5.712891 95-96 -ATCAG 95 1.1875033 5.7837663 6 -ATGAG 90 1.1831475 6.082693 25-29 -CAGTT 100 1.1717947 5.1698627 85-89 -ATGCT 100 1.1717947 5.421894 8 -TCAAT 85 1.1629024 6.3302937 10-14 -TGTGT 100 1.1552525 10.690706 3 -GCCCA 130 1.1497369 12.276536 1 -TGATT 85 1.1464858 12.481857 5 -TGCTC 120 1.1451827 8.831267 4 -TGTCC 120 1.1451827 13.2469015 2 -TCCCC 145 1.143084 7.295242 2 -AAGGC 100 1.1420842 5.493164 65-69 -CAACA 90 1.1411123 5.8665853 8 -CACAA 90 1.1411123 11.7331705 9 -ACATC 95 1.129145 5.4995303 8 -AAGCT 90 1.1250031 6.2527194 95-96 -GAAAG 80 1.1218792 12.977338 7 -AAGGA 80 1.1218792 6.488669 3 -GCACT 110 1.1198142 9.4206915 5 -CCTGA 110 1.119814 9.420691 9 -ACCTT 100 1.1142083 5.1554418 7 -GTCAT 95 1.113205 5.421894 1 -TGATC 95 1.113205 10.843788 5 -TCATG 95 1.113205 5.421894 3 -TGGAT 90 1.1091216 5.702118 9 -GTGGG 115 1.1090435 8.924425 1 -CTGTG 110 1.1040058 9.2877 4 -GCTTT 100 1.0984789 5.4947696 95-96 -TGTCT 100 1.0984789 10.165323 5 -TTGGT 95 1.0974898 5.345353 4 -CTGTC 115 1.0974668 17.662535 4 -CAGAC 100 1.0859579 5.0247273 5 -GGAAC 95 1.0849801 5.2844243 6 -CCTCG 130 1.0778012 7.672287 6 -GCGGC 135 1.075477 7.372196 1 -ATAAA 60 1.0752101 8.291662 7 -GGGAT 95 1.0696635 10.419649 3 -CATCC 110 1.0647823 8.957723 3 -ACAGT 85 1.062503 5.7837663 4 -ACTGA 85 1.062503 11.567533 7 -GTTGG 100 1.0555136 9.767722 1 -TGTGG 100 1.0555136 9.767722 5 -GGAAA 75 1.0517617 19.466007 6 -GTGAA 80 1.0516868 6.082693 1 -GAAGT 80 1.0516866 6.082693 5 -GTCTC 110 1.0497508 8.831267 1 -CGGCT 120 1.046313 8.068818 1 -TTTAT 70 1.0333818 5.4645233 10-14 -GACAC 95 1.0316601 10.049455 7 -GGCAA 90 1.0278759 10.56885 3 -TCATA 75 1.0260904 6.330293 5 -ATTCA 75 1.0260903 6.3302927 7 -TAACA 70 1.0216029 6.7527957 8 -GGTCA 95 1.0170963 9.907589 3 -ATGGC 95 1.0170962 9.907587 1 -TCAGG 95 1.0170962 9.907587 8 -GGTGA 90 1.0133655 15.629474 3 -TGTTT 80 1.0115329 5.8504534 5 -TGAAT 70 1.007181 6.6574664 5 -ATTGA 70 1.0071809 6.6574664 7 -AAGTT 70 1.0071809 6.6574664 6 -TTGCC 105 1.0020349 8.831267 2 -CTTGC 105 1.0020349 8.831267 6 -GCAAA 75 1.0000744 6.169792 4 -CATAG 80 1.0000029 6.2527204 95-96 -GACTT 85 0.99602544 5.421894 1 -CTGAT 85 0.99602544 5.421894 4 -CTTGT 90 0.988631 10.165323 3 -AATGG 75 0.98595625 6.082693 8 -AAGGT 75 0.9859562 6.0826926 4 -GATGT 80 0.98588586 5.7021174 7 -GGATT 80 0.98588586 11.404235 5 -GGCGG 115 0.96349704 7.753219 1 -AGAGG 80 0.9608892 5.557543 8 -GAGGT 85 0.95706743 5.2098246 3 -ATGGG 85 0.9570673 5.209824 1 -CCGTC 115 0.95343953 7.672287 4 -TAGCA 75 0.9375027 5.7837667 1 -ACATG 75 0.9375026 5.7837663 2 -TTGCA 80 0.93743575 5.421894 4 -GTTCA 80 0.93743575 5.421894 6 -ATGTC 80 0.93743575 5.421894 5 -TTCAG 80 0.93743575 5.421894 8 -TTGAC 80 0.9374356 5.4218936 2 -GTTCT 85 0.93370706 5.0826616 1 -TTGTC 85 0.93370706 5.0826616 9 -TTTGC 85 0.93370706 5.0826616 3 -ATGGT 75 0.924268 5.7021174 4 -ATGAA 60 0.920917 7.1018047 9 -AGATG 70 0.92022586 6.082693 5 -GCTCA 90 0.91621155 5.092265 95-96 -AGTGC 85 0.9100334 9.907587 2 -AGGGT 80 0.90076935 10.419649 1 -GTAGG 80 0.90076923 10.419648 6 -AGTGG 80 0.90076923 5.209824 2 -TAAAA 50 0.89600843 8.291662 8 -CACAT 75 0.89143026 5.499531 6 -CCATT 80 0.89136666 10.3108835 9 -ATACT 65 0.8892783 6.330293 9 -ACATT 65 0.88927823 6.3302927 7 -GCGGG 105 0.87971467 7.753219 2 -ACACC 85 0.8777014 9.555587 9 -CATAA 60 0.8756596 6.7527947 6 -ACCCT 90 0.8711856 13.436585 1 -GAACA 65 0.8667311 6.169792 7 -ACTGC 85 0.8653109 5.092265 95-96 -GGTAT 70 0.86265016 17.106354 6 -AGTTG 70 0.86265016 5.702118 7 -GAGAC 75 0.85656327 5.2844243 1 -GTGTC 85 0.8530954 13.93155 1 -GTTGC 85 0.8530954 9.2877 1 -ATAGA 55 0.84417385 7.1018047 8 -GAAAT 55 0.84417385 7.1018047 5 -CATTC 75 0.83565605 5.155441 6 -TCACA 70 0.83200157 5.499531 3 -TGCGG 90 0.8252928 8.485845 3 -GCATT 70 0.8202563 5.421894 4 -GAACC 75 0.8144686 5.0247283 6 -CTCGA 80 0.81441027 9.420691 6 -GAATC 65 0.8125023 5.7837667 6 -TACAG 65 0.81250226 11.567533 7 -TGGTA 65 0.80103225 11.404236 5 -AAGAC 60 0.80005944 6.169791 8 -CAAGG 70 0.7994591 5.2844243 2 -ATGTA 55 0.7913565 6.6574664 4 -AATGT 55 0.7913565 6.6574664 3 -CGGCA 85 0.7906042 8.607354 2 -GAGAG 65 0.7807225 5.557543 8 -ACCAT 65 0.7725729 5.499531 8 -TTCTA 60 0.7695128 5.934226 9 -TAGAA 50 0.7674308 7.1018047 9 -GCATC 75 0.7635097 9.4206915 1 -GTTCC 80 0.76345515 8.831267 6 -AGCTT 65 0.76166654 5.421894 1 -TTAGC 65 0.76166654 5.421894 9 -CTGTA 65 0.76166654 5.421894 2 -ACTTG 65 0.7616664 5.4218936 2 -GTGCT 75 0.7527313 9.287701 3 -ATCAT 55 0.7524662 6.3302927 3 -GTTTG 65 0.7509141 5.345353 9 -GTGTT 65 0.7509141 10.690706 1 -GTCAA 60 0.75000215 11.5675335 6 -AATGC 60 0.75000215 6.252721 95-96 -CAAGT 60 0.7500021 5.7837663 9 -GCAAT 60 0.7500021 5.7837663 4 -GCAAG 65 0.74235487 5.2844243 1 -AGTGT 60 0.7394144 5.7021174 1 -TTAGG 60 0.7394144 5.702118 7 -AGCGG 75 0.73364604 9.052214 1 -ATCCT 65 0.72423524 5.155441 4 -ACTCT 65 0.72423524 5.155441 9 -AGTGA 55 0.7230346 6.082693 6 -AATAA 40 0.71680677 8.291662 6 -AACCT 60 0.71314424 5.4995303 1 -ATTCT 55 0.70538664 5.9342256 7 -AGTCT 60 0.7030768 5.421894 3 -GTGCA 65 0.69590795 9.907589 6 -AAAGT 45 0.69068766 7.101804 8 -AACTG 55 0.6875019 5.7837663 1 -CGAAG 60 0.68525064 5.2844243 4 -GATTG 55 0.67779654 5.702118 6 -GTGAT 55 0.67779654 11.404236 4 -TGTTA 50 0.67440337 12.481857 5 -TTGTA 50 0.6744033 6.240928 9 -TATTG 50 0.6744033 6.240928 7 -CTCTA 60 0.6685249 5.1554413 7 -TACCT 60 0.66852486 10.310882 8 -ATGGA 50 0.65730417 6.082693 8 -ATACA 45 0.6567447 6.7527957 6 -ATCAA 45 0.65674466 6.7527947 9 -TGTAA 45 0.6474735 6.6574664 7 -GCGGT 70 0.6418945 8.485846 4 -GGCCG 80 0.63731974 7.372196 2 -GGTTT 55 0.63538885 10.690706 9 -TTGTG 55 0.63538885 5.345353 1 -TATAT 40 0.62991583 7.2865515 8 -CCTGT 65 0.62030727 8.831266 3 -GTGAG 55 0.6192789 5.2098246 1 -TAGGG 55 0.61927885 5.209824 8 -GAGTT 50 0.6161787 5.7021174 6 -ATGTG 50 0.6161787 5.702118 2 -GAATA 40 0.61394465 7.1018047 6 -CTGCG 70 0.6103493 8.068818 2 -CGGTG 65 0.59604484 8.485845 2 -TAAGG 45 0.5915738 6.082693 9 -AAGTG 45 0.5915737 6.0826926 1 -TATTT 40 0.5905039 6.8306537 8 -GGCAT 55 0.5888452 14.861383 3 -GTATC 50 0.5858973 5.421894 4 -ATAAC 40 0.5837731 13.505591 7 -TTACT 45 0.57713455 5.934226 9 -GTATA 40 0.575532 13.314933 7 -GAGTG 50 0.5629808 5.209824 1 -GTACA 45 0.5625016 5.7837667 6 -ATAGC 45 0.5625016 5.7837667 9 -TCTAC 50 0.5571041 5.1554413 8 -GCGAG 55 0.53800714 9.052216 1 -ACGGG 55 0.5380071 9.052214 1 -GATAA 35 0.5372016 7.1018047 6 -AATAG 35 0.5372016 7.101805 7 -CAACT 45 0.53485817 5.4995303 6 -CATAC 45 0.53485817 5.4995303 5 -GATTC 45 0.52730757 5.421894 6 -AGGTA 40 0.5258433 12.165386 5 -CGGTC 60 0.52315664 8.068819 5 -ACGAG 45 0.51393795 5.2844243 7 -TATTC 40 0.5130085 5.9342256 7 -CTAAA 35 0.51080143 6.7527957 9 -TACAA 35 0.51080143 5.402236 35-39 -CCTTA 45 0.5013937 5.1554413 6 -CAGTA 40 0.50000143 5.7837667 4 -GTGTA 40 0.49294293 5.702118 4 -TAACT 35 0.47884214 6.330293 8 -CTTAA 35 0.47884214 6.330293 7 -CTATA 35 0.47884214 6.330293 4 -TTAAC 35 0.47884214 6.330293 8 -TATCA 35 0.4788421 6.3302927 5 -TCAAC 40 0.47542948 5.499531 7 -ACTCA 40 0.47542942 5.49953 8 -TTAGT 35 0.47208238 10.120425 95-96 -TGTAT 35 0.47208238 6.2409286 3 -ATTGT 35 0.47208235 6.240928 8 -GTTAC 40 0.46871787 5.421894 6 -TGTAC 40 0.46871787 10.843788 7 -AGAGT 35 0.46011293 6.082693 5 -AGTAG 35 0.46011293 6.082693 5 -CTCCG 55 0.45599285 7.672287 6 -GGTAG 40 0.45038468 5.2098246 2 -TTTAC 35 0.44888243 5.9342256 8 -CTACT 40 0.44568333 5.1554418 4 -AACTA 30 0.4378298 6.7527947 9 -TATAG 30 0.43164897 6.6574664 5 -ATATA 25 0.4199739 7.7728767 9 -CTCAA 35 0.41600078 5.499531 9 -TATAC 30 0.4104361 6.3302927 5 -ACTAT 30 0.4104361 6.3302927 6 -TACTA 30 0.4104361 6.3302927 5 -TCGAT 35 0.41012815 10.843788 7 -ACGTT 35 0.41012815 5.421894 4 -CGAAA 30 0.40002972 6.169792 9 -GTAAG 30 0.3943825 6.082693 8 -ATAGG 30 0.3943825 6.082693 3 -TCCTA 35 0.38997287 5.1554413 5 -TTACC 35 0.38997287 5.1554413 7 -ACCGA 35 0.3800853 5.0247273 7 -GCATA 30 0.37500107 5.7837667 1 -TCGAA 30 0.37500107 5.7837667 4 -GCTAA 30 0.37500107 5.7837667 8 -TAGGT 30 0.3697072 5.7021174 7 -GTTAG 30 0.3697072 5.702118 6 -CAATA 25 0.36485815 6.7527947 5 -ATACC 30 0.35657212 5.499531 6 -GACGA 30 0.3426253 5.284424 6 -AAGCG 30 0.3426253 10.568848 7 -GTTTA 25 0.33720168 6.2409286 7 -GTATT 25 0.33720168 12.481857 6 -AGATA 20 0.30697232 7.1018047 5 -CGTCA 30 0.30540386 9.420691 5 -CCTAA 25 0.29714343 5.499531 7 -TACCA 25 0.2971434 5.49953 9 -TGCTA 25 0.29294866 5.421894 7 -TACGT 25 0.29294863 5.4218936 9 -AGACG 25 0.2855211 5.284425 9 -CCTAT 25 0.2785521 5.1554418 3 -TAAGC 20 0.25000072 5.7837667 9 -CTAAG 20 0.25000072 5.7837667 8 -CGATT 20 0.23435894 5.421894 9 -GGGTA 20 0.22519234 5.2098246 2 -ACGCA 20 0.21719159 5.0247273 5 -GCGAA 15 0.17131266 5.284425 3 -CGAAC 15 0.16289368 5.0247273 5 +>>Kmer Content pass >>END_MODULE diff --git a/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/CutadaptTest.scala b/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/CutadaptTest.scala new file mode 100644 index 0000000000000000000000000000000000000000..2b537d9767cbc1ddbd9f2e528a1c122dfe973d7c --- /dev/null +++ b/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/CutadaptTest.scala @@ -0,0 +1,84 @@ +/** + * Biopet is built on top of GATK Queue for building bioinformatic + * pipelines. It is mainly intended to support LUMC SHARK cluster which is running + * SGE. But other types of HPC that are supported by GATK Queue (such as PBS) + * should also be able to execute Biopet tools and pipelines. + * + * Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center + * + * Contact us at: sasc@lumc.nl + * + * A dual licensing mode is applied. The source code within this project that are + * not part of GATK Queue is freely available for non-commercial use under an AGPL + * license; For commercial users or users who do not want to follow the AGPL + * license, please contact us to obtain a separate license. + */ +package nl.lumc.sasc.biopet.pipelines.flexiprep + +import java.io.File + +import org.testng.annotations.Test + +class CutadaptTest extends FastqcV0101Test { + /** Mock output file of a Cutadapt 1.9 run */ + private[flexiprep] val cutadaptOut: File = resourceFile("ct-test.R1.clip.stats") + + def testFastQCinstance: Fastqc = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + fqc.contaminants = Option(resourceFile("fqc_contaminants_v0112.txt")) + // fqc.beforeGraph() + fqc + } + + def testCutadaptInst: Cutadapt = { + val caExe = new Cutadapt(null, testFastQCinstance) + caExe.statsOutput = cutadaptOut + caExe + } + + @Test def testAdapterFound() = { + val cutadapt = testCutadaptInst + val adapters = cutadapt.extractClippedAdapters(cutadaptOut) + adapters.keys.size shouldBe 4 + + adapters.get("CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC") shouldBe Some( + Map( + "count" -> 94, + "histogram" -> Map( + "5p" -> Map(5 -> 2, 6 -> 4, 9 -> 1, 3 -> 8, 4 -> 3), + "3p" -> Map(5 -> 21, 6 -> 18, 9 -> 1, 12 -> 1, 7 -> 2, 3 -> 13, 11 -> 1, 4 -> 19) + ) + ) + ) + + adapters.get("CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC") shouldBe Some( + Map( + "count" -> 0, + "histogram" -> Map() + ) + ) + } + + @Test def testSummary() = { + val cutadapt = testCutadaptInst + val summary = cutadapt.summaryStats + + summary.keys shouldBe Set("num_bases_input", "num_reads_input", "num_reads_output", + "num_reads_with_adapters", "num_reads_affected", "num_reads_discarded_too_long", + "adapters", "num_reads_discarded_many_n", "num_reads_discarded_too_short", "num_bases_output") + + summary.keys.size shouldBe 10 + summary("adapters").asInstanceOf[Map[String, Map[String, Any]]].keys.size shouldBe 4 + + summary("num_bases_input") shouldBe 100000 + summary("num_reads_input") shouldBe 1000 + summary("num_reads_output") shouldBe 985 + summary("num_reads_with_adapters") shouldBe 440 + summary("num_reads_affected") shouldBe 425 + summary("num_reads_discarded_too_long") shouldBe 0 + summary("num_reads_discarded_many_n") shouldBe 0 + summary("num_reads_discarded_too_short") shouldBe 15 + summary("num_bases_output") shouldBe 89423 + } +} diff --git a/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala b/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala index 4cb68fdfc44d5a30c3ed76aabc9570d6f62529f3..3cf24e8c60a570e8e51fe528ece4f81d0b66a01a 100644 --- a/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala +++ b/flexiprep/src/test/scala/nl/lumc/sasc/biopet/pipelines/flexiprep/FastqcV0101Test.scala @@ -25,14 +25,14 @@ import org.testng.annotations.Test class FastqcV0101Test extends TestNGSuite with Matchers { /** Returns the absolute path to test resource directory as a File object */ - private val resourceDir: File = new File(Paths.get(getClass.getResource("/").toURI).toString) + private[flexiprep] val resourceDir: File = new File(Paths.get(getClass.getResource("/").toURI).toString) /** Given a resource file name, returns the the absolute path to it as a File object */ - private def resourceFile(p: String): File = new File(resourceDir, p) + private[flexiprep] def resourceFile(p: String): File = new File(resourceDir, p) /** Mock output file of a FastQC v0.10.1 run */ // the file doesn't actually exist, we just need it so the outputDir value can be computed correctly - private val outputv0101: File = resourceFile("v0101.fq_fastqc.zip") + private[flexiprep] val outputv0101: File = resourceFile("v0101.fq_fastqc.zip") @Test def testOutputDir() = { val fqc = new Fastqc(null) @@ -44,7 +44,7 @@ class FastqcV0101Test extends TestNGSuite with Matchers { val fqc = new Fastqc(null) fqc.output = outputv0101 // 11 QC modules - fqc.qcModules.size shouldBe 11 + fqc.qcModules.size shouldBe 12 // first module fqc.qcModules.keySet should contain("Basic Statistics") // mid (6th module) @@ -83,4 +83,23 @@ class FastqcV0101Test extends TestNGSuite with Matchers { adapters.last.seq shouldEqual "GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG" } + + @Test def testPerBaseSequenceQuality() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + + val perBaseSequenceQuality = fqc.perBaseSequenceQuality + perBaseSequenceQuality.size shouldBe 55 + perBaseSequenceQuality.keys should contain("54-55") + } + + @Test def testPerBaseSequenceContent() = { + val fqc = new Fastqc(null) + fqc.output = outputv0101 + + val perBaseSequenceContent: Map[String, Map[String, Double]] = fqc.perBaseSequenceContent + perBaseSequenceContent.size shouldBe 55 + perBaseSequenceContent.keys should contain("1") + } + } \ No newline at end of file