Skip to content
Snippets Groups Projects
Commit e980fd75 authored by Peter van 't Hof's avatar Peter van 't Hof
Browse files

Merge branch 'feature-cutadapt-adapterreporting' into 'develop'

Fix reporting of found Adapters with counts by Cutadapt in Json.

the adapters used by Cutadapt were not reported in the json. This fix will solve this.

fixes #319 
fixes #325 

See merge request !369
parents 9e3d9653 d01a6749
No related branches found
No related tags found
No related merge requests found
......@@ -24,6 +24,7 @@ import org.broadinstitute.gatk.utils.commandline.{ Input, Output }
import scala.collection.mutable
import scala.io.Source
import scala.util.matching.Regex
/**
* Extension for cutadapt
......@@ -163,6 +164,51 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su
(if (outputAsStsout) "" else required("--output", fastqOutput) +
" > " + required(statsOutput))
def extractClippedAdapters(statsOutput: File): Map[String, Any] = {
val histoCountRow: Regex = """([\d]+)\t([\d]+)\t.*""".r
val adapterR = """Sequence: ([C|T|A|G]+);.*Trimmed: ([\d]+) times\.""".r
val statsFile = Source.fromFile(statsOutput)
val adapterRawStats: Array[String] = statsFile.mkString
.split("=== Adapter [\\d]+ ===")
.filter(_.contains("Sequence")
)
statsFile.close()
adapterRawStats.map(adapter => {
var adapterName = ""
var adapterCount = 0
// identify the adapter name and count
for (line <- adapter.split("\n")) {
line match {
case adapterR(adapter, count) => {
adapterName = adapter
adapterCount = count.toInt
}
case _ =>
}
}
// parse the block that gives the histogram of clipped bases and from which end
val counts = adapter.split("Overview of removed sequences ")
.filter(x => x.contains("length"))
.map(clipSideRawStats => {
val clipSideLabel = if (clipSideRawStats.contains("5'")) { "5p" } else { "3p" }
val histogramValues = clipSideRawStats.split("\n").flatMap({
case histoCountRow(length, count) => Some(length.toInt -> count.toInt)
case _ => None
})
clipSideLabel -> histogramValues.toMap
})
adapterName -> Map(
"count" -> adapterCount,
"histogram" -> counts.toMap
)
}).toMap // converting the Array[String] containing map-items to Map with 'toMap'
}
/** Output summary stats */
def summaryStats: Map[String, Any] = {
/**
......@@ -177,7 +223,6 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su
val tooLongR = """.* that were too long: *([,\d]+) .*""".r
val tooManyN = """.* with too many N: *([,\d]+) .*""".r
val adapterR = """Sequence ([C|T|A|G]*);.*Trimmed: ([,\d]+) times.""".r
val basePairsProcessed = """Total basepairs processed: *([,\d]+) bp""".r
val basePairsWritten = """Total written \(filtered\): *([,\d]+) bp .*""".r
......@@ -192,24 +237,28 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su
"bpoutput" -> 0,
"toomanyn" -> 0
)
val adapterStats: mutable.Map[String, Long] = mutable.Map()
// extract the adapters with its histogram
val adapterStats = if (statsOutput.exists) {
extractClippedAdapters(statsOutput)
} else Map.empty
if (statsOutput.exists) {
val statsFile = Source.fromFile(statsOutput)
for (line <- statsFile.getLines()) {
line match {
case processedReads(m) => stats("processed") = m.replaceAll(",", "").toLong
case withAdapters(m) => stats("withadapters") = m.replaceAll(",", "").toLong
case readsPassingFilters(m) => stats("passingfilters") = m.replaceAll(",", "").toLong
case tooShortR(m) => stats("tooshort") = m.replaceAll(",", "").toLong
case tooLongR(m) => stats("toolong") = m.replaceAll(",", "").toLong
case tooManyN(m) => stats("toomanyn") = m.replaceAll(",", "").toLong
case basePairsProcessed(m) => stats("bpinput") = m.replaceAll(",", "").toLong
case basePairsWritten(m) => stats("bpoutput") = m.replaceAll(",", "").toLong
case adapterR(adapter, count) => adapterStats += (adapter -> count.toLong)
case _ =>
case processedReads(m) => stats("processed") = m.replaceAll(",", "").toLong
case withAdapters(m) => stats("withadapters") = m.replaceAll(",", "").toLong
case readsPassingFilters(m) => stats("passingfilters") = m.replaceAll(",", "").toLong
case tooShortR(m) => stats("tooshort") = m.replaceAll(",", "").toLong
case tooLongR(m) => stats("toolong") = m.replaceAll(",", "").toLong
case tooManyN(m) => stats("toomanyn") = m.replaceAll(",", "").toLong
case basePairsProcessed(m) => stats("bpinput") = m.replaceAll(",", "").toLong
case basePairsWritten(m) => stats("bpoutput") = m.replaceAll(",", "").toLong
case _ =>
}
}
statsFile.close()
}
val cleanReads = stats("processed") - stats("withadapters")
......@@ -223,8 +272,8 @@ class Cutadapt(val root: Configurable) extends BiopetCommandLineFunction with Su
"num_reads_discarded_too_long" -> stats("toolong"),
"num_reads_discarded_many_n" -> stats("toomanyn"),
"num_bases_input" -> stats("bpinput"),
"num_based_output" -> stats("bpoutput"),
adaptersStatsName -> adapterStats.toMap
"num_bases_output" -> stats("bpoutput"),
adaptersStatsName -> adapterStats
)
}
......
......@@ -16,6 +16,7 @@
package nl.lumc.sasc.biopet.pipelines.flexiprep
import nl.lumc.sasc.biopet.utils.config.Configurable
import scala.collection.JavaConversions._
/**
* Cutadapt wrapper specific for Flexiprep.
......@@ -41,23 +42,26 @@ class Cutadapt(root: Configurable, fastqc: Fastqc) extends nl.lumc.sasc.biopet.e
val adapterCounts: Map[String, Any] = initStats.get(adaptersStatsName) match {
// "adapters" key found in statistics
case Some(m: Map[_, _]) => m.flatMap {
case (seq: String, count) =>
seqToNameMap.get(seq) match {
case (adapterSequence: String, adapterStats: Map[_, _]) =>
seqToNameMap.get(adapterSequence) match {
// adapter sequence is found by FastQC
case Some(n) => Some(n -> Map("sequence" -> seq, "count" -> count))
case Some(adapterSeqName) => {
Some(adapterSeqName ->
Map("sequence" -> adapterSequence, "stats" -> adapterStats.toMap)
)
}
// adapter sequence is clipped but not found by FastQC ~ should not happen since all clipped adapter
// sequences come from FastQC
case _ =>
throw new IllegalStateException(s"Adapter '$seq' is clipped but not found by FastQC in '$fastqInput'.")
throw new IllegalStateException(s"Adapter '$adapterSequence' is clipped but not found by FastQC in '$fastqInput'.")
}
// FastQC found no adapters
case otherwise =>
;
logger.debug(s"No adapters found for summarizing in '$fastqInput'.")
None
}
// "adapters" key not found ~ something went wrong in our part
case _ => throw new RuntimeException(s"Required key 'adapters' not found in stats entry '$fastqInput'.")
case _ => throw new RuntimeException(s"Required key '${adaptersStatsName}' not found in stats entry '${fastqInput}'.")
}
initStats.updated(adaptersStatsName, adapterCounts)
}
......
This is cutadapt 1.9.1 with Python 2.7.6
Command line parameters: -b CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC -b GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG -b CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC -b GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG --error-rate 0.2 --times 2 -m 15 ct_r1.fq.gz.seqtk.fq --output ct_r1.fq.gz.cutadapt.fq
Trimming 4 adapters with at most 20.0% errors in single-end mode ...
Finished in 0.19 s (189 us/read; 0.32 M reads/minute).
=== Summary ===
Total reads processed: 1,000
Reads with adapters: 440 (44.0%)
Reads that were too short: 15 (1.5%)
Reads written (passing filters): 985 (98.5%)
Total basepairs processed: 100,000 bp
Total written (filtered): 89,423 bp (89.4%)
=== Adapter 1 ===
Sequence: CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC; Type: variable 5'/3'; Length: 63; Trimmed: 94 times.
18 times, it overlapped the 5' end of a read
76 times, it overlapped the 3' end or was within the read
No. of allowed errors:
0-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-34 bp: 6; 35-39 bp: 7; 40-44 bp: 8; 45-49 bp: 9; 50-54 bp: 10; 55-59 bp: 11; 60-63 bp: 12
Overview of removed sequences (5')
length count expect max.err error counts
3 8 15.6 0 8
4 3 3.9 0 2 1
5 2 1.0 1 0 2
6 4 0.2 1 1 3
9 1 0.0 1 0 0 1
Overview of removed sequences (3' or within)
length count expect max.err error counts
3 13 15.6 0 13
4 19 3.9 0 3 16
5 21 1.0 1 0 21
6 18 0.2 1 1 17
7 2 0.1 1 0 2
9 1 0.0 1 0 0 1
11 1 0.0 2 0 0 1
12 1 0.0 2 0 0 1
=== Adapter 2 ===
Sequence: GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG; Type: variable 5'/3'; Length: 63; Trimmed: 340 times.
117 times, it overlapped the 5' end of a read
223 times, it overlapped the 3' end or was within the read
No. of allowed errors:
0-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-34 bp: 6; 35-39 bp: 7; 40-44 bp: 8; 45-49 bp: 9; 50-54 bp: 10; 55-59 bp: 11; 60-63 bp: 12
Overview of removed sequences (5')
length count expect max.err error counts
3 14 15.6 0 14
4 29 3.9 0 6 23
5 32 1.0 1 3 29
6 36 0.2 1 0 36
8 1 0.0 1 0 1
9 1 0.0 1 0 0 1
10 1 0.0 2 0 0 1
11 2 0.0 2 0 0 2
37 1 0.0 7 0 0 0 0 0 1
Overview of removed sequences (3' or within)
length count expect max.err error counts
3 18 15.6 0 18
4 9 3.9 0 5 4
5 15 1.0 1 8 7
6 10 0.2 1 8 2
7 7 0.1 1 5 2
8 10 0.0 1 9 1
9 6 0.0 1 5 1
10 8 0.0 2 5 0 3
11 4 0.0 2 4
12 4 0.0 2 4
13 9 0.0 2 9
14 4 0.0 2 3 0 1
15 7 0.0 3 7
16 2 0.0 3 2
17 4 0.0 3 2 1 0 1
18 2 0.0 3 2
19 2 0.0 3 2
20 2 0.0 4 0 1 1
21 7 0.0 4 6 1
22 7 0.0 4 7
23 2 0.0 4 2
24 3 0.0 4 3
25 5 0.0 5 5
26 5 0.0 5 5
27 8 0.0 5 8
28 6 0.0 5 5 1
29 2 0.0 5 2
30 5 0.0 6 5
31 3 0.0 6 3
32 8 0.0 6 8
33 1 0.0 6 1
34 5 0.0 6 0 5
35 2 0.0 7 0 0 0 0 0 0 2
36 3 0.0 7 0 0 0 0 0 0 3
37 4 0.0 7 0 0 0 0 0 0 0 2 2
38 2 0.0 7 0 0 0 0 0 0 0 0 0 2
39 4 0.0 7 0 0 0 0 1 0 0 0 0 3
40 3 0.0 8 0 0 0 0 0 0 0 3
41 1 0.0 8 0 0 0 0 0 0 0 1
42 4 0.0 8 0 0 0 0 0 0 0 0 4
43 5 0.0 8 0 0 0 0 0 0 0 0 0 5
44 3 0.0 8 0 0 0 0 0 0 0 0 0 0 3
46 1 0.0 9 0 0 0 0 0 0 0 0 0 0 1
49 1 0.0 9 0 0 0 0 0 1
=== Adapter 3 ===
Sequence: CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC; Type: variable 5'/3'; Length: 63; Trimmed: 0 times.
=== Adapter 4 ===
Sequence: GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG; Type: variable 5'/3'; Length: 63; Trimmed: 82 times.
15 times, it overlapped the 5' end of a read
67 times, it overlapped the 3' end or was within the read
No. of allowed errors:
0-4 bp: 0; 5-9 bp: 1; 10-14 bp: 2; 15-19 bp: 3; 20-24 bp: 4; 25-29 bp: 5; 30-34 bp: 6; 35-39 bp: 7; 40-44 bp: 8; 45-49 bp: 9; 50-54 bp: 10; 55-59 bp: 11; 60-63 bp: 12
Overview of removed sequences (5')
length count expect max.err error counts
26 1 0.0 5 0 1
61 2 0.0 12 0 0 0 2
64 11 0.0 12 0 0 0 11
72 1 0.0 12 0 0 0 0 0 0 0 0 0 0 0 1
Overview of removed sequences (3' or within)
length count expect max.err error counts
45 3 0.0 9 0 0 0 3
46 2 0.0 9 0 0 0 2
47 3 0.0 9 0 0 0 3
48 3 0.0 9 0 0 0 3
49 2 0.0 9 0 0 0 2
50 3 0.0 10 0 0 0 3
51 2 0.0 10 0 0 0 2
52 6 0.0 10 0 0 0 6
53 1 0.0 10 0 0 0 1
54 5 0.0 10 0 0 0 4 0 1
56 2 0.0 11 0 0 0 2
57 2 0.0 11 0 0 0 2
58 2 0.0 11 0 0 0 2
59 3 0.0 11 0 0 0 2 0 0 0 0 0 1
61 1 0.0 12 0 0 0 0 0 1
62 3 0.0 12 0 0 0 2 1
63 1 0.0 12 0 0 0 0 1
66 3 0.0 12 0 0 0 3
67 3 0.0 12 0 0 0 3
70 1 0.0 12 0 0 0 1
72 1 0.0 12 0 0 0 1
80 1 0.0 12 0 0 0 1
99 14 0.0 12 0 0 0 14
# This file contains a list of potential contaminants which are
# frequently found in high throughput sequencing reactions. These
# are mostly sequences of adapters / primers used in the various
# sequencing chemistries.
#
# Please DO NOT rely on these sequences to design your own oligos, some
# of them are truncated at ambiguous positions, and none of them are
# definitive sequences from the manufacturers so don't blame us if you
# try to use them and they don't work.
#
# You can add more sequences to the file by putting one line per entry
# and specifying a name[tab]sequence. If the contaminant you add is
# likely to be of use to others please consider sending it to the FastQ
# authors, either via a bug report at www.bioinformatics.babraham.ac.uk/bugzilla/
# or by directly emailing simon.andrews@babraham.ac.uk so other users of
# the program can benefit.
Illumina Single End Adapter 1 GATCGGAAGAGCTCGTATGCCGTCTTCTGCTTG
Illumina Single End Adapter 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT
Illumina Single End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
Illumina Single End PCR Primer 2 CAAGCAGAAGACGGCATACGAGCTCTTCCGATCT
Illumina Single End Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT
Illumina Paired End Adapter 1 ACACTCTTTCCCTACACGACGCTCTTCCGATCT
Illumina Paired End Adapter 2 GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
Illumina Paried End PCR Primer 1 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
Illumina Paired End PCR Primer 2 CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
Illumina Paried End Sequencing Primer 1 ACACTCTTTCCCTACACGACGCTCTTCCGATCT
Illumina Paired End Sequencing Primer 2 CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
Illumina DpnII expression Adapter 1 ACAGGTTCAGAGTTCTACAGTCCGAC
Illumina DpnII expression Adapter 2 CAAGCAGAAGACGGCATACGA
Illumina DpnII expression PCR Primer 1 CAAGCAGAAGACGGCATACGA
Illumina DpnII expression PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
Illumina DpnII expression Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC
Illumina NlaIII expression Adapter 1 ACAGGTTCAGAGTTCTACAGTCCGACATG
Illumina NlaIII expression Adapter 2 CAAGCAGAAGACGGCATACGA
Illumina NlaIII expression PCR Primer 1 CAAGCAGAAGACGGCATACGA
Illumina NlaIII expression PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
Illumina NlaIII expression Sequencing Primer CCGACAGGTTCAGAGTTCTACAGTCCGACATG
Illumina Small RNA Adapter 1 GTTCAGAGTTCTACAGTCCGACGATC
Illumina Small RNA Adapter 2 TGGAATTCTCGGGTGCCAAGG
Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA
Illumina Small RNA PCR Primer 1 CAAGCAGAAGACGGCATACGA
Illumina Small RNA PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
Illumina Small RNA Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC
Illumina Multiplexing Adapter 1 GATCGGAAGAGCACACGTCT
Illumina Multiplexing Adapter 2 ACACTCTTTCCCTACACGACGCTCTTCCGATCT
Illumina Multiplexing PCR Primer 1.01 AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
Illumina Multiplexing PCR Primer 2.01 GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
Illumina Multiplexing Read1 Sequencing Primer ACACTCTTTCCCTACACGACGCTCTTCCGATCT
Illumina Multiplexing Index Sequencing Primer GATCGGAAGAGCACACGTCTGAACTCCAGTCAC
Illumina Multiplexing Read2 Sequencing Primer GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
Illumina PCR Primer Index 1 CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTC
Illumina PCR Primer Index 2 CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTC
Illumina PCR Primer Index 3 CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTC
Illumina PCR Primer Index 4 CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTC
Illumina PCR Primer Index 5 CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTC
Illumina PCR Primer Index 6 CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTC
Illumina PCR Primer Index 7 CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTC
Illumina PCR Primer Index 8 CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTC
Illumina PCR Primer Index 9 CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTC
Illumina PCR Primer Index 10 CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTC
Illumina PCR Primer Index 11 CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTC
Illumina PCR Primer Index 12 CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTC
Illumina DpnII Gex Adapter 1 GATCGTCGGACTGTAGAACTCTGAAC
Illumina DpnII Gex Adapter 1.01 ACAGGTTCAGAGTTCTACAGTCCGAC
Illumina DpnII Gex Adapter 2 CAAGCAGAAGACGGCATACGA
Illumina DpnII Gex Adapter 2.01 TCGTATGCCGTCTTCTGCTTG
Illumina DpnII Gex PCR Primer 1 CAAGCAGAAGACGGCATACGA
Illumina DpnII Gex PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
Illumina DpnII Gex Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC
Illumina NlaIII Gex Adapter 1.01 TCGGACTGTAGAACTCTGAAC
Illumina NlaIII Gex Adapter 1.02 ACAGGTTCAGAGTTCTACAGTCCGACATG
Illumina NlaIII Gex Adapter 2.01 CAAGCAGAAGACGGCATACGA
Illumina NlaIII Gex Adapter 2.02 TCGTATGCCGTCTTCTGCTTG
Illumina NlaIII Gex PCR Primer 1 CAAGCAGAAGACGGCATACGA
Illumina NlaIII Gex PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
Illumina NlaIII Gex Sequencing Primer CCGACAGGTTCAGAGTTCTACAGTCCGACATG
Illumina Small RNA RT Primer CAAGCAGAAGACGGCATACGA
Illumina 5p RNA Adapter GTTCAGAGTTCTACAGTCCGACGATC
Illumina RNA Adapter1 TGGAATTCTCGGGTGCCAAGG
Illumina Small RNA 3p Adapter 1 ATCTCGTATGCCGTCTTCTGCTTG
Illumina Small RNA PCR Primer 1 CAAGCAGAAGACGGCATACGA
Illumina Small RNA PCR Primer 2 AATGATACGGCGACCACCGACAGGTTCAGAGTTCTACAGTCCGA
Illumina Small RNA Sequencing Primer CGACAGGTTCAGAGTTCTACAGTCCGACGATC
TruSeq Universal Adapter AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
TruSeq Adapter, Index 1 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 2 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 3 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 4 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 5 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 6 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 7 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 8 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 9 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 10 GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 11 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 12 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 13 GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACTCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 14 GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 15 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGTCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 16 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCTCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 18 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 19 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACTCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 20 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 21 GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGTCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 22 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTTCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 23 GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCACTCTTCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 25 GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATCTCGTATGCCGTCTTCTGCTTG
TruSeq Adapter, Index 27 GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTCTCGTATGCCGTCTTCTGCTTG
Illumina RNA RT Primer GCCTTGGCACCCGAGAATTCCA
Illumina RNA PCR Primer AATGATACGGCGACCACCGAGATCTACACGTTCAGAGTTCTACAGTCCGA
RNA PCR Primer, Index 1 CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 2 CAAGCAGAAGACGGCATACGAGATACATCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 3 CAAGCAGAAGACGGCATACGAGATGCCTAAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 4 CAAGCAGAAGACGGCATACGAGATTGGTCAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 5 CAAGCAGAAGACGGCATACGAGATCACTGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 6 CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 7 CAAGCAGAAGACGGCATACGAGATGATCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 8 CAAGCAGAAGACGGCATACGAGATTCAAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 9 CAAGCAGAAGACGGCATACGAGATCTGATCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 10 CAAGCAGAAGACGGCATACGAGATAAGCTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 11 CAAGCAGAAGACGGCATACGAGATGTAGCCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 12 CAAGCAGAAGACGGCATACGAGATTACAAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 13 CAAGCAGAAGACGGCATACGAGATTTGACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 14 CAAGCAGAAGACGGCATACGAGATGGAACTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 15 CAAGCAGAAGACGGCATACGAGATTGACATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 16 CAAGCAGAAGACGGCATACGAGATGGACGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 17 CAAGCAGAAGACGGCATACGAGATCTCTACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 18 CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 19 CAAGCAGAAGACGGCATACGAGATTTTCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 20 CAAGCAGAAGACGGCATACGAGATGGCCACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 21 CAAGCAGAAGACGGCATACGAGATCGAAACGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 22 CAAGCAGAAGACGGCATACGAGATCGTACGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 23 CAAGCAGAAGACGGCATACGAGATCCACTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 24 CAAGCAGAAGACGGCATACGAGATGCTACCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 25 CAAGCAGAAGACGGCATACGAGATATCAGTGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 26 CAAGCAGAAGACGGCATACGAGATGCTCATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 27 CAAGCAGAAGACGGCATACGAGATAGGAATGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 28 CAAGCAGAAGACGGCATACGAGATCTTTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 29 CAAGCAGAAGACGGCATACGAGATTAGTTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 30 CAAGCAGAAGACGGCATACGAGATCCGGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 31 CAAGCAGAAGACGGCATACGAGATATCGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 32 CAAGCAGAAGACGGCATACGAGATTGAGTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 33 CAAGCAGAAGACGGCATACGAGATCGCCTGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 34 CAAGCAGAAGACGGCATACGAGATGCCATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 35 CAAGCAGAAGACGGCATACGAGATAAAATGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 36 CAAGCAGAAGACGGCATACGAGATTGTTGGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 37 CAAGCAGAAGACGGCATACGAGATATTCCGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 38 CAAGCAGAAGACGGCATACGAGATAGCTAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 39 CAAGCAGAAGACGGCATACGAGATGTATAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 40 CAAGCAGAAGACGGCATACGAGATTCTGAGGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 41 CAAGCAGAAGACGGCATACGAGATGTCGTCGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 42 CAAGCAGAAGACGGCATACGAGATCGATTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 43 CAAGCAGAAGACGGCATACGAGATGCTGTAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 44 CAAGCAGAAGACGGCATACGAGATATTATAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 45 CAAGCAGAAGACGGCATACGAGATGAATGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 46 CAAGCAGAAGACGGCATACGAGATTCGGGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 47 CAAGCAGAAGACGGCATACGAGATCTTCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
RNA PCR Primer, Index 48 CAAGCAGAAGACGGCATACGAGATTGCCGAGTGACTGGAGTTCCTTGGCACCCGAGAATTCCA
ABI Dynabead EcoP Oligo CTGATCTAGAGGTACCGGATCCCAGCAGT
ABI Solid3 Adapter A CTGCCCCGGGTTCCTCATTCTCTCAGCAGCATG
ABI Solid3 Adapter B CCACTACGCCTCCGCTTTCCTCTCTATGGGCAGTCGGTGAT
ABI Solid3 5' AMP Primer CCACTACGCCTCCGCTTTCCTCTCTATG
ABI Solid3 3' AMP Primer CTGCCCCGGGTTCCTCATTCT
ABI Solid3 EF1 alpha Sense Primer CATGTGTGTTGAGAGCTTC
ABI Solid3 EF1 alpha Antisense Primer GAAAACCAAAGTGGTCCAC
ABI Solid3 GAPDH Forward Primer TTAGCACCCCTGGCCAAGG
ABI Solid3 GAPDH Reverse Primer CTTACTCCTTGGAGGCCATG
/**
* Biopet is built on top of GATK Queue for building bioinformatic
* pipelines. It is mainly intended to support LUMC SHARK cluster which is running
* SGE. But other types of HPC that are supported by GATK Queue (such as PBS)
* should also be able to execute Biopet tools and pipelines.
*
* Copyright 2014 Sequencing Analysis Support Core - Leiden University Medical Center
*
* Contact us at: sasc@lumc.nl
*
* A dual licensing mode is applied. The source code within this project that are
* not part of GATK Queue is freely available for non-commercial use under an AGPL
* license; For commercial users or users who do not want to follow the AGPL
* license, please contact us to obtain a separate license.
*/
package nl.lumc.sasc.biopet.pipelines.flexiprep
import java.io.File
import org.testng.annotations.Test
class CutadaptTest extends FastqcV0101Test {
/** Mock output file of a Cutadapt 1.9 run */
private[flexiprep] val cutadaptOut: File = resourceFile("ct-test.R1.clip.stats")
def testFastQCinstance: Fastqc = {
val fqc = new Fastqc(null)
fqc.output = outputv0101
fqc.contaminants = Option(resourceFile("fqc_contaminants_v0112.txt"))
// fqc.beforeGraph()
fqc
}
def testCutadaptInst: Cutadapt = {
val caExe = new Cutadapt(null, testFastQCinstance)
caExe.statsOutput = cutadaptOut
caExe
}
@Test def testAdapterFound() = {
val cutadapt = testCutadaptInst
val adapters = cutadapt.extractClippedAdapters(cutadaptOut)
adapters.keys.size shouldBe 4
adapters.get("CAAGCAGAAGACGGCATACGAGATCGTGATGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC") shouldBe Some(
Map(
"count" -> 94,
"histogram" -> Map(
"5p" -> Map(5 -> 2, 6 -> 4, 9 -> 1, 3 -> 8, 4 -> 3),
"3p" -> Map(5 -> 21, 6 -> 18, 9 -> 1, 12 -> 1, 7 -> 2, 3 -> 13, 11 -> 1, 4 -> 19)
)
)
)
adapters.get("CAAGCAGAAGACGGCATACGAGATGCGGACGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC") shouldBe Some(
Map(
"count" -> 0,
"histogram" -> Map()
)
)
}
@Test def testSummary() = {
val cutadapt = testCutadaptInst
val summary = cutadapt.summaryStats
summary.keys shouldBe Set("num_bases_input", "num_reads_input", "num_reads_output",
"num_reads_with_adapters", "num_reads_affected", "num_reads_discarded_too_long",
"adapters", "num_reads_discarded_many_n", "num_reads_discarded_too_short", "num_bases_output")
summary.keys.size shouldBe 10
summary("adapters").asInstanceOf[Map[String, Map[String, Any]]].keys.size shouldBe 4
summary("num_bases_input") shouldBe 100000
summary("num_reads_input") shouldBe 1000
summary("num_reads_output") shouldBe 985
summary("num_reads_with_adapters") shouldBe 440
summary("num_reads_affected") shouldBe 425
summary("num_reads_discarded_too_long") shouldBe 0
summary("num_reads_discarded_many_n") shouldBe 0
summary("num_reads_discarded_too_short") shouldBe 15
summary("num_bases_output") shouldBe 89423
}
}
......@@ -25,14 +25,14 @@ import org.testng.annotations.Test
class FastqcV0101Test extends TestNGSuite with Matchers {
/** Returns the absolute path to test resource directory as a File object */
private val resourceDir: File = new File(Paths.get(getClass.getResource("/").toURI).toString)
private[flexiprep] val resourceDir: File = new File(Paths.get(getClass.getResource("/").toURI).toString)
/** Given a resource file name, returns the the absolute path to it as a File object */
private def resourceFile(p: String): File = new File(resourceDir, p)
private[flexiprep] def resourceFile(p: String): File = new File(resourceDir, p)
/** Mock output file of a FastQC v0.10.1 run */
// the file doesn't actually exist, we just need it so the outputDir value can be computed correctly
private val outputv0101: File = resourceFile("v0101.fq_fastqc.zip")
private[flexiprep] val outputv0101: File = resourceFile("v0101.fq_fastqc.zip")
@Test def testOutputDir() = {
val fqc = new Fastqc(null)
......@@ -44,7 +44,7 @@ class FastqcV0101Test extends TestNGSuite with Matchers {
val fqc = new Fastqc(null)
fqc.output = outputv0101
// 11 QC modules
fqc.qcModules.size shouldBe 11
fqc.qcModules.size shouldBe 12
// first module
fqc.qcModules.keySet should contain("Basic Statistics")
// mid (6th module)
......@@ -83,4 +83,23 @@ class FastqcV0101Test extends TestNGSuite with Matchers {
adapters.last.seq shouldEqual "GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG"
}
@Test def testPerBaseSequenceQuality() = {
val fqc = new Fastqc(null)
fqc.output = outputv0101
val perBaseSequenceQuality = fqc.perBaseSequenceQuality
perBaseSequenceQuality.size shouldBe 55
perBaseSequenceQuality.keys should contain("54-55")
}
@Test def testPerBaseSequenceContent() = {
val fqc = new Fastqc(null)
fqc.output = outputv0101
val perBaseSequenceContent: Map[String, Map[String, Double]] = fqc.perBaseSequenceContent
perBaseSequenceContent.size shouldBe 55
perBaseSequenceContent.keys should contain("1")
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment