diff --git a/README.md b/README.md
index a0daca4c8f6e27ed2fabc8e9246aba649de4747a..64fdc060ab2894fa462a7092adf83dc034082b72 100644
--- a/README.md
+++ b/README.md
@@ -71,18 +71,20 @@ motif from [PRODORIC](https://www.prodoric.de/), and the
 
 - Score every position of the genome with the motif weight matrix:
   ```console
-  running 2 tests
-  test bench_avx     ... bench:  13,294,196 ns/iter (+/- 73,022) = 349 MB/s
-  test bench_generic ... bench: 316,647,932 ns/iter (+/- 1,420,798) = 14 MB/s
+  running 3 tests
+  test bench_avx2    ... bench:  13,053,752 ns/iter (+/- 45,411) = 355 MB/s
+  test bench_ssse3   ... bench:  37,203,277 ns/iter (+/- 2,416,572) = 124 MB/s
+  test bench_generic ... bench: 314,682,807 ns/iter (+/- 1,072,174) = 14 MB/s
   ```
 
-- Find the highest-scoring position for a motif in a sequence 
+- Find the highest-scoring position for a motif in a 10kb sequence 
   (compared to the PSSM algorithm implemented in 
   [`bio::pattern_matching::pssm`](https://docs.rs/bio/1.1.0/bio/pattern_matching/pssm/index.html)):
   ```console
-  test bench_avx     ... bench:      47,069 ns/iter (+/- 10) = 212 MB/s
-  test bench_bio     ... bench:   1,437,308 ns/iter (+/- 5,419) = 6 MB/s
-  test bench_generic ... bench:     740,348 ns/iter (+/- 2,277) = 13 MB/s
+  test bench_avx2    ... bench:      46,390 ns/iter (+/- 115) = 215 MB/s
+  test bench_ssse3   ... bench:      97,691 ns/iter (+/- 2,720) = 102 MB/s
+  test bench_generic ... bench:     740,305 ns/iter (+/- 2,527) = 13 MB/s
+  test bench_bio     ... bench:   1,575,504 ns/iter (+/- 2,799) = 6 MB/s
   ```
 
 
diff --git a/lightmotif-bench/dna.rs b/lightmotif-bench/dna.rs
index 30a97cf393f0d426075c82d6c4291f77c34eb756..609e2c6aea6b6bdb4ea674f6fe441fa7a4a41b68 100644
--- a/lightmotif-bench/dna.rs
+++ b/lightmotif-bench/dna.rs
@@ -8,6 +8,9 @@ extern crate test;
 #[cfg(target_feature = "avx2")]
 use std::arch::x86_64::__m256;
 
+#[cfg(target_feature = "sse2")]
+use std::arch::x86_64::__m128;
+
 use lightmotif::Alphabet;
 use lightmotif::Background;
 use lightmotif::CountMatrix;
@@ -45,10 +48,37 @@ fn bench_generic(bencher: &mut test::Bencher) {
 
 #[cfg(target_feature = "avx2")]
 #[bench]
-fn bench_avx(bencher: &mut test::Bencher) {
+fn bench_sse2(bencher: &mut test::Bencher) {
     let seq = &SEQUENCE[..10000];
     let encoded = EncodedSequence::<DnaAlphabet>::from_text(seq).unwrap();
-    let mut striped = encoded.to_striped::<32>();
+    let mut striped = encoded.to_striped();
+
+    let bg = Background::<DnaAlphabet, { DnaAlphabet::K }>::uniform();
+    let cm = CountMatrix::<DnaAlphabet, { DnaAlphabet::K }>::from_sequences(&[
+        EncodedSequence::from_text("GTTGACCTTATCAAC").unwrap(),
+        EncodedSequence::from_text("GTTGATCCAGTCAAC").unwrap(),
+    ])
+    .unwrap();
+    let pbm = cm.to_probability(0.1);
+    let pwm = pbm.to_weight(bg);
+
+    striped.configure(&pwm);
+    let pli = Pipeline::<_, __m128>::new();
+
+    let mut scores = StripedScores::new_for(&striped, &pwm);
+    bencher.bytes = seq.len() as u64;
+    bencher.iter(|| {
+        pli.score_into(&striped, &pwm, &mut scores);
+        test::black_box(scores.argmax());
+    });
+}
+
+#[cfg(target_feature = "avx2")]
+#[bench]
+fn bench_avx2(bencher: &mut test::Bencher) {
+    let seq = &SEQUENCE[..10000];
+    let encoded = EncodedSequence::<DnaAlphabet>::from_text(seq).unwrap();
+    let mut striped = encoded.to_striped();
 
     let bg = Background::<DnaAlphabet, { DnaAlphabet::K }>::uniform();
     let cm = CountMatrix::<DnaAlphabet, { DnaAlphabet::K }>::from_sequences(&[
diff --git a/lightmotif/benches/ecoli.rs b/lightmotif/benches/ecoli.rs
index 70f873bcc973a86947b868dd5c71387b8e8abab6..e156b99c956162299e4b0b491a9a625a14a60acf 100644
--- a/lightmotif/benches/ecoli.rs
+++ b/lightmotif/benches/ecoli.rs
@@ -3,6 +3,9 @@
 extern crate lightmotif;
 extern crate test;
 
+#[cfg(target_feature = "ssse3")]
+use std::arch::x86_64::__m128;
+
 #[cfg(target_feature = "avx2")]
 use std::arch::x86_64::__m256;
 
@@ -37,14 +40,37 @@ fn bench_generic(bencher: &mut test::Bencher) {
     bencher.iter(|| test::black_box(pli.score_into(&striped, &pwm, &mut scores)));
 }
 
+#[cfg(target_feature = "ssse3")]
+#[bench]
+fn bench_ssse3(bencher: &mut test::Bencher) {
+    let encoded = EncodedSequence::<DnaAlphabet>::from_text(SEQUENCE).unwrap();
+    let mut striped = encoded.to_striped();
+
+    let bg = Background::uniform();
+    let cm = CountMatrix::from_sequences(&[
+        EncodedSequence::from_text("GTTGACCTTATCAAC").unwrap(),
+        EncodedSequence::from_text("GTTGATCCAGTCAAC").unwrap(),
+    ])
+    .unwrap();
+    let pbm = cm.to_probability(0.1);
+    let pwm = pbm.to_weight(bg);
+
+    striped.configure(&pwm);
+    let pli = Pipeline::<_, __m128>::new();
+
+    let mut scores = StripedScores::new_for(&striped, &pwm);
+    bencher.bytes = SEQUENCE.len() as u64;
+    bencher.iter(|| test::black_box(pli.score_into(&striped, &pwm, &mut scores)));
+}
+
 #[cfg(target_feature = "avx2")]
 #[bench]
-fn bench_avx(bencher: &mut test::Bencher) {
+fn bench_avx2(bencher: &mut test::Bencher) {
     let encoded = EncodedSequence::<DnaAlphabet>::from_text(SEQUENCE).unwrap();
-    let mut striped = encoded.to_striped::<32>();
+    let mut striped = encoded.to_striped();
 
-    let bg = Background::<DnaAlphabet, { DnaAlphabet::K }>::uniform();
-    let cm = CountMatrix::<DnaAlphabet, { DnaAlphabet::K }>::from_sequences(&[
+    let bg = Background::uniform();
+    let cm = CountMatrix::from_sequences(&[
         EncodedSequence::from_text("GTTGACCTTATCAAC").unwrap(),
         EncodedSequence::from_text("GTTGATCCAGTCAAC").unwrap(),
     ])
diff --git a/lightmotif/src/pli.rs b/lightmotif/src/pli.rs
index 3283efa9cd8ea90a8c717d9d303b750f58e0ab84..bbd80f5efe488d7309d37e15fdf5d4405bf7e7f3 100644
--- a/lightmotif/src/pli.rs
+++ b/lightmotif/src/pli.rs
@@ -1,4 +1,4 @@
-#[cfg(target_feature = "avx2")]
+#[cfg(target_feature = "ssse3")]
 use std::arch::x86_64::*;
 
 use self::seal::Vector;
@@ -16,6 +16,9 @@ mod seal {
 
     #[cfg(target_feature = "avx2")]
     impl Vector for std::arch::x86_64::__m256 {}
+
+    #[cfg(target_feature = "ssse3")]
+    impl Vector for std::arch::x86_64::__m128 {}
 }
 
 pub struct Pipeline<A: Alphabet, V: Vector> {
@@ -76,6 +79,112 @@ impl Pipeline<DnaAlphabet, f32> {
     }
 }
 
+#[cfg(target_feature = "ssse3")]
+impl Pipeline<DnaAlphabet, __m128> {
+    pub fn score_into<S, M>(
+        &self,
+        seq: S,
+        pwm: M,
+        scores: &mut StripedScores<__m128, { std::mem::size_of::<__m128i>() }>,
+    ) where
+        S: AsRef<StripedSequence<DnaAlphabet, { std::mem::size_of::<__m128i>() }>>,
+        M: AsRef<WeightMatrix<DnaAlphabet, { DnaAlphabet::K }>>,
+    {
+        let seq = seq.as_ref();
+        let pwm = pwm.as_ref();
+        let result = &mut scores.data;
+
+        if seq.wrap < pwm.len() - 1 {
+            panic!("not enough wrapping rows for motif of length {}", pwm.len());
+        }
+        if result.rows() < (seq.data.rows() - seq.wrap) {
+            panic!("not enough rows for scores: {}", pwm.len());
+        }
+
+        scores.length = seq.length - pwm.len() + 1;
+        unsafe {
+            // mask vectors for broadcasting uint8x16_t to uint32x4_t to floatx4_t
+            let m1 = _mm_set_epi32(
+                0xFFFFFF03u32 as i32,
+                0xFFFFFF02u32 as i32,
+                0xFFFFFF01u32 as i32,
+                0xFFFFFF00u32 as i32,
+            );
+            let m2 = _mm_set_epi32(
+                0xFFFFFF07u32 as i32,
+                0xFFFFFF06u32 as i32,
+                0xFFFFFF05u32 as i32,
+                0xFFFFFF04u32 as i32,
+            );
+            let m3 = _mm_set_epi32(
+                0xFFFFFF0Bu32 as i32,
+                0xFFFFFF0Au32 as i32,
+                0xFFFFFF09u32 as i32,
+                0xFFFFFF08u32 as i32,
+            );
+            let m4 = _mm_set_epi32(
+                0xFFFFFF0Fu32 as i32,
+                0xFFFFFF0Eu32 as i32,
+                0xFFFFFF0Du32 as i32,
+                0xFFFFFF0Cu32 as i32,
+            );
+            //
+            // process every position of the sequence data
+            for i in 0..seq.data.rows() - seq.wrap {
+                // reset sums for current position
+                let mut s1 = _mm_setzero_ps();
+                let mut s2 = _mm_setzero_ps();
+                let mut s3 = _mm_setzero_ps();
+                let mut s4 = _mm_setzero_ps();
+                // advance position in the position weight matrix
+                for j in 0..pwm.len() {
+                    // load sequence row and broadcast to f32
+                    let x = _mm_load_si128(seq.data[i + j].as_ptr() as *const __m128i);
+                    let x1 = _mm_shuffle_epi8(x, m1);
+                    let x2 = _mm_shuffle_epi8(x, m2);
+                    let x3 = _mm_shuffle_epi8(x, m3);
+                    let x4 = _mm_shuffle_epi8(x, m4);
+                    // load row for current weight matrix position
+                    let row = pwm.data[j].as_ptr();
+                    // index lookup table with each bases incrementally
+                    for i in 0..DnaAlphabet::K {
+                        let sym = _mm_set1_epi32(i as i32);
+                        let lut = _mm_set1_ps(*row.add(i as usize));
+                        let p1 = _mm_castsi128_ps(_mm_cmpeq_epi32(x1, sym));
+                        let p2 = _mm_castsi128_ps(_mm_cmpeq_epi32(x2, sym));
+                        let p3 = _mm_castsi128_ps(_mm_cmpeq_epi32(x3, sym));
+                        let p4 = _mm_castsi128_ps(_mm_cmpeq_epi32(x4, sym));
+                        s1 = _mm_add_ps(s1, _mm_and_ps(lut, p1));
+                        s2 = _mm_add_ps(s2, _mm_and_ps(lut, p2));
+                        s3 = _mm_add_ps(s3, _mm_and_ps(lut, p3));
+                        s4 = _mm_add_ps(s4, _mm_and_ps(lut, p4));
+                    }
+                }
+                // record the score for the current position
+                let row = &mut result[i];
+                _mm_store_ps(row[0..].as_mut_ptr(), s1);
+                _mm_store_ps(row[4..].as_mut_ptr(), s2);
+                _mm_store_ps(row[8..].as_mut_ptr(), s3);
+                _mm_store_ps(row[12..].as_mut_ptr(), s4);
+            }
+        }
+    }
+
+    pub fn score<S, M>(
+        &self,
+        seq: S,
+        pwm: M,
+    ) -> StripedScores<__m128, { std::mem::size_of::<__m128i>() }>
+    where
+        S: AsRef<StripedSequence<DnaAlphabet, { std::mem::size_of::<__m128i>() }>>,
+        M: AsRef<WeightMatrix<DnaAlphabet, { DnaAlphabet::K }>>,
+    {
+        let mut scores = StripedScores::new_for(&seq, &pwm);
+        self.score_into(seq, pwm, &mut scores);
+        scores
+    }
+}
+
 #[cfg(target_feature = "avx2")]
 impl Pipeline<DnaAlphabet, __m256> {
     pub fn score_into<S, M>(
@@ -287,6 +396,43 @@ impl<const C: usize> StripedScores<f32, C> {
     }
 }
 
+#[cfg(target_feature = "ssse3")]
+impl<const C: usize> StripedScores<__m128, C> {
+    /// Convert the striped scores to a vector of scores.
+    pub fn to_vec(&self) -> Vec<f32> {
+        let mut vec = Vec::with_capacity(self.length);
+        for i in 0..self.length {
+            let col = i / self.data.rows();
+            let row = i % self.data.rows();
+            vec.push(self.data[row][col]);
+        }
+        vec
+    }
+
+    /// Get the index of the highest scoring position.
+    ///
+    /// ## Panic
+    /// Panics if the data buffer is empty.
+    pub fn argmax(&self) -> usize {
+        let mut best_pos = 0;
+        let mut best_score = self.data[0][0];
+        let mut col = 0;
+        let mut row = 0;
+        for i in 0..self.length {
+            if self.data[row][col] > best_score {
+                best_pos = i;
+                best_score = self.data[row][col];
+            }
+            row += 1;
+            if row == self.data.rows() {
+                row = 0;
+                col += 1;
+            }
+        }
+        best_pos
+    }
+}
+
 #[cfg(target_feature = "avx2")]
 impl<const C: usize> StripedScores<__m256, C> {
     /// Convert the striped scores to a vector of scores.
diff --git a/lightmotif/tests/dna.rs b/lightmotif/tests/dna.rs
index 3c617325c54a25614049e8942d2556f27cdecee2..26c3315371529eda24251a9f9dc549797289e24b 100644
--- a/lightmotif/tests/dna.rs
+++ b/lightmotif/tests/dna.rs
@@ -1,5 +1,7 @@
 extern crate lightmotif;
 
+#[cfg(target_feature = "ssse3")]
+use std::arch::x86_64::__m128;
 #[cfg(target_feature = "avx2")]
 use std::arch::x86_64::__m256;
 
@@ -62,10 +64,45 @@ fn test_score_generic() {
     // assert_eq!(result.data[0][0], -23.07094); // -23.07094
 }
 
+#[cfg(target_feature = "ssse3")]
+#[test]
+fn test_score_ssse3() {
+    let encoded = EncodedSequence::<DnaAlphabet>::from_text(SEQUENCE).unwrap();
+    let mut striped = encoded.to_striped();
+
+    let cm = CountMatrix::from_sequences(
+        PATTERNS
+            .iter()
+            .map(|x| EncodedSequence::from_text(x).unwrap()),
+    )
+    .unwrap();
+    let pbm = cm.to_probability([0.1, 0.1, 0.1, 0.1, 0.0]);
+    let pwm = pbm.to_weight([0.25, 0.25, 0.25, 0.25, 0.0]);
+
+    striped.configure(&pwm);
+    let pli = Pipeline::<_, __m128>::new();
+    let result = pli.score(&striped, &pwm);
+
+    // for i in 0..result.data.rows() {
+    //     println!("i={} {:?}", i, &result.data[i]);
+    // }
+
+    let scores = result.to_vec();
+    assert_eq!(scores.len(), EXPECTED.len());
+    for i in 0..EXPECTED.len() {
+        assert!(
+            (scores[i] - EXPECTED[i]).abs() < 1e-5,
+            "{} != {} at position {}",
+            scores[i],
+            EXPECTED[i],
+            i
+        );
+    }
+}
+
 #[cfg(target_feature = "avx2")]
 #[test]
 fn test_score_avx2() {
-    // let seq = "ATGTCCCAACAACGATACCCCGAGCCCATCGCCGTCATCGGCTCGGCATGCAGATTCCCAGGCGCATCCAGCTCGCCCTCCAAGCTGTGGAGTCTGCTCCAGGAACCTCGCGACGTCCTCAAGAAGTTCGACCCAGACCGCCTCAACCTGAAACGATTCCATCATACCAACGGTGACACTCACGGTGCGACCGACGTCAACAACAAATCATATCTCCTCGAAGAAAACACCCGACTCTTCGATGCCTCGTTCTTCGGAATCAGCCCCCTGGAGGCGGCCGGTATGGACCCCCAGCAGCGTCTGTTGCTGGAAACCGTCTACGAGTCGTTTGAGGCGGCTGGCGTGACCCTCGATCAGCTCAAGGGTTCTTTGACCTCGGTTCATGTTGGCGTCATGACCAACGACTACTCCTTTATCCAGCTCCGTGACCCAGAAACGCTGTCGAAGTACAACGCGACTGGCACGGCCAACAGCATCATGTCGAACCGTATTTCATATGTCTTTGACTTGAAAGGTCCATCAGAGACCATCGACACGGCGTGCTCCAGCTCGCTGGTCGCCCTGCACCACGCTGCTCAGGGCCTGCTCAGCGGCGACTGCGAGACTGCCGTCGTCGCCGGCGTCAACCTCATCTTCGACCCCTCTCCATACATCACAGAGTCCAAGCTACACATGCTGTCACCCGACTCCCAGTCTCGCATGTGGGACAAGTCTGCAAATGGCTACGCCCGCGGCGAGGGCGCTGCCGCGCTGCTCCTGAAGCCCCTCAGCCGCGCCCTGAGGGACGGCGATCACATCGAGGGCATTGTCCGAGGCACAGGAGTCAACTCGGACGGCCAGAGCTCCGGCATCACCATGCCTTTTGCCCCTGCGCAGTCGGCGCTCATTCGCCAAACTTATCTCCGTGCTGGCCTCGACCCGATCAAGGACCGGCCTCAGTACTTCGAGTGCCACGGCACCGGAACTCCAGCTGGTGACCCCGTGGAAGCGCGAGCCATCAGCGAGTCGTTGTTGGACGGTGAAAATGTCCCAACAACGATACCCCGAGCCCATCGCCGTCATCGGCTCGGCATGCAGATTCCCAGGCGCATCCAGCTCGCCCTCCAAGCTGTGGAGTCTGCTCCAGGAACCTCGCGACGTCCTCAAGAAGTTCGACCCAGACCGCCTCAACCTGAAACGATTCCATCATACCAACGGTGACACTCACGGTGCGACCGACGTCAACAACAAATCATATCTCCTCGAAGAAAACACCCGACTCTTCGATGCCTCGTTCTTCGGAATCAGCCCCCTGGAGGCGGCCGGTATGGACCCCCAGCAGCGTCTGTTGCTGGAAACCGTCTACGAGTCGTTTGAGGCGGCTGGCGTGACCCTCGATCAGCTCAAGGGTTCTTTGACCTCGGTTCATGTTGGCGTCATGACCAACGACTACTCCTTTATCCAGCTCCGTGACCCAGAAACGCTGTCGAAGTACAACGCGACTGGCACGGCCAACAGCATCATGTCGAACCGTATTTCATATGTCTTTGACTTGAAAGGTCCATCAGAGACCATCGACACGGCGTGCTCCAGCTCGCTGGTCGCCCTGCACCACGCTGCTCAGGGCCTGCTCAGCGGCGACTGCGAGACTGCCGTCGTCGCCGGCGTCAACCTCATCTTCGACCCCTCTCCATACATCACAGAGTCCAAGCTACACATGCTGTCACCCGACTCCCAGTCTCGCATGTGGGACAAGTCTGCAAATGGCTACGCCCGCGGCGAGGGCGCTGCCGCGCTGCTCCTGAAGCCCCTCAGCCGCGCCCTGAGGGACGGCGATCACATCGAGGGCATTGTCCGAGGCACAGGAGTCAACTCGGACGGCCAGAGCTCCGGCATCACCATGCCTTTTGCCCCTGCGCAGTCGGCGCTCATTCGCCAAACTTATCTCCGTGCTGGCCTCGACCCGATCAAGGACCGGCCTCAGTACTTCGAGTGCCACGGCACCGGAACTCCAGCTGGTGACCCCGTGGAAGCGCGAGCCATCAGCGAGTCGTTGTTGGACGGTGAAA";
     let encoded = EncodedSequence::<DnaAlphabet>::from_text(SEQUENCE).unwrap();
     let mut striped = encoded.to_striped::<{ std::mem::size_of::<__m256>() }>();