From 0dee6009ba67ab93b4b08d5911f3d7c162728874 Mon Sep 17 00:00:00 2001 From: Martin Larralde <martin.larralde@embl.de> Date: Tue, 16 May 2023 13:04:35 +0200 Subject: [PATCH] Reorganize pipeline code into several submodules --- README.md | 7 +- lightmotif/benches/ecoli.rs | 21 +- lightmotif/src/lib.rs | 14 +- lightmotif/src/pli/mod.rs | 326 +++++++--------------- lightmotif/src/pli/{ => platform}/avx2.rs | 77 +++-- lightmotif/src/pli/platform/generic.rs | 28 ++ lightmotif/src/pli/platform/mod.rs | 19 ++ lightmotif/src/pli/{ => platform}/sse2.rs | 82 ++++-- lightmotif/src/pli/scores.rs | 156 +++++++++++ lightmotif/src/pli/vector.rs | 49 ++++ lightmotif/src/seq.rs | 2 + lightmotif/tests/dna.rs | 28 +- 12 files changed, 495 insertions(+), 314 deletions(-) rename lightmotif/src/pli/{ => platform}/avx2.rs (77%) create mode 100644 lightmotif/src/pli/platform/generic.rs create mode 100644 lightmotif/src/pli/platform/mod.rs rename lightmotif/src/pli/{ => platform}/sse2.rs (71%) create mode 100644 lightmotif/src/pli/scores.rs create mode 100644 lightmotif/src/pli/vector.rs diff --git a/README.md b/README.md index c7b471d..269d6fb 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ use lightmotif::*; use typenum::U32; // Create a count matrix from an iterable of motif sequences -let counts = CountMatrix::from_sequences(&[ +let counts = CountMatrix::<Dna>::from_sequences(&[ EncodedSequence::encode("GTTGACCTTATCAAC").unwrap(), EncodedSequence::encode("GTTGATCCAGTCAAC").unwrap(), ]).unwrap(); @@ -62,7 +62,8 @@ let mut striped = encoded.to_striped::<U32>(); striped.configure(&pssm); // Use a pipeline to compute scores for every position of the matrix. -let scores = Pipeline::<Dna>::score(&striped, &pssm); +let pli = Pipeline::generic(); +let scores = pli.score(&striped, &pssm); // Scores can be extracted into a Vec<f32>, or indexed directly. let v = scores.to_vec(); @@ -70,7 +71,7 @@ assert_eq!(scores[0], -23.07094); assert_eq!(v[0], -23.07094); // The highest scoring position can be searched with a pipeline as well. -let best = Pipeline::<Dna>::best_position(&scores).unwrap(); +let best = pli.best_position(&scores).unwrap(); assert_eq!(best, 18); ``` diff --git a/lightmotif/benches/ecoli.rs b/lightmotif/benches/ecoli.rs index f2cdb68..e076bbf 100644 --- a/lightmotif/benches/ecoli.rs +++ b/lightmotif/benches/ecoli.rs @@ -35,9 +35,10 @@ fn bench_generic(bencher: &mut test::Bencher) { let pssm = pbm.to_scoring(None); striped.configure(&pssm); + let pli = Pipeline::generic(); let mut scores = StripedScores::new_for(&striped, &pssm); bencher.bytes = SEQUENCE.len() as u64; - bencher.iter(|| test::black_box(Pipeline::<_, u8>::score_into(&striped, &pssm, &mut scores))); + bencher.iter(|| test::black_box(pli.score_into(&striped, &pssm, &mut scores))); } #[cfg(target_feature = "sse2")] @@ -55,15 +56,10 @@ fn bench_sse2(bencher: &mut test::Bencher) { let pssm = pbm.to_scoring(None); striped.configure(&pssm); + let pli = Pipeline::sse2().unwrap(); let mut scores = StripedScores::new_for(&striped, &pssm); bencher.bytes = SEQUENCE.len() as u64; - bencher.iter(|| { - test::black_box(Pipeline::<_, __m128i>::score_into( - &striped, - &pssm, - &mut scores, - )) - }); + bencher.iter(|| test::black_box(pli.score_into(&striped, &pssm, &mut scores))); } #[cfg(target_feature = "avx2")] @@ -81,13 +77,8 @@ fn bench_avx2(bencher: &mut test::Bencher) { let pssm = pbm.to_scoring(None); striped.configure(&pssm); + let pli = Pipeline::avx2().unwrap(); let mut scores = StripedScores::new_for(&striped, &pssm); bencher.bytes = SEQUENCE.len() as u64; - bencher.iter(|| { - test::black_box(Pipeline::<_, __m256i>::score_into( - &striped, - &pssm, - &mut scores, - )) - }); + bencher.iter(|| test::black_box(pli.score_into(&striped, &pssm, &mut scores))); } diff --git a/lightmotif/src/lib.rs b/lightmotif/src/lib.rs index 87c49fd..7339ff8 100644 --- a/lightmotif/src/lib.rs +++ b/lightmotif/src/lib.rs @@ -3,12 +3,12 @@ extern crate generic_array; extern crate typenum; -mod abc; -mod dense; -mod err; -mod pli; -mod pwm; -mod seq; +pub mod abc; +pub mod dense; +pub mod err; +pub mod pli; +pub mod pwm; +pub mod seq; pub use abc::Alphabet; pub use abc::AminoAcid; @@ -22,10 +22,10 @@ pub use abc::Pseudocounts; pub use abc::Symbol; pub use dense::DenseMatrix; pub use err::InvalidSymbol; +pub use pli::BestPosition; pub use pli::Pipeline; pub use pli::Score; pub use pli::StripedScores; -pub use pli::Vector; pub use pwm::CountMatrix; pub use pwm::FrequencyMatrix; pub use pwm::ScoringMatrix; diff --git a/lightmotif/src/pli/mod.rs b/lightmotif/src/pli/mod.rs index 7336f11..702443b 100644 --- a/lightmotif/src/pli/mod.rs +++ b/lightmotif/src/pli/mod.rs @@ -5,8 +5,14 @@ use std::ops::Range; use typenum::marker_traits::NonZero; use typenum::marker_traits::Unsigned; -pub use self::vector::Vector; - +pub use self::platform::Avx2; +pub use self::platform::Backend; +pub use self::platform::Generic; +pub use self::platform::Sse2; +pub use self::platform::UnsupportedBackend; +pub use self::scores::StripedScores; + +use self::vector::Vector; use super::abc::Alphabet; use super::abc::Dna; use super::abc::Symbol; @@ -14,75 +20,43 @@ use super::dense::DenseMatrix; use super::pwm::ScoringMatrix; use super::seq::StripedSequence; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -mod avx2; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -mod sse2; - -// --- Vector ------------------------------------------------------------------ - -mod vector { - use typenum::consts::U1; - use typenum::consts::U16; - use typenum::consts::U32; - use typenum::marker_traits::NonZero; - use typenum::marker_traits::Unsigned; - - mod seal { - pub trait Sealed {} - - impl Sealed for u8 {} - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - impl Sealed for std::arch::x86_64::__m128i {} - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - impl Sealed for std::arch::x86_64::__m256i {} - } - - /// Sealed trait for concrete vector implementations. - /// - /// The trait is defined for the loading vector type, which has `LANES` - /// lanes of `u8` values. These values are then splat into 4 vectors with - /// `f32` values to actually compute the scores. - pub trait Vector: seal::Sealed { - type LANES: Unsigned + NonZero; - } - - impl Vector for u8 { - type LANES = U1; - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - impl Vector for std::arch::x86_64::__m128i { - type LANES = U16; - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - impl Vector for std::arch::x86_64::__m256i { - type LANES = U32; - } - - #[cfg(target_feature = "avx2")] - pub type Best = std::arch::x86_64::__m256i; - #[cfg(all(not(target_feature = "avx2"), target_feature = "ssse3"))] - pub type Best = std::arch::x86_64::__m128i; - #[cfg(all(not(target_feature = "avx2"), not(target_feature = "ssse3")))] - pub type Best = u8; -} +mod platform; +mod scores; +mod vector; // --- Score ------------------------------------------------------------------- /// Generic trait for computing sequence scores with a PSSM. -pub trait Score<A: Alphabet, V: Vector, C: NonZero + Unsigned = <V as Vector>::LANES> { +pub trait Score<A: Alphabet, C: NonZero + Unsigned> { /// Compute the PSSM scores into the given buffer. - fn score_into<S, M>(seq: S, pssm: M, scores: &mut StripedScores<C>) + fn score_into<S, M>(&self, seq: S, pssm: M, scores: &mut StripedScores<C>) where S: AsRef<StripedSequence<A, C>>, - M: AsRef<ScoringMatrix<A>>; + M: AsRef<ScoringMatrix<A>>, + { + let seq = seq.as_ref(); + let pssm = pssm.as_ref(); + + let seq_rows = seq.data.rows() - seq.wrap; + scores.resize(seq.length - pssm.len() + 1, seq_rows); + + let result = scores.matrix_mut(); + for i in 0..seq.length - pssm.len() + 1 { + let mut score = 0.0; + for j in 0..pssm.len() { + let offset = i + j; + let col = offset / seq_rows; + let row = offset % seq_rows; + score += pssm.weights()[j][seq.data[row][col].as_index()]; + } + let col = i / result.rows(); + let row = i % result.rows(); + result[row][col] = score; + } + } /// Compute the PSSM scores for every sequence positions. - fn score<S, M>(seq: S, pssm: M) -> StripedScores<C> + fn score<S, M>(&self, seq: S, pssm: M) -> StripedScores<C> where S: AsRef<StripedSequence<A, C>>, M: AsRef<ScoringMatrix<A>>, @@ -91,24 +65,27 @@ pub trait Score<A: Alphabet, V: Vector, C: NonZero + Unsigned = <V as Vector>::L let pssm = pssm.as_ref(); let data = unsafe { DenseMatrix::uninitialized(seq.data.rows() - seq.wrap) }; let length = seq.length - pssm.len() + 1; - let mut scores = StripedScores { length, data }; - Self::score_into(seq, pssm, &mut scores); + let mut scores = StripedScores::new(length, data); + self.score_into(seq, pssm, &mut scores); scores } +} +pub trait BestPosition<C: NonZero + Unsigned> { /// Find the sequence position with the highest score. - fn best_position(scores: &StripedScores<C>) -> Option<usize> { - if scores.length == 0 { + fn best_position(&self, scores: &StripedScores<C>) -> Option<usize> { + if scores.len() == 0 { return None; } + let data = scores.matrix(); let mut best_pos = 0; - let mut best_score = scores.data[0][0]; - for i in 0..scores.length { - let col = i / scores.data.rows(); - let row = i % scores.data.rows(); - if scores.data[row][col] > best_score { - best_score = scores.data[row][col]; + let mut best_score = data[0][0]; + for i in 0..scores.len() { + let col = i / data.rows(); + let row = i % data.rows(); + if data[row][col] > best_score { + best_score = data[row][col]; best_pos = i; } } @@ -121,183 +98,92 @@ pub trait Score<A: Alphabet, V: Vector, C: NonZero + Unsigned = <V as Vector>::L /// Wrapper implementing score computation for different platforms. #[derive(Debug, Default, Clone)] -pub struct Pipeline<A: Alphabet, V: Vector = vector::Best> { +pub struct Pipeline<A: Alphabet, B: Backend> { alphabet: std::marker::PhantomData<A>, - vector: std::marker::PhantomData<V>, + backend: std::marker::PhantomData<B>, } -/// Scalar scoring implementation. -impl<A: Alphabet, C: NonZero + Unsigned> Score<A, u8, C> for Pipeline<A, u8> { - fn score_into<S, M>(seq: S, pssm: M, scores: &mut StripedScores<C>) - where - S: AsRef<StripedSequence<A, C>>, - M: AsRef<ScoringMatrix<A>>, - { - let seq = seq.as_ref(); - let pssm = pssm.as_ref(); +// --- Generic pipeline -------------------------------------------------------- - let seq_rows = seq.data.rows() - seq.wrap; - let result = &mut scores.data; - if result.rows() < seq_rows { - panic!("not enough rows for scores: {}", pssm.len()); - } - - for i in 0..seq.length - pssm.len() + 1 { - let mut score = 0.0; - for j in 0..pssm.len() { - let offset = i + j; - let col = offset / seq_rows; - let row = offset % seq_rows; - score += pssm.weights()[j][seq.data[row][col].as_index()]; - } - let col = i / result.rows(); - let row = i % result.rows(); - result[row][col] = score; +impl<A: Alphabet> Pipeline<A, Generic> { + /// Create a new generic pipeline. + pub const fn generic() -> Self { + Self { + alphabet: std::marker::PhantomData, + backend: std::marker::PhantomData, } } } -// --- SSSE3 ------------------------------------------------------------------- +impl<A: Alphabet, C: NonZero + Unsigned> Score<A, C> for Pipeline<A, Generic> {} -// --- StripedScores ----------------------------------------------------------- +impl<A: Alphabet, C: NonZero + Unsigned> BestPosition<C> for Pipeline<A, Generic> {} -#[derive(Clone, Debug)] -pub struct StripedScores<C: Unsigned + NonZero> { - data: DenseMatrix<f32, C>, - length: usize, -} +// --- SSE2 pipeline ----------------------------------------------------------- -impl<C: Unsigned + NonZero> StripedScores<C> { - /// Create a new striped score matrix with the given length and rows. - pub fn new(length: usize, rows: usize) -> Self { - Self { - length, - data: DenseMatrix::new(rows), +impl<A: Alphabet> Pipeline<A, Sse2> { + /// Attempt to create a new SSE2-accelerated pipeline. + pub fn sse2() -> Result<Self, UnsupportedBackend> { + #[cfg(target_arch = "x86")] + if std::is_x86_feature_detected!("sse2") { + return Ok(Self::default()); } + #[cfg(target_arch = "x86_64")] + return Ok(Self::default()); + #[allow(unreachable_code)] + Err(UnsupportedBackend) } +} - /// Return the number of scored positions. - pub fn len(&self) -> usize { - self.length - } - - /// Return a reference to the striped matrix storing the scores. - pub fn matrix(&self) -> &DenseMatrix<f32, C> { - &self.data - } - - /// Return a mutable reference to the striped matrix storing the scores. - pub fn matrix_mut(&mut self) -> &mut DenseMatrix<f32, C> { - &mut self.data - } - - /// Create a new matrix large enough to store the scores of `pssm` applied to `seq`. - pub fn new_for<S, M, A>(seq: S, pssm: M) -> Self - where - A: Alphabet, - S: AsRef<StripedSequence<A, C>>, - M: AsRef<ScoringMatrix<A>>, - { - let seq = seq.as_ref(); - let pssm = pssm.as_ref(); - Self::new(seq.length - pssm.len() + 1, seq.data.rows() - seq.wrap) - } - - /// Resize the striped scores storage to the given length and number of rows. - pub fn resize(&mut self, length: usize, rows: usize) { - self.length = length; - self.data.resize(rows); - } - - /// Resize the striped scores storage to store the scores of `pssm` applied to `seq`. - pub fn resize_for<S, M, A>(&mut self, seq: S, pssm: M) - where - A: Alphabet, - S: AsRef<StripedSequence<A, C>>, +impl<A: Alphabet> Score<A, <Sse2 as Backend>::LANES> for Pipeline<A, Sse2> { + fn score_into<S, M>( + &self, + seq: S, + pssm: M, + scores: &mut StripedScores<<Sse2 as Backend>::LANES>, + ) where + S: AsRef<StripedSequence<A, <Sse2 as Backend>::LANES>>, M: AsRef<ScoringMatrix<A>>, { - let seq = seq.as_ref(); - let pssm = pssm.as_ref(); - self.resize(seq.length - pssm.len() + 1, seq.data.rows() - seq.wrap); - } - - /// Iterate over scores of individual sequence positions. - pub fn iter(&self) -> Iter<'_, C> { - Iter::new(&self) - } - - /// Convert the striped scores to a vector of scores. - pub fn to_vec(&self) -> Vec<f32> { - self.iter().cloned().collect() + Sse2::score_into(seq, pssm, scores) } } -impl<C: Unsigned + NonZero> AsRef<DenseMatrix<f32, C>> for StripedScores<C> { - fn as_ref(&self) -> &DenseMatrix<f32, C> { - self.matrix() +impl<A: Alphabet> BestPosition<<Sse2 as Backend>::LANES> for Pipeline<A, Sse2> { + fn best_position(&self, scores: &StripedScores<<Sse2 as Backend>::LANES>) -> Option<usize> { + Sse2::best_position(scores) } } -impl<C: Unsigned + NonZero> AsMut<DenseMatrix<f32, C>> for StripedScores<C> { - fn as_mut(&mut self) -> &mut DenseMatrix<f32, C> { - self.matrix_mut() - } -} +// --- AVX2 pipeline ----------------------------------------------------------- -impl<C: Unsigned + NonZero> Index<usize> for StripedScores<C> { - type Output = f32; - fn index(&self, index: usize) -> &f32 { - let col = index / self.data.rows(); - let row = index % self.data.rows(); - &self.data[row][col] - } -} - -impl<C: Unsigned + NonZero> From<StripedScores<C>> for Vec<f32> { - fn from(scores: StripedScores<C>) -> Self { - scores.iter().cloned().collect() - } -} - -// --- Iter -------------------------------------------------------------------- - -pub struct Iter<'a, C: Unsigned + NonZero> { - scores: &'a StripedScores<C>, - indices: Range<usize>, -} - -impl<'a, C: Unsigned + NonZero> Iter<'a, C> { - fn new(scores: &'a StripedScores<C>) -> Self { - Self { - scores, - indices: 0..scores.length, +impl<A: Alphabet> Pipeline<A, Avx2> { + /// Attempt to create a new AVX2-accelerated pipeline. + pub fn avx2() -> Result<Self, UnsupportedBackend> { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + if std::is_x86_feature_detected!("avx2") { + return Ok(Self::default()); } - } - - fn get(&self, i: usize) -> &'a f32 { - let col = i / self.scores.data.rows(); - let row = i % self.scores.data.rows(); - &self.scores.data[row][col] - } -} - -impl<'a, C: Unsigned + NonZero> Iterator for Iter<'a, C> { - type Item = &'a f32; - fn next(&mut self) -> Option<Self::Item> { - self.indices.next().map(|i| self.get(i)) + Err(UnsupportedBackend) } } -impl<'a, C: Unsigned + NonZero> ExactSizeIterator for Iter<'a, C> { - fn len(&self) -> usize { - self.indices.len() +impl Score<Dna, <Avx2 as Backend>::LANES> for Pipeline<Dna, Avx2> { + fn score_into<S, M>( + &self, + seq: S, + pssm: M, + scores: &mut StripedScores<<Avx2 as Backend>::LANES>, + ) where + S: AsRef<StripedSequence<Dna, <Avx2 as Backend>::LANES>>, + M: AsRef<ScoringMatrix<Dna>>, + { + Avx2::score_into(seq, pssm, scores) } } -impl<'a, C: Unsigned + NonZero> FusedIterator for Iter<'a, C> {} - -impl<'a, C: Unsigned + NonZero> DoubleEndedIterator for Iter<'a, C> { - fn next_back(&mut self) -> Option<Self::Item> { - self.indices.next_back().map(|i| self.get(i)) +impl<A: Alphabet> BestPosition<<Avx2 as Backend>::LANES> for Pipeline<A, Avx2> { + fn best_position(&self, scores: &StripedScores<<Avx2 as Backend>::LANES>) -> Option<usize> { + Avx2::best_position(scores) } } diff --git a/lightmotif/src/pli/avx2.rs b/lightmotif/src/pli/platform/avx2.rs similarity index 77% rename from lightmotif/src/pli/avx2.rs rename to lightmotif/src/pli/platform/avx2.rs index 2b3aefd..cdf042a 100644 --- a/lightmotif/src/pli/avx2.rs +++ b/lightmotif/src/pli/platform/avx2.rs @@ -1,24 +1,35 @@ use std::arch::x86_64::*; -use typenum::Unsigned; +use typenum::consts::U32; +use typenum::marker_traits::Unsigned; -use super::Pipeline; -use super::Score; -use super::StripedScores; -use super::Vector; +use super::Backend; use crate::abc::Alphabet; use crate::abc::Dna; use crate::abc::Nucleotide; use crate::abc::Symbol; +use crate::pli::scores::StripedScores; +use crate::pli::Pipeline; +use crate::pli::Vector; use crate::pwm::ScoringMatrix; use crate::seq::StripedSequence; +/// A marker type for the AVX2 implementation of the pipeline. +#[derive(Clone, Debug, Default)] +pub struct Avx2; + +impl Backend for Avx2 { + type LANES = U32; +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2")] unsafe fn score_avx2( - seq: &StripedSequence<Dna, <__m256i as Vector>::LANES>, + seq: &StripedSequence<Dna, <Avx2 as Backend>::LANES>, pssm: &ScoringMatrix<Dna>, - scores: &mut StripedScores<<__m256i as Vector>::LANES>, + scores: &mut StripedScores<<Avx2 as Backend>::LANES>, ) { + let data = scores.matrix_mut(); // constant vector for comparing unknown bases let n = _mm256_set1_epi8(Nucleotide::N as i8); // mask vectors for broadcasting uint8x32_t to uint32x8_t to floatx8_t @@ -110,7 +121,7 @@ unsafe fn score_avx2( let r3 = _mm256_permute2f128_ps(s1, s2, 0x31); let r4 = _mm256_permute2f128_ps(s3, s4, 0x31); // record the score for the current position - let row = &mut scores.data[i]; + let row = &mut data[i]; _mm256_store_ps(row[0x00..].as_mut_ptr(), r1); _mm256_store_ps(row[0x08..].as_mut_ptr(), r2); _mm256_store_ps(row[0x10..].as_mut_ptr(), r3); @@ -118,11 +129,13 @@ unsafe fn score_avx2( } } +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2")] -unsafe fn best_position_avx2(scores: &StripedScores<<__m256i as Vector>::LANES>) -> Option<usize> { - if scores.length == 0 { +unsafe fn best_position_avx2(scores: &StripedScores<<Avx2 as Backend>::LANES>) -> Option<usize> { + if scores.len() == 0 { None } else { + let data = scores.matrix(); unsafe { // the row index for the best score in each column // (these are 32-bit integers but for use with `_mm256_blendv_ps` @@ -132,12 +145,12 @@ unsafe fn best_position_avx2(scores: &StripedScores<<__m256i as Vector>::LANES>) let mut p3 = _mm256_setzero_ps(); let mut p4 = _mm256_setzero_ps(); // store the best scores for each column - let mut s1 = _mm256_load_ps(scores.data[0][0x00..].as_ptr()); - let mut s2 = _mm256_load_ps(scores.data[0][0x08..].as_ptr()); - let mut s3 = _mm256_load_ps(scores.data[0][0x10..].as_ptr()); - let mut s4 = _mm256_load_ps(scores.data[0][0x18..].as_ptr()); + let mut s1 = _mm256_load_ps(data[0][0x00..].as_ptr()); + let mut s2 = _mm256_load_ps(data[0][0x08..].as_ptr()); + let mut s3 = _mm256_load_ps(data[0][0x10..].as_ptr()); + let mut s4 = _mm256_load_ps(data[0][0x18..].as_ptr()); // process all rows iteratively - for (i, row) in scores.data.iter().enumerate() { + for (i, row) in data.iter().enumerate() { // record the current row index let index = _mm256_castsi256_ps(_mm256_set1_epi32(i as i32)); // load scores for the current row @@ -170,9 +183,9 @@ unsafe fn best_position_avx2(scores: &StripedScores<<__m256i as Vector>::LANES>) let mut best_pos = 0; let mut best_score = -f32::INFINITY; for (col, &row) in x.iter().enumerate() { - if scores.data[row as usize][col] > best_score { - best_score = scores.data[row as usize][col]; - best_pos = col * scores.data.rows() + row as usize; + if data[row as usize][col] > best_score { + best_score = data[row as usize][col]; + best_pos = col * data.rows() + row as usize; } } Some(best_pos) @@ -181,15 +194,14 @@ unsafe fn best_position_avx2(scores: &StripedScores<<__m256i as Vector>::LANES>) } /// Intel 256-bit vector implementation, for 32 elements column width. -impl Score<Dna, __m256i> for Pipeline<Dna, __m256i> { - fn score_into<S, M>(seq: S, pssm: M, scores: &mut StripedScores<<__m256i as Vector>::LANES>) +impl Avx2 { + pub fn score_into<S, M>(seq: S, pssm: M, scores: &mut StripedScores<<Avx2 as Backend>::LANES>) where - S: AsRef<StripedSequence<Dna, <__m256i as Vector>::LANES>>, + S: AsRef<StripedSequence<Dna, <Avx2 as Backend>::LANES>>, M: AsRef<ScoringMatrix<Dna>>, { let seq = seq.as_ref(); let pssm = pssm.as_ref(); - let result = &mut scores.data; if seq.wrap < pssm.len() - 1 { panic!( @@ -197,17 +209,22 @@ impl Score<Dna, __m256i> for Pipeline<Dna, __m256i> { pssm.len() ); } - if result.rows() < (seq.data.rows() - seq.wrap) { - panic!("not enough rows for scores: {}", pssm.len()); - } - scores.length = seq.length - pssm.len() + 1; + scores.resize(seq.length - pssm.len() + 1, seq.data.rows() - seq.wrap); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] unsafe { - score_avx2(seq, pssm, scores); - } + score_avx2(seq, pssm, scores) + }; + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + panic!("attempting to run AVX2 code on a non-x86 host") } - fn best_position(scores: &StripedScores<<__m256i as Vector>::LANES>) -> Option<usize> { - unsafe { best_position_avx2(scores) } + pub fn best_position(scores: &StripedScores<<Avx2 as Backend>::LANES>) -> Option<usize> { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + unsafe { + best_position_avx2(scores) + } + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + panic!("attempting to run AVX2 code on a non-x86 host") } } diff --git a/lightmotif/src/pli/platform/generic.rs b/lightmotif/src/pli/platform/generic.rs new file mode 100644 index 0000000..50beef4 --- /dev/null +++ b/lightmotif/src/pli/platform/generic.rs @@ -0,0 +1,28 @@ +use typenum::consts::U1; +use typenum::marker_traits::NonZero; +use typenum::marker_traits::Unsigned; + +use super::Backend; +use crate::abc::Alphabet; +use crate::abc::Dna; +use crate::abc::Nucleotide; +use crate::abc::Symbol; +use crate::pli::scores::StripedScores; +use crate::pli::BestPosition; +use crate::pli::Pipeline; +use crate::pli::Score; +use crate::pli::Vector; +use crate::pwm::ScoringMatrix; +use crate::seq::StripedSequence; + +/// A marker type for the generic implementation of the pipeline. +#[derive(Clone, Debug, Default)] +pub struct Generic; + +impl Backend for Generic { + type LANES = U1; +} + +impl<A: Alphabet, C: NonZero + Unsigned> Score<A, C> for Generic {} + +impl<C: NonZero + Unsigned> BestPosition<C> for Generic {} diff --git a/lightmotif/src/pli/platform/mod.rs b/lightmotif/src/pli/platform/mod.rs new file mode 100644 index 0000000..30e3112 --- /dev/null +++ b/lightmotif/src/pli/platform/mod.rs @@ -0,0 +1,19 @@ +mod avx2; +mod generic; +mod sse2; + +pub use self::avx2::Avx2; +pub use self::generic::Generic; +pub use self::sse2::Sse2; + +use typenum::marker_traits::NonZero; +use typenum::marker_traits::Unsigned; + +/// A marker trait for backends. +pub trait Backend { + type LANES: Unsigned + NonZero; +} + +/// An error marker when a pipeline backend is unsupported on the host platform. +#[derive(Debug, Clone)] +pub struct UnsupportedBackend; diff --git a/lightmotif/src/pli/sse2.rs b/lightmotif/src/pli/platform/sse2.rs similarity index 71% rename from lightmotif/src/pli/sse2.rs rename to lightmotif/src/pli/platform/sse2.rs index 78838ef..3ac41d7 100644 --- a/lightmotif/src/pli/sse2.rs +++ b/lightmotif/src/pli/platform/sse2.rs @@ -1,24 +1,38 @@ +//! Intel 128-bit vector implementation, for 16 elements column width. use std::arch::x86_64::*; -use typenum::Unsigned; +use typenum::consts::U16; +use typenum::marker_traits::Unsigned; -use super::Pipeline; -use super::Score; -use super::StripedScores; -use super::Vector; +use super::Backend; use crate::abc::Alphabet; use crate::abc::Dna; use crate::abc::Nucleotide; use crate::abc::Symbol; +use crate::pli::scores::StripedScores; +use crate::pli::BestPosition; +use crate::pli::Pipeline; +use crate::pli::Score; +use crate::pli::Vector; use crate::pwm::ScoringMatrix; use crate::seq::StripedSequence; +/// A marker type for the SSE2 implementation of the pipeline. +#[derive(Clone, Debug, Default)] +pub struct Sse2; + +impl Backend for Sse2 { + type LANES = U16; +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse2")] unsafe fn score_sse2<A: Alphabet>( - seq: &StripedSequence<A, <__m128i as Vector>::LANES>, + seq: &StripedSequence<A, <Sse2 as Backend>::LANES>, pssm: &ScoringMatrix<A>, - scores: &mut StripedScores<<__m128i as Vector>::LANES>, + scores: &mut StripedScores<<Sse2 as Backend>::LANES>, ) { + let data = scores.matrix_mut(); // mask vectors for broadcasting uint8x16_t to uint32x4_t to floatx4_t let zero = _mm_setzero_si128(); // process every position of the sequence data @@ -55,7 +69,7 @@ unsafe fn score_sse2<A: Alphabet>( } } // record the score for the current position - let row = &mut scores.data[i]; + let row = &mut data[i]; _mm_storeu_ps(row[0..].as_mut_ptr(), s1); _mm_storeu_ps(row[4..].as_mut_ptr(), s2); _mm_storeu_ps(row[8..].as_mut_ptr(), s3); @@ -63,11 +77,13 @@ unsafe fn score_sse2<A: Alphabet>( } } +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse2")] -unsafe fn best_position_sse2(scores: &StripedScores<<__m128i as Vector>::LANES>) -> Option<usize> { - if scores.length == 0 { +unsafe fn best_position_sse2(scores: &StripedScores<<Sse2 as Backend>::LANES>) -> Option<usize> { + if scores.len() == 0 { None } else { + let data = scores.matrix(); unsafe { // the row index for the best score in each column // (these are 32-bit integers but for use with `_mm256_blendv_ps` @@ -77,12 +93,12 @@ unsafe fn best_position_sse2(scores: &StripedScores<<__m128i as Vector>::LANES>) let mut p3 = _mm_setzero_ps(); let mut p4 = _mm_setzero_ps(); // store the best scores for each column - let mut s1 = _mm_load_ps(scores.data[0][0x00..].as_ptr()); - let mut s2 = _mm_load_ps(scores.data[0][0x04..].as_ptr()); - let mut s3 = _mm_load_ps(scores.data[0][0x08..].as_ptr()); - let mut s4 = _mm_load_ps(scores.data[0][0x0c..].as_ptr()); + let mut s1 = _mm_load_ps(data[0][0x00..].as_ptr()); + let mut s2 = _mm_load_ps(data[0][0x04..].as_ptr()); + let mut s3 = _mm_load_ps(data[0][0x08..].as_ptr()); + let mut s4 = _mm_load_ps(data[0][0x0c..].as_ptr()); // process all rows iteratively - for (i, row) in scores.data.iter().enumerate() { + for (i, row) in data.iter().enumerate() { // record the current row index let index = _mm_castsi128_ps(_mm_set1_epi32(i as i32)); // load scores for the current row @@ -119,9 +135,9 @@ unsafe fn best_position_sse2(scores: &StripedScores<<__m128i as Vector>::LANES>) let mut best_pos = 0; let mut best_score = -f32::INFINITY; for (col, &row) in x.iter().enumerate() { - if scores.data[row as usize][col] > best_score { - best_score = scores.data[row as usize][col]; - best_pos = col * scores.data.rows() + row as usize; + if data[row as usize][col] > best_score { + best_score = data[row as usize][col]; + best_pos = col * data.rows() + row as usize; } } Some(best_pos) @@ -129,11 +145,14 @@ unsafe fn best_position_sse2(scores: &StripedScores<<__m128i as Vector>::LANES>) } } -/// Intel 128-bit vector implementation, for 16 elements column width. -impl<A: Alphabet> Score<A, __m128i> for Pipeline<A, __m128i> { - fn score_into<S, M>(seq: S, pssm: M, scores: &mut StripedScores<<__m128i as Vector>::LANES>) - where - S: AsRef<StripedSequence<A, <__m128i as Vector>::LANES>>, +impl Sse2 { + pub fn score_into<A, S, M>( + seq: S, + pssm: M, + scores: &mut StripedScores<<Sse2 as Backend>::LANES>, + ) where + A: Alphabet, + S: AsRef<StripedSequence<A, <Sse2 as Backend>::LANES>>, M: AsRef<ScoringMatrix<A>>, { let seq = seq.as_ref(); @@ -145,17 +164,22 @@ impl<A: Alphabet> Score<A, __m128i> for Pipeline<A, __m128i> { pssm.len() ); } - if scores.data.rows() < (seq.data.rows() - seq.wrap) { - panic!("not enough rows for scores: {}", pssm.len()); - } - scores.length = seq.length - pssm.len() + 1; + scores.resize(seq.length - pssm.len() + 1, seq.data.rows() - seq.wrap); + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] unsafe { score_sse2(seq, pssm, scores); } + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + panic!("attempting to run SSE2 code on a non-x86 host") } - fn best_position(scores: &StripedScores<<__m128i as Vector>::LANES>) -> Option<usize> { - unsafe { best_position_sse2(scores) } + pub fn best_position(scores: &StripedScores<<Sse2 as Backend>::LANES>) -> Option<usize> { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + unsafe { + best_position_sse2(scores) + } + #[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] + panic!("attempting to run SSE2 code on a non-x86 host") } } diff --git a/lightmotif/src/pli/scores.rs b/lightmotif/src/pli/scores.rs new file mode 100644 index 0000000..134290a --- /dev/null +++ b/lightmotif/src/pli/scores.rs @@ -0,0 +1,156 @@ +use std::iter::DoubleEndedIterator; +use std::iter::ExactSizeIterator; +use std::iter::FusedIterator; +use std::ops::Index; +use std::ops::Range; + +use typenum::marker_traits::NonZero; +use typenum::marker_traits::Unsigned; + +use crate::abc::Alphabet; +use crate::dense::DenseMatrix; +use crate::pwm::ScoringMatrix; +use crate::seq::StripedSequence; + +#[derive(Clone, Debug)] +pub struct StripedScores<C: Unsigned + NonZero> { + data: DenseMatrix<f32, C>, + length: usize, +} + +impl<C: Unsigned + NonZero> StripedScores<C> { + /// Create a new striped score matrix with the given length and data. + pub fn new(length: usize, data: DenseMatrix<f32, C>) -> Self { + Self { length, data } + } + + /// Create an empty score matrix with the given length and row count. + pub fn empty(length: usize, rows: usize) -> Self { + Self::new(length, DenseMatrix::new(rows)) + } + + /// Return the number of scored positions. + pub fn len(&self) -> usize { + self.length + } + + /// Return a reference to the striped matrix storing the scores. + pub fn matrix(&self) -> &DenseMatrix<f32, C> { + &self.data + } + + /// Return a mutable reference to the striped matrix storing the scores. + pub fn matrix_mut(&mut self) -> &mut DenseMatrix<f32, C> { + &mut self.data + } + + /// Create a new matrix large enough to store the scores of `pssm` applied to `seq`. + pub fn new_for<S, M, A>(seq: S, pssm: M) -> Self + where + A: Alphabet, + S: AsRef<StripedSequence<A, C>>, + M: AsRef<ScoringMatrix<A>>, + { + let seq = seq.as_ref(); + let pssm = pssm.as_ref(); + Self::empty(seq.length - pssm.len() + 1, seq.data.rows() - seq.wrap) + } + + /// Resize the striped scores storage to the given length and number of rows. + pub fn resize(&mut self, length: usize, rows: usize) { + self.length = length; + self.data.resize(rows); + } + + /// Resize the striped scores storage to store the scores of `pssm` applied to `seq`. + pub fn resize_for<S, M, A>(&mut self, seq: S, pssm: M) + where + A: Alphabet, + S: AsRef<StripedSequence<A, C>>, + M: AsRef<ScoringMatrix<A>>, + { + let seq = seq.as_ref(); + let pssm = pssm.as_ref(); + self.resize(seq.length - pssm.len() + 1, seq.data.rows() - seq.wrap); + } + + /// Iterate over scores of individual sequence positions. + pub fn iter(&self) -> Iter<'_, C> { + Iter::new(&self) + } + + /// Convert the striped scores to a vector of scores. + pub fn to_vec(&self) -> Vec<f32> { + self.iter().cloned().collect() + } +} + +impl<C: Unsigned + NonZero> AsRef<DenseMatrix<f32, C>> for StripedScores<C> { + fn as_ref(&self) -> &DenseMatrix<f32, C> { + self.matrix() + } +} + +impl<C: Unsigned + NonZero> AsMut<DenseMatrix<f32, C>> for StripedScores<C> { + fn as_mut(&mut self) -> &mut DenseMatrix<f32, C> { + self.matrix_mut() + } +} + +impl<C: Unsigned + NonZero> Index<usize> for StripedScores<C> { + type Output = f32; + fn index(&self, index: usize) -> &f32 { + let col = index / self.data.rows(); + let row = index % self.data.rows(); + &self.data[row][col] + } +} + +impl<C: Unsigned + NonZero> From<StripedScores<C>> for Vec<f32> { + fn from(scores: StripedScores<C>) -> Self { + scores.iter().cloned().collect() + } +} + +// --- Iter -------------------------------------------------------------------- + +pub struct Iter<'a, C: Unsigned + NonZero> { + scores: &'a StripedScores<C>, + indices: Range<usize>, +} + +impl<'a, C: Unsigned + NonZero> Iter<'a, C> { + fn new(scores: &'a StripedScores<C>) -> Self { + Self { + scores, + indices: 0..scores.length, + } + } + + fn get(&self, i: usize) -> &'a f32 { + let col = i / self.scores.data.rows(); + let row = i % self.scores.data.rows(); + &self.scores.data[row][col] + } +} + +impl<'a, C: Unsigned + NonZero> Iterator for Iter<'a, C> { + type Item = &'a f32; + fn next(&mut self) -> Option<Self::Item> { + self.indices.next().map(|i| self.get(i)) + } +} + +impl<'a, C: Unsigned + NonZero> ExactSizeIterator for Iter<'a, C> { + fn len(&self) -> usize { + self.indices.len() + } +} + +impl<'a, C: Unsigned + NonZero> FusedIterator for Iter<'a, C> {} + +impl<'a, C: Unsigned + NonZero> DoubleEndedIterator for Iter<'a, C> { + fn next_back(&mut self) -> Option<Self::Item> { + self.indices.next_back().map(|i| self.get(i)) + } +} diff --git a/lightmotif/src/pli/vector.rs b/lightmotif/src/pli/vector.rs new file mode 100644 index 0000000..dc5f16e --- /dev/null +++ b/lightmotif/src/pli/vector.rs @@ -0,0 +1,49 @@ +use typenum::consts::U1; +use typenum::consts::U16; +use typenum::consts::U32; +use typenum::consts::U4; +use typenum::marker_traits::NonZero; +use typenum::marker_traits::Unsigned; + +/// Trait for concrete vector implementations. +/// +/// The trait is defined for the loading vector type, which has `LANES` +/// lanes of `Item` values. +pub trait Vector { + type Item; + type LANES: Unsigned + NonZero; +} + +impl Vector for u8 { + type Item = u8; + type LANES = U1; +} + +impl Vector for f32 { + type Item = f32; + type LANES = U1; +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl Vector for std::arch::x86_64::__m128i { + type Item = u8; + type LANES = U16; +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl Vector for std::arch::x86_64::__m256i { + type Item = u8; + type LANES = U32; +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl Vector for std::arch::x86_64::__m128 { + type Item = f32; + type LANES = U4; +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +impl Vector for std::arch::x86_64::__m256 { + type Item = f32; + type LANES = U4; +} diff --git a/lightmotif/src/seq.rs b/lightmotif/src/seq.rs index 970ed6b..fdd47f8 100644 --- a/lightmotif/src/seq.rs +++ b/lightmotif/src/seq.rs @@ -38,11 +38,13 @@ impl<A: Alphabet> EncodedSequence<A> { } /// Return the number of symbols in the sequence. + #[inline] pub fn len(&self) -> usize { self.data.len() } /// Iterate over the symbols in the sequence. + #[inline] pub fn iter(&self) -> impl IntoIterator<Item = &A::Symbol> { self.data.iter() } diff --git a/lightmotif/tests/dna.rs b/lightmotif/tests/dna.rs index 25d6d88..1809132 100644 --- a/lightmotif/tests/dna.rs +++ b/lightmotif/tests/dna.rs @@ -7,11 +7,13 @@ use std::arch::x86_64::__m128i; use std::arch::x86_64::__m256i; use std::str::FromStr; +use lightmotif::pli::BestPosition; +use lightmotif::pli::Generic; +use lightmotif::pli::Score; use lightmotif::CountMatrix; use lightmotif::Dna; use lightmotif::EncodedSequence; use lightmotif::Pipeline; -use lightmotif::Score; use typenum::U32; @@ -52,7 +54,8 @@ fn test_score_generic() { let pssm = pwm.into(); striped.configure(&pssm); - let result = Pipeline::<Dna, u8>::score(&striped, &pssm); + let pli = Pipeline::generic(); + let result = pli.score(&striped, &pssm); let scores = result.to_vec(); assert_eq!(scores.len(), EXPECTED.len()); @@ -83,8 +86,9 @@ fn test_best_position_generic() { let pssm = pwm.into(); striped.configure(&pssm); - let result = Pipeline::<Dna, u8>::score(&striped, &pssm); - assert_eq!(Pipeline::<Dna, u8>::best_position(&result), Some(18)); + let pli = Pipeline::generic(); + let result = pli.score(&striped, &pssm); + assert_eq!(pli.best_position(&result), Some(18)); } #[cfg(target_feature = "sse2")] @@ -104,7 +108,8 @@ fn test_score_sse2() { let pssm = pwm.into(); striped.configure(&pssm); - let result = Pipeline::<Dna, __m128i>::score(&striped, &pssm); + let pli = Pipeline::sse2().unwrap(); + let result = pli.score(&striped, &pssm); // for i in 0..result.data.rows() { // println!("i={} {:?}", i, &result.data[i]); @@ -140,8 +145,9 @@ fn test_best_position_sse2() { let pssm = pwm.into(); striped.configure(&pssm); - let result = Pipeline::<Dna, __m128i>::score(&striped, &pssm); - assert_eq!(Pipeline::<Dna, __m128i>::best_position(&result), Some(18)); + let pli = Pipeline::sse2().unwrap(); + let result = pli.score(&striped, &pssm); + assert_eq!(pli.best_position(&result), Some(18)); } #[cfg(target_feature = "avx2")] @@ -161,7 +167,8 @@ fn test_score_avx2() { let pssm = pwm.into(); striped.configure(&pssm); - let result = Pipeline::<_, __m256i>::score(&striped, &pssm); + let pli = Pipeline::avx2().unwrap(); + let result = pli.score(&striped, &pssm); let scores = result.to_vec(); assert_eq!(scores.len(), EXPECTED.len()); @@ -193,6 +200,7 @@ fn test_best_position_avx2() { let pssm = pwm.into(); striped.configure(&pssm); - let result = Pipeline::<Dna, __m256i>::score(&striped, &pssm); - assert_eq!(Pipeline::<Dna, __m256i>::best_position(&result), Some(18)); + let pli = Pipeline::avx2().unwrap(); + let result = pli.score(&striped, &pssm); + assert_eq!(pli.best_position(&result), Some(18)); } -- GitLab