From 72f427051aa6ccd6764d08eb577a7d8a5191a6a5 Mon Sep 17 00:00:00 2001
From: Martin Larralde <martin.larralde@embl.de>
Date: Sun, 16 Jun 2024 18:35:09 +0200
Subject: [PATCH] Fix default alignment used for x86-64 and Aarch64

---
 lightmotif/src/dense.rs             | 33 +++++++++++++++--------------
 lightmotif/src/pli/platform/avx2.rs |  3 +++
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/lightmotif/src/dense.rs b/lightmotif/src/dense.rs
index f16ef3a..edae74d 100644
--- a/lightmotif/src/dense.rs
+++ b/lightmotif/src/dense.rs
@@ -15,11 +15,16 @@ use crate::num::Unsigned;
 // --- DefaultAlignment --------------------------------------------------------
 
 #[cfg(target_arch = "x86_64")]
-type _DefaultAlignment = typenum::consts::U64;
-#[cfg(target_arch = "x86")]
 type _DefaultAlignment = typenum::consts::U32;
-#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+#[cfg(any(target_arch = "x86", target_arch = "arm", target_arch = "aarch64"))]
 type _DefaultAlignment = typenum::consts::U16;
+#[cfg(not(any(
+    target_arch = "x86",
+    target_arch = "x86_64",
+    target_arch = "arm",
+    target_arch = "aarch64"
+)))]
+type _DefaultAlignment = typenum::consts::U1;
 
 /// The default alignment used in dense matrices.
 pub type DefaultAlignment = _DefaultAlignment;
@@ -70,28 +75,24 @@ impl<T: Default + Copy, C: Unsigned, A: Unsigned + PowerOfTwo> DenseMatrix<T, C,
     /// Create a new *uninitialized* matrix with the given number of rows.
     pub unsafe fn uninitialized(rows: usize) -> Self {
         // Always over-allocate columns to avoid alignment issues.
-        let c = C::USIZE + (A::USIZE - C::USIZE % A::USIZE) * (C::USIZE % A::USIZE > 0) as usize;
+        let mut m = Self::new(0);
+        let c = m.stride();
 
         // NOTE: this is unsafe but given that we require `T` to be
         //       copy, this should be fine, as `Copy` prevents the
         //       type to be `Drop` as well.
         // reserve the vector without initializing the data
-        let mut data = Vec::with_capacity((rows + 1) * c);
-        data.set_len((rows + 1) * c);
+        m.data.reserve((rows + 1) * c);
+        m.data.set_len((rows + 1) * c);
 
         // compute offset to aligned memory
-        let mut offset = 0;
-        while data[offset..].as_ptr() as usize % c > 0 {
-            offset += 1
+        m.offset = 0;
+        while m.data[m.offset..].as_ptr() as usize % c > 0 {
+            m.offset += 1
         }
 
-        Self {
-            data,
-            offset,
-            rows,
-            _columns: std::marker::PhantomData,
-            _alignment: std::marker::PhantomData,
-        }
+        m.rows = rows;
+        m
     }
 
     /// Create a new dense matrix from an iterable of rows.
diff --git a/lightmotif/src/pli/platform/avx2.rs b/lightmotif/src/pli/platform/avx2.rs
index 7fddb07..01e4cee 100644
--- a/lightmotif/src/pli/platform/avx2.rs
+++ b/lightmotif/src/pli/platform/avx2.rs
@@ -145,12 +145,14 @@ unsafe fn score_avx2_permute<A>(
         // advance position in the position weight matrix
         for _ in 0..pssm.len() {
             // load sequence row and broadcast to f32
+            debug_assert_eq!(seqptr as usize & 0x1f, 0);
             let x = _mm256_load_si256(seqptr as *const __m256i);
             let x1 = _mm256_shuffle_epi8(x, m1);
             let x2 = _mm256_shuffle_epi8(x, m2);
             let x3 = _mm256_shuffle_epi8(x, m3);
             let x4 = _mm256_shuffle_epi8(x, m4);
             // load row for current weight matrix position
+            // debug_assert_eq!(pssmptr as usize & 0x1f, 0);
             let t = _mm256_loadu_ps(pssmptr);
             // index A/T/G/C/N lookup table with the bases
             let b1 = _mm256_permutevar8x32_ps(t, x1);
@@ -227,6 +229,7 @@ unsafe fn score_avx2_gather<A>(
         // advance position in the position weight matrix
         for _ in 0..pssm.len() {
             // load sequence row and broadcast to f32
+            debug_assert_eq!(seqptr as usize & 0x1f, 0);
             let x = _mm256_load_si256(seqptr as *const __m256i);
             let x1 = _mm256_shuffle_epi8(x, m1);
             let x2 = _mm256_shuffle_epi8(x, m2);
-- 
GitLab