From 97bac0d7af13ca09de89221cf42924f14befb740 Mon Sep 17 00:00:00 2001
From: Martin Larralde <martin.larralde@embl.de>
Date: Fri, 28 Jul 2023 19:14:48 +0200
Subject: [PATCH] Use streaming intrinsics for storing data in AVX2 and SSE2
 extensions

---
 lightmotif/src/pli/platform/avx2.rs | 11 ++++++-----
 lightmotif/src/pli/platform/sse2.rs |  8 ++++----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/lightmotif/src/pli/platform/avx2.rs b/lightmotif/src/pli/platform/avx2.rs
index f9e76d6..d7ea10d 100644
--- a/lightmotif/src/pli/platform/avx2.rs
+++ b/lightmotif/src/pli/platform/avx2.rs
@@ -32,6 +32,7 @@ unsafe fn score_avx2(
     scores: &mut StripedScores<<Avx2 as Backend>::LANES>,
 ) {
     let data = scores.matrix_mut();
+    let mut rowptr = data[0].as_mut_ptr();
     // constant vector for comparing unknown bases
     let n = _mm256_set1_epi8(Nucleotide::N as i8);
     // mask vectors for broadcasting uint8x32_t to uint32x8_t to floatx8_t
@@ -107,11 +108,11 @@ unsafe fn score_avx2(
         let r3 = _mm256_permute2f128_ps(s1, s2, 0x31);
         let r4 = _mm256_permute2f128_ps(s3, s4, 0x31);
         // record the score for the current position
-        let row = &mut data[i];
-        _mm256_store_ps(row[0x00..].as_mut_ptr(), r1);
-        _mm256_store_ps(row[0x08..].as_mut_ptr(), r2);
-        _mm256_store_ps(row[0x10..].as_mut_ptr(), r3);
-        _mm256_store_ps(row[0x18..].as_mut_ptr(), r4);
+        _mm256_stream_ps(rowptr.add(0x00), r1);
+        _mm256_stream_ps(rowptr.add(0x08), r2);
+        _mm256_stream_ps(rowptr.add(0x10), r3);
+        _mm256_stream_ps(rowptr.add(0x18), r4);
+        rowptr = rowptr.add(data.stride());
     }
 }
 
diff --git a/lightmotif/src/pli/platform/sse2.rs b/lightmotif/src/pli/platform/sse2.rs
index 897a037..63643de 100644
--- a/lightmotif/src/pli/platform/sse2.rs
+++ b/lightmotif/src/pli/platform/sse2.rs
@@ -82,10 +82,10 @@ unsafe fn score_sse2<A, C>(
             }
             // record the score for the current position
             let row = &mut data[i];
-            _mm_storeu_ps(row[offset..].as_mut_ptr(), s1);
-            _mm_storeu_ps(row[offset + 4..].as_mut_ptr(), s2);
-            _mm_storeu_ps(row[offset + 8..].as_mut_ptr(), s3);
-            _mm_storeu_ps(row[offset + 12..].as_mut_ptr(), s4);
+            _mm_stream_ps(row[offset..].as_mut_ptr(), s1);
+            _mm_stream_ps(row[offset + 4..].as_mut_ptr(), s2);
+            _mm_stream_ps(row[offset + 8..].as_mut_ptr(), s3);
+            _mm_stream_ps(row[offset + 12..].as_mut_ptr(), s4);
         }
     }
 }
-- 
GitLab