From 97bac0d7af13ca09de89221cf42924f14befb740 Mon Sep 17 00:00:00 2001 From: Martin Larralde <martin.larralde@embl.de> Date: Fri, 28 Jul 2023 19:14:48 +0200 Subject: [PATCH] Use streaming intrinsics for storing data in AVX2 and SSE2 extensions --- lightmotif/src/pli/platform/avx2.rs | 11 ++++++----- lightmotif/src/pli/platform/sse2.rs | 8 ++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/lightmotif/src/pli/platform/avx2.rs b/lightmotif/src/pli/platform/avx2.rs index f9e76d6..d7ea10d 100644 --- a/lightmotif/src/pli/platform/avx2.rs +++ b/lightmotif/src/pli/platform/avx2.rs @@ -32,6 +32,7 @@ unsafe fn score_avx2( scores: &mut StripedScores<<Avx2 as Backend>::LANES>, ) { let data = scores.matrix_mut(); + let mut rowptr = data[0].as_mut_ptr(); // constant vector for comparing unknown bases let n = _mm256_set1_epi8(Nucleotide::N as i8); // mask vectors for broadcasting uint8x32_t to uint32x8_t to floatx8_t @@ -107,11 +108,11 @@ unsafe fn score_avx2( let r3 = _mm256_permute2f128_ps(s1, s2, 0x31); let r4 = _mm256_permute2f128_ps(s3, s4, 0x31); // record the score for the current position - let row = &mut data[i]; - _mm256_store_ps(row[0x00..].as_mut_ptr(), r1); - _mm256_store_ps(row[0x08..].as_mut_ptr(), r2); - _mm256_store_ps(row[0x10..].as_mut_ptr(), r3); - _mm256_store_ps(row[0x18..].as_mut_ptr(), r4); + _mm256_stream_ps(rowptr.add(0x00), r1); + _mm256_stream_ps(rowptr.add(0x08), r2); + _mm256_stream_ps(rowptr.add(0x10), r3); + _mm256_stream_ps(rowptr.add(0x18), r4); + rowptr = rowptr.add(data.stride()); } } diff --git a/lightmotif/src/pli/platform/sse2.rs b/lightmotif/src/pli/platform/sse2.rs index 897a037..63643de 100644 --- a/lightmotif/src/pli/platform/sse2.rs +++ b/lightmotif/src/pli/platform/sse2.rs @@ -82,10 +82,10 @@ unsafe fn score_sse2<A, C>( } // record the score for the current position let row = &mut data[i]; - _mm_storeu_ps(row[offset..].as_mut_ptr(), s1); - _mm_storeu_ps(row[offset + 4..].as_mut_ptr(), s2); - _mm_storeu_ps(row[offset + 8..].as_mut_ptr(), s3); - _mm_storeu_ps(row[offset + 12..].as_mut_ptr(), s4); + _mm_stream_ps(row[offset..].as_mut_ptr(), s1); + _mm_stream_ps(row[offset + 4..].as_mut_ptr(), s2); + _mm_stream_ps(row[offset + 8..].as_mut_ptr(), s3); + _mm_stream_ps(row[offset + 12..].as_mut_ptr(), s4); } } } -- GitLab