From 5e9cc7578f77dc78421b698a1fdd037ae9363c09 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 7 Jan 2025 16:07:10 +0000
Subject: [PATCH 01/10] Test avx2

---
 app/Cargo.toml       |  2 +-
 app/src/main.rs      | 20 ++++++++---------
 src/avx2/alpha_u8.rs | 52 +++++++++++++++-----------------------------
 3 files changed, 28 insertions(+), 46 deletions(-)
diff --git a/app/Cargo.toml b/app/Cargo.toml
index 4b7e008..1c09c6d 100644
--- a/app/Cargo.toml
+++ b/app/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 [dependencies]
 image = { version = "0.25.5", features = ["default"] }
 #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
-pic-scale = { path = "..", features = ["half", "nightly_i8mm"], default-features = true }
+pic-scale = { path = "..", features = ["half"], default-features = true }
 fast_image_resize = { version = "5.0.0", features = [] }
 half = { version = "2.4.1", default-features = true }
 
diff --git a/app/src/main.rs b/app/src/main.rs
index 935f996..a5cd157 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -56,11 +56,11 @@ fn main() {
 
     // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);
 
-    let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
+    // let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
 
     //
     let store =
-        ImageStore::<u16, 4>::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize)
+        ImageStore::<u8, 4>::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize)
             .unwrap();
 
     let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
@@ -75,7 +75,7 @@ fn main() {
     //     )
     //     .unwrap();
 
-    let mut dst_store = ImageStoreMut::<u16, 4>::alloc_with_depth(
+    let mut dst_store = ImageStoreMut::<u8, 4>::alloc_with_depth(
         dimensions.0 as usize / 4,
         dimensions.1 as usize / 4,
         10,
@@ -84,7 +84,7 @@ fn main() {
     // for i in 0..25 {
     let start_time = Instant::now();
     scaler
-        .resize_rgba_u16(&store, &mut dst_store, true)
+        .resize_rgba(&store, &mut dst_store, true)
         .unwrap();
 
     let elapsed_time = start_time.elapsed();
@@ -163,13 +163,13 @@ fn main() {
     //     .map(|&x| (x * 255f32) as u8)
     //     .collect();
 
-    let dst: Vec<u8> = dst_store
-        .as_bytes()
-        .iter()
-        .map(|&x| (x >> 2) as u8)
-        .collect();
+    // let dst: Vec<u8> = dst_store
+    //     .as_bytes()
+    //     .iter()
+    //     .map(|&x| (x >> 2) as u8)
+    //     .collect();
 
-    // let dst = dst_store.as_bytes();
+    let dst = dst_store.as_bytes();
     // let dst = resized;
     // image::save_buffer(
     //     "converted.png",
diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs
index a573f36..048c1d8 100644
--- a/src/avx2/alpha_u8.rs
+++ b/src/avx2/alpha_u8.rs
@@ -62,44 +62,29 @@ struct AssociateAlphaDefault {}
 impl AssociateAlphaDefault {
     #[inline(always)]
     unsafe fn associate_chunk(&self, dst: &mut [u8], src: &[u8]) {
+        let shuffle = _mm256_setr_epi8(
+            3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15, 3, 3, 3, 3, 7, 7, 7, 7, 11, 11,
+            11, 11, 15, 15, 15, 15,
+        );
         let src_ptr = src.as_ptr();
         let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-        let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-        let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
-        let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
-        let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
+        let multiplicand = _mm256_shuffle_epi8(rgba0, shuffle);
 
         let zeros = _mm256_setzero_si256();
 
-        let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros);
-        let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros);
-
-        let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros);
-        let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros);
-
-        let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros);
-        let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros);
+        let mut v_ll = _mm256_unpacklo_epi8(rgba0, zeros);
+        let mut v_hi = _mm256_unpackhi_epi8(rgba0, zeros);
 
-        let aaa_low = _mm256_unpacklo_epi8(aaa, zeros);
-        let aaa_high = _mm256_unpackhi_epi8(aaa, zeros);
+        let a_lo = _mm256_unpacklo_epi8(multiplicand, zeros);
+        let a_hi = _mm256_unpackhi_epi8(multiplicand, zeros);
 
-        rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low));
-        rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high));
-        ggg_low = avx2_div_by255(_mm256_mullo_epi16(ggg_low, aaa_low));
-        ggg_high = avx2_div_by255(_mm256_mullo_epi16(ggg_high, aaa_high));
-        bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low));
-        bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high));
+        v_ll = avx2_div_by255(_mm256_mullo_epi16(v_ll, a_lo));
+        v_hi = avx2_div_by255(_mm256_mullo_epi16(v_hi, a_hi));
 
-        let rrr = _mm256_packus_epi16(rrr_low, rrr_high);
-        let ggg = _mm256_packus_epi16(ggg_low, ggg_high);
-        let bbb = _mm256_packus_epi16(bbb_low, bbb_high);
+        let values = _mm256_packus_epi16(v_ll, v_hi);
 
-        let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
         let dst_ptr = dst.as_mut_ptr();
-        _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
-        _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
-        _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
-        _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
+        _mm256_storeu_si256(dst_ptr as *mut __m256i, values);
     }
 }
 
@@ -109,18 +94,15 @@ impl AssociateAlpha for AssociateAlphaDefault {
         let mut rem = dst;
         let mut src_rem = src;
 
-        for (dst, src) in rem
-            .chunks_exact_mut(32 * 4)
-            .zip(src_rem.chunks_exact(32 * 4))
-        {
+        for (dst, src) in rem.chunks_exact_mut(32).zip(src_rem.chunks_exact(32)) {
             self.associate_chunk(dst, src);
         }
 
-        rem = rem.chunks_exact_mut(32 * 4).into_remainder();
-        src_rem = src_rem.chunks_exact(32 * 4).remainder();
+        rem = rem.chunks_exact_mut(32).into_remainder();
+        src_rem = src_rem.chunks_exact(32).remainder();
 
         if !rem.is_empty() {
-            const PART_SIZE: usize = 32 * 4;
+            const PART_SIZE: usize = 32;
             assert!(src_rem.len() < PART_SIZE);
             assert!(rem.len() < PART_SIZE);
             assert_eq!(src_rem.len(), rem.len());

From 3e8e8cfa18faa6f707343ea5727c71d458c75503 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 7 Jan 2025 17:47:49 +0000
Subject: [PATCH 02/10] Test different alpha algorithm

---
 src/avx2/alpha_u8.rs | 90 ++++++++++++++++----------------------------
 1 file changed, 32 insertions(+), 58 deletions(-)

diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs
index 048c1d8..47a1ce9 100644
--- a/src/avx2/alpha_u8.rs
+++ b/src/avx2/alpha_u8.rs
@@ -28,7 +28,7 @@
  */
 
 use crate::avx2::utils::{
-    _mm256_select_si256, avx2_deinterleave_rgba, avx2_div_by255, avx2_interleave_rgba,
+    _mm256_select_si256, avx2_div_by255,
 };
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
@@ -183,73 +183,47 @@ struct Avx2DisassociateAlpha {}
 
 impl Avx2DisassociateAlpha {
     #[inline(always)]
-    unsafe fn avx2_unpremultiply_row(&self, x: __m256i, a: __m256i) -> __m256i {
-        let zeros = _mm256_setzero_si256();
-        let lo = _mm256_unpacklo_epi8(x, zeros);
-        let hi = _mm256_unpackhi_epi8(x, zeros);
-
-        let is_zero_mask = _mm256_cmpeq_epi8(a, zeros);
+    unsafe fn avx2_unpremultiply_row(&self, x: __m256i) -> __m256i {
 
-        let scale_ps = _mm256_set1_ps(255f32);
+        let is_zero_mask = _mm256_set1_epi32(0xff000000u32 as i32);
 
-        let lo_lo = _mm256_mul_ps(
-            _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(lo, zeros)),
-            scale_ps,
-        );
-        let lo_hi = _mm256_mul_ps(
-            _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(lo, zeros)),
-            scale_ps,
-        );
-        let hi_lo = _mm256_mul_ps(
-            _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(hi, zeros)),
-            scale_ps,
+        let shuffle_lo = _mm256_setr_epi8(
+            0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 4, 5, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 4,
+            5, 4, 5,
         );
-        let hi_hi = _mm256_mul_ps(
-            _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)),
-            scale_ps,
+
+        let shuffle_hi = _mm256_setr_epi8(
+            8, 9, 8, 9, 8, 9, 8, 9, 12, 13, 12, 13, 12, 13, 12, 13, 8, 9, 8, 9, 8, 9, 8, 9, 12, 13,
+            12, 13, 12, 13, 12, 13,
         );
-        let a_lo = _mm256_unpacklo_epi8(a, zeros);
-        let a_hi = _mm256_unpackhi_epi8(a, zeros);
-        let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros)));
-        let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros)));
-        let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros)));
-        let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros)));
-
-        let lo_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_lo, a_lo_lo)));
-        let lo_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_hi, a_lo_hi)));
-        let hi_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(hi_lo, a_hi_lo)));
-        let hi_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(hi_hi, a_hi_hi)));
-
-        _mm256_select_si256(
-            is_zero_mask,
-            zeros,
-            _mm256_packus_epi16(
-                _mm256_packus_epi32(lo_lo, lo_hi),
-                _mm256_packus_epi32(hi_lo, hi_hi),
-            ),
-        )
+
+        let scale_ps = _mm256_set1_ps((255 * 257) as f32);
+        let alpha_cvt = _mm256_cvtepi32_ps(_mm256_srli_epi32::<24>(x));
+        let numer = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_rcp_ps(alpha_cvt), scale_ps));
+
+        let px_lo = _mm256_unpacklo_epi8(x, x);
+        let px_hi = _mm256_unpackhi_epi8(x, x);
+
+        let numer_lo = _mm256_shuffle_epi8(numer, shuffle_lo);
+        let numer_hi = _mm256_shuffle_epi8(numer, shuffle_hi);
+
+        let v_lo = _mm256_mulhi_epu16(px_lo, numer_lo);
+        let v_hi = _mm256_mulhi_epu16(px_hi, numer_hi);
+
+        // _mm256_select_si256(is_zero_mask, zeros, _mm256_packus_epi16(v_lo, v_hi))
+        let alpha = _mm256_and_si256(x, is_zero_mask);
+        _mm256_blendv_epi8(_mm256_packus_epi16(v_lo, v_hi), alpha, is_zero_mask)
     }
 
     #[inline(always)]
     unsafe fn disassociate_chunk(&self, in_place: &mut [u8]) {
         let src_ptr = in_place.as_ptr();
         let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
-        let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
-        let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
-        let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
-        let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
 
-        let rrr = self.avx2_unpremultiply_row(rrr, aaa);
-        let ggg = self.avx2_unpremultiply_row(ggg, aaa);
-        let bbb = self.avx2_unpremultiply_row(bbb, aaa);
-
-        let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
+        let rrr = self.avx2_unpremultiply_row(rgba0);
 
         let dst_ptr = in_place.as_mut_ptr();
-        _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
-        _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
-        _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
-        _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
+        _mm256_storeu_si256(dst_ptr as *mut __m256i, rrr);
     }
 }
 
@@ -258,14 +232,14 @@ impl DisassociateAlpha for Avx2DisassociateAlpha {
     unsafe fn disassociate(&self, in_place: &mut [u8]) {
         let mut rem = in_place;
 
-        for dst in rem.chunks_exact_mut(32 * 4) {
+        for dst in rem.chunks_exact_mut(32) {
             self.disassociate_chunk(dst);
         }
 
-        rem = rem.chunks_exact_mut(32 * 4).into_remainder();
+        rem = rem.chunks_exact_mut(32).into_remainder();
 
         if !rem.is_empty() {
-            const PART_SIZE: usize = 32 * 4;
+            const PART_SIZE: usize = 32;
             assert!(rem.len() < PART_SIZE);
 
             let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];

From 2b6f0bf5a7c065685a3115499eb56fe1557230c7 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 7 Jan 2025 17:51:48 +0000
Subject: [PATCH 03/10] Revert "Test different alpha algorithm"

This reverts commit 3e8e8cfa18faa6f707343ea5727c71d458c75503.
---
 src/avx2/alpha_u8.rs | 90 ++++++++++++++++++++++++++++----------------
 1 file changed, 58 insertions(+), 32 deletions(-)

diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs
index 47a1ce9..048c1d8 100644
--- a/src/avx2/alpha_u8.rs
+++ b/src/avx2/alpha_u8.rs
@@ -28,7 +28,7 @@
  */
 
 use crate::avx2::utils::{
-    _mm256_select_si256, avx2_div_by255,
+    _mm256_select_si256, avx2_deinterleave_rgba, avx2_div_by255, avx2_interleave_rgba,
 };
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
@@ -183,47 +183,73 @@ struct Avx2DisassociateAlpha {}
 
 impl Avx2DisassociateAlpha {
     #[inline(always)]
-    unsafe fn avx2_unpremultiply_row(&self, x: __m256i) -> __m256i {
+    unsafe fn avx2_unpremultiply_row(&self, x: __m256i, a: __m256i) -> __m256i {
+        let zeros = _mm256_setzero_si256();
+        let lo = _mm256_unpacklo_epi8(x, zeros);
+        let hi = _mm256_unpackhi_epi8(x, zeros);
 
-        let is_zero_mask = _mm256_set1_epi32(0xff000000u32 as i32);
+        let is_zero_mask = _mm256_cmpeq_epi8(a, zeros);
 
-        let shuffle_lo = _mm256_setr_epi8(
-            0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 4, 5, 4, 5, 0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 4,
-            5, 4, 5,
-        );
+        let scale_ps = _mm256_set1_ps(255f32);
 
-        let shuffle_hi = _mm256_setr_epi8(
-            8, 9, 8, 9, 8, 9, 8, 9, 12, 13, 12, 13, 12, 13, 12, 13, 8, 9, 8, 9, 8, 9, 8, 9, 12, 13,
-            12, 13, 12, 13, 12, 13,
+        let lo_lo = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(lo, zeros)),
+            scale_ps,
         );
-
-        let scale_ps = _mm256_set1_ps((255 * 257) as f32);
-        let alpha_cvt = _mm256_cvtepi32_ps(_mm256_srli_epi32::<24>(x));
-        let numer = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_rcp_ps(alpha_cvt), scale_ps));
-
-        let px_lo = _mm256_unpacklo_epi8(x, x);
-        let px_hi = _mm256_unpackhi_epi8(x, x);
-
-        let numer_lo = _mm256_shuffle_epi8(numer, shuffle_lo);
-        let numer_hi = _mm256_shuffle_epi8(numer, shuffle_hi);
-
-        let v_lo = _mm256_mulhi_epu16(px_lo, numer_lo);
-        let v_hi = _mm256_mulhi_epu16(px_hi, numer_hi);
-
-        // _mm256_select_si256(is_zero_mask, zeros, _mm256_packus_epi16(v_lo, v_hi))
-        let alpha = _mm256_and_si256(x, is_zero_mask);
-        _mm256_blendv_epi8(_mm256_packus_epi16(v_lo, v_hi), alpha, is_zero_mask)
+        let lo_hi = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(lo, zeros)),
+            scale_ps,
+        );
+        let hi_lo = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(hi, zeros)),
+            scale_ps,
+        );
+        let hi_hi = _mm256_mul_ps(
+            _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)),
+            scale_ps,
+        );
+        let a_lo = _mm256_unpacklo_epi8(a, zeros);
+        let a_hi = _mm256_unpackhi_epi8(a, zeros);
+        let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros)));
+        let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros)));
+        let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros)));
+        let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros)));
+
+        let lo_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_lo, a_lo_lo)));
+        let lo_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_hi, a_lo_hi)));
+        let hi_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(hi_lo, a_hi_lo)));
+        let hi_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(hi_hi, a_hi_hi)));
+
+        _mm256_select_si256(
+            is_zero_mask,
+            zeros,
+            _mm256_packus_epi16(
+                _mm256_packus_epi32(lo_lo, lo_hi),
+                _mm256_packus_epi32(hi_lo, hi_hi),
+            ),
+        )
     }
 
     #[inline(always)]
     unsafe fn disassociate_chunk(&self, in_place: &mut [u8]) {
         let src_ptr = in_place.as_ptr();
         let rgba0 = _mm256_loadu_si256(src_ptr as *const __m256i);
+        let rgba1 = _mm256_loadu_si256(src_ptr.add(32) as *const __m256i);
+        let rgba2 = _mm256_loadu_si256(src_ptr.add(64) as *const __m256i);
+        let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
+        let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
 
-        let rrr = self.avx2_unpremultiply_row(rgba0);
+        let rrr = self.avx2_unpremultiply_row(rrr, aaa);
+        let ggg = self.avx2_unpremultiply_row(ggg, aaa);
+        let bbb = self.avx2_unpremultiply_row(bbb, aaa);
+
+        let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
 
         let dst_ptr = in_place.as_mut_ptr();
-        _mm256_storeu_si256(dst_ptr as *mut __m256i, rrr);
+        _mm256_storeu_si256(dst_ptr as *mut __m256i, rgba0);
+        _mm256_storeu_si256(dst_ptr.add(32) as *mut __m256i, rgba1);
+        _mm256_storeu_si256(dst_ptr.add(64) as *mut __m256i, rgba2);
+        _mm256_storeu_si256(dst_ptr.add(96) as *mut __m256i, rgba3);
     }
 }
 
@@ -232,14 +258,14 @@ impl DisassociateAlpha for Avx2DisassociateAlpha {
     unsafe fn disassociate(&self, in_place: &mut [u8]) {
         let mut rem = in_place;
 
-        for dst in rem.chunks_exact_mut(32) {
+        for dst in rem.chunks_exact_mut(32 * 4) {
             self.disassociate_chunk(dst);
         }
 
-        rem = rem.chunks_exact_mut(32).into_remainder();
+        rem = rem.chunks_exact_mut(32 * 4).into_remainder();
 
         if !rem.is_empty() {
-            const PART_SIZE: usize = 32;
+            const PART_SIZE: usize = 32 * 4;
             assert!(rem.len() < PART_SIZE);
 
             let mut buffer: [u8; PART_SIZE] = [0u8; PART_SIZE];

From 67387215adde5f436a831a0241e32e406e854196 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 7 Jan 2025 17:52:11 +0000
Subject: [PATCH 04/10] Rolled back tests

---
 app/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/Cargo.toml b/app/Cargo.toml
index 1c09c6d..71f0b9f 100644
--- a/app/Cargo.toml
+++ b/app/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 [dependencies]
 image = { version = "0.25.5", features = ["default"] }
 #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
-pic-scale = { path = "..", features = ["half"], default-features = true }
+pic-scale = { path = "..", features = ["half", "nightly_avx512"], default-features = true }
 fast_image_resize = { version = "5.0.0", features = [] }
 half = { version = "2.4.1", default-features = true }
 

From f90291478c5d2e9449432b956ae1b10f7ff94391 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 7 Jan 2025 17:58:56 +0000
Subject: [PATCH 05/10] Testing new alpha un association

---
 src/avx2/alpha_u8.rs | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs
index 048c1d8..4a65325 100644
--- a/src/avx2/alpha_u8.rs
+++ b/src/avx2/alpha_u8.rs
@@ -208,12 +208,13 @@ impl Avx2DisassociateAlpha {
             _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)),
             scale_ps,
         );
-        let a_lo = _mm256_unpacklo_epi8(a, zeros);
-        let a_hi = _mm256_unpackhi_epi8(a, zeros);
-        let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros)));
-        let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros)));
-        let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros)));
-        let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros)));
+
+        let alphas = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_srli_epi32::<24>(a)));
+
+        let a_lo_lo = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(0, 0, 0, 0, 4, 4, 4, 4));
+        let a_lo_hi = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(1, 1, 1, 1, 5, 5, 5, 5));
+        let a_hi_lo = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(2, 2, 2, 2, 6, 6, 6, 6));
+        let a_hi_hi = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(3, 3, 3, 3, 7, 7, 7, 7));
 
         let lo_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_lo, a_lo_lo)));
         let lo_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_hi, a_lo_hi)));

From a5a99addfd80f23f00b8231f6629a8882b4aaf0b Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 7 Jan 2025 18:08:23 +0000
Subject: [PATCH 06/10] Revert "Testing new alpha un association"

This reverts commit f90291478c5d2e9449432b956ae1b10f7ff94391.
---
 src/avx2/alpha_u8.rs | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs
index 4a65325..048c1d8 100644
--- a/src/avx2/alpha_u8.rs
+++ b/src/avx2/alpha_u8.rs
@@ -208,13 +208,12 @@ impl Avx2DisassociateAlpha {
             _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)),
             scale_ps,
         );
-
-        let alphas = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_srli_epi32::<24>(a)));
-
-        let a_lo_lo = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(0, 0, 0, 0, 4, 4, 4, 4));
-        let a_lo_hi = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(1, 1, 1, 1, 5, 5, 5, 5));
-        let a_hi_lo = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(2, 2, 2, 2, 6, 6, 6, 6));
-        let a_hi_hi = _mm256_permutevar8x32_ps(alphas, _mm256_setr_epi32(3, 3, 3, 3, 7, 7, 7, 7));
+        let a_lo = _mm256_unpacklo_epi8(a, zeros);
+        let a_hi = _mm256_unpackhi_epi8(a, zeros);
+        let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros)));
+        let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros)));
+        let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros)));
+        let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros)));
 
         let lo_lo = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_lo, a_lo_lo)));
         let lo_hi = _mm256_cvtps_epi32(_mm256_round_ps::<0x00>(_mm256_mul_ps(lo_hi, a_lo_hi)));

From c14af4c88da6dcef8a1e168639384a9647ca52d7 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 7 Jan 2025 18:17:19 +0000
Subject: [PATCH 07/10] Small improvements

---
 src/neon/rgb_u8.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/neon/rgb_u8.rs b/src/neon/rgb_u8.rs
index af240f5..15ca20c 100644
--- a/src/neon/rgb_u8.rs
+++ b/src/neon/rgb_u8.rs
@@ -70,8 +70,7 @@ unsafe fn conv_horiz_rgba_2_u8(
 ) -> int32x4_t {
     const COMPONENTS: usize = 3;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
-    let mut rgb_pixel = vdup_n_u32(0);
-    rgb_pixel = vset_lane_u32::<0>((src_ptr.as_ptr() as *const u32).read_unaligned(), rgb_pixel);
+    let mut rgb_pixel = vld1_lane_u32(src_ptr.as_ptr() as *const u32, vdup_n_u32(0));
     rgb_pixel = vreinterpret_u32_u16(vset_lane_u16::<2>(
         (src_ptr.get_unchecked(4..).as_ptr() as *const u16).read_unaligned(),
         vreinterpret_u16_u32(rgb_pixel),

From 7db394bd429ae39e63c3c47d9915286d72c7ceb9 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Wed, 8 Jan 2025 09:05:26 +0000
Subject: [PATCH 08/10] Refactor

---
 app/src/main.rs            |  10 +-
 src/avx2/mod.rs            |   4 +
 src/avx2/rgb_u8_dot_i8.rs  | 201 +++++++------------------------------
 src/avx2/rgba_u8_dot_lp.rs | 161 ++++++++---------------------
 src/avx2/utils.rs          |   4 +
 src/dispatch_group_u8.rs   |  10 +-
 src/plane_u8.rs            |   9 +-
 src/rgb_u8.rs              |  16 ++-
 src/rgba_u8.rs             |  19 +++-
 9 files changed, 129 insertions(+), 305 deletions(-)

diff --git a/app/src/main.rs b/app/src/main.rs
index a5cd157..e1489a9 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -43,7 +43,7 @@ fn resize_plane(
 
 fn main() {
     // test_fast_image();
-    let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
+    let img = ImageReader::open("./assets/asset_4.png")
         .unwrap()
         .decode()
         .unwrap();
@@ -76,16 +76,14 @@ fn main() {
     //     .unwrap();
 
     let mut dst_store = ImageStoreMut::<u8, 4>::alloc_with_depth(
-        dimensions.0 as usize / 4,
-        dimensions.1 as usize / 4,
+        dimensions.0 as usize / 2,
+        dimensions.1 as usize / 2,
         10,
     );
 
     // for i in 0..25 {
     let start_time = Instant::now();
-    scaler
-        .resize_rgba(&store, &mut dst_store, true)
-        .unwrap();
+    scaler.resize_rgba(&store, &mut dst_store, true).unwrap();
 
     let elapsed_time = start_time.elapsed();
     // Print the elapsed time in milliseconds
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
index 37e75f4..1309080 100644
--- a/src/avx2/mod.rs
+++ b/src/avx2/mod.rs
@@ -34,10 +34,12 @@ mod alpha_u16;
 mod alpha_u8;
 mod check_alpha;
 mod rgb_u8;
+#[cfg(feature = "nightly_avx512")]
 mod rgb_u8_dot_i8;
 #[cfg(feature = "half")]
 mod rgba_f16;
 mod rgba_f32;
+#[cfg(feature = "nightly_avx512")]
 mod rgba_u8_dot_lp;
 mod rgba_u8_lb;
 pub(crate) mod utils;
@@ -48,6 +50,7 @@ mod vertical_u16_lb;
 mod vertical_u8;
 mod vertical_u8_lp;
 
+#[cfg(feature = "nightly_avx512")]
 pub(crate) use crate::avx2::rgba_u8_dot_lp::{
     convolve_horizontal_rgba_row_dot, convolve_horizontal_rgba_rows_4_dot,
 };
@@ -62,6 +65,7 @@ pub(crate) use check_alpha::{
     avx_has_non_constant_cap_alpha_rgba16, avx_has_non_constant_cap_alpha_rgba8,
 };
 pub(crate) use rgb_u8::{convolve_horizontal_rgb_avx_row_one, convolve_horizontal_rgb_avx_rows_4};
+#[cfg(feature = "nightly_avx512")]
 pub(crate) use rgb_u8_dot_i8::{
     convolve_horizontal_rgb_avx_row_i8_one, convolve_horizontal_rgb_avx_rows_4_i8,
 };
diff --git a/src/avx2/rgb_u8_dot_i8.rs b/src/avx2/rgb_u8_dot_i8.rs
index 870fd67..e444098 100644
--- a/src/avx2/rgb_u8_dot_i8.rs
+++ b/src/avx2/rgb_u8_dot_i8.rs
@@ -54,18 +54,13 @@ pub(crate) fn convolve_horizontal_rgb_avx_rows_4_i8(
     filter_weights: &FilterWeights<i8>,
 ) {
     unsafe {
-        #[cfg(feature = "nightly_avx512")]
-        if std::arch::is_x86_feature_detected!("avxvnni") {
-            return convolve_horizontal_rgb_avx_rows_i8_4_dot(
-                src,
-                src_stride,
-                dst,
-                dst_stride,
-                filter_weights,
-            );
-        }
-
-        convolve_horizontal_rgb_avx_rows_i8_4_ubs(src, src_stride, dst, dst_stride, filter_weights);
+        convolve_horizontal_rgb_avx_rows_i8_4_impl(
+            src,
+            src_stride,
+            dst,
+            dst_stride,
+            filter_weights,
+        );
     }
 }
 
@@ -113,43 +108,8 @@ unsafe fn make_tuple_x8(pixel: __m128i, pixel2: __m128i, shuf: __m256i) -> __m25
     )
 }
 
-#[cfg(feature = "nightly_avx512")]
 #[target_feature(enable = "avx2", enable = "avxvnni")]
-unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_dot(
-    src: &[u8],
-    src_stride: usize,
-    dst: &mut [u8],
-    dst_stride: usize,
-    filter_weights: &FilterWeights<i8>,
-) {
-    convolve_horizontal_rgb_avx_rows_i8_4_impl::<true>(
-        src,
-        src_stride,
-        dst,
-        dst_stride,
-        filter_weights,
-    );
-}
-
-#[target_feature(enable = "avx2")]
-unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_ubs(
-    src: &[u8],
-    src_stride: usize,
-    dst: &mut [u8],
-    dst_stride: usize,
-    filter_weights: &FilterWeights<i8>,
-) {
-    convolve_horizontal_rgb_avx_rows_i8_4_impl::<false>(
-        src,
-        src_stride,
-        dst,
-        dst_stride,
-        filter_weights,
-    );
-}
-
-#[inline(always)]
-unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_impl<const DOT: bool>(
+unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_impl(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -160,6 +120,7 @@ unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_impl<const DOT: bool>(
 
     const PRECISION: i32 = 7;
     const ROUNDING_CONST: i32 = 1 << (PRECISION - 1);
+    const DOT: bool = true;
 
     let shuffle_v = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1);
 
@@ -176,52 +137,18 @@ unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_impl<const DOT: bool>(
         -1, -1, -1, -1,
     );
 
-    let vld = if DOT {
-        _mm_set1_epi32(ROUNDING_CONST)
-    } else {
-        _mm_setr_epi16(
-            ROUNDING_CONST as i16,
-            0,
-            ROUNDING_CONST as i16,
-            0,
-            ROUNDING_CONST as i16,
-            0,
-            0,
-            0,
-        )
-    };
-
-    let vld_avx = if DOT {
-        _mm256_setr_epi32(
-            ROUNDING_CONST,
-            ROUNDING_CONST,
-            ROUNDING_CONST,
-            0,
-            0,
-            0,
-            0,
-            0,
-        )
-    } else {
-        _mm256_setr_epi16(
-            ROUNDING_CONST as i16,
-            0,
-            ROUNDING_CONST as i16,
-            0,
-            ROUNDING_CONST as i16,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-        )
-    };
+    let vld = _mm_set1_epi32(ROUNDING_CONST);
+
+    let vld_avx = _mm256_setr_epi32(
+        ROUNDING_CONST,
+        ROUNDING_CONST,
+        ROUNDING_CONST,
+        0,
+        0,
+        0,
+        0,
+        0,
+    );
 
     let (row0_ref, rest) = dst.split_at_mut(dst_stride);
     let (row1_ref, rest) = rest.split_at_mut(dst_stride);
@@ -405,11 +332,7 @@ pub(crate) fn convolve_horizontal_rgb_avx_row_i8_one(
     filter_weights: &FilterWeights<i8>,
 ) {
     unsafe {
-        #[cfg(feature = "nightly_avx512")]
-        if std::arch::is_x86_feature_detected!("avxvnni") {
-            return convolve_horizontal_rgb_avx_row_i8_dot_one_impl(src, dst, filter_weights);
-        }
-        convolve_horizontal_rgb_avx_row_i8_ubs_one_impl(src, dst, filter_weights);
+        convolve_horizontal_rgb_avx_row_i8_one_impl(src, dst, filter_weights);
     }
 }
 
@@ -428,32 +351,14 @@ unsafe fn add_one_weight<const DOT: bool>(
     _mm_udot8_epi16::<DOT>(store_0, lo, weight0)
 }
 
-#[cfg(feature = "nightly_avx512")]
 #[target_feature(enable = "avx2", enable = "avxvnni")]
-unsafe fn convolve_horizontal_rgb_avx_row_i8_dot_one_impl(
-    src: &[u8],
-    dst: &mut [u8],
-    filter_weights: &FilterWeights<i8>,
-) {
-    convolve_horizontal_rgb_avx_row_i8_one_impl::<true>(src, dst, filter_weights);
-}
-
-#[target_feature(enable = "avx2")]
-unsafe fn convolve_horizontal_rgb_avx_row_i8_ubs_one_impl(
-    src: &[u8],
-    dst: &mut [u8],
-    filter_weights: &FilterWeights<i8>,
-) {
-    convolve_horizontal_rgb_avx_row_i8_one_impl::<false>(src, dst, filter_weights);
-}
-
-#[inline(always)]
-unsafe fn convolve_horizontal_rgb_avx_row_i8_one_impl<const DOT: bool>(
+unsafe fn convolve_horizontal_rgb_avx_row_i8_one_impl(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i8>,
 ) {
     const CHANNELS: usize = 3;
+    const DOT: bool = true;
 
     let shuffle_v = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1);
 
@@ -478,52 +383,18 @@ unsafe fn convolve_horizontal_rgb_avx_row_i8_one_impl<const DOT: bool>(
     const PRECISION: i32 = 7;
     const ROUNDING_CONST: i32 = 1 << (PRECISION - 1);
 
-    let vld = if DOT {
-        _mm_set1_epi32(ROUNDING_CONST)
-    } else {
-        _mm_setr_epi16(
-            ROUNDING_CONST as i16,
-            0,
-            ROUNDING_CONST as i16,
-            0,
-            ROUNDING_CONST as i16,
-            0,
-            0,
-            0,
-        )
-    };
-
-    let vld_avx = if DOT {
-        _mm256_setr_epi32(
-            ROUNDING_CONST,
-            ROUNDING_CONST,
-            ROUNDING_CONST,
-            0,
-            0,
-            0,
-            0,
-            0,
-        )
-    } else {
-        _mm256_setr_epi16(
-            ROUNDING_CONST as i16,
-            0,
-            ROUNDING_CONST as i16,
-            0,
-            ROUNDING_CONST as i16,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-        )
-    };
+    let vld = _mm_set1_epi32(ROUNDING_CONST);
+
+    let vld_avx = _mm256_setr_epi32(
+        ROUNDING_CONST,
+        ROUNDING_CONST,
+        ROUNDING_CONST,
+        0,
+        0,
+        0,
+        0,
+        0,
+    );
 
     for ((dst, bounds), weights) in dst
         .chunks_exact_mut(CHANNELS)
diff --git a/src/avx2/rgba_u8_dot_lp.rs b/src/avx2/rgba_u8_dot_lp.rs
index 1b7e01f..bddae95 100644
--- a/src/avx2/rgba_u8_dot_lp.rs
+++ b/src/avx2/rgba_u8_dot_lp.rs
@@ -33,46 +33,23 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-/// Will use `avxvnni` if available, if not `maddubs16`
+/// Will use `avxvnni` if available
 ///
 /// `avxvnni` feature has slightly lower precision and won't work really well on huge kernel which
 /// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling.
 ///
 /// # Safety
-/// - Check `avx2` availability before the call.
+/// - Check `avxvnni` availability before the call.
 pub(crate) fn convolve_horizontal_rgba_row_dot(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i8>,
 ) {
     unsafe {
-        #[cfg(feature = "nightly_avx512")]
-        if std::arch::is_x86_feature_detected!("avxvnni") {
-            return convolve_horizontal_rgba_vnni_row_dot_impl(src, dst, filter_weights);
-        }
-        convolve_horizontal_rgba_ubs_row_dot_impl(src, dst, filter_weights);
+        convolve_horizontal_rgba_row_dot_impl(src, dst, filter_weights);
     }
 }
 
-#[cfg(feature = "nightly_avx512")]
-#[target_feature(enable = "avxvnni", enable = "avx2")]
-unsafe fn convolve_horizontal_rgba_vnni_row_dot_impl(
-    src: &[u8],
-    dst: &mut [u8],
-    filter_weights: &FilterWeights<i8>,
-) {
-    convolve_horizontal_rgba_row_dot_impl::<true>(src, dst, filter_weights);
-}
-
-#[target_feature(enable = "avx2")]
-unsafe fn convolve_horizontal_rgba_ubs_row_dot_impl(
-    src: &[u8],
-    dst: &mut [u8],
-    filter_weights: &FilterWeights<i8>,
-) {
-    convolve_horizontal_rgba_row_dot_impl::<false>(src, dst, filter_weights);
-}
-
 #[inline(always)]
 fn compress_i32<const DOT: bool>(x: __m128i) -> __m128i {
     unsafe {
@@ -85,14 +62,15 @@ fn compress_i32<const DOT: bool>(x: __m128i) -> __m128i {
     }
 }
 
-#[inline(always)]
-unsafe fn convolve_horizontal_rgba_row_dot_impl<const DOT: bool>(
+#[target_feature(enable = "avxvnni", enable = "avx2")]
+unsafe fn convolve_horizontal_rgba_row_dot_impl(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i8>,
 ) {
     const ROUNDING: i16 = 1 << (7 - 1);
     const CHANNELS: usize = 4;
+    const DOT: bool = true;
 
     let shuffle_weights_table = _mm_setr_epi8(0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3);
     let shuffle_4_table = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
@@ -110,11 +88,7 @@ unsafe fn convolve_horizontal_rgba_row_dot_impl<const DOT: bool>(
     {
         let bounds_size = bounds.size;
         let mut jx = 0usize;
-        let mut store = if DOT {
-            _mm_set1_epi32(ROUNDING as i32)
-        } else {
-            _mm_setr_epi16(ROUNDING, 0, ROUNDING, 0, ROUNDING, 0, ROUNDING, 0)
-        };
+        let mut store = _mm_set1_epi32(ROUNDING as i32);
 
         if bounds_size > 8 {
             let shuffle_avx_weights = _mm256_setr_epi8(
@@ -127,22 +101,16 @@ unsafe fn convolve_horizontal_rgba_row_dot_impl<const DOT: bool>(
                 6, 10, 14, 3, 7, 11, 15,
             );
 
-            let mut store_avx = if DOT {
-                _mm256_setr_epi32(
-                    ROUNDING as i32,
-                    ROUNDING as i32,
-                    ROUNDING as i32,
-                    ROUNDING as i32,
-                    0,
-                    0,
-                    0,
-                    0,
-                )
-            } else {
-                _mm256_setr_epi16(
-                    ROUNDING, 0, ROUNDING, 0, ROUNDING, 0, ROUNDING, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                )
-            };
+            let mut store_avx = _mm256_setr_epi32(
+                ROUNDING as i32,
+                ROUNDING as i32,
+                ROUNDING as i32,
+                ROUNDING as i32,
+                0,
+                0,
+                0,
+                0,
+            );
 
             while jx + 8 < bounds_size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 8));
@@ -229,13 +197,13 @@ unsafe fn convolve_horizontal_rgba_row_dot_impl<const DOT: bool>(
     }
 }
 
-/// Will use `avxvnni` if available, if not `maddubs16`
+/// Will use `avxvnni` if available
 ///
 /// `avxvnni` feature has slightly lower precision and won't work really well on huge kernel which
 /// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling.
 ///
 /// # Safety
-/// - Check `avx2` availability before the call.
+/// - Check `avxvnni` availability before the call.
 pub(crate) fn convolve_horizontal_rgba_rows_4_dot(
     src: &[u8],
     src_stride: usize,
@@ -244,57 +212,18 @@ pub(crate) fn convolve_horizontal_rgba_rows_4_dot(
     filter_weights: &FilterWeights<i8>,
 ) {
     unsafe {
-        #[cfg(feature = "nightly_avx512")]
-        if std::arch::is_x86_feature_detected!("avxvnni") {
-            return convolve_horizontal_rgba_vnni_rows_4_dot(
-                src,
-                src_stride,
-                dst,
-                dst_stride,
-                filter_weights,
-            );
-        }
-        convolve_horizontal_rgba_vnni_rows_4_ubs(src, src_stride, dst, dst_stride, filter_weights);
+        convolve_horizontal_rgba_vnni_rows_4_dot_impl(
+            src,
+            src_stride,
+            dst,
+            dst_stride,
+            filter_weights,
+        );
     }
 }
 
-#[cfg(feature = "nightly_avx512")]
 #[target_feature(enable = "avxvnni", enable = "avx2")]
-unsafe fn convolve_horizontal_rgba_vnni_rows_4_dot(
-    src: &[u8],
-    src_stride: usize,
-    dst: &mut [u8],
-    dst_stride: usize,
-    filter_weights: &FilterWeights<i8>,
-) {
-    convolve_horizontal_rgba_vnni_rows_4_dot_impl::<true>(
-        src,
-        src_stride,
-        dst,
-        dst_stride,
-        filter_weights,
-    );
-}
-
-#[target_feature(enable = "avx2")]
-unsafe fn convolve_horizontal_rgba_vnni_rows_4_ubs(
-    src: &[u8],
-    src_stride: usize,
-    dst: &mut [u8],
-    dst_stride: usize,
-    filter_weights: &FilterWeights<i8>,
-) {
-    convolve_horizontal_rgba_vnni_rows_4_dot_impl::<false>(
-        src,
-        src_stride,
-        dst,
-        dst_stride,
-        filter_weights,
-    );
-}
-
-#[inline(always)]
-unsafe fn convolve_horizontal_rgba_vnni_rows_4_dot_impl<const DOT: bool>(
+unsafe fn convolve_horizontal_rgba_vnni_rows_4_dot_impl(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -305,28 +234,20 @@ unsafe fn convolve_horizontal_rgba_vnni_rows_4_dot_impl<const DOT: bool>(
     const SCALE: i32 = 7;
     const ROUNDING: i16 = 1 << (SCALE - 1);
 
-    let init = if DOT {
-        _mm_set1_epi32(ROUNDING as i32)
-    } else {
-        _mm_setr_epi16(ROUNDING, 0, ROUNDING, 0, ROUNDING, 0, ROUNDING, 0)
-    };
-
-    let init_avx = if DOT {
-        _mm256_setr_epi32(
-            ROUNDING as i32,
-            ROUNDING as i32,
-            ROUNDING as i32,
-            ROUNDING as i32,
-            0,
-            0,
-            0,
-            0,
-        )
-    } else {
-        _mm256_setr_epi16(
-            ROUNDING, 0, ROUNDING, 0, ROUNDING, 0, ROUNDING, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-        )
-    };
+    const DOT: bool = true;
+
+    let init = _mm_set1_epi32(ROUNDING as i32);
+
+    let init_avx = _mm256_setr_epi32(
+        ROUNDING as i32,
+        ROUNDING as i32,
+        ROUNDING as i32,
+        ROUNDING as i32,
+        0,
+        0,
+        0,
+        0,
+    );
 
     let (row0_ref, rest) = dst.split_at_mut(dst_stride);
     let (row1_ref, rest) = rest.split_at_mut(dst_stride);
diff --git a/src/avx2/utils.rs b/src/avx2/utils.rs
index 216e0e9..e2b9ebc 100644
--- a/src/avx2/utils.rs
+++ b/src/avx2/utils.rs
@@ -353,6 +353,7 @@ pub(crate) unsafe fn _mm256_cvtepi64_epi32x(v: __m256i) -> __m128i {
     _mm_castps_si128(packed)
 }
 
+#[allow(dead_code)]
 #[inline(always)]
 pub(crate) unsafe fn _mm256_dot16_avx_epi32<const HAS_DOT: bool>(
     a: __m256i,
@@ -373,6 +374,7 @@ pub(crate) unsafe fn _mm256_dot16_avx_epi32<const HAS_DOT: bool>(
     }
 }
 
+#[allow(dead_code)]
 #[inline(always)]
 pub(crate) unsafe fn _mm_udot8_epi16<const DOT: bool>(
     a: __m128i,
@@ -391,6 +393,7 @@ pub(crate) unsafe fn _mm_udot8_epi16<const DOT: bool>(
     }
 }
 
+#[allow(dead_code)]
 #[inline(always)]
 pub(crate) unsafe fn _mm256_udot8_epi16<const DOT: bool>(
     a: __m256i,
@@ -407,6 +410,7 @@ pub(crate) unsafe fn _mm256_udot8_epi16<const DOT: bool>(
     _mm256_adds_epi16(a, _mm256_maddubs_epi16(b, c))
 }
 
+#[allow(dead_code)]
 #[inline(always)]
 pub(crate) unsafe fn _mm256_reduce_dot_epi16<const DOT: bool>(a: __m256i) -> __m128i {
     #[cfg(feature = "nightly_avx512")]
diff --git a/src/dispatch_group_u8.rs b/src/dispatch_group_u8.rs
index 2eda3d6..32f52d4 100644
--- a/src/dispatch_group_u8.rs
+++ b/src/dispatch_group_u8.rs
@@ -28,7 +28,6 @@
  */
 use crate::filter_weights::{FilterBounds, FilterWeights, WeightsConverter};
 use crate::image_store::ImageStoreMut;
-use crate::support::PRECISION;
 use crate::ImageStore;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
@@ -100,12 +99,13 @@ pub(crate) fn convolve_horizontal_dispatch_u8<V: Send + Sync, const CHANNELS: us
 }
 
 #[allow(clippy::type_complexity)]
-pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>(
+pub(crate) fn convolve_vertical_dispatch_u8<'a, V: Copy + Send + Sync, const COMPONENTS: usize>(
     image_store: &ImageStore<u8, COMPONENTS>,
     filter_weights: FilterWeights<f32>,
     destination: &mut ImageStoreMut<'a, u8, COMPONENTS>,
     pool: &Option<ThreadPool>,
-    dispatcher: fn(usize, &FilterBounds, &[u8], &mut [u8], usize, &[i16]),
+    dispatcher: fn(usize, &FilterBounds, &[u8], &mut [u8], usize, &[V]),
+    weights_converter: impl WeightsConverter<V>,
 ) {
     let src_stride = image_store.stride();
     let dst_stride = destination.stride();
@@ -113,9 +113,9 @@ pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>(
     let dst_width = destination.width;
 
     if let Some(pool) = pool {
+        let approx = weights_converter.prepare_weights(&filter_weights);
         pool.install(|| {
             let destination_image = destination.buffer.borrow_mut();
-            let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
             destination_image
                 .par_chunks_exact_mut(dst_stride)
                 .enumerate()
@@ -136,7 +136,7 @@ pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>(
         });
     } else {
         let destination_image = destination.buffer.borrow_mut();
-        let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+        let approx = weights_converter.prepare_weights(&filter_weights);
         destination_image
             .chunks_exact_mut(dst_stride)
             .enumerate()
diff --git a/src/plane_u8.rs b/src/plane_u8.rs
index dc57c41..1021cd4 100644
--- a/src/plane_u8.rs
+++ b/src/plane_u8.rs
@@ -132,6 +132,13 @@ impl VerticalConvolutionPass<u8, 1> for ImageStore<'_, u8, 1> {
         {
             _dispatcher = wasm_vertical_neon_row;
         }
-        convolve_vertical_dispatch_u8(self, filter_weights, destination, pool, _dispatcher);
+        convolve_vertical_dispatch_u8(
+            self,
+            filter_weights,
+            destination,
+            pool,
+            _dispatcher,
+            DefaultWeightsConverter::default(),
+        );
     }
 }
diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs
index ffe2362..43b81f3 100644
--- a/src/rgb_u8.rs
+++ b/src/rgb_u8.rs
@@ -84,10 +84,11 @@ impl HorizontalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
                 return;
             }
         }
-        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+        #[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "nightly_avx512"))]
         {
-            let has_avx2 = std::arch::is_x86_feature_detected!("avx2");
-            if _scale_factor < 5.1 && has_avx2 {
+            // Precision is too low without vnni
+            let has_vnni = std::arch::is_x86_feature_detected!("avxvnni");
+            if _scale_factor < 5.1 && has_vnni {
                 use crate::avx2::{
                     convolve_horizontal_rgb_avx_row_i8_one, convolve_horizontal_rgb_avx_rows_4_i8,
                 };
@@ -195,6 +196,13 @@ impl VerticalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
         {
             _dispatcher = wasm_vertical_neon_row;
         }
-        convolve_vertical_dispatch_u8(self, filter_weights, destination, pool, _dispatcher);
+        convolve_vertical_dispatch_u8(
+            self,
+            filter_weights,
+            destination,
+            pool,
+            _dispatcher,
+            DefaultWeightsConverter::default(),
+        );
     }
 }
diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs
index 2e9c384..e60f69e 100644
--- a/src/rgba_u8.rs
+++ b/src/rgba_u8.rs
@@ -90,10 +90,14 @@ impl HorizontalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
                 return;
             }
         }
-        #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+        #[cfg(all(
+            any(target_arch = "x86_64", target_arch = "x86"),
+            feature = "nightly_avx512"
+        ))]
         {
-            let has_avx2 = std::arch::is_x86_feature_detected!("avx2");
-            if _scale_factor < 5.1 && has_avx2 {
+            // Precision is too low without vnni
+            let has_vnni = std::arch::is_x86_feature_detected!("avxvnni");
+            if _scale_factor < 5.1 && has_vnni {
                 use crate::avx2::{
                     convolve_horizontal_rgba_row_dot, convolve_horizontal_rgba_rows_4_dot,
                 };
@@ -216,6 +220,13 @@ impl VerticalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
         {
             _dispatcher = wasm_vertical_neon_row;
         }
-        convolve_vertical_dispatch_u8(self, filter_weights, destination, pool, _dispatcher);
+        convolve_vertical_dispatch_u8(
+            self,
+            filter_weights,
+            destination,
+            pool,
+            _dispatcher,
+            DefaultWeightsConverter::default(),
+        );
     }
 }

From 9f477d9e859455b648cd01526a0254d9d3754131 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Wed, 8 Jan 2025 09:20:04 +0000
Subject: [PATCH 09/10] Refactor

---
 app/Cargo.toml     | 2 +-
 src/neon/rgb_u8.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/app/Cargo.toml b/app/Cargo.toml
index 71f0b9f..4b7e008 100644
--- a/app/Cargo.toml
+++ b/app/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 [dependencies]
 image = { version = "0.25.5", features = ["default"] }
 #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
-pic-scale = { path = "..", features = ["half", "nightly_avx512"], default-features = true }
+pic-scale = { path = "..", features = ["half", "nightly_i8mm"], default-features = true }
 fast_image_resize = { version = "5.0.0", features = [] }
 half = { version = "2.4.1", default-features = true }
 
diff --git a/src/neon/rgb_u8.rs b/src/neon/rgb_u8.rs
index 15ca20c..8a7fe7a 100644
--- a/src/neon/rgb_u8.rs
+++ b/src/neon/rgb_u8.rs
@@ -70,7 +70,7 @@ unsafe fn conv_horiz_rgba_2_u8(
 ) -> int32x4_t {
     const COMPONENTS: usize = 3;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
-    let mut rgb_pixel = vld1_lane_u32(src_ptr.as_ptr() as *const u32, vdup_n_u32(0));
+    let mut rgb_pixel = vld1_lane_u32::<0>(src_ptr.as_ptr() as *const u32, vdup_n_u32(0));
     rgb_pixel = vreinterpret_u32_u16(vset_lane_u16::<2>(
         (src_ptr.get_unchecked(4..).as_ptr() as *const u16).read_unaligned(),
         vreinterpret_u16_u32(rgb_pixel),

From cbed20911aaf4f7a0579a3bd56e25a3b426e85e3 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Wed, 8 Jan 2025 09:20:47 +0000
Subject: [PATCH 10/10] Refactor

---
 src/rgb_u8.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs
index 43b81f3..92f9690 100644
--- a/src/rgb_u8.rs
+++ b/src/rgb_u8.rs
@@ -84,7 +84,10 @@ impl HorizontalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
                 return;
             }
         }
-        #[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), feature = "nightly_avx512"))]
+        #[cfg(all(
+            any(target_arch = "x86_64", target_arch = "x86"),
+            feature = "nightly_avx512"
+        ))]
         {
             // Precision is too low without vnni
             let has_vnni = std::arch::is_x86_feature_detected!("avxvnni");