From d4d7ee411adf4008b5eb7c7ece23889bee3d0d4f Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Fri, 10 Jan 2025 08:55:03 +0000
Subject: [PATCH 1/3] Dropped i8mm

---
 Cargo.toml                |   1 -
 app/Cargo.toml            |   2 +-
 app/src/main.rs           |  16 +-
 fuzz/Cargo.toml           |   1 -
 src/avx2/mod.rs           |   6 -
 src/avx2/rgb_u8_dot_i8.rs | 482 --------------------------------------
 src/filter_weights.rs     |  30 +--
 src/lib.rs                |   1 -
 src/neon/rgb_u8_dot.rs    | 326 --------------------------
 src/rgb_u8.rs             |  59 -----
 10 files changed, 21 insertions(+), 903 deletions(-)
 delete mode 100644 src/avx2/rgb_u8_dot_i8.rs
 delete mode 100644 src/neon/rgb_u8_dot.rs

diff --git a/Cargo.toml b/Cargo.toml
index 4ffeb59..94e9301 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -29,5 +29,4 @@ libc = "0.2.158"
 default = ["colorspaces"]
 colorspaces = ["dep:colorutils-rs"]
 nightly_avx512 = []
-nightly_i8mm = []
 nightly_avx512fp16 = ["nightly_avx512"]
\ No newline at end of file
diff --git a/app/Cargo.toml b/app/Cargo.toml
index 1c09c6d..71f0b9f 100644
--- a/app/Cargo.toml
+++ b/app/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 [dependencies]
 image = { version = "0.25.5", features = ["default"] }
 #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
-pic-scale = { path = "..", features = ["half"], default-features = true }
+pic-scale = { path = "..", features = ["half", "nightly_avx512"], default-features = true }
 fast_image_resize = { version = "5.0.0", features = [] }
 half = { version = "2.4.1", default-features = true }
 
diff --git a/app/src/main.rs b/app/src/main.rs
index a7f3e34..36397d2 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -48,7 +48,7 @@ fn main() {
         .decode()
         .unwrap();
     let dimensions = img.dimensions();
-    let transient = img.to_luma_alpha8();
+    let transient = img.to_rgb8();
     let mut bytes = Vec::from(transient.as_bytes());
 
     let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
@@ -60,7 +60,7 @@ fn main() {
 
     //
     let store =
-        ImageStore::<u8, 2>::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize)
+        ImageStore::<u8, 3>::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize)
             .unwrap();
 
     let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
@@ -75,15 +75,15 @@ fn main() {
     //     )
     //     .unwrap();
 
-    let mut dst_store = ImageStoreMut::<u8, 2>::alloc_with_depth(
-        dimensions.0 as usize / 3,
-        dimensions.1 as usize / 3,
+    let mut dst_store = ImageStoreMut::<u8, 3>::alloc_with_depth(
+        dimensions.0 as usize / 3 * 2,
+        dimensions.1 as usize / 3 * 2,
         10,
     );
 
     // for i in 0..25 {
     let start_time = Instant::now();
-    scaler.resize_cbcr8(&store, &mut dst_store).unwrap();
+    scaler.resize_rgb(&store, &mut dst_store).unwrap();
 
     let elapsed_time = start_time.elapsed();
     // Print the elapsed time in milliseconds
@@ -189,11 +189,11 @@ fn main() {
         .unwrap();
     } else {
         image::save_buffer(
-            "converted.webp",
+            "converted_o.png",
             &dst,
             dst_store.width as u32,
             dst_store.height as u32,
-            image::ColorType::La8,
+            image::ColorType::Rgb8,
         )
         .unwrap();
     }
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index 9a60d02..973b015 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -12,7 +12,6 @@ libfuzzer-sys = "0.4"
 pic-scale = { path = "../" }
 
 [features]
-nightly_i8mm = ["pic-scale/nightly_i8mm"]
 nightly_avx512 = ["pic-scale/nightly_avx512"]
 
 [[bin]]
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
index 30944b8..48aa472 100644
--- a/src/avx2/mod.rs
+++ b/src/avx2/mod.rs
@@ -34,8 +34,6 @@ mod alpha_u16;
 mod alpha_u8;
 mod check_alpha;
 mod rgb_u8;
-#[cfg(feature = "nightly_avx512")]
-mod rgb_u8_dot_i8;
 #[cfg(feature = "half")]
 mod rgba_f16;
 mod rgba_f32;
@@ -59,10 +57,6 @@ pub(crate) use check_alpha::{
     avx_has_non_constant_cap_alpha_rgba16, avx_has_non_constant_cap_alpha_rgba8,
 };
 pub(crate) use rgb_u8::{convolve_horizontal_rgb_avx_row_one, convolve_horizontal_rgb_avx_rows_4};
-#[cfg(feature = "nightly_avx512")]
-pub(crate) use rgb_u8_dot_i8::{
-    convolve_horizontal_rgb_avx_row_i8_one, convolve_horizontal_rgb_avx_rows_4_i8,
-};
 #[cfg(feature = "half")]
 pub(crate) use rgba_f16::{
     convolve_horizontal_rgba_avx_row_one_f16, convolve_horizontal_rgba_avx_rows_4_f16,
diff --git a/src/avx2/rgb_u8_dot_i8.rs b/src/avx2/rgb_u8_dot_i8.rs
deleted file mode 100644
index e444098..0000000
--- a/src/avx2/rgb_u8_dot_i8.rs
+++ /dev/null
@@ -1,482 +0,0 @@
-/*
- * Copyright (c) Radzivon Bartoshyk. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- *
- * 1.  Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2.  Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3.  Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-use crate::avx2::utils::{_mm256_reduce_dot_epi16, _mm256_udot8_epi16, _mm_udot8_epi16};
-use crate::filter_weights::FilterWeights;
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
-#[cfg(target_arch = "x86_64")]
-use std::arch::x86_64::*;
-
-#[inline(always)]
-fn compress_i32<const DOT: bool>(x: __m128i) -> __m128i {
-    unsafe {
-        if DOT {
-            let store_32 = _mm_srai_epi32::<7>(x);
-            _mm_packus_epi32(store_32, store_32)
-        } else {
-            _mm_srai_epi16::<7>(_mm_hadds_epi16(x, x))
-        }
-    }
-}
-
-pub(crate) fn convolve_horizontal_rgb_avx_rows_4_i8(
-    src: &[u8],
-    src_stride: usize,
-    dst: &mut [u8],
-    dst_stride: usize,
-    filter_weights: &FilterWeights<i8>,
-) {
-    unsafe {
-        convolve_horizontal_rgb_avx_rows_i8_4_impl(
-            src,
-            src_stride,
-            dst,
-            dst_stride,
-            filter_weights,
-        );
-    }
-}
-
-#[inline(always)]
-unsafe fn load_rgb_x2(src: &[u8]) -> __m128i {
-    let mut rgb_pixel = _mm_setzero_si128();
-    rgb_pixel = _mm_insert_epi32::<0>(rgb_pixel, (src.as_ptr() as *const i32).read_unaligned());
-    rgb_pixel = _mm_insert_epi16::<2>(
-        rgb_pixel,
-        (src.get_unchecked(4..).as_ptr() as *const i16).read_unaligned() as i32,
-    );
-    rgb_pixel
-}
-
-#[inline(always)]
-unsafe fn load_rgb_x4(src: &[u8]) -> __m128i {
-    let mut rgb_pixel = _mm_loadu_si64(src.as_ptr());
-    rgb_pixel = _mm_insert_epi32::<2>(
-        rgb_pixel,
-        (src.get_unchecked(8..).as_ptr() as *const i32).read_unaligned(),
-    );
-    rgb_pixel
-}
-
-#[inline(always)]
-unsafe fn load_distr_x8_rgb(src: &[u8], shuf: __m256i) -> __m256i {
-    let pixel_lo = _mm_loadu_si128(src.as_ptr() as *const _);
-    let pixel_hi = _mm_loadu_si64(src.get_unchecked(16..).as_ptr() as *const _);
-
-    make_tuple_x8(pixel_lo, pixel_hi, shuf)
-}
-
-#[inline(always)]
-unsafe fn make_tuple_x8(pixel: __m128i, pixel2: __m128i, shuf: __m256i) -> __m256i {
-    // Low part
-    // [R0, G0, B0] [R1, G1, B1] [R2 G2 B2] [R3 G3 B3] [R4 G4 B4] [R5]
-    // High part
-    // [G5, B5] [R6, G6, B6] [R7, G7, B7]
-
-    let hi_part = _mm_alignr_epi8::<12>(pixel2, pixel);
-
-    _mm256_shuffle_epi8(
-        _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(pixel), hi_part),
-        shuf,
-    )
-}
-
-#[target_feature(enable = "avx2", enable = "avxvnni")]
-unsafe fn convolve_horizontal_rgb_avx_rows_i8_4_impl(
-    src: &[u8],
-    src_stride: usize,
-    dst: &mut [u8],
-    dst_stride: usize,
-    filter_weights: &FilterWeights<i8>,
-) {
-    const CHANNELS: usize = 3;
-
-    const PRECISION: i32 = 7;
-    const ROUNDING_CONST: i32 = 1 << (PRECISION - 1);
-    const DOT: bool = true;
-
-    let shuffle_v = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1);
-
-    let shuffle_weights = _mm_setr_epi8(0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3);
-
-    let weights_idx = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1);
-
-    let shuffle_weights01 = _mm256_setr_epi8(
-        0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
-        2, 3,
-    );
-    let shuffle_pixels_4 = _mm256_setr_epi8(
-        0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1, 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11,
-        -1, -1, -1, -1,
-    );
-
-    let vld = _mm_set1_epi32(ROUNDING_CONST);
-
-    let vld_avx = _mm256_setr_epi32(
-        ROUNDING_CONST,
-        ROUNDING_CONST,
-        ROUNDING_CONST,
-        0,
-        0,
-        0,
-        0,
-        0,
-    );
-
-    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-        .zip(iter_row1)
-        .zip(iter_row2)
-        .zip(iter_row3)
-        .zip(filter_weights.bounds.iter())
-        .zip(
-            filter_weights
-                .weights
-                .chunks_exact(filter_weights.aligned_size),
-        )
-    {
-        let mut jx = 0usize;
-        let mut store_0 = vld;
-        let mut store_1 = vld;
-        let mut store_2 = vld;
-        let mut store_3 = vld;
-
-        let src0 = src;
-        let src1 = src0.get_unchecked(src_stride..);
-        let src2 = src1.get_unchecked(src_stride..);
-        let src3 = src2.get_unchecked(src_stride..);
-
-        // [R0, G0, B0] [R1, G1, B1] [R2 G2 B2] [R3 G3 B3]
-
-        if bounds.size > 4 {
-            let mut store0 = vld_avx;
-            let mut store1 = vld_avx;
-            let mut store2 = vld_avx;
-            let mut store3 = vld_avx;
-
-            while jx + 8 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 8));
-                let full_weights =
-                    _mm256_castsi128_si256(_mm_loadu_si64(w_ptr.as_ptr() as *const _));
-
-                let w0 = _mm256_shuffle_epi8(
-                    _mm256_permutevar8x32_epi32(full_weights, weights_idx),
-                    shuffle_weights01,
-                );
-
-                let bounds_start = (bounds.start + jx) * CHANNELS;
-
-                let rgb_pixel_0 =
-                    load_distr_x8_rgb(src0.get_unchecked(bounds_start..), shuffle_pixels_4);
-                let rgb_pixel_1 =
-                    load_distr_x8_rgb(src1.get_unchecked(bounds_start..), shuffle_pixels_4);
-                let rgb_pixel_2 =
-                    load_distr_x8_rgb(src2.get_unchecked(bounds_start..), shuffle_pixels_4);
-                let rgb_pixel_3 =
-                    load_distr_x8_rgb(src3.get_unchecked(bounds_start..), shuffle_pixels_4);
-
-                store0 = _mm256_udot8_epi16::<DOT>(store0, rgb_pixel_0, w0);
-                store1 = _mm256_udot8_epi16::<DOT>(store1, rgb_pixel_1, w0);
-                store2 = _mm256_udot8_epi16::<DOT>(store2, rgb_pixel_2, w0);
-                store3 = _mm256_udot8_epi16::<DOT>(store3, rgb_pixel_3, w0);
-
-                jx += 8;
-            }
-
-            store_0 = _mm256_reduce_dot_epi16::<DOT>(store0);
-            store_1 = _mm256_reduce_dot_epi16::<DOT>(store1);
-            store_2 = _mm256_reduce_dot_epi16::<DOT>(store2);
-            store_3 = _mm256_reduce_dot_epi16::<DOT>(store3);
-        }
-
-        while jx + 4 < bounds.size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 2));
-
-            let weight0 =
-                _mm_shuffle_epi8(_mm_loadu_si32(w_ptr.as_ptr() as *const u8), shuffle_weights);
-            let bounds_start = (bounds.start + jx) * CHANNELS;
-
-            let rgb_pixel_0 = load_rgb_x4(src0.get_unchecked(bounds_start..));
-            let rgb_pixel_1 = load_rgb_x4(src1.get_unchecked(bounds_start..));
-            let rgb_pixel_2 = load_rgb_x4(src2.get_unchecked(bounds_start..));
-            let rgb_pixel_4 = load_rgb_x4(src3.get_unchecked(bounds_start..));
-
-            let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_v);
-            let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_v);
-            let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_v);
-            let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_v);
-
-            store_0 = _mm_udot8_epi16::<DOT>(store_0, lo_0, weight0);
-            store_1 = _mm_udot8_epi16::<DOT>(store_1, lo_1, weight0);
-            store_2 = _mm_udot8_epi16::<DOT>(store_2, lo_2, weight0);
-            store_3 = _mm_udot8_epi16::<DOT>(store_3, lo_3, weight0);
-
-            jx += 4;
-        }
-
-        while jx + 2 < bounds.size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 2));
-            let bounds_start = (bounds.start + jx) * CHANNELS;
-            let weight0 =
-                _mm_shuffle_epi8(_mm_loadu_si16(w_ptr.as_ptr() as *const u8), shuffle_weights);
-
-            let rgb_pixel_0 = load_rgb_x2(src0.get_unchecked(bounds_start..));
-            let rgb_pixel_1 = load_rgb_x2(src1.get_unchecked(bounds_start..));
-            let rgb_pixel_2 = load_rgb_x2(src2.get_unchecked(bounds_start..));
-            let rgb_pixel_4 = load_rgb_x2(src3.get_unchecked(bounds_start..));
-
-            let lo_0 = _mm_shuffle_epi8(rgb_pixel_0, shuffle_v);
-            let lo_1 = _mm_shuffle_epi8(rgb_pixel_1, shuffle_v);
-            let lo_2 = _mm_shuffle_epi8(rgb_pixel_2, shuffle_v);
-            let lo_3 = _mm_shuffle_epi8(rgb_pixel_4, shuffle_v);
-
-            store_0 = _mm_udot8_epi16::<DOT>(store_0, lo_0, weight0);
-            store_1 = _mm_udot8_epi16::<DOT>(store_1, lo_1, weight0);
-            store_2 = _mm_udot8_epi16::<DOT>(store_2, lo_2, weight0);
-            store_3 = _mm_udot8_epi16::<DOT>(store_3, lo_3, weight0);
-
-            jx += 2;
-        }
-
-        while jx < bounds.size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 1));
-            let bounds_start = bounds.start + jx;
-
-            let weight0 = _mm_shuffle_epi8(
-                _mm_set1_epi8(w_ptr.as_ptr().read_unaligned()),
-                shuffle_weights,
-            );
-
-            store_0 = add_one_weight::<DOT>(bounds_start, src0, weight0, store_0);
-            store_1 = add_one_weight::<DOT>(bounds_start, src1, weight0, store_1);
-            store_2 = add_one_weight::<DOT>(bounds_start, src2, weight0, store_2);
-            store_3 = add_one_weight::<DOT>(bounds_start, src3, weight0, store_3);
-            jx += 1;
-        }
-
-        let store_0_8 = compress_i32::<DOT>(store_0);
-        let store_1_8 = compress_i32::<DOT>(store_1);
-        let store_2_8 = compress_i32::<DOT>(store_2);
-        let store_3_8 = compress_i32::<DOT>(store_3);
-
-        let store_0_8 = _mm_packus_epi16(store_0_8, store_0_8);
-        let store_1_8 = _mm_packus_epi16(store_1_8, store_1_8);
-        let store_2_8 = _mm_packus_epi16(store_2_8, store_2_8);
-        let store_3_8 = _mm_packus_epi16(store_3_8, store_3_8);
-
-        let element_0 = _mm_extract_epi32::<0>(store_0_8);
-        let element_1 = _mm_extract_epi32::<0>(store_1_8);
-        let element_2 = _mm_extract_epi32::<0>(store_2_8);
-        let element_3 = _mm_extract_epi32::<0>(store_3_8);
-
-        let bytes = element_0.to_le_bytes();
-        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-        (chunk0.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-        *chunk0.get_unchecked_mut(2) = bytes[2];
-
-        let bytes = element_1.to_le_bytes();
-        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-        (chunk1.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-        *chunk1.get_unchecked_mut(2) = bytes[2];
-
-        let bytes = element_2.to_le_bytes();
-        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-        (chunk2.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-        *chunk2.get_unchecked_mut(2) = bytes[2];
-
-        let bytes = element_3.to_le_bytes();
-        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-        (chunk3.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-        *chunk3.get_unchecked_mut(2) = bytes[2];
-    }
-}
-
-pub(crate) fn convolve_horizontal_rgb_avx_row_i8_one(
-    src: &[u8],
-    dst: &mut [u8],
-    filter_weights: &FilterWeights<i8>,
-) {
-    unsafe {
-        convolve_horizontal_rgb_avx_row_i8_one_impl(src, dst, filter_weights);
-    }
-}
-
-#[inline(always)]
-unsafe fn add_one_weight<const DOT: bool>(
-    start_x: usize,
-    src: &[u8],
-    weight0: __m128i,
-    store_0: __m128i,
-) -> __m128i {
-    const COMPONENTS: usize = 3;
-    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..).as_ptr();
-    let base_pixel = _mm_loadu_si16(src.as_ptr());
-    let m_vl = _mm_insert_epi8::<2>(base_pixel, src_ptr.add(2).read_unaligned() as i32);
-    let lo = _mm_unpacklo_epi8(m_vl, _mm_setzero_si128());
-    _mm_udot8_epi16::<DOT>(store_0, lo, weight0)
-}
-
-#[target_feature(enable = "avx2", enable = "avxvnni")]
-unsafe fn convolve_horizontal_rgb_avx_row_i8_one_impl(
-    src: &[u8],
-    dst: &mut [u8],
-    filter_weights: &FilterWeights<i8>,
-) {
-    const CHANNELS: usize = 3;
-    const DOT: bool = true;
-
-    let shuffle_v = _mm_setr_epi8(0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1);
-
-    let shuffle_weights = _mm_setr_epi8(0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3);
-
-    let weights_idx = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1);
-
-    let shuffle_weights01 = _mm256_setr_epi8(
-        0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
-        2, 3,
-    );
-    let shuffle_pixels_4 = _mm256_setr_epi8(
-        0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, -1, -1, -1, -1, 0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11,
-        -1, -1, -1, -1,
-    );
-
-    // Low part
-    // [R0, G0, B0] [R1, G1, B1] [R2 G2 B2] [R3 G3 B3] [R4 G4 B4] [R5]
-    // High part
-    // [G5, B5] [R6, G6, B6] [R7, G7, B7]
-
-    const PRECISION: i32 = 7;
-    const ROUNDING_CONST: i32 = 1 << (PRECISION - 1);
-
-    let vld = _mm_set1_epi32(ROUNDING_CONST);
-
-    let vld_avx = _mm256_setr_epi32(
-        ROUNDING_CONST,
-        ROUNDING_CONST,
-        ROUNDING_CONST,
-        0,
-        0,
-        0,
-        0,
-        0,
-    );
-
-    for ((dst, bounds), weights) in dst
-        .chunks_exact_mut(CHANNELS)
-        .zip(filter_weights.bounds.iter())
-        .zip(
-            filter_weights
-                .weights
-                .chunks_exact(filter_weights.aligned_size),
-        )
-    {
-        let bounds_size = bounds.size;
-        let mut jx = 0usize;
-
-        let mut store = if bounds_size > 4 {
-            let mut store = vld_avx;
-            while jx + 8 < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 8));
-                let full_weights =
-                    _mm256_castsi128_si256(_mm_loadu_si64(w_ptr.as_ptr() as *const _));
-
-                let w0 = _mm256_shuffle_epi8(
-                    _mm256_permutevar8x32_epi32(full_weights, weights_idx),
-                    shuffle_weights01,
-                );
-
-                let bounds_start = bounds.start + jx;
-                let src_ptr_0 = src.get_unchecked((bounds_start * CHANNELS)..);
-
-                let pixel_lo = _mm_loadu_si128(src_ptr_0.as_ptr() as *const _);
-                let pixel_hi = _mm_loadu_si64(src_ptr_0.get_unchecked(16..).as_ptr() as *const _);
-
-                let px = make_tuple_x8(pixel_lo, pixel_hi, shuffle_pixels_4);
-
-                store = _mm256_udot8_epi16::<DOT>(store, px, w0);
-
-                jx += 8;
-            }
-
-            _mm256_reduce_dot_epi16::<DOT>(store)
-        } else {
-            vld
-        };
-
-        while jx + 4 < bounds.size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 2));
-            let weight0 =
-                _mm_shuffle_epi8(_mm_loadu_si32(w_ptr.as_ptr() as *const u8), shuffle_weights);
-            let src_ptr = src.get_unchecked(((bounds.start + jx) * 3)..);
-            let rgb_pixel = load_rgb_x4(src_ptr);
-            let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_v);
-            store = _mm_udot8_epi16::<DOT>(store, lo, weight0);
-            jx += 4;
-        }
-
-        while jx + 2 < bounds.size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 2));
-            let weight0 =
-                _mm_shuffle_epi8(_mm_loadu_si16(w_ptr.as_ptr() as *const u8), shuffle_weights);
-            let src_ptr = src.get_unchecked(((bounds.start + jx) * 3)..);
-            let rgb_pixel = load_rgb_x2(src_ptr);
-            let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_v);
-            store = _mm_udot8_epi16::<DOT>(store, lo, weight0);
-            jx += 2;
-        }
-
-        while jx < bounds_size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 1));
-            let weight0 = _mm_shuffle_epi8(
-                _mm_set1_epi8(w_ptr.as_ptr().read_unaligned()),
-                shuffle_weights,
-            );
-            store = add_one_weight::<DOT>(bounds.start + jx, src, weight0, store);
-            jx += 1;
-        }
-
-        let store_16_8 = compress_i32::<DOT>(store);
-        let store_16_8 = _mm_packus_epi16(store_16_8, store_16_8);
-
-        let element = _mm_extract_epi32::<0>(store_16_8);
-        let bytes = element.to_le_bytes();
-        let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-        (dst.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-        *dst.get_unchecked_mut(2) = bytes[2];
-    }
-}
diff --git a/src/filter_weights.rs b/src/filter_weights.rs
index 0d0701f..a665d96 100644
--- a/src/filter_weights.rs
+++ b/src/filter_weights.rs
@@ -26,7 +26,7 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-use num_traits::AsPrimitive;
+use num_traits::{AsPrimitive, Bounded};
 
 #[derive(Debug, Clone)]
 pub(crate) struct FilterWeights<T> {
@@ -79,7 +79,7 @@ impl FilterWeights<f32> {
     }
 
     pub(crate) fn numerical_approximation<
-        J: Clone + Default + Copy + 'static,
+        J: Clone + Default + Copy + 'static + Bounded + AsPrimitive<f32>,
         const PRECISION: i32,
     >(
         &self,
@@ -97,13 +97,20 @@ impl FilterWeights<f32> {
 
         let mut output_kernel = vec![J::default(); self.distinct_elements * align];
 
+        let lower_bound = J::min_value().as_();
+        let upper_bound = J::max_value().as_();
+
         for (chunk, kernel_chunk) in self
             .weights
             .chunks_exact(self.kernel_size)
             .zip(output_kernel.chunks_exact_mut(align))
         {
             for (&weight, kernel) in chunk.iter().zip(kernel_chunk) {
-                *kernel = (weight * precision_scale).round().as_();
+                *kernel = (weight * precision_scale)
+                    .round()
+                    .min(upper_bound)
+                    .max(lower_bound)
+                    .as_();
             }
         }
 
@@ -131,7 +138,8 @@ pub(crate) trait WeightsConverter<V> {
 #[derive(Default)]
 pub(crate) struct DefaultWeightsConverter {}
 
-impl<V: Default + Copy + 'static + Clone> WeightsConverter<V> for DefaultWeightsConverter
+impl<V: Default + Copy + 'static + Clone + Bounded + AsPrimitive<f32>> WeightsConverter<V>
+    for DefaultWeightsConverter
 where
     f32: AsPrimitive<V>,
 {
@@ -141,20 +149,6 @@ where
     }
 }
 
-#[derive(Default)]
-#[allow(dead_code)]
-pub(crate) struct WeightsConverterQ7 {}
-
-#[allow(dead_code)]
-impl<V: Default + Copy + 'static + Clone> WeightsConverter<V> for WeightsConverterQ7
-where
-    f32: AsPrimitive<V>,
-{
-    fn prepare_weights(&self, weights: &FilterWeights<f32>) -> FilterWeights<V> {
-        weights.numerical_approximation::<V, 7>(0)
-    }
-}
-
 #[derive(Default)]
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 pub(crate) struct WeightFloat16Converter {}
diff --git a/src/lib.rs b/src/lib.rs
index c3c8b41..cdf959e 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -32,7 +32,6 @@
 #![cfg_attr(feature = "nightly_avx512", feature(cfg_version))]
 #![cfg_attr(feature = "nightly_avx512", feature(avx512_target_feature))]
 #![cfg_attr(feature = "nightly_avx512", feature(stdarch_x86_avx512))]
-#![cfg_attr(feature = "nightly_i8mm", feature(stdarch_neon_i8mm))]
 #![cfg_attr(feature = "nightly_avx512fp16", feature(stdarch_x86_avx512_f16))]
 
 mod alpha_check;
diff --git a/src/neon/rgb_u8_dot.rs b/src/neon/rgb_u8_dot.rs
deleted file mode 100644
index 0ac1ba6..0000000
--- a/src/neon/rgb_u8_dot.rs
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Copyright (c) Radzivon Bartoshyk 01/2025. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- *
- * 1.  Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2.  Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3.  Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-use crate::filter_weights::FilterWeights;
-use crate::neon::utils::{load_12b_as_u8x16, load_3b_as_u8x16, load_6b_as_u8x16};
-use std::arch::aarch64::*;
-
-#[must_use]
-#[inline(always)]
-unsafe fn conv_horiz_rgba_8_u8(
-    start_x: usize,
-    src: &[u8],
-    weights0: int8x16_t,
-    weights1: int8x16_t,
-    shuffle: uint8x16_t,
-    store: int32x4_t,
-) -> int32x4_t {
-    const COMPONENTS: usize = 3;
-    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
-    let pixel_lo = vld1q_u8(src_ptr.as_ptr());
-    let pixel_hi = vcombine_u8(vld1_u8(src_ptr.get_unchecked(16..).as_ptr()), vdup_n_u8(0));
-    let created_new = vextq_u8::<12>(pixel_lo, pixel_hi);
-    let pixel0 = vqtbl1q_u8(pixel_lo, shuffle);
-    let pixel1 = vqtbl1q_u8(created_new, shuffle);
-    let v0 = vusdotq_s32(store, pixel0, weights0);
-    vusdotq_s32(v0, pixel1, weights1)
-}
-
-#[must_use]
-#[inline(always)]
-unsafe fn conv_horiz_rgba_4_u8(
-    start_x: usize,
-    src: &[u8],
-    weights: int8x16_t,
-    shuffle: uint8x16_t,
-    store: int32x4_t,
-) -> int32x4_t {
-    const COMPONENTS: usize = 3;
-    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
-    let pixel = vqtbl1q_u8(load_12b_as_u8x16(src_ptr.as_ptr()), shuffle);
-    vusdotq_s32(store, pixel, weights)
-}
-
-#[must_use]
-#[inline(always)]
-unsafe fn conv_horiz_rgba_2_u8(
-    start_x: usize,
-    src: &[u8],
-    weights: int8x16_t,
-    shuffle: uint8x16_t,
-    store: int32x4_t,
-) -> int32x4_t {
-    const COMPONENTS: usize = 3;
-    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
-    let rgb_pixel = vqtbl1q_u8(load_6b_as_u8x16(src_ptr.as_ptr()), shuffle);
-    vusdotq_s32(store, rgb_pixel, weights)
-}
-
-#[must_use]
-#[inline(always)]
-unsafe fn conv_horiz_rgba_1_u8(
-    start_x: usize,
-    src: &[u8],
-    w0: int8x16_t,
-    shuf: uint8x16_t,
-    store: int32x4_t,
-) -> int32x4_t {
-    const COMPONENTS: usize = 3;
-    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
-    let rgb_pixel = vqtbl1q_u8(load_3b_as_u8x16(src_ptr.as_ptr()), shuf);
-    vusdotq_s32(store, rgb_pixel, w0)
-}
-
-#[inline(always)]
-unsafe fn write_accumulator_u8(store: int32x4_t, dst: &mut [u8]) {
-    let store_16 = vqshrun_n_s32::<7>(store);
-    let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
-    vst1_lane_u16::<0>(
-        dst.as_mut_ptr() as *mut u16,
-        vreinterpret_u16_u8(store_16_8),
-    );
-    vst1_lane_u8::<2>(dst.as_mut_ptr().add(2), store_16_8);
-}
-
-pub(crate) fn convolve_horizontal_rgb_neon_dot_rows_4(
-    src: &[u8],
-    src_stride: usize,
-    dst: &mut [u8],
-    dst_stride: usize,
-    filter_weights: &FilterWeights<i8>,
-) {
-    unsafe {
-        convolve_horizontal_rgb_neon_dot_rows_4_impl(
-            src,
-            src_stride,
-            dst,
-            dst_stride,
-            filter_weights,
-        );
-    }
-}
-
-#[target_feature(enable = "i8mm")]
-unsafe fn convolve_horizontal_rgb_neon_dot_rows_4_impl(
-    src: &[u8],
-    src_stride: usize,
-    dst: &mut [u8],
-    dst_stride: usize,
-    filter_weights: &FilterWeights<i8>,
-) {
-    let shuffle_v_table: [u8; 16] = [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 255, 255, 255, 255];
-    let shuffle_v = vld1q_u8(shuffle_v_table.as_ptr());
-    let weights_shuffle_table: [u8; 16] = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
-    let weights_shuffle = vld1q_u8(weights_shuffle_table.as_ptr());
-    let weights_shuffle_table1: [u8; 16] = [4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7];
-    let weights_shuffle1 = vld1q_u8(weights_shuffle_table1.as_ptr());
-
-    // (r0 g0 b0 r1) (g2 b2 r3 g3) (b3 r4 g4 b4) (r5 g5 b5 r6)
-
-    const CHANNELS: usize = 3;
-    const ROUNDING_CONST: i32 = 1 << 6;
-    let init = vdupq_n_s32(ROUNDING_CONST);
-    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-        .zip(iter_row1)
-        .zip(iter_row2)
-        .zip(iter_row3)
-        .zip(filter_weights.bounds.iter())
-        .zip(
-            filter_weights
-                .weights
-                .chunks_exact(filter_weights.aligned_size),
-        )
-    {
-        let mut jx = 0usize;
-        let mut store_0 = init;
-        let mut store_1 = init;
-        let mut store_2 = init;
-        let mut store_3 = init;
-
-        let src0 = src;
-        let src1 = src0.get_unchecked(src_stride..);
-        let src2 = src1.get_unchecked(src_stride..);
-        let src3 = src2.get_unchecked(src_stride..);
-
-        while jx + 8 < bounds.size {
-            let bounds_start = bounds.start + jx;
-            let w_ptr = weights.get_unchecked(jx..(jx + 4));
-            let weights = vreinterpretq_s8_s64(vld1q_dup_s64(w_ptr.as_ptr() as *const _));
-            let weights0 = vqtbl1q_s8(weights, weights_shuffle);
-            let weights1 = vqtbl1q_s8(weights, weights_shuffle1);
-
-            store_0 =
-                conv_horiz_rgba_8_u8(bounds_start, src0, weights0, weights1, shuffle_v, store_0);
-            store_1 =
-                conv_horiz_rgba_8_u8(bounds_start, src1, weights0, weights1, shuffle_v, store_1);
-            store_2 =
-                conv_horiz_rgba_8_u8(bounds_start, src2, weights0, weights1, shuffle_v, store_2);
-            store_3 =
-                conv_horiz_rgba_8_u8(bounds_start, src3, weights0, weights1, shuffle_v, store_3);
-            jx += 8;
-        }
-
-        while jx + 4 < bounds.size {
-            let bounds_start = bounds.start + jx;
-            let w_ptr = weights.get_unchecked(jx..(jx + 4));
-            let weights = vqtbl1q_s8(
-                vreinterpretq_s8_s32(vld1q_dup_s32(w_ptr.as_ptr() as *const _)),
-                weights_shuffle,
-            );
-            store_0 = conv_horiz_rgba_4_u8(bounds_start, src0, weights, shuffle_v, store_0);
-            store_1 = conv_horiz_rgba_4_u8(bounds_start, src1, weights, shuffle_v, store_1);
-            store_2 = conv_horiz_rgba_4_u8(bounds_start, src2, weights, shuffle_v, store_2);
-            store_3 = conv_horiz_rgba_4_u8(bounds_start, src3, weights, shuffle_v, store_3);
-            jx += 4;
-        }
-
-        while jx + 2 < bounds.size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 2));
-            let bnds = bounds.start + jx;
-            let v_weight = vqtbl1q_s8(
-                vreinterpretq_s8_s16(vld1q_dup_s16(w_ptr.as_ptr() as *const _)),
-                weights_shuffle,
-            );
-            store_0 = conv_horiz_rgba_2_u8(bnds, src0, v_weight, shuffle_v, store_0);
-            store_1 = conv_horiz_rgba_2_u8(bnds, src1, v_weight, shuffle_v, store_1);
-            store_2 = conv_horiz_rgba_2_u8(bnds, src2, v_weight, shuffle_v, store_2);
-            store_3 = conv_horiz_rgba_2_u8(bnds, src3, v_weight, shuffle_v, store_3);
-            jx += 2;
-        }
-
-        while jx < bounds.size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 1));
-            let bnds = bounds.start + jx;
-            let weight0 = vqtbl1q_s8(vld1q_dup_s8(w_ptr.as_ptr()), weights_shuffle);
-            store_0 = conv_horiz_rgba_1_u8(bnds, src0, weight0, shuffle_v, store_0);
-            store_1 = conv_horiz_rgba_1_u8(bnds, src1, weight0, shuffle_v, store_1);
-            store_2 = conv_horiz_rgba_1_u8(bnds, src2, weight0, shuffle_v, store_2);
-            store_3 = conv_horiz_rgba_1_u8(bnds, src3, weight0, shuffle_v, store_3);
-            jx += 1;
-        }
-
-        write_accumulator_u8(store_0, chunk0);
-        write_accumulator_u8(store_1, chunk1);
-        write_accumulator_u8(store_2, chunk2);
-        write_accumulator_u8(store_3, chunk3);
-    }
-}
-
-pub(crate) fn convolve_horizontal_rgb_neon_dot_row_one(
-    src: &[u8],
-    dst: &mut [u8],
-    filter_weights: &FilterWeights<i8>,
-) {
-    unsafe {
-        convolve_horizontal_rgb_neon_dot_row_one_impl(src, dst, filter_weights);
-    }
-}
-
-#[target_feature(enable = "i8mm")]
-unsafe fn convolve_horizontal_rgb_neon_dot_row_one_impl(
-    src: &[u8],
-    dst: &mut [u8],
-    filter_weights: &FilterWeights<i8>,
-) {
-    const CHANNELS: usize = 3;
-    const ROUNDING_CONST: i32 = 1 << 6;
-
-    let shuffle_v_table: [u8; 16] = [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11, 255, 255, 255, 255];
-    let shuffle_v = vld1q_u8(shuffle_v_table.as_ptr());
-    let weights_shuffle_table: [u8; 16] = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
-    let weights_shuffle = vld1q_u8(weights_shuffle_table.as_ptr());
-    let weights_shuffle_table1: [u8; 16] = [4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7];
-    let weights_shuffle1 = vld1q_u8(weights_shuffle_table1.as_ptr());
-
-    for ((dst, bounds), weights) in dst
-        .chunks_exact_mut(CHANNELS)
-        .zip(filter_weights.bounds.iter())
-        .zip(
-            filter_weights
-                .weights
-                .chunks_exact(filter_weights.aligned_size),
-        )
-    {
-        let bounds_size = bounds.size;
-
-        let mut jx = 0usize;
-        let mut store = vdupq_n_s32(ROUNDING_CONST);
-
-        while jx + 8 < bounds_size {
-            let bounds_start = bounds.start + jx;
-            let w_ptr = weights.get_unchecked(jx..(jx + 4));
-            let weights = vreinterpretq_s8_s64(vld1q_dup_s64(w_ptr.as_ptr() as *const _));
-            let weights0 = vqtbl1q_s8(weights, weights_shuffle);
-            let weights1 = vqtbl1q_s8(weights, weights_shuffle1);
-            store = conv_horiz_rgba_8_u8(bounds_start, src, weights0, weights1, shuffle_v, store);
-            jx += 8;
-        }
-
-        while jx + 4 < bounds_size {
-            let bounds_start = bounds.start + jx;
-            let w_ptr = weights.get_unchecked(jx..(jx + 4));
-            let weights = vqtbl1q_s8(
-                vreinterpretq_s8_s32(vld1q_dup_s32(w_ptr.as_ptr() as *const _)),
-                weights_shuffle,
-            );
-            store = conv_horiz_rgba_4_u8(bounds_start, src, weights, shuffle_v, store);
-            jx += 4;
-        }
-
-        while jx + 2 < bounds_size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 2));
-            let bounds_start = bounds.start + jx;
-            let v_weight = vqtbl1q_s8(
-                vreinterpretq_s8_s16(vld1q_dup_s16(w_ptr.as_ptr() as *const _)),
-                weights_shuffle,
-            );
-            store = conv_horiz_rgba_2_u8(bounds_start, src, v_weight, shuffle_v, store);
-            jx += 2;
-        }
-
-        while jx < bounds_size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 1));
-            let weight0 = vqtbl1q_s8(vld1q_dup_s8(w_ptr.as_ptr()), weights_shuffle);
-            let bnds = bounds.start + jx;
-            store = conv_horiz_rgba_1_u8(bnds, src, weight0, shuffle_v, store);
-            jx += 1;
-        }
-
-        write_accumulator_u8(store, dst);
-    }
-}
diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs
index b6257b6..5f541fc 100644
--- a/src/rgb_u8.rs
+++ b/src/rgb_u8.rs
@@ -55,65 +55,6 @@ impl HorizontalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
         pool: &Option<ThreadPool>,
     ) {
         let _scale_factor = self.width as f32 / destination.width as f32;
-        #[cfg(all(
-            feature = "nightly_i8mm",
-            target_arch = "aarch64",
-            target_feature = "neon"
-        ))]
-        {
-            if _scale_factor <= 2. && std::arch::is_aarch64_feature_detected!("i8mm") {
-                use crate::filter_weights::WeightsConverterQ7;
-                use crate::neon::{
-                    convolve_horizontal_rgb_neon_dot_row_one,
-                    convolve_horizontal_rgb_neon_dot_rows_4,
-                };
-                let _dispatcher_4_rows: Option<
-                    fn(&[u8], usize, &mut [u8], usize, &FilterWeights<i8>),
-                > = Some(convolve_horizontal_rgb_neon_dot_rows_4);
-                let _dispatcher_1_row: fn(&[u8], &mut [u8], &FilterWeights<i8>) =
-                    convolve_horizontal_rgb_neon_dot_row_one;
-                convolve_horizontal_dispatch_u8(
-                    self,
-                    filter_weights,
-                    destination,
-                    pool,
-                    _dispatcher_4_rows,
-                    _dispatcher_1_row,
-                    WeightsConverterQ7::default(),
-                );
-                return;
-            }
-        }
-        #[cfg(all(
-            any(target_arch = "x86_64", target_arch = "x86"),
-            feature = "nightly_avx512"
-        ))]
-        {
-            // Precision is too low without vnni
-            let has_vnni = std::arch::is_x86_feature_detected!("avxvnni");
-            if _scale_factor <= 2. && has_vnni {
-                use crate::avx2::{
-                    convolve_horizontal_rgb_avx_row_i8_one, convolve_horizontal_rgb_avx_rows_4_i8,
-                };
-                use crate::filter_weights::WeightsConverterQ7;
-                let _dispatcher_4_rows: Option<
-                    fn(&[u8], usize, &mut [u8], usize, &FilterWeights<i8>),
-                > = Some(convolve_horizontal_rgb_avx_rows_4_i8);
-                let _dispatcher_1_row: fn(&[u8], &mut [u8], &FilterWeights<i8>) =
-                    convolve_horizontal_rgb_avx_row_i8_one;
-                convolve_horizontal_dispatch_u8(
-                    self,
-                    filter_weights,
-                    destination,
-                    pool,
-                    _dispatcher_4_rows,
-                    _dispatcher_1_row,
-                    WeightsConverterQ7::default(),
-                );
-                return;
-            }
-        }
-
         let mut _dispatcher_4_rows: Option<
             fn(&[u8], usize, &mut [u8], usize, &FilterWeights<i16>),
         > = Some(handle_fixed_rows_4_u8::<3>);

From f253e3a6ae62ff0ae64ad0bd789c3cb25d210cff Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Fri, 10 Jan 2025 08:59:23 +0000
Subject: [PATCH 2/3] Dropped i8mm

---
 Cargo.toml     | 3 ++-
 app/Cargo.toml | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 94e9301..43dcf51 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -29,4 +29,5 @@ libc = "0.2.158"
 default = ["colorspaces"]
 colorspaces = ["dep:colorutils-rs"]
 nightly_avx512 = []
-nightly_avx512fp16 = ["nightly_avx512"]
\ No newline at end of file
+nightly_avx512fp16 = ["nightly_avx512"]
+nightly_i8mm = []
\ No newline at end of file
diff --git a/app/Cargo.toml b/app/Cargo.toml
index 71f0b9f..1c09c6d 100644
--- a/app/Cargo.toml
+++ b/app/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 [dependencies]
 image = { version = "0.25.5", features = ["default"] }
 #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
-pic-scale = { path = "..", features = ["half", "nightly_avx512"], default-features = true }
+pic-scale = { path = "..", features = ["half"], default-features = true }
 fast_image_resize = { version = "5.0.0", features = [] }
 half = { version = "2.4.1", default-features = true }
 

From d2028bc6540b592cdd35f1f260eb69ddfaa6c137 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Fri, 10 Jan 2025 08:59:44 +0000
Subject: [PATCH 3/3] Dropped i8mm

---
 .github/workflows/build_push.yml | 1 -
 Cargo.toml                       | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index 315e7b6..873aea2 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -25,7 +25,6 @@ jobs:
       - run: rustup target add aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu i686-unknown-linux-gnu powerpc-unknown-linux-gnu riscv64gc-unknown-linux-gnu
       - run: RUSTFLAGS="-C target-feature=+neon,-fp16" cargo build --target aarch64-unknown-linux-gnu --features half
       - run: RUSTFLAGS="-C target-feature=+neon,+fp16" cargo build --target aarch64-unknown-linux-gnu --features half
-      - run: RUSTFLAGS="-C target-feature=+neon,+fp16" cargo build --target aarch64-unknown-linux-gnu --features half,nightly_i8mm
       - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target i686-unknown-linux-gnu
       - run: cargo build --target powerpc-unknown-linux-gnu
       - run: cargo build --target riscv64gc-unknown-linux-gnu
diff --git a/Cargo.toml b/Cargo.toml
index 43dcf51..94e9301 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -29,5 +29,4 @@ libc = "0.2.158"
 default = ["colorspaces"]
 colorspaces = ["dep:colorutils-rs"]
 nightly_avx512 = []
-nightly_avx512fp16 = ["nightly_avx512"]
-nightly_i8mm = []
\ No newline at end of file
+nightly_avx512fp16 = ["nightly_avx512"]
\ No newline at end of file