From e7f4ccd3c357c2a2f552c27c019e58fc6e589992 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Thu, 14 Nov 2024 22:15:38 +0000
Subject: [PATCH 01/19] Improved RDM detection

---
 Cargo.toml          |  2 +-
 src/cpu_features.rs | 19 +++++++++++++++++--
 src/lib.rs          |  2 --
 src/plane_u8.rs     |  2 +-
 src/rgb_u8.rs       |  2 +-
 src/rgba_u8.rs      |  4 ++--
 6 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 9c091c7..019bce5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["app", "wasm"] }
 
 [package]
 name = "pic-scale"
-version = "0.3.5"
+version = "0.3.6"
 edition = "2021"
 description = "High performance image scaling"
 readme = "README.md"
diff --git a/src/cpu_features.rs b/src/cpu_features.rs
index dcd0497..94bf023 100644
--- a/src/cpu_features.rs
+++ b/src/cpu_features.rs
@@ -65,7 +65,7 @@ fn apple_has_cpu_feature(_feature_name: &str) -> bool {
 /// Test aarch64 cpu with *fp16* check,
 /// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-pub fn is_aarch_f16_supported() -> bool {
+pub(crate) fn is_aarch_f16_supported() -> bool {
     #[cfg(any(target_os = "macos", target_os = "ios"))]
     {
         apple_has_cpu_feature("hw.optional.arm.FEAT_FP16")
@@ -81,7 +81,7 @@ pub fn is_aarch_f16_supported() -> bool {
 /// otherwise consider it is always available
 #[allow(clippy::too_long_first_doc_paragraph)]
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-pub fn is_aarch_f16c_supported() -> bool {
+pub(crate) fn is_aarch_f16c_supported() -> bool {
     #[cfg(any(target_os = "macos", target_os = "ios"))]
     {
         apple_has_cpu_feature("hw.optional.AdvSIMD_HPFPCvt")
@@ -91,3 +91,18 @@ pub fn is_aarch_f16c_supported() -> bool {
         true
     }
 }
+
+/// Test aarch64 cpu with *RDM* check
+///
+/// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+pub(crate) fn is_aarch_rdm_supported() -> bool {
+    #[cfg(any(target_os = "macos", target_os = "ios"))]
+    {
+        apple_has_cpu_feature("hw.optional.arm.FEAT_RDM")
+    }
+    #[cfg(not(any(target_os = "macos", target_os = "ios")))]
+    {
+        std::arch::is_aarch64_feature_detected!("rdm")
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 1942a7d..7de7dc5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -91,8 +91,6 @@ mod wasm32;
 pub use colors::*;
 #[cfg(feature = "colorspaces")]
 pub use colorutils_rs::TransferFunction;
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-pub use cpu_features::{is_aarch_f16_supported, is_aarch_f16c_supported};
 pub use image_size::ImageSize;
 pub use image_store::ImageStore;
 pub use math::*;
diff --git a/src/plane_u8.rs b/src/plane_u8.rs
index b448b41..0cd736e 100644
--- a/src/plane_u8.rs
+++ b/src/plane_u8.rs
@@ -96,7 +96,7 @@ impl VerticalConvolutionPass<u8, 1> for ImageStore<'_, u8, 1> {
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             // For more downscaling better to use more precise version
-            if _scale_factor < 8. && std::arch::is_aarch64_feature_detected!("rdm") {
+            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
                 _dispatcher = convolve_vertical_neon_i16_precision;
             } else {
                 _dispatcher = convolve_vertical_neon_i32_precision;
diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs
index 1ec37e7..85ce1e5 100644
--- a/src/rgb_u8.rs
+++ b/src/rgb_u8.rs
@@ -96,7 +96,7 @@ impl VerticalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             // For more downscaling better to use more precise version
-            if _scale_factor < 8. && std::arch::is_aarch64_feature_detected!("rdm") {
+            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
                 _dispatcher = convolve_vertical_neon_i16_precision;
             } else {
                 _dispatcher = convolve_vertical_neon_i32_precision;
diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs
index ac570d2..597ffe4 100644
--- a/src/rgba_u8.rs
+++ b/src/rgba_u8.rs
@@ -65,7 +65,7 @@ impl HorizontalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
             handle_fixed_row_u8::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            if _scale_factor < 8. && std::arch::is_aarch64_feature_detected!("rdm") {
+            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
                 _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8_i16);
                 _dispatcher_1_row = convolve_horizontal_rgba_neon_row_i16;
             } else {
@@ -110,7 +110,7 @@ impl VerticalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             // For more downscaling better to use more precise version
-            if _scale_factor < 8. && std::arch::is_aarch64_feature_detected!("rdm") {
+            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
                 _dispatcher = convolve_vertical_neon_i16_precision;
             } else {
                 _dispatcher = convolve_vertical_neon_i32_precision;

From 3a7191ab57890d9cf5b9377023921f6be9f3d78c Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Thu, 14 Nov 2024 22:30:16 +0000
Subject: [PATCH 02/19] Deny unused

---
 Cargo.lock                 |   2 +-
 src/alpha_handle_f16.rs    |   4 +-
 src/alpha_handle_f32.rs    |   4 +-
 src/alpha_handle_u16.rs    |  12 +--
 src/alpha_handle_u8.rs     |   6 +-
 src/avx2/alpha_f16.rs      |   4 +-
 src/avx2/alpha_f32.rs      |   6 +-
 src/avx2/alpha_u16.rs      |  10 +--
 src/avx2/alpha_u8.rs       |   4 +-
 src/avx2/mod.rs            |  26 +++----
 src/avx2/rgba_f16.rs       |   4 +-
 src/avx2/rgba_f32.rs       |   4 +-
 src/avx2/utils.rs          |  51 ++++++++-----
 src/avx2/vertical_f16.rs   |   4 +-
 src/avx2/vertical_f32.rs   |   2 +-
 src/avx2/vertical_u8.rs    |   2 +-
 src/avx2/vertical_u8_lp.rs |   2 +-
 src/color_group.rs         | 153 +++++--------------------------------
 src/convolution.rs         |   4 +-
 src/filter_weights.rs      |  10 +--
 src/handler_provider.rs    |   8 +-
 src/image_store.rs         |   4 +-
 src/lib.rs                 |   1 +
 src/mixed_storage.rs       |   2 +-
 src/mlaf.rs                |   4 +-
 src/nearest_sampler.rs     |   2 +-
 src/saturate_narrow.rs     |   2 +-
 src/sse/alpha_f16.rs       |   4 +-
 src/sse/alpha_f32.rs       |   4 +-
 src/sse/alpha_u16.rs       |  10 +--
 src/sse/alpha_u8.rs        |  14 ++--
 src/sse/f16_utils.rs       |  20 +++--
 src/sse/mod.rs             |  58 +++++++-------
 src/sse/plane_f32.rs       |   4 +-
 src/sse/plane_u8.rs        |   4 +-
 src/sse/rgb_f16.rs         |   4 +-
 src/sse/rgb_f32.rs         |   4 +-
 src/sse/rgb_u8.rs          |   4 +-
 src/sse/rgba_f16.rs        |   4 +-
 src/sse/rgba_f32.rs        |   4 +-
 src/sse/rgba_u16.rs        |   4 +-
 src/sse/rgba_u16_lb.rs     |   4 +-
 src/sse/rgba_u8.rs         |   4 +-
 src/sse/rgba_u8_lb.rs      |   4 +-
 src/sse/u8_utils.rs        |   2 +-
 src/sse/utils.rs           |  24 +++---
 src/sse/vertical_f16.rs    |   6 +-
 src/sse/vertical_f32.rs    |   2 +-
 src/sse/vertical_u16.rs    |   2 +-
 src/sse/vertical_u16_lb.rs |   2 +-
 src/sse/vertical_u8.rs     |   2 +-
 src/sse/vertical_u8_lp.rs  |   2 +-
 src/support.rs             |   4 +-
 src/unsafe_slice.rs        |  12 +--
 54 files changed, 228 insertions(+), 321 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 50c2826..e0f7920 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -808,7 +808,7 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
 [[package]]
 name = "pic-scale"
-version = "0.3.5"
+version = "0.3.6"
 dependencies = [
  "colorutils-rs",
  "half",
diff --git a/src/alpha_handle_f16.rs b/src/alpha_handle_f16.rs
index 15e3f4c..74ac366 100644
--- a/src/alpha_handle_f16.rs
+++ b/src/alpha_handle_f16.rs
@@ -123,7 +123,7 @@ fn unpremultiply_alpha_rgba_impl_f16(
     }
 }
 
-pub fn premultiply_alpha_rgba_f16(
+pub(crate) fn premultiply_alpha_rgba_f16(
     dst: &mut [half::f16],
     src: &[half::f16],
     width: usize,
@@ -154,7 +154,7 @@ pub fn premultiply_alpha_rgba_f16(
     _dispatcher(dst, src, width, height, pool);
 }
 
-pub fn unpremultiply_alpha_rgba_f16(
+pub(crate) fn unpremultiply_alpha_rgba_f16(
     in_place: &mut [half::f16],
     width: usize,
     height: usize,
diff --git a/src/alpha_handle_f32.rs b/src/alpha_handle_f32.rs
index 132c53a..5a46f5e 100644
--- a/src/alpha_handle_f32.rs
+++ b/src/alpha_handle_f32.rs
@@ -119,7 +119,7 @@ fn unpremultiply_alpha_rgba_impl_f32(
     }
 }
 
-pub fn premultiply_alpha_rgba_f32(
+pub(crate) fn premultiply_alpha_rgba_f32(
     dst: &mut [f32],
     src: &[f32],
     width: usize,
@@ -147,7 +147,7 @@ pub fn premultiply_alpha_rgba_f32(
     _dispatcher(dst, src, width, height, pool);
 }
 
-pub fn unpremultiply_alpha_rgba_f32(
+pub(crate) fn unpremultiply_alpha_rgba_f32(
     in_place: &mut [f32],
     width: usize,
     height: usize,
diff --git a/src/alpha_handle_u16.rs b/src/alpha_handle_u16.rs
index a7f89d4..a9ecce7 100644
--- a/src/alpha_handle_u16.rs
+++ b/src/alpha_handle_u16.rs
@@ -39,21 +39,21 @@ use rayon::slice::ParallelSliceMut;
 use rayon::ThreadPool;
 
 #[inline]
-pub fn div_by_1023(v: u32) -> u16 {
+pub(crate) fn div_by_1023(v: u32) -> u16 {
     let round = 1 << 9;
     let v = v + round;
     (((v >> 10) + v) >> 10) as u16
 }
 
 #[inline]
-pub fn div_by_4095(v: u32) -> u16 {
+pub(crate) fn div_by_4095(v: u32) -> u16 {
     let round = 1 << 11;
     let v = v + round;
     (((v >> 12) + v) >> 12) as u16
 }
 
 #[inline]
-pub fn div_by_65535(v: u32) -> u16 {
+pub(crate) fn div_by_65535(v: u32) -> u16 {
     let round = 1 << 15;
     let v_expand = v;
     let v = v_expand + round;
@@ -101,7 +101,7 @@ pub(crate) fn premultiply_alpha_rgba_row(dst: &mut [u16], src: &[u16], max_color
     }
 }
 
-pub fn unpremultiply_alpha_rgba_row(in_place: &mut [u16], max_colors: u32) {
+pub(crate) fn unpremultiply_alpha_rgba_row(in_place: &mut [u16], max_colors: u32) {
     for dst in in_place.chunks_exact_mut(4) {
         let a = dst[3] as u32;
         if a != 0 {
@@ -161,7 +161,7 @@ fn unpremultiply_alpha_rgba_impl(
     }
 }
 
-pub fn premultiply_alpha_rgba_u16(
+pub(crate) fn premultiply_alpha_rgba_u16(
     dst: &mut [u16],
     src: &[u16],
     width: usize,
@@ -191,7 +191,7 @@ pub fn premultiply_alpha_rgba_u16(
     _dispatcher(dst, src, width, height, bit_depth, pool);
 }
 
-pub fn unpremultiply_alpha_rgba_u16(
+pub(crate) fn unpremultiply_alpha_rgba_u16(
     in_place: &mut [u16],
     width: usize,
     height: usize,
diff --git a/src/alpha_handle_u8.rs b/src/alpha_handle_u8.rs
index 162754a..a357e51 100644
--- a/src/alpha_handle_u8.rs
+++ b/src/alpha_handle_u8.rs
@@ -40,7 +40,7 @@ use rayon::slice::ParallelSliceMut;
 use rayon::ThreadPool;
 
 #[inline]
-pub fn div_by_255(v: u16) -> u8 {
+pub(crate) fn div_by_255(v: u16) -> u8 {
     ((((v + 0x80) >> 8) + v + 0x80) >> 8).min(255) as u8
 }
 
@@ -112,7 +112,7 @@ fn unpremultiply_alpha_rgba_impl(
     }
 }
 
-pub fn premultiply_alpha_rgba(
+pub(crate) fn premultiply_alpha_rgba(
     dst: &mut [u8],
     src: &[u8],
     width: usize,
@@ -144,7 +144,7 @@ pub fn premultiply_alpha_rgba(
     _dispatcher(dst, src, width, height, pool);
 }
 
-pub fn unpremultiply_alpha_rgba(
+pub(crate) fn unpremultiply_alpha_rgba(
     in_place: &mut [u8],
     width: usize,
     height: usize,
diff --git a/src/avx2/alpha_f16.rs b/src/avx2/alpha_f16.rs
index 6d21ef8..ac2bf6e 100644
--- a/src/avx2/alpha_f16.rs
+++ b/src/avx2/alpha_f16.rs
@@ -37,7 +37,7 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-pub fn avx_premultiply_alpha_rgba_f16(
+pub(crate) fn avx_premultiply_alpha_rgba_f16(
     dst: &mut [half::f16],
     src: &[half::f16],
     width: usize,
@@ -138,7 +138,7 @@ unsafe fn avx_premultiply_alpha_rgba_f16_impl(
     }
 }
 
-pub fn avx_unpremultiply_alpha_rgba_f16(
+pub(crate) fn avx_unpremultiply_alpha_rgba_f16(
     in_place: &mut [half::f16],
     width: usize,
     height: usize,
diff --git a/src/avx2/alpha_f32.rs b/src/avx2/alpha_f32.rs
index b7c97b0..d24ca33 100644
--- a/src/avx2/alpha_f32.rs
+++ b/src/avx2/alpha_f32.rs
@@ -40,13 +40,13 @@ use rayon::slice::ParallelSliceMut;
 use rayon::ThreadPool;
 
 #[inline(always)]
-pub unsafe fn avx_unpremultiply_row_f32(x: __m256, a: __m256) -> __m256 {
+pub(crate) unsafe fn avx_unpremultiply_row_f32(x: __m256, a: __m256) -> __m256 {
     let is_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(a, _mm256_setzero_ps());
     let rs = _mm256_div_ps(x, a);
     _mm256_blendv_ps(rs, _mm256_setzero_ps(), is_zero_mask)
 }
 
-pub fn avx_unpremultiply_alpha_rgba_f32(
+pub(crate) fn avx_unpremultiply_alpha_rgba_f32(
     in_place: &mut [f32],
     width: usize,
     height: usize,
@@ -111,7 +111,7 @@ unsafe fn avx_unpremultiply_alpha_rgba_f32_impl(
     }
 }
 
-pub fn avx_premultiply_alpha_rgba_f32(
+pub(crate) fn avx_premultiply_alpha_rgba_f32(
     dst: &mut [f32],
     src: &[f32],
     width: usize,
diff --git a/src/avx2/alpha_u16.rs b/src/avx2/alpha_u16.rs
index 35f1a24..f190f68 100644
--- a/src/avx2/alpha_u16.rs
+++ b/src/avx2/alpha_u16.rs
@@ -51,7 +51,7 @@ unsafe fn _mm256_scale_by_alpha(px: __m256i, low_low_a: __m256, low_high_a: __m2
 }
 
 #[inline(always)]
-pub unsafe fn _mm256_div_by_1023_epi32(v: __m256i) -> __m256i {
+pub(crate) unsafe fn _mm256_div_by_1023_epi32(v: __m256i) -> __m256i {
     const DIVIDING_BY: i32 = 10;
     let addition = _mm256_set1_epi32(1 << (DIVIDING_BY - 1));
     let v = _mm256_add_epi32(v, addition);
@@ -59,7 +59,7 @@ pub unsafe fn _mm256_div_by_1023_epi32(v: __m256i) -> __m256i {
 }
 
 #[inline(always)]
-pub unsafe fn _mm256_div_by_4095_epi32(v: __m256i) -> __m256i {
+pub(crate) unsafe fn _mm256_div_by_4095_epi32(v: __m256i) -> __m256i {
     const DIVIDING_BY: i32 = 12;
     let addition = _mm256_set1_epi32(1 << (DIVIDING_BY - 1));
     let v = _mm256_add_epi32(v, addition);
@@ -67,14 +67,14 @@ pub unsafe fn _mm256_div_by_4095_epi32(v: __m256i) -> __m256i {
 }
 
 #[inline(always)]
-pub unsafe fn _mm256_div_by_65535_epi32(v: __m256i) -> __m256i {
+pub(crate) unsafe fn _mm256_div_by_65535_epi32(v: __m256i) -> __m256i {
     const DIVIDING_BY: i32 = 16;
     let addition = _mm256_set1_epi32(1 << (DIVIDING_BY - 1));
     let v = _mm256_add_epi32(v, addition);
     _mm256_srli_epi32::<DIVIDING_BY>(_mm256_add_epi32(v, _mm256_srli_epi32::<DIVIDING_BY>(v)))
 }
 
-pub fn avx_premultiply_alpha_rgba_u16(
+pub(crate) fn avx_premultiply_alpha_rgba_u16(
     dst: &mut [u16],
     src: &[u16],
     width: usize,
@@ -339,7 +339,7 @@ unsafe fn avx_premultiply_alpha_rgba_u16_impl(
     }
 }
 
-pub fn avx_unpremultiply_alpha_rgba_u16(
+pub(crate) fn avx_unpremultiply_alpha_rgba_u16(
     in_place: &mut [u16],
     width: usize,
     height: usize,
diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs
index 934f7b8..291c4b4 100644
--- a/src/avx2/alpha_u8.rs
+++ b/src/avx2/alpha_u8.rs
@@ -99,7 +99,7 @@ unsafe fn avx2_unpremultiply_row(x: __m256i, a: __m256i) -> __m256i {
     )
 }
 
-pub fn avx_premultiply_alpha_rgba(
+pub(crate) fn avx_premultiply_alpha_rgba(
     dst: &mut [u8],
     src: &[u8],
     width: usize,
@@ -239,7 +239,7 @@ unsafe fn avx_premultiply_alpha_rgba_impl(
     }
 }
 
-pub fn avx_unpremultiply_alpha_rgba(
+pub(crate) fn avx_unpremultiply_alpha_rgba(
     in_place: &mut [u8],
     width: usize,
     height: usize,
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
index 9affca8..8a1806a 100644
--- a/src/avx2/mod.rs
+++ b/src/avx2/mod.rs
@@ -35,7 +35,7 @@ mod alpha_u8;
 #[cfg(feature = "half")]
 mod rgba_f16;
 mod rgba_f32;
-pub mod utils;
+pub(crate) mod utils;
 #[cfg(feature = "half")]
 mod vertical_f16;
 mod vertical_f32;
@@ -43,21 +43,21 @@ mod vertical_u8;
 mod vertical_u8_lp;
 
 #[cfg(feature = "half")]
-pub use alpha_f16::{avx_premultiply_alpha_rgba_f16, avx_unpremultiply_alpha_rgba_f16};
-pub use alpha_f32::avx_premultiply_alpha_rgba_f32;
-pub use alpha_f32::avx_unpremultiply_alpha_rgba_f32;
-pub use alpha_u16::{avx_premultiply_alpha_rgba_u16, avx_unpremultiply_alpha_rgba_u16};
-pub use alpha_u8::avx_premultiply_alpha_rgba;
-pub use alpha_u8::avx_unpremultiply_alpha_rgba;
+pub(crate) use alpha_f16::{avx_premultiply_alpha_rgba_f16, avx_unpremultiply_alpha_rgba_f16};
+pub(crate) use alpha_f32::avx_premultiply_alpha_rgba_f32;
+pub(crate) use alpha_f32::avx_unpremultiply_alpha_rgba_f32;
+pub(crate) use alpha_u16::{avx_premultiply_alpha_rgba_u16, avx_unpremultiply_alpha_rgba_u16};
+pub(crate) use alpha_u8::avx_premultiply_alpha_rgba;
+pub(crate) use alpha_u8::avx_unpremultiply_alpha_rgba;
 #[cfg(feature = "half")]
-pub use rgba_f16::{
+pub(crate) use rgba_f16::{
     convolve_horizontal_rgba_avx_row_one_f16, convolve_horizontal_rgba_avx_rows_4_f16,
 };
-pub use rgba_f32::{
+pub(crate) use rgba_f32::{
     convolve_horizontal_rgba_avx_row_one_f32, convolve_horizontal_rgba_avx_rows_4_f32,
 };
 #[cfg(feature = "half")]
-pub use vertical_f16::convolve_vertical_avx_row_f16;
-pub use vertical_f32::convolve_vertical_avx_row_f32;
-pub use vertical_u8::convolve_vertical_avx_row;
-pub use vertical_u8_lp::convolve_vertical_avx_row_lp;
+pub(crate) use vertical_f16::convolve_vertical_avx_row_f16;
+pub(crate) use vertical_f32::convolve_vertical_avx_row_f32;
+pub(crate) use vertical_u8::convolve_vertical_avx_row;
+pub(crate) use vertical_u8_lp::convolve_vertical_avx_row_lp;
diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs
index f54af44..e9d645d 100644
--- a/src/avx2/rgba_f16.rs
+++ b/src/avx2/rgba_f16.rs
@@ -116,7 +116,7 @@ unsafe fn convolve_horizontal_parts_2_rgba_f16<const FMA: bool>(
     acc
 }
 
-pub fn convolve_horizontal_rgba_avx_row_one_f16<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_avx_row_one_f16<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -271,7 +271,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl<const FMA: bool>(
     }
 }
 
-pub fn convolve_horizontal_rgba_avx_rows_4_f16<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f16<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/avx2/rgba_f32.rs b/src/avx2/rgba_f32.rs
index f2a2726..f82923a 100644
--- a/src/avx2/rgba_f32.rs
+++ b/src/avx2/rgba_f32.rs
@@ -112,7 +112,7 @@ unsafe fn convolve_horizontal_parts_2_rgba_f32<const FMA: bool>(
     _mm256_fma_ps::<FMA>(store_0, rgb_pixel, weight0)
 }
 
-pub fn convolve_horizontal_rgba_avx_rows_4_f32<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -396,7 +396,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl<const FMA: bool>(
     }
 }
 
-pub fn convolve_horizontal_rgba_avx_row_one_f32<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_avx_row_one_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/avx2/utils.rs b/src/avx2/utils.rs
index c9cdf93..cd11c57 100644
--- a/src/avx2/utils.rs
+++ b/src/avx2/utils.rs
@@ -33,7 +33,7 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 #[inline]
-pub unsafe fn _mm256_fma_ps<const FMA: bool>(a: __m256, b: __m256, c: __m256) -> __m256 {
+pub(crate) unsafe fn _mm256_fma_ps<const FMA: bool>(a: __m256, b: __m256, c: __m256) -> __m256 {
     if FMA {
         _mm256_fma_psx(a, b, c)
     } else {
@@ -47,12 +47,12 @@ unsafe fn _mm256_fma_psx(a: __m256, b: __m256, c: __m256) -> __m256 {
 }
 
 #[inline(always)]
-pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
+pub(crate) const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
     ((z << 6) | (y << 4) | (x << 2) | w) as i32
 }
 
 #[inline(always)]
-pub unsafe fn _mm256_select_si256(
+pub(crate) unsafe fn _mm256_select_si256(
     mask: __m256i,
     true_vals: __m256i,
     false_vals: __m256i,
@@ -64,12 +64,16 @@ pub unsafe fn _mm256_select_si256(
 }
 
 #[inline(always)]
-pub unsafe fn _mm256_selecti_ps(mask: __m256i, true_vals: __m256, false_vals: __m256) -> __m256 {
+pub(crate) unsafe fn _mm256_selecti_ps(
+    mask: __m256i,
+    true_vals: __m256,
+    false_vals: __m256,
+) -> __m256 {
     _mm256_blendv_ps(false_vals, true_vals, _mm256_castsi256_ps(mask))
 }
 
 #[inline(always)]
-pub unsafe fn avx2_div_by255(v: __m256i) -> __m256i {
+pub(crate) unsafe fn avx2_div_by255(v: __m256i) -> __m256i {
     let addition = _mm256_set1_epi16(127);
     _mm256_srli_epi16::<8>(_mm256_add_epi16(
         _mm256_add_epi16(v, addition),
@@ -78,7 +82,7 @@ pub unsafe fn avx2_div_by255(v: __m256i) -> __m256i {
 }
 
 #[inline(always)]
-pub unsafe fn avx2_deinterleave_rgba(
+pub(crate) unsafe fn avx2_deinterleave_rgba(
     rgba0: __m256i,
     rgba1: __m256i,
     rgba2: __m256i,
@@ -118,7 +122,7 @@ pub unsafe fn avx2_deinterleave_rgba(
 }
 
 #[inline(always)]
-pub unsafe fn avx_deinterleave_rgba_epi32(
+pub(crate) unsafe fn avx_deinterleave_rgba_epi32(
     p0: __m256i,
     p1: __m256i,
     p2: __m256i,
@@ -142,7 +146,7 @@ pub unsafe fn avx_deinterleave_rgba_epi32(
 }
 
 #[inline(always)]
-pub unsafe fn avx_interleave_rgba_epi32(
+pub(crate) unsafe fn avx_interleave_rgba_epi32(
     p0: __m256i,
     p1: __m256i,
     p2: __m256i,
@@ -167,7 +171,7 @@ pub unsafe fn avx_interleave_rgba_epi32(
 }
 
 #[inline(always)]
-pub unsafe fn avx_interleave_rgba_epi16(
+pub(crate) unsafe fn avx_interleave_rgba_epi16(
     a: __m256i,
     b: __m256i,
     c: __m256i,
@@ -191,7 +195,7 @@ pub unsafe fn avx_interleave_rgba_epi16(
 }
 
 #[inline(always)]
-pub unsafe fn avx_deinterleave_rgba_epi16(
+pub(crate) unsafe fn avx_deinterleave_rgba_epi16(
     a: __m256i,
     b: __m256i,
     c: __m256i,
@@ -224,7 +228,7 @@ pub unsafe fn avx_deinterleave_rgba_epi16(
 }
 
 #[inline(always)]
-pub unsafe fn avx_deinterleave_rgba_ps(
+pub(crate) unsafe fn avx_deinterleave_rgba_ps(
     p0: __m256,
     p1: __m256,
     p2: __m256,
@@ -245,7 +249,7 @@ pub unsafe fn avx_deinterleave_rgba_ps(
 }
 
 #[inline(always)]
-pub unsafe fn avx_interleave_rgba_ps(
+pub(crate) unsafe fn avx_interleave_rgba_ps(
     p0: __m256,
     p1: __m256,
     p2: __m256,
@@ -266,7 +270,7 @@ pub unsafe fn avx_interleave_rgba_ps(
 }
 
 #[inline(always)]
-pub unsafe fn avx2_interleave_rgba(
+pub(crate) unsafe fn avx2_interleave_rgba(
     r: __m256i,
     g: __m256i,
     b: __m256i,
@@ -290,7 +294,7 @@ pub unsafe fn avx2_interleave_rgba(
 }
 
 #[inline(always)]
-pub unsafe fn avx2_pack_u16(s_1: __m256i, s_2: __m256i) -> __m256i {
+pub(crate) unsafe fn avx2_pack_u16(s_1: __m256i, s_2: __m256i) -> __m256i {
     let packed = _mm256_packus_epi16(s_1, s_2);
     const MASK: i32 = shuffle(3, 1, 2, 0);
     _mm256_permute4x64_epi64::<MASK>(packed)
@@ -298,7 +302,12 @@ pub unsafe fn avx2_pack_u16(s_1: __m256i, s_2: __m256i) -> __m256i {
 
 #[inline]
 #[target_feature(enable = "avx2")]
-pub unsafe fn _mm256_packus_four_epi32(a: __m256i, b: __m256i, c: __m256i, d: __m256i) -> __m256i {
+pub(crate) unsafe fn _mm256_packus_four_epi32(
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+    d: __m256i,
+) -> __m256i {
     let ab = _mm256_packs_epi32(a, b);
     let cd = _mm256_packs_epi32(c, d);
 
@@ -309,7 +318,7 @@ pub unsafe fn _mm256_packus_four_epi32(a: __m256i, b: __m256i, c: __m256i, d: __
 }
 
 #[inline(always)]
-pub unsafe fn avx2_pack_u32(s_1: __m256i, s_2: __m256i) -> __m256i {
+pub(crate) unsafe fn avx2_pack_u32(s_1: __m256i, s_2: __m256i) -> __m256i {
     let packed = _mm256_packus_epi32(s_1, s_2);
     const MASK: i32 = shuffle(3, 1, 2, 0);
     _mm256_permute4x64_epi64::<MASK>(packed)
@@ -317,13 +326,13 @@ pub unsafe fn avx2_pack_u32(s_1: __m256i, s_2: __m256i) -> __m256i {
 
 #[inline(always)]
 #[allow(dead_code)]
-pub unsafe fn avx_combine_ps(lo: __m128, hi: __m128) -> __m256 {
+pub(crate) unsafe fn avx_combine_ps(lo: __m128, hi: __m128) -> __m256 {
     _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(lo), hi)
 }
 
 #[inline(always)]
 #[allow(dead_code)]
-pub unsafe fn avx_combine_epi(lo: __m128i, hi: __m128i) -> __m256i {
+pub(crate) unsafe fn avx_combine_epi(lo: __m128i, hi: __m128i) -> __m256i {
     _mm256_castps_si256(_mm256_insertf128_ps::<1>(
         _mm256_castps128_ps256(_mm_castsi128_ps(lo)),
         _mm_castsi128_ps(hi),
@@ -332,7 +341,7 @@ pub unsafe fn avx_combine_epi(lo: __m128i, hi: __m128i) -> __m256i {
 
 #[inline]
 /// Arithmetic shift for i64, shifting with sign bits
-pub unsafe fn _mm256_srai_epi64x<const IMM8: i32>(a: __m256i) -> __m256i {
+pub(crate) unsafe fn _mm256_srai_epi64x<const IMM8: i32>(a: __m256i) -> __m256i {
     let m = _mm256_set1_epi64x(1 << (64 - 1));
     let x = _mm256_srli_epi64::<IMM8>(a);
     _mm256_sub_epi64(_mm256_xor_si256(x, m), m)
@@ -340,7 +349,7 @@ pub unsafe fn _mm256_srai_epi64x<const IMM8: i32>(a: __m256i) -> __m256i {
 
 #[inline]
 /// Pack 64bytes integers into 32 bytes using truncation
-pub unsafe fn _mm256_packts_epi64(a: __m256i, b: __m256i) -> __m256i {
+pub(crate) unsafe fn _mm256_packts_epi64(a: __m256i, b: __m256i) -> __m256i {
     const SHUFFLE_1: i32 = shuffle(2, 0, 2, 0);
     let combined = _mm256_shuffle_ps::<SHUFFLE_1>(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b));
     const SHUFFLE_2: i32 = shuffle(3, 1, 2, 0);
@@ -351,7 +360,7 @@ pub unsafe fn _mm256_packts_epi64(a: __m256i, b: __m256i) -> __m256i {
 #[inline]
 #[allow(dead_code)]
 /// Pack 64bytes integers into 32 bytes
-pub unsafe fn _mm256_cvtepi64_epi32x(v: __m256i) -> __m128i {
+pub(crate) unsafe fn _mm256_cvtepi64_epi32x(v: __m256i) -> __m128i {
     let vf = _mm256_castsi256_ps(v);
     let hi = _mm256_extractf128_ps::<1>(vf);
     let lo = _mm256_castps256_ps128(vf);
diff --git a/src/avx2/vertical_f16.rs b/src/avx2/vertical_f16.rs
index fc2b2e4..63b2871 100644
--- a/src/avx2/vertical_f16.rs
+++ b/src/avx2/vertical_f16.rs
@@ -201,7 +201,7 @@ unsafe fn convolve_vertical_part_avx_16_f16<const FMA: bool>(
     _mm256_storeu_si256(dst_ptr as *mut __m256i, acc0);
 }
 
-pub fn convolve_vertical_avx_row_f16<const CHANNELS: usize, const FMA: bool>(
+pub(crate) fn convolve_vertical_avx_row_f16<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const half::f16,
@@ -273,7 +273,7 @@ unsafe fn convolve_vertical_avx_row_f16_fma<const CHANNELS: usize>(
 }
 
 #[inline(always)]
-pub fn convolve_vertical_avx_row_f16_impl<const CHANNELS: usize, const FMA: bool>(
+pub(crate) fn convolve_vertical_avx_row_f16_impl<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const half::f16,
diff --git a/src/avx2/vertical_f32.rs b/src/avx2/vertical_f32.rs
index b480b36..ddd673a 100644
--- a/src/avx2/vertical_f32.rs
+++ b/src/avx2/vertical_f32.rs
@@ -171,7 +171,7 @@ pub(crate) unsafe fn convolve_vertical_part_avx_f32<const FMA: bool>(
 }
 
 #[inline]
-pub fn convolve_vertical_avx_row_f32<const CHANNELS: usize, const FMA: bool>(
+pub(crate) fn convolve_vertical_avx_row_f32<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const f32,
diff --git a/src/avx2/vertical_u8.rs b/src/avx2/vertical_u8.rs
index 03feaad..b8e3ee8 100644
--- a/src/avx2/vertical_u8.rs
+++ b/src/avx2/vertical_u8.rs
@@ -520,7 +520,7 @@ unsafe fn convolve_vertical_part_avx(
     *dst_ptr = _mm256_extract_epi8::<0>(item) as u8;
 }
 
-pub fn convolve_vertical_avx_row(
+pub(crate) fn convolve_vertical_avx_row(
     dst_width: usize,
     bounds: &FilterBounds,
     src: &[u8],
diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs
index 7ad792d..51b7134 100644
--- a/src/avx2/vertical_u8_lp.rs
+++ b/src/avx2/vertical_u8_lp.rs
@@ -33,7 +33,7 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-pub fn convolve_vertical_avx_row_lp(
+pub(crate) fn convolve_vertical_avx_row_lp(
     dst_width: usize,
     bounds: &FilterBounds,
     src: &[u8],
diff --git a/src/color_group.rs b/src/color_group.rs
index abb7cd8..bcb60e1 100644
--- a/src/color_group.rs
+++ b/src/color_group.rs
@@ -26,10 +26,9 @@
  * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-#![allow(dead_code)]
 use crate::mlaf::mlaf;
 use crate::saturate_narrow::SaturateNarrow;
-use num_traits::{AsPrimitive, FromPrimitive, MulAdd, Num};
+use num_traits::{AsPrimitive, FromPrimitive, MulAdd};
 use std::ops::{Add, AddAssign, Mul, Shr, ShrAssign, Sub, SubAssign};
 
 #[repr(C)]
@@ -46,7 +45,7 @@ where
     J: Copy + Default,
 {
     #[inline(always)]
-    pub fn new() -> ColorGroup<COMPS, J> {
+    pub(crate) fn new() -> ColorGroup<COMPS, J> {
         ColorGroup {
             r: J::default(),
             g: J::default(),
@@ -56,12 +55,12 @@ where
     }
 
     #[inline(always)]
-    pub fn from_components(r: J, g: J, b: J, a: J) -> ColorGroup<COMPS, J> {
+    pub(crate) fn from_components(r: J, g: J, b: J, a: J) -> ColorGroup<COMPS, J> {
         ColorGroup { r, g, b, a }
     }
 
     #[inline(always)]
-    pub fn dup(v: J) -> ColorGroup<COMPS, J> {
+    pub(crate) fn dup(v: J) -> ColorGroup<COMPS, J> {
         ColorGroup {
             r: v,
             g: v,
@@ -76,47 +75,7 @@ where
     J: Copy + Default + 'static,
 {
     #[inline(always)]
-    pub fn from_slice<T>(store: &[T], offset: usize) -> ColorGroup<COMPS, J>
-    where
-        T: AsPrimitive<J>,
-    {
-        unsafe {
-            if COMPS == 1 {
-                ColorGroup {
-                    r: (*store.get_unchecked(offset)).as_(),
-                    g: J::default(),
-                    b: J::default(),
-                    a: J::default(),
-                }
-            } else if COMPS == 2 {
-                ColorGroup {
-                    r: (*store.get_unchecked(offset)).as_(),
-                    g: (*store.get_unchecked(offset + 1)).as_(),
-                    b: J::default(),
-                    a: J::default(),
-                }
-            } else if COMPS == 3 {
-                ColorGroup {
-                    r: (*store.get_unchecked(offset)).as_(),
-                    g: (*store.get_unchecked(offset + 1)).as_(),
-                    b: (*store.get_unchecked(offset + 2)).as_(),
-                    a: J::default(),
-                }
-            } else if COMPS == 4 {
-                ColorGroup {
-                    r: (*store.get_unchecked(offset)).as_(),
-                    g: (*store.get_unchecked(offset + 1)).as_(),
-                    b: (*store.get_unchecked(offset + 2)).as_(),
-                    a: (*store.get_unchecked(offset + 3)).as_(),
-                }
-            } else {
-                panic!("Not implemented.")
-            }
-        }
-    }
-
-    #[inline(always)]
-    pub fn from_ptr<T>(store: *const T, offset: usize) -> ColorGroup<COMPS, J>
+    pub(crate) fn from_ptr<T>(store: *const T, offset: usize) -> ColorGroup<COMPS, J>
     where
         T: AsPrimitive<J>,
     {
@@ -151,30 +110,13 @@ where
                     a: l_ptr.add(3).read_unaligned().as_(),
                 }
             } else {
-                panic!("Not implemented.")
+                unimplemented!("Not implemented.")
             }
         }
     }
 
     #[inline(always)]
-    pub fn to_ptr(self, ptr: *mut J, offset: usize) {
-        unsafe {
-            let s_ptr = ptr.add(offset);
-            s_ptr.write_unaligned(self.r);
-            if COMPS > 1 {
-                s_ptr.add(1).write_unaligned(self.g);
-            }
-            if COMPS > 2 {
-                s_ptr.add(2).write_unaligned(self.b);
-            }
-            if COMPS == 4 {
-                s_ptr.add(3).write_unaligned(self.a);
-            }
-        }
-    }
-
-    #[inline(always)]
-    pub fn as_ptr<V: Copy + 'static>(self, ptr: *mut V, offset: usize)
+    pub(crate) fn as_ptr<V: Copy + 'static>(self, ptr: *mut V, offset: usize)
     where
         J: Copy + AsPrimitive<V>,
     {
@@ -194,67 +136,6 @@ where
     }
 }
 
-impl<const COMPS: usize, J> ColorGroup<COMPS, J>
-where
-    J: Copy + Default + 'static + Num + Ord,
-{
-    #[inline(always)]
-    pub fn min_scalar(&self, other: J) -> ColorGroup<COMPS, J> {
-        if COMPS == 1 {
-            ColorGroup::from_components(self.r.min(other), J::default(), J::default(), J::default())
-        } else if COMPS == 2 {
-            ColorGroup::from_components(
-                self.r.min(other),
-                self.g.min(other),
-                J::default(),
-                J::default(),
-            )
-        } else if COMPS == 3 {
-            ColorGroup::from_components(
-                self.r.min(other),
-                self.g.min(other),
-                self.b.min(other),
-                J::default(),
-            )
-        } else {
-            ColorGroup::from_components(
-                self.r.min(other),
-                self.g.min(other),
-                self.b.min(other),
-                self.a.min(other),
-            )
-        }
-    }
-
-    #[inline(always)]
-    pub(crate) fn max_scalar(&self, other: J) -> ColorGroup<COMPS, J> {
-        if COMPS == 1 {
-            ColorGroup::from_components(self.r.max(other), J::default(), J::default(), J::default())
-        } else if COMPS == 2 {
-            ColorGroup::from_components(
-                self.r.max(other),
-                self.g.max(other),
-                J::default(),
-                J::default(),
-            )
-        } else if COMPS == 3 {
-            ColorGroup::from_components(
-                self.r.max(other),
-                self.g.max(other),
-                self.b.max(other),
-                J::default(),
-            )
-        } else {
-            ColorGroup::from_components(
-                self.r.max(other),
-                self.g.max(other),
-                self.b.max(other),
-                self.a.max(other),
-            )
-        }
-    }
-}
-
 impl<const COMPS: usize, J> Mul<J> for ColorGroup<COMPS, J>
 where
     J: Copy + Mul<Output = J> + Default + 'static,
@@ -272,7 +153,7 @@ where
         } else if COMPS == 4 {
             ColorGroup::from_components(self.r * rhs, self.g * rhs, self.b * rhs, self.a * rhs)
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -282,7 +163,7 @@ where
     J: Copy + Default + 'static,
 {
     #[inline(always)]
-    pub fn saturate_narrow<V>(&self, bit_depth: u32) -> ColorGroup<COMPS, V>
+    pub(crate) fn saturate_narrow<V>(&self, bit_depth: u32) -> ColorGroup<COMPS, V>
     where
         V: Copy + Default,
         J: SaturateNarrow<V>,
@@ -341,7 +222,7 @@ where
                 self.a * rhs.b,
             )
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -363,7 +244,7 @@ where
         } else if COMPS == 4 {
             ColorGroup::from_components(self.r - rhs, self.g - rhs, self.b - rhs, self.a - rhs)
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -390,7 +271,7 @@ where
                 self.a - rhs.a,
             )
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -417,7 +298,7 @@ where
                 self.a + rhs.a,
             )
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -439,7 +320,7 @@ where
         } else if COMPS == 4 {
             ColorGroup::from_components(self.r + rhs, self.g + rhs, self.b + rhs, self.a + rhs)
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -461,7 +342,7 @@ where
         } else if COMPS == 4 {
             ColorGroup::from_components(self.r >> rhs, self.g >> rhs, self.b >> rhs, self.a >> rhs)
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -517,7 +398,7 @@ where
                 mlaf(self.a, a.a, b),
             )
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -611,7 +492,7 @@ macro_rules! fast_load_color_group {
                 a: $store.get_unchecked(3).as_(),
             }
         } else {
-            panic!("Not implemented.")
+            unimplemented!("Not implemented.")
         }
     }};
 }
diff --git a/src/convolution.rs b/src/convolution.rs
index c71c2a0..944b4ae 100644
--- a/src/convolution.rs
+++ b/src/convolution.rs
@@ -34,7 +34,7 @@ use std::fmt::Debug;
 use crate::filter_weights::FilterWeights;
 use crate::ImageStore;
 
-pub trait HorizontalConvolutionPass<T, const N: usize>
+pub(crate) trait HorizontalConvolutionPass<T, const N: usize>
 where
     T: FromPrimitive + Clone + Copy + Debug,
 {
@@ -46,7 +46,7 @@ where
     );
 }
 
-pub trait VerticalConvolutionPass<T, const N: usize>
+pub(crate) trait VerticalConvolutionPass<T, const N: usize>
 where
     T: FromPrimitive + Clone + Copy + Debug,
 {
diff --git a/src/filter_weights.rs b/src/filter_weights.rs
index b3491ae..6e2823d 100644
--- a/src/filter_weights.rs
+++ b/src/filter_weights.rs
@@ -28,7 +28,7 @@
  */
 
 #[derive(Debug, Clone)]
-pub struct FilterWeights<T> {
+pub(crate) struct FilterWeights<T> {
     pub weights: Vec<T>,
     pub bounds: Vec<FilterBounds>,
     pub kernel_size: usize,
@@ -38,19 +38,19 @@ pub struct FilterWeights<T> {
 }
 
 #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
-pub struct FilterBounds {
+pub(crate) struct FilterBounds {
     pub start: usize,
     pub size: usize,
 }
 
 impl FilterBounds {
-    pub fn new(start: usize, size: usize) -> FilterBounds {
+    pub(crate) fn new(start: usize, size: usize) -> FilterBounds {
         FilterBounds { start, size }
     }
 }
 
 impl<T> FilterWeights<T> {
-    pub fn new(
+    pub(crate) fn new(
         slice_ref: Vec<T>,
         kernel_size: usize,
         aligned_size: usize,
@@ -70,7 +70,7 @@ impl<T> FilterWeights<T> {
 }
 
 impl FilterWeights<f32> {
-    pub fn numerical_approximation_i16<const PRECISION: i32>(
+    pub(crate) fn numerical_approximation_i16<const PRECISION: i32>(
         &self,
         alignment: usize,
     ) -> FilterWeights<i16> {
diff --git a/src/handler_provider.rs b/src/handler_provider.rs
index c4f2d14..8ffbfe4 100644
--- a/src/handler_provider.rs
+++ b/src/handler_provider.rs
@@ -51,7 +51,7 @@ use crate::sse::{
 use num_traits::{AsPrimitive, Float, MulAdd};
 use std::ops::{Add, AddAssign, Mul};
 
-pub trait ColumnHandlerFloatingPoint<T, J, F>
+pub(crate) trait ColumnHandlerFloatingPoint<T, J, F>
 where
     T: Copy + 'static + AsPrimitive<J> + Default,
     J: Copy + 'static + AsPrimitive<T> + MulAdd<J, Output = J> + Default + MixedStorage<T>,
@@ -153,7 +153,7 @@ impl ColumnHandlerFloatingPoint<u16, f32, f32> for u16 {
 default_floating_column_handler!(u8);
 default_floating_column_handler!(f32);
 
-pub trait RowHandlerFloatingPoint<T, J, F>
+pub(crate) trait RowHandlerFloatingPoint<T, J, F>
 where
     T: Copy + 'static + AsPrimitive<J> + Default,
     J: Copy + 'static + AsPrimitive<T> + MulAdd<J, Output = J> + Default + MixedStorage<T>,
@@ -263,7 +263,7 @@ impl RowHandlerFloatingPoint<u16, f32, f32> for u16 {
     }
 }
 
-pub trait ColumnHandlerFixedPoint<T> {
+pub(crate) trait ColumnHandlerFixedPoint<T> {
     fn handle_fixed_column<J, const COMPONENTS: usize>(
         dst_width: usize,
         bounds: &FilterBounds,
@@ -285,7 +285,7 @@ pub trait ColumnHandlerFixedPoint<T> {
         i16: AsPrimitive<J>;
 }
 
-pub trait RowHandlerFixedPoint<T> {
+pub(crate) trait RowHandlerFixedPoint<T> {
     fn handle_fixed_row_4<J, const COMPONENTS: usize>(
         src: &[T],
         src_stride: usize,
diff --git a/src/image_store.rs b/src/image_store.rs
index a3a68ad..16bd36f 100644
--- a/src/image_store.rs
+++ b/src/image_store.rs
@@ -69,14 +69,14 @@ pub(crate) enum BufferStore<'a, T: Copy + Debug> {
 }
 
 impl<T: Copy + Debug> BufferStore<'_, T> {
-    pub fn borrow(&self) -> &[T] {
+    pub(crate) fn borrow(&self) -> &[T] {
         match self {
             Self::Borrowed(p_ref) => p_ref,
             Self::Owned(vec) => vec,
         }
     }
 
-    pub fn borrow_mut(&mut self) -> &mut [T] {
+    pub(crate) fn borrow_mut(&mut self) -> &mut [T] {
         match self {
             Self::Borrowed(p_ref) => p_ref,
             Self::Owned(vec) => vec,
diff --git a/src/lib.rs b/src/lib.rs
index 7de7dc5..7e41574 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -27,6 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #![deny(deprecated)]
+#![deny(unreachable_code, unused)]
 #![allow(clippy::too_many_arguments)]
 mod alpha_check;
 #[cfg(feature = "half")]
diff --git a/src/mixed_storage.rs b/src/mixed_storage.rs
index 3591604..594b8c4 100644
--- a/src/mixed_storage.rs
+++ b/src/mixed_storage.rs
@@ -27,7 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-pub trait MixedStorage<T> {
+pub(crate) trait MixedStorage<T> {
     fn to_mixed(self, bit_depth: u32) -> T;
 }
 
diff --git a/src/mlaf.rs b/src/mlaf.rs
index f72d2bd..1cf3f2b 100644
--- a/src/mlaf.rs
+++ b/src/mlaf.rs
@@ -37,7 +37,7 @@ use std::ops::{Add, Mul};
     all(target_arch = "aarch64", target_feature = "neon")
 ))]
 #[inline(always)]
-pub fn mlaf<T: Copy + Mul<T, Output = T> + Add<T, Output = T> + MulAdd<T, Output = T>>(
+pub(crate) fn mlaf<T: Copy + Mul<T, Output = T> + Add<T, Output = T> + MulAdd<T, Output = T>>(
     acc: T,
     a: T,
     b: T,
@@ -53,7 +53,7 @@ pub fn mlaf<T: Copy + Mul<T, Output = T> + Add<T, Output = T> + MulAdd<T, Output
     ),
     all(target_arch = "aarch64", target_feature = "neon")
 )))]
-pub fn mlaf<T: Copy + Mul<T, Output = T> + Add<T, Output = T> + MulAdd<T, Output = T>>(
+pub(crate) fn mlaf<T: Copy + Mul<T, Output = T> + Add<T, Output = T> + MulAdd<T, Output = T>>(
     acc: T,
     a: T,
     b: T,
diff --git a/src/nearest_sampler.rs b/src/nearest_sampler.rs
index b1aa650..945d5bb 100644
--- a/src/nearest_sampler.rs
+++ b/src/nearest_sampler.rs
@@ -31,7 +31,7 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::ParallelSliceMut;
 use rayon::ThreadPool;
 
-pub fn resize_nearest<T: Copy + Send + Sync, const CHANNELS: usize>(
+pub(crate) fn resize_nearest<T: Copy + Send + Sync, const CHANNELS: usize>(
     src: &[T],
     src_width: usize,
     src_height: usize,
diff --git a/src/saturate_narrow.rs b/src/saturate_narrow.rs
index c856cc1..74949d3 100644
--- a/src/saturate_narrow.rs
+++ b/src/saturate_narrow.rs
@@ -29,7 +29,7 @@
 
 use crate::support::PRECISION;
 
-pub trait SaturateNarrow<J> {
+pub(crate) trait SaturateNarrow<J> {
     fn saturate_narrow(self, bit_depth: u32) -> J;
 }
 
diff --git a/src/sse/alpha_f16.rs b/src/sse/alpha_f16.rs
index ebe54eb..b5ed21d 100644
--- a/src/sse/alpha_f16.rs
+++ b/src/sse/alpha_f16.rs
@@ -38,7 +38,7 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-pub fn sse_premultiply_alpha_rgba_f16(
+pub(crate) fn sse_premultiply_alpha_rgba_f16(
     dst: &mut [half::f16],
     src: &[half::f16],
     width: usize,
@@ -156,7 +156,7 @@ unsafe fn sse_premultiply_alpha_rgba_f16_impl<const F16C: bool>(
     }
 }
 
-pub fn sse_unpremultiply_alpha_rgba_f16(
+pub(crate) fn sse_unpremultiply_alpha_rgba_f16(
     in_place: &mut [half::f16],
     width: usize,
     height: usize,
diff --git a/src/sse/alpha_f32.rs b/src/sse/alpha_f32.rs
index 2a75274..def96a0 100644
--- a/src/sse/alpha_f32.rs
+++ b/src/sse/alpha_f32.rs
@@ -44,7 +44,7 @@ unsafe fn sse_unpremultiply_row_f32(x: __m128, a: __m128) -> __m128 {
     _mm_blendv_ps(rs, _mm_setzero_ps(), is_zero_mask)
 }
 
-pub fn sse_unpremultiply_alpha_rgba_f32(
+pub(crate) fn sse_unpremultiply_alpha_rgba_f32(
     in_place: &mut [f32],
     width: usize,
     height: usize,
@@ -106,7 +106,7 @@ unsafe fn sse_unpremultiply_alpha_rgba_f32_impl(
     }
 }
 
-pub fn sse_premultiply_alpha_rgba_f32(
+pub(crate) fn sse_premultiply_alpha_rgba_f32(
     dst: &mut [f32],
     src: &[f32],
     width: usize,
diff --git a/src/sse/alpha_u16.rs b/src/sse/alpha_u16.rs
index b8d0836..9cde8aa 100644
--- a/src/sse/alpha_u16.rs
+++ b/src/sse/alpha_u16.rs
@@ -65,7 +65,7 @@ unsafe fn sse_unpremultiply_row_u16(
 }
 
 #[inline(always)]
-pub unsafe fn _mm_div_by_1023_epi32(v: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_div_by_1023_epi32(v: __m128i) -> __m128i {
     const DIVIDING_BY: i32 = 10;
     let addition = _mm_set1_epi32(1 << (DIVIDING_BY - 1));
     let v = _mm_add_epi32(v, addition);
@@ -73,7 +73,7 @@ pub unsafe fn _mm_div_by_1023_epi32(v: __m128i) -> __m128i {
 }
 
 #[inline(always)]
-pub unsafe fn _mm_div_by_4095_epi32(v: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_div_by_4095_epi32(v: __m128i) -> __m128i {
     const DIVIDING_BY: i32 = 12;
     let addition = _mm_set1_epi32(1 << (DIVIDING_BY - 1));
     let v = _mm_add_epi32(v, addition);
@@ -81,14 +81,14 @@ pub unsafe fn _mm_div_by_4095_epi32(v: __m128i) -> __m128i {
 }
 
 #[inline(always)]
-pub unsafe fn _mm_div_by_65535_epi32(v: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_div_by_65535_epi32(v: __m128i) -> __m128i {
     const DIVIDING_BY: i32 = 16;
     let addition = _mm_set1_epi32(1 << (DIVIDING_BY - 1));
     let v = _mm_add_epi32(v, addition);
     _mm_srli_epi32::<DIVIDING_BY>(_mm_add_epi32(v, _mm_srli_epi32::<DIVIDING_BY>(v)))
 }
 
-pub fn unpremultiply_alpha_sse_rgba_u16(
+pub(crate) fn unpremultiply_alpha_sse_rgba_u16(
     in_place: &mut [u16],
     width: usize,
     height: usize,
@@ -198,7 +198,7 @@ unsafe fn sse_premultiply_row_u16(
     _mm_packs_epi32(new_lo, new_hi)
 }
 
-pub fn premultiply_alpha_sse_rgba_u16(
+pub(crate) fn premultiply_alpha_sse_rgba_u16(
     dst: &mut [u16],
     src: &[u16],
     width: usize,
diff --git a/src/sse/alpha_u8.rs b/src/sse/alpha_u8.rs
index 8c3c3f6..f194299 100644
--- a/src/sse/alpha_u8.rs
+++ b/src/sse/alpha_u8.rs
@@ -38,7 +38,11 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 #[inline(always)]
-pub unsafe fn _mm_select_si128(mask: __m128i, true_vals: __m128i, false_vals: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_select_si128(
+    mask: __m128i,
+    true_vals: __m128i,
+    false_vals: __m128i,
+) -> __m128i {
     _mm_or_si128(
         _mm_and_si128(mask, true_vals),
         _mm_andnot_si128(mask, false_vals),
@@ -46,7 +50,7 @@ pub unsafe fn _mm_select_si128(mask: __m128i, true_vals: __m128i, false_vals: __
 }
 
 #[inline(always)]
-pub unsafe fn _mm_div_by_255_epi16(v: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_div_by_255_epi16(v: __m128i) -> __m128i {
     let addition = _mm_set1_epi16(127);
     _mm_srli_epi16::<8>(_mm_add_epi16(
         _mm_add_epi16(v, addition),
@@ -55,7 +59,7 @@ pub unsafe fn _mm_div_by_255_epi16(v: __m128i) -> __m128i {
 }
 
 #[inline(always)]
-pub unsafe fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i {
+pub(crate) unsafe fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i {
     let zeros = _mm_setzero_si128();
     let lo = _mm_cvtepu8_epi16(x);
     let hi = _mm_unpackhi_epi8(x, zeros);
@@ -90,7 +94,7 @@ pub unsafe fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i {
     _mm_select_si128(is_zero_mask, _mm_setzero_si128(), _mm_packus_epi16(lo, hi))
 }
 
-pub fn sse_premultiply_alpha_rgba(
+pub(crate) fn sse_premultiply_alpha_rgba(
     dst: &mut [u8],
     src: &[u8],
     width: usize,
@@ -185,7 +189,7 @@ unsafe fn sse_premultiply_alpha_rgba_impl(
     }
 }
 
-pub fn sse_unpremultiply_alpha_rgba(
+pub(crate) fn sse_unpremultiply_alpha_rgba(
     in_place: &mut [u8],
     width: usize,
     height: usize,
diff --git a/src/sse/f16_utils.rs b/src/sse/f16_utils.rs
index 6c1d905..7f7a8e1 100644
--- a/src/sse/f16_utils.rs
+++ b/src/sse/f16_utils.rs
@@ -34,13 +34,13 @@ use std::arch::x86_64::*;
 
 #[inline]
 #[cfg(target_feature = "avx2")]
-pub unsafe fn _mm_srlv_epi32x(c: __m128i, n: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_srlv_epi32x(c: __m128i, n: __m128i) -> __m128i {
     _mm_srlv_epi32(c, n)
 }
 
 #[inline]
 #[cfg(not(target_feature = "avx2"))]
-pub unsafe fn _mm_srlv_epi32x(c: __m128i, n: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_srlv_epi32x(c: __m128i, n: __m128i) -> __m128i {
     _mm_setr_epi32(
         _mm_extract_epi32::<0>(c).wrapping_shr(_mm_extract_epi32::<0>(n) as u32),
         _mm_extract_epi32::<1>(c).wrapping_shr(_mm_extract_epi32::<1>(n) as u32),
@@ -51,13 +51,13 @@ pub unsafe fn _mm_srlv_epi32x(c: __m128i, n: __m128i) -> __m128i {
 
 #[inline]
 #[cfg(target_feature = "avx2")]
-pub unsafe fn _mm_sllv_epi32x(c: __m128i, n: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_sllv_epi32x(c: __m128i, n: __m128i) -> __m128i {
     _mm_sllv_epi32(c, n)
 }
 
 #[inline]
 #[cfg(not(target_feature = "avx2"))]
-pub unsafe fn _mm_sllv_epi32x(c: __m128i, n: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_sllv_epi32x(c: __m128i, n: __m128i) -> __m128i {
     _mm_setr_epi32(
         _mm_extract_epi32::<0>(c).wrapping_shl(_mm_extract_epi32::<0>(n) as u32),
         _mm_extract_epi32::<1>(c).wrapping_shl(_mm_extract_epi32::<1>(n) as u32),
@@ -67,7 +67,7 @@ pub unsafe fn _mm_sllv_epi32x(c: __m128i, n: __m128i) -> __m128i {
 }
 
 #[inline(always)]
-pub unsafe fn _mm_blendv_epi32(xmm0: __m128i, xmm1: __m128i, mask: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_blendv_epi32(xmm0: __m128i, xmm1: __m128i, mask: __m128i) -> __m128i {
     _mm_castps_si128(_mm_blendv_ps(
         _mm_castsi128_ps(xmm0),
         _mm_castsi128_ps(xmm1),
@@ -77,7 +77,11 @@ pub unsafe fn _mm_blendv_epi32(xmm0: __m128i, xmm1: __m128i, mask: __m128i) -> _
 
 #[inline(always)]
 /// If mask then `true_vals` otherwise `false_val`
-pub unsafe fn _mm_select_epi32(mask: __m128i, true_vals: __m128i, false_vals: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_select_epi32(
+    mask: __m128i,
+    true_vals: __m128i,
+    false_vals: __m128i,
+) -> __m128i {
     _mm_blendv_epi32(false_vals, true_vals, mask)
 }
 
@@ -181,7 +185,7 @@ unsafe fn _mm_cvtps_phdx(x: __m128) -> __m128i {
 }
 
 #[inline]
-pub unsafe fn _mm_cvtps_phx<const F16C: bool>(x: __m128) -> __m128i {
+pub(crate) unsafe fn _mm_cvtps_phx<const F16C: bool>(x: __m128) -> __m128i {
     if F16C {
         _mm_cvtps_phdx(x)
     } else {
@@ -196,7 +200,7 @@ unsafe fn _mm_cvtph_psdx(x: __m128i) -> __m128 {
 }
 
 #[inline]
-pub unsafe fn _mm_cvtph_psx<const F16C: bool>(x: __m128i) -> __m128 {
+pub(crate) unsafe fn _mm_cvtph_psx<const F16C: bool>(x: __m128i) -> __m128 {
     if F16C {
         _mm_cvtph_ps(x)
     } else {
diff --git a/src/sse/mod.rs b/src/sse/mod.rs
index ab0dcc0..6726e82 100644
--- a/src/sse/mod.rs
+++ b/src/sse/mod.rs
@@ -59,51 +59,57 @@ mod vertical_u8;
 mod vertical_u8_lp;
 
 #[cfg(feature = "half")]
-pub use alpha_f16::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16};
-pub use alpha_f32::sse_premultiply_alpha_rgba_f32;
-pub use alpha_f32::sse_unpremultiply_alpha_rgba_f32;
-pub use alpha_u16::{premultiply_alpha_sse_rgba_u16, unpremultiply_alpha_sse_rgba_u16};
-pub use alpha_u8::{
+pub(crate) use alpha_f16::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16};
+pub(crate) use alpha_f32::sse_premultiply_alpha_rgba_f32;
+pub(crate) use alpha_f32::sse_unpremultiply_alpha_rgba_f32;
+pub(crate) use alpha_u16::{premultiply_alpha_sse_rgba_u16, unpremultiply_alpha_sse_rgba_u16};
+pub(crate) use alpha_u8::{
     _mm_div_by_255_epi16, sse_premultiply_alpha_rgba, sse_unpremultiply_alpha_rgba,
     sse_unpremultiply_row,
 };
-pub use plane_f32::convolve_horizontal_plane_sse_row_one;
-pub use plane_f32::convolve_horizontal_plane_sse_rows_4;
-pub use plane_u8::{convolve_horizontal_plane_sse_row, convolve_horizontal_plane_sse_rows_4_u8};
+pub(crate) use plane_f32::convolve_horizontal_plane_sse_row_one;
+pub(crate) use plane_f32::convolve_horizontal_plane_sse_rows_4;
+pub(crate) use plane_u8::{
+    convolve_horizontal_plane_sse_row, convolve_horizontal_plane_sse_rows_4_u8,
+};
 #[cfg(feature = "half")]
-pub use rgb_f16::{
+pub(crate) use rgb_f16::{
     convolve_horizontal_rgb_sse_row_one_f16, convolve_horizontal_rgb_sse_rows_4_f16,
 };
-pub use rgb_f32::{
+pub(crate) use rgb_f32::{
     convolve_horizontal_rgb_sse_row_one_f32, convolve_horizontal_rgb_sse_rows_4_f32,
 };
-pub use rgb_u8::*;
+pub(crate) use rgb_u8::*;
 #[cfg(feature = "half")]
-pub use rgba_f16::{
+pub(crate) use rgba_f16::{
     convolve_horizontal_rgba_sse_row_one_f16, convolve_horizontal_rgba_sse_rows_4_f16,
 };
-pub use rgba_f32::{
+pub(crate) use rgba_f32::{
     convolve_horizontal_rgba_sse_row_one_f32, convolve_horizontal_rgba_sse_rows_4_f32,
 };
-pub use rgba_u16::{convolve_horizontal_rgba_sse_rows_4_u16, convolve_horizontal_rgba_sse_u16_row};
-pub use rgba_u16_lb::{
+pub(crate) use rgba_u16::{
+    convolve_horizontal_rgba_sse_rows_4_u16, convolve_horizontal_rgba_sse_u16_row,
+};
+pub(crate) use rgba_u16_lb::{
     convolve_horizontal_rgba_sse_rows_4_lb_u8, convolve_horizontal_rgba_sse_u16_lb_row,
 };
-pub use rgba_u8::{convolve_horizontal_rgba_sse_rows_4, convolve_horizontal_rgba_sse_rows_one};
-pub use rgba_u8_lb::{
+pub(crate) use rgba_u8::{
+    convolve_horizontal_rgba_sse_rows_4, convolve_horizontal_rgba_sse_rows_one,
+};
+pub(crate) use rgba_u8_lb::{
     convolve_horizontal_rgba_sse_rows_4_lb, convolve_horizontal_rgba_sse_rows_one_lb,
 };
 pub(crate) use routines::{load_4_weights, load_4_weights_group_2_avx, load_8_weights_group_4_avx};
-pub use u8_utils::*;
-pub use utils::*;
+pub(crate) use u8_utils::*;
+pub(crate) use utils::*;
 #[cfg(feature = "half")]
-pub use vertical_f16::convolve_vertical_sse_row_f16;
-pub use vertical_f32::convolve_vertical_rgb_sse_row_f32;
-pub use vertical_u16::convolve_column_sse_u16;
-pub use vertical_u16_lb::convolve_column_lb_sse_u16;
-pub use vertical_u8::convolve_vertical_sse_row;
-pub use vertical_u8_lp::convolve_vertical_sse_row_lp;
+pub(crate) use vertical_f16::convolve_vertical_sse_row_f16;
+pub(crate) use vertical_f32::convolve_vertical_rgb_sse_row_f32;
+pub(crate) use vertical_u16::convolve_column_sse_u16;
+pub(crate) use vertical_u16_lb::convolve_column_lb_sse_u16;
+pub(crate) use vertical_u8::convolve_vertical_sse_row;
+pub(crate) use vertical_u8_lp::convolve_vertical_sse_row_lp;
 
-pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
+pub(crate) const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
     ((z << 6) | (y << 4) | (x << 2) | w) as i32
 }
diff --git a/src/sse/plane_f32.rs b/src/sse/plane_f32.rs
index 55f32c7..7b619eb 100644
--- a/src/sse/plane_f32.rs
+++ b/src/sse/plane_f32.rs
@@ -97,7 +97,7 @@ macro_rules! conv_horiz_plane_1_f32 {
     }};
 }
 
-pub fn convolve_horizontal_plane_sse_row_one<const FMA: bool>(
+pub(crate) fn convolve_horizontal_plane_sse_row_one<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -245,7 +245,7 @@ unsafe fn convolve_horizontal_plane_sse_row_one_impl<const FMA: bool>(
     }
 }
 
-pub fn convolve_horizontal_plane_sse_rows_4<const FMA: bool>(
+pub(crate) fn convolve_horizontal_plane_sse_rows_4<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/sse/plane_u8.rs b/src/sse/plane_u8.rs
index 58fc4ea..6f275f2 100644
--- a/src/sse/plane_u8.rs
+++ b/src/sse/plane_u8.rs
@@ -67,7 +67,7 @@ macro_rules! s_accumulate_1_horiz {
     }};
 }
 
-pub fn convolve_horizontal_plane_sse_rows_4_u8(
+pub(crate) fn convolve_horizontal_plane_sse_rows_4_u8(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -213,7 +213,7 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_u8_impl(
     }
 }
 
-pub fn convolve_horizontal_plane_sse_row(
+pub(crate) fn convolve_horizontal_plane_sse_row(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/sse/rgb_f16.rs b/src/sse/rgb_f16.rs
index 1ebb02c..18ba209 100644
--- a/src/sse/rgb_f16.rs
+++ b/src/sse/rgb_f16.rs
@@ -146,7 +146,7 @@ unsafe fn convolve_horizontal_parts_one_rgb_f16<const F16C: bool, const FMA: boo
     acc
 }
 
-pub fn convolve_horizontal_rgb_sse_row_one_f16<const F16C: bool, const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgb_sse_row_one_f16<const F16C: bool, const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -317,7 +317,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_impl<const F16C: bool, const F
     }
 }
 
-pub fn convolve_horizontal_rgb_sse_rows_4_f16<const F16C: bool, const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f16<const F16C: bool, const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/sse/rgb_f32.rs b/src/sse/rgb_f32.rs
index 53bf992..918daed 100644
--- a/src/sse/rgb_f32.rs
+++ b/src/sse/rgb_f32.rs
@@ -106,7 +106,7 @@ unsafe fn convolve_horizontal_parts_one_rgb_f32<const FMA: bool>(
     _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel, weight0)
 }
 
-pub fn convolve_horizontal_rgb_sse_row_one_f32<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgb_sse_row_one_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -247,7 +247,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f32_impl<const FMA: bool>(
     }
 }
 
-pub fn convolve_horizontal_rgb_sse_rows_4_f32<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/sse/rgb_u8.rs b/src/sse/rgb_u8.rs
index a79e7b5..ee6e357 100644
--- a/src/sse/rgb_u8.rs
+++ b/src/sse/rgb_u8.rs
@@ -36,7 +36,7 @@ use crate::filter_weights::FilterWeights;
 use crate::sse::{compress_i32, convolve_horizontal_parts_one_sse_rgb, shuffle};
 use crate::support::ROUNDING_CONST;
 
-pub fn convolve_horizontal_rgb_sse_rows_4(
+pub(crate) fn convolve_horizontal_rgb_sse_rows_4(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -245,7 +245,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl(
     }
 }
 
-pub fn convolve_horizontal_rgb_sse_row_one(
+pub(crate) fn convolve_horizontal_rgb_sse_row_one(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/sse/rgba_f16.rs b/src/sse/rgba_f16.rs
index f186de0..6b7b1e7 100644
--- a/src/sse/rgba_f16.rs
+++ b/src/sse/rgba_f16.rs
@@ -103,7 +103,7 @@ unsafe fn convolve_horizontal_parts_2_rgba_f16<const F16C: bool, const FMA: bool
     acc
 }
 
-pub fn convolve_horizontal_rgba_sse_row_one_f16<const F16C: bool, const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_sse_row_one_f16<const F16C: bool, const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -276,7 +276,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl<const F16C: bool, const
     }
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_4_f16<const F16C: bool, const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f16<const F16C: bool, const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/sse/rgba_f32.rs b/src/sse/rgba_f32.rs
index 8a3ab2d..9b4b244 100644
--- a/src/sse/rgba_f32.rs
+++ b/src/sse/rgba_f32.rs
@@ -47,7 +47,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_f32<const FMA: bool>(
     _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel, weight0)
 }
 
-pub fn convolve_horizontal_rgba_sse_row_one_f32<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_sse_row_one_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -229,7 +229,7 @@ unsafe fn convolve_horizontal_parts_2_rgba_f32<const FMA: bool>(
     _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_1, weight1)
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_4_f32<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/sse/rgba_u16.rs b/src/sse/rgba_u16.rs
index 2e499e3..79ec664 100644
--- a/src/sse/rgba_u16.rs
+++ b/src/sse/rgba_u16.rs
@@ -175,7 +175,7 @@ unsafe fn conv_horiz_rgba_8_u16<const FMA: bool>(
     acc
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_4_u16(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_4_u16(
     src: &[u16],
     src_stride: usize,
     dst: &mut [u16],
@@ -392,7 +392,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_u16_impl<const FMA: bool>(
     }
 }
 
-pub fn convolve_horizontal_rgba_sse_u16_row(
+pub(crate) fn convolve_horizontal_rgba_sse_u16_row(
     src: &[u16],
     dst: &mut [u16],
     filter_weights: &FilterWeights<f32>,
diff --git a/src/sse/rgba_u16_lb.rs b/src/sse/rgba_u16_lb.rs
index 6a5c715..f2d5974 100644
--- a/src/sse/rgba_u16_lb.rs
+++ b/src/sse/rgba_u16_lb.rs
@@ -162,7 +162,7 @@ unsafe fn conv_horiz_rgba_8_u16(
     acc
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_4_lb_u8(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_4_lb_u8(
     src: &[u16],
     src_stride: usize,
     dst: &mut [u16],
@@ -320,7 +320,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_lb_u8_impl(
     }
 }
 
-pub fn convolve_horizontal_rgba_sse_u16_lb_row(
+pub(crate) fn convolve_horizontal_rgba_sse_u16_lb_row(
     src: &[u16],
     dst: &mut [u16],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/sse/rgba_u8.rs b/src/sse/rgba_u8.rs
index c5c34ba..e41d35f 100644
--- a/src/sse/rgba_u8.rs
+++ b/src/sse/rgba_u8.rs
@@ -52,7 +52,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_sse(
     _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(lo), weight0))
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_4(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_4(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -244,7 +244,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
     }
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_one(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_one(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/sse/rgba_u8_lb.rs b/src/sse/rgba_u8_lb.rs
index 2c339e7..1cef21a 100644
--- a/src/sse/rgba_u8_lb.rs
+++ b/src/sse/rgba_u8_lb.rs
@@ -50,7 +50,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_sse<const SCALE: i32>(
     _mm_add_epi16(store_0, _mm_mulhi_epi16(lo, weight0))
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_4_lb(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_4_lb(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -385,7 +385,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
     }
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_one_lb(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_one_lb(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/sse/u8_utils.rs b/src/sse/u8_utils.rs
index 706776d..3605d2a 100644
--- a/src/sse/u8_utils.rs
+++ b/src/sse/u8_utils.rs
@@ -35,7 +35,7 @@ use std::arch::x86_64::*;
 use crate::support::PRECISION;
 
 #[inline(always)]
-pub fn compress_i32(x: __m128i) -> __m128i {
+pub(crate) fn compress_i32(x: __m128i) -> __m128i {
     let store_32 = unsafe { _mm_srai_epi32::<PRECISION>(_mm_max_epi32(x, _mm_setzero_si128())) };
     let store_16 = unsafe { _mm_packus_epi32(store_32, store_32) };
     unsafe { _mm_packus_epi16(store_16, store_16) }
diff --git a/src/sse/utils.rs b/src/sse/utils.rs
index 023aa0b..e4f50c9 100644
--- a/src/sse/utils.rs
+++ b/src/sse/utils.rs
@@ -34,8 +34,7 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 #[inline]
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn _mm_prefer_fma_ps<const FMA: bool>(a: __m128, b: __m128, c: __m128) -> __m128 {
+pub(crate) unsafe fn _mm_prefer_fma_ps<const FMA: bool>(a: __m128, b: __m128, c: __m128) -> __m128 {
     if FMA {
         _mm_fma_psx(a, b, c)
     } else {
@@ -44,13 +43,12 @@ pub unsafe fn _mm_prefer_fma_ps<const FMA: bool>(a: __m128, b: __m128, c: __m128
 }
 
 #[inline]
-#[target_feature(enable = "sse4.1,fma")]
 unsafe fn _mm_fma_psx(a: __m128, b: __m128, c: __m128) -> __m128 {
     _mm_fmadd_ps(b, c, a)
 }
 
 #[inline(always)]
-pub unsafe fn sse_deinterleave_rgba_ps(
+pub(crate) unsafe fn sse_deinterleave_rgba_ps(
     v0: __m128,
     v1: __m128,
     v2: __m128,
@@ -68,7 +66,7 @@ pub unsafe fn sse_deinterleave_rgba_ps(
 }
 
 #[inline(always)]
-pub unsafe fn sse_interleave_rgba_ps(
+pub(crate) unsafe fn sse_interleave_rgba_ps(
     v0: __m128,
     v1: __m128,
     v2: __m128,
@@ -87,7 +85,7 @@ pub unsafe fn sse_interleave_rgba_ps(
 }
 
 #[inline(always)]
-pub unsafe fn sse_deinterleave_rgba(
+pub(crate) unsafe fn sse_deinterleave_rgba(
     rgba0: __m128i,
     rgba1: __m128i,
     rgba2: __m128i,
@@ -124,7 +122,7 @@ pub unsafe fn sse_deinterleave_rgba(
 }
 
 #[inline(always)]
-pub unsafe fn sse_interleave_rgba(
+pub(crate) unsafe fn sse_interleave_rgba(
     r: __m128i,
     g: __m128i,
     b: __m128i,
@@ -144,7 +142,7 @@ pub unsafe fn sse_interleave_rgba(
 
 /// Sums all lanes in float32
 #[inline(always)]
-pub unsafe fn _mm_hsum_ps(v: __m128) -> f32 {
+pub(crate) unsafe fn _mm_hsum_ps(v: __m128) -> f32 {
     let mut shuf = _mm_movehdup_ps(v);
     let mut sums = _mm_add_ps(v, shuf);
     shuf = _mm_movehl_ps(shuf, sums);
@@ -154,7 +152,7 @@ pub unsafe fn _mm_hsum_ps(v: __m128) -> f32 {
 
 #[inline(always)]
 #[allow(dead_code)]
-pub unsafe fn sse_deinterleave_rgba_epi16(
+pub(crate) unsafe fn sse_deinterleave_rgba_epi16(
     rgba0: __m128i,
     rgba1: __m128i,
     rgba2: __m128i,
@@ -179,7 +177,7 @@ pub unsafe fn sse_deinterleave_rgba_epi16(
 
 #[inline(always)]
 #[allow(dead_code)]
-pub unsafe fn sse_interleave_rgba_epi16(
+pub(crate) unsafe fn sse_interleave_rgba_epi16(
     a: __m128i,
     b: __m128i,
     c: __m128i,
@@ -218,7 +216,7 @@ pub(crate) unsafe fn _mm_muladd_wide_epi16(a: __m128i, b: __m128i, c: __m128i) -
 
 #[inline]
 /// Arithmetic shift for i64, shifting with sign bits
-pub unsafe fn _mm_srai_epi64x<const IMM8: i32>(a: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_srai_epi64x<const IMM8: i32>(a: __m128i) -> __m128i {
     let m = _mm_set1_epi64x(1 << (64 - 1));
     let x = _mm_srli_epi64::<IMM8>(a);
     _mm_sub_epi64(_mm_xor_si128(x, m), m)
@@ -235,7 +233,7 @@ pub(crate) unsafe fn _mm_packus_epi64(a: __m128i, b: __m128i) -> __m128i {
 
 #[inline(always)]
 /// Extracts i64 value
-pub unsafe fn _mm_extract_epi64x<const IMM: i32>(d: __m128i) -> i64 {
+pub(crate) unsafe fn _mm_extract_epi64x<const IMM: i32>(d: __m128i) -> i64 {
     #[cfg(target_arch = "x86_64")]
     {
         if IMM == 0 {
@@ -259,7 +257,7 @@ pub unsafe fn _mm_extract_epi64x<const IMM: i32>(d: __m128i) -> i64 {
 }
 
 #[inline]
-pub unsafe fn _mm_store3_u16(ptr: *mut u16, a: __m128i) {
+pub(crate) unsafe fn _mm_store3_u16(ptr: *mut u16, a: __m128i) {
     let low_pixel = _mm_extract_epi32::<0>(a);
     (ptr as *mut i32).write_unaligned(low_pixel);
     (ptr as *mut i16)
diff --git a/src/sse/vertical_f16.rs b/src/sse/vertical_f16.rs
index 9e400ef..50e0ede 100644
--- a/src/sse/vertical_f16.rs
+++ b/src/sse/vertical_f16.rs
@@ -189,7 +189,11 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8_f16<const F16C: bool, const FM
     _mm_storeu_si128(dst_ptr as *mut __m128i, acc0);
 }
 
-pub fn convolve_vertical_sse_row_f16<const CHANNELS: usize, const F16C: bool, const FMA: bool>(
+pub(crate) fn convolve_vertical_sse_row_f16<
+    const CHANNELS: usize,
+    const F16C: bool,
+    const FMA: bool,
+>(
     width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const half::f16,
diff --git a/src/sse/vertical_f32.rs b/src/sse/vertical_f32.rs
index 3b513ab..d00dfda 100644
--- a/src/sse/vertical_f32.rs
+++ b/src/sse/vertical_f32.rs
@@ -219,7 +219,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_f32<const FMA: bool>(
     (dst_ptr as *mut i32).write_unaligned(_mm_extract_ps::<0>(store_0));
 }
 
-pub fn convolve_vertical_rgb_sse_row_f32<const CHANNELS: usize, const FMA: bool>(
+pub(crate) fn convolve_vertical_rgb_sse_row_f32<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const f32,
diff --git a/src/sse/vertical_u16.rs b/src/sse/vertical_u16.rs
index b9100b2..2fde19b 100644
--- a/src/sse/vertical_u16.rs
+++ b/src/sse/vertical_u16.rs
@@ -37,7 +37,7 @@ use std::arch::x86_64::*;
 
 const ROUNDING: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 
-pub fn convolve_column_sse_u16(
+pub(crate) fn convolve_column_sse_u16(
     _: usize,
     bounds: &FilterBounds,
     src: &[u16],
diff --git a/src/sse/vertical_u16_lb.rs b/src/sse/vertical_u16_lb.rs
index a35f950..9715cd7 100644
--- a/src/sse/vertical_u16_lb.rs
+++ b/src/sse/vertical_u16_lb.rs
@@ -34,7 +34,7 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 #[inline(always)]
-pub fn convolve_column_lb_sse_u16(
+pub(crate) fn convolve_column_lb_sse_u16(
     _: usize,
     bounds: &FilterBounds,
     src: &[u16],
diff --git a/src/sse/vertical_u8.rs b/src/sse/vertical_u8.rs
index d5726a0..5b66bf4 100644
--- a/src/sse/vertical_u8.rs
+++ b/src/sse/vertical_u8.rs
@@ -631,7 +631,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse(
     *dst_ptr = _mm_extract_epi8::<0>(item) as u8;
 }
 
-pub fn convolve_vertical_sse_row(
+pub(crate) fn convolve_vertical_sse_row(
     dst_width: usize,
     bounds: &FilterBounds,
     src: &[u8],
diff --git a/src/sse/vertical_u8_lp.rs b/src/sse/vertical_u8_lp.rs
index ca8621f..d507857 100644
--- a/src/sse/vertical_u8_lp.rs
+++ b/src/sse/vertical_u8_lp.rs
@@ -32,7 +32,7 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-pub fn convolve_vertical_sse_row_lp(
+pub(crate) fn convolve_vertical_sse_row_lp(
     dst_width: usize,
     bounds: &FilterBounds,
     src: &[u8],
diff --git a/src/support.rs b/src/support.rs
index 430544c..4735c62 100644
--- a/src/support.rs
+++ b/src/support.rs
@@ -27,8 +27,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #![forbid(unsafe_code)]
-pub const PRECISION: i32 = 15;
-pub const ROUNDING_CONST: i32 = 1 << (PRECISION - 1);
+pub(crate) const PRECISION: i32 = 15;
+pub(crate) const ROUNDING_CONST: i32 = 1 << (PRECISION - 1);
 
 pub(crate) fn check_image_size_overflow(width: usize, height: usize, chan: usize) -> bool {
     let (stride, is_overflowed) = width.overflowing_mul(chan);
diff --git a/src/unsafe_slice.rs b/src/unsafe_slice.rs
index 21339a7..52b4352 100644
--- a/src/unsafe_slice.rs
+++ b/src/unsafe_slice.rs
@@ -30,7 +30,7 @@
 use std::cell::UnsafeCell;
 
 #[derive(Copy, Clone)]
-pub struct UnsafeSlice<'a, T> {
+pub(crate) struct UnsafeSlice<'a, T> {
     pub slice: &'a [UnsafeCell<T>],
 }
 
@@ -39,31 +39,31 @@ unsafe impl<T: Send + Sync> Send for UnsafeSlice<'_, T> {}
 unsafe impl<T: Send + Sync> Sync for UnsafeSlice<'_, T> {}
 
 impl<'a, T> UnsafeSlice<'a, T> {
-    pub fn new(slice: &'a mut [T]) -> Self {
+    pub(crate) fn new(slice: &'a mut [T]) -> Self {
         let ptr = slice as *mut [T] as *const [UnsafeCell<T>];
         Self {
             slice: unsafe { &*ptr },
         }
     }
 
-    pub fn mut_ptr(&self) -> *mut T {
+    pub(crate) fn mut_ptr(&self) -> *mut T {
         self.slice.as_ptr() as *const T as *mut T
     }
 
     /// SAFETY: It is UB if two threads write to the same index without
     /// synchronization.
     #[allow(dead_code)]
-    pub unsafe fn write(&self, i: usize, value: T) {
+    pub(crate) unsafe fn write(&self, i: usize, value: T) {
         let ptr = self.slice[i].get();
         *ptr = value;
     }
     #[allow(dead_code)]
-    pub fn get(&self, i: usize) -> &T {
+    pub(crate) fn get(&self, i: usize) -> &T {
         let ptr = self.slice[i].get();
         unsafe { &*ptr }
     }
     #[allow(dead_code)]
-    pub fn len(&self) -> usize {
+    pub(crate) fn len(&self) -> usize {
         self.slice.len()
     }
 }

From c1bb3aa7cacbc402fe616ade3fb2b09c2283db33 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Thu, 14 Nov 2024 23:48:05 +0000
Subject: [PATCH 03/19] Deny unused

---
 src/cpu_features.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpu_features.rs b/src/cpu_features.rs
index 94bf023..975fd4a 100644
--- a/src/cpu_features.rs
+++ b/src/cpu_features.rs
@@ -64,7 +64,7 @@ fn apple_has_cpu_feature(_feature_name: &str) -> bool {
 
 /// Test aarch64 cpu with *fp16* check,
 /// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "half"))]
 pub(crate) fn is_aarch_f16_supported() -> bool {
     #[cfg(any(target_os = "macos", target_os = "ios"))]
     {
@@ -80,7 +80,7 @@ pub(crate) fn is_aarch_f16_supported() -> bool {
 /// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used
 /// otherwise consider it is always available
 #[allow(clippy::too_long_first_doc_paragraph)]
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "half"))]
 pub(crate) fn is_aarch_f16c_supported() -> bool {
     #[cfg(any(target_os = "macos", target_os = "ios"))]
     {

From 57fa55991b9c59a8a0635ebee45302b85b0f1aff Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Wed, 20 Nov 2024 22:21:34 +0000
Subject: [PATCH 04/19] Added AR30 scaling

---
 Cargo.lock                         |   8 +
 app/Cargo.toml                     |   1 +
 app/src/main.rs                    | 110 +++++++---
 src/ar30.rs                        | 115 +++++++++++
 src/color_group.rs                 |  73 ++++++-
 src/dispatch_group_ar30.rs         | 129 ++++++++++++
 src/fixed_point_horizontal.rs      | 197 ++++++++----------
 src/fixed_point_horizontal_ar30.rs | 295 +++++++++++++++++++++++++++
 src/fixed_point_vertical_ar30.rs   | 310 +++++++++++++++++++++++++++++
 src/floating_point_horizontal.rs   | 163 ++++++++-------
 src/floating_point_vertical.rs     | 183 ++++++++---------
 src/lib.rs                         |   8 +-
 src/resize_ar30.rs                 | 115 +++++++++++
 src/scaler.rs                      |  60 ++++++
 14 files changed, 1434 insertions(+), 333 deletions(-)
 create mode 100644 src/ar30.rs
 create mode 100644 src/dispatch_group_ar30.rs
 create mode 100644 src/fixed_point_horizontal_ar30.rs
 create mode 100644 src/fixed_point_vertical_ar30.rs
 create mode 100644 src/resize_ar30.rs

diff --git a/Cargo.lock b/Cargo.lock
index e0f7920..11e13f9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -56,6 +56,7 @@ dependencies = [
  "half",
  "image",
  "pic-scale",
+ "yuvutils-rs",
 ]
 
 [[package]]
@@ -1519,6 +1520,13 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "yuvutils-rs"
+version = "0.5.3"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "zerocopy"
 version = "0.7.35"
diff --git a/app/Cargo.toml b/app/Cargo.toml
index 1c09c6d..baa46c0 100644
--- a/app/Cargo.toml
+++ b/app/Cargo.toml
@@ -9,6 +9,7 @@ image = { version = "0.25.5", features = ["default"] }
 pic-scale = { path = "..", features = ["half"], default-features = true }
 fast_image_resize = { version = "5.0.0", features = [] }
 half = { version = "2.4.1", default-features = true }
+yuvutils-rs = {path = "../../../RustRoverProjects/yuvutils-rs"}
 
 [dev-dependencies]
 criterion = "0.5.1"
diff --git a/app/src/main.rs b/app/src/main.rs
index 0e2b15a..fb0dd1f 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -11,9 +11,10 @@ use fast_image_resize::{
 };
 use image::{EncodableLayout, GenericImageView, ImageReader};
 use pic_scale::{
-    ImageSize, ImageStore, LinearApproxScaler, ResamplingFunction, Scaler, Scaling, ScalingU16,
-    ThreadingPolicy,
+    Ar30ByteOrder, ImageSize, ImageStore, LinearApproxScaler, ResamplingFunction, Scaler, Scaling,
+    ScalingU16, ThreadingPolicy,
 };
+use yuvutils_rs::{ar30_to_rgba8, rgb8_to_ar30, rgba8_to_ar30, Rgb30ByteOrder};
 
 fn main() {
     test_fast_image();
@@ -28,25 +29,61 @@ fn main() {
     let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
     scaler.set_threading_policy(ThreadingPolicy::Single);
 
-    let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
+    let mut ar30_src = vec![0u32; dimensions.0 as usize * dimensions.1 as usize];
+    rgba8_to_ar30(
+        &mut ar30_src,
+        dimensions.0,
+        Rgb30ByteOrder::Host,
+        &bytes,
+        dimensions.0 * 4,
+        dimensions.0,
+        dimensions.1,
+    )
+    .unwrap();
+
+    // let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
     //
-    let store =
-        ImageStore::<u16, 4>::from_slice(&mut choke, dimensions.0 as usize, dimensions.1 as usize)
-            .unwrap();
+    // let store =
+    //     ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize)
+    //         .unwrap();
+
+    let dst_size = ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2);
+    let mut resized_ar = vec![0u32; dst_size.width * dst_size.height];
     let start_time = Instant::now();
-    let resized = scaler
-        .resize_rgba_u16(
-            ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
-            store,
-            10,
-            true,
+    scaler
+        .resize_ar30(
+            &ar30_src,
+            ImageSize::new(dimensions.0 as usize, dimensions.1 as usize),
+            &mut resized_ar,
+            dst_size,
+            Ar30ByteOrder::Host,
         )
         .unwrap();
 
+    // let resized = scaler
+    //     .resize_rgba(
+    //         ImageSize::new(dimensions.0 as usize / 8, dimensions.1 as usize / 8),
+    //         store,
+    //         false,
+    //     )
+    //     .unwrap();
+
     let elapsed_time = start_time.elapsed();
     // Print the elapsed time in milliseconds
     println!("Scaler: {:.2?}", elapsed_time);
 
+    let mut resized = vec![0u8; dst_size.width * dst_size.height * 4];
+    ar30_to_rgba8(
+        &resized_ar,
+        dst_size.width as u32,
+        Rgb30ByteOrder::Host,
+        &mut resized,
+        dst_size.width as u32 * 4,
+        dst_size.width as u32,
+        dst_size.height as u32,
+    )
+    .unwrap();
+
     // let dst: Vec<u8> = resized.as_bytes().iter().map(|&x| x).collect::<Vec<_>>();
     // println!("f1 {}, f2 {}, f3 {}, f4 {}", dst[0], dst[1], dst[2], dst[3]);
     // let dst: Vec<u8> = resized
@@ -106,29 +143,38 @@ fn main() {
     //     .map(|&x| (x * 255f32) as u8)
     //     .collect();
 
-    let dst: Vec<u8> = resized.as_bytes().iter().map(|&x| (x >> 2) as u8).collect();
+    // let dst: Vec<u8> = resized.as_bytes().iter().map(|&x| (x >> 2) as u8).collect();
     //
     // let dst = resized.as_bytes();
+    let dst = resized;
+    image::save_buffer(
+        "converted.png",
+        &dst,
+        dst_size.width as u32,
+        dst_size.height as u32,
+        image::ColorType::Rgba8,
+    )
+    .unwrap();
 
-    if resized.channels == 4 {
-        image::save_buffer(
-            "converted.png",
-            &dst,
-            resized.width as u32,
-            resized.height as u32,
-            image::ColorType::Rgba8,
-        )
-        .unwrap();
-    } else {
-        image::save_buffer(
-            "converted.png",
-            &dst,
-            resized.width as u32,
-            resized.height as u32,
-            image::ColorType::Rgb8,
-        )
-        .unwrap();
-    }
+    // if resized.channels == 4 {
+    //     image::save_buffer(
+    //         "converted.png",
+    //         &dst,
+    //         resized.width as u32,
+    //         resized.height as u32,
+    //         image::ColorType::Rgba8,
+    //     )
+    //     .unwrap();
+    // } else {
+    //     image::save_buffer(
+    //         "converted.png",
+    //         &dst,
+    //         resized.width as u32,
+    //         resized.height as u32,
+    //         image::ColorType::Rgb8,
+    //     )
+    //     .unwrap();
+    // }
 
     // for i in 0..37 {
     //     let mut scaler = Scaler::new(i.into());
diff --git a/src/ar30.rs b/src/ar30.rs
new file mode 100644
index 0000000..cc4fcc6
--- /dev/null
+++ b/src/ar30.rs
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub(crate) enum Rgb30 {
+    Ar30 = 0,
+    Ra30 = 1,
+}
+
+impl From<usize> for Rgb30 {
+    fn from(value: usize) -> Self {
+        match value {
+            0 => Rgb30::Ar30,
+            1 => Rgb30::Ra30,
+            _ => {
+                unimplemented!("Rgb30 is not implemented for value {}", value)
+            }
+        }
+    }
+}
+
+/// Converts a value from host byte order to network byte order.
+#[inline]
+const fn htonl(hostlong: u32) -> u32 {
+    hostlong.to_be()
+}
+
+/// Converts a value from network byte order to host byte order.
+#[inline]
+const fn ntohl(netlong: u32) -> u32 {
+    u32::from_be(netlong)
+}
+
+impl Rgb30 {
+    #[inline]
+    pub(crate) const fn pack_w_a<const STORE: usize>(self, r: i32, g: i32, b: i32, a: i32) -> u32 {
+        let value: u32 = match self {
+            Rgb30::Ar30 => (a << 30 | (b << 20) | (g << 10) | r) as u32,
+            Rgb30::Ra30 => ((r << 22) | (g << 12) | (b << 2) | a) as u32,
+        };
+        if STORE == 0 {
+            value
+        } else {
+            htonl(value)
+        }
+    }
+
+    #[inline(always)]
+    pub(crate) const fn unpack<const STORE: usize>(self, value: u32) -> (u32, u32, u32, u32) {
+        let pixel = if STORE == 0 { value } else { ntohl(value) };
+        match self {
+            Rgb30::Ar30 => {
+                let r10 = pixel & 0x3ff;
+                let g10 = (pixel >> 10) & 0x3ff;
+                let b10 = (pixel >> 20) & 0x3ff;
+                let a10 = pixel >> 30;
+                (r10, g10, b10, a10)
+            }
+            Rgb30::Ra30 => {
+                let a2 = pixel & 0x3;
+                let r10 = (pixel >> 22) & 0x3ff;
+                let g10 = (pixel >> 12) & 0x3ff;
+                let b10 = (pixel >> 2) & 0x3ff;
+                (r10, g10, b10, a2)
+            }
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+/// Defines storage byte order for RGBA1010102 or RGBA2101010
+///
+/// Some systems require to be bytes in network byte order instead of host.
+pub enum Ar30ByteOrder {
+    Host = 0,
+    Network = 1,
+}
+
+impl From<usize> for Ar30ByteOrder {
+    fn from(value: usize) -> Self {
+        match value {
+            0 => Ar30ByteOrder::Host,
+            1 => Ar30ByteOrder::Network,
+            _ => {
+                unimplemented!("Rgb30ByteOrder is not implemented for value {}", value)
+            }
+        }
+    }
+}
diff --git a/src/color_group.rs b/src/color_group.rs
index bcb60e1..3f829e0 100644
--- a/src/color_group.rs
+++ b/src/color_group.rs
@@ -227,6 +227,24 @@ where
     }
 }
 
+impl ColorGroup<4, i32> {
+    #[inline(always)]
+    pub(crate) fn saturate_ar30(&self) -> ColorGroup<4, i32> {
+        ColorGroup::from_components(
+            (self.r >> PRECISION).min(1023).max(0),
+            (self.g >> PRECISION).min(1023).max(0),
+            (self.b >> PRECISION).min(1023).max(0),
+            (self.a >> PRECISION).min(3).max(0),
+        )
+    }
+
+    #[inline(always)]
+    pub(crate) fn to_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(&self) -> u32 {
+        let ar30_type: Rgb30 = AR30_TYPE.into();
+        ar30_type.pack_w_a::<AR30_ORDER>(self.r, self.g, self.b, self.a)
+    }
+}
+
 impl<const COMPS: usize, J> Sub<J> for ColorGroup<COMPS, J>
 where
     J: Copy + Sub<Output = J> + Default + 'static,
@@ -461,7 +479,52 @@ where
     }
 }
 
-macro_rules! fast_load_color_group {
+macro_rules! load_ar30 {
+    ($store: expr, $ar_type: expr, $ar_order: ty) => {{
+        let ar_type: crate::ar30::Rgb30 = $ar_type.into();
+        let unpacked = ar_type.unpack::<$ar_order>($store[0]);
+        ColorGroup::<4, i32> {
+            r: unpacked.0 as i32,
+            g: unpacked.1 as i32,
+            b: unpacked.2 as i32,
+            a: unpacked.3 as i32,
+        }
+    }};
+}
+
+pub(crate) use load_ar30;
+
+macro_rules! load_ar30_p {
+    ($store: expr, $ar_type: expr, $ar_order: ty) => {{
+        let ar_type: crate::ar30::Rgb30 = $ar_type.into();
+        let unpacked = ar_type.unpack::<$ar_order>(*$store);
+        ColorGroup::<4, i32> {
+            r: unpacked.0 as i32,
+            g: unpacked.1 as i32,
+            b: unpacked.2 as i32,
+            a: unpacked.3 as i32,
+        }
+    }};
+}
+
+pub(crate) use load_ar30_p;
+
+macro_rules! load_ar30_with_offset {
+    ($store: expr, $ar_type: expr, $ar_order: ty, $offset: expr) => {{
+        let ar_type: crate::ar30::Rgb30 = $ar_type.into();
+        let unpacked = ar_type.unpack::<$ar_order>($store[$offset]);
+        ColorGroup::<4, i32> {
+            r: unpacked.0 as i32,
+            g: unpacked.1 as i32,
+            b: unpacked.2 as i32,
+            a: unpacked.3 as i32,
+        }
+    }};
+}
+
+pub(crate) use load_ar30_with_offset;
+
+macro_rules! load_color_group {
     ($store: expr, $channels: expr, $vtype: ty) => {{
         if $channels == 1 {
             ColorGroup::<$channels, $vtype> {
@@ -497,9 +560,9 @@ macro_rules! fast_load_color_group {
     }};
 }
 
-pub(crate) use fast_load_color_group;
+pub(crate) use load_color_group;
 
-macro_rules! fast_load_color_group_with_offset {
+macro_rules! load_color_group_with_offset {
     ($store: expr, $channels: expr, $offset: expr, $vtype: ty) => {{
         if $channels == 1 {
             ColorGroup::<$channels, $vtype> {
@@ -535,7 +598,7 @@ macro_rules! fast_load_color_group_with_offset {
     }};
 }
 
-pub(crate) use fast_load_color_group_with_offset;
+pub(crate) use load_color_group_with_offset;
 
 macro_rules! fast_store_color_group {
     ($color_group: expr, $store: expr, $components: expr) => {{
@@ -569,4 +632,6 @@ macro_rules! fast_mixed_store_color_group {
     }};
 }
 
+use crate::ar30::Rgb30;
+use crate::support::PRECISION;
 pub(crate) use fast_mixed_store_color_group;
diff --git a/src/dispatch_group_ar30.rs b/src/dispatch_group_ar30.rs
new file mode 100644
index 0000000..4095c5c
--- /dev/null
+++ b/src/dispatch_group_ar30.rs
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+use crate::filter_weights::FilterWeights;
+use crate::fixed_point_horizontal_ar30::{
+    convolve_row_handler_fixed_point_4_ar30, convolve_row_handler_fixed_point_ar30,
+};
+use crate::fixed_point_vertical_ar30::column_handler_fixed_point_ar30;
+use crate::support::PRECISION;
+use rayon::iter::{IndexedParallelIterator, ParallelIterator};
+use rayon::prelude::{ParallelSlice, ParallelSliceMut};
+use rayon::ThreadPool;
+
+#[allow(clippy::type_complexity)]
+pub(crate) fn convolve_horizontal_dispatch_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    src: &[u32],
+    src_stride: usize,
+    filter_weights: FilterWeights<f32>,
+    dst: &mut [u32],
+    dst_stride: usize,
+    pool: &Option<ThreadPool>,
+) {
+    if let Some(pool) = pool {
+        pool.install(|| {
+            let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+            dst.par_chunks_exact_mut(dst_stride * 4)
+                .zip(src.par_chunks_exact(src_stride * 4))
+                .for_each(|(dst, src)| {
+                    convolve_row_handler_fixed_point_4_ar30::<AR30_TYPE, AR30_ORDER>(
+                        src, src_stride, dst, dst_stride, &approx,
+                    );
+                });
+
+            let remainder = dst.chunks_exact_mut(dst_stride * 4).into_remainder();
+            let src_remainder = src.chunks_exact(src_stride * 4).remainder();
+
+            remainder
+                .par_chunks_exact_mut(dst_stride)
+                .zip(src_remainder.par_chunks_exact(src_stride))
+                .for_each(|(dst, src)| {
+                    convolve_row_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>(
+                        src, dst, &approx,
+                    );
+                });
+        });
+    } else {
+        let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+        dst.chunks_exact_mut(dst_stride * 4)
+            .zip(src.chunks_exact(src_stride * 4))
+            .for_each(|(dst, src)| {
+                convolve_row_handler_fixed_point_4_ar30::<AR30_TYPE, AR30_ORDER>(
+                    src, src_stride, dst, dst_stride, &approx,
+                );
+            });
+
+        let remainder = dst.chunks_exact_mut(dst_stride * 4).into_remainder();
+        let src_remainder = src.chunks_exact(src_stride * 4).remainder();
+
+        remainder
+            .chunks_exact_mut(dst_stride)
+            .zip(src_remainder.chunks_exact(src_stride))
+            .for_each(|(dst, src)| {
+                convolve_row_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>(src, dst, &approx);
+            });
+    }
+}
+
+pub(crate) fn convolve_vertical_dispatch_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    src: &[u32],
+    src_stride: usize,
+    filter_weights: FilterWeights<f32>,
+    dst: &mut [u32],
+    dst_stride: usize,
+    pool: &Option<ThreadPool>,
+) {
+    if let Some(pool) = pool {
+        pool.install(|| {
+            let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+            dst.par_chunks_exact_mut(dst_stride)
+                .enumerate()
+                .for_each(|(y, row)| {
+                    let bounds = approx.bounds[y];
+                    let filter_offset = y * approx.aligned_size;
+                    let weights = &approx.weights[filter_offset..];
+                    column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>(
+                        &bounds, src, row, src_stride, weights,
+                    );
+                });
+        });
+    } else {
+        let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+        dst.chunks_exact_mut(dst_stride)
+            .enumerate()
+            .for_each(|(y, row)| {
+                let bounds = approx.bounds[y];
+                let filter_offset = y * approx.aligned_size;
+                let weights = &approx.weights[filter_offset..];
+                column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>(
+                    &bounds, src, row, src_stride, weights,
+                );
+            });
+    }
+}
diff --git a/src/fixed_point_horizontal.rs b/src/fixed_point_horizontal.rs
index 7baa2d3..06103e5 100644
--- a/src/fixed_point_horizontal.rs
+++ b/src/fixed_point_horizontal.rs
@@ -27,7 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::color_group::{
-    fast_load_color_group, fast_load_color_group_with_offset, fast_store_color_group, ColorGroup,
+    fast_store_color_group, load_color_group, load_color_group_with_offset, ColorGroup,
 };
 use crate::filter_weights::FilterWeights;
 use crate::saturate_narrow::SaturateNarrow;
@@ -78,18 +78,17 @@ pub(crate) fn convolve_row_handler_fixed_point<
                 let sliced_weights = &weights[0..2];
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
-                sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1;
+                sums += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1;
             } else if bounds_size == 3 {
                 let src_ptr0 = &src[px..(px + 3 * CHANNELS)];
                 let sliced_weights = &weights[0..3];
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
-                sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                        * weight2;
+                sums += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2;
             } else if bounds_size == 4 {
                 let src_ptr0 = &src[px..(px + 4 * CHANNELS)];
                 let sliced_weights = &weights[0..4];
@@ -97,12 +96,10 @@ pub(crate) fn convolve_row_handler_fixed_point<
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
                 let weight3 = sliced_weights[3].as_();
-                sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                        * weight3;
+                sums += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3;
             } else if bounds_size == 6 {
                 let src_ptr0 = &src[px..(px + 6 * CHANNELS)];
 
@@ -113,16 +110,12 @@ pub(crate) fn convolve_row_handler_fixed_point<
                 let weight3 = sliced_weights[3].as_();
                 let weight4 = sliced_weights[4].as_();
                 let weight5 = sliced_weights[5].as_();
-                sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                        * weight3
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J)
-                        * weight4
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J)
-                        * weight5;
+                sums += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J) * weight4
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J) * weight5;
             } else {
                 let src_ptr0 = &src[px..(px + bounds_size * CHANNELS)];
                 for (&k_weight, src) in weights
@@ -131,7 +124,7 @@ pub(crate) fn convolve_row_handler_fixed_point<
                     .take(bounds.size)
                 {
                     let weight: J = k_weight.as_();
-                    let new_px = fast_load_color_group!(src, CHANNELS, J);
+                    let new_px = load_color_group!(src, CHANNELS, J);
                     sums += new_px * weight;
                 }
             }
@@ -205,14 +198,14 @@ pub(crate) fn convolve_row_handler_fixed_point_4<
                 let sliced_weights = &weights[0..2];
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
-                sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1;
-                sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1;
-                sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1;
-                sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1;
+                sums0 += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1;
+                sums1 += load_color_group!(src_ptr1, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1;
+                sums2 += load_color_group!(src_ptr2, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1;
+                sums3 += load_color_group!(src_ptr3, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1;
             } else if bounds_size == 3 {
                 let src_ptr0 = &src[px..(px + 3 * CHANNELS)];
                 let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 3 * CHANNELS)];
@@ -223,22 +216,18 @@ pub(crate) fn convolve_row_handler_fixed_point_4<
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
-                sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                        * weight2;
-                sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J)
-                        * weight2;
-                sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J)
-                        * weight2;
-                sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J)
-                        * weight2;
+                sums0 += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2;
+                sums1 += load_color_group!(src_ptr1, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J) * weight2;
+                sums2 += load_color_group!(src_ptr2, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J) * weight2;
+                sums3 += load_color_group!(src_ptr3, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2;
             } else if bounds_size == 4 {
                 let src_ptr0 = &src[px..(px + 4 * CHANNELS)];
                 let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 4 * CHANNELS)];
@@ -250,30 +239,22 @@ pub(crate) fn convolve_row_handler_fixed_point_4<
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
                 let weight3 = sliced_weights[3].as_();
-                sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                        * weight3;
-                sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J)
-                        * weight3;
-                sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J)
-                        * weight3;
-                sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J)
-                        * weight3;
+                sums0 += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3;
+                sums1 += load_color_group!(src_ptr1, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J) * weight3;
+                sums2 += load_color_group!(src_ptr2, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J) * weight3;
+                sums3 += load_color_group!(src_ptr3, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J) * weight3;
             } else if bounds_size == 6 {
                 let src_ptr0 = &src[px..(px + 6 * CHANNELS)];
                 let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 6 * CHANNELS)];
@@ -287,46 +268,30 @@ pub(crate) fn convolve_row_handler_fixed_point_4<
                 let weight3 = sliced_weights[3].as_();
                 let weight4 = sliced_weights[4].as_();
                 let weight5 = sliced_weights[5].as_();
-                sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                        * weight3
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J)
-                        * weight4
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J)
-                        * weight5;
-                sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J)
-                        * weight3
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 4, J)
-                        * weight4
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 5, J)
-                        * weight5;
-                sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J)
-                        * weight3
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 4, J)
-                        * weight4
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 5, J)
-                        * weight5;
-                sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J)
-                        * weight3
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 4, J)
-                        * weight4
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 5, J)
-                        * weight5;
+                sums0 += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J) * weight4
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J) * weight5;
+                sums1 += load_color_group!(src_ptr1, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J) * weight3
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 4, J) * weight4
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 5, J) * weight5;
+                sums2 += load_color_group!(src_ptr2, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J) * weight3
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 4, J) * weight4
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 5, J) * weight5;
+                sums3 += load_color_group!(src_ptr3, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J) * weight3
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 4, J) * weight4
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 5, J) * weight5;
             } else {
                 let src_ptr0 = &src[px..(px + bounds_size * CHANNELS)];
                 let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds_size * CHANNELS)];
@@ -345,10 +310,10 @@ pub(crate) fn convolve_row_handler_fixed_point_4<
                 {
                     let weight: J = k_weight.as_();
 
-                    let new_px0 = fast_load_color_group!(src0, CHANNELS, J);
-                    let new_px1 = fast_load_color_group!(src1, CHANNELS, J);
-                    let new_px2 = fast_load_color_group!(src2, CHANNELS, J);
-                    let new_px3 = fast_load_color_group!(src3, CHANNELS, J);
+                    let new_px0 = load_color_group!(src0, CHANNELS, J);
+                    let new_px1 = load_color_group!(src1, CHANNELS, J);
+                    let new_px2 = load_color_group!(src2, CHANNELS, J);
+                    let new_px3 = load_color_group!(src3, CHANNELS, J);
 
                     sums0 += new_px0 * weight;
                     sums1 += new_px1 * weight;
diff --git a/src/fixed_point_horizontal_ar30.rs b/src/fixed_point_horizontal_ar30.rs
new file mode 100644
index 0000000..b46a7cb
--- /dev/null
+++ b/src/fixed_point_horizontal_ar30.rs
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#![forbid(unsafe_code)]
+use crate::color_group::{load_ar30, load_ar30_p, load_ar30_with_offset, ColorGroup};
+use crate::filter_weights::FilterWeights;
+use crate::support::ROUNDING_CONST;
+use num_traits::AsPrimitive;
+
+#[inline(always)]
+pub(crate) fn convolve_row_handler_fixed_point_ar30<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+>(
+    src: &[u32],
+    dst: &mut [u32],
+    filter_weights: &FilterWeights<i16>,
+) {
+    for ((chunk, &bounds), weights) in dst.iter_mut().zip(filter_weights.bounds.iter()).zip(
+        filter_weights
+            .weights
+            .chunks_exact(filter_weights.aligned_size),
+    ) {
+        let mut sums = ColorGroup::<4, i32>::dup(ROUNDING_CONST.as_());
+
+        let start_x = bounds.start;
+        let bounds_size = bounds.size;
+
+        let px = start_x;
+
+        if bounds_size == 2 {
+            let src_ptr0 = &src[px..(px + 2)];
+            let sliced_weights = &weights[0..2];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            sums += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1;
+        } else if bounds_size == 3 {
+            let src_ptr0 = &src[px..(px + 3)];
+            let sliced_weights = &weights[0..3];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            let weight2 = sliced_weights[2] as i32;
+            sums += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2;
+        } else if bounds_size == 4 {
+            let src_ptr0 = &src[px..(px + 4)];
+            let sliced_weights = &weights[0..4];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            let weight2 = sliced_weights[2] as i32;
+            let weight3 = sliced_weights[3] as i32;
+            sums += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 3) * weight3;
+        } else if bounds_size == 6 {
+            let src_ptr0 = &src[px..(px + 6)];
+
+            let sliced_weights = &weights[0..6];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            let weight2 = sliced_weights[2] as i32;
+            let weight3 = sliced_weights[3] as i32;
+            let weight4 = sliced_weights[4] as i32;
+            let weight5 = sliced_weights[5] as i32;
+            sums += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 3) * weight3
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 4) * weight4
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 5) * weight5;
+        } else {
+            let src_ptr0 = &src[px..(px + bounds_size)];
+            for (&k_weight, src) in weights.iter().zip(src_ptr0.iter()).take(bounds.size) {
+                let weight: i32 = k_weight as i32;
+                let new_px = load_ar30_p!(src, AR30_TYPE, AR30_ORDER);
+                sums += new_px * weight;
+            }
+        }
+
+        let narrowed = sums.saturate_ar30();
+        *chunk = narrowed.to_ar30::<AR30_TYPE, AR30_ORDER>();
+    }
+}
+
+#[inline(always)]
+pub(crate) fn convolve_row_handler_fixed_point_4_ar30<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+>(
+    src: &[u32],
+    src_stride: usize,
+    dst: &mut [u32],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+) {
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.iter_mut();
+    let iter_row1 = row1_ref.iter_mut();
+    let iter_row2 = row2_ref.iter_mut();
+    let iter_row3 = row3_ref.iter_mut();
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut sums0 = ColorGroup::<4, i32>::dup(ROUNDING_CONST.as_());
+        let mut sums1 = ColorGroup::<4, i32>::dup(ROUNDING_CONST.as_());
+        let mut sums2 = ColorGroup::<4, i32>::dup(ROUNDING_CONST.as_());
+        let mut sums3 = ColorGroup::<4, i32>::dup(ROUNDING_CONST.as_());
+
+        let start_x = bounds.start;
+
+        let px = start_x;
+        let bounds_size = bounds.size;
+
+        if bounds_size == 2 {
+            let src_ptr0 = &src[px..(px + 2)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 2)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 2)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 2)];
+
+            let sliced_weights = &weights[0..2];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            sums0 += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1;
+            sums1 += load_ar30!(src_ptr1, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 1) * weight1;
+            sums2 += load_ar30!(src_ptr2, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 1) * weight1;
+            sums3 += load_ar30!(src_ptr3, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 1) * weight1;
+        } else if bounds_size == 3 {
+            let src_ptr0 = &src[px..(px + 3)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 3)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 3)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 3)];
+
+            let sliced_weights = &weights[0..3];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            let weight2 = sliced_weights[2] as i32;
+            sums0 += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2;
+            sums1 += load_ar30!(src_ptr1, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 2) * weight2;
+            sums2 += load_ar30!(src_ptr2, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 2) * weight2;
+            sums3 += load_ar30!(src_ptr3, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 2) * weight2;
+        } else if bounds_size == 4 {
+            let src_ptr0 = &src[px..(px + 4)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 4)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 4)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 4)];
+
+            let sliced_weights = &weights[0..4];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            let weight2 = sliced_weights[2] as i32;
+            let weight3 = sliced_weights[3] as i32;
+            sums0 += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 3) * weight3;
+            sums1 += load_ar30!(src_ptr1, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 3) * weight3;
+            sums2 += load_ar30!(src_ptr2, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 3) * weight3;
+            sums3 += load_ar30!(src_ptr3, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 3) * weight3;
+        } else if bounds_size == 6 {
+            let src_ptr0 = &src[px..(px + 6)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 6)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 6)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 6)];
+
+            let sliced_weights = &weights[0..6];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            let weight2 = sliced_weights[2] as i32;
+            let weight3 = sliced_weights[3] as i32;
+            let weight4 = sliced_weights[4] as i32;
+            let weight5 = sliced_weights[5] as i32;
+            sums0 += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 3) * weight3
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 4) * weight4
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 5) * weight5;
+            sums1 += load_ar30!(src_ptr1, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 3) * weight3
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 4) * weight4
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 5) * weight5;
+            sums2 += load_ar30!(src_ptr2, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 3) * weight3
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 4) * weight4
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 5) * weight5;
+            sums3 += load_ar30!(src_ptr3, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 3) * weight3
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 4) * weight4
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 5) * weight5;
+        } else {
+            let src_ptr0 = &src[px..(px + bounds_size)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds_size)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds_size)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds_size)];
+
+            for ((((&k_weight, src0), src1), src2), src3) in weights
+                .iter()
+                .zip(src_ptr0.iter())
+                .zip(src_ptr1.iter())
+                .zip(src_ptr2.iter())
+                .zip(src_ptr3.iter())
+                .take(bounds.size)
+            {
+                let weight: i32 = k_weight as i32;
+
+                let new_px0 = load_ar30_p!(src0, AR30_TYPE, AR30_ORDER);
+                let new_px1 = load_ar30_p!(src1, AR30_TYPE, AR30_ORDER);
+                let new_px2 = load_ar30_p!(src2, AR30_TYPE, AR30_ORDER);
+                let new_px3 = load_ar30_p!(src3, AR30_TYPE, AR30_ORDER);
+
+                sums0 += new_px0 * weight;
+                sums1 += new_px1 * weight;
+                sums2 += new_px2 * weight;
+                sums3 += new_px3 * weight;
+            }
+        }
+
+        let narrowed0 = sums0.saturate_ar30();
+        let narrowed1 = sums1.saturate_ar30();
+        let narrowed2 = sums2.saturate_ar30();
+        let narrowed3 = sums3.saturate_ar30();
+
+        *chunk0 = narrowed0.to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *chunk1 = narrowed1.to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *chunk2 = narrowed2.to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *chunk3 = narrowed3.to_ar30::<AR30_TYPE, AR30_ORDER>();
+    }
+}
diff --git a/src/fixed_point_vertical_ar30.rs b/src/fixed_point_vertical_ar30.rs
new file mode 100644
index 0000000..04386b7
--- /dev/null
+++ b/src/fixed_point_vertical_ar30.rs
@@ -0,0 +1,310 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::color_group::{load_ar30_p, ColorGroup};
+use crate::filter_weights::FilterBounds;
+use crate::support::ROUNDING_CONST;
+
+#[inline(always)]
+/// # Generics
+/// `T` - template buffer type
+/// `J` - accumulator type
+fn convolve_column_handler_fixed_point_direct_buffer<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+    const BUFFER_SIZE: usize,
+>(
+    src: &[u32],
+    src_stride: usize,
+    dst: &mut [u32],
+    filter: &[i16],
+    bounds: &FilterBounds,
+    x: usize,
+) {
+    if filter.is_empty() {
+        return;
+    }
+    let mut direct_store: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+
+    let v_start_px = x;
+
+    let py = bounds.start;
+    let weight = filter[0] as i32;
+    let offset = src_stride * py + v_start_px;
+    let src_ptr = &src[offset..(offset + BUFFER_SIZE)];
+
+    for (dst, src) in direct_store.iter_mut().zip(src_ptr) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
+        // Adding 1 is necessary because skip do not incrementing value on values that skipped
+        let py = bounds.start + j + 1;
+        let weight = k_weight as i32;
+        let offset = src_stride * py + v_start_px;
+        let src_ptr = &src[offset..(offset + BUFFER_SIZE)];
+
+        for (dst, src) in direct_store.iter_mut().zip(src_ptr.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+    }
+
+    let v_dst = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)];
+    for (dst, src) in v_dst.iter_mut().zip(direct_store) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+}
+
+#[inline(always)]
+/// # Generics
+/// `T` - template buffer type
+/// `J` - accumulator type
+fn convolve_column_handler_fixed_point_direct_buffer_double<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+    const BUFFER_SIZE: usize,
+>(
+    src: &[u32],
+    src_stride: usize,
+    dst: &mut [u32],
+    filter: &[i16],
+    bounds: &FilterBounds,
+    x: usize,
+) {
+    if filter.is_empty() {
+        return;
+    }
+    let mut direct_store0: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+    let mut direct_store1: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+
+    let v_start_px = x;
+
+    let py = bounds.start;
+    let weight = filter[0] as i32;
+    let offset = src_stride * py + v_start_px;
+    let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
+    let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
+
+    for (dst, src) in direct_store0.iter_mut().zip(src_ptr0) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (dst, src) in direct_store1.iter_mut().zip(src_ptr1) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
+        // Adding 1 is necessary because skip do not incrementing value on values that skipped
+        let py = bounds.start + j + 1;
+        let weight = k_weight as i32;
+        let offset = src_stride * py + v_start_px;
+        let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
+        let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
+
+        for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+        for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+    }
+
+    let v_dst0 = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)];
+    for (dst, src) in v_dst0.iter_mut().zip(direct_store0) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+
+    let v_dst1 = &mut dst[(v_start_px + BUFFER_SIZE)..(v_start_px + BUFFER_SIZE * 2)];
+    for (dst, src) in v_dst1.iter_mut().zip(direct_store1) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+}
+
+#[inline(always)]
+/// # Generics
+/// `T` - template buffer type
+/// `J` - accumulator type
+fn convolve_column_handler_fixed_point_direct_buffer_four<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+    const BUFFER_SIZE: usize,
+>(
+    src: &[u32],
+    src_stride: usize,
+    dst: &mut [u32],
+    filter: &[i16],
+    bounds: &FilterBounds,
+    x: usize,
+) {
+    if filter.is_empty() {
+        return;
+    }
+    let mut direct_store0: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+    let mut direct_store1: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+    let mut direct_store2: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+    let mut direct_store3: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+
+    let v_start_px = x;
+
+    let py = bounds.start;
+    let weight = filter[0] as i32;
+    let offset = src_stride * py + v_start_px;
+    let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
+    let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
+    let src_ptr2 = &src[(offset + BUFFER_SIZE * 2)..(offset + BUFFER_SIZE * 3)];
+    let src_ptr3 = &src[(offset + BUFFER_SIZE * 3)..(offset + BUFFER_SIZE * 4)];
+
+    for (dst, src) in direct_store0.iter_mut().zip(src_ptr0) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (dst, src) in direct_store1.iter_mut().zip(src_ptr1) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (dst, src) in direct_store2.iter_mut().zip(src_ptr2) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (dst, src) in direct_store3.iter_mut().zip(src_ptr3) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
+        // Adding 1 is necessary because skip do not incrementing value on values that skipped
+        let py = bounds.start + j + 1;
+        let weight = k_weight as i32;
+        let offset = src_stride * py + v_start_px;
+        let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
+        let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
+        let src_ptr2 = &src[(offset + BUFFER_SIZE * 2)..(offset + BUFFER_SIZE * 3)];
+        let src_ptr3 = &src[(offset + BUFFER_SIZE * 3)..(offset + BUFFER_SIZE * 4)];
+
+        for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+        for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+        for (dst, src) in direct_store2.iter_mut().zip(src_ptr2.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+        for (dst, src) in direct_store3.iter_mut().zip(src_ptr3.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+    }
+
+    let v_dst0 = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)];
+    for (dst, src) in v_dst0.iter_mut().zip(direct_store0) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+
+    let v_dst1 = &mut dst[(v_start_px + BUFFER_SIZE)..(v_start_px + BUFFER_SIZE * 2)];
+    for (dst, src) in v_dst1.iter_mut().zip(direct_store1) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+
+    let v_dst2 = &mut dst[(v_start_px + BUFFER_SIZE * 2)..(v_start_px + BUFFER_SIZE * 3)];
+    for (dst, src) in v_dst2.iter_mut().zip(direct_store2) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+
+    let v_dst3 = &mut dst[(v_start_px + BUFFER_SIZE * 3)..(v_start_px + BUFFER_SIZE * 4)];
+    for (dst, src) in v_dst3.iter_mut().zip(direct_store3) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+}
+
+/// # Generics
+/// `T` - template buffer type
+/// `J` - accumulator type
+pub(crate) fn column_handler_fixed_point_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    bounds: &FilterBounds,
+    src: &[u32],
+    dst: &mut [u32],
+    src_stride: usize,
+    weight: &[i16],
+) {
+    let mut cx = 0usize;
+
+    let total_width = dst.len();
+
+    while cx + 64 < total_width {
+        convolve_column_handler_fixed_point_direct_buffer_four::<AR30_TYPE, AR30_ORDER, 16>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 64;
+    }
+
+    while cx + 32 < total_width {
+        convolve_column_handler_fixed_point_direct_buffer_double::<AR30_TYPE, AR30_ORDER, 16>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 32;
+    }
+
+    while cx + 16 < total_width {
+        convolve_column_handler_fixed_point_direct_buffer::<AR30_TYPE, AR30_ORDER, 16>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 16;
+    }
+
+    while cx + 8 < total_width {
+        convolve_column_handler_fixed_point_direct_buffer::<AR30_TYPE, AR30_ORDER, 8>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 8;
+    }
+
+    while cx < total_width {
+        convolve_column_handler_fixed_point_direct_buffer::<AR30_TYPE, AR30_ORDER, 1>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 1;
+    }
+}
diff --git a/src/floating_point_horizontal.rs b/src/floating_point_horizontal.rs
index aa7fd99..9fc80c4 100644
--- a/src/floating_point_horizontal.rs
+++ b/src/floating_point_horizontal.rs
@@ -27,8 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::color_group::{
-    fast_load_color_group, fast_load_color_group_with_offset, fast_mixed_store_color_group,
-    ColorGroup,
+    fast_mixed_store_color_group, load_color_group, load_color_group_with_offset, ColorGroup,
 };
 use crate::filter_weights::FilterWeights;
 use crate::mixed_storage::MixedStorage;
@@ -83,8 +82,8 @@ pub(crate) fn convolve_row_handler_floating_point<
                 let sliced_weights = &weights[0..2];
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
-                sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0).mul_add(
-                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                sums = (load_color_group!(src_ptr0, CHANNELS, J) * weight0).mul_add(
+                    load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                     weight1,
                 );
             } else if bounds_size == 3 {
@@ -94,13 +93,13 @@ pub(crate) fn convolve_row_handler_floating_point<
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
-                sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                sums = (load_color_group!(src_ptr0, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     );
             } else if bounds_size == 4 {
@@ -111,17 +110,17 @@ pub(crate) fn convolve_row_handler_floating_point<
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
                 let weight3 = sliced_weights[3].as_();
-                sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                sums = (load_color_group!(src_ptr0, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     );
             } else if bounds_size == 6 {
@@ -134,25 +133,25 @@ pub(crate) fn convolve_row_handler_floating_point<
                 let weight3 = sliced_weights[3].as_();
                 let weight4 = sliced_weights[4].as_();
                 let weight5 = sliced_weights[5].as_();
-                sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                sums = (load_color_group!(src_ptr0, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J),
                         weight4,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J),
                         weight5,
                     );
             } else {
@@ -163,7 +162,7 @@ pub(crate) fn convolve_row_handler_floating_point<
                     .take(bounds.size)
                 {
                     let weight: J = k_weight.as_();
-                    let new_px = fast_load_color_group!(src, CHANNELS, J);
+                    let new_px = load_color_group!(src, CHANNELS, J);
                     sums = sums.mul_add(new_px, weight);
                 }
             }
@@ -239,20 +238,20 @@ pub(crate) fn convolve_row_handler_floating_point_4<
                 let sliced_weights = &weights[0..2];
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
-                sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0).mul_add(
-                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                sums0 = (load_color_group!(src_ptr0, CHANNELS, J) * weight0).mul_add(
+                    load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                     weight1,
                 );
-                sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0).mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                sums1 = (load_color_group!(src_ptr1, CHANNELS, J) * weight0).mul_add(
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                     weight1,
                 );
-                sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0).mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                sums2 = (load_color_group!(src_ptr2, CHANNELS, J) * weight0).mul_add(
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                     weight1,
                 );
-                sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0).mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                sums3 = (load_color_group!(src_ptr3, CHANNELS, J) * weight0).mul_add(
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
                     weight1,
                 );
             } else if bounds_size == 3 {
@@ -265,40 +264,40 @@ pub(crate) fn convolve_row_handler_floating_point_4<
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
-                sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                sums0 = (load_color_group!(src_ptr0, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     );
-                sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0)
+                sums1 = (load_color_group!(src_ptr1, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     );
-                sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0)
+                sums2 = (load_color_group!(src_ptr2, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     );
-                sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0)
+                sums3 = (load_color_group!(src_ptr3, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     );
             } else if bounds_size == 4 {
@@ -312,56 +311,56 @@ pub(crate) fn convolve_row_handler_floating_point_4<
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
                 let weight3 = sliced_weights[3].as_();
-                sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                sums0 = (load_color_group!(src_ptr0, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     );
-                sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0)
+                sums1 = (load_color_group!(src_ptr1, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     );
-                sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0)
+                sums2 = (load_color_group!(src_ptr2, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     );
-                sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0)
+                sums3 = (load_color_group!(src_ptr3, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     );
             } else if bounds_size == 6 {
@@ -377,88 +376,88 @@ pub(crate) fn convolve_row_handler_floating_point_4<
                 let weight3 = sliced_weights[3].as_();
                 let weight4 = sliced_weights[4].as_();
                 let weight5 = sliced_weights[5].as_();
-                sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                sums0 = (load_color_group!(src_ptr0, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J),
                         weight4,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J),
                         weight5,
                     );
-                sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0)
+                sums1 = (load_color_group!(src_ptr1, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 4, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 4, J),
                         weight4,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 5, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 5, J),
                         weight5,
                     );
-                sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0)
+                sums2 = (load_color_group!(src_ptr2, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 4, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 4, J),
                         weight4,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 5, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 5, J),
                         weight5,
                     );
-                sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0)
+                sums3 = (load_color_group!(src_ptr3, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 4, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 4, J),
                         weight4,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 5, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 5, J),
                         weight5,
                     );
             } else {
@@ -479,10 +478,10 @@ pub(crate) fn convolve_row_handler_floating_point_4<
                 {
                     let weight: J = k_weight.as_();
 
-                    let new_px0 = fast_load_color_group!(src0, CHANNELS, J);
-                    let new_px1 = fast_load_color_group!(src1, CHANNELS, J);
-                    let new_px2 = fast_load_color_group!(src2, CHANNELS, J);
-                    let new_px3 = fast_load_color_group!(src3, CHANNELS, J);
+                    let new_px0 = load_color_group!(src0, CHANNELS, J);
+                    let new_px1 = load_color_group!(src1, CHANNELS, J);
+                    let new_px2 = load_color_group!(src2, CHANNELS, J);
+                    let new_px3 = load_color_group!(src3, CHANNELS, J);
 
                     sums0 = sums0.mul_add(new_px0, weight);
                     sums1 = sums1.mul_add(new_px1, weight);
diff --git a/src/floating_point_vertical.rs b/src/floating_point_vertical.rs
index 4d82044..9f38b38 100644
--- a/src/floating_point_vertical.rs
+++ b/src/floating_point_vertical.rs
@@ -27,8 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::color_group::{
-    fast_load_color_group, fast_load_color_group_with_offset, fast_mixed_store_color_group,
-    ColorGroup,
+    fast_mixed_store_color_group, load_color_group, load_color_group_with_offset, ColorGroup,
 };
 use crate::filter_weights::FilterBounds;
 use crate::mixed_storage::MixedStorage;
@@ -83,26 +82,23 @@ pub(crate) fn convolve_column_handler_floating_point_4<
             let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)];
             let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0).mul_add(
+                load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                weight1,
+            );
+            sums1 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                     weight1,
                 );
-            sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
+            sums2 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                     weight1,
                 );
-            sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                * weight0)
+            sums3 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
-                    weight1,
-                );
-            sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                * weight0)
-                .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
                     weight1,
                 );
         } else if bounds_size == 3 {
@@ -117,45 +113,43 @@ pub(crate) fn convolve_column_handler_floating_point_4<
             let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)];
             let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
                     weight2,
                 );
 
-            sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
+            sums1 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                     weight2,
                 );
 
-            sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                * weight0)
+            sums2 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
                     weight2,
                 );
 
-            sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                * weight0)
+            sums3 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
                     weight2,
                 );
         } else if bounds_size == 4 {
@@ -173,61 +167,59 @@ pub(crate) fn convolve_column_handler_floating_point_4<
             let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)];
             let src_ptr3 = &src[offset3..(offset3 + CHANNELS * 4)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
                     weight3,
                 );
 
-            sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
+            sums1 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
                     weight3,
                 );
 
-            sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                * weight0)
+            sums2 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
                     weight3,
                 );
 
-            sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                * weight0)
+            sums3 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
                     weight3,
                 );
         } else if bounds_size == 6 {
@@ -251,93 +243,91 @@ pub(crate) fn convolve_column_handler_floating_point_4<
             let src_ptr4 = &src[offset4..(offset4 + CHANNELS * 4)];
             let src_ptr5 = &src[offset5..(offset5 + CHANNELS * 4)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
                     weight3,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr4, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr4, CHANNELS, 0, J),
                     weight4,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr5, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr5, CHANNELS, 0, J),
                     weight5,
                 );
 
-            sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
+            sums1 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
                     weight3,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS, J),
                     weight4,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS, J),
                     weight5,
                 );
 
-            sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                * weight0)
+            sums2 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
                     weight3,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS * 2, J),
                     weight4,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS * 2, J),
                     weight5,
                 );
 
-            sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                * weight0)
+            sums3 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
                     weight3,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS * 3, J),
                     weight4,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS * 3, J),
                     weight5,
                 );
         } else {
@@ -347,12 +337,10 @@ pub(crate) fn convolve_column_handler_floating_point_4<
                 let offset = src_stride * py + v_start_px;
                 let src_ptr = &src[offset..(offset + CHANNELS * 4)];
 
-                let new_px0 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, 0, J);
-                let new_px1 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS, J);
-                let new_px2 =
-                    fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 2, J);
-                let new_px3 =
-                    fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 3, J);
+                let new_px0 = load_color_group_with_offset!(src_ptr, CHANNELS, 0, J);
+                let new_px1 = load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS, J);
+                let new_px2 = load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 2, J);
+                let new_px3 = load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 3, J);
 
                 sums0 = sums0.mul_add(new_px0, weight);
                 sums1 = sums1.mul_add(new_px1, weight);
@@ -435,11 +423,10 @@ pub(crate) fn convolve_column_handler_floating_point<
             let src_ptr0 = &src[offset0..(offset0 + CHANNELS)];
             let src_ptr1 = &src[offset1..(offset1 + CHANNELS)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
-                .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
-                    weight1,
-                );
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0).mul_add(
+                load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                weight1,
+            );
         } else if bounds_size == 3 {
             let weights = &filter[0..3];
             let weight0 = weights[0].as_();
@@ -452,13 +439,13 @@ pub(crate) fn convolve_column_handler_floating_point<
             let src_ptr1 = &src[offset1..(offset1 + CHANNELS)];
             let src_ptr2 = &src[offset2..(offset2 + CHANNELS)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
                     weight2,
                 );
         } else if bounds_size == 4 {
@@ -476,17 +463,17 @@ pub(crate) fn convolve_column_handler_floating_point<
             let src_ptr2 = &src[offset2..(offset2 + CHANNELS)];
             let src_ptr3 = &src[offset3..(offset3 + CHANNELS)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
                     weight3,
                 );
         } else if bounds_size == 6 {
@@ -510,25 +497,25 @@ pub(crate) fn convolve_column_handler_floating_point<
             let src_ptr4 = &src[offset4..(offset4 + CHANNELS)];
             let src_ptr5 = &src[offset5..(offset5 + CHANNELS)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
                     weight3,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr4, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr4, CHANNELS, 0, J),
                     weight4,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr5, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr5, CHANNELS, 0, J),
                     weight5,
                 );
         } else {
@@ -538,7 +525,7 @@ pub(crate) fn convolve_column_handler_floating_point<
                 let offset = src_stride * py + v_start_px;
                 let src_ptr = &src[offset..(offset + CHANNELS)];
 
-                let new_px0 = fast_load_color_group!(src_ptr, CHANNELS, J);
+                let new_px0 = load_color_group!(src_ptr, CHANNELS, J);
 
                 sums0 = sums0.mul_add(new_px0, weight);
             }
diff --git a/src/lib.rs b/src/lib.rs
index 7e41574..724fa48 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -27,7 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #![deny(deprecated)]
-#![deny(unreachable_code, unused)]
+// #![deny(unreachable_code, unused)]
 #![allow(clippy::too_many_arguments)]
 mod alpha_check;
 #[cfg(feature = "half")]
@@ -35,6 +35,7 @@ mod alpha_handle_f16;
 mod alpha_handle_f32;
 mod alpha_handle_u16;
 mod alpha_handle_u8;
+mod ar30;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 mod avx2;
 mod color_group;
@@ -44,6 +45,7 @@ mod convolution;
 mod convolve_naive_f32;
 mod convolve_naive_u16;
 mod cpu_features;
+mod dispatch_group_ar30;
 #[cfg(feature = "half")]
 mod dispatch_group_f16;
 mod dispatch_group_f32;
@@ -53,7 +55,9 @@ mod dispatch_group_u8;
 mod f16;
 mod filter_weights;
 mod fixed_point_horizontal;
+mod fixed_point_horizontal_ar30;
 mod fixed_point_vertical;
+mod fixed_point_vertical_ar30;
 mod floating_point_horizontal;
 mod floating_point_vertical;
 mod handler_provider;
@@ -69,6 +73,7 @@ mod pic_scale_error;
 mod plane_f32;
 mod plane_u16;
 mod plane_u8;
+mod resize_ar30;
 mod rgb_f32;
 mod rgb_u16;
 mod rgb_u8;
@@ -88,6 +93,7 @@ mod unsafe_slice;
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128",))]
 mod wasm32;
 
+pub use ar30::Ar30ByteOrder;
 #[cfg(feature = "colorspaces")]
 pub use colors::*;
 #[cfg(feature = "colorspaces")]
diff --git a/src/resize_ar30.rs b/src/resize_ar30.rs
new file mode 100644
index 0000000..cf4c36f
--- /dev/null
+++ b/src/resize_ar30.rs
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::dispatch_group_ar30::{
+    convolve_horizontal_dispatch_ar30, convolve_vertical_dispatch_ar30,
+};
+use crate::nearest_sampler::resize_nearest;
+use crate::pic_scale_error::PicScaleError;
+use crate::support::check_image_size_overflow;
+use crate::{ImageSize, ResamplingFunction, Scaler};
+
+pub(crate) fn resize_ar30_impl<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    src: &[u32],
+    src_size: ImageSize,
+    dst: &mut [u32],
+    dst_size: ImageSize,
+    scaler: &Scaler,
+) -> Result<(), PicScaleError> {
+    if src_size.width == 0 || src_size.height == 0 || dst_size.width == 0 || dst_size.height == 0 {
+        return Err(PicScaleError::ZeroImageDimensions);
+    }
+
+    if check_image_size_overflow(src_size.width, src_size.height, 1) {
+        return Err(PicScaleError::SourceImageIsTooLarge);
+    }
+
+    if check_image_size_overflow(dst_size.width, dst_size.height, 1) {
+        return Err(PicScaleError::DestinationImageIsTooLarge);
+    }
+
+    if src_size.width == dst_size.width && src_size.height == dst_size.height {
+        for (src, dst) in src.iter().zip(dst.iter_mut()) {
+            *dst = *src;
+        }
+        return Ok(());
+    }
+
+    let pool = scaler
+        .threading_policy
+        .get_pool(ImageSize::new(dst_size.width, dst_size.height));
+
+    if scaler.function == ResamplingFunction::Nearest {
+        resize_nearest::<u32, 1>(
+            src,
+            src_size.width,
+            src_size.height,
+            dst,
+            dst_size.width,
+            dst_size.height,
+            &pool,
+        );
+        return Ok(());
+    }
+
+    let should_do_horizontal = src_size.width != dst_size.width;
+    let should_do_vertical = src_size.height != dst_size.height;
+    assert!(should_do_horizontal || should_do_vertical);
+
+    let working_store = if should_do_vertical {
+        let mut target = vec![0u32; src_size.width * dst_size.height];
+
+        let vertical_filters = scaler.generate_weights(src_size.height, dst_size.height);
+        convolve_vertical_dispatch_ar30::<AR30_TYPE, AR30_ORDER>(
+            src,
+            src_size.width,
+            vertical_filters,
+            &mut target,
+            src_size.width,
+            &pool,
+        );
+
+        std::borrow::Cow::Owned(target)
+    } else {
+        std::borrow::Cow::Borrowed(src)
+    };
+
+    if should_do_horizontal {
+        let horizontal_filters = scaler.generate_weights(src_size.width, dst_size.width);
+        convolve_horizontal_dispatch_ar30::<AR30_TYPE, AR30_ORDER>(
+            working_store.as_ref(),
+            src_size.width,
+            horizontal_filters,
+            dst,
+            dst_size.width,
+            &pool,
+        );
+    }
+
+    Ok(())
+}
diff --git a/src/scaler.rs b/src/scaler.rs
index 2f57e69..30011b6 100644
--- a/src/scaler.rs
+++ b/src/scaler.rs
@@ -30,12 +30,14 @@ use crate::alpha_check::{
     has_non_constant_cap_alpha_rgba16, has_non_constant_cap_alpha_rgba8,
     has_non_constant_cap_alpha_rgba_f32,
 };
+use crate::ar30::{Ar30ByteOrder, Rgb30};
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::image_size::ImageSize;
 use crate::image_store::ImageStore;
 use crate::nearest_sampler::resize_nearest;
 use crate::pic_scale_error::PicScaleError;
+use crate::resize_ar30::resize_ar30_impl;
 use crate::support::check_image_size_overflow;
 use crate::threading_policy::ThreadingPolicy;
 use crate::{ConstPI, ConstSqrt2, Jinc, ResamplingFunction};
@@ -1148,3 +1150,61 @@ impl ScalingU16 for Scaler {
         Ok(src_store)
     }
 }
+
+impl Scaler {
+    /// Resizes RGBA2101010 image
+    ///
+    /// # Arguments
+    /// `src` - source slice
+    /// `src_size` - Source Image size
+    /// `dst` - destination slice
+    /// `new_size` - New image size
+    ///
+    pub fn resize_ar30(
+        &self,
+        src: &[u32],
+        src_size: ImageSize,
+        dst: &mut [u32],
+        new_size: ImageSize,
+        order: Ar30ByteOrder,
+    ) -> Result<(), PicScaleError> {
+        match order {
+            Ar30ByteOrder::Host => resize_ar30_impl::<
+                { Rgb30::Ar30 as usize },
+                { Ar30ByteOrder::Host as usize },
+            >(src, src_size, dst, new_size, self),
+            Ar30ByteOrder::Network => resize_ar30_impl::<
+                { Rgb30::Ar30 as usize },
+                { Ar30ByteOrder::Network as usize },
+            >(src, src_size, dst, new_size, self),
+        }
+    }
+
+    /// Resizes RGBA1010102 image
+    ///
+    /// # Arguments
+    /// `src` - source slice
+    /// `src_size` - Source Image size
+    /// `dst` - destination slice
+    /// `new_size` - New image size
+    ///
+    pub fn resize_ra30(
+        &self,
+        src: &[u32],
+        src_size: ImageSize,
+        dst: &mut [u32],
+        new_size: ImageSize,
+        order: Ar30ByteOrder,
+    ) -> Result<(), PicScaleError> {
+        match order {
+            Ar30ByteOrder::Host => resize_ar30_impl::<
+                { Rgb30::Ra30 as usize },
+                { Ar30ByteOrder::Host as usize },
+            >(src, src_size, dst, new_size, self),
+            Ar30ByteOrder::Network => resize_ar30_impl::<
+                { Rgb30::Ra30 as usize },
+                { Ar30ByteOrder::Network as usize },
+            >(src, src_size, dst, new_size, self),
+        }
+    }
+}

From 68b90c80df9920525e7555901d7f2c94d639fe8e Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Thu, 21 Nov 2024 19:02:06 +0000
Subject: [PATCH 05/19] Added AR30/RA30

---
 Cargo.lock                       | 173 ++++++----------
 Cargo.toml                       |   2 +-
 README.md                        |   1 +
 app/src/main.rs                  |  14 +-
 src/dispatch_group_ar30.rs       |  54 +++--
 src/fixed_point_vertical_ar30.rs |  11 +-
 src/neon/ar30.rs                 | 332 +++++++++++++++++++++++++++++++
 src/neon/horizontal_ar30.rs      | 327 ++++++++++++++++++++++++++++++
 src/neon/mod.rs                  |   5 +
 src/neon/vertical_ar30.rs        | 176 ++++++++++++++++
 src/resize_ar30.rs               |  13 ++
 11 files changed, 969 insertions(+), 139 deletions(-)
 create mode 100644 src/neon/ar30.rs
 create mode 100644 src/neon/horizontal_ar30.rs
 create mode 100644 src/neon/vertical_ar30.rs

diff --git a/Cargo.lock b/Cargo.lock
index 11e13f9..1d04751 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,12 +2,6 @@
 # It is not intended for manual editing.
 version = 4
 
-[[package]]
-name = "adler"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
-
 [[package]]
 name = "adler2"
 version = "2.0.0"
@@ -37,15 +31,15 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anstyle"
-version = "1.0.8"
+version = "1.0.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
+checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
 
 [[package]]
 name = "anyhow"
-version = "1.0.90"
+version = "1.0.93"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37bf3594c4c988a53154954629820791dde498571819ae4ca50ca811e060cc95"
+checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775"
 
 [[package]]
 name = "app"
@@ -61,9 +55,9 @@ dependencies = [
 
 [[package]]
 name = "arbitrary"
-version = "1.3.2"
+version = "1.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
+checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223"
 
 [[package]]
 name = "arg_enum_proc_macro"
@@ -125,9 +119,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitstream-io"
-version = "2.5.3"
+version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b81e1519b0d82120d2fd469d5bfb2919a9361c48b02d82d04befc1cdd2002452"
+checksum = "6099cdc01846bc367c4e7dd630dc5966dccf36b652fae7a74e17b640411a91b2"
 
 [[package]]
 name = "built"
@@ -143,9 +137,9 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
 
 [[package]]
 name = "bytemuck"
-version = "1.19.0"
+version = "1.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d"
+checksum = "8b37c88a63ffd85d15b406896cc343916d7cf57838a847b3a6f2ca5d39a5695a"
 
 [[package]]
 name = "byteorder"
@@ -167,9 +161,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cc"
-version = "1.1.31"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f"
+checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47"
 dependencies = [
  "jobserver",
  "libc",
@@ -227,18 +221,18 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.20"
+version = "4.5.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
+checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f"
 dependencies = [
  "clap_builder",
 ]
 
 [[package]]
 name = "clap_builder"
-version = "4.5.20"
+version = "4.5.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
+checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec"
 dependencies = [
  "anstyle",
  "clap_lex",
@@ -246,9 +240,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.7.2"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
+checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
 [[package]]
 name = "color_quant"
@@ -258,9 +252,9 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
 
 [[package]]
 name = "colorutils-rs"
-version = "0.7.2"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e1632390e4314e1ce4b4060fbbb36f0d576b994399088bfdfe4216eef0aa54b"
+checksum = "31ca2cc8ed986672b15bfd3e416014e40cada05196bdfaa51168985f3c2e81f1"
 dependencies = [
  "erydanos",
  "half",
@@ -386,15 +380,14 @@ dependencies = [
 
 [[package]]
 name = "exr"
-version = "1.72.0"
+version = "1.73.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "887d93f60543e9a9362ef8a21beedd0a833c5d9610e18c67abe15a5963dcb1a4"
+checksum = "f83197f59927b46c04a183a619b7c29df34e63e63c7869320862268c0ef687e0"
 dependencies = [
  "bit_field",
- "flume",
  "half",
  "lebe",
- "miniz_oxide 0.7.4",
+ "miniz_oxide",
  "rayon-core",
  "smallvec",
  "zune-inflate",
@@ -414,30 +407,21 @@ dependencies = [
 
 [[package]]
 name = "fdeflate"
-version = "0.3.5"
+version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8090f921a24b04994d9929e204f50b498a33ea6ba559ffaa05e04f7ee7fb5ab"
+checksum = "07c6f4c64c1d33a3111c4466f7365ebdcc37c5bd1ea0d62aae2e3d722aacbedb"
 dependencies = [
  "simd-adler32",
 ]
 
 [[package]]
 name = "flate2"
-version = "1.0.34"
+version = "1.0.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0"
+checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c"
 dependencies = [
  "crc32fast",
- "miniz_oxide 0.8.0",
-]
-
-[[package]]
-name = "flume"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095"
-dependencies = [
- "spin",
+ "miniz_oxide",
 ]
 
 [[package]]
@@ -474,9 +458,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.15.0"
+version = "0.15.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb"
+checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3"
 
 [[package]]
 name = "heck"
@@ -581,9 +565,9 @@ dependencies = [
 
 [[package]]
 name = "itoa"
-version = "1.0.11"
+version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
+checksum = "540654e97a3f4470a492cd30ff187bc95d89557a903a2bbf112e2fae98104ef2"
 
 [[package]]
 name = "jobserver"
@@ -617,26 +601,25 @@ checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8"
 
 [[package]]
 name = "libc"
-version = "0.2.161"
+version = "0.2.164"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1"
+checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f"
 
 [[package]]
 name = "libfuzzer-sys"
-version = "0.4.7"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
+checksum = "9b9569d2f74e257076d8c6bfa73fb505b46b851e51ddaecc825944aa3bed17fa"
 dependencies = [
  "arbitrary",
  "cc",
- "once_cell",
 ]
 
 [[package]]
 name = "libm"
-version = "0.2.8"
+version = "0.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
 
 [[package]]
 name = "litrs"
@@ -644,16 +627,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4ce301924b7887e9d637144fdade93f9dfff9b60981d4ac161db09720d39aa5"
 
-[[package]]
-name = "lock_api"
-version = "0.4.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
-dependencies = [
- "autocfg",
- "scopeguard",
-]
-
 [[package]]
 name = "log"
 version = "0.4.22"
@@ -697,15 +670,6 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
-[[package]]
-name = "miniz_oxide"
-version = "0.7.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
-dependencies = [
- "adler",
-]
-
 [[package]]
 name = "miniz_oxide"
 version = "0.8.0"
@@ -809,7 +773,7 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
 [[package]]
 name = "pic-scale"
-version = "0.3.6"
+version = "0.3.7"
 dependencies = [
  "colorutils-rs",
  "half",
@@ -862,7 +826,7 @@ dependencies = [
  "crc32fast",
  "fdeflate",
  "flate2",
- "miniz_oxide 0.8.0",
+ "miniz_oxide",
 ]
 
 [[package]]
@@ -876,9 +840,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.88"
+version = "1.0.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9"
+checksum = "307e3004becf10f5a6e0d59d20f3cd28231b0e0827a96cd3e0ce6d14bc1e4bb3"
 dependencies = [
  "unicode-ident",
 ]
@@ -1028,9 +992,9 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.11.0"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8"
+checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -1040,9 +1004,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.8"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -1076,26 +1040,20 @@ dependencies = [
  "winapi-util",
 ]
 
-[[package]]
-name = "scopeguard"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
-
 [[package]]
 name = "serde"
-version = "1.0.210"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
+checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.210"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
+checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1104,9 +1062,9 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.132"
+version = "1.0.133"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03"
+checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
 dependencies = [
  "itoa",
  "memchr",
@@ -1150,20 +1108,11 @@ version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
-[[package]]
-name = "spin"
-version = "0.9.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
-dependencies = [
- "lock_api",
-]
-
 [[package]]
 name = "syn"
-version = "2.0.81"
+version = "2.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "198514704ca887dd5a1e408c6c6cdcba43672f9b4062e1b24aa34e74e6d7faae"
+checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1191,18 +1140,18 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
 
 [[package]]
 name = "thiserror"
-version = "1.0.64"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.64"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1266,9 +1215,9 @@ dependencies = [
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.13"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
 [[package]]
 name = "v_frame"
@@ -1522,7 +1471,7 @@ dependencies = [
 
 [[package]]
 name = "yuvutils-rs"
-version = "0.5.3"
+version = "0.5.5"
 dependencies = [
  "num-traits",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 019bce5..5dc3d81 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["app", "wasm"] }
 
 [package]
 name = "pic-scale"
-version = "0.3.6"
+version = "0.3.7"
 edition = "2021"
 description = "High performance image scaling"
 readme = "README.md"
diff --git a/README.md b/README.md
index 645438a..24ab676 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ Despite all implementation are fast, not all the paths are implemented using SIM
 | RGBA (f16)     | x    | x   | x   | -    | 
 | RGB (f16)      | x    | ~   | ~   | -    | 
 | Plane (f16)    | ~    | ~   | ~   | -    |
+| AR30/RA30      | x    | -   | -   | -    |
 
 #### Features
 
diff --git a/app/src/main.rs b/app/src/main.rs
index fb0dd1f..2d5c918 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -14,10 +14,12 @@ use pic_scale::{
     Ar30ByteOrder, ImageSize, ImageStore, LinearApproxScaler, ResamplingFunction, Scaler, Scaling,
     ScalingU16, ThreadingPolicy,
 };
-use yuvutils_rs::{ar30_to_rgba8, rgb8_to_ar30, rgba8_to_ar30, Rgb30ByteOrder};
+use yuvutils_rs::{
+    ar30_to_rgba8, ra30_to_rgba8, rgb8_to_ar30, rgba8_to_ar30, rgba8_to_ra30, Rgb30ByteOrder,
+};
 
 fn main() {
-    test_fast_image();
+    // test_fast_image();
     let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
         .unwrap()
         .decode()
@@ -26,11 +28,11 @@ fn main() {
     let transient = img.to_rgba8();
     let mut bytes = Vec::from(transient.as_bytes());
 
-    let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+    let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
     scaler.set_threading_policy(ThreadingPolicy::Single);
 
     let mut ar30_src = vec![0u32; dimensions.0 as usize * dimensions.1 as usize];
-    rgba8_to_ar30(
+    rgba8_to_ra30(
         &mut ar30_src,
         dimensions.0,
         Rgb30ByteOrder::Host,
@@ -51,7 +53,7 @@ fn main() {
     let mut resized_ar = vec![0u32; dst_size.width * dst_size.height];
     let start_time = Instant::now();
     scaler
-        .resize_ar30(
+        .resize_ra30(
             &ar30_src,
             ImageSize::new(dimensions.0 as usize, dimensions.1 as usize),
             &mut resized_ar,
@@ -73,7 +75,7 @@ fn main() {
     println!("Scaler: {:.2?}", elapsed_time);
 
     let mut resized = vec![0u8; dst_size.width * dst_size.height * 4];
-    ar30_to_rgba8(
+    ra30_to_rgba8(
         &resized_ar,
         dst_size.width as u32,
         Rgb30ByteOrder::Host,
diff --git a/src/dispatch_group_ar30.rs b/src/dispatch_group_ar30.rs
index 4095c5c..95de42b 100644
--- a/src/dispatch_group_ar30.rs
+++ b/src/dispatch_group_ar30.rs
@@ -27,11 +27,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::filter_weights::FilterWeights;
+use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::fixed_point_horizontal_ar30::{
     convolve_row_handler_fixed_point_4_ar30, convolve_row_handler_fixed_point_ar30,
 };
 use crate::fixed_point_vertical_ar30::column_handler_fixed_point_ar30;
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+use crate::neon::{
+    neon_column_handler_fixed_point_ar30, neon_convolve_horizontal_rgba_rows_4_ar30,
+};
 use crate::support::PRECISION;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
@@ -46,15 +50,22 @@ pub(crate) fn convolve_horizontal_dispatch_ar30<const AR30_TYPE: usize, const AR
     dst_stride: usize,
     pool: &Option<ThreadPool>,
 ) {
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
     if let Some(pool) = pool {
         pool.install(|| {
             let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
             dst.par_chunks_exact_mut(dst_stride * 4)
                 .zip(src.par_chunks_exact(src_stride * 4))
                 .for_each(|(dst, src)| {
-                    convolve_row_handler_fixed_point_4_ar30::<AR30_TYPE, AR30_ORDER>(
-                        src, src_stride, dst, dst_stride, &approx,
-                    );
+                    let mut _dispatch: fn(&[u32], usize, &mut [u32], usize, &FilterWeights<i16>) =
+                        convolve_row_handler_fixed_point_4_ar30::<AR30_TYPE, AR30_ORDER>;
+                    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                    if is_rdm_available {
+                        _dispatch =
+                            neon_convolve_horizontal_rgba_rows_4_ar30::<AR30_TYPE, AR30_ORDER>;
+                    }
+                    _dispatch(src, src_stride, dst, dst_stride, &approx);
                 });
 
             let remainder = dst.chunks_exact_mut(dst_stride * 4).into_remainder();
@@ -74,9 +85,13 @@ pub(crate) fn convolve_horizontal_dispatch_ar30<const AR30_TYPE: usize, const AR
         dst.chunks_exact_mut(dst_stride * 4)
             .zip(src.chunks_exact(src_stride * 4))
             .for_each(|(dst, src)| {
-                convolve_row_handler_fixed_point_4_ar30::<AR30_TYPE, AR30_ORDER>(
-                    src, src_stride, dst, dst_stride, &approx,
-                );
+                let mut _dispatch: fn(&[u32], usize, &mut [u32], usize, &FilterWeights<i16>) =
+                    convolve_row_handler_fixed_point_4_ar30::<AR30_TYPE, AR30_ORDER>;
+                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                if is_rdm_available {
+                    _dispatch = neon_convolve_horizontal_rgba_rows_4_ar30::<AR30_TYPE, AR30_ORDER>;
+                }
+                _dispatch(src, src_stride, dst, dst_stride, &approx);
             });
 
         let remainder = dst.chunks_exact_mut(dst_stride * 4).into_remainder();
@@ -99,6 +114,8 @@ pub(crate) fn convolve_vertical_dispatch_ar30<const AR30_TYPE: usize, const AR30
     dst_stride: usize,
     pool: &Option<ThreadPool>,
 ) {
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
     if let Some(pool) = pool {
         pool.install(|| {
             let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
@@ -108,9 +125,14 @@ pub(crate) fn convolve_vertical_dispatch_ar30<const AR30_TYPE: usize, const AR30
                     let bounds = approx.bounds[y];
                     let filter_offset = y * approx.aligned_size;
                     let weights = &approx.weights[filter_offset..];
-                    column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>(
-                        &bounds, src, row, src_stride, weights,
-                    );
+                    let mut _dispatch: fn(&FilterBounds, &[u32], &mut [u32], usize, &[i16]) =
+                        column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>;
+                    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                    if is_rdm_available {
+                        _dispatch = neon_column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>;
+                    }
+
+                    _dispatch(&bounds, src, row, src_stride, weights);
                 });
         });
     } else {
@@ -121,9 +143,15 @@ pub(crate) fn convolve_vertical_dispatch_ar30<const AR30_TYPE: usize, const AR30
                 let bounds = approx.bounds[y];
                 let filter_offset = y * approx.aligned_size;
                 let weights = &approx.weights[filter_offset..];
-                column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>(
-                    &bounds, src, row, src_stride, weights,
-                );
+
+                let mut _dispatch: fn(&FilterBounds, &[u32], &mut [u32], usize, &[i16]) =
+                    column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>;
+                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                if is_rdm_available {
+                    _dispatch = neon_column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>;
+                }
+
+                _dispatch(&bounds, src, row, src_stride, weights);
             });
     }
 }
diff --git a/src/fixed_point_vertical_ar30.rs b/src/fixed_point_vertical_ar30.rs
index 04386b7..86a84f7 100644
--- a/src/fixed_point_vertical_ar30.rs
+++ b/src/fixed_point_vertical_ar30.rs
@@ -34,7 +34,7 @@ use crate::support::ROUNDING_CONST;
 /// # Generics
 /// `T` - template buffer type
 /// `J` - accumulator type
-fn convolve_column_handler_fixed_point_direct_buffer<
+pub(crate) fn convolve_column_handler_fip_db_ar30<
     const AR30_TYPE: usize,
     const AR30_ORDER: usize,
     const BUFFER_SIZE: usize,
@@ -254,9 +254,6 @@ fn convolve_column_handler_fixed_point_direct_buffer_four<
     }
 }
 
-/// # Generics
-/// `T` - template buffer type
-/// `J` - accumulator type
 pub(crate) fn column_handler_fixed_point_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
     bounds: &FilterBounds,
     src: &[u32],
@@ -285,7 +282,7 @@ pub(crate) fn column_handler_fixed_point_ar30<const AR30_TYPE: usize, const AR30
     }
 
     while cx + 16 < total_width {
-        convolve_column_handler_fixed_point_direct_buffer::<AR30_TYPE, AR30_ORDER, 16>(
+        convolve_column_handler_fip_db_ar30::<AR30_TYPE, AR30_ORDER, 16>(
             src, src_stride, dst, weight, bounds, cx,
         );
 
@@ -293,7 +290,7 @@ pub(crate) fn column_handler_fixed_point_ar30<const AR30_TYPE: usize, const AR30
     }
 
     while cx + 8 < total_width {
-        convolve_column_handler_fixed_point_direct_buffer::<AR30_TYPE, AR30_ORDER, 8>(
+        convolve_column_handler_fip_db_ar30::<AR30_TYPE, AR30_ORDER, 8>(
             src, src_stride, dst, weight, bounds, cx,
         );
 
@@ -301,7 +298,7 @@ pub(crate) fn column_handler_fixed_point_ar30<const AR30_TYPE: usize, const AR30
     }
 
     while cx < total_width {
-        convolve_column_handler_fixed_point_direct_buffer::<AR30_TYPE, AR30_ORDER, 1>(
+        convolve_column_handler_fip_db_ar30::<AR30_TYPE, AR30_ORDER, 1>(
             src, src_stride, dst, weight, bounds, cx,
         );
 
diff --git a/src/neon/ar30.rs b/src/neon/ar30.rs
new file mode 100644
index 0000000..a9cdaf8
--- /dev/null
+++ b/src/neon/ar30.rs
@@ -0,0 +1,332 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::ar30::Rgb30;
+use std::arch::aarch64::*;
+
+#[inline(always)]
+pub(crate) unsafe fn vrev128_u32(v: uint32x4_t) -> uint32x4_t {
+    vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v)))
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vunzips_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    v: uint32x4_t,
+) -> int16x4x4_t {
+    let mask = vdupq_n_u32(0x3ff);
+    let ar_type: Rgb30 = AR30_TYPE.into();
+
+    let v = if AR30_ORDER == 0 { v } else { vrev128_u32(v) };
+
+    match ar_type {
+        Rgb30::Ar30 => {
+            let r = vmovn_u32(vandq_u32(v, mask));
+            let g = vmovn_u32(vandq_u32(vshrq_n_u32::<10>(v), mask));
+            let b = vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v), mask));
+            let va = vmovn_u32(vshrq_n_u32::<30>(v));
+            let a = vorr_u16(
+                vorr_u16(
+                    vorr_u16(
+                        vorr_u16(vshl_n_u16::<8>(va), vshl_n_u16::<6>(va)),
+                        vshl_n_u16::<4>(va),
+                    ),
+                    vshl_n_u16::<2>(va),
+                ),
+                va,
+            );
+            int16x4x4_t(
+                vreinterpret_s16_u16(r),
+                vreinterpret_s16_u16(g),
+                vreinterpret_s16_u16(b),
+                vreinterpret_s16_u16(a),
+            )
+        }
+        Rgb30::Ra30 => {
+            let a_mask = vdupq_n_u32(0x3);
+            let va = vmovn_u32(vandq_u32(v, a_mask));
+
+            let a = vorr_u16(
+                vorr_u16(
+                    vorr_u16(
+                        vorr_u16(vshl_n_u16::<8>(va), vshl_n_u16::<6>(va)),
+                        vshl_n_u16::<4>(va),
+                    ),
+                    vshl_n_u16::<2>(va),
+                ),
+                va,
+            );
+
+            let r = vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v), mask));
+            let g = vmovn_u32(vandq_u32(vshrq_n_u32::<12>(v), mask));
+            let b = vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v), mask));
+            int16x4x4_t(
+                vreinterpret_s16_u16(r),
+                vreinterpret_s16_u16(g),
+                vreinterpret_s16_u16(b),
+                vreinterpret_s16_u16(a),
+            )
+        }
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vunzip_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    v: uint32x4x2_t,
+) -> int16x8x4_t {
+    let mask = vdupq_n_u32(0x3ff);
+    let ar_type: Rgb30 = AR30_TYPE.into();
+
+    let v = if AR30_ORDER == 0 {
+        v
+    } else {
+        uint32x4x2_t(vrev128_u32(v.0), vrev128_u32(v.1))
+    };
+
+    match ar_type {
+        Rgb30::Ar30 => {
+            let r = vcombine_u16(
+                vmovn_u32(vandq_u32(v.0, mask)),
+                vmovn_u32(vandq_u32(v.1, mask)),
+            );
+            let g = vcombine_u16(
+                vmovn_u32(vandq_u32(vshrq_n_u32::<10>(v.0), mask)),
+                vmovn_u32(vandq_u32(vshrq_n_u32::<10>(v.1), mask)),
+            );
+            let b = vcombine_u16(
+                vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v.0), mask)),
+                vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v.1), mask)),
+            );
+            let va = vcombine_u16(
+                vmovn_u32(vshrq_n_u32::<30>(v.0)),
+                vmovn_u32(vshrq_n_u32::<30>(v.1)),
+            );
+            let a = vorrq_u16(
+                vorrq_u16(
+                    vorrq_u16(
+                        vorrq_u16(vshlq_n_u16::<8>(va), vshlq_n_u16::<6>(va)),
+                        vshlq_n_u16::<4>(va),
+                    ),
+                    vshlq_n_u16::<2>(va),
+                ),
+                va,
+            );
+            int16x8x4_t(
+                vreinterpretq_s16_u16(r),
+                vreinterpretq_s16_u16(g),
+                vreinterpretq_s16_u16(b),
+                vreinterpretq_s16_u16(a),
+            )
+        }
+        Rgb30::Ra30 => {
+            let a_mask = vdupq_n_u32(0x3);
+            let va = vcombine_u16(
+                vmovn_u32(vandq_u32(v.0, a_mask)),
+                vmovn_u32(vandq_u32(v.1, a_mask)),
+            );
+
+            let a = vorrq_u16(
+                vorrq_u16(
+                    vorrq_u16(
+                        vorrq_u16(vshlq_n_u16::<8>(va), vshlq_n_u16::<6>(va)),
+                        vshlq_n_u16::<4>(va),
+                    ),
+                    vshlq_n_u16::<2>(va),
+                ),
+                va,
+            );
+
+            let r = vcombine_u16(
+                vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v.0), mask)),
+                vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v.1), mask)),
+            );
+            let g = vcombine_u16(
+                vmovn_u32(vandq_u32(vshrq_n_u32::<12>(v.0), mask)),
+                vmovn_u32(vandq_u32(vshrq_n_u32::<12>(v.1), mask)),
+            );
+            let b = vcombine_u16(
+                vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v.0), mask)),
+                vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v.1), mask)),
+            );
+            int16x8x4_t(
+                vreinterpretq_s16_u16(r),
+                vreinterpretq_s16_u16(g),
+                vreinterpretq_s16_u16(b),
+                vreinterpretq_s16_u16(a),
+            )
+        }
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vunzip_4_ar30_separate<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    v: uint32x4x2_t,
+) -> int16x8x4_t {
+    let values = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(v);
+    let a0 = vtrnq_s16(values.0, values.1);
+    let a1 = vtrnq_s16(values.2, values.3);
+    let v1 = vtrnq_s32(vreinterpretq_s32_s16(a0.0), vreinterpretq_s32_s16(a1.0));
+    let v2 = vtrnq_s32(vreinterpretq_s32_s16(a0.1), vreinterpretq_s32_s16(a1.1));
+    let k0 = vreinterpretq_s16_s32(v1.0);
+    let k1 = vreinterpretq_s16_s32(v2.0);
+    let k2 = vreinterpretq_s16_s32(v1.1);
+    let k3 = vreinterpretq_s16_s32(v2.1);
+    let regi = int16x8x4_t(k0, k1, k2, k3);
+    regi
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vunzips_4_ar30_separate<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    v: uint32x4_t,
+) -> int16x8x2_t {
+    let values = vunzips_4_ar30::<AR30_TYPE, AR30_ORDER>(v);
+    let a0 = vtrn_s16(values.0, values.1);
+    let a1 = vtrn_s16(values.2, values.3);
+    let v1 = vtrn_s32(vreinterpret_s32_s16(a0.0), vreinterpret_s32_s16(a1.0));
+    let v2 = vtrn_s32(vreinterpret_s32_s16(a0.1), vreinterpret_s32_s16(a1.1));
+    let k0 = vreinterpret_s16_s32(v1.0);
+    let k1 = vreinterpret_s16_s32(v2.0);
+    let k2 = vreinterpret_s16_s32(v1.1);
+    let k3 = vreinterpret_s16_s32(v2.1);
+    let regi = int16x8x2_t(vcombine_s16(k0, k1), vcombine_s16(k2, k3));
+    regi
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vzip_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    v: int16x8x4_t,
+) -> uint32x4x2_t {
+    let ar_type: Rgb30 = AR30_TYPE.into();
+    let a_max = vdupq_n_s16(3);
+    match ar_type {
+        Rgb30::Ar30 => {
+            let v3 = vminq_s16(vrshrq_n_s16::<8>(v.3), a_max);
+            let mut a0 = vshlq_n_u32::<30>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v3))));
+            let mut a1 = vshlq_n_u32::<30>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v3))));
+
+            let r0 = vshlq_n_u32::<20>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.2))));
+            let r1 = vshlq_n_u32::<20>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.2))));
+
+            a0 = vorrq_u32(a0, r0);
+            a1 = vorrq_u32(a1, r1);
+
+            let g0 = vshlq_n_u32::<10>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.1))));
+            let g1 = vshlq_n_u32::<10>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.1))));
+
+            a0 = vorrq_u32(a0, g0);
+            a1 = vorrq_u32(a1, g1);
+
+            a0 = vorrq_u32(a0, vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.0))));
+            a1 = vorrq_u32(a1, vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.0))));
+
+            if AR30_ORDER == 0 {
+                uint32x4x2_t(a0, a1)
+            } else {
+                uint32x4x2_t(vrev128_u32(a0), vrev128_u32(a1))
+            }
+        }
+        Rgb30::Ra30 => {
+            let v3 = vminq_s16(vrshrq_n_s16::<8>(v.3), a_max);
+            let mut a0 = vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v3)));
+            let mut a1 = vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v3)));
+
+            let r0 = vshlq_n_u32::<22>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.0))));
+            let r1 = vshlq_n_u32::<22>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.0))));
+
+            a0 = vorrq_u32(a0, r0);
+            a1 = vorrq_u32(a1, r1);
+
+            let g0 = vshlq_n_u32::<12>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.1))));
+            let g1 = vshlq_n_u32::<12>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.1))));
+
+            a0 = vorrq_u32(a0, g0);
+            a1 = vorrq_u32(a1, g1);
+
+            a0 = vorrq_u32(
+                a0,
+                vshlq_n_u32::<2>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.2)))),
+            );
+            a1 = vorrq_u32(
+                a1,
+                vshlq_n_u32::<2>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.2)))),
+            );
+
+            if AR30_ORDER == 0 {
+                uint32x4x2_t(a0, a1)
+            } else {
+                uint32x4x2_t(vrev128_u32(a0), vrev128_u32(a1))
+            }
+        }
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vld1_ar30_s16<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    arr: &[u32],
+) -> int16x4_t {
+    let item = *arr.get_unchecked(0);
+    let ar_type: Rgb30 = AR30_TYPE.into();
+    let vl = ar_type.unpack::<AR30_ORDER>(item);
+    let a_rep = (vl.3 as i16) << 8;
+    let temp = [vl.0 as i16, vl.1 as i16, vl.2 as i16, a_rep];
+    vld1_s16(temp.as_ptr())
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vextract_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    v: uint16x4_t,
+) -> u32 {
+    let v0 = vreinterpret_u64_u16(v);
+    let a_mask = vdup_n_u64(0x3);
+    let v_mask = vdup_n_u64(0x3ff);
+    let mut a = vand_u64(vshr_n_u64::<48>(v0), a_mask);
+    let r = vand_u64(v0, v_mask);
+    let g = vand_u64(vshr_n_u64::<16>(v0), v_mask);
+    let b = vand_u64(vshr_n_u64::<32>(v0), v_mask);
+
+    let ar_type: Rgb30 = AR30_TYPE.into();
+
+    match ar_type {
+        Rgb30::Ar30 => {
+            a = vshl_n_u64::<30>(a);
+            a = vorr_u64(a, vshl_n_u64::<20>(b));
+            a = vorr_u64(a, vshl_n_u64::<10>(g));
+            a = vorr_u64(a, r);
+        }
+        Rgb30::Ra30 => {
+            a = vorr_u64(a, vshl_n_u64::<2>(b));
+            a = vorr_u64(a, vshl_n_u64::<12>(g));
+            a = vorr_u64(a, vshl_n_u64::<22>(r));
+        }
+    }
+
+    if AR30_ORDER == 1 {
+        a = vreinterpret_u64_u8(vrev32_u8(vreinterpret_u8_u64(a)));
+    }
+    let pairs = vreinterpret_u32_u64(a);
+    vget_lane_u32::<0>(pairs)
+}
diff --git a/src/neon/horizontal_ar30.rs b/src/neon/horizontal_ar30.rs
new file mode 100644
index 0000000..37b9b33
--- /dev/null
+++ b/src/neon/horizontal_ar30.rs
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::filter_weights::FilterWeights;
+use crate::neon::ar30::{
+    vextract_ar30, vld1_ar30_s16, vunzip_4_ar30_separate, vunzips_4_ar30_separate,
+};
+use std::arch::aarch64::*;
+
+#[inline]
+unsafe fn conv_horiz_rgba_1_u8_i16<
+    const SCALE: i32,
+    const AR_TYPE: usize,
+    const AR_ORDER: usize,
+>(
+    start_x: usize,
+    src: &[u32],
+    w0: int16x4_t,
+    store: int16x4_t,
+) -> int16x4_t {
+    let src_ptr = src.get_unchecked(start_x..);
+    let ld = vld1_ar30_s16::<AR_TYPE, AR_ORDER>(src_ptr);
+    let rgba_pixel = vshl_n_s16::<SCALE>(ld);
+    vqrdmlah_s16(store, rgba_pixel, w0)
+}
+
+#[inline(always)]
+unsafe fn conv_horiz_rgba_8_u8_i16<
+    const SCALE: i32,
+    const AR_TYPE: usize,
+    const AR_ORDER: usize,
+>(
+    start_x: usize,
+    src: &[u32],
+    set1: (int16x4_t, int16x4_t, int16x4_t, int16x4_t),
+    set2: (int16x4_t, int16x4_t, int16x4_t, int16x4_t),
+    store: int16x4_t,
+) -> int16x4_t {
+    let src_ptr = src.get_unchecked(start_x..);
+
+    let rgba_pixel = vunzip_4_ar30_separate::<AR_TYPE, AR_ORDER>(vld1q_u32_x2(src_ptr.as_ptr()));
+
+    let hi0 = vshlq_n_s16::<SCALE>(rgba_pixel.1);
+    let lo0 = vshlq_n_s16::<SCALE>(rgba_pixel.0);
+    let hi1 = vshlq_n_s16::<SCALE>(rgba_pixel.3);
+    let lo1 = vshlq_n_s16::<SCALE>(rgba_pixel.2);
+
+    let hi_v = vqrdmulhq_s16(hi0, vcombine_s16(set1.2, set1.3));
+    let mut product = vqrdmlahq_s16(hi_v, lo0, vcombine_s16(set1.0, set1.1));
+    product = vqrdmlahq_s16(product, hi1, vcombine_s16(set2.2, set2.3));
+    product = vqrdmlahq_s16(product, lo1, vcombine_s16(set2.0, set2.1));
+
+    vadd_s16(
+        vadd_s16(store, vget_low_s16(product)),
+        vget_high_s16(product),
+    )
+}
+
+#[inline]
+unsafe fn conv_horiz_rgba_4_u8_i16<
+    const SCALE: i32,
+    const AR_TYPE: usize,
+    const AR_ORDER: usize,
+>(
+    start_x: usize,
+    src: &[u32],
+    w0: int16x4_t,
+    w1: int16x4_t,
+    w2: int16x4_t,
+    w3: int16x4_t,
+    store: int16x4_t,
+) -> int16x4_t {
+    let src_ptr = src.get_unchecked(start_x..);
+
+    let rgba_pixel = vunzips_4_ar30_separate::<AR_TYPE, AR_ORDER>(vld1q_u32(src_ptr.as_ptr()));
+
+    let hi = vshlq_n_s16::<SCALE>(rgba_pixel.1);
+    let lo = vshlq_n_s16::<SCALE>(rgba_pixel.0);
+
+    let hi_v = vqrdmulhq_s16(hi, vcombine_s16(w2, w3));
+    let product = vqrdmlahq_s16(hi_v, lo, vcombine_s16(w0, w1));
+
+    vadd_s16(
+        vadd_s16(store, vget_low_s16(product)),
+        vget_high_s16(product),
+    )
+}
+
+pub(crate) fn neon_convolve_horizontal_rgba_rows_4_ar30<
+    const AR_TYPE: usize,
+    const AR_ORDER: usize,
+>(
+    src: &[u32],
+    src_stride: usize,
+    dst: &mut [u32],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+) {
+    unsafe {
+        const SCALE: i32 = 4;
+        const ROUNDING: i16 = 1 << (SCALE - 1);
+        let zeros = vdup_n_s16(0i16);
+        const ALPHA_ROUNDING: i16 = 1 << (SCALE as i16 + 7);
+        let init = vld1_s16([ROUNDING, ROUNDING, ROUNDING, ALPHA_ROUNDING].as_ptr());
+
+        let v_cut_off = vld1_s16([1023, 1023, 1023, 3].as_ptr());
+
+        let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+        let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+        let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+        let iter_row0 = row0_ref.iter_mut();
+        let iter_row1 = row1_ref.iter_mut();
+        let iter_row2 = row2_ref.iter_mut();
+        let iter_row3 = row3_ref.iter_mut();
+
+        let v_shl_back = vld1_s16(
+            [
+                -SCALE as i16,
+                -SCALE as i16,
+                -SCALE as i16,
+                -(SCALE as i16 + 8),
+            ]
+            .as_ptr(),
+        );
+
+        for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+            .zip(iter_row1)
+            .zip(iter_row2)
+            .zip(iter_row3)
+            .zip(filter_weights.bounds.iter())
+            .zip(
+                filter_weights
+                    .weights
+                    .chunks_exact(filter_weights.aligned_size),
+            )
+        {
+            let mut jx = 0usize;
+
+            let bounds_size = bounds.size;
+
+            let mut store_0 = init;
+            let mut store_1 = init;
+            let mut store_2 = init;
+            let mut store_3 = init;
+
+            let src0 = src;
+            let src1 = src0.get_unchecked(src_stride..);
+            let src2 = src1.get_unchecked(src_stride..);
+            let src3 = src2.get_unchecked(src_stride..);
+
+            while jx + 8 < bounds_size {
+                let bounds_start = bounds.start + jx;
+                let w_ptr = weights.get_unchecked(jx..(jx + 8));
+                let weights_set = vld1q_s16(w_ptr.as_ptr());
+                let w0 = vdup_laneq_s16::<0>(weights_set);
+                let w1 = vdup_laneq_s16::<1>(weights_set);
+                let w2 = vdup_laneq_s16::<2>(weights_set);
+                let w3 = vdup_laneq_s16::<3>(weights_set);
+                let w4 = vdup_laneq_s16::<4>(weights_set);
+                let w5 = vdup_laneq_s16::<5>(weights_set);
+                let w6 = vdup_laneq_s16::<6>(weights_set);
+                let w7 = vdup_laneq_s16::<7>(weights_set);
+                let set1 = (w0, w1, w2, w3);
+                let set2 = (w4, w5, w6, w7);
+                store_0 = conv_horiz_rgba_8_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src0,
+                    set1,
+                    set2,
+                    store_0,
+                );
+                store_1 = conv_horiz_rgba_8_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src1,
+                    set1,
+                    set2,
+                    store_1,
+                );
+                store_2 = conv_horiz_rgba_8_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src2,
+                    set1,
+                    set2,
+                    store_2,
+                );
+                store_3 = conv_horiz_rgba_8_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src3,
+                    set1,
+                    set2,
+                    store_3,
+                );
+                jx += 8;
+            }
+
+            while jx + 4 < bounds_size {
+                let bounds_start = bounds.start + jx;
+                let w_ptr = weights.get_unchecked(jx..(jx + 4));
+                let weights = vld1_s16(w_ptr.as_ptr());
+                let w0 = vdup_lane_s16::<0>(weights);
+                let w1 = vdup_lane_s16::<1>(weights);
+                let w2 = vdup_lane_s16::<2>(weights);
+                let w3 = vdup_lane_s16::<3>(weights);
+                store_0 = conv_horiz_rgba_4_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src0,
+                    w0,
+                    w1,
+                    w2,
+                    w3,
+                    store_0,
+                );
+                store_1 = conv_horiz_rgba_4_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src1,
+                    w0,
+                    w1,
+                    w2,
+                    w3,
+                    store_1,
+                );
+                store_2 = conv_horiz_rgba_4_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src2,
+                    w0,
+                    w1,
+                    w2,
+                    w3,
+                    store_2,
+                );
+                store_3 = conv_horiz_rgba_4_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src3,
+                    w0,
+                    w1,
+                    w2,
+                    w3,
+                    store_3,
+                );
+                jx += 4;
+            }
+
+            while jx < bounds_size {
+                let w_ptr = weights.get_unchecked(jx..(jx + 1));
+                let bounds_start = bounds.start + jx;
+                let weight0 = vld1_dup_s16(w_ptr.as_ptr());
+                store_0 = conv_horiz_rgba_1_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src0,
+                    weight0,
+                    store_0,
+                );
+                store_1 = conv_horiz_rgba_1_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src1,
+                    weight0,
+                    store_1,
+                );
+                store_2 = conv_horiz_rgba_1_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src2,
+                    weight0,
+                    store_2,
+                );
+                store_3 = conv_horiz_rgba_1_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src3,
+                    weight0,
+                    store_3,
+                );
+                jx += 1;
+            }
+
+            let store_16_0 = vreinterpret_u16_s16(vmin_s16(
+                vshl_s16(vmax_s16(store_0, zeros), v_shl_back),
+                v_cut_off,
+            ));
+            let store_16_1 = vreinterpret_u16_s16(vmin_s16(
+                vshl_s16(vmax_s16(store_1, zeros), v_shl_back),
+                v_cut_off,
+            ));
+            let store_16_2 = vreinterpret_u16_s16(vmin_s16(
+                vshl_s16(vmax_s16(store_2, zeros), v_shl_back),
+                v_cut_off,
+            ));
+            let store_16_3 = vreinterpret_u16_s16(vmin_s16(
+                vshl_s16(vmax_s16(store_3, zeros), v_shl_back),
+                v_cut_off,
+            ));
+
+            let packed0 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_0);
+            *chunk0 = packed0;
+            let packed1 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_1);
+            *chunk1 = packed1;
+            let packed2 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_2);
+            *chunk2 = packed2;
+            let packed3 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_3);
+            *chunk3 = packed3;
+        }
+    }
+}
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index a7ca013..1327b60 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -33,10 +33,12 @@ mod alpha_f16_full;
 mod alpha_f32;
 mod alpha_u16;
 mod alpha_u8;
+mod ar30;
 #[cfg(feature = "half")]
 mod convolve_f16;
 #[cfg(feature = "half")]
 mod f16_utils;
+mod horizontal_ar30;
 mod plane_f32;
 mod plane_u8;
 #[cfg(feature = "half")]
@@ -53,6 +55,7 @@ mod rgba_f32;
 mod rgba_u16_lb;
 mod rgba_u8;
 mod utils;
+mod vertical_ar30;
 #[cfg(feature = "half")]
 mod vertical_f16;
 #[cfg(feature = "half")]
@@ -75,6 +78,7 @@ pub use alpha_u8::neon_premultiply_alpha_rgba;
 pub use alpha_u8::neon_unpremultiply_alpha_rgba;
 #[cfg(feature = "half")]
 pub use f16_utils::*;
+pub(crate) use horizontal_ar30::neon_convolve_horizontal_rgba_rows_4_ar30;
 pub use plane_f32::convolve_horizontal_plane_neon_row_one;
 pub use plane_f32::convolve_horizontal_plane_neon_rows_4;
 pub use plane_u8::{convolve_horizontal_plane_neon_row, convolve_horizontal_plane_neon_rows_4_u8};
@@ -101,6 +105,7 @@ pub use rgba_u16_lb::{
     convolve_horizontal_rgba_neon_rows_4_lb_u8, convolve_horizontal_rgba_neon_u16_lb_row,
 };
 pub use rgba_u8::*;
+pub(crate) use vertical_ar30::neon_column_handler_fixed_point_ar30;
 #[cfg(feature = "half")]
 pub use vertical_f16::convolve_vertical_rgb_neon_row_f16;
 #[cfg(feature = "half")]
diff --git a/src/neon/vertical_ar30.rs b/src/neon/vertical_ar30.rs
new file mode 100644
index 0000000..b6dae91
--- /dev/null
+++ b/src/neon/vertical_ar30.rs
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::filter_weights::FilterBounds;
+use crate::fixed_point_vertical_ar30::convolve_column_handler_fip_db_ar30;
+use crate::neon::ar30::{vunzip_4_ar30, vzip_4_ar30};
+use std::arch::aarch64::{
+    int16x8x4_t, vdupq_n_s16, vld1q_u32_x2, vmaxq_s16, vminq_s16, vqrdmlahq_s16, vqrdmulhq_s16,
+    vrshrq_n_s16, vshlq_n_s16, vst1q_u32_x2,
+};
+
+pub(crate) fn neon_column_handler_fixed_point_ar30<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+>(
+    bounds: &FilterBounds,
+    src: &[u32],
+    dst: &mut [u32],
+    src_stride: usize,
+    weight: &[i16],
+) {
+    let mut cx = 0usize;
+
+    let total_width = dst.len();
+
+    const PREC: i32 = 5;
+    const BACK: i32 = 5;
+
+    let bounds_size = bounds.size;
+
+    while cx + 8 < total_width {
+        unsafe {
+            let v_max = vdupq_n_s16(1023);
+            let zeros = vdupq_n_s16(0);
+            let filter = weight;
+            let v_start_px = cx;
+
+            let py = bounds.start;
+            let weight = vdupq_n_s16(filter[0]);
+            let offset = src_stride * py + v_start_px;
+            let src_ptr = src.get_unchecked(offset..(offset + 8));
+
+            let ps = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr.as_ptr()));
+            let mut v0 = vqrdmulhq_s16(vshlq_n_s16::<PREC>(ps.0), weight);
+            let mut v1 = vqrdmulhq_s16(vshlq_n_s16::<PREC>(ps.1), weight);
+            let mut v2 = vqrdmulhq_s16(vshlq_n_s16::<PREC>(ps.2), weight);
+            let mut v3 = vqrdmulhq_s16(vshlq_n_s16::<PREC>(ps.3), weight);
+
+            if bounds_size == 2 {
+                let weights = filter.get_unchecked(0..2);
+                let py = bounds.start;
+                let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_start_px)..);
+
+                let v_weight1 = vdupq_n_s16(weights[1]);
+
+                let ps1 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr1.as_ptr()));
+                v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps1.0), v_weight1);
+                v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps1.1), v_weight1);
+                v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps1.2), v_weight1);
+                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps1.3), v_weight1);
+            } else if bounds_size == 3 {
+                let weights = filter.get_unchecked(0..3);
+                let py = bounds.start;
+                let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_start_px)..);
+                let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_start_px)..);
+
+                let v_weight1 = vdupq_n_s16(weights[1]);
+                let v_weight2 = vdupq_n_s16(weights[2]);
+
+                let ps1 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr1.as_ptr()));
+                v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps1.0), v_weight1);
+                v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps1.1), v_weight1);
+                v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps1.2), v_weight1);
+                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps1.3), v_weight1);
+                let ps2 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr2.as_ptr()));
+                v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps2.0), v_weight2);
+                v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps2.1), v_weight2);
+                v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps2.2), v_weight2);
+                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps2.3), v_weight2);
+            } else if bounds_size == 4 {
+                let weights = filter.get_unchecked(0..4);
+                let py = bounds.start;
+                let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_start_px)..);
+                let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_start_px)..);
+                let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_start_px)..);
+
+                let v_weight1 = vdupq_n_s16(weights[1]);
+                let v_weight2 = vdupq_n_s16(weights[2]);
+                let v_weight3 = vdupq_n_s16(weights[3]);
+
+                let ps1 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr1.as_ptr()));
+                v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps1.0), v_weight1);
+                v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps1.1), v_weight1);
+                v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps1.2), v_weight1);
+                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps1.3), v_weight1);
+                let ps2 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr2.as_ptr()));
+                v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps2.0), v_weight2);
+                v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps2.1), v_weight2);
+                v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps2.2), v_weight2);
+                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps2.3), v_weight2);
+                let ps3 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr3.as_ptr()));
+                v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps3.0), v_weight3);
+                v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps3.1), v_weight3);
+                v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps3.2), v_weight3);
+                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps3.3), v_weight3);
+            } else {
+                for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
+                    // Adding 1 is necessary because skip do not incrementing value on values that skipped
+                    let py = bounds.start + j + 1;
+                    let weight = vdupq_n_s16(k_weight);
+                    let offset = src_stride * py + v_start_px;
+                    let src_ptr = src.get_unchecked(offset..(offset + 8));
+
+                    let ps = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr.as_ptr()));
+                    v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps.0), weight);
+                    v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps.1), weight);
+                    v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps.2), weight);
+                    v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps.3), weight);
+                }
+            }
+
+            let v_dst = dst.get_unchecked_mut(v_start_px..(v_start_px + 8));
+
+            v0 = vmaxq_s16(vminq_s16(vrshrq_n_s16::<BACK>(v0), v_max), zeros);
+            v1 = vmaxq_s16(vminq_s16(vrshrq_n_s16::<BACK>(v1), v_max), zeros);
+            v2 = vmaxq_s16(vminq_s16(vrshrq_n_s16::<BACK>(v2), v_max), zeros);
+            v3 = vmaxq_s16(vrshrq_n_s16::<BACK>(v3), zeros);
+
+            let vals = vzip_4_ar30::<AR30_TYPE, AR30_ORDER>(int16x8x4_t(v0, v1, v2, v3));
+            vst1q_u32_x2(v_dst.as_mut_ptr(), vals);
+        }
+
+        cx += 8;
+    }
+
+    while cx + 4 < total_width {
+        convolve_column_handler_fip_db_ar30::<AR30_TYPE, AR30_ORDER, 4>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 4;
+    }
+
+    while cx < total_width {
+        convolve_column_handler_fip_db_ar30::<AR30_TYPE, AR30_ORDER, 1>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 1;
+    }
+}
diff --git a/src/resize_ar30.rs b/src/resize_ar30.rs
index cf4c36f..bf36dd2 100644
--- a/src/resize_ar30.rs
+++ b/src/resize_ar30.rs
@@ -81,6 +81,19 @@ pub(crate) fn resize_ar30_impl<const AR30_TYPE: usize, const AR30_ORDER: usize>(
     let should_do_vertical = src_size.height != dst_size.height;
     assert!(should_do_horizontal || should_do_vertical);
 
+    if should_do_vertical && !should_do_horizontal {
+        let vertical_filters = scaler.generate_weights(src_size.height, dst_size.height);
+        convolve_vertical_dispatch_ar30::<AR30_TYPE, AR30_ORDER>(
+            src,
+            src_size.width,
+            vertical_filters,
+            dst,
+            src_size.width,
+            &pool,
+        );
+        return Ok(());
+    }
+
     let working_store = if should_do_vertical {
         let mut target = vec![0u32; src_size.width * dst_size.height];
 

From cb400d36ee1d9ce5f0f978051e7fe048c8d12065 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Thu, 21 Nov 2024 19:03:43 +0000
Subject: [PATCH 06/19] Fix package, added workflows

---
 .github/workflows/build_push.yml  |  6 +++---
 .github/workflows/no-response.yml | 23 +++++++++++++++++++++++
 app/Cargo.toml                    |  2 +-
 3 files changed, 27 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/no-response.yml

diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index 156cf3b..0f1c7f2 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -7,14 +7,14 @@ concurrency:
 on:
   push:
     branches:
-      - '*'
+      - 'master'
+      - 'dev'
       - '!ci_test_*'
     tags-ignore:
       - '*'
   pull_request:
     branches:
-      - '*'
-      - '!ci_test_*'
+      - 'master'
 
 jobs:
   build:
diff --git a/.github/workflows/no-response.yml b/.github/workflows/no-response.yml
new file mode 100644
index 0000000..a4b4351
--- /dev/null
+++ b/.github/workflows/no-response.yml
@@ -0,0 +1,23 @@
+name: no-response
+
+on:
+  schedule:
+    - cron: '0 0 * * *' # Runs daily at midnight
+  workflow_dispatch:
+
+jobs:
+  noResponse:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v9
+        with:
+          repo-token: ${{ github.token }}
+          days-before-stale: -1
+          days-before-close: 14
+          only-labels: 'waiting for author'
+          stale-issue-label: 'waiting for author'
+          stale-pr-label: 'waiting for author'
+          remove-stale-when-updated: true
+          ignore-updates: false
+          close-issue-message: This issue has been automatically closed due to inactivity. We requested additional information but have not received a response from the original author. Without the requested details, we cannot proceed. If you have or find the information needed, please comment so we can reopen the issue.
+          close-pr-message: This pull request has been automatically closed due to inactivity. We requested additional information but have not received a response from the original author. Without the requested details, we cannot proceed. If you have the needed information or updates, please reopen the PR or comment so we can continue the review.
\ No newline at end of file
diff --git a/app/Cargo.toml b/app/Cargo.toml
index baa46c0..aa48512 100644
--- a/app/Cargo.toml
+++ b/app/Cargo.toml
@@ -9,7 +9,7 @@ image = { version = "0.25.5", features = ["default"] }
 pic-scale = { path = "..", features = ["half"], default-features = true }
 fast_image_resize = { version = "5.0.0", features = [] }
 half = { version = "2.4.1", default-features = true }
-yuvutils-rs = {path = "../../../RustRoverProjects/yuvutils-rs"}
+yuvutils-rs = "0.5.5"
 
 [dev-dependencies]
 criterion = "0.5.1"

From de5efdcae90f12ee11caf7998d207d947752662b Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Thu, 21 Nov 2024 19:06:03 +0000
Subject: [PATCH 07/19] Clippy

---
 Cargo.lock         | 2 ++
 src/color_group.rs | 3 ++-
 src/neon/ar30.rs   | 6 ++----
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1d04751..99be909 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1472,6 +1472,8 @@ dependencies = [
 [[package]]
 name = "yuvutils-rs"
 version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74d0d13bb8e3921f9d76ef4bcb348108578df0402c6cf2695ef0690b4b64c7d9"
 dependencies = [
  "num-traits",
 ]
diff --git a/src/color_group.rs b/src/color_group.rs
index 3f829e0..942deb5 100644
--- a/src/color_group.rs
+++ b/src/color_group.rs
@@ -229,6 +229,7 @@ where
 
 impl ColorGroup<4, i32> {
     #[inline(always)]
+    #[allow(clippy::manual_clamp)]
     pub(crate) fn saturate_ar30(&self) -> ColorGroup<4, i32> {
         ColorGroup::from_components(
             (self.r >> PRECISION).min(1023).max(0),
@@ -239,7 +240,7 @@ impl ColorGroup<4, i32> {
     }
 
     #[inline(always)]
-    pub(crate) fn to_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(&self) -> u32 {
+    pub(crate) fn to_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(self) -> u32 {
         let ar30_type: Rgb30 = AR30_TYPE.into();
         ar30_type.pack_w_a::<AR30_ORDER>(self.r, self.g, self.b, self.a)
     }
diff --git a/src/neon/ar30.rs b/src/neon/ar30.rs
index a9cdaf8..d846e11 100644
--- a/src/neon/ar30.rs
+++ b/src/neon/ar30.rs
@@ -195,8 +195,7 @@ pub(crate) unsafe fn vunzip_4_ar30_separate<const AR30_TYPE: usize, const AR30_O
     let k1 = vreinterpretq_s16_s32(v2.0);
     let k2 = vreinterpretq_s16_s32(v1.1);
     let k3 = vreinterpretq_s16_s32(v2.1);
-    let regi = int16x8x4_t(k0, k1, k2, k3);
-    regi
+    int16x8x4_t(k0, k1, k2, k3)
 }
 
 #[inline(always)]
@@ -212,8 +211,7 @@ pub(crate) unsafe fn vunzips_4_ar30_separate<const AR30_TYPE: usize, const AR30_
     let k1 = vreinterpret_s16_s32(v2.0);
     let k2 = vreinterpret_s16_s32(v1.1);
     let k3 = vreinterpret_s16_s32(v2.1);
-    let regi = int16x8x2_t(vcombine_s16(k0, k1), vcombine_s16(k2, k3));
-    regi
+    int16x8x2_t(vcombine_s16(k0, k1), vcombine_s16(k2, k3))
 }
 
 #[inline(always)]

From 5d6b38d64ea973cbeccb0a95413a38887b9b5d1d Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Thu, 21 Nov 2024 19:08:55 +0000
Subject: [PATCH 08/19] Clippy

---
 src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib.rs b/src/lib.rs
index 724fa48..bc9278d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -27,7 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #![deny(deprecated)]
-// #![deny(unreachable_code, unused)]
+#![deny(unreachable_code, unused)]
 #![allow(clippy::too_many_arguments)]
 mod alpha_check;
 #[cfg(feature = "half")]

From dec05253bb4df31ded45af5a070a59591f6bdbdb Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Fri, 22 Nov 2024 00:49:36 +0000
Subject: [PATCH 09/19] AR30, rdm runtime dispatch

---
 app/src/main.rs             |   2 +-
 src/lib.rs                  |   2 +-
 src/neon/horizontal_ar30.rs |  19 ++
 src/neon/rgba_u8.rs         | 419 +++++++++++++++++++-----------------
 src/neon/vertical_ar30.rs   |  19 ++
 src/neon/vertical_u8.rs     |   7 +-
 6 files changed, 265 insertions(+), 203 deletions(-)

diff --git a/app/src/main.rs b/app/src/main.rs
index 2d5c918..d412892 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -28,7 +28,7 @@ fn main() {
     let transient = img.to_rgba8();
     let mut bytes = Vec::from(transient.as_bytes());
 
-    let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
+    let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
     scaler.set_threading_policy(ThreadingPolicy::Single);
 
     let mut ar30_src = vec![0u32; dimensions.0 as usize * dimensions.1 as usize];
diff --git a/src/lib.rs b/src/lib.rs
index bc9278d..929b771 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -67,7 +67,7 @@ mod math;
 mod mixed_storage;
 mod mlaf;
 mod nearest_sampler;
-#[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 mod neon;
 mod pic_scale_error;
 mod plane_f32;
diff --git a/src/neon/horizontal_ar30.rs b/src/neon/horizontal_ar30.rs
index 37b9b33..ea489ba 100644
--- a/src/neon/horizontal_ar30.rs
+++ b/src/neon/horizontal_ar30.rs
@@ -120,6 +120,25 @@ pub(crate) fn neon_convolve_horizontal_rgba_rows_4_ar30<
     dst: &mut [u32],
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
+) {
+    unsafe {
+        neon_convolve_horizontal_rgba_rows_4_impl::<AR_TYPE, AR_ORDER>(
+            src,
+            src_stride,
+            dst,
+            dst_stride,
+            filter_weights,
+        );
+    }
+}
+
+#[target_feature(enable = "rdm")]
+unsafe fn neon_convolve_horizontal_rgba_rows_4_impl<const AR_TYPE: usize, const AR_ORDER: usize>(
+    src: &[u32],
+    src_stride: usize,
+    dst: &mut [u32],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
 ) {
     unsafe {
         const SCALE: i32 = 4;
diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs
index 7f0b6ce..a134850 100644
--- a/src/neon/rgba_u8.rs
+++ b/src/neon/rgba_u8.rs
@@ -211,7 +211,6 @@ unsafe fn conv_horiz_rgba_1_u8_i16<const SCALE: i32>(
     vqrdmlah_s16(store, lo, w0)
 }
 
-/// Slightly lower precision scale option
 pub fn convolve_horizontal_rgba_neon_rows_4_u8_i16(
     src: &[u8],
     src_stride: usize,
@@ -220,139 +219,153 @@ pub fn convolve_horizontal_rgba_neon_rows_4_u8_i16(
     filter_weights: &FilterWeights<i16>,
 ) {
     unsafe {
-        const CHANNELS: usize = 4;
-        const SCALE: i32 = 6;
-        const ROUNDING: i16 = 1 << (SCALE - 1);
-        let zeros = vdup_n_s16(0i16);
-        let init = vdup_n_s16(ROUNDING);
-
-        let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-        let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-        let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-        let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-        let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-        let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-        let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-        for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-            .zip(iter_row1)
-            .zip(iter_row2)
-            .zip(iter_row3)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let mut jx = 0usize;
-
-            let bounds_size = bounds.size;
-
-            let mut store_0 = init;
-            let mut store_1 = init;
-            let mut store_2 = init;
-            let mut store_3 = init;
-
-            let src0 = src;
-            let src1 = src0.get_unchecked(src_stride..);
-            let src2 = src1.get_unchecked(src_stride..);
-            let src3 = src2.get_unchecked(src_stride..);
+        convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
+            src,
+            src_stride,
+            dst,
+            dst_stride,
+            filter_weights,
+        );
+    }
+}
 
-            while jx + 8 < bounds_size {
-                let bounds_start = bounds.start + jx;
-                let w_ptr = weights.get_unchecked(jx..(jx + 8));
-                let weights_set = vld1q_s16(w_ptr.as_ptr());
-                let w0 = vdup_laneq_s16::<0>(weights_set);
-                let w1 = vdup_laneq_s16::<1>(weights_set);
-                let w2 = vdup_laneq_s16::<2>(weights_set);
-                let w3 = vdup_laneq_s16::<3>(weights_set);
-                let w4 = vdup_laneq_s16::<4>(weights_set);
-                let w5 = vdup_laneq_s16::<5>(weights_set);
-                let w6 = vdup_laneq_s16::<6>(weights_set);
-                let w7 = vdup_laneq_s16::<7>(weights_set);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
-                store_0 =
-                    conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src0, set1, set2, store_0);
-                store_1 =
-                    conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src1, set1, set2, store_1);
-                store_2 =
-                    conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src2, set1, set2, store_2);
-                store_3 =
-                    conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src3, set1, set2, store_3);
-                jx += 8;
-            }
+/// Slightly lower precision scale option
+#[target_feature(enable = "rdm")]
+unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
+    src: &[u8],
+    src_stride: usize,
+    dst: &mut [u8],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+) {
+    const CHANNELS: usize = 4;
+    const SCALE: i32 = 6;
+    const ROUNDING: i16 = 1 << (SCALE - 1);
+    let zeros = vdup_n_s16(0i16);
+    let init = vdup_n_s16(ROUNDING);
+
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
+    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
+    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
+    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut jx = 0usize;
+
+        let bounds_size = bounds.size;
+
+        let mut store_0 = init;
+        let mut store_1 = init;
+        let mut store_2 = init;
+        let mut store_3 = init;
+
+        let src0 = src;
+        let src1 = src0.get_unchecked(src_stride..);
+        let src2 = src1.get_unchecked(src_stride..);
+        let src3 = src2.get_unchecked(src_stride..);
+
+        while jx + 8 < bounds_size {
+            let bounds_start = bounds.start + jx;
+            let w_ptr = weights.get_unchecked(jx..(jx + 8));
+            let weights_set = vld1q_s16(w_ptr.as_ptr());
+            let w0 = vdup_laneq_s16::<0>(weights_set);
+            let w1 = vdup_laneq_s16::<1>(weights_set);
+            let w2 = vdup_laneq_s16::<2>(weights_set);
+            let w3 = vdup_laneq_s16::<3>(weights_set);
+            let w4 = vdup_laneq_s16::<4>(weights_set);
+            let w5 = vdup_laneq_s16::<5>(weights_set);
+            let w6 = vdup_laneq_s16::<6>(weights_set);
+            let w7 = vdup_laneq_s16::<7>(weights_set);
+            let set1 = (w0, w1, w2, w3);
+            let set2 = (w4, w5, w6, w7);
+            store_0 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src0, set1, set2, store_0);
+            store_1 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src1, set1, set2, store_1);
+            store_2 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src2, set1, set2, store_2);
+            store_3 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src3, set1, set2, store_3);
+            jx += 8;
+        }
 
-            while jx + 4 < bounds_size {
-                let bounds_start = bounds.start + jx;
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-                let weights = vld1_s16(w_ptr.as_ptr());
-                let w0 = vdup_lane_s16::<0>(weights);
-                let w1 = vdup_lane_s16::<1>(weights);
-                let w2 = vdup_lane_s16::<2>(weights);
-                let w3 = vdup_lane_s16::<3>(weights);
-                store_0 =
-                    conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src0, w0, w1, w2, w3, store_0);
-                store_1 =
-                    conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src1, w0, w1, w2, w3, store_1);
-                store_2 =
-                    conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src2, w0, w1, w2, w3, store_2);
-                store_3 =
-                    conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src3, w0, w1, w2, w3, store_3);
-                jx += 4;
-            }
+        while jx + 4 < bounds_size {
+            let bounds_start = bounds.start + jx;
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+            let weights = vld1_s16(w_ptr.as_ptr());
+            let w0 = vdup_lane_s16::<0>(weights);
+            let w1 = vdup_lane_s16::<1>(weights);
+            let w2 = vdup_lane_s16::<2>(weights);
+            let w3 = vdup_lane_s16::<3>(weights);
+            store_0 =
+                conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src0, w0, w1, w2, w3, store_0);
+            store_1 =
+                conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src1, w0, w1, w2, w3, store_1);
+            store_2 =
+                conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src2, w0, w1, w2, w3, store_2);
+            store_3 =
+                conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src3, w0, w1, w2, w3, store_3);
+            jx += 4;
+        }
 
-            while jx + 2 < bounds_size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let bounds_start = bounds.start + jx;
-                let w0 = vld1_dup_s16(w_ptr.as_ptr());
-                let w1 = vld1_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-                store_0 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src0, w0, w1, store_0);
-                store_1 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src1, w0, w1, store_1);
-                store_2 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src2, w0, w1, store_2);
-                store_3 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src3, w0, w1, store_3);
-                jx += 2;
-            }
+        while jx + 2 < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = bounds.start + jx;
+            let w0 = vld1_dup_s16(w_ptr.as_ptr());
+            let w1 = vld1_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
+            store_0 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src0, w0, w1, store_0);
+            store_1 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src1, w0, w1, store_1);
+            store_2 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src2, w0, w1, store_2);
+            store_3 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src3, w0, w1, store_3);
+            jx += 2;
+        }
 
-            while jx < bounds_size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-                let bounds_start = bounds.start + jx;
-                let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-                store_0 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src0, weight0, store_0);
-                store_1 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src1, weight0, store_1);
-                store_2 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src2, weight0, store_2);
-                store_3 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src3, weight0, store_3);
-                jx += 1;
-            }
+        while jx < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+            let bounds_start = bounds.start + jx;
+            let weight0 = vld1_dup_s16(w_ptr.as_ptr());
+            store_0 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src0, weight0, store_0);
+            store_1 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src1, weight0, store_1);
+            store_2 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src2, weight0, store_2);
+            store_3 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src3, weight0, store_3);
+            jx += 1;
+        }
 
-            let store_16_0 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_0, zeros)));
-            let store_16_1 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_1, zeros)));
-            let store_16_2 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_2, zeros)));
-            let store_16_3 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_3, zeros)));
+        let store_16_0 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_0, zeros)));
+        let store_16_1 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_1, zeros)));
+        let store_16_2 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_2, zeros)));
+        let store_16_3 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_3, zeros)));
 
-            let store_16_8_0 = vqmovn_u16(vcombine_u16(store_16_0, store_16_0));
-            let store_16_8_1 = vqmovn_u16(vcombine_u16(store_16_1, store_16_1));
-            let store_16_8_2 = vqmovn_u16(vcombine_u16(store_16_2, store_16_2));
-            let store_16_8 = vqmovn_u16(vcombine_u16(store_16_3, store_16_3));
+        let store_16_8_0 = vqmovn_u16(vcombine_u16(store_16_0, store_16_0));
+        let store_16_8_1 = vqmovn_u16(vcombine_u16(store_16_1, store_16_1));
+        let store_16_8_2 = vqmovn_u16(vcombine_u16(store_16_2, store_16_2));
+        let store_16_8 = vqmovn_u16(vcombine_u16(store_16_3, store_16_3));
 
-            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_0));
-            let dest_ptr_32 = chunk0.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(pixel);
+        let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_0));
+        let dest_ptr_32 = chunk0.as_mut_ptr() as *mut u32;
+        dest_ptr_32.write_unaligned(pixel);
 
-            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_1));
-            let dest_ptr_32 = chunk1.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(pixel);
+        let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_1));
+        let dest_ptr_32 = chunk1.as_mut_ptr() as *mut u32;
+        dest_ptr_32.write_unaligned(pixel);
 
-            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_2));
-            let dest_ptr_32 = chunk2.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(pixel);
+        let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_2));
+        let dest_ptr_32 = chunk2.as_mut_ptr() as *mut u32;
+        dest_ptr_32.write_unaligned(pixel);
 
-            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
-            let dest_ptr_32 = chunk3.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(pixel);
-        }
+        let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
+        let dest_ptr_32 = chunk3.as_mut_ptr() as *mut u32;
+        dest_ptr_32.write_unaligned(pixel);
     }
 }
 
@@ -581,86 +594,94 @@ pub fn convolve_horizontal_rgba_neon_row_i16(
     filter_weights: &FilterWeights<i16>,
 ) {
     unsafe {
-        const SCALE: i32 = 6;
-        const ROUNDING: i16 = 1 << (SCALE - 1);
-        let zeros = vdup_n_s16(0i16);
-        const CHANNELS: usize = 4;
-
-        for ((dst, bounds), weights) in dst
-            .chunks_exact_mut(CHANNELS)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let bounds_size = bounds.size;
-            let mut jx = 0usize;
-            let mut store = vdup_n_s16(ROUNDING);
+        convolve_horizontal_rgba_neon_row_i16_impl(src, dst, filter_weights);
+    }
+}
 
-            while jx + 8 < bounds_size {
-                let bounds_start = bounds.start + jx;
-                let w_ptr = weights.get_unchecked(jx..(jx + 8));
-                let weights_set = vld1q_s16(w_ptr.as_ptr());
-                let w0 = vdup_laneq_s16::<0>(weights_set);
-                let w1 = vdup_laneq_s16::<1>(weights_set);
-                let w2 = vdup_laneq_s16::<2>(weights_set);
-                let w3 = vdup_laneq_s16::<3>(weights_set);
-                let w4 = vdup_laneq_s16::<4>(weights_set);
-                let w5 = vdup_laneq_s16::<5>(weights_set);
-                let w6 = vdup_laneq_s16::<6>(weights_set);
-                let w7 = vdup_laneq_s16::<7>(weights_set);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
-                store = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src, set1, set2, store);
-                jx += 8;
-            }
+#[target_feature(enable = "rdm")]
+unsafe fn convolve_horizontal_rgba_neon_row_i16_impl(
+    src: &[u8],
+    dst: &mut [u8],
+    filter_weights: &FilterWeights<i16>,
+) {
+    const SCALE: i32 = 6;
+    const ROUNDING: i16 = 1 << (SCALE - 1);
+    let zeros = vdup_n_s16(0i16);
+    const CHANNELS: usize = 4;
+
+    for ((dst, bounds), weights) in dst
+        .chunks_exact_mut(CHANNELS)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let bounds_size = bounds.size;
+        let mut jx = 0usize;
+        let mut store = vdup_n_s16(ROUNDING);
+
+        while jx + 8 < bounds_size {
+            let bounds_start = bounds.start + jx;
+            let w_ptr = weights.get_unchecked(jx..(jx + 8));
+            let weights_set = vld1q_s16(w_ptr.as_ptr());
+            let w0 = vdup_laneq_s16::<0>(weights_set);
+            let w1 = vdup_laneq_s16::<1>(weights_set);
+            let w2 = vdup_laneq_s16::<2>(weights_set);
+            let w3 = vdup_laneq_s16::<3>(weights_set);
+            let w4 = vdup_laneq_s16::<4>(weights_set);
+            let w5 = vdup_laneq_s16::<5>(weights_set);
+            let w6 = vdup_laneq_s16::<6>(weights_set);
+            let w7 = vdup_laneq_s16::<7>(weights_set);
+            let set1 = (w0, w1, w2, w3);
+            let set2 = (w4, w5, w6, w7);
+            store = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src, set1, set2, store);
+            jx += 8;
+        }
 
-            while jx + 4 < bounds_size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-                let weights = vld1_s16(w_ptr.as_ptr());
-                let weight0 = vdup_lane_s16::<0>(weights);
-                let weight1 = vdup_lane_s16::<1>(weights);
-                let weight2 = vdup_lane_s16::<2>(weights);
-                let weight3 = vdup_lane_s16::<3>(weights);
-                let bounds_start = bounds.start + jx;
-                store = conv_horiz_rgba_4_u8_i16::<SCALE>(
-                    bounds_start,
-                    src,
-                    weight0,
-                    weight1,
-                    weight2,
-                    weight3,
-                    store,
-                );
-                jx += 4;
-            }
+        while jx + 4 < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+            let weights = vld1_s16(w_ptr.as_ptr());
+            let weight0 = vdup_lane_s16::<0>(weights);
+            let weight1 = vdup_lane_s16::<1>(weights);
+            let weight2 = vdup_lane_s16::<2>(weights);
+            let weight3 = vdup_lane_s16::<3>(weights);
+            let bounds_start = bounds.start + jx;
+            store = conv_horiz_rgba_4_u8_i16::<SCALE>(
+                bounds_start,
+                src,
+                weight0,
+                weight1,
+                weight2,
+                weight3,
+                store,
+            );
+            jx += 4;
+        }
 
-            while jx + 2 < bounds_size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let bounds_start = bounds.start + jx;
-                let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-                let weight1 = vld1_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-                store =
-                    conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src, weight0, weight1, store);
-                jx += 2;
-            }
+        while jx + 2 < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = bounds.start + jx;
+            let weight0 = vld1_dup_s16(w_ptr.as_ptr());
+            let weight1 = vld1_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
+            store = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src, weight0, weight1, store);
+            jx += 2;
+        }
 
-            while jx < bounds_size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-                let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-                let bounds_start = bounds.start + jx;
-                store = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src, weight0, store);
-                jx += 1;
-            }
+        while jx < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+            let weight0 = vld1_dup_s16(w_ptr.as_ptr());
+            let bounds_start = bounds.start + jx;
+            store = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src, weight0, store);
+            jx += 1;
+        }
 
-            let store_16 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store, zeros)));
-            let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
+        let store_16 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store, zeros)));
+        let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
 
-            let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
-            let dest_ptr_32 = dst.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(value);
-        }
+        let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
+        let dest_ptr_32 = dst.as_mut_ptr() as *mut u32;
+        dest_ptr_32.write_unaligned(value);
     }
 }
diff --git a/src/neon/vertical_ar30.rs b/src/neon/vertical_ar30.rs
index b6dae91..b5c3f2f 100644
--- a/src/neon/vertical_ar30.rs
+++ b/src/neon/vertical_ar30.rs
@@ -34,6 +34,7 @@ use std::arch::aarch64::{
     vrshrq_n_s16, vshlq_n_s16, vst1q_u32_x2,
 };
 
+#[inline(always)]
 pub(crate) fn neon_column_handler_fixed_point_ar30<
     const AR30_TYPE: usize,
     const AR30_ORDER: usize,
@@ -43,6 +44,24 @@ pub(crate) fn neon_column_handler_fixed_point_ar30<
     dst: &mut [u32],
     src_stride: usize,
     weight: &[i16],
+) {
+    unsafe {
+        neon_column_handler_fixed_point_ar30_impl::<AR30_TYPE, AR30_ORDER>(
+            bounds, src, dst, src_stride, weight,
+        );
+    }
+}
+
+#[target_feature(enable = "rdm")]
+unsafe fn neon_column_handler_fixed_point_ar30_impl<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+>(
+    bounds: &FilterBounds,
+    src: &[u32],
+    dst: &mut [u32],
+    src_stride: usize,
+    weight: &[i16],
 ) {
     let mut cx = 0usize;
 
diff --git a/src/neon/vertical_u8.rs b/src/neon/vertical_u8.rs
index 32eccf5..392f9dc 100644
--- a/src/neon/vertical_u8.rs
+++ b/src/neon/vertical_u8.rs
@@ -67,7 +67,9 @@ pub fn convolve_vertical_neon_i16_precision(
     src_stride: usize,
     weight: &[i16],
 ) {
-    convolve_vertical_neon_row_upper(width, bounds, src, dst, src_stride, weight);
+    unsafe {
+        convolve_vertical_neon_row_upper(width, bounds, src, dst, src_stride, weight);
+    }
 }
 
 pub fn convolve_vertical_neon_i32_precision(
@@ -95,7 +97,8 @@ unsafe fn vdot<const SCALE: i32>(
     (store0, store1)
 }
 
-fn convolve_vertical_neon_row_upper(
+#[target_feature(enable = "rdm")]
+unsafe fn convolve_vertical_neon_row_upper(
     _: usize,
     bounds: &FilterBounds,
     src: &[u8],

From 7b524f029a8f6e79a287014d1c95c6cd7eedc9d8 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Mon, 9 Dec 2024 16:25:37 +0000
Subject: [PATCH 10/19] Improvements in NEON

---
 .github/workflows/no-response.yml |   3 +
 Cargo.lock                        |   2 +-
 Cargo.toml                        |   2 +-
 README.md                         |   9 +-
 app/src/main.rs                   | 132 +++---
 src/handler_provider.rs           |   4 +-
 src/neon/mod.rs                   |   2 +-
 src/neon/plane_f32.rs             |  25 +-
 src/neon/plane_u8.rs              |  17 +-
 src/neon/rgb_f32.rs               |   3 +-
 src/neon/rgb_u8.rs                |  71 ++-
 src/neon/rgba_f32.rs              |   7 +-
 src/neon/rgba_u16_lb.rs           | 128 ++----
 src/neon/rgba_u8.rs               | 291 +++++--------
 src/neon/utils.rs                 |  64 ++-
 src/neon/vertical_f32.rs          |  13 +-
 src/neon/vertical_u16_lb.rs       | 194 ++++-----
 src/neon/vertical_u8.rs           | 702 ++++++++++++++++--------------
 18 files changed, 768 insertions(+), 901 deletions(-)

diff --git a/.github/workflows/no-response.yml b/.github/workflows/no-response.yml
index a4b4351..fc5cbc5 100644
--- a/.github/workflows/no-response.yml
+++ b/.github/workflows/no-response.yml
@@ -7,6 +7,9 @@ on:
 
 jobs:
   noResponse:
+    permissions:
+      issues: write
+      pull-requests: write
     runs-on: ubuntu-latest
     steps:
       - uses: actions/stale@v9
diff --git a/Cargo.lock b/Cargo.lock
index 99be909..bdc39dd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -773,7 +773,7 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
 [[package]]
 name = "pic-scale"
-version = "0.3.7"
+version = "0.3.6"
 dependencies = [
  "colorutils-rs",
  "half",
diff --git a/Cargo.toml b/Cargo.toml
index 5dc3d81..019bce5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["app", "wasm"] }
 
 [package]
 name = "pic-scale"
-version = "0.3.7"
+version = "0.3.6"
 edition = "2021"
 description = "High performance image scaling"
 readme = "README.md"
diff --git a/README.md b/README.md
index 24ab676..6b24ff1 100644
--- a/README.md
+++ b/README.md
@@ -86,16 +86,9 @@ Even when `half` feature activated but platform do not support or features not e
 
 Example comparison with `fast-image-resize` time for downscale RGB 4928x3279 image in 4 times.
 
-| Lanczos3  |  SSE  |  AVX  | NEON  |
-|-----------|:-----:|:-----:|:-----:|
-| pic-scale | 43.84 | 28.46 | 8.56  |
-| fir       | 45.36 | 32.07 | 32.77 |
-
-Example comparison with `fast-image-resize` time for downscale RGB 4928x3279 image in 4 times.
-
 | Lanczos3  |  AVX  | NEON  |
 |-----------|:-----:|:-----:|
-| pic-scale | 16.67 | 10.88 |
+| pic-scale | 16.67 | 8.54  |
 | fir       | 22.83 | 24.97 |
 
 Example comparison time for downscale RGBA 4928x3279 image in two times with premultiplying alpha.
diff --git a/app/src/main.rs b/app/src/main.rs
index d412892..3fee90c 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -28,63 +28,51 @@ fn main() {
     let transient = img.to_rgba8();
     let mut bytes = Vec::from(transient.as_bytes());
 
-    let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+    let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
     scaler.set_threading_policy(ThreadingPolicy::Single);
 
-    let mut ar30_src = vec![0u32; dimensions.0 as usize * dimensions.1 as usize];
-    rgba8_to_ra30(
-        &mut ar30_src,
-        dimensions.0,
-        Rgb30ByteOrder::Host,
-        &bytes,
-        dimensions.0 * 4,
-        dimensions.0,
-        dimensions.1,
-    )
-    .unwrap();
-
     // let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
     //
-    // let store =
-    //     ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize)
-    //         .unwrap();
+    let store =
+        ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize)
+            .unwrap();
 
     let dst_size = ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2);
-    let mut resized_ar = vec![0u32; dst_size.width * dst_size.height];
+    // let mut resized_ar = vec![0u32; dst_size.width * dst_size.height];
     let start_time = Instant::now();
-    scaler
-        .resize_ra30(
-            &ar30_src,
-            ImageSize::new(dimensions.0 as usize, dimensions.1 as usize),
-            &mut resized_ar,
-            dst_size,
-            Ar30ByteOrder::Host,
-        )
-        .unwrap();
-
-    // let resized = scaler
-    //     .resize_rgba(
-    //         ImageSize::new(dimensions.0 as usize / 8, dimensions.1 as usize / 8),
-    //         store,
-    //         false,
+    // scaler
+    //     .resize_ra30(
+    //         &ar30_src,
+    //         ImageSize::new(dimensions.0 as usize, dimensions.1 as usize),
+    //         &mut resized_ar,
+    //         dst_size,
+    //         Ar30ByteOrder::Host,
     //     )
     //     .unwrap();
 
+    let resized = scaler
+        .resize_rgba(
+            ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
+            store,
+            false,
+        )
+        .unwrap();
+
     let elapsed_time = start_time.elapsed();
     // Print the elapsed time in milliseconds
     println!("Scaler: {:.2?}", elapsed_time);
 
-    let mut resized = vec![0u8; dst_size.width * dst_size.height * 4];
-    ra30_to_rgba8(
-        &resized_ar,
-        dst_size.width as u32,
-        Rgb30ByteOrder::Host,
-        &mut resized,
-        dst_size.width as u32 * 4,
-        dst_size.width as u32,
-        dst_size.height as u32,
-    )
-    .unwrap();
+    // let mut resized = vec![0u8; dst_size.width * dst_size.height * 4];
+    // ra30_to_rgba8(
+    //     &resized_ar,
+    //     dst_size.width as u32,
+    //     Rgb30ByteOrder::Host,
+    //     &mut resized,
+    //     dst_size.width as u32 * 4,
+    //     dst_size.width as u32,
+    //     dst_size.height as u32,
+    // )
+    // .unwrap();
 
     // let dst: Vec<u8> = resized.as_bytes().iter().map(|&x| x).collect::<Vec<_>>();
     // println!("f1 {}, f2 {}, f3 {}, f4 {}", dst[0], dst[1], dst[2], dst[3]);
@@ -147,36 +135,36 @@ fn main() {
 
     // let dst: Vec<u8> = resized.as_bytes().iter().map(|&x| (x >> 2) as u8).collect();
     //
-    // let dst = resized.as_bytes();
-    let dst = resized;
-    image::save_buffer(
-        "converted.png",
-        &dst,
-        dst_size.width as u32,
-        dst_size.height as u32,
-        image::ColorType::Rgba8,
-    )
-    .unwrap();
+    let dst = resized.as_bytes();
+    // let dst = resized;
+    // image::save_buffer(
+    //     "converted.png",
+    //     &dst,
+    //     dst_size.width as u32,
+    //     dst_size.height as u32,
+    //     image::ColorType::Rgba8,
+    // )
+    // .unwrap();
 
-    // if resized.channels == 4 {
-    //     image::save_buffer(
-    //         "converted.png",
-    //         &dst,
-    //         resized.width as u32,
-    //         resized.height as u32,
-    //         image::ColorType::Rgba8,
-    //     )
-    //     .unwrap();
-    // } else {
-    //     image::save_buffer(
-    //         "converted.png",
-    //         &dst,
-    //         resized.width as u32,
-    //         resized.height as u32,
-    //         image::ColorType::Rgb8,
-    //     )
-    //     .unwrap();
-    // }
+    if resized.channels == 4 {
+        image::save_buffer(
+            "converted.png",
+            &dst,
+            resized.width as u32,
+            resized.height as u32,
+            image::ColorType::Rgba8,
+        )
+        .unwrap();
+    } else {
+        image::save_buffer(
+            "converted.png",
+            &dst,
+            resized.width as u32,
+            resized.height as u32,
+            image::ColorType::Rgb8,
+        )
+        .unwrap();
+    }
 
     // for i in 0..37 {
     //     let mut scaler = Scaler::new(i.into());
diff --git a/src/handler_provider.rs b/src/handler_provider.rs
index 8ffbfe4..f725b70 100644
--- a/src/handler_provider.rs
+++ b/src/handler_provider.rs
@@ -38,7 +38,7 @@ use crate::floating_point_vertical::column_handler_floating_point;
 use crate::mixed_storage::MixedStorage;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::neon::{
-    convolve_column_lb_u16, convolve_column_u16, convolve_horizontal_rgba_neon_rows_4_lb_u8,
+    convolve_column_lb_u16, convolve_column_u16, convolve_horizontal_rgba_neon_rows_4_lb_u16,
     convolve_horizontal_rgba_neon_u16_lb_row,
 };
 use crate::saturate_narrow::SaturateNarrow;
@@ -382,7 +382,7 @@ impl RowHandlerFixedPoint<u16> for u16 {
         u16: AsPrimitive<J>,
     {
         if COMPONENTS == 4 {
-            convolve_horizontal_rgba_neon_rows_4_lb_u8(
+            convolve_horizontal_rgba_neon_rows_4_lb_u16(
                 src,
                 src_stride,
                 dst,
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index 1327b60..f937e95 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -102,7 +102,7 @@ pub use rgba_f16_full::{
 };
 pub use rgba_f32::*;
 pub use rgba_u16_lb::{
-    convolve_horizontal_rgba_neon_rows_4_lb_u8, convolve_horizontal_rgba_neon_u16_lb_row,
+    convolve_horizontal_rgba_neon_rows_4_lb_u16, convolve_horizontal_rgba_neon_u16_lb_row,
 };
 pub use rgba_u8::*;
 pub(crate) use vertical_ar30::neon_column_handler_fixed_point_ar30;
diff --git a/src/neon/plane_f32.rs b/src/neon/plane_f32.rs
index 576a8e0..e13e5b3 100644
--- a/src/neon/plane_f32.rs
+++ b/src/neon/plane_f32.rs
@@ -28,14 +28,14 @@
  */
 
 use crate::filter_weights::FilterWeights;
-use crate::neon::utils::prefer_vfmaq_f32;
+use crate::neon::utils::{prefer_vfmaq_f32, xvld1q_f32_x4};
 use std::arch::aarch64::*;
 
 macro_rules! conv_horiz_plane_16_f32 {
     ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{
         let src_ptr = $src.add($start_x);
 
-        let rgb_pixel = vld1q_f32_x4(src_ptr);
+        let rgb_pixel = xvld1q_f32_x4(src_ptr);
 
         let mut acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set.0);
         acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set.1);
@@ -87,13 +87,6 @@ macro_rules! conv_horiz_plane_1_f32 {
     }};
 }
 
-macro_rules! vfullq_sum_f32 {
-    ($reg: expr) => {{
-        let acc = vadd_f32(vget_low_f32($reg), vget_high_f32($reg));
-        vpadds_f32(acc)
-    }};
-}
-
 pub fn convolve_horizontal_plane_neon_row_one(
     dst_width: usize,
     _: usize,
@@ -113,7 +106,7 @@ pub fn convolve_horizontal_plane_neon_row_one(
             while jx + 16 < bounds.size {
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = vld1q_f32_x4(ptr);
+                let read_weights = xvld1q_f32_x4(ptr);
                 store = conv_horiz_plane_16_f32!(
                     bounds_start,
                     unsafe_source_ptr_0,
@@ -165,7 +158,7 @@ pub fn convolve_horizontal_plane_neon_row_one(
 
             let px = x;
             let dest_ptr = unsafe_destination_ptr_0.add(px);
-            dest_ptr.write_unaligned(vfullq_sum_f32!(store));
+            dest_ptr.write_unaligned(vaddvq_f32(store));
 
             filter_offset += filter_weights.aligned_size;
         }
@@ -196,7 +189,7 @@ pub fn convolve_horizontal_plane_neon_rows_4(
 
             while jx + 16 < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = vld1q_f32_x4(ptr);
+                let read_weights = xvld1q_f32_x4(ptr);
                 let bounds_start = bounds.start + jx;
                 store_0 = conv_horiz_plane_16_f32!(
                     bounds_start,
@@ -303,16 +296,16 @@ pub fn convolve_horizontal_plane_neon_rows_4(
 
             let px = x;
             let dest_ptr = unsafe_destination_ptr_0.add(px);
-            dest_ptr.write_unaligned(vfullq_sum_f32!(store_0));
+            dest_ptr.write_unaligned(vaddvq_f32(store_0));
 
             let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
-            dest_ptr.write_unaligned(vfullq_sum_f32!(store_1));
+            dest_ptr.write_unaligned(vaddvq_f32(store_1));
 
             let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
-            dest_ptr.write_unaligned(vfullq_sum_f32!(store_2));
+            dest_ptr.write_unaligned(vaddvq_f32(store_2));
 
             let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
-            dest_ptr.write_unaligned(vfullq_sum_f32!(store_3));
+            dest_ptr.write_unaligned(vaddvq_f32(store_3));
 
             filter_offset += filter_weights.aligned_size;
         }
diff --git a/src/neon/plane_u8.rs b/src/neon/plane_u8.rs
index 427494b..ee3d581 100644
--- a/src/neon/plane_u8.rs
+++ b/src/neon/plane_u8.rs
@@ -30,13 +30,6 @@ use crate::filter_weights::FilterWeights;
 use crate::support::{PRECISION, ROUNDING_CONST};
 use std::arch::aarch64::*;
 
-macro_rules! vfullq_sum_s32 {
-    ($reg: expr) => {{
-        let acc = vadd_s32(vget_low_s32($reg), vget_high_s32($reg));
-        vget_lane_s32::<0>(vpadd_s32(acc, acc))
-    }};
-}
-
 macro_rules! accumulate_16_horiz {
     ($store: expr, $ptr: expr, $weights: expr) => {{
         let pixel_colors = vld1q_u8($ptr);
@@ -209,22 +202,22 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8(
                 jx += 1;
             }
 
-            let sums = vfullq_sum_s32!(store0).max(0);
+            let sums = vaddvq_s32(store0).max(0);
             let shifted = sums >> PRECISION;
             let value = shifted.min(255) as u8;
             *chunk0 = value;
 
-            let sums = vfullq_sum_s32!(store1).max(0);
+            let sums = vaddvq_s32(store1).max(0);
             let shifted = sums >> PRECISION;
             let value = shifted.min(255) as u8;
             *chunk1 = value;
 
-            let sums = vfullq_sum_s32!(store2).max(0);
+            let sums = vaddvq_s32(store2).max(0);
             let shifted = sums >> PRECISION;
             let value = shifted.min(255) as u8;
             *chunk2 = value;
 
-            let sums = vfullq_sum_s32!(store3).max(0);
+            let sums = vaddvq_s32(store3).max(0);
             let shifted = sums >> PRECISION;
             let value = shifted.min(255) as u8;
             *chunk3 = value;
@@ -291,7 +284,7 @@ pub fn convolve_horizontal_plane_neon_row(
                 jx += 1;
             }
 
-            let sums = vfullq_sum_s32!(store).max(0);
+            let sums = vaddvq_s32(store).max(0);
             let shifted = sums >> PRECISION;
             let value = shifted.min(255) as u8;
             *dst = value;
diff --git a/src/neon/rgb_f32.rs b/src/neon/rgb_f32.rs
index 6df1b0c..6934bbb 100644
--- a/src/neon/rgb_f32.rs
+++ b/src/neon/rgb_f32.rs
@@ -30,6 +30,7 @@
 use std::arch::aarch64::*;
 
 use crate::filter_weights::FilterWeights;
+use crate::neon::utils::xvld1q_f32_x4;
 use crate::neon::utils::{prefer_vfmaq_f32, vsplit_rgb_5};
 
 macro_rules! write_rgb_f32 {
@@ -46,7 +47,7 @@ macro_rules! conv_horiz_5_rgb_f32 {
         const COMPONENTS: usize = 3;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
-        let full_pixel = vld1q_f32_x4(src_ptr);
+        let full_pixel = xvld1q_f32_x4(src_ptr);
         let splat = vsplit_rgb_5(full_pixel);
 
         let mut acc = prefer_vfmaq_f32($store, splat.0, $set.0);
diff --git a/src/neon/rgb_u8.rs b/src/neon/rgb_u8.rs
index 7bd7fcd..8192ec2 100644
--- a/src/neon/rgb_u8.rs
+++ b/src/neon/rgb_u8.rs
@@ -32,14 +32,11 @@ use crate::neon::utils::load_3b_as_u16x4;
 use crate::support::{PRECISION, ROUNDING_CONST};
 use std::arch::aarch64::*;
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_4_u8(
     start_x: usize,
     src: &[u8],
-    w0: int16x4_t,
-    w1: int16x8_t,
-    w2: int16x4_t,
-    w3: int16x8_t,
+    weights: int16x4_t,
     store: int32x4_t,
     shuffle: uint8x16_t,
 ) -> int32x4_t {
@@ -57,18 +54,17 @@ unsafe fn conv_horiz_rgba_4_u8(
     let hi = vreinterpretq_s16_u16(vmovl_high_u8(rgb_pixel));
     let lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgb_pixel)));
 
-    let acc = vmlal_high_s16(store, hi, w3);
-    let acc = vmlal_s16(acc, vget_low_s16(hi), w2);
-    let acc = vmlal_high_s16(acc, lo, w1);
-    vmlal_s16(acc, vget_low_s16(lo), w0)
+    let acc = vmlal_high_lane_s16::<3>(store, hi, weights);
+    let acc = vmlal_lane_s16::<2>(acc, vget_low_s16(hi), weights);
+    let acc = vmlal_high_lane_s16::<1>(acc, lo, weights);
+    vmlal_lane_s16::<0>(acc, vget_low_s16(lo), weights)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_2_u8(
     start_x: usize,
     src: &[u8],
-    w0: int16x4_t,
-    w1: int16x8_t,
+    weights: int16x4_t,
     store: int32x4_t,
     shuffle: uint8x8_t,
 ) -> int32x4_t {
@@ -84,11 +80,11 @@ unsafe fn conv_horiz_rgba_2_u8(
 
     let wide = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(rgb_pixel)));
 
-    let acc = vmlal_high_s16(store, wide, w1);
-    vmlal_s16(acc, vget_low_s16(wide), w0)
+    let acc = vmlal_high_lane_s16::<1>(store, wide, weights);
+    vmlal_lane_s16::<0>(acc, vget_low_s16(wide), weights)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_1_u8(
     start_x: usize,
     src: &[u8],
@@ -102,10 +98,9 @@ unsafe fn conv_horiz_rgba_1_u8(
     vmlal_s16(store, lo, w0)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn write_accumulator_u8(store: int32x4_t, dst: &mut [u8]) {
-    let zeros = vdupq_n_s32(0i32);
-    let store_16 = vqshrun_n_s32::<PRECISION>(vmaxq_s32(store, zeros));
+    let store_16 = vqshrun_n_s32::<PRECISION>(store);
     let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
     let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
     let bytes = pixel.to_le_bytes();
@@ -167,30 +162,22 @@ pub fn convolve_horizontal_rgb_neon_rows_4(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                let w0 = vdup_lane_s16::<0>(weights);
-                let w1 = vdupq_lane_s16::<1>(weights);
-                let w2 = vdup_lane_s16::<2>(weights);
-                let w3 = vdupq_lane_s16::<3>(weights);
-                store_0 =
-                    conv_horiz_rgba_4_u8(bounds_start, src0, w0, w1, w2, w3, store_0, shuffle);
-                store_1 =
-                    conv_horiz_rgba_4_u8(bounds_start, src1, w0, w1, w2, w3, store_1, shuffle);
-                store_2 =
-                    conv_horiz_rgba_4_u8(bounds_start, src2, w0, w1, w2, w3, store_2, shuffle);
-                store_3 =
-                    conv_horiz_rgba_4_u8(bounds_start, src3, w0, w1, w2, w3, store_3, shuffle);
+                store_0 = conv_horiz_rgba_4_u8(bounds_start, src0, weights, store_0, shuffle);
+                store_1 = conv_horiz_rgba_4_u8(bounds_start, src1, weights, store_1, shuffle);
+                store_2 = conv_horiz_rgba_4_u8(bounds_start, src2, weights, store_2, shuffle);
+                store_3 = conv_horiz_rgba_4_u8(bounds_start, src3, weights, store_3, shuffle);
                 jx += 4;
             }
 
             while jx + 2 < bounds.size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 2));
                 let bnds = bounds.start + jx;
-                let w0 = vld1_dup_s16(w_ptr.as_ptr());
-                let w1 = vld1q_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-                store_0 = conv_horiz_rgba_2_u8(bnds, src0, w0, w1, store_0, shuffle_1);
-                store_1 = conv_horiz_rgba_2_u8(bnds, src1, w0, w1, store_1, shuffle_1);
-                store_2 = conv_horiz_rgba_2_u8(bnds, src2, w0, w1, store_2, shuffle_1);
-                store_3 = conv_horiz_rgba_2_u8(bnds, src3, w0, w1, store_3, shuffle_1);
+                let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
+                v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
+                store_0 = conv_horiz_rgba_2_u8(bnds, src0, v_weight, store_0, shuffle_1);
+                store_1 = conv_horiz_rgba_2_u8(bnds, src1, v_weight, store_1, shuffle_1);
+                store_2 = conv_horiz_rgba_2_u8(bnds, src2, v_weight, store_2, shuffle_1);
+                store_3 = conv_horiz_rgba_2_u8(bnds, src3, v_weight, store_3, shuffle_1);
                 jx += 2;
             }
 
@@ -245,20 +232,16 @@ pub fn convolve_horizontal_rgb_neon_row_one(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                let w0 = vdup_lane_s16::<0>(weights);
-                let w1 = vdupq_lane_s16::<1>(weights);
-                let w2 = vdup_lane_s16::<2>(weights);
-                let w3 = vdupq_lane_s16::<3>(weights);
-                store = conv_horiz_rgba_4_u8(bounds_start, src, w0, w1, w2, w3, store, shuffle);
+                store = conv_horiz_rgba_4_u8(bounds_start, src, weights, store, shuffle);
                 jx += 4;
             }
 
             while jx + 2 < bounds_size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 2));
                 let bounds_start = bounds.start + jx;
-                let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-                let weight1 = vld1q_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-                store = conv_horiz_rgba_2_u8(bounds_start, src, weight0, weight1, store, shuffle_1);
+                let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
+                v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
+                store = conv_horiz_rgba_2_u8(bounds_start, src, v_weight, store, shuffle_1);
                 jx += 2;
             }
 
diff --git a/src/neon/rgba_f32.rs b/src/neon/rgba_f32.rs
index d087703..0f60df9 100644
--- a/src/neon/rgba_f32.rs
+++ b/src/neon/rgba_f32.rs
@@ -29,6 +29,7 @@
 
 use crate::filter_weights::FilterWeights;
 use crate::neon::utils::prefer_vfmaq_f32;
+use crate::neon::utils::xvld1q_f32_x4;
 use std::arch::aarch64::*;
 
 macro_rules! conv_horiz_rgba_8_f32 {
@@ -36,8 +37,8 @@ macro_rules! conv_horiz_rgba_8_f32 {
         const COMPONENTS: usize = 4;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
-        let rgb_pixel0 = vld1q_f32_x4(src_ptr);
-        let rgb_pixel1 = vld1q_f32_x4(src_ptr.add(16));
+        let rgb_pixel0 = xvld1q_f32_x4(src_ptr);
+        let rgb_pixel1 = xvld1q_f32_x4(src_ptr.add(16));
 
         let mut acc = prefer_vfmaq_f32($store, rgb_pixel0.0, $set1.0);
         acc = prefer_vfmaq_f32(acc, rgb_pixel0.1, $set1.1);
@@ -56,7 +57,7 @@ macro_rules! conv_horiz_rgba_4_f32 {
         const COMPONENTS: usize = 4;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
-        let rgb_pixel = vld1q_f32_x4(src_ptr);
+        let rgb_pixel = xvld1q_f32_x4(src_ptr);
 
         let acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set1.0);
         let acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set1.1);
diff --git a/src/neon/rgba_u16_lb.rs b/src/neon/rgba_u16_lb.rs
index ba8285d..36f2d91 100644
--- a/src/neon/rgba_u16_lb.rs
+++ b/src/neon/rgba_u16_lb.rs
@@ -27,10 +27,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::filter_weights::FilterWeights;
+use crate::neon::utils::{xvld1q_u16_x2, xvld1q_u16_x4};
 use crate::support::{PRECISION, ROUNDING_CONST};
 use std::arch::aarch64::*;
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_1_u16(
     start_x: usize,
     src: &[u16],
@@ -44,7 +45,7 @@ unsafe fn conv_horiz_rgba_1_u16(
     vmlal_s16(store, lo, w0)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_2_u16(
     start_x: usize,
     src: &[u16],
@@ -62,61 +63,57 @@ unsafe fn conv_horiz_rgba_2_u16(
     vmlal_s16(acc, vget_low_s16(wide), w0)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_4_u16(
     start_x: usize,
     src: &[u16],
-    w0: int16x4_t,
-    w1: int16x8_t,
-    w2: int16x4_t,
-    w3: int16x8_t,
+    weights: int16x4_t,
     store: int32x4_t,
 ) -> int32x4_t {
     const COMPONENTS: usize = 4;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
 
-    let rgba_pixel = vld1q_u16_x2(src_ptr.as_ptr());
+    let rgba_pixel = xvld1q_u16_x2(src_ptr.as_ptr());
 
     let hi = vreinterpretq_s16_u16(rgba_pixel.1);
     let lo = vreinterpretq_s16_u16(rgba_pixel.0);
 
-    let acc = vmlal_high_s16(store, hi, w3);
-    let acc = vmlal_s16(acc, vget_low_s16(hi), w2);
-    let acc = vmlal_high_s16(acc, lo, w1);
-    vmlal_s16(acc, vget_low_s16(lo), w0)
+    let acc = vmlal_high_lane_s16::<3>(store, hi, weights);
+    let acc = vmlal_lane_s16::<2>(acc, vget_low_s16(hi), weights);
+    let acc = vmlal_high_lane_s16::<1>(acc, lo, weights);
+    vmlal_lane_s16::<0>(acc, vget_low_s16(lo), weights)
 }
 
 #[inline(always)]
 unsafe fn conv_horiz_rgba_8_u16(
     start_x: usize,
     src: &[u16],
-    set1: (int16x8_t, int16x8_t, int16x8_t, int16x8_t),
-    set2: (int16x8_t, int16x8_t, int16x8_t, int16x8_t),
+    weights: int16x8_t,
     store: int32x4_t,
 ) -> int32x4_t {
     const COMPONENTS: usize = 4;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
 
-    let rgba_pixel = vld1q_u16_x4(src_ptr.as_ptr());
+    let rgba_pixel = xvld1q_u16_x4(src_ptr.as_ptr());
 
     let hi0 = vreinterpretq_s16_u16(rgba_pixel.1);
     let lo0 = vreinterpretq_s16_u16(rgba_pixel.0);
     let hi1 = vreinterpretq_s16_u16(rgba_pixel.3);
     let lo1 = vreinterpretq_s16_u16(rgba_pixel.2);
 
-    let mut acc = vmlal_high_s16(store, hi0, set1.3);
-    acc = vmlal_s16(acc, vget_low_s16(hi0), vget_low_s16(set1.2));
-    acc = vmlal_high_s16(acc, lo0, set1.1);
-    acc = vmlal_s16(acc, vget_low_s16(lo0), vget_low_s16(set1.0));
+    let mut acc = vmlal_high_laneq_s16::<3>(store, hi0, weights);
+    acc = vmlal_laneq_s16::<2>(acc, vget_low_s16(hi0), weights);
+    acc = vmlal_high_laneq_s16::<1>(acc, lo0, weights);
+    acc = vmlal_laneq_s16::<0>(acc, vget_low_s16(lo0), weights);
 
-    acc = vmlal_high_s16(acc, hi1, set2.3);
-    acc = vmlal_s16(acc, vget_low_s16(hi1), vget_low_s16(set2.2));
-    acc = vmlal_high_s16(acc, lo1, set2.1);
-    acc = vmlal_s16(acc, vget_low_s16(lo1), vget_low_s16(set2.0));
+    acc = vmlal_high_laneq_s16::<7>(acc, hi1, weights);
+    acc = vmlal_laneq_s16::<6>(acc, vget_low_s16(hi1), weights);
+    acc = vmlal_high_laneq_s16::<5>(acc, lo1, weights);
+    acc = vmlal_laneq_s16::<4>(acc, vget_low_s16(lo1), weights);
     acc
 }
 
-pub fn convolve_horizontal_rgba_neon_rows_4_lb_u8(
+pub fn convolve_horizontal_rgba_neon_rows_4_lb_u16(
     src: &[u16],
     src_stride: usize,
     dst: &mut [u16],
@@ -126,7 +123,6 @@ pub fn convolve_horizontal_rgba_neon_rows_4_lb_u8(
 ) {
     unsafe {
         const CHANNELS: usize = 4;
-        let zeros = vdupq_n_s32(0i32);
         let init = vdupq_n_s32(ROUNDING_CONST);
 
         let v_max_colors = vdup_n_u16((1 << bit_depth) - 1);
@@ -168,20 +164,10 @@ pub fn convolve_horizontal_rgba_neon_rows_4_lb_u8(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 8));
                 let weights_set = vld1q_s16(w_ptr.as_ptr());
-                let w0 = vdupq_laneq_s16::<0>(weights_set);
-                let w1 = vdupq_laneq_s16::<1>(weights_set);
-                let w2 = vdupq_laneq_s16::<2>(weights_set);
-                let w3 = vdupq_laneq_s16::<3>(weights_set);
-                let w4 = vdupq_laneq_s16::<4>(weights_set);
-                let w5 = vdupq_laneq_s16::<5>(weights_set);
-                let w6 = vdupq_laneq_s16::<6>(weights_set);
-                let w7 = vdupq_laneq_s16::<7>(weights_set);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
-                store_0 = conv_horiz_rgba_8_u16(bounds_start, src0, set1, set2, store_0);
-                store_1 = conv_horiz_rgba_8_u16(bounds_start, src1, set1, set2, store_1);
-                store_2 = conv_horiz_rgba_8_u16(bounds_start, src2, set1, set2, store_2);
-                store_3 = conv_horiz_rgba_8_u16(bounds_start, src3, set1, set2, store_3);
+                store_0 = conv_horiz_rgba_8_u16(bounds_start, src0, weights_set, store_0);
+                store_1 = conv_horiz_rgba_8_u16(bounds_start, src1, weights_set, store_1);
+                store_2 = conv_horiz_rgba_8_u16(bounds_start, src2, weights_set, store_2);
+                store_3 = conv_horiz_rgba_8_u16(bounds_start, src3, weights_set, store_3);
                 jx += 8;
             }
 
@@ -189,14 +175,10 @@ pub fn convolve_horizontal_rgba_neon_rows_4_lb_u8(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                let w0 = vdup_lane_s16::<0>(weights);
-                let w1 = vdupq_lane_s16::<1>(weights);
-                let w2 = vdup_lane_s16::<2>(weights);
-                let w3 = vdupq_lane_s16::<3>(weights);
-                store_0 = conv_horiz_rgba_4_u16(bounds_start, src0, w0, w1, w2, w3, store_0);
-                store_1 = conv_horiz_rgba_4_u16(bounds_start, src1, w0, w1, w2, w3, store_1);
-                store_2 = conv_horiz_rgba_4_u16(bounds_start, src2, w0, w1, w2, w3, store_2);
-                store_3 = conv_horiz_rgba_4_u16(bounds_start, src3, w0, w1, w2, w3, store_3);
+                store_0 = conv_horiz_rgba_4_u16(bounds_start, src0, weights, store_0);
+                store_1 = conv_horiz_rgba_4_u16(bounds_start, src1, weights, store_1);
+                store_2 = conv_horiz_rgba_4_u16(bounds_start, src2, weights, store_2);
+                store_3 = conv_horiz_rgba_4_u16(bounds_start, src3, weights, store_3);
                 jx += 4;
             }
 
@@ -223,22 +205,10 @@ pub fn convolve_horizontal_rgba_neon_rows_4_lb_u8(
                 jx += 1;
             }
 
-            let store_16_0 = vmin_u16(
-                vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_0, zeros)),
-                v_max_colors,
-            );
-            let store_16_1 = vmin_u16(
-                vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_1, zeros)),
-                v_max_colors,
-            );
-            let store_16_2 = vmin_u16(
-                vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_2, zeros)),
-                v_max_colors,
-            );
-            let store_16_3 = vmin_u16(
-                vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_3, zeros)),
-                v_max_colors,
-            );
+            let store_16_0 = vmin_u16(vqshrun_n_s32::<PRECISION>(store_0), v_max_colors);
+            let store_16_1 = vmin_u16(vqshrun_n_s32::<PRECISION>(store_1), v_max_colors);
+            let store_16_2 = vmin_u16(vqshrun_n_s32::<PRECISION>(store_2), v_max_colors);
+            let store_16_3 = vmin_u16(vqshrun_n_s32::<PRECISION>(store_3), v_max_colors);
 
             vst1_u16(chunk0.as_mut_ptr(), store_16_0);
             vst1_u16(chunk1.as_mut_ptr(), store_16_1);
@@ -257,7 +227,6 @@ pub fn convolve_horizontal_rgba_neon_u16_lb_row(
     unsafe {
         const CHANNELS: usize = 4;
 
-        let zeros = vdupq_n_s32(0i32);
         let v_max_colors = vdup_n_u16((1 << bit_depth) - 1);
 
         for ((dst, bounds), weights) in dst
@@ -277,37 +246,15 @@ pub fn convolve_horizontal_rgba_neon_u16_lb_row(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 8));
                 let weights_set = vld1q_s16(w_ptr.as_ptr());
-                let w0 = vdupq_laneq_s16::<0>(weights_set);
-                let w1 = vdupq_laneq_s16::<1>(weights_set);
-                let w2 = vdupq_laneq_s16::<2>(weights_set);
-                let w3 = vdupq_laneq_s16::<3>(weights_set);
-                let w4 = vdupq_laneq_s16::<4>(weights_set);
-                let w5 = vdupq_laneq_s16::<5>(weights_set);
-                let w6 = vdupq_laneq_s16::<6>(weights_set);
-                let w7 = vdupq_laneq_s16::<7>(weights_set);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
-                store = conv_horiz_rgba_8_u16(bounds_start, src, set1, set2, store);
+                store = conv_horiz_rgba_8_u16(bounds_start, src, weights_set, store);
                 jx += 8;
             }
 
             while jx + 4 < bounds_size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                let weight0 = vdup_lane_s16::<0>(weights);
-                let weight1 = vdupq_lane_s16::<1>(weights);
-                let weight2 = vdup_lane_s16::<2>(weights);
-                let weight3 = vdupq_lane_s16::<3>(weights);
                 let bounds_start = bounds.start + jx;
-                store = conv_horiz_rgba_4_u16(
-                    bounds_start,
-                    src,
-                    weight0,
-                    weight1,
-                    weight2,
-                    weight3,
-                    store,
-                );
+                store = conv_horiz_rgba_4_u16(bounds_start, src, weights, store);
                 jx += 4;
             }
 
@@ -328,10 +275,7 @@ pub fn convolve_horizontal_rgba_neon_u16_lb_row(
                 jx += 1;
             }
 
-            let store_16_0 = vmin_u16(
-                vqshrun_n_s32::<PRECISION>(vmaxq_s32(store, zeros)),
-                v_max_colors,
-            );
+            let store_16_0 = vmin_u16(vqshrun_n_s32::<PRECISION>(store), v_max_colors);
 
             vst1_u16(dst.as_mut_ptr(), store_16_0);
         }
diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs
index a134850..62373f0 100644
--- a/src/neon/rgba_u8.rs
+++ b/src/neon/rgba_u8.rs
@@ -28,7 +28,7 @@
  */
 
 use crate::filter_weights::FilterWeights;
-use crate::neon::utils::load_4b_as_u16x4;
+use crate::neon::utils::{load_4b_as_u16x4, xvld1q_u8_x2};
 use crate::support::PRECISION;
 use crate::support::ROUNDING_CONST;
 use std::arch::aarch64::*;
@@ -37,29 +37,28 @@ use std::arch::aarch64::*;
 unsafe fn conv_horiz_rgba_8_u8(
     start_x: usize,
     src: &[u8],
-    set1: (int16x8_t, int16x8_t, int16x8_t, int16x8_t),
-    set2: (int16x8_t, int16x8_t, int16x8_t, int16x8_t),
+    weights: int16x8_t,
     store: int32x4_t,
 ) -> int32x4_t {
     const COMPONENTS: usize = 4;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
 
-    let rgba_pixel = vld1q_u8_x2(src_ptr.as_ptr());
+    let rgba_pixel = xvld1q_u8_x2(src_ptr.as_ptr());
 
     let hi0 = vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel.0));
     let lo0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel.0)));
     let hi1 = vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel.1));
     let lo1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel.1)));
 
-    let mut acc = vmlal_high_s16(store, hi0, set1.3);
-    acc = vmlal_s16(acc, vget_low_s16(hi0), vget_low_s16(set1.2));
-    acc = vmlal_high_s16(acc, lo0, set1.1);
-    acc = vmlal_s16(acc, vget_low_s16(lo0), vget_low_s16(set1.0));
+    let mut acc = vmlal_high_laneq_s16::<3>(store, hi0, weights);
+    acc = vmlal_laneq_s16::<2>(acc, vget_low_s16(hi0), weights);
+    acc = vmlal_high_laneq_s16::<1>(acc, lo0, weights);
+    acc = vmlal_laneq_s16::<0>(acc, vget_low_s16(lo0), weights);
 
-    acc = vmlal_high_s16(acc, hi1, set2.3);
-    acc = vmlal_s16(acc, vget_low_s16(hi1), vget_low_s16(set2.2));
-    acc = vmlal_high_s16(acc, lo1, set2.1);
-    acc = vmlal_s16(acc, vget_low_s16(lo1), vget_low_s16(set2.0));
+    acc = vmlal_high_laneq_s16::<7>(acc, hi1, weights);
+    acc = vmlal_laneq_s16::<6>(acc, vget_low_s16(hi1), weights);
+    acc = vmlal_high_laneq_s16::<5>(acc, lo1, weights);
+    acc = vmlal_laneq_s16::<4>(acc, vget_low_s16(lo1), weights);
     acc
 }
 
@@ -67,37 +66,35 @@ unsafe fn conv_horiz_rgba_8_u8(
 unsafe fn conv_horiz_rgba_8_u8_i16<const SCALE: i32>(
     start_x: usize,
     src: &[u8],
-    set1: (int16x4_t, int16x4_t, int16x4_t, int16x4_t),
-    set2: (int16x4_t, int16x4_t, int16x4_t, int16x4_t),
+    weights: int16x8_t,
     store: int16x4_t,
 ) -> int16x4_t {
     const COMPONENTS: usize = 4;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
 
-    let rgba_pixel = vld1q_u8_x2(src_ptr.as_ptr());
+    let rgba_pixel = xvld1q_u8_x2(src_ptr.as_ptr());
 
     let hi0 = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel.0)));
     let lo0 = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel.0))));
     let hi1 = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel.1)));
     let lo1 = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel.1))));
 
-    let hi_v = vqrdmulhq_s16(hi0, vcombine_s16(set1.2, set1.3));
-    let mut product = vqrdmlahq_s16(hi_v, lo0, vcombine_s16(set1.0, set1.1));
-    product = vqrdmlahq_s16(product, hi1, vcombine_s16(set2.2, set2.3));
-    product = vqrdmlahq_s16(product, lo1, vcombine_s16(set2.0, set2.1));
-
-    vadd_s16(
-        vadd_s16(store, vget_low_s16(product)),
-        vget_high_s16(product),
-    )
+    let mut product = vqrdmlah_laneq_s16::<3>(store, vget_high_s16(hi0), weights);
+    product = vqrdmlah_laneq_s16::<2>(product, vget_low_s16(hi0), weights);
+    product = vqrdmlah_laneq_s16::<1>(product, vget_high_s16(lo0), weights);
+    product = vqrdmlah_laneq_s16::<0>(product, vget_low_s16(lo0), weights);
+    product = vqrdmlah_laneq_s16::<7>(product, vget_high_s16(hi1), weights);
+    product = vqrdmlah_laneq_s16::<6>(product, vget_low_s16(hi1), weights);
+    product = vqrdmlah_laneq_s16::<5>(product, vget_high_s16(lo1), weights);
+    product = vqrdmlah_laneq_s16::<4>(product, vget_low_s16(lo1), weights);
+    product
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_2_u8(
     start_x: usize,
     src: &[u8],
-    w0: int16x4_t,
-    w1: int16x8_t,
+    weights: int16x4_t,
     store: int32x4_t,
 ) -> int32x4_t {
     const COMPONENTS: usize = 4;
@@ -106,16 +103,15 @@ unsafe fn conv_horiz_rgba_2_u8(
     let rgb_pixel = vld1_u8(src_ptr.as_ptr());
     let wide = vreinterpretq_s16_u16(vmovl_u8(rgb_pixel));
 
-    let acc = vmlal_high_s16(store, wide, w1);
-    vmlal_s16(acc, vget_low_s16(wide), w0)
+    let acc = vmlal_high_lane_s16::<1>(store, wide, weights);
+    vmlal_lane_s16::<0>(acc, vget_low_s16(wide), weights)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_2_u8_i16<const SCALE: i32>(
     start_x: usize,
     src: &[u8],
-    w0: int16x4_t,
-    w1: int16x4_t,
+    weights: int16x4_t,
     store: int16x4_t,
 ) -> int16x4_t {
     const COMPONENTS: usize = 4;
@@ -124,22 +120,15 @@ unsafe fn conv_horiz_rgba_2_u8_i16<const SCALE: i32>(
     let rgb_pixel = vld1_u8(src_ptr.as_ptr());
     let wide = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_u8(rgb_pixel)));
 
-    let product = vqrdmulhq_s16(wide, vcombine_s16(w0, w1));
-
-    vadd_s16(
-        vadd_s16(store, vget_low_s16(product)),
-        vget_high_s16(product),
-    )
+    let product = vqrdmlah_lane_s16::<0>(store, vget_low_s16(wide), weights);
+    vqrdmlah_lane_s16::<1>(product, vget_high_s16(wide), weights)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_4_u8(
     start_x: usize,
     src: &[u8],
-    w0: int16x4_t,
-    w1: int16x8_t,
-    w2: int16x4_t,
-    w3: int16x8_t,
+    weights: int16x4_t,
     store: int32x4_t,
 ) -> int32x4_t {
     const COMPONENTS: usize = 4;
@@ -150,20 +139,17 @@ unsafe fn conv_horiz_rgba_4_u8(
     let hi = vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel));
     let lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel)));
 
-    let acc = vmlal_high_s16(store, hi, w3);
-    let acc = vmlal_s16(acc, vget_low_s16(hi), w2);
-    let acc = vmlal_high_s16(acc, lo, w1);
-    vmlal_s16(acc, vget_low_s16(lo), w0)
+    let acc = vmlal_high_lane_s16::<3>(store, hi, weights);
+    let acc = vmlal_lane_s16::<2>(acc, vget_low_s16(hi), weights);
+    let acc = vmlal_high_lane_s16::<1>(acc, lo, weights);
+    vmlal_lane_s16::<0>(acc, vget_low_s16(lo), weights)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_4_u8_i16<const SCALE: i32>(
     start_x: usize,
     src: &[u8],
-    w0: int16x4_t,
-    w1: int16x4_t,
-    w2: int16x4_t,
-    w3: int16x4_t,
+    weights: int16x4_t,
     store: int16x4_t,
 ) -> int16x4_t {
     const COMPONENTS: usize = 4;
@@ -174,16 +160,14 @@ unsafe fn conv_horiz_rgba_4_u8_i16<const SCALE: i32>(
     let hi = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel)));
     let lo = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel))));
 
-    let hi_v = vqrdmulhq_s16(hi, vcombine_s16(w2, w3));
-    let product = vqrdmlahq_s16(hi_v, lo, vcombine_s16(w0, w1));
-
-    vadd_s16(
-        vadd_s16(store, vget_low_s16(product)),
-        vget_high_s16(product),
-    )
+    let mut product = vqrdmlah_lane_s16::<3>(store, vget_high_s16(hi), weights);
+    product = vqrdmlah_lane_s16::<2>(product, vget_low_s16(hi), weights);
+    product = vqrdmlah_lane_s16::<1>(product, vget_high_s16(lo), weights);
+    product = vqrdmlah_lane_s16::<0>(product, vget_low_s16(lo), weights);
+    product
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_1_u8(
     start_x: usize,
     src: &[u8],
@@ -197,7 +181,7 @@ unsafe fn conv_horiz_rgba_1_u8(
     vmlal_s16(store, lo, w0)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_1_u8_i16<const SCALE: i32>(
     start_x: usize,
     src: &[u8],
@@ -241,7 +225,6 @@ unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
     const CHANNELS: usize = 4;
     const SCALE: i32 = 6;
     const ROUNDING: i16 = 1 << (SCALE - 1);
-    let zeros = vdup_n_s16(0i16);
     let init = vdup_n_s16(ROUNDING);
 
     let (row0_ref, rest) = dst.split_at_mut(dst_stride);
@@ -282,20 +265,10 @@ unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
             let bounds_start = bounds.start + jx;
             let w_ptr = weights.get_unchecked(jx..(jx + 8));
             let weights_set = vld1q_s16(w_ptr.as_ptr());
-            let w0 = vdup_laneq_s16::<0>(weights_set);
-            let w1 = vdup_laneq_s16::<1>(weights_set);
-            let w2 = vdup_laneq_s16::<2>(weights_set);
-            let w3 = vdup_laneq_s16::<3>(weights_set);
-            let w4 = vdup_laneq_s16::<4>(weights_set);
-            let w5 = vdup_laneq_s16::<5>(weights_set);
-            let w6 = vdup_laneq_s16::<6>(weights_set);
-            let w7 = vdup_laneq_s16::<7>(weights_set);
-            let set1 = (w0, w1, w2, w3);
-            let set2 = (w4, w5, w6, w7);
-            store_0 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src0, set1, set2, store_0);
-            store_1 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src1, set1, set2, store_1);
-            store_2 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src2, set1, set2, store_2);
-            store_3 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src3, set1, set2, store_3);
+            store_0 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src0, weights_set, store_0);
+            store_1 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src1, weights_set, store_1);
+            store_2 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src2, weights_set, store_2);
+            store_3 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src3, weights_set, store_3);
             jx += 8;
         }
 
@@ -303,30 +276,22 @@ unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
             let bounds_start = bounds.start + jx;
             let w_ptr = weights.get_unchecked(jx..(jx + 4));
             let weights = vld1_s16(w_ptr.as_ptr());
-            let w0 = vdup_lane_s16::<0>(weights);
-            let w1 = vdup_lane_s16::<1>(weights);
-            let w2 = vdup_lane_s16::<2>(weights);
-            let w3 = vdup_lane_s16::<3>(weights);
-            store_0 =
-                conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src0, w0, w1, w2, w3, store_0);
-            store_1 =
-                conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src1, w0, w1, w2, w3, store_1);
-            store_2 =
-                conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src2, w0, w1, w2, w3, store_2);
-            store_3 =
-                conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src3, w0, w1, w2, w3, store_3);
+            store_0 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src0, weights, store_0);
+            store_1 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src1, weights, store_1);
+            store_2 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src2, weights, store_2);
+            store_3 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src3, weights, store_3);
             jx += 4;
         }
 
         while jx + 2 < bounds_size {
             let w_ptr = weights.get_unchecked(jx..(jx + 2));
             let bounds_start = bounds.start + jx;
-            let w0 = vld1_dup_s16(w_ptr.as_ptr());
-            let w1 = vld1_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-            store_0 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src0, w0, w1, store_0);
-            store_1 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src1, w0, w1, store_1);
-            store_2 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src2, w0, w1, store_2);
-            store_3 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src3, w0, w1, store_3);
+            let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
+            v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
+            store_0 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src0, v_weight, store_0);
+            store_1 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src1, v_weight, store_1);
+            store_2 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src2, v_weight, store_2);
+            store_3 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src3, v_weight, store_3);
             jx += 2;
         }
 
@@ -341,15 +306,15 @@ unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
             jx += 1;
         }
 
-        let store_16_0 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_0, zeros)));
-        let store_16_1 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_1, zeros)));
-        let store_16_2 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_2, zeros)));
-        let store_16_3 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_3, zeros)));
+        let store_16_0 = vshr_n_s16::<SCALE>(store_0);
+        let store_16_1 = vshr_n_s16::<SCALE>(store_1);
+        let store_16_2 = vshr_n_s16::<SCALE>(store_2);
+        let store_16_3 = vshr_n_s16::<SCALE>(store_3);
 
-        let store_16_8_0 = vqmovn_u16(vcombine_u16(store_16_0, store_16_0));
-        let store_16_8_1 = vqmovn_u16(vcombine_u16(store_16_1, store_16_1));
-        let store_16_8_2 = vqmovn_u16(vcombine_u16(store_16_2, store_16_2));
-        let store_16_8 = vqmovn_u16(vcombine_u16(store_16_3, store_16_3));
+        let store_16_8_0 = vqmovun_s16(vcombine_s16(store_16_0, store_16_0));
+        let store_16_8_1 = vqmovun_s16(vcombine_s16(store_16_1, store_16_1));
+        let store_16_8_2 = vqmovun_s16(vcombine_s16(store_16_2, store_16_2));
+        let store_16_8 = vqmovun_s16(vcombine_s16(store_16_3, store_16_3));
 
         let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_0));
         let dest_ptr_32 = chunk0.as_mut_ptr() as *mut u32;
@@ -378,7 +343,6 @@ pub fn convolve_horizontal_rgba_neon_rows_4_u8(
 ) {
     unsafe {
         const CHANNELS: usize = 4;
-        let zeros = vdupq_n_s32(0i32);
         let init = vdupq_n_s32(ROUNDING_CONST);
 
         let (row0_ref, rest) = dst.split_at_mut(dst_stride);
@@ -418,20 +382,10 @@ pub fn convolve_horizontal_rgba_neon_rows_4_u8(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 8));
                 let weights_set = vld1q_s16(w_ptr.as_ptr());
-                let w0 = vdupq_laneq_s16::<0>(weights_set);
-                let w1 = vdupq_laneq_s16::<1>(weights_set);
-                let w2 = vdupq_laneq_s16::<2>(weights_set);
-                let w3 = vdupq_laneq_s16::<3>(weights_set);
-                let w4 = vdupq_laneq_s16::<4>(weights_set);
-                let w5 = vdupq_laneq_s16::<5>(weights_set);
-                let w6 = vdupq_laneq_s16::<6>(weights_set);
-                let w7 = vdupq_laneq_s16::<7>(weights_set);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
-                store_0 = conv_horiz_rgba_8_u8(bounds_start, src0, set1, set2, store_0);
-                store_1 = conv_horiz_rgba_8_u8(bounds_start, src1, set1, set2, store_1);
-                store_2 = conv_horiz_rgba_8_u8(bounds_start, src2, set1, set2, store_2);
-                store_3 = conv_horiz_rgba_8_u8(bounds_start, src3, set1, set2, store_3);
+                store_0 = conv_horiz_rgba_8_u8(bounds_start, src0, weights_set, store_0);
+                store_1 = conv_horiz_rgba_8_u8(bounds_start, src1, weights_set, store_1);
+                store_2 = conv_horiz_rgba_8_u8(bounds_start, src2, weights_set, store_2);
+                store_3 = conv_horiz_rgba_8_u8(bounds_start, src3, weights_set, store_3);
                 jx += 8;
             }
 
@@ -439,26 +393,22 @@ pub fn convolve_horizontal_rgba_neon_rows_4_u8(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                let w0 = vdup_lane_s16::<0>(weights);
-                let w1 = vdupq_lane_s16::<1>(weights);
-                let w2 = vdup_lane_s16::<2>(weights);
-                let w3 = vdupq_lane_s16::<3>(weights);
-                store_0 = conv_horiz_rgba_4_u8(bounds_start, src0, w0, w1, w2, w3, store_0);
-                store_1 = conv_horiz_rgba_4_u8(bounds_start, src1, w0, w1, w2, w3, store_1);
-                store_2 = conv_horiz_rgba_4_u8(bounds_start, src2, w0, w1, w2, w3, store_2);
-                store_3 = conv_horiz_rgba_4_u8(bounds_start, src3, w0, w1, w2, w3, store_3);
+                store_0 = conv_horiz_rgba_4_u8(bounds_start, src0, weights, store_0);
+                store_1 = conv_horiz_rgba_4_u8(bounds_start, src1, weights, store_1);
+                store_2 = conv_horiz_rgba_4_u8(bounds_start, src2, weights, store_2);
+                store_3 = conv_horiz_rgba_4_u8(bounds_start, src3, weights, store_3);
                 jx += 4;
             }
 
             while jx + 2 < bounds_size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 2));
                 let bounds_start = bounds.start + jx;
-                let w0 = vld1_dup_s16(w_ptr.as_ptr());
-                let w1 = vld1q_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-                store_0 = conv_horiz_rgba_2_u8(bounds_start, src0, w0, w1, store_0);
-                store_1 = conv_horiz_rgba_2_u8(bounds_start, src1, w0, w1, store_1);
-                store_2 = conv_horiz_rgba_2_u8(bounds_start, src2, w0, w1, store_2);
-                store_3 = conv_horiz_rgba_2_u8(bounds_start, src3, w0, w1, store_3);
+                let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
+                v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
+                store_0 = conv_horiz_rgba_2_u8(bounds_start, src0, v_weight, store_0);
+                store_1 = conv_horiz_rgba_2_u8(bounds_start, src1, v_weight, store_1);
+                store_2 = conv_horiz_rgba_2_u8(bounds_start, src2, v_weight, store_2);
+                store_3 = conv_horiz_rgba_2_u8(bounds_start, src3, v_weight, store_3);
                 jx += 2;
             }
 
@@ -473,10 +423,10 @@ pub fn convolve_horizontal_rgba_neon_rows_4_u8(
                 jx += 1;
             }
 
-            let store_16_0 = vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_0, zeros));
-            let store_16_1 = vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_1, zeros));
-            let store_16_2 = vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_2, zeros));
-            let store_16_3 = vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_3, zeros));
+            let store_16_0 = vqshrun_n_s32::<PRECISION>(store_0);
+            let store_16_1 = vqshrun_n_s32::<PRECISION>(store_1);
+            let store_16_2 = vqshrun_n_s32::<PRECISION>(store_2);
+            let store_16_3 = vqshrun_n_s32::<PRECISION>(store_3);
 
             let store_16_8_0 = vqmovn_u16(vcombine_u16(store_16_0, store_16_0));
             let store_16_8_1 = vqmovn_u16(vcombine_u16(store_16_1, store_16_1));
@@ -527,46 +477,24 @@ pub fn convolve_horizontal_rgba_neon_row(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 8));
                 let weights_set = vld1q_s16(w_ptr.as_ptr());
-                let w0 = vdupq_laneq_s16::<0>(weights_set);
-                let w1 = vdupq_laneq_s16::<1>(weights_set);
-                let w2 = vdupq_laneq_s16::<2>(weights_set);
-                let w3 = vdupq_laneq_s16::<3>(weights_set);
-                let w4 = vdupq_laneq_s16::<4>(weights_set);
-                let w5 = vdupq_laneq_s16::<5>(weights_set);
-                let w6 = vdupq_laneq_s16::<6>(weights_set);
-                let w7 = vdupq_laneq_s16::<7>(weights_set);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
-                store = conv_horiz_rgba_8_u8(bounds_start, src, set1, set2, store);
+                store = conv_horiz_rgba_8_u8(bounds_start, src, weights_set, store);
                 jx += 8;
             }
 
             while jx + 4 < bounds_size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                let weight0 = vdup_lane_s16::<0>(weights);
-                let weight1 = vdupq_lane_s16::<1>(weights);
-                let weight2 = vdup_lane_s16::<2>(weights);
-                let weight3 = vdupq_lane_s16::<3>(weights);
                 let bounds_start = bounds.start + jx;
-                store = conv_horiz_rgba_4_u8(
-                    bounds_start,
-                    src,
-                    weight0,
-                    weight1,
-                    weight2,
-                    weight3,
-                    store,
-                );
+                store = conv_horiz_rgba_4_u8(bounds_start, src, weights, store);
                 jx += 4;
             }
 
             while jx + 2 < bounds_size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 2));
                 let bounds_start = bounds.start + jx;
-                let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-                let weight1 = vld1q_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-                store = conv_horiz_rgba_2_u8(bounds_start, src, weight0, weight1, store);
+                let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
+                v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
+                store = conv_horiz_rgba_2_u8(bounds_start, src, v_weight, store);
                 jx += 2;
             }
 
@@ -578,7 +506,7 @@ pub fn convolve_horizontal_rgba_neon_row(
                 jx += 1;
             }
 
-            let store_16 = vqshrun_n_s32::<PRECISION>(vmaxq_s32(store, vdupq_n_s32(0i32)));
+            let store_16 = vqshrun_n_s32::<PRECISION>(store);
             let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
 
             let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
@@ -606,7 +534,6 @@ unsafe fn convolve_horizontal_rgba_neon_row_i16_impl(
 ) {
     const SCALE: i32 = 6;
     const ROUNDING: i16 = 1 << (SCALE - 1);
-    let zeros = vdup_n_s16(0i16);
     const CHANNELS: usize = 4;
 
     for ((dst, bounds), weights) in dst
@@ -626,46 +553,24 @@ unsafe fn convolve_horizontal_rgba_neon_row_i16_impl(
             let bounds_start = bounds.start + jx;
             let w_ptr = weights.get_unchecked(jx..(jx + 8));
             let weights_set = vld1q_s16(w_ptr.as_ptr());
-            let w0 = vdup_laneq_s16::<0>(weights_set);
-            let w1 = vdup_laneq_s16::<1>(weights_set);
-            let w2 = vdup_laneq_s16::<2>(weights_set);
-            let w3 = vdup_laneq_s16::<3>(weights_set);
-            let w4 = vdup_laneq_s16::<4>(weights_set);
-            let w5 = vdup_laneq_s16::<5>(weights_set);
-            let w6 = vdup_laneq_s16::<6>(weights_set);
-            let w7 = vdup_laneq_s16::<7>(weights_set);
-            let set1 = (w0, w1, w2, w3);
-            let set2 = (w4, w5, w6, w7);
-            store = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src, set1, set2, store);
+            store = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src, weights_set, store);
             jx += 8;
         }
 
         while jx + 4 < bounds_size {
             let w_ptr = weights.get_unchecked(jx..(jx + 4));
             let weights = vld1_s16(w_ptr.as_ptr());
-            let weight0 = vdup_lane_s16::<0>(weights);
-            let weight1 = vdup_lane_s16::<1>(weights);
-            let weight2 = vdup_lane_s16::<2>(weights);
-            let weight3 = vdup_lane_s16::<3>(weights);
             let bounds_start = bounds.start + jx;
-            store = conv_horiz_rgba_4_u8_i16::<SCALE>(
-                bounds_start,
-                src,
-                weight0,
-                weight1,
-                weight2,
-                weight3,
-                store,
-            );
+            store = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src, weights, store);
             jx += 4;
         }
 
         while jx + 2 < bounds_size {
             let w_ptr = weights.get_unchecked(jx..(jx + 2));
             let bounds_start = bounds.start + jx;
-            let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-            let weight1 = vld1_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-            store = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src, weight0, weight1, store);
+            let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
+            v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
+            store = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src, v_weight, store);
             jx += 2;
         }
 
@@ -677,8 +582,8 @@ unsafe fn convolve_horizontal_rgba_neon_row_i16_impl(
             jx += 1;
         }
 
-        let store_16 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store, zeros)));
-        let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
+        let store_16 = vshr_n_s16::<SCALE>(store);
+        let store_16_8 = vqmovun_s16(vcombine_s16(store_16, store_16));
 
         let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
         let dest_ptr_32 = dst.as_mut_ptr() as *mut u32;
diff --git a/src/neon/utils.rs b/src/neon/utils.rs
index fe2e131..e21a1d8 100644
--- a/src/neon/utils.rs
+++ b/src/neon/utils.rs
@@ -29,6 +29,60 @@
 
 use std::arch::aarch64::*;
 
+#[inline(always)]
+pub(crate) unsafe fn xvld1q_u8_x2(ptr: *const u8) -> uint8x16x2_t {
+    uint8x16x2_t(vld1q_u8(ptr), vld1q_u8(ptr.add(16)))
+}
+
+#[inline(always)]
+pub(crate) unsafe fn xvld1q_u8_x4(ptr: *const u8) -> uint8x16x4_t {
+    uint8x16x4_t(
+        vld1q_u8(ptr),
+        vld1q_u8(ptr.add(16)),
+        vld1q_u8(ptr.add(32)),
+        vld1q_u8(ptr.add(48)),
+    )
+}
+
+#[inline(always)]
+pub(crate) unsafe fn xvld1q_u16_x4(a: *const u16) -> uint16x8x4_t {
+    uint16x8x4_t(
+        vld1q_u16(a),
+        vld1q_u16(a.add(8)),
+        vld1q_u16(a.add(16)),
+        vld1q_u16(a.add(24)),
+    )
+}
+
+#[inline(always)]
+pub(crate) unsafe fn xvld1q_u16_x2(a: *const u16) -> uint16x8x2_t {
+    uint16x8x2_t(vld1q_u16(a), vld1q_u16(a.add(8)))
+}
+
+#[inline(always)]
+pub(crate) unsafe fn xvld1q_f32_x4(a: *const f32) -> float32x4x4_t {
+    float32x4x4_t(
+        vld1q_f32(a),
+        vld1q_f32(a.add(4)),
+        vld1q_f32(a.add(8)),
+        vld1q_f32(a.add(12)),
+    )
+}
+
+#[inline(always)]
+pub(crate) unsafe fn xvst1q_u8_x2(ptr: *mut u8, b: uint8x16x2_t) {
+    vst1q_u8(ptr, b.0);
+    vst1q_u8(ptr.add(16), b.1);
+}
+
+#[inline(always)]
+pub(crate) unsafe fn xvst1q_u8_x4(ptr: *mut u8, b: uint8x16x4_t) {
+    vst1q_u8(ptr, b.0);
+    vst1q_u8(ptr.add(16), b.1);
+    vst1q_u8(ptr.add(32), b.2);
+    vst1q_u8(ptr.add(48), b.3);
+}
+
 #[inline(always)]
 pub(crate) unsafe fn prefer_vfmaq_f32(
     a: float32x4_t,
@@ -74,9 +128,9 @@ pub(crate) unsafe fn vsplit_rgb_5(px: float32x4x4_t) -> Float32x5T {
 }
 
 pub(crate) struct Float32x5T(
-    pub float32x4_t,
-    pub float32x4_t,
-    pub float32x4_t,
-    pub float32x4_t,
-    pub float32x4_t,
+    pub(crate) float32x4_t,
+    pub(crate) float32x4_t,
+    pub(crate) float32x4_t,
+    pub(crate) float32x4_t,
+    pub(crate) float32x4_t,
 );
diff --git a/src/neon/vertical_f32.rs b/src/neon/vertical_f32.rs
index 915c0b5..d1a241f 100644
--- a/src/neon/vertical_f32.rs
+++ b/src/neon/vertical_f32.rs
@@ -28,6 +28,7 @@
  */
 use crate::filter_weights::FilterBounds;
 use crate::neon::utils::prefer_vfmaq_f32;
+use crate::neon::utils::xvld1q_f32_x4;
 use std::arch::aarch64::*;
 
 macro_rules! conv_vertical_part_neon_16_f32 {
@@ -46,7 +47,7 @@ macro_rules! conv_vertical_part_neon_16_f32 {
                 let src_ptr = $src.add($src_stride * py);
 
                 let s_ptr = src_ptr.add(px);
-                let item_row = vld1q_f32_x4(s_ptr);
+                let item_row = xvld1q_f32_x4(s_ptr);
 
                 store_0 = prefer_vfmaq_f32(store_0, item_row.0, v_weight);
                 store_1 = prefer_vfmaq_f32(store_1, item_row.1, v_weight);
@@ -81,8 +82,8 @@ macro_rules! conv_vertical_part_neon_32_f32 {
                 let src_ptr = $src.add($src_stride * py);
 
                 let s_ptr = src_ptr.add(px);
-                let item_row_0 = vld1q_f32_x4(s_ptr);
-                let item_row_1 = vld1q_f32_x4(s_ptr.add(16));
+                let item_row_0 = xvld1q_f32_x4(s_ptr);
+                let item_row_1 = xvld1q_f32_x4(s_ptr.add(16));
 
                 store_0 = prefer_vfmaq_f32(store_0, item_row_0.0, v_weight);
                 store_1 = prefer_vfmaq_f32(store_1, item_row_0.1, v_weight);
@@ -131,9 +132,9 @@ macro_rules! conv_vertical_part_neon_48_f32 {
                 let src_ptr = $src.add($src_stride * py);
 
                 let s_ptr = src_ptr.add(px);
-                let item_row_0 = vld1q_f32_x4(s_ptr);
-                let item_row_1 = vld1q_f32_x4(s_ptr.add(16));
-                let item_row_2 = vld1q_f32_x4(s_ptr.add(32));
+                let item_row_0 = xvld1q_f32_x4(s_ptr);
+                let item_row_1 = xvld1q_f32_x4(s_ptr.add(16));
+                let item_row_2 = xvld1q_f32_x4(s_ptr.add(32));
 
                 store_0 = prefer_vfmaq_f32(store_0, item_row_0.0, v_weight);
                 store_1 = prefer_vfmaq_f32(store_1, item_row_0.1, v_weight);
diff --git a/src/neon/vertical_u16_lb.rs b/src/neon/vertical_u16_lb.rs
index 602e5dd..7dc925d 100644
--- a/src/neon/vertical_u16_lb.rs
+++ b/src/neon/vertical_u16_lb.rs
@@ -46,7 +46,6 @@ pub fn convolve_column_lb_u16(
 
         let bounds_size = bounds.size;
 
-        let zeros = vdupq_n_s32(0);
         let initial_store = vdupq_n_s32(ROUNDING_CONST);
 
         let v_max_colors = vdupq_n_u16(max_colors);
@@ -65,9 +64,8 @@ pub fn convolve_column_lb_u16(
 
             if bounds_size == 2 {
                 let weights = weight.get_unchecked(0..2);
-
-                let v_weight0 = vdupq_n_s16(weights[0]);
-                let v_weight1 = vdupq_n_s16(weights[1]);
+                let mut v_weight = vld1_dup_s16(weights.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weights.as_ptr().add(1), v_weight);
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -76,24 +74,23 @@ pub fn convolve_column_lb_u16(
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight0));
-                store1 = vmlal_high_s16(store1, item_row0, v_weight0);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row1), vget_low_s16(v_weight0));
-                store3 = vmlal_high_s16(store3, item_row1, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
+                store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight);
+                store3 = vmlal_high_lane_s16::<0>(store3, item_row1, v_weight);
 
                 let item_row10 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
                 let item_row11 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row10), vget_low_s16(v_weight1));
-                store1 = vmlal_high_s16(store1, item_row10, v_weight1);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row11), vget_low_s16(v_weight1));
-                store3 = vmlal_high_s16(store3, item_row11, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight);
+                store1 = vmlal_high_lane_s16::<1>(store1, item_row10, v_weight);
+                store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight);
+                store3 = vmlal_high_lane_s16::<1>(store3, item_row11, v_weight);
             } else if bounds_size == 3 {
                 let weights = weight.get_unchecked(0..3);
-
-                let v_weight0 = vdupq_n_s16(weights[0]);
-                let v_weight1 = vdupq_n_s16(weights[1]);
-                let v_weight2 = vdupq_n_s16(weights[2]);
+                let mut v_weight = vld1_dup_s16(weights.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weights.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weights.as_ptr().add(2), v_weight);
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -103,33 +100,30 @@ pub fn convolve_column_lb_u16(
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight0));
-                store1 = vmlal_high_s16(store1, item_row0, v_weight0);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row1), vget_low_s16(v_weight0));
-                store3 = vmlal_high_s16(store3, item_row1, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
+                store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight);
+                store3 = vmlal_high_lane_s16::<0>(store3, item_row1, v_weight);
 
                 let item_row10 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
                 let item_row11 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row10), vget_low_s16(v_weight1));
-                store1 = vmlal_high_s16(store1, item_row10, v_weight1);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row11), vget_low_s16(v_weight1));
-                store3 = vmlal_high_s16(store3, item_row11, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight);
+                store1 = vmlal_high_lane_s16::<1>(store1, item_row10, v_weight);
+                store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight);
+                store3 = vmlal_high_lane_s16::<1>(store3, item_row11, v_weight);
 
                 let item_row20 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr()));
                 let item_row21 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row20), vget_low_s16(v_weight2));
-                store1 = vmlal_high_s16(store1, item_row20, v_weight2);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row21), vget_low_s16(v_weight2));
-                store3 = vmlal_high_s16(store3, item_row21, v_weight2);
+                store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row20), v_weight);
+                store1 = vmlal_high_lane_s16::<2>(store1, item_row20, v_weight);
+                store2 = vmlal_lane_s16::<2>(store2, vget_low_s16(item_row21), v_weight);
+                store3 = vmlal_high_lane_s16::<2>(store3, item_row21, v_weight);
             } else if bounds_size == 4 {
                 let weights = weight.get_unchecked(0..4);
 
-                let v_weight0 = vdupq_n_s16(weights[0]);
-                let v_weight1 = vdupq_n_s16(weights[1]);
-                let v_weight2 = vdupq_n_s16(weights[2]);
-                let v_weight3 = vdupq_n_s16(weights[3]);
+                let v_weight = vld1_s16(weights.as_ptr());
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -140,34 +134,34 @@ pub fn convolve_column_lb_u16(
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight0));
-                store1 = vmlal_high_s16(store1, item_row0, v_weight0);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row1), vget_low_s16(v_weight0));
-                store3 = vmlal_high_s16(store3, item_row1, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
+                store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight);
+                store3 = vmlal_high_lane_s16::<0>(store3, item_row1, v_weight);
 
                 let item_row10 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
                 let item_row11 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row10), vget_low_s16(v_weight1));
-                store1 = vmlal_high_s16(store1, item_row10, v_weight1);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row11), vget_low_s16(v_weight1));
-                store3 = vmlal_high_s16(store3, item_row11, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight);
+                store1 = vmlal_high_lane_s16::<1>(store1, item_row10, v_weight);
+                store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight);
+                store3 = vmlal_high_lane_s16::<1>(store3, item_row11, v_weight);
 
                 let item_row20 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr()));
                 let item_row21 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row20), vget_low_s16(v_weight2));
-                store1 = vmlal_high_s16(store1, item_row20, v_weight2);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row21), vget_low_s16(v_weight2));
-                store3 = vmlal_high_s16(store3, item_row21, v_weight2);
+                store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row20), v_weight);
+                store1 = vmlal_high_lane_s16::<2>(store1, item_row20, v_weight);
+                store2 = vmlal_lane_s16::<2>(store2, vget_low_s16(item_row21), v_weight);
+                store3 = vmlal_high_lane_s16::<2>(store3, item_row21, v_weight);
 
                 let item_row30 = vreinterpretq_s16_u16(vld1q_u16(src_ptr3.as_ptr()));
                 let item_row31 = vreinterpretq_s16_u16(vld1q_u16(src_ptr3.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row30), vget_low_s16(v_weight3));
-                store1 = vmlal_high_s16(store1, item_row30, v_weight3);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row31), vget_low_s16(v_weight3));
-                store3 = vmlal_high_s16(store3, item_row31, v_weight3);
+                store0 = vmlal_lane_s16::<3>(store0, vget_low_s16(item_row30), v_weight);
+                store1 = vmlal_high_lane_s16::<3>(store1, item_row30, v_weight);
+                store2 = vmlal_lane_s16::<3>(store2, vget_low_s16(item_row31), v_weight);
+                store3 = vmlal_high_lane_s16::<3>(store3, item_row31, v_weight);
             } else {
                 for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
                     let py = bounds.start + j;
@@ -187,15 +181,15 @@ pub fn convolve_column_lb_u16(
 
             let item0 = vminq_u16(
                 vcombine_u16(
-                    vqshrun_n_s32::<PRECISION>(vmaxq_s32(store0, zeros)),
-                    vqshrun_n_s32::<PRECISION>(vmaxq_s32(store1, zeros)),
+                    vqshrun_n_s32::<PRECISION>(store0),
+                    vqshrun_n_s32::<PRECISION>(store1),
                 ),
                 v_max_colors,
             );
             let item1 = vminq_u16(
                 vcombine_u16(
-                    vqshrun_n_s32::<PRECISION>(vmaxq_s32(store2, zeros)),
-                    vqshrun_n_s32::<PRECISION>(vmaxq_s32(store3, zeros)),
+                    vqshrun_n_s32::<PRECISION>(store2),
+                    vqshrun_n_s32::<PRECISION>(store3),
                 ),
                 v_max_colors,
             );
@@ -219,9 +213,8 @@ pub fn convolve_column_lb_u16(
 
             if bounds_size == 2 {
                 let weights = weight.get_unchecked(0..2);
-
-                let v_weight0 = vdupq_n_s16(weights[0]);
-                let v_weight1 = vdupq_n_s16(weights[1]);
+                let mut v_weight = vld1_dup_s16(weights.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weights.as_ptr().add(1), v_weight);
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -229,19 +222,18 @@ pub fn convolve_column_lb_u16(
 
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight0));
-                store1 = vmlal_high_s16(store1, item_row0, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
 
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row1), vget_low_s16(v_weight1));
-                store1 = vmlal_high_s16(store1, item_row1, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight);
+                store1 = vmlal_high_lane_s16::<1>(store1, item_row1, v_weight);
             } else if bounds_size == 3 {
                 let weights = weight.get_unchecked(0..3);
-
-                let v_weight0 = vdupq_n_s16(weights[0]);
-                let v_weight1 = vdupq_n_s16(weights[1]);
-                let v_weight2 = vdupq_n_s16(weights[2]);
+                let mut v_weight = vld1_dup_s16(weights.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weights.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weights.as_ptr().add(2), v_weight);
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -250,25 +242,21 @@ pub fn convolve_column_lb_u16(
 
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight0));
-                store1 = vmlal_high_s16(store1, item_row0, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
 
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row1), vget_low_s16(v_weight1));
-                store1 = vmlal_high_s16(store1, item_row1, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight);
+                store1 = vmlal_high_lane_s16::<1>(store1, item_row1, v_weight);
 
                 let item_row2 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row2), vget_low_s16(v_weight2));
-                store1 = vmlal_high_s16(store1, item_row2, v_weight2);
+                store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row2), v_weight);
+                store1 = vmlal_high_lane_s16::<2>(store1, item_row2, v_weight);
             } else if bounds_size == 4 {
                 let weights = weight.get_unchecked(0..4);
-
-                let v_weight0 = vdupq_n_s16(weights[0]);
-                let v_weight1 = vdupq_n_s16(weights[1]);
-                let v_weight2 = vdupq_n_s16(weights[2]);
-                let v_weight3 = vdupq_n_s16(weights[3]);
+                let v_weight = vld1_s16(weights.as_ptr());
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -278,23 +266,23 @@ pub fn convolve_column_lb_u16(
 
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight0));
-                store1 = vmlal_high_s16(store1, item_row0, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
 
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row1), vget_low_s16(v_weight1));
-                store1 = vmlal_high_s16(store1, item_row1, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight);
+                store1 = vmlal_high_lane_s16::<1>(store1, item_row1, v_weight);
 
                 let item_row2 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row2), vget_low_s16(v_weight2));
-                store1 = vmlal_high_s16(store1, item_row2, v_weight2);
+                store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row2), v_weight);
+                store1 = vmlal_high_lane_s16::<2>(store1, item_row2, v_weight);
 
                 let item_row3 = vreinterpretq_s16_u16(vld1q_u16(src_ptr3.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row3), vget_low_s16(v_weight3));
-                store1 = vmlal_high_s16(store1, item_row3, v_weight3);
+                store0 = vmlal_lane_s16::<3>(store0, vget_low_s16(item_row3), v_weight);
+                store1 = vmlal_high_lane_s16::<3>(store1, item_row3, v_weight);
             } else {
                 for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
                     let py = bounds.start + j;
@@ -311,8 +299,8 @@ pub fn convolve_column_lb_u16(
 
             let item = vminq_u16(
                 vcombine_u16(
-                    vqshrun_n_s32::<PRECISION>(vmaxq_s32(store0, zeros)),
-                    vqshrun_n_s32::<PRECISION>(vmaxq_s32(store1, zeros)),
+                    vqshrun_n_s32::<PRECISION>(store0),
+                    vqshrun_n_s32::<PRECISION>(store1),
                 ),
                 v_max_colors,
             );
@@ -333,25 +321,23 @@ pub fn convolve_column_lb_u16(
 
             if bounds_size == 2 {
                 let weights = weight.get_unchecked(0..2);
-
-                let v_weight0 = vdup_n_s16(weights[0]);
-                let v_weight1 = vdup_n_s16(weights[1]);
+                let mut v_weight = vld1_dup_s16(weights.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weights.as_ptr().add(1), v_weight);
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
 
                 let item_row0 = vreinterpret_s16_u16(vld1_u16(src_ptr0.as_ptr()));
-                store0 = vmlal_s16(store0, item_row0, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, item_row0, v_weight);
 
                 let item_row1 = vreinterpret_s16_u16(vld1_u16(src_ptr1.as_ptr()));
-                store0 = vmlal_s16(store0, item_row1, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, item_row1, v_weight);
             } else if bounds_size == 3 {
                 let weights = weight.get_unchecked(0..3);
-
-                let v_weight0 = vdup_n_s16(weights[0]);
-                let v_weight1 = vdup_n_s16(weights[1]);
-                let v_weight2 = vdup_n_s16(weights[2]);
+                let mut v_weight = vld1_dup_s16(weights.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weights.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weights.as_ptr().add(2), v_weight);
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -359,20 +345,16 @@ pub fn convolve_column_lb_u16(
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..);
 
                 let item_row0 = vreinterpret_s16_u16(vld1_u16(src_ptr0.as_ptr()));
-                store0 = vmlal_s16(store0, item_row0, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, item_row0, v_weight);
 
                 let item_row1 = vreinterpret_s16_u16(vld1_u16(src_ptr1.as_ptr()));
-                store0 = vmlal_s16(store0, item_row1, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, item_row1, v_weight);
 
                 let item_row2 = vreinterpret_s16_u16(vld1_u16(src_ptr2.as_ptr()));
-                store0 = vmlal_s16(store0, item_row2, v_weight2);
+                store0 = vmlal_lane_s16::<2>(store0, item_row2, v_weight);
             } else if bounds_size == 4 {
                 let weights = weight.get_unchecked(0..4);
-
-                let v_weight0 = vdup_n_s16(weights[0]);
-                let v_weight1 = vdup_n_s16(weights[1]);
-                let v_weight2 = vdup_n_s16(weights[2]);
-                let v_weight3 = vdup_n_s16(weights[3]);
+                let v_weight = vld1_s16(weights.as_ptr());
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -381,16 +363,16 @@ pub fn convolve_column_lb_u16(
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_dx)..);
 
                 let item_row0 = vreinterpret_s16_u16(vld1_u16(src_ptr0.as_ptr()));
-                store0 = vmlal_s16(store0, item_row0, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, item_row0, v_weight);
 
                 let item_row1 = vreinterpret_s16_u16(vld1_u16(src_ptr1.as_ptr()));
-                store0 = vmlal_s16(store0, item_row1, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, item_row1, v_weight);
 
                 let item_row2 = vreinterpret_s16_u16(vld1_u16(src_ptr2.as_ptr()));
-                store0 = vmlal_s16(store0, item_row2, v_weight2);
+                store0 = vmlal_lane_s16::<2>(store0, item_row2, v_weight);
 
                 let item_row3 = vreinterpret_s16_u16(vld1_u16(src_ptr3.as_ptr()));
-                store0 = vmlal_s16(store0, item_row3, v_weight3);
+                store0 = vmlal_lane_s16::<3>(store0, item_row3, v_weight);
             } else {
                 for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
                     let py = bounds.start + j;
@@ -405,7 +387,7 @@ pub fn convolve_column_lb_u16(
             }
 
             let u_store0 = vmin_u16(
-                vqshrun_n_s32::<PRECISION>(vmaxq_s32(store0, zeros)),
+                vqshrun_n_s32::<PRECISION>(store0),
                 vget_low_u16(v_max_colors),
             );
             vst1_u16(dst.as_mut_ptr(), u_store0);
diff --git a/src/neon/vertical_u8.rs b/src/neon/vertical_u8.rs
index 392f9dc..667fb39 100644
--- a/src/neon/vertical_u8.rs
+++ b/src/neon/vertical_u8.rs
@@ -27,23 +27,21 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::filter_weights::FilterBounds;
+use crate::neon::utils::{xvld1q_u8_x2, xvld1q_u8_x4, xvst1q_u8_x2, xvst1q_u8_x4};
 use crate::support::{PRECISION, ROUNDING_CONST};
 use std::arch::aarch64::*;
 
 macro_rules! pack_weights {
     ($store_0: expr, $store_1: expr, $store_2: expr, $store_3: expr) => {{
-        let zeros = vdupq_n_s16(0);
-        let low_s16 = vcombine_s16(
-            vqshrn_n_s32::<PRECISION>($store_0),
-            vqshrn_n_s32::<PRECISION>($store_1),
+        let low_u16 = vcombine_u16(
+            vqshrun_n_s32::<PRECISION>($store_0),
+            vqshrun_n_s32::<PRECISION>($store_1),
         );
-        let high_s16 = vcombine_s16(
-            vqshrn_n_s32::<PRECISION>($store_2),
-            vqshrn_n_s32::<PRECISION>($store_3),
+        let high_u16 = vcombine_u16(
+            vqshrun_n_s32::<PRECISION>($store_2),
+            vqshrun_n_s32::<PRECISION>($store_3),
         );
-        let low_16 = vreinterpretq_u16_s16(vmaxq_s16(low_s16, zeros));
-        let high_16 = vreinterpretq_u16_s16(vmaxq_s16(high_s16, zeros));
-        vcombine_u8(vqmovn_u16(low_16), vqmovn_u16(high_16))
+        vcombine_u8(vqmovn_u16(low_u16), vqmovn_u16(high_u16))
     }};
 }
 
@@ -59,6 +57,18 @@ macro_rules! accumulate_4_into {
     }};
 }
 
+macro_rules! accumulate_4_into_lane {
+    ($item: expr,$store_0: expr, $store_1: expr, $store_2: expr, $store_3: expr, $weight: expr, $weight_pos: expr) => {{
+        let low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8($item)));
+        let high = vreinterpretq_s16_u16(vmovl_high_u8($item));
+
+        $store_0 = vmlal_lane_s16::<$weight_pos>($store_0, vget_low_s16(low), $weight);
+        $store_1 = vmlal_high_lane_s16::<$weight_pos>($store_1, low, $weight);
+        $store_2 = vmlal_lane_s16::<$weight_pos>($store_2, vget_low_s16(high), $weight);
+        $store_3 = vmlal_high_lane_s16::<$weight_pos>($store_3, high, $weight);
+    }};
+}
+
 pub fn convolve_vertical_neon_i16_precision(
     width: usize,
     bounds: &FilterBounds,
@@ -97,6 +107,20 @@ unsafe fn vdot<const SCALE: i32>(
     (store0, store1)
 }
 
+#[inline(always)]
+unsafe fn vdot_lane<const SCALE: i32, const LANE: i32>(
+    store0: int16x8_t,
+    store1: int16x8_t,
+    row: uint8x16_t,
+    weight: int16x4_t,
+) -> (int16x8_t, int16x8_t) {
+    let lo0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(vget_low_u8(row)));
+    let store0 = vqrdmlahq_lane_s16::<LANE>(store0, lo0, weight);
+    let hi0 = vreinterpretq_s16_u16(vshll_high_n_u8::<SCALE>(row));
+    let store1 = vqrdmlahq_lane_s16::<LANE>(store1, hi0, weight);
+    (store0, store1)
+}
+
 #[target_feature(enable = "rdm")]
 unsafe fn convolve_vertical_neon_row_upper(
     _: usize,
@@ -135,100 +159,97 @@ unsafe fn convolve_vertical_neon_row_upper(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
 
-                let items0 = vld1q_u8_x4(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items0.0, v_weight0);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items0.1, v_weight0);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items0.2, v_weight0);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items0.3, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 0>(store_4, store_5, items0.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 0>(store_6, store_7, items0.3, v_weight);
 
-                let items1 = vld1q_u8_x4(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items1.0, v_weight1);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items1.1, v_weight1);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items1.2, v_weight1);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items1.3, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 1>(store_4, store_5, items1.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 1>(store_6, store_7, items1.3, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
 
-                let items0 = vld1q_u8_x4(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items0.0, v_weight0);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items0.1, v_weight0);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items0.2, v_weight0);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items0.3, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 0>(store_4, store_5, items0.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 0>(store_6, store_7, items0.3, v_weight);
 
-                let items1 = vld1q_u8_x4(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items1.0, v_weight1);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items1.1, v_weight1);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items1.2, v_weight1);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items1.3, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 1>(store_4, store_5, items1.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 1>(store_6, store_7, items1.3, v_weight);
 
-                let items2 = vld1q_u8_x4(src_ptr2.as_ptr());
+                let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items2.0, v_weight2);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items2.1, v_weight2);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items2.2, v_weight2);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items2.3, v_weight2);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 2>(store_4, store_5, items2.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 2>(store_6, store_7, items2.3, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
 
-                let items0 = vld1q_u8_x4(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items0.0, v_weight0);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items0.1, v_weight0);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items0.2, v_weight0);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items0.3, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 0>(store_4, store_5, items0.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 0>(store_6, store_7, items0.3, v_weight);
 
-                let items1 = vld1q_u8_x4(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items1.0, v_weight1);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items1.1, v_weight1);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items1.2, v_weight1);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items1.3, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 1>(store_4, store_5, items1.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 1>(store_6, store_7, items1.3, v_weight);
 
-                let items2 = vld1q_u8_x4(src_ptr2.as_ptr());
+                let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items2.0, v_weight2);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items2.1, v_weight2);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items2.2, v_weight2);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items2.3, v_weight2);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 2>(store_4, store_5, items2.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 2>(store_6, store_7, items2.3, v_weight);
 
-                let items3 = vld1q_u8_x4(src_ptr3.as_ptr());
+                let items3 = xvld1q_u8_x4(src_ptr3.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items3.0, v_weight3);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items3.1, v_weight3);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items3.2, v_weight3);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items3.3, v_weight3);
+                (store_0, store_1) = vdot_lane::<SCALE, 3>(store_0, store_1, items3.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 3>(store_2, store_3, items3.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 3>(store_4, store_5, items3.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 3>(store_6, store_7, items3.3, v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
                     let weight = weight.get_unchecked(j..);
                     let v_weight = vld1q_dup_s16(weight.as_ptr());
                     let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                    let items = vld1q_u8_x4(src_ptr.as_ptr());
+                    let items = xvld1q_u8_x4(src_ptr.as_ptr());
 
                     (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items.0, v_weight);
                     (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items.1, v_weight);
@@ -260,7 +281,7 @@ unsafe fn convolve_vertical_neon_row_upper(
             let item3 = vcombine_u8(item30, item31);
 
             let dst_items = uint8x16x4_t(item0, item1, item2, item3);
-            vst1q_u8_x4(dst.as_mut_ptr(), dst_items);
+            xvst1q_u8_x4(dst.as_mut_ptr(), dst_items);
 
             cx += 64;
         }
@@ -280,82 +301,79 @@ unsafe fn convolve_vertical_neon_row_upper(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
 
-                let items0 = vld1q_u8_x2(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items0.0, v_weight0);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items0.1, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
 
-                let items1 = vld1q_u8_x2(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items1.0, v_weight1);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items1.1, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
 
-                let items0 = vld1q_u8_x2(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items0.0, v_weight0);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items0.1, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
 
-                let items1 = vld1q_u8_x2(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items1.0, v_weight1);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items1.1, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
 
-                let items2 = vld1q_u8_x2(src_ptr2.as_ptr());
+                let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items2.0, v_weight2);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items2.1, v_weight2);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
 
-                let items0 = vld1q_u8_x2(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items0.0, v_weight0);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items0.1, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
 
-                let items1 = vld1q_u8_x2(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items1.0, v_weight1);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items1.1, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
 
-                let items2 = vld1q_u8_x2(src_ptr2.as_ptr());
+                let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items2.0, v_weight2);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items2.1, v_weight2);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
 
-                let items3 = vld1q_u8_x2(src_ptr3.as_ptr());
+                let items3 = xvld1q_u8_x2(src_ptr3.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items3.0, v_weight3);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items3.1, v_weight3);
+                (store_0, store_1) = vdot_lane::<SCALE, 3>(store_0, store_1, items3.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 3>(store_2, store_3, items3.1, v_weight);
             } else {
                 for j in 0..bounds.size {
                     let py = bounds.start + j;
                     let weight = weight.get_unchecked(j..);
                     let v_weight = vld1q_dup_s16(weight.as_ptr());
                     let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                    let items = vld1q_u8_x2(src_ptr.as_ptr());
+                    let items = xvld1q_u8_x2(src_ptr.as_ptr());
 
                     (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items.0, v_weight);
                     (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items.1, v_weight);
@@ -375,7 +393,7 @@ unsafe fn convolve_vertical_neon_row_upper(
             let item1 = vcombine_u8(item10, item11);
 
             let dst_items = uint8x16x2_t(item0, item1);
-            vst1q_u8_x2(dst.as_mut_ptr(), dst_items);
+            xvst1q_u8_x2(dst.as_mut_ptr(), dst_items);
 
             cx += 32;
         }
@@ -393,62 +411,58 @@ unsafe fn convolve_vertical_neon_row_upper(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
 
                 let item0 = vld1q_u8(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item0, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, item0, v_weight);
 
                 let item1 = vld1q_u8(src_ptr1.as_ptr());
-
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item1, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, item1, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
 
                 let item0 = vld1q_u8(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item0, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, item0, v_weight);
 
                 let item1 = vld1q_u8(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item1, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, item1, v_weight);
 
                 let item2 = vld1q_u8(src_ptr2.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item2, v_weight2);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, item2, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
 
                 let item0 = vld1q_u8(src_ptr0.as_ptr());
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item0, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, item0, v_weight);
 
                 let item1 = vld1q_u8(src_ptr1.as_ptr());
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item1, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, item1, v_weight);
 
                 let item2 = vld1q_u8(src_ptr2.as_ptr());
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item2, v_weight2);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, item2, v_weight);
 
                 let item3 = vld1q_u8(src_ptr3.as_ptr());
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item3, v_weight3);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, item3, v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -484,46 +498,43 @@ unsafe fn convolve_vertical_neon_row_upper(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
 
                 let item0 = vld1_u8(src_ptr0.as_ptr());
                 let low0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item0));
-                store_0 = vqrdmlahq_s16(store_0, low0, v_weight0);
+                store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight);
 
                 let item1 = vld1_u8(src_ptr1.as_ptr());
                 let low1 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item1));
-                store_0 = vqrdmlahq_s16(store_0, low1, v_weight1);
+                store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
 
                 let item0 = vld1_u8(src_ptr0.as_ptr());
                 let low0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item0));
-                store_0 = vqrdmlahq_s16(store_0, low0, v_weight0);
+                store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight);
 
                 let item1 = vld1_u8(src_ptr1.as_ptr());
                 let low1 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item1));
-                store_0 = vqrdmlahq_s16(store_0, low1, v_weight1);
+                store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight);
 
                 let item2 = vld1_u8(src_ptr2.as_ptr());
                 let low2 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item2));
-                store_0 = vqrdmlahq_s16(store_0, low2, v_weight2);
+                store_0 = vqrdmlahq_lane_s16::<2>(store_0, low2, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -531,19 +542,19 @@ unsafe fn convolve_vertical_neon_row_upper(
 
                 let item0 = vld1_u8(src_ptr0.as_ptr());
                 let low0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item0));
-                store_0 = vqrdmlahq_s16(store_0, low0, v_weight0);
+                store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight);
 
                 let item1 = vld1_u8(src_ptr1.as_ptr());
                 let low1 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item1));
-                store_0 = vqrdmlahq_s16(store_0, low1, v_weight1);
+                store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight);
 
                 let item2 = vld1_u8(src_ptr2.as_ptr());
                 let low2 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item2));
-                store_0 = vqrdmlahq_s16(store_0, low2, v_weight2);
+                store_0 = vqrdmlahq_lane_s16::<2>(store_0, low2, v_weight);
 
                 let item3 = vld1_u8(src_ptr3.as_ptr());
                 let low3 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item3));
-                store_0 = vqrdmlahq_s16(store_0, low3, v_weight3);
+                store_0 = vqrdmlahq_lane_s16::<3>(store_0, low3, v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -577,46 +588,43 @@ unsafe fn convolve_vertical_neon_row_upper(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
 
                 let items0 = vld1_dup_u8(src_ptr0.as_ptr());
                 let low0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items0));
-                store = vqrdmlahq_s16(store, low0, v_weight0);
+                store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight);
 
                 let items1 = vld1_dup_u8(src_ptr1.as_ptr());
                 let low1 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items1));
-                store = vqrdmlahq_s16(store, low1, v_weight1);
+                store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
 
                 let items0 = vld1_dup_u8(src_ptr0.as_ptr());
                 let low0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items0));
-                store = vqrdmlahq_s16(store, low0, v_weight0);
+                store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight);
 
                 let items1 = vld1_dup_u8(src_ptr1.as_ptr());
                 let low1 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items1));
-                store = vqrdmlahq_s16(store, low1, v_weight1);
+                store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight);
 
                 let items2 = vld1_dup_u8(src_ptr2.as_ptr());
                 let low2 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items2));
-                store = vqrdmlahq_s16(store, low2, v_weight2);
+                store = vqrdmlahq_lane_s16::<2>(store, low2, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -624,19 +632,19 @@ unsafe fn convolve_vertical_neon_row_upper(
 
                 let items0 = vld1_dup_u8(src_ptr0.as_ptr());
                 let low0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items0));
-                store = vqrdmlahq_s16(store, low0, v_weight0);
+                store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight);
 
                 let items1 = vld1_dup_u8(src_ptr1.as_ptr());
                 let low1 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items1));
-                store = vqrdmlahq_s16(store, low1, v_weight1);
+                store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight);
 
                 let items2 = vld1_dup_u8(src_ptr2.as_ptr());
                 let low2 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items2));
-                store = vqrdmlahq_s16(store, low2, v_weight2);
+                store = vqrdmlahq_lane_s16::<2>(store, low2, v_weight);
 
                 let items3 = vld1_dup_u8(src_ptr3.as_ptr());
                 let low3 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items3));
-                store = vqrdmlahq_s16(store, low3, v_weight3);
+                store = vqrdmlahq_lane_s16::<3>(store, low3, v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -702,100 +710,133 @@ fn convolve_vertical_neon_row_full(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
 
-                let items0 = vld1q_u8_x4(src_ptr0.as_ptr());
-
-                accumulate_4_into!(items0.0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(items0.1, store_4, store_5, store_6, store_7, v_weight0);
-                accumulate_4_into!(items0.2, store_8, store_9, store_10, store_11, v_weight0);
-                accumulate_4_into!(items0.3, store_12, store_13, store_14, store_15, v_weight0);
-
-                let items1 = vld1q_u8_x4(src_ptr1.as_ptr());
-
-                accumulate_4_into!(items1.0, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(items1.1, store_4, store_5, store_6, store_7, v_weight1);
-                accumulate_4_into!(items1.2, store_8, store_9, store_10, store_11, v_weight1);
-                accumulate_4_into!(items1.3, store_12, store_13, store_14, store_15, v_weight1);
+                let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
+
+                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
+                accumulate_4_into_lane!(
+                    items0.2, store_8, store_9, store_10, store_11, v_weight, 0
+                );
+                accumulate_4_into_lane!(
+                    items0.3, store_12, store_13, store_14, store_15, v_weight, 0
+                );
+
+                let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
+
+                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
+                accumulate_4_into_lane!(
+                    items1.2, store_8, store_9, store_10, store_11, v_weight, 1
+                );
+                accumulate_4_into_lane!(
+                    items1.3, store_12, store_13, store_14, store_15, v_weight, 1
+                );
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
 
-                let items0 = vld1q_u8_x4(src_ptr0.as_ptr());
-
-                accumulate_4_into!(items0.0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(items0.1, store_4, store_5, store_6, store_7, v_weight0);
-                accumulate_4_into!(items0.2, store_8, store_9, store_10, store_11, v_weight0);
-                accumulate_4_into!(items0.3, store_12, store_13, store_14, store_15, v_weight0);
-
-                let items1 = vld1q_u8_x4(src_ptr1.as_ptr());
-
-                accumulate_4_into!(items1.0, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(items1.1, store_4, store_5, store_6, store_7, v_weight1);
-                accumulate_4_into!(items1.2, store_8, store_9, store_10, store_11, v_weight1);
-                accumulate_4_into!(items1.3, store_12, store_13, store_14, store_15, v_weight1);
-
-                let items2 = vld1q_u8_x4(src_ptr2.as_ptr());
-
-                accumulate_4_into!(items2.0, store_0, store_1, store_2, store_3, v_weight2);
-                accumulate_4_into!(items2.1, store_4, store_5, store_6, store_7, v_weight2);
-                accumulate_4_into!(items2.2, store_8, store_9, store_10, store_11, v_weight2);
-                accumulate_4_into!(items2.3, store_12, store_13, store_14, store_15, v_weight2);
+                let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
+
+                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
+                accumulate_4_into_lane!(
+                    items0.2, store_8, store_9, store_10, store_11, v_weight, 0
+                );
+                accumulate_4_into_lane!(
+                    items0.3, store_12, store_13, store_14, store_15, v_weight, 0
+                );
+
+                let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
+
+                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
+                accumulate_4_into_lane!(
+                    items1.2, store_8, store_9, store_10, store_11, v_weight, 1
+                );
+                accumulate_4_into_lane!(
+                    items1.3, store_12, store_13, store_14, store_15, v_weight, 1
+                );
+
+                let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
+
+                accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2);
+                accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2);
+                accumulate_4_into_lane!(
+                    items2.2, store_8, store_9, store_10, store_11, v_weight, 2
+                );
+                accumulate_4_into_lane!(
+                    items2.3, store_12, store_13, store_14, store_15, v_weight, 2
+                );
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
 
-                let items0 = vld1q_u8_x4(src_ptr0.as_ptr());
-
-                accumulate_4_into!(items0.0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(items0.1, store_4, store_5, store_6, store_7, v_weight0);
-                accumulate_4_into!(items0.2, store_8, store_9, store_10, store_11, v_weight0);
-                accumulate_4_into!(items0.3, store_12, store_13, store_14, store_15, v_weight0);
-
-                let items1 = vld1q_u8_x4(src_ptr1.as_ptr());
-
-                accumulate_4_into!(items1.0, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(items1.1, store_4, store_5, store_6, store_7, v_weight1);
-                accumulate_4_into!(items1.2, store_8, store_9, store_10, store_11, v_weight1);
-                accumulate_4_into!(items1.3, store_12, store_13, store_14, store_15, v_weight1);
-
-                let items2 = vld1q_u8_x4(src_ptr2.as_ptr());
-
-                accumulate_4_into!(items2.0, store_0, store_1, store_2, store_3, v_weight2);
-                accumulate_4_into!(items2.1, store_4, store_5, store_6, store_7, v_weight2);
-                accumulate_4_into!(items2.2, store_8, store_9, store_10, store_11, v_weight2);
-                accumulate_4_into!(items2.3, store_12, store_13, store_14, store_15, v_weight2);
-
-                let items3 = vld1q_u8_x4(src_ptr3.as_ptr());
-
-                accumulate_4_into!(items3.0, store_0, store_1, store_2, store_3, v_weight3);
-                accumulate_4_into!(items3.1, store_4, store_5, store_6, store_7, v_weight3);
-                accumulate_4_into!(items3.2, store_8, store_9, store_10, store_11, v_weight3);
-                accumulate_4_into!(items3.3, store_12, store_13, store_14, store_15, v_weight3);
+                let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
+
+                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
+                accumulate_4_into_lane!(
+                    items0.2, store_8, store_9, store_10, store_11, v_weight, 0
+                );
+                accumulate_4_into_lane!(
+                    items0.3, store_12, store_13, store_14, store_15, v_weight, 0
+                );
+
+                let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
+
+                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
+                accumulate_4_into_lane!(
+                    items1.2, store_8, store_9, store_10, store_11, v_weight, 1
+                );
+                accumulate_4_into_lane!(
+                    items1.3, store_12, store_13, store_14, store_15, v_weight, 1
+                );
+
+                let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
+
+                accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2);
+                accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2);
+                accumulate_4_into_lane!(
+                    items2.2, store_8, store_9, store_10, store_11, v_weight, 2
+                );
+                accumulate_4_into_lane!(
+                    items2.3, store_12, store_13, store_14, store_15, v_weight, 2
+                );
+
+                let items3 = xvld1q_u8_x4(src_ptr3.as_ptr());
+
+                accumulate_4_into_lane!(items3.0, store_0, store_1, store_2, store_3, v_weight, 3);
+                accumulate_4_into_lane!(items3.1, store_4, store_5, store_6, store_7, v_weight, 3);
+                accumulate_4_into_lane!(
+                    items3.2, store_8, store_9, store_10, store_11, v_weight, 3
+                );
+                accumulate_4_into_lane!(
+                    items3.3, store_12, store_13, store_14, store_15, v_weight, 3
+                );
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
                     let weight = weight.get_unchecked(j..);
                     let v_weight = vld1q_dup_s16(weight.as_ptr());
                     let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                    let items = vld1q_u8_x4(src_ptr.as_ptr());
+                    let items = xvld1q_u8_x4(src_ptr.as_ptr());
 
                     accumulate_4_into!(items.0, store_0, store_1, store_2, store_3, v_weight);
                     accumulate_4_into!(items.1, store_4, store_5, store_6, store_7, v_weight);
@@ -810,7 +851,7 @@ fn convolve_vertical_neon_row_full(
             let item_3 = pack_weights!(store_12, store_13, store_14, store_15);
 
             let dst_items = uint8x16x4_t(item_0, item_1, item_2, item_3);
-            vst1q_u8_x4(dst.as_mut_ptr(), dst_items);
+            xvst1q_u8_x4(dst.as_mut_ptr(), dst_items);
 
             cx += 64;
         }
@@ -834,79 +875,76 @@ fn convolve_vertical_neon_row_full(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-                let items0 = vld1q_u8_x2(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                accumulate_4_into!(items0.0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(items0.1, store_4, store_5, store_6, store_7, v_weight0);
+                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
 
-                let items1 = vld1q_u8_x2(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                accumulate_4_into!(items1.0, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(items1.1, store_4, store_5, store_6, store_7, v_weight1);
+                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-                let items0 = vld1q_u8_x2(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                accumulate_4_into!(items0.0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(items0.1, store_4, store_5, store_6, store_7, v_weight0);
+                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
 
-                let items1 = vld1q_u8_x2(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                accumulate_4_into!(items1.0, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(items1.1, store_4, store_5, store_6, store_7, v_weight1);
+                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
 
-                let items2 = vld1q_u8_x2(src_ptr2.as_ptr());
+                let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
 
-                accumulate_4_into!(items2.0, store_0, store_1, store_2, store_3, v_weight2);
-                accumulate_4_into!(items2.1, store_4, store_5, store_6, store_7, v_weight2);
+                accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2);
+                accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
-                let items0 = vld1q_u8_x2(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                accumulate_4_into!(items0.0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(items0.1, store_4, store_5, store_6, store_7, v_weight0);
+                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
 
-                let items1 = vld1q_u8_x2(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                accumulate_4_into!(items1.0, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(items1.1, store_4, store_5, store_6, store_7, v_weight1);
+                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
 
-                let items2 = vld1q_u8_x2(src_ptr2.as_ptr());
+                let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
 
-                accumulate_4_into!(items2.0, store_0, store_1, store_2, store_3, v_weight2);
-                accumulate_4_into!(items2.1, store_4, store_5, store_6, store_7, v_weight2);
+                accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2);
+                accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2);
 
-                let items3 = vld1q_u8_x2(src_ptr3.as_ptr());
+                let items3 = xvld1q_u8_x2(src_ptr3.as_ptr());
 
-                accumulate_4_into!(items3.0, store_0, store_1, store_2, store_3, v_weight3);
-                accumulate_4_into!(items3.1, store_4, store_5, store_6, store_7, v_weight3);
+                accumulate_4_into_lane!(items3.0, store_0, store_1, store_2, store_3, v_weight, 3);
+                accumulate_4_into_lane!(items3.1, store_4, store_5, store_6, store_7, v_weight, 3);
             } else {
                 for j in 0..bounds.size {
                     let py = bounds.start + j;
                     let weight = weight.get_unchecked(j..);
                     let v_weight = vld1q_dup_s16(weight.as_ptr());
                     let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                    let items = vld1q_u8_x2(src_ptr.as_ptr());
+                    let items = xvld1q_u8_x2(src_ptr.as_ptr());
 
                     accumulate_4_into!(items.0, store_0, store_1, store_2, store_3, v_weight);
                     accumulate_4_into!(items.1, store_4, store_5, store_6, store_7, v_weight);
@@ -917,7 +955,7 @@ fn convolve_vertical_neon_row_full(
             let item_1 = pack_weights!(store_4, store_5, store_6, store_7);
 
             let dst_items = uint8x16x2_t(item_0, item_1);
-            vst1q_u8_x2(dst.as_mut_ptr(), dst_items);
+            xvst1q_u8_x2(dst.as_mut_ptr(), dst_items);
 
             cx += 32;
         }
@@ -937,36 +975,33 @@ fn convolve_vertical_neon_row_full(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let item_row0 = vld1q_u8(src_ptr0.as_ptr());
                 let item_row1 = vld1q_u8(src_ptr1.as_ptr());
-                accumulate_4_into!(item_row0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(item_row1, store_0, store_1, store_2, store_3, v_weight1);
+                accumulate_4_into_lane!(item_row0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(item_row1, store_0, store_1, store_2, store_3, v_weight, 1);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let item_row0 = vld1q_u8(src_ptr0.as_ptr());
                 let item_row1 = vld1q_u8(src_ptr1.as_ptr());
                 let item_row2 = vld1q_u8(src_ptr2.as_ptr());
-                accumulate_4_into!(item_row0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(item_row1, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(item_row2, store_0, store_1, store_2, store_3, v_weight2);
+                accumulate_4_into_lane!(item_row0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(item_row1, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(item_row2, store_0, store_1, store_2, store_3, v_weight, 2);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -975,10 +1010,10 @@ fn convolve_vertical_neon_row_full(
                 let item_row1 = vld1q_u8(src_ptr1.as_ptr());
                 let item_row2 = vld1q_u8(src_ptr2.as_ptr());
                 let item_row3 = vld1q_u8(src_ptr3.as_ptr());
-                accumulate_4_into!(item_row0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(item_row1, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(item_row2, store_0, store_1, store_2, store_3, v_weight2);
-                accumulate_4_into!(item_row3, store_0, store_1, store_2, store_3, v_weight3);
+                accumulate_4_into_lane!(item_row0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(item_row1, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(item_row2, store_0, store_1, store_2, store_3, v_weight, 2);
+                accumulate_4_into_lane!(item_row3, store_0, store_1, store_2, store_3, v_weight, 3);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -1010,8 +1045,8 @@ fn convolve_vertical_neon_row_full(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let item_row0 = vld1_u8(src_ptr0.as_ptr());
@@ -1019,16 +1054,16 @@ fn convolve_vertical_neon_row_full(
 
                 let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0));
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
-                store_0 = vmlal_s16(store_0, vget_low_s16(low0), vget_low_s16(v_weight0));
-                store_1 = vmlal_high_s16(store_1, low0, v_weight0);
-                store_0 = vmlal_s16(store_0, vget_low_s16(low1), vget_low_s16(v_weight1));
-                store_1 = vmlal_high_s16(store_1, low1, v_weight1);
+                store_0 = vmlal_lane_s16::<0>(store_0, vget_low_s16(low0), v_weight);
+                store_1 = vmlal_high_lane_s16::<0>(store_1, low0, v_weight);
+                store_0 = vmlal_lane_s16::<1>(store_0, vget_low_s16(low1), v_weight);
+                store_1 = vmlal_high_lane_s16::<1>(store_1, low1, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -1039,19 +1074,16 @@ fn convolve_vertical_neon_row_full(
                 let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0));
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
                 let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2));
-                store_0 = vmlal_s16(store_0, vget_low_s16(low0), vget_low_s16(v_weight0));
-                store_1 = vmlal_high_s16(store_1, low0, v_weight0);
-                store_0 = vmlal_s16(store_0, vget_low_s16(low1), vget_low_s16(v_weight1));
-                store_1 = vmlal_high_s16(store_1, low1, v_weight1);
-                store_0 = vmlal_s16(store_0, vget_low_s16(low2), vget_low_s16(v_weight2));
-                store_1 = vmlal_high_s16(store_1, low2, v_weight2);
+                store_0 = vmlal_lane_s16::<0>(store_0, vget_low_s16(low0), v_weight);
+                store_1 = vmlal_high_lane_s16::<0>(store_1, low0, v_weight);
+                store_0 = vmlal_lane_s16::<1>(store_0, vget_low_s16(low1), v_weight);
+                store_1 = vmlal_high_lane_s16::<1>(store_1, low1, v_weight);
+                store_0 = vmlal_lane_s16::<2>(store_0, vget_low_s16(low2), v_weight);
+                store_1 = vmlal_high_lane_s16::<3>(store_1, low2, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -1065,14 +1097,14 @@ fn convolve_vertical_neon_row_full(
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
                 let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2));
                 let low3 = vreinterpretq_s16_u16(vmovl_u8(item_row3));
-                store_0 = vmlal_s16(store_0, vget_low_s16(low0), vget_low_s16(v_weight0));
-                store_1 = vmlal_high_s16(store_1, low0, v_weight0);
-                store_0 = vmlal_s16(store_0, vget_low_s16(low1), vget_low_s16(v_weight1));
-                store_1 = vmlal_high_s16(store_1, low1, v_weight1);
-                store_0 = vmlal_s16(store_0, vget_low_s16(low2), vget_low_s16(v_weight2));
-                store_1 = vmlal_high_s16(store_1, low2, v_weight2);
-                store_0 = vmlal_s16(store_0, vget_low_s16(low3), vget_low_s16(v_weight3));
-                store_1 = vmlal_high_s16(store_1, low3, v_weight3);
+                store_0 = vmlal_lane_s16::<0>(store_0, vget_low_s16(low0), v_weight);
+                store_1 = vmlal_high_lane_s16::<0>(store_1, low0, v_weight);
+                store_0 = vmlal_lane_s16::<1>(store_0, vget_low_s16(low1), v_weight);
+                store_1 = vmlal_high_lane_s16::<1>(store_1, low1, v_weight);
+                store_0 = vmlal_lane_s16::<2>(store_0, vget_low_s16(low2), v_weight);
+                store_1 = vmlal_high_lane_s16::<2>(store_1, low2, v_weight);
+                store_0 = vmlal_lane_s16::<3>(store_0, vget_low_s16(low3), v_weight);
+                store_1 = vmlal_high_lane_s16::<3>(store_1, low3, v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -1087,15 +1119,12 @@ fn convolve_vertical_neon_row_full(
                 }
             }
 
-            let zeros = vdupq_n_s16(0);
-
-            let low_s16 = vcombine_s16(
-                vqshrn_n_s32::<PRECISION>(store_0),
-                vqshrn_n_s32::<PRECISION>(store_1),
+            let low_u16 = vcombine_u16(
+                vqshrun_n_s32::<PRECISION>(store_0),
+                vqshrun_n_s32::<PRECISION>(store_1),
             );
-            let low_16 = vreinterpretq_u16_s16(vmaxq_s16(low_s16, zeros));
 
-            let item = vqmovn_u16(low_16);
+            let item = vqmovn_u16(low_u16);
 
             vst1_u8(dst.as_mut_ptr(), item);
 
@@ -1114,8 +1143,8 @@ fn convolve_vertical_neon_row_full(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let item_row0 = vld1_dup_u8(src_ptr0.as_ptr());
@@ -1123,14 +1152,14 @@ fn convolve_vertical_neon_row_full(
 
                 let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0));
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
-                store = vmlal_s16(store, vget_low_s16(low0), vget_low_s16(v_weight0));
-                store = vmlal_s16(store, vget_low_s16(low1), vget_low_s16(v_weight1));
+                store = vmlal_lane_s16::<0>(store, vget_low_s16(low0), v_weight);
+                store = vmlal_lane_s16::<1>(store, vget_low_s16(low1), v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -1141,16 +1170,13 @@ fn convolve_vertical_neon_row_full(
                 let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0));
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
                 let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2));
-                store = vmlal_s16(store, vget_low_s16(low0), vget_low_s16(v_weight0));
-                store = vmlal_s16(store, vget_low_s16(low1), vget_low_s16(v_weight1));
-                store = vmlal_s16(store, vget_low_s16(low2), vget_low_s16(v_weight2));
+                store = vmlal_lane_s16::<0>(store, vget_low_s16(low0), v_weight);
+                store = vmlal_lane_s16::<1>(store, vget_low_s16(low1), v_weight);
+                store = vmlal_lane_s16::<2>(store, vget_low_s16(low2), v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -1164,10 +1190,10 @@ fn convolve_vertical_neon_row_full(
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
                 let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2));
                 let low3 = vreinterpretq_s16_u16(vmovl_u8(item_row3));
-                store = vmlal_s16(store, vget_low_s16(low0), vget_low_s16(v_weight0));
-                store = vmlal_s16(store, vget_low_s16(low1), vget_low_s16(v_weight1));
-                store = vmlal_s16(store, vget_low_s16(low2), vget_low_s16(v_weight2));
-                store = vmlal_s16(store, vget_low_s16(low3), vget_low_s16(v_weight3));
+                store = vmlal_lane_s16::<0>(store, vget_low_s16(low0), v_weight);
+                store = vmlal_lane_s16::<1>(store, vget_low_s16(low1), v_weight);
+                store = vmlal_lane_s16::<2>(store, vget_low_s16(low2), v_weight);
+                store = vmlal_lane_s16::<3>(store, vget_low_s16(low3), v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;

From d335ebb8a303ef831d1b8275e6ef86a3ae213fdc Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Mon, 9 Dec 2024 18:18:13 +0000
Subject: [PATCH 11/19] Adding fuzzing, rgba8

---
 .github/workflows/build_push.yml        | 11 +++++-
 Cargo.lock                              |  8 +++++
 Cargo.toml                              |  2 +-
 fuzz/resize_rgb/resize_rgb.rs           | 42 ++++++++++++++++++++++
 fuzz/resize_rgb_u16/resize_rgb_u16.rs   | 47 +++++++++++++++++++++++++
 fuzz/resize_rgba_u16/resize_rgba_u16.rs | 47 +++++++++++++++++++++++++
 6 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100644 fuzz/resize_rgb/resize_rgb.rs
 create mode 100644 fuzz/resize_rgb_u16/resize_rgb_u16.rs
 create mode 100644 fuzz/resize_rgba_u16/resize_rgba_u16.rs

diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index 0f1c7f2..6765ab9 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -34,4 +34,13 @@ jobs:
       - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target x86_64-unknown-linux-gnu
       - run: RUSTFLAGS="-C target-feature=+sse4.1,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu
       - run: RUSTFLAGS="-C target-feature=+avx2,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu
-      - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu
\ No newline at end of file
+      - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu
+
+  fuzz_rgba_8bit:
+    name: Fuzzing RGBA8
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@nightly
+      - run: cargo install cargo-fuzz
+      - run: cargo fuzz run resize_rgba -- -max_total_time=30
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index bdc39dd..9eda492 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -53,6 +53,14 @@ dependencies = [
  "yuvutils-rs",
 ]
 
+[[package]]
+name = "app-fuzz"
+version = "0.0.0"
+dependencies = [
+ "libfuzzer-sys",
+ "pic-scale",
+]
+
 [[package]]
 name = "arbitrary"
 version = "1.4.1"
diff --git a/Cargo.toml b/Cargo.toml
index 019bce5..70878ec 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,4 +1,4 @@
-workspace = { members = ["app", "wasm"] }
+workspace = { members = ["app", "wasm", "fuzz"] }
 
 [package]
 name = "pic-scale"
diff --git a/fuzz/resize_rgb/resize_rgb.rs b/fuzz/resize_rgb/resize_rgb.rs
new file mode 100644
index 0000000..ee424f7
--- /dev/null
+++ b/fuzz/resize_rgb/resize_rgb.rs
@@ -0,0 +1,42 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgb(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_rgb(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![0u8; src_width * src_height * 3];
+
+    let store = ImageStore::<u8, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_rgb(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+}
diff --git a/fuzz/resize_rgb_u16/resize_rgb_u16.rs b/fuzz/resize_rgb_u16/resize_rgb_u16.rs
new file mode 100644
index 0000000..3cfea98
--- /dev/null
+++ b/fuzz/resize_rgb_u16/resize_rgb_u16.rs
@@ -0,0 +1,47 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ScalingU16};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgba(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Lanczos3,
+    )
+});
+
+fn resize_rgb(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![1u16; src_width * src_height * 4];
+
+    let store = ImageStore::<u16, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 10, false)
+        .unwrap();
+
+    let store = ImageStore::<u16, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 16, false)
+        .unwrap();
+}
diff --git a/fuzz/resize_rgba_u16/resize_rgba_u16.rs b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
new file mode 100644
index 0000000..19d9824
--- /dev/null
+++ b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
@@ -0,0 +1,47 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ScalingU16};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgba(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Lanczos3,
+    )
+});
+
+fn resize_rgba(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![1u16; src_width * src_height * 4];
+
+    let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 10, false)
+        .unwrap();
+
+    let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 16, false)
+        .unwrap();
+}

From 05c79b5f214f1962aaf26dd804df588cc1ef992f Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Mon, 9 Dec 2024 18:19:14 +0000
Subject: [PATCH 12/19] Adding fuzzing, rgba8

---
 fuzz/.gitignore                 |  4 +++
 fuzz/Cargo.toml                 | 40 ++++++++++++++++++++++++++++
 fuzz/resize_rgba/resize_rgba.rs | 46 +++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+)
 create mode 100644 fuzz/.gitignore
 create mode 100644 fuzz/Cargo.toml
 create mode 100644 fuzz/resize_rgba/resize_rgba.rs

diff --git a/fuzz/.gitignore b/fuzz/.gitignore
new file mode 100644
index 0000000..1a45eee
--- /dev/null
+++ b/fuzz/.gitignore
@@ -0,0 +1,4 @@
+target
+corpus
+artifacts
+coverage
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
new file mode 100644
index 0000000..7e0166c
--- /dev/null
+++ b/fuzz/Cargo.toml
@@ -0,0 +1,40 @@
+[package]
+name = "app-fuzz"
+version = "0.0.0"
+publish = false
+edition = "2021"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+pic-scale = {path = "../"}
+
+[[bin]]
+name = "resize_rgba"
+path = "resize_rgba/resize_rgba.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_rgb"
+path = "resize_rgb/resize_rgb.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_rgb_u16"
+path = "resize_rgb_u16/resize_rgb_u16.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_rgba_u16"
+path = "resize_rgba_u16/resize_rgba_u16.rs"
+test = false
+doc = false
+bench = false
\ No newline at end of file
diff --git a/fuzz/resize_rgba/resize_rgba.rs b/fuzz/resize_rgba/resize_rgba.rs
new file mode 100644
index 0000000..00f158c
--- /dev/null
+++ b/fuzz/resize_rgba/resize_rgba.rs
@@ -0,0 +1,46 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgba(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_rgba(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![0u8; src_width * src_height * 4];
+
+    let store = ImageStore::<u8, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_rgba(ImageSize::new(dst_width, dst_height), store, false)
+        .unwrap();
+    let store = ImageStore::<u8, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgba(ImageSize::new(dst_width, dst_height), store, true)
+        .unwrap();
+}

From 0833dc468cc78ebbadee63792c86635f609aeedb Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Mon, 9 Dec 2024 18:22:48 +0000
Subject: [PATCH 13/19] Adding fuzzing, rgb8

---
 .github/workflows/build_push.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index 6765ab9..49796f7 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -37,10 +37,11 @@ jobs:
       - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu
 
   fuzz_rgba_8bit:
-    name: Fuzzing RGBA8
+    name: Fuzzing 8bit
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v4
       - uses: dtolnay/rust-toolchain@nightly
       - run: cargo install cargo-fuzz
-      - run: cargo fuzz run resize_rgba -- -max_total_time=30
\ No newline at end of file
+      - run: cargo fuzz run resize_rgba -- -max_total_time=30
+      - run: cargo fuzz run resize_rgb -- -max_total_time=30
\ No newline at end of file

From 528b72d26630fced383dc267a97817378221eff1 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Mon, 9 Dec 2024 18:29:11 +0000
Subject: [PATCH 14/19] Fuzzing high bit depth

---
 .github/workflows/build_push.yml        | 12 +++++++++++-
 fuzz/Cargo.toml                         | 10 +++++-----
 fuzz/resize_rgb_u16/resize_rgb_u16.rs   | 10 +++++-----
 fuzz/resize_rgba_u16/resize_rgba_u16.rs | 11 ++++++++++-
 src/scaler.rs                           |  4 ++++
 5 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index 49796f7..a36d92c 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -44,4 +44,14 @@ jobs:
       - uses: dtolnay/rust-toolchain@nightly
       - run: cargo install cargo-fuzz
       - run: cargo fuzz run resize_rgba -- -max_total_time=30
-      - run: cargo fuzz run resize_rgb -- -max_total_time=30
\ No newline at end of file
+      - run: cargo fuzz run resize_rgb -- -max_total_time=30
+
+  fuzz_rgba_high_bit:
+    name: Fuzzing High bit-depth
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@nightly
+      - run: cargo install cargo-fuzz
+      - run: cargo fuzz run resize_rgba_u16 -- -max_total_time=30
+      - run: cargo fuzz run resize_rgb_u16 -- -max_total_time=30
\ No newline at end of file
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index 7e0166c..25130d5 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -26,15 +26,15 @@ doc = false
 bench = false
 
 [[bin]]
-name = "resize_rgb_u16"
-path = "resize_rgb_u16/resize_rgb_u16.rs"
+name = "resize_rgba_u16"
+path = "resize_rgba_u16/resize_rgba_u16.rs"
 test = false
 doc = false
 bench = false
 
 [[bin]]
-name = "resize_rgba_u16"
-path = "resize_rgba_u16/resize_rgba_u16.rs"
+name = "resize_rgb_u16"
+path = "resize_rgb_u16/resize_rgb_u16.rs"
 test = false
 doc = false
-bench = false
\ No newline at end of file
+bench = false
diff --git a/fuzz/resize_rgb_u16/resize_rgb_u16.rs b/fuzz/resize_rgb_u16/resize_rgb_u16.rs
index 3cfea98..a636854 100644
--- a/fuzz/resize_rgb_u16/resize_rgb_u16.rs
+++ b/fuzz/resize_rgb_u16/resize_rgb_u16.rs
@@ -1,10 +1,10 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ScalingU16};
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16};
 
 fuzz_target!(|data: (u16, u16, u16, u16)| {
-    resize_rgba(
+    resize_rgb(
         data.0 as usize,
         data.1 as usize,
         data.2 as usize,
@@ -32,16 +32,16 @@ fn resize_rgb(
         return;
     }
 
-    let mut src_data = vec![1u16; src_width * src_height * 4];
+    let mut src_data = vec![1u16; src_width * src_height * 3];
 
     let store = ImageStore::<u16, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
     let scaler = Scaler::new(sampler);
     _ = scaler
-        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 10, false)
+        .resize_rgb_u16(ImageSize::new(dst_width, dst_height), store, 10)
         .unwrap();
 
     let store = ImageStore::<u16, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
     _ = scaler
-        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 16, false)
+        .resize_rgb_u16(ImageSize::new(dst_width, dst_height), store, 16)
         .unwrap();
 }
diff --git a/fuzz/resize_rgba_u16/resize_rgba_u16.rs b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
index 19d9824..c69a3e9 100644
--- a/fuzz/resize_rgba_u16/resize_rgba_u16.rs
+++ b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
@@ -1,7 +1,7 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ScalingU16};
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16};
 
 fuzz_target!(|data: (u16, u16, u16, u16)| {
     resize_rgba(
@@ -39,9 +39,18 @@ fn resize_rgba(
     _ = scaler
         .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 10, false)
         .unwrap();
+    let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 10, true)
+        .unwrap();
 
     let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
     _ = scaler
         .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 16, false)
         .unwrap();
+
+    let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 16, true)
+        .unwrap();
 }
diff --git a/src/scaler.rs b/src/scaler.rs
index 30011b6..513183a 100644
--- a/src/scaler.rs
+++ b/src/scaler.rs
@@ -909,6 +909,10 @@ impl ScalingU16 for Scaler {
             return Err(PicScaleError::UnsupportedBitDepth(bit_depth));
         }
 
+        if store.width == new_size.width && store.height == new_size.height {
+            return Ok(store.copied());
+        }
+
         let should_do_horizontal = store.width != new_size.width;
         let should_do_vertical = store.height != new_size.height;
         assert!(should_do_horizontal || should_do_vertical);

From 3b5f9977e54e9d6c8da986027869c41ed49c7f87 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Mon, 9 Dec 2024 18:37:24 +0000
Subject: [PATCH 15/19] Matrices, fuzzing planar images

---
 .github/workflows/build_push.yml          | 14 +++++--
 app/src/main.rs                           | 31 +++++++++++++++
 fuzz/Cargo.toml                           | 14 +++++++
 fuzz/resize_plane/resize_plane.rs         | 42 ++++++++++++++++++++
 fuzz/resize_plane_u16/resize_plane_u16.rs | 47 +++++++++++++++++++++++
 src/neon/plane_u8.rs                      |  2 +-
 6 files changed, 146 insertions(+), 4 deletions(-)
 create mode 100644 fuzz/resize_plane/resize_plane.rs
 create mode 100644 fuzz/resize_plane_u16/resize_plane_u16.rs

diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index a36d92c..bd7ad57 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -38,20 +38,28 @@ jobs:
 
   fuzz_rgba_8bit:
     name: Fuzzing 8bit
-    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
       - uses: dtolnay/rust-toolchain@nightly
       - run: cargo install cargo-fuzz
       - run: cargo fuzz run resize_rgba -- -max_total_time=30
       - run: cargo fuzz run resize_rgb -- -max_total_time=30
+      - run: cargo fuzz run resize_plane -- -max_total_time=30
 
   fuzz_rgba_high_bit:
     name: Fuzzing High bit-depth
-    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
       - uses: dtolnay/rust-toolchain@nightly
       - run: cargo install cargo-fuzz
       - run: cargo fuzz run resize_rgba_u16 -- -max_total_time=30
-      - run: cargo fuzz run resize_rgb_u16 -- -max_total_time=30
\ No newline at end of file
+      - run: cargo fuzz run resize_rgb_u16 -- -max_total_time=30
+      - run: cargo fuzz run resize_plane_u16 -- -max_total_time=30
\ No newline at end of file
diff --git a/app/src/main.rs b/app/src/main.rs
index 3fee90c..3b4cb82 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -18,6 +18,35 @@ use yuvutils_rs::{
     ar30_to_rgba8, ra30_to_rgba8, rgb8_to_ar30, rgba8_to_ar30, rgba8_to_ra30, Rgb30ByteOrder,
 };
 
+fn resize_plane(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![15u8; src_width * src_height * 1];
+
+    let store = ImageStore::<u8, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_plane(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+}
+
+
 fn main() {
     // test_fast_image();
     let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
@@ -31,6 +60,8 @@ fn main() {
     let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
     scaler.set_threading_policy(ThreadingPolicy::Single);
 
+    resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);
+
     // let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
     //
     let store =
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index 25130d5..476e60e 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -25,6 +25,13 @@ test = false
 doc = false
 bench = false
 
+[[bin]]
+name = "resize_plane"
+path = "resize_plane/resize_plane.rs"
+test = false
+doc = false
+bench = false
+
 [[bin]]
 name = "resize_rgba_u16"
 path = "resize_rgba_u16/resize_rgba_u16.rs"
@@ -38,3 +45,10 @@ path = "resize_rgb_u16/resize_rgb_u16.rs"
 test = false
 doc = false
 bench = false
+
+[[bin]]
+name = "resize_plane_u16"
+path = "resize_plane_u16/resize_plane_u16.rs"
+test = false
+doc = false
+bench = false
\ No newline at end of file
diff --git a/fuzz/resize_plane/resize_plane.rs b/fuzz/resize_plane/resize_plane.rs
new file mode 100644
index 0000000..29b1844
--- /dev/null
+++ b/fuzz/resize_plane/resize_plane.rs
@@ -0,0 +1,42 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_plane(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_plane(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![15u8; src_width * src_height];
+
+    let store = ImageStore::<u8, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_plane(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+}
diff --git a/fuzz/resize_plane_u16/resize_plane_u16.rs b/fuzz/resize_plane_u16/resize_plane_u16.rs
new file mode 100644
index 0000000..5c87ac7
--- /dev/null
+++ b/fuzz/resize_plane_u16/resize_plane_u16.rs
@@ -0,0 +1,47 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgb(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Lanczos3,
+    )
+});
+
+fn resize_rgb(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![1u16; src_width * src_height];
+
+    let store = ImageStore::<u16, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_plane_u16(ImageSize::new(dst_width, dst_height), store, 10)
+        .unwrap();
+
+    let store = ImageStore::<u16, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_plane_u16(ImageSize::new(dst_width, dst_height), store, 16)
+        .unwrap();
+}
diff --git a/src/neon/plane_u8.rs b/src/neon/plane_u8.rs
index ee3d581..724b4b0 100644
--- a/src/neon/plane_u8.rs
+++ b/src/neon/plane_u8.rs
@@ -183,7 +183,7 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8(
             }
 
             while jx < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
+                let w_ptr = weights.get_unchecked(jx..(jx + 1));
                 let weight = vld1_lane_s16::<0>(w_ptr.as_ptr(), vdup_n_s16(0));
                 let bounds_start = bounds.start + jx;
 

From 50160074b7d12dc14cd09bf455082caa022e53a0 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Mon, 9 Dec 2024 18:45:15 +0000
Subject: [PATCH 16/19] Fuzzing f32 images

---
 .github/workflows/build_push.yml          | 16 +++++++-
 fuzz/Cargo.toml                           | 23 +++++++++++-
 fuzz/resize_plane_f32/resize_plane_f32.rs | 46 +++++++++++++++++++++++
 fuzz/resize_rgb_f32/resize_rgb_f32.rs     | 46 +++++++++++++++++++++++
 fuzz/resize_rgba_f32/resize_rgba_f32.rs   | 46 +++++++++++++++++++++++
 5 files changed, 175 insertions(+), 2 deletions(-)
 create mode 100644 fuzz/resize_plane_f32/resize_plane_f32.rs
 create mode 100644 fuzz/resize_rgb_f32/resize_rgb_f32.rs
 create mode 100644 fuzz/resize_rgba_f32/resize_rgba_f32.rs

diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index bd7ad57..1648287 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -62,4 +62,18 @@ jobs:
       - run: cargo install cargo-fuzz
       - run: cargo fuzz run resize_rgba_u16 -- -max_total_time=30
       - run: cargo fuzz run resize_rgb_u16 -- -max_total_time=30
-      - run: cargo fuzz run resize_plane_u16 -- -max_total_time=30
\ No newline at end of file
+      - run: cargo fuzz run resize_plane_u16 -- -max_total_time=30
+
+  fuzz_rgba_f32:
+    name: Fuzzing floating point
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@nightly
+      - run: cargo install cargo-fuzz
+      - run: cargo fuzz run resize_rgba_f32 -- -max_total_time=30
+      - run: cargo fuzz run resize_rgb_f32 -- -max_total_time=30
+      - run: cargo fuzz run resize_plane_f32 -- -max_total_time=30
\ No newline at end of file
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index 476e60e..3db86e5 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -51,4 +51,25 @@ name = "resize_plane_u16"
 path = "resize_plane_u16/resize_plane_u16.rs"
 test = false
 doc = false
-bench = false
\ No newline at end of file
+bench = false
+
+[[bin]]
+name = "resize_rgba_f32"
+path = "resize_rgba_f32/resize_rgba_f32.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_rgb_f32"
+path = "resize_rgb_f32/resize_rgb_f32.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_plane_f32"
+path = "resize_plane_f32/resize_plane_f32.rs"
+test = false
+doc = false
+bench = false
diff --git a/fuzz/resize_plane_f32/resize_plane_f32.rs b/fuzz/resize_plane_f32/resize_plane_f32.rs
new file mode 100644
index 0000000..62752d9
--- /dev/null
+++ b/fuzz/resize_plane_f32/resize_plane_f32.rs
@@ -0,0 +1,46 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_plane(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_plane(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![0f32; src_width * src_height];
+
+    let store = ImageStore::<f32, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_plane_f32(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+    let store = ImageStore::<f32, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_plane_f32(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+}
diff --git a/fuzz/resize_rgb_f32/resize_rgb_f32.rs b/fuzz/resize_rgb_f32/resize_rgb_f32.rs
new file mode 100644
index 0000000..1a3226e
--- /dev/null
+++ b/fuzz/resize_rgb_f32/resize_rgb_f32.rs
@@ -0,0 +1,46 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingF32};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgb(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_rgb(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![0f32; src_width * src_height * 3];
+
+    let store = ImageStore::<f32, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_rgb_f32(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+    let store = ImageStore::<f32, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgb_f32(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+}
diff --git a/fuzz/resize_rgba_f32/resize_rgba_f32.rs b/fuzz/resize_rgba_f32/resize_rgba_f32.rs
new file mode 100644
index 0000000..14b4bfb
--- /dev/null
+++ b/fuzz/resize_rgba_f32/resize_rgba_f32.rs
@@ -0,0 +1,46 @@
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingF32};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgba(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_rgba(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![0f32; src_width * src_height * 4];
+
+    let store = ImageStore::<f32, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_rgba_f32(ImageSize::new(dst_width, dst_height), store, false)
+        .unwrap();
+    let store = ImageStore::<f32, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgba_f32(ImageSize::new(dst_width, dst_height), store, true)
+        .unwrap();
+}

From 59ed259143f5c15eb397e13f2501802edaab6a9a Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Mon, 9 Dec 2024 19:17:53 +0000
Subject: [PATCH 17/19] Copyrights in fuzzer

---
 fuzz/resize_plane/resize_plane.rs         | 29 +++++++++++++++++++++++
 fuzz/resize_plane_f32/resize_plane_f32.rs | 29 +++++++++++++++++++++++
 fuzz/resize_plane_u16/resize_plane_u16.rs | 29 +++++++++++++++++++++++
 fuzz/resize_rgb/resize_rgb.rs             | 29 +++++++++++++++++++++++
 fuzz/resize_rgb_f32/resize_rgb_f32.rs     | 29 +++++++++++++++++++++++
 fuzz/resize_rgb_u16/resize_rgb_u16.rs     | 29 +++++++++++++++++++++++
 fuzz/resize_rgba/resize_rgba.rs           | 29 +++++++++++++++++++++++
 fuzz/resize_rgba_f32/resize_rgba_f32.rs   | 29 +++++++++++++++++++++++
 fuzz/resize_rgba_u16/resize_rgba_u16.rs   | 29 +++++++++++++++++++++++
 src/threading_policy.rs                   | 10 ++++----
 10 files changed, 266 insertions(+), 5 deletions(-)

diff --git a/fuzz/resize_plane/resize_plane.rs b/fuzz/resize_plane/resize_plane.rs
index 29b1844..829cca4 100644
--- a/fuzz/resize_plane/resize_plane.rs
+++ b/fuzz/resize_plane/resize_plane.rs
@@ -1,3 +1,32 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
diff --git a/fuzz/resize_plane_f32/resize_plane_f32.rs b/fuzz/resize_plane_f32/resize_plane_f32.rs
index 62752d9..bb128e0 100644
--- a/fuzz/resize_plane_f32/resize_plane_f32.rs
+++ b/fuzz/resize_plane_f32/resize_plane_f32.rs
@@ -1,3 +1,32 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
diff --git a/fuzz/resize_plane_u16/resize_plane_u16.rs b/fuzz/resize_plane_u16/resize_plane_u16.rs
index 5c87ac7..8a59c96 100644
--- a/fuzz/resize_plane_u16/resize_plane_u16.rs
+++ b/fuzz/resize_plane_u16/resize_plane_u16.rs
@@ -1,3 +1,32 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
diff --git a/fuzz/resize_rgb/resize_rgb.rs b/fuzz/resize_rgb/resize_rgb.rs
index ee424f7..ecc74d3 100644
--- a/fuzz/resize_rgb/resize_rgb.rs
+++ b/fuzz/resize_rgb/resize_rgb.rs
@@ -1,3 +1,32 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
diff --git a/fuzz/resize_rgb_f32/resize_rgb_f32.rs b/fuzz/resize_rgb_f32/resize_rgb_f32.rs
index 1a3226e..f2d4773 100644
--- a/fuzz/resize_rgb_f32/resize_rgb_f32.rs
+++ b/fuzz/resize_rgb_f32/resize_rgb_f32.rs
@@ -1,3 +1,32 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
diff --git a/fuzz/resize_rgb_u16/resize_rgb_u16.rs b/fuzz/resize_rgb_u16/resize_rgb_u16.rs
index a636854..47e48fd 100644
--- a/fuzz/resize_rgb_u16/resize_rgb_u16.rs
+++ b/fuzz/resize_rgb_u16/resize_rgb_u16.rs
@@ -1,3 +1,32 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
diff --git a/fuzz/resize_rgba/resize_rgba.rs b/fuzz/resize_rgba/resize_rgba.rs
index 00f158c..dab34f5 100644
--- a/fuzz/resize_rgba/resize_rgba.rs
+++ b/fuzz/resize_rgba/resize_rgba.rs
@@ -1,3 +1,32 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
diff --git a/fuzz/resize_rgba_f32/resize_rgba_f32.rs b/fuzz/resize_rgba_f32/resize_rgba_f32.rs
index 14b4bfb..8c08146 100644
--- a/fuzz/resize_rgba_f32/resize_rgba_f32.rs
+++ b/fuzz/resize_rgba_f32/resize_rgba_f32.rs
@@ -1,3 +1,32 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
diff --git a/fuzz/resize_rgba_u16/resize_rgba_u16.rs b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
index c69a3e9..eb519ec 100644
--- a/fuzz/resize_rgba_u16/resize_rgba_u16.rs
+++ b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
@@ -1,3 +1,32 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
diff --git a/src/threading_policy.rs b/src/threading_policy.rs
index ba70111..2f3b9a4 100644
--- a/src/threading_policy.rs
+++ b/src/threading_policy.rs
@@ -47,20 +47,20 @@ pub enum ThreadingPolicy {
 
 impl ThreadingPolicy {
     #[cfg(not(target_arch = "wasm32"))]
-    pub fn get_threads_count(&self, for_size: ImageSize) -> usize {
+    pub fn thread_count(&self, for_size: ImageSize) -> usize {
         match self {
             ThreadingPolicy::Single => 1,
             ThreadingPolicy::Fixed(thread_count) => (*thread_count).max(1),
             ThreadingPolicy::Adaptive => {
                 let box_size = 256 * 256;
                 let new_box_size = for_size.height * for_size.width;
-                (new_box_size / box_size).clamp(1, 16)
+                (new_box_size / box_size).clamp(1, 12)
             }
         }
     }
 
     #[cfg(target_arch = "wasm32")]
-    pub fn get_threads_count(&self, _: ImageSize) -> usize {
+    pub fn thread_count(&self, _: ImageSize) -> usize {
         1
     }
 }
@@ -71,9 +71,9 @@ impl ThreadingPolicy {
         if *self == ThreadingPolicy::Single {
             return None;
         }
-        let threads_count = self.get_threads_count(for_size);
+        let thread_count = self.thread_count(for_size);
         match rayon::ThreadPoolBuilder::new()
-            .num_threads(threads_count)
+            .num_threads(thread_count)
             .build()
         {
             Ok(pool) => Some(pool),

From 625d1bec86faeb83e4341a630c08ded7476979b5 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 10 Dec 2024 10:44:28 +0000
Subject: [PATCH 18/19] Clippy

---
 .github/workflows/build_push.yml        | 13 +++++++++++--
 fuzz/resize_rgba_u16/resize_rgba_u16.rs | 12 +++++++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index 1648287..3e8fb85 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -29,13 +29,22 @@ jobs:
       - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target i686-unknown-linux-gnu
       - run: cargo build --target powerpc-unknown-linux-gnu
       - run: cargo build --target riscv64gc-unknown-linux-gnu
-      - run: cargo clippy
-      - run: cargo clippy --target aarch64-unknown-linux-gnu
       - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target x86_64-unknown-linux-gnu
       - run: RUSTFLAGS="-C target-feature=+sse4.1,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu
       - run: RUSTFLAGS="-C target-feature=+avx2,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu
       - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu
 
+  clippy:
+    name: Clippy
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@stable
+      - run: cargo clippy
+
   fuzz_rgba_8bit:
     name: Fuzzing 8bit
     strategy:
diff --git a/fuzz/resize_rgba_u16/resize_rgba_u16.rs b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
index eb519ec..494da64 100644
--- a/fuzz/resize_rgba_u16/resize_rgba_u16.rs
+++ b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
@@ -30,7 +30,7 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16};
+use pic_scale::{Ar30ByteOrder, ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16};
 
 fuzz_target!(|data: (u16, u16, u16, u16)| {
     resize_rgba(
@@ -82,4 +82,14 @@ fn resize_rgba(
     _ = scaler
         .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 16, true)
         .unwrap();
+
+    let src_data_ar30 = vec![1u32; src_width * src_height];
+    let mut dst_data_ar30 = vec![1u32; dst_width * dst_height];
+    _ = scaler.resize_ar30(
+        &src_data_ar30,
+        ImageSize::new(src_width, src_height),
+        &mut dst_data_ar30,
+        ImageSize::new(dst_width, dst_height),
+        Ar30ByteOrder::Host,
+    );
 }

From 5f61298a2b19262b211e73195dbae753781c9635 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Tue, 10 Dec 2024 12:27:23 +0000
Subject: [PATCH 19/19] MSRV declared

---
 Cargo.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Cargo.toml b/Cargo.toml
index 70878ec..68371ce 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,6 +14,7 @@ categories = ["multimedia::images", "multimedia::video"]
 homepage = "https://github.com/awxkee/pic-scale"
 repository = "https://github.com/awxkee/pic-scale"
 exclude = ["*.jpg", "/assets", "*.png", "*.sh", "/assets/*"]
+rust-version = "1.73.0"
 
 [dependencies]
 colorutils-rs = {version = "0.7.0", optional = true}