From 898c096dfb37ae9dfeeb4b1b61b18efeab968865 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Fri, 20 Dec 2024 15:25:07 +0000
Subject: [PATCH 1/9] x86 improvements

---
 src/avx2/rgba_f32.rs      | 167 +++++++++++++---------------
 src/avx2/vertical_f32.rs  | 130 +++++++++-------------
 src/color_group.rs        |  68 +-----------
 src/convolve_naive_f32.rs | 223 ++++----------------------------------
 src/dispatch_group_f32.rs | 216 ++++++++++++++++++------------------
 src/lib.rs                |   1 -
 src/neon/plane_f32.rs     |  82 ++++++--------
 src/neon/rgb_f32.rs       |  53 ++++-----
 src/neon/rgba_f32.rs      |  67 +++++-------
 src/neon/vertical_f32.rs  | 108 +++++++-----------
 src/plane_f32.rs          |  12 +-
 src/rgb_f32.rs            |  63 +++--------
 src/rgba_f32.rs           |  22 ++--
 src/sse/plane_f32.rs      | 181 +++++++++++++------------------
 src/sse/rgb_f32.rs        | 137 +++++++++++------------
 src/sse/rgba_f32.rs       | 138 +++++++++++------------
 src/sse/vertical_f32.rs   | 138 ++++++++++-------------
 src/unsafe_slice.rs       |  69 ------------
 18 files changed, 674 insertions(+), 1201 deletions(-)
 delete mode 100644 src/unsafe_slice.rs
diff --git a/src/avx2/rgba_f32.rs b/src/avx2/rgba_f32.rs
index f82923a..0aba5bd 100644
--- a/src/avx2/rgba_f32.rs
+++ b/src/avx2/rgba_f32.rs
@@ -39,13 +39,13 @@ use crate::sse::{load_4_weights_group_2_avx, load_8_weights_group_4_avx, shuffle
 #[inline(always)]
 unsafe fn convolve_horizontal_parts_one_rgba_f32<const FMA: bool>(
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     weight0: __m256,
     store_0: __m256,
 ) -> __m256 {
     const COMPONENTS: usize = 4;
-    let src_ptr = src.add(start_x * COMPONENTS);
-    let rgb_pixel = _mm_loadu_ps(src_ptr);
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..);
+    let rgb_pixel = _mm_loadu_ps(src_ptr.as_ptr());
     _mm256_fma_ps::<FMA>(
         store_0,
         avx_combine_ps(rgb_pixel, _mm_setzero_ps()),
@@ -56,13 +56,13 @@ unsafe fn convolve_horizontal_parts_one_rgba_f32<const FMA: bool>(
 #[inline(always)]
 unsafe fn convolve_horizontal_parts_4_rgba_f32<const FMA: bool>(
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     weight0: __m256,
     weight1: __m256,
     store_0: __m256,
 ) -> __m256 {
     const COMPONENTS: usize = 4;
-    let src_ptr = src.add(start_x * COMPONENTS);
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
 
     let rgb_pixel_0 = _mm256_loadu_ps(src_ptr);
     let rgb_pixel_1 = _mm256_loadu_ps(src_ptr.add(8));
@@ -75,7 +75,7 @@ unsafe fn convolve_horizontal_parts_4_rgba_f32<const FMA: bool>(
 #[inline(always)]
 unsafe fn convolve_horizontal_parts_8_rgba_f32<const FMA: bool>(
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     weight0: __m256,
     weight1: __m256,
     weight2: __m256,
@@ -83,7 +83,7 @@ unsafe fn convolve_horizontal_parts_8_rgba_f32<const FMA: bool>(
     store_0: __m256,
 ) -> __m256 {
     const COMPONENTS: usize = 4;
-    let src_ptr = src.add(start_x * COMPONENTS);
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
 
     let rgb_pixel_0 = _mm256_loadu_ps(src_ptr);
     let rgb_pixel_1 = _mm256_loadu_ps(src_ptr.add(8));
@@ -100,14 +100,14 @@ unsafe fn convolve_horizontal_parts_8_rgba_f32<const FMA: bool>(
 #[inline(always)]
 unsafe fn convolve_horizontal_parts_2_rgba_f32<const FMA: bool>(
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     weight0: __m256,
     store_0: __m256,
 ) -> __m256 {
     const COMPONENTS: usize = 4;
-    let src_ptr = src.add(start_x * COMPONENTS);
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..);
 
-    let rgb_pixel = _mm256_loadu_ps(src_ptr);
+    let rgb_pixel = _mm256_loadu_ps(src_ptr.as_ptr());
 
     _mm256_fma_ps::<FMA>(store_0, rgb_pixel, weight0)
 }
@@ -116,9 +116,9 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     unsafe {
@@ -127,9 +127,9 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f32<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 dst_stride,
             );
         } else {
@@ -137,9 +137,9 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f32<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 dst_stride,
             );
         }
@@ -151,18 +151,18 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgba_avx_rows_4_f32_impl::<false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
@@ -172,18 +172,18 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgba_avx_rows_4_f32_impl::<true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
@@ -193,9 +193,9 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl<const FMA: bool>(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     const CHANNELS: usize = 4;
@@ -218,7 +218,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl<const FMA: bool>(
 
             store_0 = convolve_horizontal_parts_8_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src,
                 weight0,
                 weight1,
                 weight2,
@@ -227,7 +227,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl<const FMA: bool>(
             );
             store_1 = convolve_horizontal_parts_8_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..),
                 weight0,
                 weight1,
                 weight2,
@@ -236,7 +236,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl<const FMA: bool>(
             );
             store_2 = convolve_horizontal_parts_8_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..),
                 weight0,
                 weight1,
                 weight2,
@@ -245,7 +245,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl<const FMA: bool>(
             );
             store_3 = convolve_horizontal_parts_8_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..),
                 weight0,
                 weight1,
                 weight2,
@@ -262,28 +262,28 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl<const FMA: bool>(
 
             store_0 = convolve_horizontal_parts_4_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src,
                 weight0,
                 weight1,
                 store_0,
             );
             store_1 = convolve_horizontal_parts_4_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..),
                 weight0,
                 weight1,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_4_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..),
                 weight0,
                 weight1,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_4_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..),
                 weight0,
                 weight1,
                 store_3,
@@ -297,27 +297,23 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl<const FMA: bool>(
             let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
             let weight = avx_combine_ps(weight0, weight1);
             let filter_start = jx + bounds.start;
-            store_0 = convolve_horizontal_parts_2_rgba_f32::<FMA>(
-                filter_start,
-                unsafe_source_ptr_0,
-                weight,
-                store_0,
-            );
+            store_0 =
+                convolve_horizontal_parts_2_rgba_f32::<FMA>(filter_start, src, weight, store_0);
             store_1 = convolve_horizontal_parts_2_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..),
                 weight,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_2_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..),
                 weight,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_2_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..),
                 weight,
                 store_3,
             );
@@ -328,27 +324,23 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl<const FMA: bool>(
             let ptr = weights_ptr.add(jx + filter_offset);
             let filter_start = jx + bounds.start;
             let weight0 = _mm256_set1_ps(ptr.read_unaligned());
-            store_0 = convolve_horizontal_parts_one_rgba_f32::<FMA>(
-                filter_start,
-                unsafe_source_ptr_0,
-                weight0,
-                store_0,
-            );
+            store_0 =
+                convolve_horizontal_parts_one_rgba_f32::<FMA>(filter_start, src, weight0, store_0);
             store_1 = convolve_horizontal_parts_one_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..),
                 weight0,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_one_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..),
                 weight0,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_one_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..),
                 weight0,
                 store_3,
             );
@@ -356,36 +348,36 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl<const FMA: bool>(
         }
 
         let px = x * CHANNELS;
-        let dest_ptr = unsafe_destination_ptr_0.add(px);
+        let dest_ptr = dst.get_unchecked_mut(px..);
         _mm_storeu_ps(
-            dest_ptr,
+            dest_ptr.as_mut_ptr(),
             _mm_add_ps(
                 _mm256_castps256_ps128(store_0),
                 _mm256_extractf128_ps::<1>(store_0),
             ),
         );
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride..);
         _mm_storeu_ps(
-            dest_ptr,
+            dest_ptr.as_mut_ptr(),
             _mm_add_ps(
                 _mm256_castps256_ps128(store_1),
                 _mm256_extractf128_ps::<1>(store_1),
             ),
         );
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..);
         _mm_storeu_ps(
-            dest_ptr,
+            dest_ptr.as_mut_ptr(),
             _mm_add_ps(
                 _mm256_castps256_ps128(store_2),
                 _mm256_extractf128_ps::<1>(store_2),
             ),
         );
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..);
         _mm_storeu_ps(
-            dest_ptr,
+            dest_ptr.as_mut_ptr(),
             _mm_add_ps(
                 _mm256_castps256_ps128(store_3),
                 _mm256_extractf128_ps::<1>(store_3),
@@ -400,8 +392,8 @@ pub(crate) fn convolve_horizontal_rgba_avx_row_one_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     unsafe {
         if FMA {
@@ -409,16 +401,16 @@ pub(crate) fn convolve_horizontal_rgba_avx_row_one_f32<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
+                src,
+                dst,
             );
         } else {
             convolve_horizontal_rgba_avx_row_one_f32_regular(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
+                src,
+                dst,
             );
         }
     }
@@ -429,15 +421,15 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     convolve_horizontal_rgba_avx_row_one_f32_impl::<false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
@@ -446,15 +438,15 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     convolve_horizontal_rgba_avx_row_one_f32_impl::<false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
@@ -463,8 +455,8 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_impl<const FMA: bool>(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     const CHANNELS: usize = 4;
     let mut filter_offset = 0usize;
@@ -482,7 +474,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_impl<const FMA: bool>(
 
             store = convolve_horizontal_parts_8_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src,
                 weight0,
                 weight1,
                 weight2,
@@ -498,7 +490,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_impl<const FMA: bool>(
             let filter_start = jx + bounds.start;
             store = convolve_horizontal_parts_4_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src,
                 weight0,
                 weight1,
                 store,
@@ -512,12 +504,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_impl<const FMA: bool>(
             let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
             let weight = avx_combine_ps(weight0, weight1);
             let filter_start = jx + bounds.start;
-            store = convolve_horizontal_parts_2_rgba_f32::<FMA>(
-                filter_start,
-                unsafe_source_ptr_0,
-                weight,
-                store,
-            );
+            store = convolve_horizontal_parts_2_rgba_f32::<FMA>(filter_start, src, weight, store);
             jx += 2
         }
 
@@ -525,19 +512,15 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_impl<const FMA: bool>(
             let ptr = weights_ptr.add(jx + filter_offset);
             let weight0 = _mm256_set1_ps(ptr.read_unaligned());
             let filter_start = jx + bounds.start;
-            store = convolve_horizontal_parts_one_rgba_f32::<FMA>(
-                filter_start,
-                unsafe_source_ptr_0,
-                weight0,
-                store,
-            );
+            store =
+                convolve_horizontal_parts_one_rgba_f32::<FMA>(filter_start, src, weight0, store);
             jx += 1;
         }
 
         let px = x * CHANNELS;
-        let dest_ptr = unsafe_destination_ptr_0.add(px);
+        let dest_ptr = dst.get_unchecked_mut(px..);
         _mm_storeu_ps(
-            dest_ptr,
+            dest_ptr.as_mut_ptr(),
             _mm_add_ps(
                 _mm256_castps256_ps128(store),
                 _mm256_extractf128_ps::<1>(store),
diff --git a/src/avx2/vertical_f32.rs b/src/avx2/vertical_f32.rs
index ddd673a..6e88617 100644
--- a/src/avx2/vertical_f32.rs
+++ b/src/avx2/vertical_f32.rs
@@ -38,9 +38,9 @@ use std::arch::x86_64::*;
 pub(crate) unsafe fn convolve_vertical_part_avx_32_f32<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     src_stride: usize,
-    dst: *mut f32,
+    dst: &mut [f32],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -55,13 +55,11 @@ pub(crate) unsafe fn convolve_vertical_part_avx_32_f32<const FMA: bool>(
         let py = start_y + j;
         let weight = *filter.get_unchecked(j);
         let v_weight = _mm256_set1_ps(weight);
-        let src_ptr = src.add(src_stride * py);
-
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm256_loadu_ps(s_ptr);
-        let item_row_1 = _mm256_loadu_ps(s_ptr.add(8));
-        let item_row_2 = _mm256_loadu_ps(s_ptr.add(16));
-        let item_row_3 = _mm256_loadu_ps(s_ptr.add(24));
+        let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr();
+        let item_row_0 = _mm256_loadu_ps(src_ptr);
+        let item_row_1 = _mm256_loadu_ps(src_ptr.add(8));
+        let item_row_2 = _mm256_loadu_ps(src_ptr.add(16));
+        let item_row_3 = _mm256_loadu_ps(src_ptr.add(24));
 
         store_0 = _mm256_fma_ps::<FMA>(store_0, item_row_0, v_weight);
         store_1 = _mm256_fma_ps::<FMA>(store_1, item_row_1, v_weight);
@@ -69,7 +67,7 @@ pub(crate) unsafe fn convolve_vertical_part_avx_32_f32<const FMA: bool>(
         store_3 = _mm256_fma_ps::<FMA>(store_3, item_row_3, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     _mm256_storeu_ps(dst_ptr, store_0);
     _mm256_storeu_ps(dst_ptr.add(8), store_1);
     _mm256_storeu_ps(dst_ptr.add(16), store_2);
@@ -80,9 +78,9 @@ pub(crate) unsafe fn convolve_vertical_part_avx_32_f32<const FMA: bool>(
 pub(crate) unsafe fn convolve_vertical_part_avx_16_f32<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     src_stride: usize,
-    dst: *mut f32,
+    dst: &mut [f32],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -95,17 +93,16 @@ pub(crate) unsafe fn convolve_vertical_part_avx_16_f32<const FMA: bool>(
         let py = start_y + j;
         let weight = *filter.get_unchecked(j);
         let v_weight = _mm256_set1_ps(weight);
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr();
 
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm256_loadu_ps(s_ptr);
-        let item_row_1 = _mm256_loadu_ps(s_ptr.add(8));
+        let item_row_0 = _mm256_loadu_ps(src_ptr);
+        let item_row_1 = _mm256_loadu_ps(src_ptr.add(8));
 
         store_0 = _mm256_fma_ps::<FMA>(store_0, item_row_0, v_weight);
         store_1 = _mm256_fma_ps::<FMA>(store_1, item_row_1, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     _mm256_storeu_ps(dst_ptr, store_0);
     _mm256_storeu_ps(dst_ptr.add(8), store_1);
 }
@@ -114,9 +111,9 @@ pub(crate) unsafe fn convolve_vertical_part_avx_16_f32<const FMA: bool>(
 pub(crate) unsafe fn convolve_vertical_part_avx_8_f32<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     src_stride: usize,
-    dst: *mut f32,
+    dst: &mut [f32],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -128,15 +125,13 @@ pub(crate) unsafe fn convolve_vertical_part_avx_8_f32<const FMA: bool>(
         let py = start_y + j;
         let weight = *filter.get_unchecked(j);
         let v_weight = _mm256_set1_ps(weight);
-        let src_ptr = src.add(src_stride * py);
-
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm256_loadu_ps(s_ptr);
+        let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr();
+        let item_row_0 = _mm256_loadu_ps(src_ptr);
 
         store_0 = _mm256_fma_ps::<FMA>(store_0, item_row_0, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     _mm256_storeu_ps(dst_ptr, store_0);
 }
 
@@ -144,9 +139,9 @@ pub(crate) unsafe fn convolve_vertical_part_avx_8_f32<const FMA: bool>(
 pub(crate) unsafe fn convolve_vertical_part_avx_f32<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     src_stride: usize,
-    dst: *mut f32,
+    dst: &mut [f32],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -158,15 +153,14 @@ pub(crate) unsafe fn convolve_vertical_part_avx_f32<const FMA: bool>(
         let py = start_y + j;
         let weight = *filter.get_unchecked(j);
         let v_weight = _mm256_set1_ps(weight);
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr();
 
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm256_set1_ps(s_ptr.read_unaligned());
+        let item_row_0 = _mm256_set1_ps(src_ptr.read_unaligned());
 
         store_0 = _mm256_fma_ps::<FMA>(store_0, item_row_0, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     (dst_ptr as *mut i32).write_unaligned(_mm256_extract_epi32::<0>(_mm256_castps_si256(store_0)));
 }
 
@@ -174,29 +168,19 @@ pub(crate) unsafe fn convolve_vertical_part_avx_f32<const FMA: bool>(
 pub(crate) fn convolve_vertical_avx_row_f32<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     unsafe {
         if FMA {
             convolve_vertical_avx_row_f32_fma::<CHANNELS>(
-                width,
-                bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
+                width, bounds, src, dst, src_stride, weight_ptr,
             );
         } else {
             convolve_vertical_avx_row_f32_regular::<CHANNELS>(
-                width,
-                bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
+                width, bounds, src, dst, src_stride, weight_ptr,
             );
         }
     }
@@ -206,18 +190,13 @@ pub(crate) fn convolve_vertical_avx_row_f32<const CHANNELS: usize, const FMA: bo
 unsafe fn convolve_vertical_avx_row_f32_regular<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     convolve_vertical_avx_row_f32_impl::<CHANNELS, false>(
-        width,
-        bounds,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
-        src_stride,
-        weight_ptr,
+        width, bounds, src, dst, src_stride, weight_ptr,
     );
 }
 
@@ -225,18 +204,13 @@ unsafe fn convolve_vertical_avx_row_f32_regular<const CHANNELS: usize>(
 unsafe fn convolve_vertical_avx_row_f32_fma<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     convolve_vertical_avx_row_f32_impl::<CHANNELS, true>(
-        width,
-        bounds,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
-        src_stride,
-        weight_ptr,
+        width, bounds, src, dst, src_stride, weight_ptr,
     );
 }
 
@@ -244,8 +218,8 @@ unsafe fn convolve_vertical_avx_row_f32_fma<const CHANNELS: usize>(
 unsafe fn convolve_vertical_avx_row_f32_impl<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -256,9 +230,9 @@ unsafe fn convolve_vertical_avx_row_f32_impl<const CHANNELS: usize, const FMA: b
         convolve_vertical_part_avx_32_f32::<FMA>(
             bounds.start,
             cx,
-            unsafe_source_ptr_0,
+            src,
             src_stride,
-            unsafe_destination_ptr_0,
+            dst,
             weight_ptr,
             bounds,
         );
@@ -267,17 +241,15 @@ unsafe fn convolve_vertical_avx_row_f32_impl<const CHANNELS: usize, const FMA: b
     }
 
     while cx + 16 < dst_width {
-        unsafe {
-            convolve_vertical_part_avx_16_f32::<FMA>(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-            );
-        }
+        convolve_vertical_part_avx_16_f32::<FMA>(
+            bounds.start,
+            cx,
+            src,
+            src_stride,
+            dst,
+            weight_ptr,
+            bounds,
+        );
 
         cx += 16;
     }
@@ -286,9 +258,9 @@ unsafe fn convolve_vertical_avx_row_f32_impl<const CHANNELS: usize, const FMA: b
         convolve_vertical_part_avx_8_f32::<FMA>(
             bounds.start,
             cx,
-            unsafe_source_ptr_0,
+            src,
             src_stride,
-            unsafe_destination_ptr_0,
+            dst,
             weight_ptr,
             bounds,
         );
@@ -300,9 +272,9 @@ unsafe fn convolve_vertical_avx_row_f32_impl<const CHANNELS: usize, const FMA: b
         convolve_vertical_part_avx_f32::<FMA>(
             bounds.start,
             cx,
-            unsafe_source_ptr_0,
+            src,
             src_stride,
-            unsafe_destination_ptr_0,
+            dst,
             weight_ptr,
             bounds,
         );
diff --git a/src/color_group.rs b/src/color_group.rs
index 942deb5..3c8982b 100644
--- a/src/color_group.rs
+++ b/src/color_group.rs
@@ -28,7 +28,7 @@
  */
 use crate::mlaf::mlaf;
 use crate::saturate_narrow::SaturateNarrow;
-use num_traits::{AsPrimitive, FromPrimitive, MulAdd};
+use num_traits::{FromPrimitive, MulAdd};
 use std::ops::{Add, AddAssign, Mul, Shr, ShrAssign, Sub, SubAssign};
 
 #[repr(C)]
@@ -70,72 +70,6 @@ where
     }
 }
 
-impl<const COMPS: usize, J> ColorGroup<COMPS, J>
-where
-    J: Copy + Default + 'static,
-{
-    #[inline(always)]
-    pub(crate) fn from_ptr<T>(store: *const T, offset: usize) -> ColorGroup<COMPS, J>
-    where
-        T: AsPrimitive<J>,
-    {
-        unsafe {
-            let l_ptr = store.add(offset);
-            if COMPS == 1 {
-                ColorGroup {
-                    r: l_ptr.read_unaligned().as_(),
-                    g: J::default(),
-                    b: J::default(),
-                    a: J::default(),
-                }
-            } else if COMPS == 2 {
-                ColorGroup {
-                    r: l_ptr.read_unaligned().as_(),
-                    g: l_ptr.add(1).read_unaligned().as_(),
-                    b: J::default(),
-                    a: J::default(),
-                }
-            } else if COMPS == 3 {
-                ColorGroup {
-                    r: l_ptr.read_unaligned().as_(),
-                    g: l_ptr.add(1).read_unaligned().as_(),
-                    b: l_ptr.add(2).read_unaligned().as_(),
-                    a: J::default(),
-                }
-            } else if COMPS == 4 {
-                ColorGroup {
-                    r: l_ptr.read_unaligned().as_(),
-                    g: l_ptr.add(1).read_unaligned().as_(),
-                    b: l_ptr.add(2).read_unaligned().as_(),
-                    a: l_ptr.add(3).read_unaligned().as_(),
-                }
-            } else {
-                unimplemented!("Not implemented.")
-            }
-        }
-    }
-
-    #[inline(always)]
-    pub(crate) fn as_ptr<V: Copy + 'static>(self, ptr: *mut V, offset: usize)
-    where
-        J: Copy + AsPrimitive<V>,
-    {
-        unsafe {
-            let s_ptr = ptr.add(offset);
-            s_ptr.write_unaligned(self.r.as_());
-            if COMPS > 1 {
-                s_ptr.add(1).write_unaligned(self.g.as_());
-            }
-            if COMPS > 2 {
-                s_ptr.add(2).write_unaligned(self.b.as_());
-            }
-            if COMPS == 4 {
-                s_ptr.add(3).write_unaligned(self.a.as_());
-            }
-        }
-    }
-}
-
 impl<const COMPS: usize, J> Mul<J> for ColorGroup<COMPS, J>
 where
     J: Copy + Mul<Output = J> + Default + 'static,
diff --git a/src/convolve_naive_f32.rs b/src/convolve_naive_f32.rs
index 4952b8e..9bfe8f1 100644
--- a/src/convolve_naive_f32.rs
+++ b/src/convolve_naive_f32.rs
@@ -26,214 +26,37 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-use crate::color_group::ColorGroup;
-use crate::filter_weights::{FilterBounds, FilterWeights};
-use num_traits::{AsPrimitive, MulAdd};
-use std::ops::{Add, Mul};
-
-pub(crate) unsafe fn convolve_vertical_part_f32<
-    T: Copy + 'static + AsPrimitive<I>,
-    I: Copy
-        + 'static
-        + AsPrimitive<T>
-        + Default
-        + MulAdd<I, Output = I>
-        + Mul<I, Output = I>
-        + Add<I, Output = I>,
-    const CHANNELS: usize,
->(
-    start_y: usize,
-    start_x: usize,
-    src: *const T,
-    src_stride: usize,
-    dst: *mut T,
-    filter: &[f32],
-    bounds: &FilterBounds,
-) where
-    f32: AsPrimitive<I>,
-{
-    let mut sums0 = ColorGroup::<CHANNELS, I>::dup(I::default());
-
-    let v_start_px = start_x * CHANNELS;
-
-    for j in 0..bounds.size {
-        let py = start_y + j;
-        let weight: I = filter.get_unchecked(j).as_();
-        let src_ptr = src.add(src_stride * py);
-
-        let new_px0 = ColorGroup::<CHANNELS, I>::from_ptr(src_ptr, v_start_px);
-
-        sums0 = sums0.mul_add(new_px0, weight);
-    }
-
-    sums0.as_ptr(dst, v_start_px);
-}
-
-pub(crate) unsafe fn convolve_vertical_part_4_f32<
-    T: Copy + 'static + AsPrimitive<I>,
-    I: Copy
-        + 'static
-        + AsPrimitive<T>
-        + Default
-        + MulAdd<I, Output = I>
-        + Mul<I, Output = I>
-        + Add<I, Output = I>,
-    const CHANNELS: usize,
->(
-    start_y: usize,
-    start_x: usize,
-    src: *const T,
-    src_stride: usize,
-    dst: *mut T,
-    filter: &[f32],
-    bounds: &FilterBounds,
-) where
-    f32: AsPrimitive<I>,
-{
-    let mut sums0 = ColorGroup::<CHANNELS, I>::dup(I::default());
-    let mut sums1 = ColorGroup::<CHANNELS, I>::dup(I::default());
-    let mut sums2 = ColorGroup::<CHANNELS, I>::dup(I::default());
-    let mut sums3 = ColorGroup::<CHANNELS, I>::dup(I::default());
-
-    let v_start_px = start_x * CHANNELS;
-
-    for j in 0..bounds.size {
-        let py = start_y + j;
-        let weight: I = filter.get_unchecked(j).as_();
-        let src_ptr = src.add(src_stride * py);
-
-        let new_px0 = ColorGroup::<CHANNELS, I>::from_ptr(src_ptr, v_start_px);
-        let new_px1 = ColorGroup::<CHANNELS, I>::from_ptr(src_ptr, v_start_px + CHANNELS);
-        let new_px2 = ColorGroup::<CHANNELS, I>::from_ptr(src_ptr, v_start_px + CHANNELS * 2);
-        let new_px3 = ColorGroup::<CHANNELS, I>::from_ptr(src_ptr, v_start_px + CHANNELS * 3);
-
-        sums0 = sums0.mul_add(new_px0, weight);
-        sums1 = sums1.mul_add(new_px1, weight);
-        sums2 = sums2.mul_add(new_px2, weight);
-        sums3 = sums3.mul_add(new_px3, weight);
-    }
-
-    sums0.as_ptr(dst, v_start_px);
-    sums0.as_ptr(dst, v_start_px + CHANNELS);
-    sums0.as_ptr(dst, v_start_px + CHANNELS * 2);
-    sums0.as_ptr(dst, v_start_px + CHANNELS * 3);
-}
+use crate::filter_weights::FilterWeights;
+use crate::floating_point_horizontal::{
+    convolve_row_handler_floating_point, convolve_row_handler_floating_point_4,
+};
 
 #[inline]
-pub(crate) fn convolve_horizontal_rgb_native_row<
-    T: Copy + 'static + AsPrimitive<I>,
-    I: Copy
-        + 'static
-        + Default
-        + MulAdd<I, Output = I>
-        + AsPrimitive<T>
-        + Mul<I, Output = I>
-        + Add<I, Output = I>,
-    const CHANNELS: usize,
->(
-    dst_width: usize,
+pub(crate) fn convolve_horizontal_rgb_native_row<const CHANNELS: usize>(
+    _: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const T,
-    unsafe_destination_ptr_0: *mut T,
-) where
-    f32: AsPrimitive<T> + AsPrimitive<I>,
-{
-    unsafe {
-        let weights_ptr = &filter_weights.weights;
-        let mut filter_offset = 0usize;
-
-        for x in 0..dst_width {
-            let mut sums = ColorGroup::<CHANNELS, I>::dup(0f32.as_());
-
-            let bounds = filter_weights.bounds.get_unchecked(x);
-            let start_x = bounds.start;
-            for j in 0..bounds.size {
-                let px = (start_x + j) * CHANNELS;
-                let weight = *weights_ptr.get_unchecked(j + filter_offset);
-
-                let new_px = ColorGroup::<CHANNELS, I>::from_ptr(unsafe_source_ptr_0, px);
-
-                sums = sums.mul_add(new_px, weight.as_());
-            }
-
-            let px = x * CHANNELS;
-
-            sums.as_ptr(unsafe_destination_ptr_0, px);
-
-            filter_offset += filter_weights.aligned_size;
-        }
-    }
+    src: &[f32],
+    dst: &mut [f32],
+) {
+    convolve_row_handler_floating_point::<f32, f32, f32, CHANNELS>(src, dst, filter_weights, 8)
 }
 
-pub(crate) fn convolve_horizontal_rgba_4_row_f32<
-    T: Copy + 'static + AsPrimitive<I>,
-    I: Copy
-        + 'static
-        + Default
-        + MulAdd<I, Output = I>
-        + AsPrimitive<T>
-        + Mul<I, Output = I>
-        + Add<I, Output = I>,
-    const CHANNELS: usize,
->(
-    dst_width: usize,
+pub(crate) fn convolve_horizontal_rgba_4_row_f32<const CHANNELS: usize>(
+    _: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const T,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut T,
+    dst: &mut [f32],
     dst_stride: usize,
-) where
-    f32: AsPrimitive<T> + AsPrimitive<I>,
-{
-    unsafe {
-        let mut filter_offset = 0usize;
-        let weights = &filter_weights.weights;
-
-        let src_row0 = unsafe_source_ptr_0;
-        let src_row1 = unsafe_source_ptr_0.add(src_stride);
-        let src_row2 = unsafe_source_ptr_0.add(src_stride * 2);
-        let src_row3 = unsafe_source_ptr_0.add(src_stride * 3);
-
-        let dst_row0 = unsafe_destination_ptr_0;
-        let dst_row1 = unsafe_destination_ptr_0.add(dst_stride);
-        let dst_row2 = unsafe_destination_ptr_0.add(dst_stride * 2);
-        let dst_row3 = unsafe_destination_ptr_0.add(dst_stride * 3);
-
-        for x in 0..dst_width {
-            let mut sums0 = ColorGroup::<CHANNELS, I>::dup(0f32.as_());
-            let mut sums1 = ColorGroup::<CHANNELS, I>::dup(0f32.as_());
-            let mut sums2 = ColorGroup::<CHANNELS, I>::dup(0f32.as_());
-            let mut sums3 = ColorGroup::<CHANNELS, I>::dup(0f32.as_());
-
-            let bounds = filter_weights.bounds.get_unchecked(x);
-            let start_x = bounds.start;
-            for j in 0..bounds.size {
-                let px = (start_x + j) * CHANNELS;
-                let weight = *weights.get_unchecked(j + filter_offset);
-
-                let new_px0 = ColorGroup::<CHANNELS, I>::from_ptr(src_row0, px);
-                sums0 = sums0.mul_add(new_px0, weight.as_());
-
-                let new_px1 = ColorGroup::<CHANNELS, I>::from_ptr(src_row1, px);
-                sums1 = sums1.mul_add(new_px1, weight.as_());
-
-                let new_px2 = ColorGroup::<CHANNELS, I>::from_ptr(src_row2, px);
-                sums2 = sums2.mul_add(new_px2, weight.as_());
-
-                let new_px3 = ColorGroup::<CHANNELS, I>::from_ptr(src_row3, px);
-                sums3 = sums3.mul_add(new_px3, weight.as_());
-            }
-
-            let px = x * CHANNELS;
-
-            sums0.as_ptr(dst_row0, px);
-            sums1.as_ptr(dst_row1, px);
-            sums2.as_ptr(dst_row2, px);
-            sums3.as_ptr(dst_row3, px);
-
-            filter_offset += filter_weights.aligned_size;
-        }
-    }
+) {
+    convolve_row_handler_floating_point_4::<f32, f32, f32, CHANNELS>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        filter_weights,
+        8,
+    )
 }
diff --git a/src/dispatch_group_f32.rs b/src/dispatch_group_f32.rs
index ce639d4..f4b87e9 100644
--- a/src/dispatch_group_f32.rs
+++ b/src/dispatch_group_f32.rs
@@ -28,70 +28,51 @@
  */
 
 use crate::filter_weights::{FilterBounds, FilterWeights};
-use crate::unsafe_slice::UnsafeSlice;
 use crate::ImageStore;
+use rayon::iter::{IndexedParallelIterator, ParallelIterator};
+use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use rayon::ThreadPool;
-use std::sync::Arc;
 
 pub(crate) fn convolve_vertical_dispatch_f32<const COMPONENTS: usize>(
     image_store: &ImageStore<f32, COMPONENTS>,
     filter_weights: FilterWeights<f32>,
     destination: &mut ImageStore<f32, COMPONENTS>,
     pool: &Option<ThreadPool>,
-    dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, &[f32]),
+    dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]),
 ) {
-    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
     let src_stride = image_store.width * image_store.channels;
-
-    let mut filter_offset = 0usize;
-
     let dst_stride = destination.width * image_store.channels;
+
     let dst_width = destination.width;
 
     if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let bounds = unsafe { weights.bounds.get_unchecked(y) };
-                    let weight_ptr =
-                        unsafe { weights.weights.get_unchecked((weights.aligned_size * y)..) };
-                    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    dispatcher(
-                        dst_width,
-                        bounds,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                        src_stride,
-                        weight_ptr,
-                    );
+        pool.install(|| {
+            destination
+                .buffer
+                .borrow_mut()
+                .par_chunks_exact_mut(dst_stride)
+                .enumerate()
+                .for_each(|(y, row)| {
+                    let bounds = filter_weights.bounds[y];
+                    let filter_offset = y * filter_weights.aligned_size;
+                    let weights = &filter_weights.weights[filter_offset..];
+                    let source_buffer = image_store.buffer.borrow();
+                    dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights);
                 });
-            }
         });
     } else {
-        for y in 0..destination.height {
-            let bounds = unsafe { filter_weights.bounds.get_unchecked(y) };
-            let weight_ptr = unsafe { filter_weights.weights.get_unchecked(filter_offset..) };
-
-            dispatcher(
-                dst_width,
-                bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
-            );
-
-            filter_offset += filter_weights.aligned_size;
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
+        destination
+            .buffer
+            .borrow_mut()
+            .chunks_exact_mut(dst_stride)
+            .enumerate()
+            .for_each(|(y, row)| {
+                let bounds = filter_weights.bounds[y];
+                let filter_offset = y * filter_weights.aligned_size;
+                let weights = &filter_weights.weights[filter_offset..];
+                let source_buffer = image_store.buffer.borrow();
+                dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights);
+            });
     }
 }
 
@@ -102,95 +83,120 @@ pub(crate) fn convolve_horizontal_dispatch_f32<const CHANNELS: usize>(
     destination: &mut ImageStore<f32, CHANNELS>,
     pool: &Option<ThreadPool>,
     dispatcher_4_rows: Option<
-        fn(usize, usize, &FilterWeights<f32>, *const f32, usize, *mut f32, usize),
+        fn(usize, usize, &FilterWeights<f32>, &[f32], usize, &mut [f32], usize),
     >,
-    dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f32, *mut f32),
+    dispatcher_row: fn(usize, usize, &FilterWeights<f32>, &[f32], &mut [f32]),
 ) {
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
     let src_stride = image_store.width * image_store.channels;
     let dst_stride = destination.width * image_store.channels;
     let dst_width = destination.width;
     let src_width = image_store.width;
 
     if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            let mut yy = 0usize;
+        pool.install(|| {
+            let mut processed_4 = false;
+
             if let Some(dispatcher) = dispatcher_4_rows {
-                for y in (0..destination.height.saturating_sub(4)).step_by(4) {
-                    let weights = arc_weights.clone();
-                    scope.spawn(move |_| {
-                        let unsafe_source_ptr_0 =
-                            unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                        let dst_ptr = unsafe_slice.mut_ptr();
-                        let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
+                image_store
+                    .buffer
+                    .borrow()
+                    .par_chunks_exact(src_stride * 4)
+                    .zip(
+                        destination
+                            .buffer
+                            .borrow_mut()
+                            .par_chunks_exact_mut(dst_stride * 4),
+                    )
+                    .for_each(|(src, dst)| {
                         dispatcher(
                             dst_width,
                             src_width,
-                            &weights,
-                            unsafe_source_ptr_0,
+                            &filter_weights,
+                            src,
                             src_stride,
-                            unsafe_destination_ptr_0,
+                            dst,
                             dst_stride,
                         );
                     });
-                    yy = y;
-                }
+                processed_4 = true;
             }
-            for y in yy..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    dispatcher_row(
-                        dst_width,
-                        src_width,
-                        &weights,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                    );
+
+            let left_src_rows = if processed_4 {
+                image_store
+                    .buffer
+                    .borrow()
+                    .chunks_exact(src_stride * 4)
+                    .remainder()
+            } else {
+                image_store.buffer.borrow()
+            };
+            let left_dst_rows = if processed_4 {
+                destination
+                    .buffer
+                    .borrow_mut()
+                    .chunks_exact_mut(dst_stride * 4)
+                    .into_remainder()
+            } else {
+                destination.buffer.borrow_mut()
+            };
+
+            left_src_rows
+                .par_chunks_exact(src_stride)
+                .zip(left_dst_rows.par_chunks_exact_mut(dst_stride))
+                .for_each(|(src, dst)| {
+                    dispatcher_row(dst_width, src_width, &filter_weights, src, dst);
                 });
-            }
         });
     } else {
-        let mut yy = 0usize;
-
+        let mut processed_4 = false;
         if let Some(dispatcher) = dispatcher_4_rows {
-            while yy + 4 < destination.height {
+            for (src, dst) in image_store
+                .buffer
+                .borrow()
+                .chunks_exact(src_stride * 4)
+                .zip(
+                    destination
+                        .buffer
+                        .borrow_mut()
+                        .chunks_exact_mut(dst_stride * 4),
+                )
+            {
                 dispatcher(
                     dst_width,
                     src_width,
                     &filter_weights,
-                    unsafe_source_ptr_0,
+                    src,
                     src_stride,
-                    unsafe_destination_ptr_0,
+                    dst,
                     dst_stride,
                 );
-
-                unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) };
-                unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) };
-
-                yy += 4;
             }
+            processed_4 = true;
         }
 
-        for _ in yy..destination.height {
-            dispatcher_row(
-                dst_width,
-                src_width,
-                &filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-            );
-
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
+        let left_src_rows = if processed_4 {
+            image_store
+                .buffer
+                .borrow()
+                .chunks_exact(src_stride * 4)
+                .remainder()
+        } else {
+            image_store.buffer.borrow()
+        };
+        let left_dst_rows = if processed_4 {
+            destination
+                .buffer
+                .borrow_mut()
+                .chunks_exact_mut(dst_stride * 4)
+                .into_remainder()
+        } else {
+            destination.buffer.borrow_mut()
+        };
+        for (src, dst) in left_src_rows
+            .chunks_exact(src_stride)
+            .zip(left_dst_rows.chunks_exact_mut(dst_stride))
+        {
+            dispatcher_row(dst_width, src_width, &filter_weights, src, dst);
         }
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index de47fb5..290b8d4 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -88,7 +88,6 @@ mod scaler_f16;
 mod sse;
 mod support;
 mod threading_policy;
-mod unsafe_slice;
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128",))]
 mod wasm32;
 
diff --git a/src/neon/plane_f32.rs b/src/neon/plane_f32.rs
index 3db9629..cb8c65f 100644
--- a/src/neon/plane_f32.rs
+++ b/src/neon/plane_f32.rs
@@ -91,8 +91,8 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     unsafe {
         let mut filter_offset = 0usize;
@@ -109,7 +109,7 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one(
                 let read_weights = xvld1q_f32_x4(ptr);
                 store = conv_horiz_plane_16_f32!(
                     bounds_start,
-                    unsafe_source_ptr_0,
+                    src.as_ptr(),
                     read_weights,
                     store
                 );
@@ -122,7 +122,7 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one(
                 let read_weights = xvld1q_f32_x2(ptr);
                 store = conv_horiz_plane_8_f32!(
                     bounds_start,
-                    unsafe_source_ptr_0,
+                    src.as_ptr(),
                     read_weights.0,
                     read_weights.1,
                     store
@@ -135,7 +135,7 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
                 store =
-                    conv_horiz_plane_4_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store);
+                    conv_horiz_plane_4_f32!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 4;
             }
 
@@ -144,7 +144,7 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weights0 = vld1_f32(ptr);
                 let weights = vcombine_f32(weights0, vdup_n_f32(0.));
-                store = conv_horiz_plane_2_f32!(bounds_start, unsafe_source_ptr_0, weights, store);
+                store = conv_horiz_plane_2_f32!(bounds_start, src.as_ptr(), weights, store);
                 jx += 2;
             }
 
@@ -152,12 +152,12 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = vld1q_dup_f32(ptr);
-                store = conv_horiz_plane_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store);
+                store = conv_horiz_plane_1_f32!(bounds_start, src.as_ptr(), weight0, store);
                 jx += 1;
             }
 
             let px = x;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             dest_ptr.write_unaligned(vaddvq_f32(store));
 
             filter_offset += filter_weights.aligned_size;
@@ -169,9 +169,9 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     unsafe {
@@ -191,17 +191,13 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = xvld1q_f32_x4(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_16_f32!(
-                    bounds_start,
-                    unsafe_source_ptr_0,
-                    read_weights,
-                    store_0
-                );
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 =
+                    conv_horiz_plane_16_f32!(bounds_start, src.as_ptr(), read_weights, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, read_weights, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, read_weights, store_2);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, read_weights, store_3);
                 jx += 16;
             }
@@ -212,12 +208,12 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4(
                 let bounds_start = bounds.start + jx;
                 store_0 = conv_horiz_plane_8_f32!(
                     bounds_start,
-                    unsafe_source_ptr_0,
+                    src.as_ptr(),
                     read_weights.0,
                     read_weights.1,
                     store_0
                 );
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_plane_8_f32!(
                     bounds_start,
                     s_ptr_1,
@@ -225,7 +221,7 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4(
                     read_weights.1,
                     store_1
                 );
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_plane_8_f32!(
                     bounds_start,
                     s_ptr2,
@@ -233,7 +229,7 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4(
                     read_weights.1,
                     store_2
                 );
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_plane_8_f32!(
                     bounds_start,
                     s_ptr3,
@@ -248,17 +244,13 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_4_f32!(
-                    bounds_start,
-                    unsafe_source_ptr_0,
-                    read_weights,
-                    store_0
-                );
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 =
+                    conv_horiz_plane_4_f32!(bounds_start, src.as_ptr(), read_weights, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3);
                 jx += 4;
             }
@@ -268,13 +260,12 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4(
                 let weights0 = vld1_f32(ptr);
                 let weights = vcombine_f32(weights0, vdup_n_f32(0.));
                 let bounds_start = bounds.start + jx;
-                store_0 =
-                    conv_horiz_plane_2_f32!(bounds_start, unsafe_source_ptr_0, weights, store_0);
-                let ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_plane_2_f32!(bounds_start, src.as_ptr(), weights, store_0);
+                let ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1);
-                let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2);
-                let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3);
                 jx += 2;
             }
@@ -283,28 +274,27 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = vld1q_dup_f32(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 =
-                    conv_horiz_plane_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store_0);
-                let ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_plane_1_f32!(bounds_start, src.as_ptr(), weight0, store_0);
+                let ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1);
-                let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2);
-                let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3);
                 jx += 1;
             }
 
             let px = x;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             dest_ptr.write_unaligned(vaddvq_f32(store_0));
 
-            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
+            let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
             dest_ptr.write_unaligned(vaddvq_f32(store_1));
 
-            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+            let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
             dest_ptr.write_unaligned(vaddvq_f32(store_2));
 
-            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+            let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
             dest_ptr.write_unaligned(vaddvq_f32(store_3));
 
             filter_offset += filter_weights.aligned_size;
diff --git a/src/neon/rgb_f32.rs b/src/neon/rgb_f32.rs
index 473f745..f1a5325 100644
--- a/src/neon/rgb_f32.rs
+++ b/src/neon/rgb_f32.rs
@@ -113,9 +113,9 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f32(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     unsafe {
@@ -138,13 +138,12 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f32(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                store_0 =
-                    conv_horiz_4_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store_0);
-                let s_ptr1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_4_rgb_f32!(bounds_start, src.as_ptr(), read_weights, store_0);
+                let s_ptr1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr1, read_weights, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr2, read_weights, store_2);
-                let s_ptr = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr, read_weights, store_3);
                 jx += 4;
             }
@@ -153,13 +152,12 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f32(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                store_0 =
-                    conv_horiz_2_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store_0);
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_2_rgb_f32!(bounds_start, src.as_ptr(), read_weights, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr_1, read_weights, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr2, read_weights, store_2);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr3, read_weights, store_3);
                 jx += 2;
             }
@@ -168,28 +166,27 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f32(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let bounds_start = bounds.start + jx;
                 let weight0 = vld1q_dup_f32(ptr);
-                store_0 =
-                    conv_horiz_1_rgb_f32!(bounds_start, unsafe_source_ptr_0, weight0, store_0);
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_1_rgb_f32!(bounds_start, src.as_ptr(), weight0, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_1_rgb_f32!(bounds_start, s_ptr_1, weight0, store_1);
-                let s_ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_1_rgb_f32!(bounds_start, s_ptr_2, weight0, store_2);
-                let s_ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_1_rgb_f32!(bounds_start, s_ptr_3, weight0, store_3);
                 jx += 1;
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             write_rgb_f32!(store_0, dest_ptr);
 
-            let dest_ptr_1 = unsafe_destination_ptr_0.add(px + dst_stride);
+            let dest_ptr_1 = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
             write_rgb_f32!(store_1, dest_ptr_1);
 
-            let dest_ptr_2 = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+            let dest_ptr_2 = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
             write_rgb_f32!(store_2, dest_ptr_2);
 
-            let dest_ptr_3 = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+            let dest_ptr_3 = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
             write_rgb_f32!(store_3, dest_ptr_3);
 
             filter_offset += filter_weights.aligned_size;
@@ -201,8 +198,8 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f32(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     unsafe {
         const CHANNELS: usize = 3;
@@ -218,8 +215,7 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f32(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                store =
-                    conv_horiz_4_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store);
+                store = conv_horiz_4_rgb_f32!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 4;
             }
 
@@ -227,8 +223,7 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f32(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                store =
-                    conv_horiz_2_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store);
+                store = conv_horiz_2_rgb_f32!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 2;
             }
 
@@ -236,12 +231,12 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f32(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = vld1q_dup_f32(ptr);
                 let bounds_start = bounds.start + jx;
-                store = conv_horiz_1_rgb_f32!(bounds_start, unsafe_source_ptr_0, weight0, store);
+                store = conv_horiz_1_rgb_f32!(bounds_start, src.as_ptr(), weight0, store);
                 jx += 1;
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             write_rgb_f32!(store, dest_ptr);
 
             filter_offset += filter_weights.aligned_size;
diff --git a/src/neon/rgba_f32.rs b/src/neon/rgba_f32.rs
index 3db6b0e..1e44dd3 100644
--- a/src/neon/rgba_f32.rs
+++ b/src/neon/rgba_f32.rs
@@ -94,8 +94,8 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     unsafe {
         const CHANNELS: usize = 4;
@@ -112,7 +112,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
                 store =
-                    conv_horiz_rgba_4_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store);
+                    conv_horiz_rgba_4_f32!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 4;
             }
 
@@ -121,7 +121,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
                 store =
-                    conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store);
+                    conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 2;
             }
 
@@ -129,12 +129,12 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = vld1q_dup_f32(ptr);
-                store = conv_horiz_rgba_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store);
+                store = conv_horiz_rgba_1_f32!(bounds_start, src.as_ptr(), weight0, store);
                 jx += 1;
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             vst1q_f32(dest_ptr, store);
 
             filter_offset += filter_weights.aligned_size;
@@ -146,9 +146,9 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     unsafe {
@@ -171,12 +171,12 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4(
                 let bounds_start = bounds.start + jx;
                 store_0 = conv_horiz_rgba_8_f32!(
                     bounds_start,
-                    unsafe_source_ptr_0,
+                    src.as_ptr(),
                     read_weights.0,
                     read_weights.1,
                     store_0
                 );
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_rgba_8_f32!(
                     bounds_start,
                     s_ptr_1,
@@ -184,7 +184,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4(
                     read_weights.1,
                     store_1
                 );
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_rgba_8_f32!(
                     bounds_start,
                     s_ptr2,
@@ -192,7 +192,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4(
                     read_weights.1,
                     store_2
                 );
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_rgba_8_f32!(
                     bounds_start,
                     s_ptr3,
@@ -207,17 +207,12 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_rgba_4_f32!(
-                    bounds_start,
-                    unsafe_source_ptr_0,
-                    read_weights,
-                    store_0
-                );
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_rgba_4_f32!(bounds_start, src.as_ptr(), read_weights, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr_1, read_weights, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr2, read_weights, store_2);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr3, read_weights, store_3);
                 jx += 4;
             }
@@ -226,17 +221,12 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_rgba_2_f32!(
-                    bounds_start,
-                    unsafe_source_ptr_0,
-                    read_weights,
-                    store_0
-                );
-                let ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store_0);
+                let ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, read_weights, store_1);
-                let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, read_weights, store_2);
-                let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, read_weights, store_3);
                 jx += 2;
             }
@@ -245,28 +235,27 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = vld1q_dup_f32(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 =
-                    conv_horiz_rgba_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store_0);
-                let ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_rgba_1_f32!(bounds_start, src.as_ptr(), weight0, store_0);
+                let ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_rgba_1_f32!(bounds_start, ptr_1, weight0, store_1);
-                let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_rgba_1_f32!(bounds_start, ptr_2, weight0, store_2);
-                let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_rgba_1_f32!(bounds_start, ptr_3, weight0, store_3);
                 jx += 1;
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             vst1q_f32(dest_ptr, store_0);
 
-            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
+            let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
             vst1q_f32(dest_ptr, store_1);
 
-            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+            let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
             vst1q_f32(dest_ptr, store_2);
 
-            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+            let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
             vst1q_f32(dest_ptr, store_3);
 
             filter_offset += filter_weights.aligned_size;
diff --git a/src/neon/vertical_f32.rs b/src/neon/vertical_f32.rs
index dbb1b6d..454892c 100644
--- a/src/neon/vertical_f32.rs
+++ b/src/neon/vertical_f32.rs
@@ -44,10 +44,9 @@ macro_rules! conv_vertical_part_neon_16_f32 {
             for j in 0..$bounds.size {
                 let py = $start_y + j;
                 let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr());
-                let src_ptr = $src.add($src_stride * py);
+                let src_ptr = $src.get_unchecked($src_stride * py + px..);
 
-                let s_ptr = src_ptr.add(px);
-                let item_row = xvld1q_f32_x4(s_ptr);
+                let item_row = xvld1q_f32_x4(src_ptr.as_ptr());
 
                 store_0 = prefer_vfmaq_f32(store_0, item_row.0, v_weight);
                 store_1 = prefer_vfmaq_f32(store_1, item_row.1, v_weight);
@@ -55,7 +54,7 @@ macro_rules! conv_vertical_part_neon_16_f32 {
                 store_3 = prefer_vfmaq_f32(store_3, item_row.3, v_weight);
             }
 
-            let dst_ptr = $dst.add(px);
+            let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr();
             let f_set = float32x4x4_t(store_0, store_1, store_2, store_3);
             vst1q_f32_x4(dst_ptr, f_set);
         }
@@ -79,11 +78,10 @@ macro_rules! conv_vertical_part_neon_32_f32 {
             for j in 0..$bounds.size {
                 let py = $start_y + j;
                 let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr());
-                let src_ptr = $src.add($src_stride * py);
+                let src_ptr = $src.get_unchecked($src_stride * py + px..).as_ptr();
 
-                let s_ptr = src_ptr.add(px);
-                let item_row_0 = xvld1q_f32_x4(s_ptr);
-                let item_row_1 = xvld1q_f32_x4(s_ptr.add(16));
+                let item_row_0 = xvld1q_f32_x4(src_ptr);
+                let item_row_1 = xvld1q_f32_x4(src_ptr.add(16));
 
                 store_0 = prefer_vfmaq_f32(store_0, item_row_0.0, v_weight);
                 store_1 = prefer_vfmaq_f32(store_1, item_row_0.1, v_weight);
@@ -96,7 +94,7 @@ macro_rules! conv_vertical_part_neon_32_f32 {
                 store_7 = prefer_vfmaq_f32(store_7, item_row_1.3, v_weight);
             }
 
-            let dst_ptr = $dst.add(px);
+            let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr();
             let f_set = float32x4x4_t(store_0, store_1, store_2, store_3);
             vst1q_f32_x4(dst_ptr, f_set);
 
@@ -129,12 +127,11 @@ macro_rules! conv_vertical_part_neon_48_f32 {
             for j in 0..$bounds.size {
                 let py = $start_y + j;
                 let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr());
-                let src_ptr = $src.add($src_stride * py);
+                let src_ptr = $src.get_unchecked($src_stride * py + px..).as_ptr();
 
-                let s_ptr = src_ptr.add(px);
-                let item_row_0 = xvld1q_f32_x4(s_ptr);
-                let item_row_1 = xvld1q_f32_x4(s_ptr.add(16));
-                let item_row_2 = xvld1q_f32_x4(s_ptr.add(32));
+                let item_row_0 = xvld1q_f32_x4(src_ptr);
+                let item_row_1 = xvld1q_f32_x4(src_ptr.add(16));
+                let item_row_2 = xvld1q_f32_x4(src_ptr.add(32));
 
                 store_0 = prefer_vfmaq_f32(store_0, item_row_0.0, v_weight);
                 store_1 = prefer_vfmaq_f32(store_1, item_row_0.1, v_weight);
@@ -152,7 +149,7 @@ macro_rules! conv_vertical_part_neon_48_f32 {
                 store_11 = prefer_vfmaq_f32(store_11, item_row_2.3, v_weight);
             }
 
-            let dst_ptr = $dst.add(px);
+            let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr();
             let f_set = float32x4x4_t(store_0, store_1, store_2, store_3);
             vst1q_f32_x4(dst_ptr, f_set);
 
@@ -169,9 +166,9 @@ macro_rules! conv_vertical_part_neon_48_f32 {
 unsafe fn convolve_vertical_part_neon_8_f32(
     start_y: usize,
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     src_stride: usize,
-    dst: *mut f32,
+    dst: &mut [f32],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -184,10 +181,8 @@ unsafe fn convolve_vertical_part_neon_8_f32(
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = vld1q_dup_f32(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
-
-        let s_ptr = src_ptr.add(px);
-        let item_row = xvld1q_f32_x2(s_ptr);
+        let src_ptr = src.get_unchecked(src_stride * py + px..);
+        let item_row = xvld1q_f32_x2(src_ptr.as_ptr());
 
         store_0 = prefer_vfmaq_f32(store_0, item_row.0, v_weight);
         store_1 = prefer_vfmaq_f32(store_1, item_row.1, v_weight);
@@ -195,7 +190,7 @@ unsafe fn convolve_vertical_part_neon_8_f32(
 
     let item = float32x4x2_t(store_0, store_1);
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     vst1q_f32_x2(dst_ptr, item);
 }
 
@@ -203,9 +198,9 @@ unsafe fn convolve_vertical_part_neon_8_f32(
 unsafe fn convolve_vertical_part_neon_4_f32(
     start_y: usize,
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     src_stride: usize,
-    dst: *mut f32,
+    dst: &mut [f32],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -217,15 +212,14 @@ unsafe fn convolve_vertical_part_neon_4_f32(
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = vld1q_dup_f32(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py + px..);
 
-        let s_ptr = src_ptr.add(px);
-        let item_row = xvld1q_f32_x2(s_ptr);
+        let item_row = xvld1q_f32_x2(src_ptr.as_ptr());
 
         store_0 = prefer_vfmaq_f32(store_0, item_row.0, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     vst1q_f32(dst_ptr, store_0);
 }
 
@@ -233,9 +227,9 @@ unsafe fn convolve_vertical_part_neon_4_f32(
 unsafe fn convolve_vertical_part_neon_1_f32(
     start_y: usize,
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     src_stride: usize,
-    dst: *mut f32,
+    dst: &mut [f32],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -247,23 +241,21 @@ unsafe fn convolve_vertical_part_neon_1_f32(
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = vld1q_dup_f32(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
-
-        let s_ptr = src_ptr.add(px);
-        let item_row = vld1q_dup_f32(s_ptr);
+        let src_ptr = src.get_unchecked(src_stride * py + px..);
+        let item_row = vld1q_dup_f32(src_ptr.as_ptr());
 
         store_0 = prefer_vfmaq_f32(store_0, item_row, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     dst_ptr.write_unaligned(vgetq_lane_f32::<0>(store_0));
 }
 
 pub(crate) fn convolve_vertical_rgb_neon_row_f32<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -271,43 +263,19 @@ pub(crate) fn convolve_vertical_rgb_neon_row_f32<const CHANNELS: usize>(
     let dst_width = width * CHANNELS;
 
     while cx + 48 < dst_width {
-        conv_vertical_part_neon_48_f32!(
-            bounds.start,
-            cx,
-            unsafe_source_ptr_0,
-            src_stride,
-            unsafe_destination_ptr_0,
-            weight_ptr,
-            bounds
-        );
+        conv_vertical_part_neon_48_f32!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
 
         cx += 48;
     }
 
     while cx + 32 < dst_width {
-        conv_vertical_part_neon_32_f32!(
-            bounds.start,
-            cx,
-            unsafe_source_ptr_0,
-            src_stride,
-            unsafe_destination_ptr_0,
-            weight_ptr,
-            bounds
-        );
+        conv_vertical_part_neon_32_f32!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
 
         cx += 32;
     }
 
     while cx + 16 < dst_width {
-        conv_vertical_part_neon_16_f32!(
-            bounds.start,
-            cx,
-            unsafe_source_ptr_0,
-            src_stride,
-            unsafe_destination_ptr_0,
-            weight_ptr,
-            bounds
-        );
+        conv_vertical_part_neon_16_f32!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
 
         cx += 16;
     }
@@ -317,9 +285,9 @@ pub(crate) fn convolve_vertical_rgb_neon_row_f32<const CHANNELS: usize>(
             convolve_vertical_part_neon_8_f32(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
             );
@@ -333,9 +301,9 @@ pub(crate) fn convolve_vertical_rgb_neon_row_f32<const CHANNELS: usize>(
             convolve_vertical_part_neon_4_f32(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
             );
@@ -349,9 +317,9 @@ pub(crate) fn convolve_vertical_rgb_neon_row_f32<const CHANNELS: usize>(
             convolve_vertical_part_neon_1_f32(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
             );
diff --git a/src/plane_f32.rs b/src/plane_f32.rs
index 742d024..61f533a 100644
--- a/src/plane_f32.rs
+++ b/src/plane_f32.rs
@@ -58,10 +58,10 @@ impl HorizontalConvolutionPass<f32, 1> for ImageStore<'_, f32, 1> {
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher_4_rows: Option<
-            fn(usize, usize, &FilterWeights<f32>, *const f32, usize, *mut f32, usize),
-        > = Some(convolve_horizontal_rgba_4_row_f32::<f32, f32, 1>);
-        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f32, *mut f32) =
-            convolve_horizontal_rgb_native_row::<f32, f32, 1>;
+            fn(usize, usize, &FilterWeights<f32>, &[f32], usize, &mut [f32], usize),
+        > = Some(convolve_horizontal_rgba_4_row_f32::<1>);
+        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, &[f32], &mut [f32]) =
+            convolve_horizontal_rgb_native_row::<1>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_4);
@@ -96,8 +96,8 @@ impl VerticalConvolutionPass<f32, 1> for ImageStore<'_, f32, 1> {
         destination: &mut ImageStore<f32, 1>,
         pool: &Option<ThreadPool>,
     ) {
-        let mut _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, &[f32]) =
-            convolve_vertical_rgb_native_row_f32::<f32, 1>;
+        let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) =
+            convolve_vertical_rgb_native_row_f32::<1>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             _dispatcher = convolve_vertical_rgb_neon_row_f32::<1>;
diff --git a/src/rgb_f32.rs b/src/rgb_f32.rs
index f06005d..357d8ec 100644
--- a/src/rgb_f32.rs
+++ b/src/rgb_f32.rs
@@ -32,60 +32,23 @@ use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::convolve_naive_f32::*;
 use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32};
 use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::floating_point_vertical::column_handler_floating_point;
 use crate::image_store::ImageStore;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
 use crate::neon::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse::*;
-use num_traits::AsPrimitive;
 use rayon::ThreadPool;
 
-pub(crate) fn convolve_vertical_rgb_native_row_f32<
-    T: Copy + 'static + AsPrimitive<f32>,
-    const COMPONENTS: usize,
->(
-    dst_width: usize,
+pub(crate) fn convolve_vertical_rgb_native_row_f32<const COMPONENTS: usize>(
+    _: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const T,
-    unsafe_destination_ptr_0: *mut T,
+    src: &[f32],
+    dst: &mut [f32],
     src_stride: usize,
     weight: &[f32],
-) where
-    f32: AsPrimitive<T>,
-{
-    let mut cx = 0usize;
-
-    while cx + 4 < dst_width {
-        unsafe {
-            convolve_vertical_part_4_f32::<T, f32, COMPONENTS>(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight,
-                bounds,
-            );
-        }
-
-        cx += 4;
-    }
-
-    while cx < dst_width {
-        unsafe {
-            convolve_vertical_part_f32::<T, f32, COMPONENTS>(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight,
-                bounds,
-            );
-        }
-
-        cx += 1;
-    }
+) {
+    column_handler_floating_point::<f32, f32, f32>(bounds, src, dst, src_stride, weight, 8);
 }
 
 impl HorizontalConvolutionPass<f32, 3> for ImageStore<'_, f32, 3> {
@@ -97,10 +60,10 @@ impl HorizontalConvolutionPass<f32, 3> for ImageStore<'_, f32, 3> {
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher_4_rows: Option<
-            fn(usize, usize, &FilterWeights<f32>, *const f32, usize, *mut f32, usize),
-        > = Some(convolve_horizontal_rgba_4_row_f32::<f32, f32, 3>);
-        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f32, *mut f32) =
-            convolve_horizontal_rgb_native_row::<f32, f32, 3>;
+            fn(usize, usize, &FilterWeights<f32>, &[f32], usize, &mut [f32], usize),
+        > = Some(convolve_horizontal_rgba_4_row_f32::<3>);
+        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, &[f32], &mut [f32]) =
+            convolve_horizontal_rgb_native_row::<3>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4_f32);
@@ -135,8 +98,8 @@ impl VerticalConvolutionPass<f32, 3> for ImageStore<'_, f32, 3> {
         destination: &mut ImageStore<f32, 3>,
         pool: &Option<ThreadPool>,
     ) {
-        let mut _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, &[f32]) =
-            convolve_vertical_rgb_native_row_f32::<f32, 3>;
+        let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) =
+            convolve_vertical_rgb_native_row_f32::<3>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             _dispatcher = convolve_vertical_rgb_neon_row_f32::<3>;
diff --git a/src/rgba_f32.rs b/src/rgba_f32.rs
index 09fed44..f2081a4 100644
--- a/src/rgba_f32.rs
+++ b/src/rgba_f32.rs
@@ -54,10 +54,10 @@ impl HorizontalConvolutionPass<f32, 4> for ImageStore<'_, f32, 4> {
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher_4_rows: Option<
-            fn(usize, usize, &FilterWeights<f32>, *const f32, usize, *mut f32, usize),
-        > = Some(convolve_horizontal_rgba_4_row_f32::<f32, f32, 4>);
-        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f32, *mut f32) =
-            convolve_horizontal_rgb_native_row::<f32, f32, 4>;
+            fn(usize, usize, &FilterWeights<f32>, &[f32], usize, &mut [f32], usize),
+        > = Some(convolve_horizontal_rgba_4_row_f32::<4>);
+        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, &[f32], &mut [f32]) =
+            convolve_horizontal_rgb_native_row::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4);
@@ -65,7 +65,7 @@ impl HorizontalConvolutionPass<f32, 4> for ImageStore<'_, f32, 4> {
         }
         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
         {
-            if is_x86_feature_detected!("sse4.1") {
+            if std::is_x86_feature_detected!("sse4.1") {
                 _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4_f32::<false>);
                 _dispatcher_row = convolve_horizontal_rgba_sse_row_one_f32::<false>;
                 if is_x86_feature_detected!("fma") {
@@ -73,7 +73,7 @@ impl HorizontalConvolutionPass<f32, 4> for ImageStore<'_, f32, 4> {
                     _dispatcher_row = convolve_horizontal_rgba_sse_row_one_f32::<true>;
                 }
             }
-            if is_x86_feature_detected!("avx2") {
+            if std::is_x86_feature_detected!("avx2") {
                 _dispatcher_4_rows = Some(convolve_horizontal_rgba_avx_rows_4_f32::<false>);
                 _dispatcher_row = convolve_horizontal_rgba_avx_row_one_f32::<false>;
                 if is_x86_feature_detected!("fma") {
@@ -100,23 +100,23 @@ impl VerticalConvolutionPass<f32, 4> for ImageStore<'_, f32, 4> {
         destination: &mut ImageStore<f32, 4>,
         pool: &Option<ThreadPool>,
     ) {
-        let mut _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, &[f32]) =
-            convolve_vertical_rgb_native_row_f32::<f32, 4>;
+        let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) =
+            convolve_vertical_rgb_native_row_f32::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             _dispatcher = convolve_vertical_rgb_neon_row_f32::<4>;
         }
         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
         {
-            let has_fma = is_x86_feature_detected!("fma");
-            if is_x86_feature_detected!("sse4.1") {
+            let has_fma = std::is_x86_feature_detected!("fma");
+            if std::is_x86_feature_detected!("sse4.1") {
                 if has_fma {
                     _dispatcher = convolve_vertical_rgb_sse_row_f32::<4, true>;
                 } else {
                     _dispatcher = convolve_vertical_rgb_sse_row_f32::<4, false>;
                 }
             }
-            if is_x86_feature_detected!("avx2") {
+            if std::is_x86_feature_detected!("avx2") {
                 _dispatcher = convolve_vertical_avx_row_f32::<4, false>;
                 if has_fma {
                     _dispatcher = convolve_vertical_avx_row_f32::<4, true>;
diff --git a/src/sse/plane_f32.rs b/src/sse/plane_f32.rs
index 7b619eb..8852f17 100644
--- a/src/sse/plane_f32.rs
+++ b/src/sse/plane_f32.rs
@@ -36,7 +36,7 @@ use std::arch::x86_64::*;
 
 macro_rules! conv_horiz_plane_16_f32 {
     ($start_x: expr, $src: expr, $set: expr, $store: expr, $fma: expr) => {{
-        let src_ptr = $src.add($start_x);
+        let src_ptr = $src.get_unchecked($start_x..).as_ptr();
 
         let rgb_pixel0 = _mm_loadu_ps(src_ptr);
         let rgb_pixel1 = _mm_loadu_ps(src_ptr.add(4));
@@ -53,7 +53,7 @@ macro_rules! conv_horiz_plane_16_f32 {
 
 macro_rules! conv_horiz_plane_8_f32 {
     ($start_x: expr, $src: expr, $set1: expr, $set2: expr, $store: expr, $fma: expr) => {{
-        let src_ptr = $src.add($start_x);
+        let src_ptr = $src.get_unchecked($start_x..).as_ptr();
 
         let rgb_pixel0 = _mm_loadu_ps(src_ptr);
         let rgb_pixel1 = _mm_loadu_ps(src_ptr.add(4));
@@ -66,7 +66,7 @@ macro_rules! conv_horiz_plane_8_f32 {
 
 macro_rules! conv_horiz_plane_4_f32 {
     ($start_x: expr, $src: expr, $set1: expr,  $store: expr, $fma: expr) => {{
-        let src_ptr = $src.add($start_x);
+        let src_ptr = $src.get_unchecked($start_x..).as_ptr();
 
         let rgb_pixel = _mm_loadu_ps(src_ptr);
 
@@ -76,7 +76,7 @@ macro_rules! conv_horiz_plane_4_f32 {
 
 macro_rules! conv_horiz_plane_2_f32 {
     ($start_x: expr, $src: expr, $set: expr,  $store: expr, $fma: expr) => {{
-        let src_ptr = $src.add($start_x);
+        let src_ptr = $src.get_unchecked($start_x..).as_ptr();
 
         let rgb_pixel = _mm_setr_ps(
             src_ptr.read_unaligned(),
@@ -91,7 +91,7 @@ macro_rules! conv_horiz_plane_2_f32 {
 
 macro_rules! conv_horiz_plane_1_f32 {
     ($start_x: expr, $src: expr, $set: expr,  $store: expr, $fma: expr) => {{
-        let src_ptr = $src.add($start_x);
+        let src_ptr = $src.get_unchecked($start_x..).as_ptr();
         let rgb_pixel = _mm_setr_ps(src_ptr.read_unaligned(), 0., 0., 0.);
         _mm_prefer_fma_ps::<$fma>($store, rgb_pixel, $set)
     }};
@@ -101,8 +101,8 @@ pub(crate) fn convolve_horizontal_plane_sse_row_one<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     unsafe {
         if FMA {
@@ -110,64 +110,62 @@ pub(crate) fn convolve_horizontal_plane_sse_row_one<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
+                src,
+                dst,
             );
         } else {
             convolve_horizontal_plane_sse_row_one_regular(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
+                src,
+                dst,
             );
         }
     }
 }
 
-#[inline]
 #[target_feature(enable = "sse4.1")]
 unsafe fn convolve_horizontal_plane_sse_row_one_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     convolve_horizontal_plane_sse_row_one_impl::<false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
-#[inline]
 #[target_feature(enable = "sse4.1,fma")]
 unsafe fn convolve_horizontal_plane_sse_row_one_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     convolve_horizontal_plane_sse_row_one_impl::<true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn convolve_horizontal_plane_sse_row_one_impl<const FMA: bool>(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     let mut filter_offset = 0usize;
     let weights_ptr = filter_weights.weights.as_ptr();
@@ -185,8 +183,7 @@ unsafe fn convolve_horizontal_plane_sse_row_one_impl<const FMA: bool>(
             let read_weights2 = _mm_loadu_ps(ptr.add(8));
             let read_weights3 = _mm_loadu_ps(ptr.add(12));
             let weights = (read_weights0, read_weights1, read_weights2, read_weights3);
-            store =
-                conv_horiz_plane_16_f32!(bounds_start, unsafe_source_ptr_0, weights, store, FMA);
+            store = conv_horiz_plane_16_f32!(bounds_start, src, weights, store, FMA);
             jx += 8;
         }
 
@@ -198,7 +195,7 @@ unsafe fn convolve_horizontal_plane_sse_row_one_impl<const FMA: bool>(
             let read_weights = (read_weights0, read_weights1);
             store = conv_horiz_plane_8_f32!(
                 bounds_start,
-                unsafe_source_ptr_0,
+                src,
                 read_weights.0,
                 read_weights.1,
                 store,
@@ -211,13 +208,7 @@ unsafe fn convolve_horizontal_plane_sse_row_one_impl<const FMA: bool>(
             let bounds_start = bounds.start + jx;
             let ptr = weights_ptr.add(jx + filter_offset);
             let read_weights = _mm_loadu_ps(ptr);
-            store = conv_horiz_plane_4_f32!(
-                bounds_start,
-                unsafe_source_ptr_0,
-                read_weights,
-                store,
-                FMA
-            );
+            store = conv_horiz_plane_4_f32!(bounds_start, src, read_weights, store, FMA);
             jx += 4;
         }
 
@@ -225,7 +216,7 @@ unsafe fn convolve_horizontal_plane_sse_row_one_impl<const FMA: bool>(
             let bounds_start = bounds.start + jx;
             let ptr = weights_ptr.add(jx + filter_offset);
             let weights = _mm_setr_ps(ptr.read_unaligned(), ptr.add(1).read_unaligned(), 0., 0.);
-            store = conv_horiz_plane_2_f32!(bounds_start, unsafe_source_ptr_0, weights, store, FMA);
+            store = conv_horiz_plane_2_f32!(bounds_start, src, weights, store, FMA);
             jx += 2;
         }
 
@@ -233,13 +224,13 @@ unsafe fn convolve_horizontal_plane_sse_row_one_impl<const FMA: bool>(
             let bounds_start = bounds.start + jx;
             let ptr = weights_ptr.add(jx + filter_offset);
             let weight0 = _mm_load1_ps(ptr);
-            store = conv_horiz_plane_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store, FMA);
+            store = conv_horiz_plane_1_f32!(bounds_start, src, weight0, store, FMA);
             jx += 1;
         }
 
         let px = x;
-        let dest_ptr = unsafe_destination_ptr_0.add(px);
-        dest_ptr.write_unaligned(_mm_hsum_ps(store));
+        let dest_ptr = dst.get_unchecked_mut(px);
+        *dest_ptr = _mm_hsum_ps(store);
 
         filter_offset += filter_weights.aligned_size;
     }
@@ -249,9 +240,9 @@ pub(crate) fn convolve_horizontal_plane_sse_rows_4<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     unsafe {
@@ -260,9 +251,9 @@ pub(crate) fn convolve_horizontal_plane_sse_rows_4<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 dst_stride,
             );
         } else {
@@ -270,55 +261,53 @@ pub(crate) fn convolve_horizontal_plane_sse_rows_4<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 dst_stride,
             );
         }
     }
 }
 
-#[inline]
 #[target_feature(enable = "sse4.1")]
 unsafe fn convolve_horizontal_plane_sse_rows_4_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     convolve_horizontal_plane_sse_rows_4_impl::<false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
 
-#[inline]
 #[target_feature(enable = "sse4.1,fma")]
 unsafe fn convolve_horizontal_plane_sse_rows_4_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     convolve_horizontal_plane_sse_rows_4_impl::<true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
@@ -328,9 +317,9 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl<const FMA: bool>(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     unsafe {
@@ -354,18 +343,12 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl<const FMA: bool>(
                 let read_weights3 = _mm_loadu_ps(ptr.add(12));
                 let weights = (read_weights0, read_weights1, read_weights2, read_weights3);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_16_f32!(
-                    bounds_start,
-                    unsafe_source_ptr_0,
-                    weights,
-                    store_0,
-                    FMA
-                );
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_plane_16_f32!(bounds_start, src, weights, store_0, FMA);
+                let s_ptr_1 = src.get_unchecked(src_stride..);
                 store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, weights, store_1, FMA);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..);
                 store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, weights, store_2, FMA);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..);
                 store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, weights, store_3, FMA);
                 jx += 16;
             }
@@ -378,13 +361,13 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl<const FMA: bool>(
                 let bounds_start = bounds.start + jx;
                 store_0 = conv_horiz_plane_8_f32!(
                     bounds_start,
-                    unsafe_source_ptr_0,
+                    src,
                     read_weights.0,
                     read_weights.1,
                     store_0,
                     FMA
                 );
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                let s_ptr_1 = src.get_unchecked(src_stride..);
                 store_1 = conv_horiz_plane_8_f32!(
                     bounds_start,
                     s_ptr_1,
@@ -393,7 +376,7 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl<const FMA: bool>(
                     store_1,
                     FMA
                 );
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..);
                 store_2 = conv_horiz_plane_8_f32!(
                     bounds_start,
                     s_ptr2,
@@ -402,7 +385,7 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl<const FMA: bool>(
                     store_2,
                     FMA
                 );
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..);
                 store_3 = conv_horiz_plane_8_f32!(
                     bounds_start,
                     s_ptr3,
@@ -418,19 +401,13 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl<const FMA: bool>(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = _mm_loadu_ps(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_4_f32!(
-                    bounds_start,
-                    unsafe_source_ptr_0,
-                    read_weights,
-                    store_0,
-                    FMA
-                );
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_plane_4_f32!(bounds_start, src, read_weights, store_0, FMA);
+                let s_ptr_1 = src.get_unchecked(src_stride..);
                 store_1 =
                     conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1, FMA);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..);
                 store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2, FMA);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..);
                 store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3, FMA);
                 jx += 4;
             }
@@ -440,18 +417,12 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl<const FMA: bool>(
                 let weights =
                     _mm_setr_ps(ptr.read_unaligned(), ptr.add(1).read_unaligned(), 0., 0.);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_2_f32!(
-                    bounds_start,
-                    unsafe_source_ptr_0,
-                    weights,
-                    store_0,
-                    FMA
-                );
-                let ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_plane_2_f32!(bounds_start, src, weights, store_0, FMA);
+                let ptr_1 = src.get_unchecked(src_stride..);
                 store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1, FMA);
-                let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let ptr_2 = src.get_unchecked(src_stride * 2..);
                 store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2, FMA);
-                let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let ptr_3 = src.get_unchecked(src_stride * 3..);
                 store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3, FMA);
                 jx += 2;
             }
@@ -460,34 +431,28 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl<const FMA: bool>(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = _mm_set1_ps(ptr.read_unaligned());
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_plane_1_f32!(
-                    bounds_start,
-                    unsafe_source_ptr_0,
-                    weight0,
-                    store_0,
-                    FMA
-                );
-                let ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_plane_1_f32!(bounds_start, src, weight0, store_0, FMA);
+                let ptr_1 = src.get_unchecked(src_stride..);
                 store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1, FMA);
-                let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let ptr_2 = src.get_unchecked(src_stride * 2..);
                 store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2, FMA);
-                let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let ptr_3 = src.get_unchecked(src_stride * 3..);
                 store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3, FMA);
                 jx += 1;
             }
 
             let px = x;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
-            dest_ptr.write_unaligned(_mm_hsum_ps(store_0));
+            let dest_ptr = dst.get_unchecked_mut(px);
+            *dest_ptr = _mm_hsum_ps(store_0);
 
-            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
-            dest_ptr.write_unaligned(_mm_hsum_ps(store_1));
+            let dest_ptr = dst.get_unchecked_mut(px + dst_stride);
+            *dest_ptr = _mm_hsum_ps(store_1);
 
-            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
-            dest_ptr.write_unaligned(_mm_hsum_ps(store_2));
+            let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2);
+            *dest_ptr = _mm_hsum_ps(store_2);
 
-            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
-            dest_ptr.write_unaligned(_mm_hsum_ps(store_3));
+            let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3);
+            *dest_ptr = _mm_hsum_ps(store_3);
 
             filter_offset += filter_weights.aligned_size;
         }
diff --git a/src/sse/rgb_f32.rs b/src/sse/rgb_f32.rs
index 918daed..26641f4 100644
--- a/src/sse/rgb_f32.rs
+++ b/src/sse/rgb_f32.rs
@@ -37,7 +37,7 @@ use std::arch::x86_64::*;
 #[inline(always)]
 unsafe fn convolve_horizontal_parts_4_rgb_f32<const FMA: bool>(
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     weight0: __m128,
     weight1: __m128,
     weight2: __m128,
@@ -45,7 +45,7 @@ unsafe fn convolve_horizontal_parts_4_rgb_f32<const FMA: bool>(
     store_0: __m128,
 ) -> __m128 {
     const COMPONENTS: usize = 3;
-    let src_ptr = src.add(start_x * COMPONENTS);
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
 
     let rgb_pixel_0 = _mm_loadu_ps(src_ptr);
     let rgb_pixel_1 = _mm_loadu_ps(src_ptr.add(3));
@@ -66,13 +66,13 @@ unsafe fn convolve_horizontal_parts_4_rgb_f32<const FMA: bool>(
 #[inline(always)]
 unsafe fn convolve_horizontal_parts_2_rgb_f32<const FMA: bool>(
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     weight0: __m128,
     weight1: __m128,
     store_0: __m128,
 ) -> __m128 {
     const COMPONENTS: usize = 3;
-    let src_ptr = src.add(start_x * COMPONENTS);
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
 
     let orig1 = _mm_loadu_ps(src_ptr);
     let rgb_pixel_0 = orig1;
@@ -91,12 +91,12 @@ unsafe fn convolve_horizontal_parts_2_rgb_f32<const FMA: bool>(
 #[inline(always)]
 unsafe fn convolve_horizontal_parts_one_rgb_f32<const FMA: bool>(
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     weight0: __m128,
     store_0: __m128,
 ) -> __m128 {
     const COMPONENTS: usize = 3;
-    let src_ptr = src.add(start_x * COMPONENTS);
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
     let rgb_pixel = _mm_setr_ps(
         src_ptr.add(0).read_unaligned(),
         src_ptr.add(1).read_unaligned(),
@@ -110,8 +110,8 @@ pub(crate) fn convolve_horizontal_rgb_sse_row_one_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     unsafe {
         if FMA {
@@ -119,54 +119,52 @@ pub(crate) fn convolve_horizontal_rgb_sse_row_one_f32<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
+                src,
+                dst,
             );
         } else {
             convolve_horizontal_rgb_sse_row_one_f32_regular(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
+                src,
+                dst,
             );
         }
     }
 }
 
-#[inline]
 #[target_feature(enable = "sse4.1")]
 unsafe fn convolve_horizontal_rgb_sse_row_one_f32_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     convolve_horizontal_rgb_sse_row_one_f32_impl::<false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
-#[inline]
-#[target_feature(enable = "sse4.1,fma")]
+#[target_feature(enable = "sse4.1", enable = "fma")]
 unsafe fn convolve_horizontal_rgb_sse_row_one_f32_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     convolve_horizontal_rgb_sse_row_one_f32_impl::<true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
@@ -175,8 +173,8 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f32_impl<const FMA: bool>(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     const CHANNELS: usize = 3;
     let mut filter_offset = 0usize;
@@ -193,7 +191,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f32_impl<const FMA: bool>(
             let filter_start = jx + bounds.start;
             store = convolve_horizontal_parts_4_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src,
                 weight0,
                 weight1,
                 weight2,
@@ -215,7 +213,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f32_impl<const FMA: bool>(
             let filter_start = jx + bounds.start;
             store = convolve_horizontal_parts_2_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src,
                 weight0,
                 weight1,
                 store,
@@ -227,17 +225,12 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f32_impl<const FMA: bool>(
             let ptr = weights_ptr.add(jx + filter_offset);
             let weight0 = _mm_load1_ps(ptr);
             let filter_start = jx + bounds.start;
-            store = convolve_horizontal_parts_one_rgb_f32::<FMA>(
-                filter_start,
-                unsafe_source_ptr_0,
-                weight0,
-                store,
-            );
+            store = convolve_horizontal_parts_one_rgb_f32::<FMA>(filter_start, src, weight0, store);
             jx += 1;
         }
 
         let px = x * CHANNELS;
-        let dest_ptr = unsafe_destination_ptr_0.add(px);
+        let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
         (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store)));
         (dest_ptr as *mut i32)
             .add(2)
@@ -251,9 +244,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     unsafe {
@@ -262,9 +255,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 dst_stride,
             );
         } else {
@@ -272,67 +265,65 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 dst_stride,
             );
         }
     }
 }
 
-#[inline]
 #[target_feature(enable = "sse4.1")]
 unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgb_sse_rows_4_f32_impl::<false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
 
-#[inline]
-#[target_feature(enable = "sse4.1,fma")]
+#[target_feature(enable = "sse4.1", enable = "fma")]
 unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgb_sse_rows_4_f32_impl::<true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl<const FMA: bool>(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     const CHANNELS: usize = 3;
@@ -354,7 +345,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl<const FMA: bool>(
             let filter_start = jx + bounds.start;
             store_0 = convolve_horizontal_parts_4_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src,
                 weight0,
                 weight1,
                 weight2,
@@ -363,7 +354,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl<const FMA: bool>(
             );
             store_1 = convolve_horizontal_parts_4_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..),
                 weight0,
                 weight1,
                 weight2,
@@ -372,7 +363,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl<const FMA: bool>(
             );
             store_2 = convolve_horizontal_parts_4_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..),
                 weight0,
                 weight1,
                 weight2,
@@ -381,7 +372,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl<const FMA: bool>(
             );
             store_3 = convolve_horizontal_parts_4_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..),
                 weight0,
                 weight1,
                 weight2,
@@ -403,28 +394,28 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl<const FMA: bool>(
             let filter_start = jx + bounds.start;
             store_0 = convolve_horizontal_parts_2_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src,
                 weight0,
                 weight1,
                 store_0,
             );
             store_1 = convolve_horizontal_parts_2_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..),
                 weight0,
                 weight1,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_2_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..),
                 weight0,
                 weight1,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_2_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..),
                 weight0,
                 weight1,
                 store_3,
@@ -436,27 +427,23 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl<const FMA: bool>(
             let ptr = weights_ptr.add(jx + filter_offset);
             let weight0 = _mm_load1_ps(ptr);
             let filter_start = jx + bounds.start;
-            store_0 = convolve_horizontal_parts_one_rgb_f32::<FMA>(
-                filter_start,
-                unsafe_source_ptr_0,
-                weight0,
-                store_0,
-            );
+            store_0 =
+                convolve_horizontal_parts_one_rgb_f32::<FMA>(filter_start, src, weight0, store_0);
             store_1 = convolve_horizontal_parts_one_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..),
                 weight0,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_one_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..),
                 weight0,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_one_rgb_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..),
                 weight0,
                 store_3,
             );
@@ -464,25 +451,25 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl<const FMA: bool>(
         }
 
         let px = x * CHANNELS;
-        let dest_ptr = unsafe_destination_ptr_0.add(px);
+        let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
         (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_0)));
         (dest_ptr as *mut i32)
             .add(2)
             .write_unaligned(_mm_extract_ps::<2>(store_0));
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
         (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_1)));
         (dest_ptr as *mut i32)
             .add(2)
             .write_unaligned(_mm_extract_ps::<2>(store_1));
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
         (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_2)));
         (dest_ptr as *mut i32)
             .add(2)
             .write_unaligned(_mm_extract_ps::<2>(store_2));
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
         (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_3)));
         (dest_ptr as *mut i32)
             .add(2)
diff --git a/src/sse/rgba_f32.rs b/src/sse/rgba_f32.rs
index 9b4b244..1ba151f 100644
--- a/src/sse/rgba_f32.rs
+++ b/src/sse/rgba_f32.rs
@@ -37,12 +37,12 @@ use std::arch::x86_64::*;
 #[inline(always)]
 unsafe fn convolve_horizontal_parts_one_rgba_f32<const FMA: bool>(
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     weight0: __m128,
     store_0: __m128,
 ) -> __m128 {
     const COMPONENTS: usize = 4;
-    let src_ptr = src.add(start_x * COMPONENTS);
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
     let rgb_pixel = _mm_loadu_ps(src_ptr);
     _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel, weight0)
 }
@@ -51,8 +51,8 @@ pub(crate) fn convolve_horizontal_rgba_sse_row_one_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     unsafe {
         if FMA {
@@ -60,54 +60,52 @@ pub(crate) fn convolve_horizontal_rgba_sse_row_one_f32<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
+                src,
+                dst,
             );
         } else {
             convolve_horizontal_rgba_sse_row_one_f32_regular(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
+                src,
+                dst,
             );
         }
     }
 }
 
-#[inline]
 #[target_feature(enable = "sse4.1,fma")]
 unsafe fn convolve_horizontal_rgba_sse_row_one_f32_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     convolve_horizontal_rgba_sse_row_one_f32_impl::<true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
-#[inline]
 #[target_feature(enable = "sse4.1")]
 unsafe fn convolve_horizontal_rgba_sse_row_one_f32_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     convolve_horizontal_rgba_sse_row_one_f32_impl::<false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
@@ -116,8 +114,8 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl<const FMA: bool>(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
 ) {
     unsafe {
         const CHANNELS: usize = 4;
@@ -135,7 +133,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl<const FMA: bool>(
                 let filter_start = jx + bounds.start;
                 store = convolve_horizontal_parts_4_rgba_f32::<FMA>(
                     filter_start,
-                    unsafe_source_ptr_0,
+                    src,
                     weight0,
                     weight1,
                     weight2,
@@ -157,7 +155,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl<const FMA: bool>(
                 let filter_start = jx + bounds.start;
                 store = convolve_horizontal_parts_2_rgba_f32::<FMA>(
                     filter_start,
-                    unsafe_source_ptr_0,
+                    src,
                     weight0,
                     weight1,
                     store,
@@ -171,7 +169,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl<const FMA: bool>(
                 let filter_start = jx + bounds.start;
                 store = convolve_horizontal_parts_one_rgba_f32::<FMA>(
                     filter_start,
-                    unsafe_source_ptr_0,
+                    src,
                     weight0,
                     store,
                 );
@@ -179,8 +177,8 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl<const FMA: bool>(
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
-            _mm_storeu_ps(dest_ptr, store);
+            let dest_ptr = dst.get_unchecked_mut(px..);
+            _mm_storeu_ps(dest_ptr.as_mut_ptr(), store);
 
             filter_offset += filter_weights.aligned_size;
         }
@@ -190,7 +188,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl<const FMA: bool>(
 #[inline(always)]
 unsafe fn convolve_horizontal_parts_4_rgba_f32<const FMA: bool>(
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     weight0: __m128,
     weight1: __m128,
     weight2: __m128,
@@ -198,7 +196,7 @@ unsafe fn convolve_horizontal_parts_4_rgba_f32<const FMA: bool>(
     store_0: __m128,
 ) -> __m128 {
     const COMPONENTS: usize = 4;
-    let src_ptr = src.add(start_x * COMPONENTS);
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
 
     let rgb_pixel_0 = _mm_loadu_ps(src_ptr);
     let rgb_pixel_1 = _mm_loadu_ps(src_ptr.add(4));
@@ -214,13 +212,13 @@ unsafe fn convolve_horizontal_parts_4_rgba_f32<const FMA: bool>(
 #[inline(always)]
 unsafe fn convolve_horizontal_parts_2_rgba_f32<const FMA: bool>(
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     weight0: __m128,
     weight1: __m128,
     store_0: __m128,
 ) -> __m128 {
     const COMPONENTS: usize = 4;
-    let src_ptr = src.add(start_x * COMPONENTS);
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
 
     let rgb_pixel_0 = _mm_loadu_ps(src_ptr);
     let rgb_pixel_1 = _mm_loadu_ps(src_ptr.add(4));
@@ -233,9 +231,9 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     unsafe {
@@ -244,9 +242,9 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 dst_stride,
             );
         } else {
@@ -254,67 +252,65 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 dst_stride,
             );
         }
     }
 }
 
-#[inline]
 #[target_feature(enable = "sse4.1,fma")]
 unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgba_sse_rows_4_f32_impl::<true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
 
-#[inline]
 #[target_feature(enable = "sse4.1")]
 unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgba_sse_rows_4_f32_impl::<false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl<const FMA: bool>(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
+    src: &[f32],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f32,
+    dst: &mut [f32],
     dst_stride: usize,
 ) {
     const CHANNELS: usize = 4;
@@ -336,7 +332,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl<const FMA: bool>(
 
             store_0 = convolve_horizontal_parts_4_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src,
                 weight0,
                 weight1,
                 weight2,
@@ -345,7 +341,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl<const FMA: bool>(
             );
             store_1 = convolve_horizontal_parts_4_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..),
                 weight0,
                 weight1,
                 weight2,
@@ -354,7 +350,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl<const FMA: bool>(
             );
             store_2 = convolve_horizontal_parts_4_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..),
                 weight0,
                 weight1,
                 weight2,
@@ -363,7 +359,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl<const FMA: bool>(
             );
             store_3 = convolve_horizontal_parts_4_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..),
                 weight0,
                 weight1,
                 weight2,
@@ -385,28 +381,28 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl<const FMA: bool>(
             let filter_start = jx + bounds.start;
             store_0 = convolve_horizontal_parts_2_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src,
                 weight0,
                 weight1,
                 store_0,
             );
             store_1 = convolve_horizontal_parts_2_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..),
                 weight0,
                 weight1,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_2_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..),
                 weight0,
                 weight1,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_2_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..),
                 weight0,
                 weight1,
                 store_3,
@@ -418,27 +414,23 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl<const FMA: bool>(
             let ptr = weights_ptr.add(jx + filter_offset);
             let filter_start = jx + bounds.start;
             let weight0 = _mm_load1_ps(ptr);
-            store_0 = convolve_horizontal_parts_one_rgba_f32::<FMA>(
-                filter_start,
-                unsafe_source_ptr_0,
-                weight0,
-                store_0,
-            );
+            store_0 =
+                convolve_horizontal_parts_one_rgba_f32::<FMA>(filter_start, src, weight0, store_0);
             store_1 = convolve_horizontal_parts_one_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..),
                 weight0,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_one_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..),
                 weight0,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_one_rgba_f32::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..),
                 weight0,
                 store_3,
             );
@@ -446,17 +438,17 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl<const FMA: bool>(
         }
 
         let px = x * CHANNELS;
-        let dest_ptr = unsafe_destination_ptr_0.add(px);
-        _mm_storeu_ps(dest_ptr, store_0);
+        let dest_ptr = dst.get_unchecked_mut(px..);
+        _mm_storeu_ps(dest_ptr.as_mut_ptr(), store_0);
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
-        _mm_storeu_ps(dest_ptr, store_1);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride..);
+        _mm_storeu_ps(dest_ptr.as_mut_ptr(), store_1);
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
-        _mm_storeu_ps(dest_ptr, store_2);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..);
+        _mm_storeu_ps(dest_ptr.as_mut_ptr(), store_2);
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
-        _mm_storeu_ps(dest_ptr, store_3);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..);
+        _mm_storeu_ps(dest_ptr.as_mut_ptr(), store_3);
 
         filter_offset += filter_weights.aligned_size;
     }
diff --git a/src/sse/vertical_f32.rs b/src/sse/vertical_f32.rs
index d00dfda..df0ccba 100644
--- a/src/sse/vertical_f32.rs
+++ b/src/sse/vertical_f32.rs
@@ -37,9 +37,9 @@ use std::arch::x86_64::*;
 unsafe fn convolve_vertical_part_sse_24_f32<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     src_stride: usize,
-    dst: *mut f32,
+    dst: &mut [f32],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -56,15 +56,14 @@ unsafe fn convolve_vertical_part_sse_24_f32<const FMA: bool>(
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = _mm_load1_ps(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr();
 
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm_loadu_ps(s_ptr);
-        let item_row_1 = _mm_loadu_ps(s_ptr.add(4));
-        let item_row_2 = _mm_loadu_ps(s_ptr.add(8));
-        let item_row_3 = _mm_loadu_ps(s_ptr.add(12));
-        let item_row_4 = _mm_loadu_ps(s_ptr.add(16));
-        let item_row_5 = _mm_loadu_ps(s_ptr.add(20));
+        let item_row_0 = _mm_loadu_ps(src_ptr);
+        let item_row_1 = _mm_loadu_ps(src_ptr.add(4));
+        let item_row_2 = _mm_loadu_ps(src_ptr.add(8));
+        let item_row_3 = _mm_loadu_ps(src_ptr.add(12));
+        let item_row_4 = _mm_loadu_ps(src_ptr.add(16));
+        let item_row_5 = _mm_loadu_ps(src_ptr.add(20));
 
         store_0 = _mm_prefer_fma_ps::<FMA>(store_0, item_row_0, v_weight);
         store_1 = _mm_prefer_fma_ps::<FMA>(store_1, item_row_1, v_weight);
@@ -74,7 +73,7 @@ unsafe fn convolve_vertical_part_sse_24_f32<const FMA: bool>(
         store_5 = _mm_prefer_fma_ps::<FMA>(store_5, item_row_5, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     _mm_storeu_ps(dst_ptr, store_0);
     _mm_storeu_ps(dst_ptr.add(4), store_1);
     _mm_storeu_ps(dst_ptr.add(8), store_2);
@@ -87,9 +86,9 @@ unsafe fn convolve_vertical_part_sse_24_f32<const FMA: bool>(
 unsafe fn convolve_vertical_part_sse_16_f32<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     src_stride: usize,
-    dst: *mut f32,
+    dst: &mut [f32],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -104,13 +103,12 @@ unsafe fn convolve_vertical_part_sse_16_f32<const FMA: bool>(
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = _mm_load1_ps(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr();
 
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm_loadu_ps(s_ptr);
-        let item_row_1 = _mm_loadu_ps(s_ptr.add(4));
-        let item_row_2 = _mm_loadu_ps(s_ptr.add(8));
-        let item_row_3 = _mm_loadu_ps(s_ptr.add(12));
+        let item_row_0 = _mm_loadu_ps(src_ptr);
+        let item_row_1 = _mm_loadu_ps(src_ptr.add(4));
+        let item_row_2 = _mm_loadu_ps(src_ptr.add(8));
+        let item_row_3 = _mm_loadu_ps(src_ptr.add(12));
 
         store_0 = _mm_prefer_fma_ps::<FMA>(store_0, item_row_0, v_weight);
         store_1 = _mm_prefer_fma_ps::<FMA>(store_1, item_row_1, v_weight);
@@ -118,7 +116,7 @@ unsafe fn convolve_vertical_part_sse_16_f32<const FMA: bool>(
         store_3 = _mm_prefer_fma_ps::<FMA>(store_3, item_row_3, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     _mm_storeu_ps(dst_ptr, store_0);
     _mm_storeu_ps(dst_ptr.add(4), store_1);
     _mm_storeu_ps(dst_ptr.add(8), store_2);
@@ -129,9 +127,9 @@ unsafe fn convolve_vertical_part_sse_16_f32<const FMA: bool>(
 unsafe fn convolve_vertical_part_sse_8_f32<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     src_stride: usize,
-    dst: *mut f32,
+    dst: &mut [f32],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -144,17 +142,16 @@ unsafe fn convolve_vertical_part_sse_8_f32<const FMA: bool>(
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = _mm_load1_ps(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr();
 
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm_loadu_ps(s_ptr);
-        let item_row_1 = _mm_loadu_ps(s_ptr.add(4));
+        let item_row_0 = _mm_loadu_ps(src_ptr);
+        let item_row_1 = _mm_loadu_ps(src_ptr.add(4));
 
         store_0 = _mm_prefer_fma_ps::<FMA>(store_0, item_row_0, v_weight);
         store_1 = _mm_prefer_fma_ps::<FMA>(store_1, item_row_1, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     _mm_storeu_ps(dst_ptr, store_0);
     _mm_storeu_ps(dst_ptr.add(4), store_1);
 }
@@ -163,9 +160,9 @@ unsafe fn convolve_vertical_part_sse_8_f32<const FMA: bool>(
 unsafe fn convolve_vertical_part_sse_4_f32<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     src_stride: usize,
-    dst: *mut f32,
+    dst: &mut [f32],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -177,15 +174,15 @@ unsafe fn convolve_vertical_part_sse_4_f32<const FMA: bool>(
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = _mm_load1_ps(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
 
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm_loadu_ps(s_ptr);
+        let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr();
+
+        let item_row_0 = _mm_loadu_ps(src_ptr);
 
         store_0 = _mm_prefer_fma_ps::<FMA>(store_0, item_row_0, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     _mm_storeu_ps(dst_ptr, store_0);
 }
 
@@ -193,9 +190,9 @@ unsafe fn convolve_vertical_part_sse_4_f32<const FMA: bool>(
 pub(crate) unsafe fn convolve_vertical_part_sse_f32<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const f32,
+    src: &[f32],
     src_stride: usize,
-    dst: *mut f32,
+    dst: &mut [f32],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -207,44 +204,33 @@ pub(crate) unsafe fn convolve_vertical_part_sse_f32<const FMA: bool>(
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = _mm_load1_ps(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py + px..);
 
-        let s_ptr = src_ptr.add(px);
-        let item_row_0 = _mm_set1_ps(s_ptr.read_unaligned());
+        let item_row_0 = _mm_set1_ps(src_ptr.as_ptr().read_unaligned());
 
         store_0 = _mm_prefer_fma_ps::<FMA>(store_0, item_row_0, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     (dst_ptr as *mut i32).write_unaligned(_mm_extract_ps::<0>(store_0));
 }
 
 pub(crate) fn convolve_vertical_rgb_sse_row_f32<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     unsafe {
         if FMA {
             convolve_vertical_rgb_sse_row_f32_fma::<CHANNELS>(
-                width,
-                bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
+                width, bounds, src, dst, src_stride, weight_ptr,
             );
         } else {
             convolve_vertical_rgb_sse_row_f32_regular::<CHANNELS>(
-                width,
-                bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
+                width, bounds, src, dst, src_stride, weight_ptr,
             );
         }
     }
@@ -255,18 +241,13 @@ pub(crate) fn convolve_vertical_rgb_sse_row_f32<const CHANNELS: usize, const FMA
 unsafe fn convolve_vertical_rgb_sse_row_f32_regular<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     convolve_vertical_rgb_sse_row_f32_impl::<CHANNELS, false>(
-        width,
-        bounds,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
-        src_stride,
-        weight_ptr,
+        width, bounds, src, dst, src_stride, weight_ptr,
     );
 }
 
@@ -275,18 +256,13 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_regular<const CHANNELS: usize>(
 unsafe fn convolve_vertical_rgb_sse_row_f32_fma<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     convolve_vertical_rgb_sse_row_f32_impl::<CHANNELS, true>(
-        width,
-        bounds,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
-        src_stride,
-        weight_ptr,
+        width, bounds, src, dst, src_stride, weight_ptr,
     );
 }
 
@@ -294,8 +270,8 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_fma<const CHANNELS: usize>(
 unsafe fn convolve_vertical_rgb_sse_row_f32_impl<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
+    src: &[f32],
+    dst: &mut [f32],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -306,9 +282,9 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_impl<const CHANNELS: usize, const FM
         convolve_vertical_part_sse_24_f32::<FMA>(
             bounds.start,
             cx,
-            unsafe_source_ptr_0,
+            src,
             src_stride,
-            unsafe_destination_ptr_0,
+            dst,
             weight_ptr,
             bounds,
         );
@@ -320,9 +296,9 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_impl<const CHANNELS: usize, const FM
         convolve_vertical_part_sse_16_f32::<FMA>(
             bounds.start,
             cx,
-            unsafe_source_ptr_0,
+            src,
             src_stride,
-            unsafe_destination_ptr_0,
+            dst,
             weight_ptr,
             bounds,
         );
@@ -334,9 +310,9 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_impl<const CHANNELS: usize, const FM
         convolve_vertical_part_sse_8_f32::<FMA>(
             bounds.start,
             cx,
-            unsafe_source_ptr_0,
+            src,
             src_stride,
-            unsafe_destination_ptr_0,
+            dst,
             weight_ptr,
             bounds,
         );
@@ -348,9 +324,9 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_impl<const CHANNELS: usize, const FM
         convolve_vertical_part_sse_4_f32::<FMA>(
             bounds.start,
             cx,
-            unsafe_source_ptr_0,
+            src,
             src_stride,
-            unsafe_destination_ptr_0,
+            dst,
             weight_ptr,
             bounds,
         );
@@ -362,9 +338,9 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_impl<const CHANNELS: usize, const FM
         convolve_vertical_part_sse_f32::<FMA>(
             bounds.start,
             cx,
-            unsafe_source_ptr_0,
+            src,
             src_stride,
-            unsafe_destination_ptr_0,
+            dst,
             weight_ptr,
             bounds,
         );
diff --git a/src/unsafe_slice.rs b/src/unsafe_slice.rs
deleted file mode 100644
index 52b4352..0000000
--- a/src/unsafe_slice.rs
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) Radzivon Bartoshyk. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- *
- * 1.  Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2.  Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3.  Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-use std::cell::UnsafeCell;
-
-#[derive(Copy, Clone)]
-pub(crate) struct UnsafeSlice<'a, T> {
-    pub slice: &'a [UnsafeCell<T>],
-}
-
-unsafe impl<T: Send + Sync> Send for UnsafeSlice<'_, T> {}
-
-unsafe impl<T: Send + Sync> Sync for UnsafeSlice<'_, T> {}
-
-impl<'a, T> UnsafeSlice<'a, T> {
-    pub(crate) fn new(slice: &'a mut [T]) -> Self {
-        let ptr = slice as *mut [T] as *const [UnsafeCell<T>];
-        Self {
-            slice: unsafe { &*ptr },
-        }
-    }
-
-    pub(crate) fn mut_ptr(&self) -> *mut T {
-        self.slice.as_ptr() as *const T as *mut T
-    }
-
-    /// SAFETY: It is UB if two threads write to the same index without
-    /// synchronization.
-    #[allow(dead_code)]
-    pub(crate) unsafe fn write(&self, i: usize, value: T) {
-        let ptr = self.slice[i].get();
-        *ptr = value;
-    }
-    #[allow(dead_code)]
-    pub(crate) fn get(&self, i: usize) -> &T {
-        let ptr = self.slice[i].get();
-        unsafe { &*ptr }
-    }
-    #[allow(dead_code)]
-    pub(crate) fn len(&self) -> usize {
-        self.slice.len()
-    }
-}

From b98f995c5a94519ffd320118b404df1e43ee17f1 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Fri, 20 Dec 2024 15:34:13 +0000
Subject: [PATCH 2/9] x86 improvements

---
 src/avx2/vertical_u8_lp.rs | 31 ++++++++++---------------------
 src/neon/plane_f32.rs      | 10 ++--------
 src/neon/rgba_f32.rs       |  6 ++----
 3 files changed, 14 insertions(+), 33 deletions(-)

diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs
index 51b7134..af4ebc3 100644
--- a/src/avx2/vertical_u8_lp.rs
+++ b/src/avx2/vertical_u8_lp.rs
@@ -77,7 +77,6 @@ unsafe fn convolve_vertical_avx2_row_impl(
     weight: &[i16],
 ) {
     let zeros = _mm_setzero_si128();
-    let zeros256 = _mm256_setzero_si256();
 
     let bounds_size = bounds.size;
     const SCALE: i32 = 6;
@@ -212,17 +211,14 @@ unsafe fn convolve_vertical_avx2_row_impl(
             }
         }
 
-        store0 = _mm256_max_epi16(store0, zeros256);
-        store1 = _mm256_max_epi16(store1, zeros256);
-        store2 = _mm256_max_epi16(store2, zeros256);
-        store3 = _mm256_max_epi16(store3, zeros256);
+        let rebased0 = _mm256_srai_epi16::<R_SHR_SCALE>(store0);
+        let rebased1 = _mm256_srai_epi16::<R_SHR_SCALE>(store1);
+        let rebased2 = _mm256_srai_epi16::<R_SHR_SCALE>(store2);
+        let rebased3 = _mm256_srai_epi16::<R_SHR_SCALE>(store3);
 
-        let rebased0 = _mm256_srli_epi16::<R_SHR_SCALE>(store0);
-        let rebased1 = _mm256_srli_epi16::<R_SHR_SCALE>(store1);
-        let rebased2 = _mm256_srli_epi16::<R_SHR_SCALE>(store2);
-        let rebased3 = _mm256_srli_epi16::<R_SHR_SCALE>(store3);
         let shrank0 = avx2_pack_u16(rebased0, rebased1);
         let shrank1 = avx2_pack_u16(rebased2, rebased3);
+
         _mm256_storeu_si256(dst.as_mut_ptr() as *mut __m256i, shrank0);
         _mm256_storeu_si256(
             dst.get_unchecked_mut(32..).as_mut_ptr() as *mut __m256i,
@@ -317,11 +313,9 @@ unsafe fn convolve_vertical_avx2_row_impl(
             }
         }
 
-        store0 = _mm256_max_epi16(store0, zeros256);
-        store1 = _mm256_max_epi16(store1, zeros256);
+        let rebased0 = _mm256_srai_epi16::<R_SHR_SCALE>(store0);
+        let rebased1 = _mm256_srai_epi16::<R_SHR_SCALE>(store1);
 
-        let rebased0 = _mm256_srli_epi16::<R_SHR_SCALE>(store0);
-        let rebased1 = _mm256_srli_epi16::<R_SHR_SCALE>(store1);
         let shrank0 = avx2_pack_u16(rebased0, rebased1);
         _mm256_storeu_si256(dst.as_mut_ptr() as *mut __m256i, shrank0);
 
@@ -351,8 +345,7 @@ unsafe fn convolve_vertical_avx2_row_impl(
             );
         }
 
-        store0 = _mm256_max_epi16(store0, zeros256);
-        store0 = _mm256_srli_epi16::<R_SHR_SCALE>(store0);
+        store0 = _mm256_srai_epi16::<R_SHR_SCALE>(store0);
 
         let packed = avx2_pack_u16(store0, store0);
 
@@ -474,9 +467,7 @@ unsafe fn convolve_vertical_avx2_row_impl(
             }
         }
 
-        store = _mm_max_epi16(store, zeros);
-
-        let rebased = _mm_srli_epi16::<R_SHR_SCALE>(store);
+        let rebased = _mm_srai_epi16::<R_SHR_SCALE>(store);
         let shrank = _mm_packus_epi16(rebased, rebased);
         std::ptr::copy_nonoverlapping(&shrank as *const _ as *const u8, dst.as_mut_ptr(), 8);
 
@@ -600,9 +591,7 @@ unsafe fn convolve_vertical_avx2_row_impl(
             }
         }
 
-        store = _mm_max_epi16(store, zeros);
-
-        let rebased = _mm_srli_epi16::<R_SHR_SCALE>(store);
+        let rebased = _mm_srai_epi16::<R_SHR_SCALE>(store);
         let value = _mm_extract_epi8::<0>(_mm_packus_epi16(rebased, rebased));
         *dst = value as u8;
 
diff --git a/src/neon/plane_f32.rs b/src/neon/plane_f32.rs
index cb8c65f..24e4ca2 100644
--- a/src/neon/plane_f32.rs
+++ b/src/neon/plane_f32.rs
@@ -107,12 +107,7 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = xvld1q_f32_x4(ptr);
-                store = conv_horiz_plane_16_f32!(
-                    bounds_start,
-                    src.as_ptr(),
-                    read_weights,
-                    store
-                );
+                store = conv_horiz_plane_16_f32!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 8;
             }
 
@@ -134,8 +129,7 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                store =
-                    conv_horiz_plane_4_f32!(bounds_start, src.as_ptr(), read_weights, store);
+                store = conv_horiz_plane_4_f32!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 4;
             }
 
diff --git a/src/neon/rgba_f32.rs b/src/neon/rgba_f32.rs
index 1e44dd3..2a2b088 100644
--- a/src/neon/rgba_f32.rs
+++ b/src/neon/rgba_f32.rs
@@ -111,8 +111,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                store =
-                    conv_horiz_rgba_4_f32!(bounds_start, src.as_ptr(), read_weights, store);
+                store = conv_horiz_rgba_4_f32!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 4;
             }
 
@@ -120,8 +119,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                store =
-                    conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store);
+                store = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 2;
             }
 

From 4d669f78d9f3de529c37600e421c630f9f256250 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Fri, 27 Dec 2024 11:46:16 +0000
Subject: [PATCH 3/9] Improvements

---
 app/src/main.rs            |   4 +-
 src/avx2/alpha_u16.rs      | 102 +++++----
 src/avx2/alpha_u8.rs       |  72 +++---
 src/avx2/rgba_f16.rs       |  34 +--
 src/avx2/rgba_f32.rs       |   4 +-
 src/avx2/utils.rs          |  24 --
 src/avx2/vertical_f16.rs   |   2 +-
 src/avx2/vertical_f32.rs   |   5 +-
 src/avx2/vertical_u8.rs    | 285 +++++------------------
 src/avx2/vertical_u8_lp.rs |  13 +-
 src/neon/rgb_f32.rs        |   6 +-
 src/neon/rgb_u8.rs         |  10 +-
 src/neon/rgba_u8.rs        |  38 +--
 src/neon/vertical_u8.rs    |   4 +-
 src/sse/alpha_u16.rs       |  17 +-
 src/sse/alpha_u8.rs        |  20 +-
 src/sse/plane_u8.rs        |   2 +-
 src/sse/rgb_f32.rs         |  12 +-
 src/sse/rgba_f16.rs        |  34 +--
 src/sse/rgba_u16.rs        |  31 +--
 src/sse/rgba_u16_lb.rs     |  43 +---
 src/sse/rgba_u8.rs         |  42 ++--
 src/sse/rgba_u8_lb.rs      |  56 +++--
 src/sse/u8_utils.rs        |   9 +-
 src/sse/vertical_f16.rs    |   2 +-
 src/sse/vertical_u16.rs    | 459 ++-----------------------------------
 src/sse/vertical_u16_lb.rs | 403 ++------------------------------
 src/sse/vertical_u8.rs     | 322 ++++++--------------------
 src/sse/vertical_u8_lp.rs  | 335 +++------------------------
 29 files changed, 435 insertions(+), 1955 deletions(-)

diff --git a/app/src/main.rs b/app/src/main.rs
index 6d8dcec..cb713ce 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -45,7 +45,7 @@ fn resize_plane(
 
 fn main() {
     // test_fast_image();
-    let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
+    let img = ImageReader::open("./assets/test_1.jpg")
         .unwrap()
         .decode()
         .unwrap();
@@ -53,7 +53,7 @@ fn main() {
     let transient = img.to_rgba8();
     let mut bytes = Vec::from(transient.as_bytes());
 
-    let mut scaler = LinearScaler::new(ResamplingFunction::Bilinear);
+    let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
     scaler.set_threading_policy(ThreadingPolicy::Single);
 
     // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);
diff --git a/src/avx2/alpha_u16.rs b/src/avx2/alpha_u16.rs
index f190f68..27203d7 100644
--- a/src/avx2/alpha_u16.rs
+++ b/src/avx2/alpha_u16.rs
@@ -29,7 +29,7 @@
 
 use crate::alpha_handle_u16::{premultiply_alpha_rgba_row, unpremultiply_alpha_rgba_row};
 use crate::avx2::utils::{
-    _mm256_select_si256, avx2_pack_u32, avx_deinterleave_rgba_epi16, avx_interleave_rgba_epi16,
+    _mm256_select_si256, avx_deinterleave_rgba_epi16, avx_interleave_rgba_epi16,
 };
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
@@ -39,15 +39,16 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-#[inline]
+#[inline(always)]
 unsafe fn _mm256_scale_by_alpha(px: __m256i, low_low_a: __m256, low_high_a: __m256) -> __m256i {
-    let low_px = _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(px)));
-    let high_px = _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(px)));
+    let zeros = _mm256_setzero_si256();
+    let low_px = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(px, zeros));
+    let high_px = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(px, zeros));
 
     let new_ll = _mm256_cvtps_epi32(_mm256_round_ps::<0x02>(_mm256_mul_ps(low_px, low_low_a)));
     let new_lh = _mm256_cvtps_epi32(_mm256_round_ps::<0x02>(_mm256_mul_ps(high_px, low_high_a)));
 
-    avx2_pack_u32(new_ll, new_lh)
+    _mm256_packus_epi32(new_ll, new_lh)
 }
 
 #[inline(always)]
@@ -108,36 +109,37 @@ unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_d
 
                 let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
 
-                let low_alpha = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.3));
-                let high_alpha = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.3));
+                let zeros = _mm256_setzero_si256();
+                let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros);
+                let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros);
 
-                let new_rrr = avx2_pack_u32(
+                let new_rrr = _mm256_packus_epi32(
                     _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.0)),
+                        _mm256_unpacklo_epi16(pixel.0, zeros),
                         low_alpha,
                     )),
                     _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                        _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.0)),
+                        _mm256_unpackhi_epi16(pixel.0, zeros),
                         high_alpha,
                     )),
                 );
-                let new_ggg = avx2_pack_u32(
+                let new_ggg = _mm256_packus_epi32(
                     _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.1)),
+                        _mm256_unpacklo_epi16(pixel.1, zeros),
                         low_alpha,
                     )),
                     _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                        _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.1)),
+                        _mm256_unpackhi_epi16(pixel.1, zeros),
                         high_alpha,
                     )),
                 );
-                let new_bbb = avx2_pack_u32(
+                let new_bbb = _mm256_packus_epi32(
                     _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.2)),
+                        _mm256_unpacklo_epi16(pixel.2, zeros),
                         low_alpha,
                     )),
                     _mm256_div_by_1023_epi32(_mm256_madd_epi16(
-                        _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.2)),
+                        _mm256_unpackhi_epi16(pixel.2, zeros),
                         high_alpha,
                     )),
                 );
@@ -165,36 +167,37 @@ unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_d
 
                 let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
 
-                let low_alpha = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.3));
-                let high_alpha = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.3));
+                let zeros = _mm256_setzero_si256();
+                let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros);
+                let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros);
 
-                let new_rrr = avx2_pack_u32(
+                let new_rrr = _mm256_packus_epi32(
                     _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.0)),
+                        _mm256_unpacklo_epi16(pixel.0, zeros),
                         low_alpha,
                     )),
                     _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                        _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.0)),
+                        _mm256_unpackhi_epi16(pixel.0, zeros),
                         high_alpha,
                     )),
                 );
-                let new_ggg = avx2_pack_u32(
+                let new_ggg = _mm256_packus_epi32(
                     _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.1)),
+                        _mm256_unpacklo_epi16(pixel.1, zeros),
                         low_alpha,
                     )),
                     _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                        _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.1)),
+                        _mm256_unpackhi_epi16(pixel.1, zeros),
                         high_alpha,
                     )),
                 );
-                let new_bbb = avx2_pack_u32(
+                let new_bbb = _mm256_packus_epi32(
                     _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.2)),
+                        _mm256_unpacklo_epi16(pixel.2, zeros),
                         low_alpha,
                     )),
                     _mm256_div_by_4095_epi32(_mm256_madd_epi16(
-                        _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.2)),
+                        _mm256_unpackhi_epi16(pixel.2, zeros),
                         high_alpha,
                     )),
                 );
@@ -222,36 +225,37 @@ unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_d
 
                 let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
 
-                let low_alpha = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.3));
-                let high_alpha = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.3));
+                let zeros = _mm256_setzero_si256();
+                let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros);
+                let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros);
 
-                let new_rrr = avx2_pack_u32(
+                let new_rrr = _mm256_packus_epi32(
                     _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.0)),
+                        _mm256_unpacklo_epi16(pixel.0, zeros),
                         low_alpha,
                     )),
                     _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                        _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.0)),
+                        _mm256_unpackhi_epi16(pixel.0, zeros),
                         high_alpha,
                     )),
                 );
-                let new_ggg = avx2_pack_u32(
+                let new_ggg = _mm256_packus_epi32(
                     _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.1)),
+                        _mm256_unpacklo_epi16(pixel.1, zeros),
                         low_alpha,
                     )),
                     _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                        _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.1)),
+                        _mm256_unpackhi_epi16(pixel.1, zeros),
                         high_alpha,
                     )),
                 );
-                let new_bbb = avx2_pack_u32(
+                let new_bbb = _mm256_packus_epi32(
                     _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.2)),
+                        _mm256_unpacklo_epi16(pixel.2, zeros),
                         low_alpha,
                     )),
                     _mm256_div_by_65535_epi32(_mm256_mullo_epi32(
-                        _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.2)),
+                        _mm256_unpackhi_epi16(pixel.2, zeros),
                         high_alpha,
                     )),
                 );
@@ -280,14 +284,14 @@ unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_d
 
                 let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
 
+                let zeros = _mm256_setzero_si256();
+
                 let low_alpha = _mm256_mul_ps(
-                    _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.3))),
+                    _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(pixel.3, zeros)),
                     v_scale_colors,
                 );
                 let high_alpha = _mm256_mul_ps(
-                    _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(
-                        pixel.3,
-                    ))),
+                    _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(pixel.3, zeros)),
                     v_scale_colors,
                 );
 
@@ -368,17 +372,17 @@ unsafe fn avx_unpremultiply_alpha_rgba_u16_row(in_place: &mut [u16], bit_depth:
 
         let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3);
 
-        let is_zero_alpha_mask = _mm256_cmpeq_epi16(pixel.3, _mm256_setzero_si256());
+        let zeros = _mm256_setzero_si256();
+
+        let is_zero_alpha_mask = _mm256_cmpeq_epi16(pixel.3, zeros);
 
-        let mut low_alpha = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(
-            _mm256_castsi256_si128(pixel.3),
-        )));
+        let mut low_alpha =
+            _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(pixel.3, zeros)));
 
         low_alpha = _mm256_mul_ps(low_alpha, v_scale_colors);
 
-        let mut high_alpha = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(
-            _mm256_extracti128_si256::<1>(pixel.3),
-        )));
+        let mut high_alpha =
+            _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(pixel.3, zeros)));
 
         high_alpha = _mm256_mul_ps(high_alpha, v_scale_colors);
 
diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs
index 291c4b4..83200dc 100644
--- a/src/avx2/alpha_u8.rs
+++ b/src/avx2/alpha_u8.rs
@@ -29,8 +29,7 @@
 
 use crate::alpha_handle_u8::{premultiply_alpha_rgba_row_impl, unpremultiply_alpha_rgba_row_impl};
 use crate::avx2::utils::{
-    _mm256_packus_four_epi32, _mm256_select_si256, avx2_deinterleave_rgba, avx2_div_by255,
-    avx2_interleave_rgba, avx2_pack_u16,
+    _mm256_select_si256, avx2_deinterleave_rgba, avx2_div_by255, avx2_interleave_rgba,
 };
 use crate::sse::{
     _mm_div_by_255_epi16, sse_deinterleave_rgba, sse_interleave_rgba, sse_unpremultiply_row,
@@ -46,8 +45,8 @@ use std::arch::x86_64::*;
 #[inline(always)]
 unsafe fn avx2_unpremultiply_row(x: __m256i, a: __m256i) -> __m256i {
     let zeros = _mm256_setzero_si256();
-    let lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(x));
-    let hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(x));
+    let lo = _mm256_unpacklo_epi8(x, zeros);
+    let hi = _mm256_unpackhi_epi8(x, zeros);
 
     let scale = _mm256_set1_epi16(255);
 
@@ -57,35 +56,27 @@ unsafe fn avx2_unpremultiply_row(x: __m256i, a: __m256i) -> __m256i {
     let scale_ps = _mm256_set1_ps(255f32);
 
     let lo_lo = _mm256_mul_ps(
-        _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(lo))),
+        _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(lo, zeros)),
         scale_ps,
     );
     let lo_hi = _mm256_mul_ps(
-        _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(lo))),
+        _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(lo, zeros)),
         scale_ps,
     );
     let hi_lo = _mm256_mul_ps(
-        _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(hi))),
+        _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(hi, zeros)),
         scale_ps,
     );
     let hi_hi = _mm256_mul_ps(
-        _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(hi))),
+        _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)),
         scale_ps,
     );
-    let a_lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a));
-    let a_hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(a));
-    let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(
-        _mm256_castsi256_si128(a_lo),
-    )));
-    let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(
-        _mm256_extracti128_si256::<1>(a_lo),
-    )));
-    let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(
-        _mm256_castsi256_si128(a_hi),
-    )));
-    let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(
-        _mm256_extracti128_si256::<1>(a_hi),
-    )));
+    let a_lo = _mm256_unpacklo_epi8(a, zeros);
+    let a_hi = _mm256_unpackhi_epi8(x, zeros);
+    let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros)));
+    let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros)));
+    let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros)));
+    let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros)));
 
     let lo_lo = _mm256_cvtps_epi32(_mm256_mul_ps(lo_lo, a_lo_lo));
     let lo_hi = _mm256_cvtps_epi32(_mm256_mul_ps(lo_hi, a_lo_hi));
@@ -95,7 +86,10 @@ unsafe fn avx2_unpremultiply_row(x: __m256i, a: __m256i) -> __m256i {
     _mm256_select_si256(
         is_zero_mask,
         zeros,
-        _mm256_packus_four_epi32(lo_lo, lo_hi, hi_lo, hi_hi),
+        _mm256_packus_epi16(
+            _mm256_packus_epi32(lo_lo, lo_hi),
+            _mm256_packus_epi32(hi_lo, hi_hi),
+        ),
     )
 }
 
@@ -128,17 +122,19 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
             let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i);
             let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
 
-            let mut rrr_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(rrr));
-            let mut rrr_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(rrr));
+            let zeros = _mm256_setzero_si256();
 
-            let mut ggg_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(ggg));
-            let mut ggg_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(ggg));
+            let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros);
+            let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros);
 
-            let mut bbb_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(bbb));
-            let mut bbb_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(bbb));
+            let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros);
+            let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros);
 
-            let aaa_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(aaa));
-            let aaa_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(aaa));
+            let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros);
+            let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros);
+
+            let aaa_low = _mm256_unpacklo_epi8(aaa, zeros);
+            let aaa_high = _mm256_unpackhi_epi8(aaa, zeros);
 
             rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low));
             rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high));
@@ -147,9 +143,9 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
             bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low));
             bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high));
 
-            let rrr = avx2_pack_u16(rrr_low, rrr_high);
-            let ggg = avx2_pack_u16(ggg_low, ggg_high);
-            let bbb = avx2_pack_u16(bbb_low, bbb_high);
+            let rrr = _mm256_packus_epi16(rrr_low, rrr_high);
+            let ggg = _mm256_packus_epi16(ggg_low, ggg_high);
+            let bbb = _mm256_packus_epi16(bbb_low, bbb_high);
 
             let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa);
             let dst_ptr = dst.as_mut_ptr();
@@ -174,16 +170,16 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
             let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
             let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
 
-            let mut rrr_low = _mm_cvtepu8_epi16(rrr);
+            let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros);
             let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros);
 
-            let mut ggg_low = _mm_cvtepu8_epi16(ggg);
+            let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros);
             let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros);
 
-            let mut bbb_low = _mm_cvtepu8_epi16(bbb);
+            let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros);
             let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros);
 
-            let aaa_low = _mm_cvtepu8_epi16(aaa);
+            let aaa_low = _mm_unpacklo_epi8(aaa, zeros);
             let aaa_high = _mm_unpackhi_epi8(aaa, zeros);
 
             rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low));
diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs
index 85207bb..f3f6b22 100644
--- a/src/avx2/rgba_f16.rs
+++ b/src/avx2/rgba_f16.rs
@@ -261,11 +261,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl<const FMA: bool>(
             _mm256_castps256_ps128(store),
             _mm256_extractf128_ps::<1>(store),
         ));
-        std::ptr::copy_nonoverlapping(
-            &converted_f16 as *const _ as *const u8,
-            dest_ptr as *mut u8,
-            8,
-        );
+        _mm_storeu_si64(dest_ptr as *mut u8, converted_f16);
 
         filter_offset += filter_weights.aligned_size;
     }
@@ -454,8 +450,8 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
 
         while jx + 2 < bounds.size {
             let ptr = weights_ptr.add(jx + filter_offset);
-            let weight0 = _mm_set1_ps(ptr.read_unaligned());
-            let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
+            let weight0 = _mm_load1_ps(ptr);
+            let weight1 = _mm_load1_ps(ptr.add(1));
             let weight = avx_combine_ps(weight0, weight1);
             let filter_start = jx + bounds.start;
             store_0 = convolve_horizontal_parts_2_rgba_f16::<FMA>(
@@ -522,44 +518,28 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
             _mm256_castps256_ps128(store_0),
             _mm256_extractf128_ps::<1>(store_0),
         ));
-        std::ptr::copy_nonoverlapping(
-            &converted_f16_0 as *const _ as *const u8,
-            dest_ptr as *mut u8,
-            8,
-        );
+        _mm_storeu_si64(dest_ptr as *mut u8, converted_f16_0);
 
         let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
         let converted_f16_1 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps(
             _mm256_castps256_ps128(store_1),
             _mm256_extractf128_ps::<1>(store_1),
         ));
-        std::ptr::copy_nonoverlapping(
-            &converted_f16_1 as *const _ as *const u8,
-            dest_ptr as *mut u8,
-            8,
-        );
+        _mm_storeu_si64(dest_ptr as *mut u8, converted_f16_1);
 
         let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
         let converted_f16_2 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps(
             _mm256_castps256_ps128(store_2),
             _mm256_extractf128_ps::<1>(store_2),
         ));
-        std::ptr::copy_nonoverlapping(
-            &converted_f16_2 as *const _ as *const u8,
-            dest_ptr as *mut u8,
-            8,
-        );
+        _mm_storeu_si64(dest_ptr as *mut u8, converted_f16_2);
 
         let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
         let converted_f16_3 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps(
             _mm256_castps256_ps128(store_3),
             _mm256_extractf128_ps::<1>(store_3),
         ));
-        std::ptr::copy_nonoverlapping(
-            &converted_f16_3 as *const _ as *const u8,
-            dest_ptr as *mut u8,
-            8,
-        );
+        _mm_storeu_si64(dest_ptr as *mut u8, converted_f16_3);
 
         filter_offset += filter_weights.aligned_size;
     }
diff --git a/src/avx2/rgba_f32.rs b/src/avx2/rgba_f32.rs
index 0aba5bd..370abb1 100644
--- a/src/avx2/rgba_f32.rs
+++ b/src/avx2/rgba_f32.rs
@@ -500,8 +500,8 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_impl<const FMA: bool>(
 
         while jx + 2 < bounds.size {
             let ptr = weights_ptr.add(jx + filter_offset);
-            let weight0 = _mm_set1_ps(ptr.read_unaligned());
-            let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
+            let weight0 = _mm_load1_ps(ptr);
+            let weight1 = _mm_load1_ps(ptr.add(1));
             let weight = avx_combine_ps(weight0, weight1);
             let filter_start = jx + bounds.start;
             store = convolve_horizontal_parts_2_rgba_f32::<FMA>(filter_start, src, weight, store);
diff --git a/src/avx2/utils.rs b/src/avx2/utils.rs
index cd11c57..599468e 100644
--- a/src/avx2/utils.rs
+++ b/src/avx2/utils.rs
@@ -300,30 +300,6 @@ pub(crate) unsafe fn avx2_pack_u16(s_1: __m256i, s_2: __m256i) -> __m256i {
     _mm256_permute4x64_epi64::<MASK>(packed)
 }
 
-#[inline]
-#[target_feature(enable = "avx2")]
-pub(crate) unsafe fn _mm256_packus_four_epi32(
-    a: __m256i,
-    b: __m256i,
-    c: __m256i,
-    d: __m256i,
-) -> __m256i {
-    let ab = _mm256_packs_epi32(a, b);
-    let cd = _mm256_packs_epi32(c, d);
-
-    const MASK: i32 = shuffle(3, 1, 2, 0);
-
-    let abcd = _mm256_permute4x64_epi64::<MASK>(_mm256_packus_epi16(ab, cd));
-    _mm256_shuffle_epi32::<MASK>(abcd)
-}
-
-#[inline(always)]
-pub(crate) unsafe fn avx2_pack_u32(s_1: __m256i, s_2: __m256i) -> __m256i {
-    let packed = _mm256_packus_epi32(s_1, s_2);
-    const MASK: i32 = shuffle(3, 1, 2, 0);
-    _mm256_permute4x64_epi64::<MASK>(packed)
-}
-
 #[inline(always)]
 #[allow(dead_code)]
 pub(crate) unsafe fn avx_combine_ps(lo: __m128, hi: __m128) -> __m256 {
diff --git a/src/avx2/vertical_f16.rs b/src/avx2/vertical_f16.rs
index bfe1436..8c915d4 100644
--- a/src/avx2/vertical_f16.rs
+++ b/src/avx2/vertical_f16.rs
@@ -102,7 +102,7 @@ unsafe fn convolve_vertical_part_avx_4_f16<const FMA: bool>(
 
     let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     let acc = _mm256_cvtps_ph::<ROUNDING_FLAGS>(store_0);
-    std::ptr::copy_nonoverlapping(&acc as *const _ as *const u8, dst_ptr as *mut u8, 8);
+    _mm_storeu_si64(dst_ptr as *mut u8, acc);
 }
 
 #[inline(always)]
diff --git a/src/avx2/vertical_f32.rs b/src/avx2/vertical_f32.rs
index 6e88617..28248e8 100644
--- a/src/avx2/vertical_f32.rs
+++ b/src/avx2/vertical_f32.rs
@@ -161,7 +161,10 @@ pub(crate) unsafe fn convolve_vertical_part_avx_f32<const FMA: bool>(
     }
 
     let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
-    (dst_ptr as *mut i32).write_unaligned(_mm256_extract_epi32::<0>(_mm256_castps_si256(store_0)));
+    _mm_storeu_si32(
+        dst_ptr as *mut u8,
+        _mm256_castsi256_si128(_mm256_castps_si256(store_0)),
+    );
 }
 
 #[inline]
diff --git a/src/avx2/vertical_u8.rs b/src/avx2/vertical_u8.rs
index b8e3ee8..e578206 100644
--- a/src/avx2/vertical_u8.rs
+++ b/src/avx2/vertical_u8.rs
@@ -83,176 +83,64 @@ unsafe fn convolve_vertical_part_avx_64(
 
     let bounds_size = bounds.size;
 
-    if bounds_size == 2 {
-        let py = start_y;
-        let weight = filter.get_unchecked(0..2);
-        let v_weight0 = _mm256_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm256_set1_epi32(weight[1] as i32);
-
-        let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-        let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+    let mut jj = 0usize;
 
-        let item_row_0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
-        let item_row_1 =
-            _mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i);
-        let item_row_10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
-        let item_row_11 =
-            _mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i);
+    while jj < bounds_size.saturating_sub(2) {
+        let py = start_y + jj;
+        let f_ptr = filter.get_unchecked(jj..).as_ptr() as *const i32;
+        let v_weight_2 = _mm256_set1_epi32(f_ptr.read_unaligned());
+        let src_ptr = src.get_unchecked((src_stride * py + px)..);
+        let s_ptr_next = src_ptr.get_unchecked(src_stride..);
 
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight0);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight0);
+        let item_row_0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
+        let item_row_1 = _mm256_loadu_si256(s_ptr_next.as_ptr() as *const __m256i);
 
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_10, v_weight1);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_11, v_weight1);
-    } else if bounds_size == 3 {
-        let py = start_y;
-        let weight = filter.get_unchecked(0..3);
-        let v_weight0 = _mm256_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm256_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm256_set1_epi32(weight[2] as i32);
+        let interleaved = _mm256_unpacklo_epi8(item_row_0, item_row_1);
+        let pix = _mm256_unpacklo_epi8(interleaved, zeros);
+        store_0 = _mm256_add_epi32(store_0, _mm256_madd_epi16(pix, v_weight_2));
+        let pix = _mm256_unpackhi_epi8(interleaved, zeros);
+        store_1 = _mm256_add_epi32(store_1, _mm256_madd_epi16(pix, v_weight_2));
 
-        let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-        let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-        let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
+        let interleaved = _mm256_unpackhi_epi8(item_row_0, item_row_1);
+        let pix = _mm256_unpacklo_epi8(interleaved, zeros);
+        store_2 = _mm256_add_epi32(store_2, _mm256_madd_epi16(pix, v_weight_2));
+        let pix = _mm256_unpackhi_epi8(interleaved, zeros);
+        store_3 = _mm256_add_epi32(store_3, _mm256_madd_epi16(pix, v_weight_2));
 
-        let item_row_0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
+        let item_row_0 =
+            _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i);
         let item_row_1 =
-            _mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i);
+            _mm256_loadu_si256(s_ptr_next.get_unchecked(32..).as_ptr() as *const __m256i);
 
-        let item_row_10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
-        let item_row_11 =
-            _mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i);
+        let interleaved = _mm256_unpacklo_epi8(item_row_0, item_row_1);
+        let pix = _mm256_unpacklo_epi8(interleaved, zeros);
+        store_4 = _mm256_add_epi32(store_4, _mm256_madd_epi16(pix, v_weight_2));
+        let pix = _mm256_unpackhi_epi8(interleaved, zeros);
+        store_5 = _mm256_add_epi32(store_5, _mm256_madd_epi16(pix, v_weight_2));
 
-        let item_row_20 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
-        let item_row_21 =
-            _mm256_loadu_si256(src_ptr2.get_unchecked(32..).as_ptr() as *const __m256i);
+        let interleaved = _mm256_unpackhi_epi8(item_row_0, item_row_1);
+        let pix = _mm256_unpacklo_epi8(interleaved, zeros);
+        store_6 = _mm256_add_epi32(store_6, _mm256_madd_epi16(pix, v_weight_2));
+        let pix = _mm256_unpackhi_epi8(interleaved, zeros);
+        store_7 = _mm256_add_epi32(store_7, _mm256_madd_epi16(pix, v_weight_2));
 
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight0);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight0);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_10, v_weight1);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_11, v_weight1);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_20, v_weight2);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_21, v_weight2);
-    } else if bounds_size == 4 {
-        let py = start_y;
-        let weight = filter.get_unchecked(0..4);
-        let v_weight0 = _mm256_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm256_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm256_set1_epi32(weight[2] as i32);
-        let v_weight3 = _mm256_set1_epi32(weight[3] as i32);
+        jj += 2;
+    }
 
-        let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-        let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-        let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-        let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
+    for j in jj..bounds_size {
+        let py = start_y + j;
+        let weight = *filter.get_unchecked(j);
+        let v_weight = _mm256_set1_epi32(weight as i32);
+        let src_ptr = src.get_unchecked((src_stride * py + px)..);
 
-        let item_row_0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
+        let item_row_0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
         let item_row_1 =
-            _mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i);
-
-        let item_row_10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
-        let item_row_11 =
-            _mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i);
-
-        let item_row_20 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
-        let item_row_21 =
-            _mm256_loadu_si256(src_ptr2.get_unchecked(32..).as_ptr() as *const __m256i);
-
-        let item_row_30 = _mm256_loadu_si256(src_ptr3.as_ptr() as *const __m256i);
-        let item_row_31 =
-            _mm256_loadu_si256(src_ptr3.get_unchecked(32..).as_ptr() as *const __m256i);
+            _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i);
 
         (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight0);
+            dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight);
         (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight0);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_10, v_weight1);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_11, v_weight1);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_20, v_weight2);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_21, v_weight2);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_30, v_weight3);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_31, v_weight3);
-    } else {
-        let mut jj = 0usize;
-
-        while jj < bounds_size.saturating_sub(2) {
-            let py = start_y + jj;
-            let f_ptr = filter.get_unchecked(jj..).as_ptr() as *const i32;
-            let v_weight_2 = _mm256_set1_epi32(f_ptr.read_unaligned());
-            let src_ptr = src.get_unchecked((src_stride * py + px)..);
-            let s_ptr_next = src_ptr.get_unchecked(src_stride..);
-
-            let item_row_0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
-            let item_row_1 = _mm256_loadu_si256(s_ptr_next.as_ptr() as *const __m256i);
-
-            let interleaved = _mm256_unpacklo_epi8(item_row_0, item_row_1);
-            let pix = _mm256_unpacklo_epi8(interleaved, zeros);
-            store_0 = _mm256_add_epi32(store_0, _mm256_madd_epi16(pix, v_weight_2));
-            let pix = _mm256_unpackhi_epi8(interleaved, zeros);
-            store_1 = _mm256_add_epi32(store_1, _mm256_madd_epi16(pix, v_weight_2));
-
-            let interleaved = _mm256_unpackhi_epi8(item_row_0, item_row_1);
-            let pix = _mm256_unpacklo_epi8(interleaved, zeros);
-            store_2 = _mm256_add_epi32(store_2, _mm256_madd_epi16(pix, v_weight_2));
-            let pix = _mm256_unpackhi_epi8(interleaved, zeros);
-            store_3 = _mm256_add_epi32(store_3, _mm256_madd_epi16(pix, v_weight_2));
-
-            let item_row_0 =
-                _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i);
-            let item_row_1 =
-                _mm256_loadu_si256(s_ptr_next.get_unchecked(32..).as_ptr() as *const __m256i);
-
-            let interleaved = _mm256_unpacklo_epi8(item_row_0, item_row_1);
-            let pix = _mm256_unpacklo_epi8(interleaved, zeros);
-            store_4 = _mm256_add_epi32(store_4, _mm256_madd_epi16(pix, v_weight_2));
-            let pix = _mm256_unpackhi_epi8(interleaved, zeros);
-            store_5 = _mm256_add_epi32(store_5, _mm256_madd_epi16(pix, v_weight_2));
-
-            let interleaved = _mm256_unpackhi_epi8(item_row_0, item_row_1);
-            let pix = _mm256_unpacklo_epi8(interleaved, zeros);
-            store_6 = _mm256_add_epi32(store_6, _mm256_madd_epi16(pix, v_weight_2));
-            let pix = _mm256_unpackhi_epi8(interleaved, zeros);
-            store_7 = _mm256_add_epi32(store_7, _mm256_madd_epi16(pix, v_weight_2));
-
-            jj += 2;
-        }
-
-        for j in jj..bounds_size {
-            let py = start_y + j;
-            let weight = *filter.get_unchecked(j);
-            let v_weight = _mm256_set1_epi32(weight as i32);
-            let src_ptr = src.get_unchecked((src_stride * py + px)..);
-
-            let item_row_0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
-            let item_row_1 =
-                _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i);
-
-            (store_0, store_1, store_2, store_3) =
-                dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight);
-            (store_4, store_5, store_6, store_7) =
-                dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight);
-        }
+            dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight);
     }
 
     store_0 = _mm256_srai_epi32::<PRECISION>(store_0);
@@ -299,78 +187,16 @@ unsafe fn convolve_vertical_part_avx_32(
 
     let bounds_size = bounds.size;
 
-    if bounds_size == 2 {
-        let py = start_y;
-        let weight = filter.get_unchecked(0..2);
-        let v_weight0 = _mm256_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm256_set1_epi32(weight[1] as i32);
-        let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-        let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-
-        let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
-        let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row0, v_weight0);
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row1, v_weight1);
-    } else if bounds_size == 3 {
-        let py = start_y;
-        let weight = filter.get_unchecked(0..3);
-        let v_weight0 = _mm256_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm256_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm256_set1_epi32(weight[2] as i32);
-        let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-        let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-        let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-
-        let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
-        let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
-        let item_row2 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row0, v_weight0);
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row1, v_weight1);
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row2, v_weight2);
-    } else if bounds_size == 4 {
-        let py = start_y;
-        let weight = filter.get_unchecked(0..4);
-        let v_weight0 = _mm256_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm256_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm256_set1_epi32(weight[2] as i32);
-        let v_weight3 = _mm256_set1_epi32(weight[3] as i32);
-        let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-        let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-        let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-        let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
+    for j in 0..bounds_size {
+        let py = start_y + j;
+        let weight = *filter.get_unchecked(j);
+        let v_weight = _mm256_set1_epi32(weight as i32);
+        let src_ptr = src.get_unchecked((src_stride * py + px)..);
 
-        let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
-        let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
-        let item_row2 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
-        let item_row3 = _mm256_loadu_si256(src_ptr3.as_ptr() as *const __m256i);
+        let item_row = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
 
         (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row0, v_weight0);
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row1, v_weight1);
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row2, v_weight2);
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row3, v_weight3);
-    } else {
-        for j in 0..bounds_size {
-            let py = start_y + j;
-            let weight = *filter.get_unchecked(j);
-            let v_weight = _mm256_set1_epi32(weight as i32);
-            let src_ptr = src.get_unchecked((src_stride * py + px)..);
-
-            let item_row = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
-
-            (store_0, store_1, store_2, store_3) =
-                dot_prod(store_0, store_1, store_2, store_3, item_row, v_weight);
-        }
+            dot_prod(store_0, store_1, store_2, store_3, item_row, v_weight);
     }
 
     store_0 = _mm256_srai_epi32::<PRECISION>(store_0);
@@ -410,13 +236,14 @@ unsafe fn convolve_vertical_part_8_avx(
         let weight = *filter.get_unchecked(j);
         let v_weight = _mm256_set1_epi32(weight as i32);
         let src_ptr = src.get_unchecked((src_stride * py + px)..);
-        let item_row = _mm256_cvtepu16_epi32(_mm_cvtepu8_epi16(_mm_loadu_si64(src_ptr.as_ptr())));
+        let item_row = _mm256_cvtepu16_epi32(_mm_unpacklo_epi8(
+            _mm_loadu_si64(src_ptr.as_ptr()),
+            _mm_setzero_si128(),
+        ));
 
         store_0 = _mm256_add_epi32(store_0, _mm256_mullo_epi32(item_row, v_weight));
     }
 
-    store_0 = _mm256_max_epi32(store_0, zeros);
-
     const MASK: i32 = shuffle(3, 1, 2, 0);
 
     let low_16 = _mm256_permute4x64_epi64::<MASK>(_mm256_packus_epi32(
@@ -428,7 +255,7 @@ unsafe fn convolve_vertical_part_8_avx(
     let item_sse = _mm256_castsi256_si128(item);
 
     let dst_ptr = dst.get_unchecked_mut(px..);
-    std::ptr::copy_nonoverlapping(&item_sse as *const _ as *const u8, dst_ptr.as_mut_ptr(), 8);
+    _mm_storeu_si64(dst_ptr.as_mut_ptr(), item_sse);
 }
 
 #[inline(always)]
@@ -510,8 +337,6 @@ unsafe fn convolve_vertical_part_avx(
         }
     }
 
-    store_0 = _mm256_max_epi32(store_0, zeros);
-
     let low_16 = _mm256_packus_epi32(_mm256_srai_epi32::<PRECISION>(store_0), zeros);
 
     let item = _mm256_packus_epi16(low_16, low_16);
diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs
index af4ebc3..48ea85e 100644
--- a/src/avx2/vertical_u8_lp.rs
+++ b/src/avx2/vertical_u8_lp.rs
@@ -53,8 +53,9 @@ unsafe fn m256dot<const SCALE: i32>(
     row: __m256i,
     weight: __m256i,
 ) -> (__m256i, __m256i) {
-    let lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(row));
-    let hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(row));
+    let zeros = _mm256_setzero_si256();
+    let lo = _mm256_unpacklo_epi8(row, zeros);
+    let hi = _mm256_unpackhi_epi8(row, zeros);
 
     let store0 = _mm256_add_epi16(
         store0,
@@ -216,8 +217,8 @@ unsafe fn convolve_vertical_avx2_row_impl(
         let rebased2 = _mm256_srai_epi16::<R_SHR_SCALE>(store2);
         let rebased3 = _mm256_srai_epi16::<R_SHR_SCALE>(store3);
 
-        let shrank0 = avx2_pack_u16(rebased0, rebased1);
-        let shrank1 = avx2_pack_u16(rebased2, rebased3);
+        let shrank0 = _mm256_packus_epi16(rebased0, rebased1);
+        let shrank1 = _mm256_packus_epi16(rebased2, rebased3);
 
         _mm256_storeu_si256(dst.as_mut_ptr() as *mut __m256i, shrank0);
         _mm256_storeu_si256(
@@ -316,7 +317,7 @@ unsafe fn convolve_vertical_avx2_row_impl(
         let rebased0 = _mm256_srai_epi16::<R_SHR_SCALE>(store0);
         let rebased1 = _mm256_srai_epi16::<R_SHR_SCALE>(store1);
 
-        let shrank0 = avx2_pack_u16(rebased0, rebased1);
+        let shrank0 = _mm256_packus_epi16(rebased0, rebased1);
         _mm256_storeu_si256(dst.as_mut_ptr() as *mut __m256i, shrank0);
 
         cx += 32;
@@ -469,7 +470,7 @@ unsafe fn convolve_vertical_avx2_row_impl(
 
         let rebased = _mm_srai_epi16::<R_SHR_SCALE>(store);
         let shrank = _mm_packus_epi16(rebased, rebased);
-        std::ptr::copy_nonoverlapping(&shrank as *const _ as *const u8, dst.as_mut_ptr(), 8);
+        _mm_storeu_si64(dst.as_mut_ptr(), shrank);
 
         cx += 8;
     }
diff --git a/src/neon/rgb_f32.rs b/src/neon/rgb_f32.rs
index f1a5325..d0f22a2 100644
--- a/src/neon/rgb_f32.rs
+++ b/src/neon/rgb_f32.rs
@@ -35,10 +35,8 @@ use crate::neon::utils::{prefer_vfmaq_f32, prefer_vfmaq_lane_f32};
 
 macro_rules! write_rgb_f32 {
     ($store: expr, $dest_ptr: expr) => {{
-        let l1 = vgetq_lane_u64::<0>(vreinterpretq_u64_f32($store));
-        let l3 = vgetq_lane_f32::<2>($store);
-        ($dest_ptr as *mut u64).write_unaligned(l1);
-        $dest_ptr.add(2).write_unaligned(l3);
+        vst1_f32($dest_ptr, vget_low_f32($store));
+        vst1q_lane_f32::<2>($dest_ptr.add(2), $store);
     }};
 }
 
diff --git a/src/neon/rgb_u8.rs b/src/neon/rgb_u8.rs
index 44c0862..af240f5 100644
--- a/src/neon/rgb_u8.rs
+++ b/src/neon/rgb_u8.rs
@@ -102,11 +102,11 @@ unsafe fn conv_horiz_rgba_1_u8(
 unsafe fn write_accumulator_u8(store: int32x4_t, dst: &mut [u8]) {
     let store_16 = vqshrun_n_s32::<PRECISION>(store);
     let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
-    let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
-    let bytes = pixel.to_le_bytes();
-    let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]);
-    (dst.as_mut_ptr() as *mut u16).write_unaligned(first_byte);
-    *dst.get_unchecked_mut(2) = bytes[2];
+    vst1_lane_u16::<0>(
+        dst.as_mut_ptr() as *mut u16,
+        vreinterpret_u16_u8(store_16_8),
+    );
+    vst1_lane_u8::<2>(dst.as_mut_ptr().add(2), store_16_8);
 }
 
 pub(crate) fn convolve_horizontal_rgb_neon_rows_4(
diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs
index d4fa251..b2a4fb9 100644
--- a/src/neon/rgba_u8.rs
+++ b/src/neon/rgba_u8.rs
@@ -433,21 +433,22 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8(
             let store_16_8_2 = vqmovn_u16(vcombine_u16(store_16_2, store_16_2));
             let store_16_8 = vqmovn_u16(vcombine_u16(store_16_3, store_16_3));
 
-            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_0));
-            let dest_ptr_32 = chunk0.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(pixel);
-
-            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_1));
-            let dest_ptr_32 = chunk1.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(pixel);
-
-            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_2));
-            let dest_ptr_32 = chunk2.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(pixel);
-
-            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
-            let dest_ptr_32 = chunk3.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(pixel);
+            vst1_lane_u32::<0>(
+                chunk0.as_mut_ptr() as *mut u32,
+                vreinterpret_u32_u8(store_16_8_0),
+            );
+            vst1_lane_u32::<0>(
+                chunk1.as_mut_ptr() as *mut u32,
+                vreinterpret_u32_u8(store_16_8_1),
+            );
+            vst1_lane_u32::<0>(
+                chunk2.as_mut_ptr() as *mut u32,
+                vreinterpret_u32_u8(store_16_8_2),
+            );
+            vst1_lane_u32::<0>(
+                chunk3.as_mut_ptr() as *mut u32,
+                vreinterpret_u32_u8(store_16_8),
+            );
         }
     }
 }
@@ -585,8 +586,9 @@ unsafe fn convolve_horizontal_rgba_neon_row_i16_impl(
         let store_16 = vshr_n_s16::<SCALE>(store);
         let store_16_8 = vqmovun_s16(vcombine_s16(store_16, store_16));
 
-        let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
-        let dest_ptr_32 = dst.as_mut_ptr() as *mut u32;
-        dest_ptr_32.write_unaligned(value);
+        vst1_lane_u32::<0>(
+            dst.as_mut_ptr() as *mut u32,
+            vreinterpret_u32_u8(store_16_8),
+        );
     }
 }
diff --git a/src/neon/vertical_u8.rs b/src/neon/vertical_u8.rs
index 1e01da3..2a0f44a 100644
--- a/src/neon/vertical_u8.rs
+++ b/src/neon/vertical_u8.rs
@@ -1216,9 +1216,7 @@ fn convolve_vertical_neon_row_full(
             let low_16 = vcombine_u16(shrinked_store, shrinked_store);
 
             let item = vqmovn_u16(low_16);
-
-            let value = vget_lane_u8::<0>(item);
-            *dst = value;
+            vst1_lane_u8::<0>(dst, item);
             cx += 1;
         }
     }
diff --git a/src/sse/alpha_u16.rs b/src/sse/alpha_u16.rs
index 9cde8aa..38e79e4 100644
--- a/src/sse/alpha_u16.rs
+++ b/src/sse/alpha_u16.rs
@@ -46,7 +46,7 @@ unsafe fn sse_unpremultiply_row_u16(
     a_hi_f: __m128,
 ) -> __m128i {
     let zeros = _mm_setzero_si128();
-    let lo = _mm_cvtepu16_epi32(x);
+    let lo = _mm_unpacklo_epi16(x, zeros);
     let hi = _mm_unpackhi_epi16(x, zeros);
 
     const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
@@ -119,7 +119,10 @@ unsafe fn unpremultiply_alpha_sse_rgba_u16_row_impl(in_place: &mut [u16], bit_de
 
             let is_zero_mask = _mm_cmpeq_epi16(aaaa, _mm_setzero_si128());
             let a_lo_f = _mm_mul_ps(
-                _mm_rcp_ps(_mm_cvtepi32_ps(_mm_cvtepu16_epi32(aaaa))),
+                _mm_rcp_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(
+                    aaaa,
+                    _mm_setzero_si128(),
+                ))),
                 v_max_colors,
             );
             let a_hi_f = _mm_mul_ps(
@@ -183,7 +186,7 @@ unsafe fn sse_premultiply_row_u16(
     v_max_colors_scale: __m128,
 ) -> __m128i {
     let zeros = _mm_setzero_si128();
-    let lo = _mm_cvtepu16_epi32(x);
+    let lo = _mm_unpacklo_epi16(x, zeros);
     let hi = _mm_unpackhi_epi16(x, zeros);
 
     let new_lo = _mm_cvtps_epi32(_mm_mul_ps(
@@ -241,7 +244,7 @@ unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16],
                 let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
                 let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
 
-                let a_lo_f = _mm_cvtepu16_epi32(aaaa);
+                let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros);
                 let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros);
 
                 let new_rrrr = _mm_packus_epi32(
@@ -276,7 +279,7 @@ unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16],
                 let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
                 let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
 
-                let a_lo_f = _mm_cvtepu16_epi32(aaaa);
+                let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros);
                 let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros);
 
                 let new_rrrr = _mm_packus_epi32(
@@ -311,7 +314,7 @@ unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16],
                 let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
                 let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
 
-                let a_lo_f = _mm_cvtepu16_epi32(aaaa);
+                let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros);
                 let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros);
 
                 let new_rrrr = _mm_packus_epi32(
@@ -365,7 +368,7 @@ unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16],
                 let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i);
                 let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3);
 
-                let a_lo_f = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(aaaa));
+                let a_lo_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(aaaa, _mm_setzero_si128()));
                 let a_hi_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(aaaa, _mm_setzero_si128()));
 
                 let new_rrrr = sse_premultiply_row_u16(rrrr, a_lo_f, a_hi_f, v_max_colors_scale);
diff --git a/src/sse/alpha_u8.rs b/src/sse/alpha_u8.rs
index f194299..32f01cd 100644
--- a/src/sse/alpha_u8.rs
+++ b/src/sse/alpha_u8.rs
@@ -61,7 +61,7 @@ pub(crate) unsafe fn _mm_div_by_255_epi16(v: __m128i) -> __m128i {
 #[inline(always)]
 pub(crate) unsafe fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i {
     let zeros = _mm_setzero_si128();
-    let lo = _mm_cvtepu8_epi16(x);
+    let lo = _mm_unpacklo_epi8(x, zeros);
     let hi = _mm_unpackhi_epi8(x, zeros);
 
     let scale = _mm_set1_epi16(255);
@@ -71,15 +71,15 @@ pub(crate) unsafe fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i {
 
     let scale_ps = _mm_set1_ps(255f32);
 
-    let lo_lo = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(lo)), scale_ps);
+    let lo_lo = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(lo, zeros)), scale_ps);
     let lo_hi = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(lo, zeros)), scale_ps);
-    let hi_lo = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(hi)), scale_ps);
+    let hi_lo = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(hi, zeros)), scale_ps);
     let hi_hi = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(hi, zeros)), scale_ps);
-    let a_lo = _mm_cvtepu8_epi16(a);
+    let a_lo = _mm_unpacklo_epi8(a, zeros);
     let a_hi = _mm_unpackhi_epi8(a, zeros);
-    let a_lo_lo = _mm_rcp_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_lo)));
+    let a_lo_lo = _mm_rcp_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(a_lo, zeros)));
     let a_lo_hi = _mm_rcp_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_lo, zeros)));
-    let a_hi_lo = _mm_rcp_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_hi)));
+    let a_hi_lo = _mm_rcp_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(a_hi, zeros)));
     let a_hi_hi = _mm_rcp_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_hi, zeros)));
 
     const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
@@ -124,16 +124,16 @@ unsafe fn sse_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
             let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i);
             let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3);
 
-            let mut rrr_low = _mm_cvtepu8_epi16(rrr);
+            let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros);
             let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros);
 
-            let mut ggg_low = _mm_cvtepu8_epi16(ggg);
+            let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros);
             let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros);
 
-            let mut bbb_low = _mm_cvtepu8_epi16(bbb);
+            let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros);
             let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros);
 
-            let aaa_low = _mm_cvtepu8_epi16(aaa);
+            let aaa_low = _mm_unpacklo_epi8(aaa, zeros);
             let aaa_high = _mm_unpackhi_epi8(aaa, zeros);
 
             rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low));
diff --git a/src/sse/plane_u8.rs b/src/sse/plane_u8.rs
index 6f275f2..039f653 100644
--- a/src/sse/plane_u8.rs
+++ b/src/sse/plane_u8.rs
@@ -39,7 +39,7 @@ use crate::support::{PRECISION, ROUNDING_CONST};
 macro_rules! s_accumulate_8_horiz {
     ($store: expr, $ptr: expr, $weights: expr) => {{
         let pixel_colors = _mm_loadu_si64($ptr);
-        let px_16 = _mm_cvtepu8_epi16(pixel_colors);
+        let px_16 = _mm_unpacklo_epi8(pixel_colors, _mm_setzero_si128());
         let px_lo = _mm_unpacklo_epi16(px_16, _mm_setzero_si128());
         let px_hi = _mm_unpackhi_epi16(px_16, _mm_setzero_si128());
 
diff --git a/src/sse/rgb_f32.rs b/src/sse/rgb_f32.rs
index 26641f4..bca2a31 100644
--- a/src/sse/rgb_f32.rs
+++ b/src/sse/rgb_f32.rs
@@ -28,7 +28,7 @@
  */
 
 use crate::filter_weights::FilterWeights;
-use crate::sse::{_mm_extract_epi64x, _mm_prefer_fma_ps, load_4_weights, shuffle};
+use crate::sse::{_mm_prefer_fma_ps, load_4_weights, shuffle};
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
@@ -231,7 +231,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f32_impl<const FMA: bool>(
 
         let px = x * CHANNELS;
         let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
-        (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store)));
+        _mm_storeu_si64(dest_ptr as *mut u8, _mm_castps_si128(store));
         (dest_ptr as *mut i32)
             .add(2)
             .write_unaligned(_mm_extract_ps::<2>(store));
@@ -452,25 +452,25 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl<const FMA: bool>(
 
         let px = x * CHANNELS;
         let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
-        (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_0)));
+        _mm_storeu_si64(dest_ptr as *mut u8, _mm_castps_si128(store_0));
         (dest_ptr as *mut i32)
             .add(2)
             .write_unaligned(_mm_extract_ps::<2>(store_0));
 
         let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
-        (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_1)));
+        _mm_storeu_si64(dest_ptr as *mut u8, _mm_castps_si128(store_1));
         (dest_ptr as *mut i32)
             .add(2)
             .write_unaligned(_mm_extract_ps::<2>(store_1));
 
         let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
-        (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_2)));
+        _mm_storeu_si64(dest_ptr as *mut u8, _mm_castps_si128(store_2));
         (dest_ptr as *mut i32)
             .add(2)
             .write_unaligned(_mm_extract_ps::<2>(store_2));
 
         let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
-        (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_3)));
+        _mm_storeu_si64(dest_ptr as *mut u8, _mm_castps_si128(store_3));
         (dest_ptr as *mut i32)
             .add(2)
             .write_unaligned(_mm_extract_ps::<2>(store_3));
diff --git a/src/sse/rgba_f16.rs b/src/sse/rgba_f16.rs
index f29b541..d032a21 100644
--- a/src/sse/rgba_f16.rs
+++ b/src/sse/rgba_f16.rs
@@ -265,11 +265,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl<const F16C: bool, const
             let px = x * CHANNELS;
             let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             let converted_f16 = _mm_cvtps_phx::<F16C>(store);
-            std::ptr::copy_nonoverlapping(
-                &converted_f16 as *const _ as *const u8,
-                dest_ptr as *mut u8,
-                8,
-            );
+            _mm_storeu_si64(dest_ptr as *mut u8, converted_f16);
 
             filter_offset += filter_weights.aligned_size;
         }
@@ -536,29 +532,11 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl<const F16C: bool, const F
         let converted_f16_1 = _mm_cvtps_phx::<F16C>(store_1);
         let converted_f16_2 = _mm_cvtps_phx::<F16C>(store_2);
         let converted_f16_3 = _mm_cvtps_phx::<F16C>(store_3);
-        std::ptr::copy_nonoverlapping(
-            &converted_f16_0 as *const _ as *const u8,
-            dest_ptr0 as *mut u8,
-            8,
-        );
-
-        std::ptr::copy_nonoverlapping(
-            &converted_f16_1 as *const _ as *const u8,
-            dest_ptr1 as *mut u8,
-            8,
-        );
-
-        std::ptr::copy_nonoverlapping(
-            &converted_f16_2 as *const _ as *const u8,
-            dest_ptr2 as *mut u8,
-            8,
-        );
-
-        std::ptr::copy_nonoverlapping(
-            &converted_f16_3 as *const _ as *const u8,
-            dest_ptr3 as *mut u8,
-            8,
-        );
+
+        _mm_storeu_si64(dest_ptr0 as *mut u8, converted_f16_0);
+        _mm_storeu_si64(dest_ptr1 as *mut u8, converted_f16_1);
+        _mm_storeu_si64(dest_ptr2 as *mut u8, converted_f16_2);
+        _mm_storeu_si64(dest_ptr3 as *mut u8, converted_f16_3);
 
         filter_offset += filter_weights.aligned_size;
     }
diff --git a/src/sse/rgba_u16.rs b/src/sse/rgba_u16.rs
index 79ec664..969e85e 100644
--- a/src/sse/rgba_u16.rs
+++ b/src/sse/rgba_u16.rs
@@ -369,26 +369,10 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_u16_impl<const FMA: bool>(
         let store_16_2 = _mm_packus_epi32(v_st2, v_st2);
         let store_16_3 = _mm_packus_epi32(v_st3, v_st3);
 
-        std::ptr::copy_nonoverlapping(
-            &store_16_0 as *const _ as *const u8,
-            chunk0.as_mut_ptr() as *mut u8,
-            8,
-        );
-        std::ptr::copy_nonoverlapping(
-            &store_16_1 as *const _ as *const u8,
-            chunk1.as_mut_ptr() as *mut u8,
-            8,
-        );
-        std::ptr::copy_nonoverlapping(
-            &store_16_2 as *const _ as *const u8,
-            chunk2.as_mut_ptr() as *mut u8,
-            8,
-        );
-        std::ptr::copy_nonoverlapping(
-            &store_16_3 as *const _ as *const u8,
-            chunk3.as_mut_ptr() as *mut u8,
-            8,
-        );
+        _mm_storeu_si64(chunk0.as_mut_ptr() as *mut u8, store_16_0);
+        _mm_storeu_si64(chunk1.as_mut_ptr() as *mut u8, store_16_1);
+        _mm_storeu_si64(chunk2.as_mut_ptr() as *mut u8, store_16_2);
+        _mm_storeu_si64(chunk3.as_mut_ptr() as *mut u8, store_16_3);
     }
 }
 
@@ -502,11 +486,6 @@ unsafe fn convolve_horizontal_rgba_sse_u16_row_impl<const FMA: bool>(
         );
 
         let store_16_0 = _mm_packus_epi32(v_st, v_st);
-
-        std::ptr::copy_nonoverlapping(
-            &store_16_0 as *const _ as *const u8,
-            dst.as_mut_ptr() as *mut u8,
-            8,
-        );
+        _mm_storeu_si64(dst.as_mut_ptr() as *mut u8, store_16_0);
     }
 }
diff --git a/src/sse/rgba_u16_lb.rs b/src/sse/rgba_u16_lb.rs
index f2d5974..d295c05 100644
--- a/src/sse/rgba_u16_lb.rs
+++ b/src/sse/rgba_u16_lb.rs
@@ -192,7 +192,6 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_lb_u8_impl(
     bit_depth: u32,
 ) {
     const CHANNELS: usize = 4;
-    let zeros = _mm_setzero_si128();
     let init = _mm_set1_epi32(ROUNDING_CONST);
 
     let v_max_colors = _mm_set1_epi16((1 << bit_depth) - 1);
@@ -287,36 +286,20 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_lb_u8_impl(
             jx += 1;
         }
 
-        let v_st0 = _mm_srai_epi32::<PRECISION>(_mm_max_epi32(store_0, zeros));
-        let v_st1 = _mm_srai_epi32::<PRECISION>(_mm_max_epi32(store_1, zeros));
-        let v_st2 = _mm_srai_epi32::<PRECISION>(_mm_max_epi32(store_2, zeros));
-        let v_st3 = _mm_srai_epi32::<PRECISION>(_mm_max_epi32(store_3, zeros));
+        let v_st0 = _mm_srai_epi32::<PRECISION>(store_0);
+        let v_st1 = _mm_srai_epi32::<PRECISION>(store_1);
+        let v_st2 = _mm_srai_epi32::<PRECISION>(store_2);
+        let v_st3 = _mm_srai_epi32::<PRECISION>(store_3);
 
         let store_16_0 = _mm_min_epi16(_mm_packus_epi32(v_st0, v_st0), v_max_colors);
         let store_16_1 = _mm_min_epi16(_mm_packus_epi32(v_st1, v_st1), v_max_colors);
         let store_16_2 = _mm_min_epi16(_mm_packus_epi32(v_st2, v_st2), v_max_colors);
         let store_16_3 = _mm_min_epi16(_mm_packus_epi32(v_st3, v_st3), v_max_colors);
 
-        std::ptr::copy_nonoverlapping(
-            &store_16_0 as *const _ as *const u8,
-            chunk0.as_mut_ptr() as *mut u8,
-            8,
-        );
-        std::ptr::copy_nonoverlapping(
-            &store_16_1 as *const _ as *const u8,
-            chunk1.as_mut_ptr() as *mut u8,
-            8,
-        );
-        std::ptr::copy_nonoverlapping(
-            &store_16_2 as *const _ as *const u8,
-            chunk2.as_mut_ptr() as *mut u8,
-            8,
-        );
-        std::ptr::copy_nonoverlapping(
-            &store_16_3 as *const _ as *const u8,
-            chunk3.as_mut_ptr() as *mut u8,
-            8,
-        );
+        _mm_storeu_si64(chunk0.as_mut_ptr() as *mut u8, store_16_0);
+        _mm_storeu_si64(chunk1.as_mut_ptr() as *mut u8, store_16_1);
+        _mm_storeu_si64(chunk2.as_mut_ptr() as *mut u8, store_16_2);
+        _mm_storeu_si64(chunk3.as_mut_ptr() as *mut u8, store_16_3);
     }
 }
 
@@ -340,7 +323,6 @@ unsafe fn convolve_horizontal_rgba_sse_u16_lb_row_impl(
 ) {
     const CHANNELS: usize = 4;
 
-    let zeros = _mm_setzero_si128();
     let v_max_colors = _mm_set1_epi16((1 << bit_depth) - 1);
 
     for ((dst, bounds), weights) in dst
@@ -401,14 +383,9 @@ unsafe fn convolve_horizontal_rgba_sse_u16_lb_row_impl(
             jx += 1;
         }
 
-        let v_st = _mm_srai_epi32::<PRECISION>(_mm_max_epi32(store, zeros));
+        let v_st = _mm_srai_epi32::<PRECISION>(store);
 
         let store_16_0 = _mm_min_epi16(_mm_packus_epi32(v_st, v_st), v_max_colors);
-
-        std::ptr::copy_nonoverlapping(
-            &store_16_0 as *const _ as *const u8,
-            dst.as_mut_ptr() as *mut u8,
-            8,
-        );
+        _mm_storeu_si64(dst.as_mut_ptr() as *mut u8, store_16_0);
     }
 }
diff --git a/src/sse/rgba_u8.rs b/src/sse/rgba_u8.rs
index e41d35f..c746c33 100644
--- a/src/sse/rgba_u8.rs
+++ b/src/sse/rgba_u8.rs
@@ -47,7 +47,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_sse(
 
     let src_ptr_32 = src_ptr.as_ptr() as *const i32;
     let rgba_pixel = _mm_cvtsi32_si128(src_ptr_32.read_unaligned());
-    let lo = _mm_cvtepu8_epi16(rgba_pixel);
+    let lo = _mm_unpacklo_epi8(rgba_pixel, _mm_setzero_si128());
 
     _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(lo), weight0))
 }
@@ -224,22 +224,22 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
             let store_16_8_2 = compress_i32(store_2);
             let store_16_8_3 = compress_i32(store_3);
 
-            let pixel_0 = _mm_extract_epi32::<0>(store_16_8_0);
-            let pixel_1 = _mm_extract_epi32::<0>(store_16_8_1);
-            let pixel_2 = _mm_extract_epi32::<0>(store_16_8_2);
-            let pixel_3 = _mm_extract_epi32::<0>(store_16_8_3);
-
-            let dest_ptr = chunk0.as_mut_ptr() as *mut i32;
-            dest_ptr.write_unaligned(pixel_0);
-
-            let dest_ptr = chunk1.as_mut_ptr() as *mut i32;
-            dest_ptr.write_unaligned(pixel_1);
-
-            let dest_ptr = chunk2.as_mut_ptr() as *mut i32;
-            dest_ptr.write_unaligned(pixel_2);
-
-            let dest_ptr = chunk3.as_mut_ptr() as *mut i32;
-            dest_ptr.write_unaligned(pixel_3);
+            _mm_storeu_si32(
+                chunk0.as_mut_ptr() as *mut _,
+                _mm_packus_epi16(store_16_8_0, store_16_8_0),
+            );
+            _mm_storeu_si32(
+                chunk1.as_mut_ptr() as *mut _,
+                _mm_packus_epi16(store_16_8_1, store_16_8_1),
+            );
+            _mm_storeu_si32(
+                chunk2.as_mut_ptr() as *mut _,
+                _mm_packus_epi16(store_16_8_2, store_16_8_2),
+            );
+            _mm_storeu_si32(
+                chunk3.as_mut_ptr() as *mut _,
+                _mm_packus_epi16(store_16_8_3, store_16_8_3),
+            );
         }
     }
 }
@@ -342,9 +342,9 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl(
         }
 
         let store_16_8 = compress_i32(store);
-        let pixel = _mm_extract_epi32::<0>(store_16_8);
-
-        let dest_ptr_32 = dst.as_mut_ptr() as *mut i32;
-        dest_ptr_32.write_unaligned(pixel);
+        _mm_storeu_si32(
+            dst.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8, store_16_8),
+        );
     }
 }
diff --git a/src/sse/rgba_u8_lb.rs b/src/sse/rgba_u8_lb.rs
index 1cef21a..9f7ffe9 100644
--- a/src/sse/rgba_u8_lb.rs
+++ b/src/sse/rgba_u8_lb.rs
@@ -45,7 +45,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_sse<const SCALE: i32>(
 
     let src_ptr_32 = src_ptr.as_ptr() as *const i32;
     let rgba_pixel = _mm_cvtsi32_si128(src_ptr_32.read_unaligned());
-    let lo = _mm_slli_epi16::<SCALE>(_mm_cvtepu8_epi16(rgba_pixel));
+    let lo = _mm_slli_epi16::<SCALE>(_mm_unpacklo_epi8(rgba_pixel, _mm_setzero_si128()));
 
     _mm_add_epi16(store_0, _mm_mulhi_epi16(lo, weight0))
 }
@@ -128,8 +128,6 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
         const ROUNDING: i16 = 1 << (SCALE - 1);
         const V_SHR: i32 = SCALE - 1;
 
-        let zeros = _mm_setzero_si128();
-
         let vld = _mm_set1_epi16(ROUNDING);
 
         let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3);
@@ -360,27 +358,27 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
                 jx += 1;
             }
 
-            let store_16_8_0 = _mm_srai_epi16::<V_SHR>(_mm_max_epi16(store_0, zeros));
-            let store_16_8_1 = _mm_srai_epi16::<V_SHR>(_mm_max_epi16(store_1, zeros));
-            let store_16_8_2 = _mm_srai_epi16::<V_SHR>(_mm_max_epi16(store_2, zeros));
-            let store_16_8_3 = _mm_srai_epi16::<V_SHR>(_mm_max_epi16(store_3, zeros));
-
-            let pixel_0 = _mm_extract_epi32::<0>(_mm_packus_epi16(store_16_8_0, store_16_8_0));
-            let pixel_1 = _mm_extract_epi32::<0>(_mm_packus_epi16(store_16_8_1, store_16_8_1));
-            let pixel_2 = _mm_extract_epi32::<0>(_mm_packus_epi16(store_16_8_2, store_16_8_2));
-            let pixel_3 = _mm_extract_epi32::<0>(_mm_packus_epi16(store_16_8_3, store_16_8_3));
-
-            let dest_ptr = chunk0.as_mut_ptr() as *mut i32;
-            dest_ptr.write_unaligned(pixel_0);
+            let store_16_8_0 = _mm_srai_epi16::<V_SHR>(store_0);
+            let store_16_8_1 = _mm_srai_epi16::<V_SHR>(store_1);
+            let store_16_8_2 = _mm_srai_epi16::<V_SHR>(store_2);
+            let store_16_8_3 = _mm_srai_epi16::<V_SHR>(store_3);
 
-            let dest_ptr = chunk1.as_mut_ptr() as *mut i32;
-            dest_ptr.write_unaligned(pixel_1);
-
-            let dest_ptr = chunk2.as_mut_ptr() as *mut i32;
-            dest_ptr.write_unaligned(pixel_2);
-
-            let dest_ptr = chunk3.as_mut_ptr() as *mut i32;
-            dest_ptr.write_unaligned(pixel_3);
+            _mm_storeu_si32(
+                chunk0.as_mut_ptr() as *mut _,
+                _mm_packus_epi16(store_16_8_0, store_16_8_0),
+            );
+            _mm_storeu_si32(
+                chunk1.as_mut_ptr() as *mut _,
+                _mm_packus_epi16(store_16_8_1, store_16_8_1),
+            );
+            _mm_storeu_si32(
+                chunk2.as_mut_ptr() as *mut _,
+                _mm_packus_epi16(store_16_8_2, store_16_8_2),
+            );
+            _mm_storeu_si32(
+                chunk3.as_mut_ptr() as *mut _,
+                _mm_packus_epi16(store_16_8_3, store_16_8_3),
+            );
         }
     }
 }
@@ -405,8 +403,6 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl(
 
     let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3);
 
-    let zeros = _mm_setzero_si128();
-
     const SCALE: i32 = 6;
     const ROUNDING: i16 = 1 << (SCALE - 1);
     const V_SHR: i32 = SCALE - 1;
@@ -520,10 +516,10 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl(
             jx += 1;
         }
 
-        let store_16_8 = _mm_srai_epi16::<V_SHR>(_mm_max_epi16(store, zeros));
-        let pixel = _mm_extract_epi32::<0>(_mm_packus_epi16(store_16_8, store_16_8));
-
-        let dest_ptr_32 = dst.as_mut_ptr() as *mut i32;
-        dest_ptr_32.write_unaligned(pixel);
+        let store_16_8 = _mm_srai_epi16::<V_SHR>(store);
+        _mm_storeu_si32(
+            dst.as_mut_ptr() as *mut _,
+            _mm_packus_epi16(store_16_8, store_16_8),
+        );
     }
 }
diff --git a/src/sse/u8_utils.rs b/src/sse/u8_utils.rs
index 3605d2a..7b6e5ec 100644
--- a/src/sse/u8_utils.rs
+++ b/src/sse/u8_utils.rs
@@ -36,7 +36,7 @@ use crate::support::PRECISION;
 
 #[inline(always)]
 pub(crate) fn compress_i32(x: __m128i) -> __m128i {
-    let store_32 = unsafe { _mm_srai_epi32::<PRECISION>(_mm_max_epi32(x, _mm_setzero_si128())) };
+    let store_32 = unsafe { _mm_srai_epi32::<PRECISION>(x) };
     let store_16 = unsafe { _mm_packus_epi32(store_32, store_32) };
     unsafe { _mm_packus_epi16(store_16, store_16) }
 }
@@ -57,6 +57,9 @@ pub(crate) unsafe fn convolve_horizontal_parts_one_sse_rgb(
         0,
     ]);
     let m_vl = _mm_cvtsi32_si128(vl);
-    let lo = _mm_cvtepu8_epi16(m_vl);
-    _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(lo), weight0))
+    let lo = _mm_unpacklo_epi8(m_vl, _mm_setzero_si128());
+    _mm_add_epi32(
+        store_0,
+        _mm_madd_epi16(_mm_unpacklo_epi16(lo, _mm_setzero_si128()), weight0),
+    )
 }
diff --git a/src/sse/vertical_f16.rs b/src/sse/vertical_f16.rs
index 6d7ca93..f6a616b 100644
--- a/src/sse/vertical_f16.rs
+++ b/src/sse/vertical_f16.rs
@@ -95,7 +95,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_4_f16<const F16C: bool, const FM
 
     let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     let acc = _mm_cvtps_phx::<F16C>(store_0);
-    std::ptr::copy_nonoverlapping(&acc as *const _ as *const u8, dst_ptr as *mut u8, 8);
+    _mm_storeu_si64(dst_ptr as *mut u8, acc);
 }
 
 #[inline(always)]
diff --git a/src/sse/vertical_u16.rs b/src/sse/vertical_u16.rs
index 2fde19b..731bbad 100644
--- a/src/sse/vertical_u16.rs
+++ b/src/sse/vertical_u16.rs
@@ -110,295 +110,35 @@ unsafe fn convolve_column_lb_u16_impl<const FMA: bool>(
 
         let v_dx = v_px + x * 16;
 
-        if bounds_size == 2 {
-            let weights = weight.get_unchecked(0..2);
-            let weight0 = weights[0];
-            let weight1 = weights[1];
-
-            let py = bounds.start;
-            let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
-
-            let v_weight0 = _mm_set1_ps(weight0);
-            let v_weight1 = _mm_set1_ps(weight1);
-
-            let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            let item_row1 = _mm_loadu_si128(src_ptr0.as_ptr().add(8) as *const __m128i);
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)),
-                v_weight0,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)),
-                v_weight0,
-            );
-            store2 = _mm_prefer_fma_ps::<FMA>(
-                store2,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)),
-                v_weight0,
-            );
-            store3 = _mm_prefer_fma_ps::<FMA>(
-                store3,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)),
-                v_weight0,
-            );
-
-            let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            let item_row11 = _mm_loadu_si128(src_ptr1.as_ptr().add(8) as *const __m128i);
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row10, zeros)),
-                v_weight1,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row10, zeros)),
-                v_weight1,
-            );
-            store2 = _mm_prefer_fma_ps::<FMA>(
-                store2,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row11, zeros)),
-                v_weight1,
-            );
-            store3 = _mm_prefer_fma_ps::<FMA>(
-                store3,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row11, zeros)),
-                v_weight1,
-            );
-        } else if bounds_size == 3 {
-            let weights = weight.get_unchecked(0..3);
-            let weight0 = weights[0];
-            let weight1 = weights[1];
-            let weight2 = weights[2];
+        for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
+            let py = bounds.start + j;
+            let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
 
-            let py = bounds.start;
-            let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..);
-
-            let v_weight0 = _mm_set1_ps(weight0);
-            let v_weight1 = _mm_set1_ps(weight1);
-            let v_weight2 = _mm_set1_ps(weight2);
+            let v_weight = _mm_set1_ps(k_weight);
 
-            let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            let item_row1 = _mm_loadu_si128(src_ptr0.as_ptr().add(8) as *const __m128i);
+            let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
+            let item_row1 = _mm_loadu_si128(src_ptr.as_ptr().add(8) as *const __m128i);
 
             store0 = _mm_prefer_fma_ps::<FMA>(
                 store0,
                 _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)),
-                v_weight0,
+                v_weight,
             );
             store1 = _mm_prefer_fma_ps::<FMA>(
                 store1,
                 _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)),
-                v_weight0,
+                v_weight,
             );
             store2 = _mm_prefer_fma_ps::<FMA>(
                 store2,
                 _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)),
-                v_weight0,
+                v_weight,
             );
             store3 = _mm_prefer_fma_ps::<FMA>(
                 store3,
                 _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)),
-                v_weight0,
-            );
-
-            let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            let item_row11 = _mm_loadu_si128(src_ptr1.as_ptr().add(8) as *const __m128i);
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row10, zeros)),
-                v_weight1,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row10, zeros)),
-                v_weight1,
-            );
-            store2 = _mm_prefer_fma_ps::<FMA>(
-                store2,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row11, zeros)),
-                v_weight1,
-            );
-            store3 = _mm_prefer_fma_ps::<FMA>(
-                store3,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row11, zeros)),
-                v_weight1,
-            );
-
-            let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-            let item_row21 = _mm_loadu_si128(src_ptr2.as_ptr().add(8) as *const __m128i);
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row20, zeros)),
-                v_weight2,
+                v_weight,
             );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row20, zeros)),
-                v_weight2,
-            );
-            store2 = _mm_prefer_fma_ps::<FMA>(
-                store2,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row21, zeros)),
-                v_weight2,
-            );
-            store3 = _mm_prefer_fma_ps::<FMA>(
-                store3,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row21, zeros)),
-                v_weight2,
-            );
-        } else if bounds_size == 4 {
-            let weights = weight.get_unchecked(0..4);
-            let weight0 = weights[0];
-            let weight1 = weights[1];
-            let weight2 = weights[2];
-            let weight3 = weights[3];
-
-            let py = bounds.start;
-            let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..);
-            let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_dx)..);
-
-            let v_weight0 = _mm_set1_ps(weight0);
-            let v_weight1 = _mm_set1_ps(weight1);
-            let v_weight2 = _mm_set1_ps(weight2);
-            let v_weight3 = _mm_set1_ps(weight3);
-
-            let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            let item_row1 = _mm_loadu_si128(src_ptr0.as_ptr().add(8) as *const __m128i);
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)),
-                v_weight0,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)),
-                v_weight0,
-            );
-            store2 = _mm_prefer_fma_ps::<FMA>(
-                store2,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)),
-                v_weight0,
-            );
-            store3 = _mm_prefer_fma_ps::<FMA>(
-                store3,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)),
-                v_weight0,
-            );
-
-            let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            let item_row11 = _mm_loadu_si128(src_ptr1.as_ptr().add(8) as *const __m128i);
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row10, zeros)),
-                v_weight1,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row10, zeros)),
-                v_weight1,
-            );
-            store2 = _mm_prefer_fma_ps::<FMA>(
-                store2,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row11, zeros)),
-                v_weight1,
-            );
-            store3 = _mm_prefer_fma_ps::<FMA>(
-                store3,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row11, zeros)),
-                v_weight1,
-            );
-
-            let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-            let item_row21 = _mm_loadu_si128(src_ptr2.as_ptr().add(8) as *const __m128i);
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row20, zeros)),
-                v_weight2,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row20, zeros)),
-                v_weight2,
-            );
-            store2 = _mm_prefer_fma_ps::<FMA>(
-                store2,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row21, zeros)),
-                v_weight2,
-            );
-            store3 = _mm_prefer_fma_ps::<FMA>(
-                store3,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row21, zeros)),
-                v_weight2,
-            );
-
-            let item_row30 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i);
-            let item_row31 = _mm_loadu_si128(src_ptr3.as_ptr().add(8) as *const __m128i);
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row30, zeros)),
-                v_weight3,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row30, zeros)),
-                v_weight3,
-            );
-            store2 = _mm_prefer_fma_ps::<FMA>(
-                store2,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row31, zeros)),
-                v_weight3,
-            );
-            store3 = _mm_prefer_fma_ps::<FMA>(
-                store3,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row31, zeros)),
-                v_weight3,
-            );
-        } else {
-            for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
-                let py = bounds.start + j;
-                let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
-
-                let v_weight = _mm_set1_ps(k_weight);
-
-                let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
-                let item_row1 = _mm_loadu_si128(src_ptr.as_ptr().add(8) as *const __m128i);
-
-                store0 = _mm_prefer_fma_ps::<FMA>(
-                    store0,
-                    _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)),
-                    v_weight,
-                );
-                store1 = _mm_prefer_fma_ps::<FMA>(
-                    store1,
-                    _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)),
-                    v_weight,
-                );
-                store2 = _mm_prefer_fma_ps::<FMA>(
-                    store2,
-                    _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)),
-                    v_weight,
-                );
-                store3 = _mm_prefer_fma_ps::<FMA>(
-                    store3,
-                    _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)),
-                    v_weight,
-                );
-            }
         }
 
         let v_st0 = _mm_min_epi32(
@@ -438,179 +178,24 @@ unsafe fn convolve_column_lb_u16_impl<const FMA: bool>(
 
         let v_dx = v_px + x * 8;
 
-        if bounds_size == 2 {
-            let weights = weight.get_unchecked(0..2);
-            let weight0 = weights[0];
-            let weight1 = weights[1];
-
-            let py = bounds.start;
-            let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
-
-            let v_weight0 = _mm_set1_ps(weight0);
-            let v_weight1 = _mm_set1_ps(weight1);
-
-            let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)),
-                v_weight0,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)),
-                v_weight0,
-            );
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)),
-                v_weight1,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)),
-                v_weight1,
-            );
-        } else if bounds_size == 3 {
-            let weights = weight.get_unchecked(0..3);
-            let weight0 = weights[0];
-            let weight1 = weights[1];
-            let weight2 = weights[2];
-
-            let py = bounds.start;
-            let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..);
-
-            let v_weight0 = _mm_set1_ps(weight0);
-            let v_weight1 = _mm_set1_ps(weight1);
-            let v_weight2 = _mm_set1_ps(weight2);
-
-            let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)),
-                v_weight0,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)),
-                v_weight0,
-            );
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)),
-                v_weight1,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)),
-                v_weight1,
-            );
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row2, zeros)),
-                v_weight2,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row2, zeros)),
-                v_weight2,
-            );
-        } else if bounds_size == 4 {
-            let weights = weight.get_unchecked(0..4);
-            let weight0 = weights[0];
-            let weight1 = weights[1];
-            let weight2 = weights[2];
-            let weight3 = weights[3];
-
-            let py = bounds.start;
-            let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..);
-            let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_dx)..);
+        for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
+            let py = bounds.start + j;
+            let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
 
-            let v_weight0 = _mm_set1_ps(weight0);
-            let v_weight1 = _mm_set1_ps(weight1);
-            let v_weight2 = _mm_set1_ps(weight2);
-            let v_weight3 = _mm_set1_ps(weight3);
+            let v_weight = _mm_set1_ps(k_weight);
 
-            let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-            let item_row3 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i);
+            let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
 
             store0 = _mm_prefer_fma_ps::<FMA>(
                 store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)),
-                v_weight0,
+                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row, zeros)),
+                v_weight,
             );
             store1 = _mm_prefer_fma_ps::<FMA>(
                 store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)),
-                v_weight0,
+                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row, zeros)),
+                v_weight,
             );
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)),
-                v_weight1,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)),
-                v_weight1,
-            );
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row2, zeros)),
-                v_weight2,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row2, zeros)),
-                v_weight2,
-            );
-
-            store0 = _mm_prefer_fma_ps::<FMA>(
-                store0,
-                _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row3, zeros)),
-                v_weight3,
-            );
-            store1 = _mm_prefer_fma_ps::<FMA>(
-                store1,
-                _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row3, zeros)),
-                v_weight3,
-            );
-        } else {
-            for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
-                let py = bounds.start + j;
-                let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
-
-                let v_weight = _mm_set1_ps(k_weight);
-
-                let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
-
-                store0 = _mm_prefer_fma_ps::<FMA>(
-                    store0,
-                    _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row, zeros)),
-                    v_weight,
-                );
-                store1 = _mm_prefer_fma_ps::<FMA>(
-                    store1,
-                    _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row, zeros)),
-                    v_weight,
-                );
-            }
         }
 
         let v_st0 = _mm_min_epi32(
@@ -769,11 +354,7 @@ unsafe fn convolve_column_lb_u16_impl<const FMA: bool>(
         );
 
         let u_store0 = _mm_packus_epi32(v_st, v_st);
-        std::ptr::copy_nonoverlapping(
-            &u_store0 as *const _ as *const u8,
-            dst.as_mut_ptr() as *mut u8,
-            8,
-        );
+        _mm_storeu_si64(dst.as_mut_ptr() as *mut u8, u_store0);
 
         cx = v_dx;
     }
diff --git a/src/sse/vertical_u16_lb.rs b/src/sse/vertical_u16_lb.rs
index 9715cd7..bd7a053 100644
--- a/src/sse/vertical_u16_lb.rs
+++ b/src/sse/vertical_u16_lb.rs
@@ -80,252 +80,37 @@ unsafe fn convolve_column_lb_u16_impl(
 
             let v_dx = v_px + x * 16;
 
-            if bounds_size == 2 {
-                let weights = weight.get_unchecked(0..2);
-
-                let v_weight0 = _mm_set1_epi32(weights[0] as i32);
-                let v_weight1 = _mm_set1_epi32(weights[1] as i32);
-
-                let py = bounds.start;
-                let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
-                let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
-
-                let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-                let item_row1 = _mm_loadu_si128(src_ptr0.as_ptr().add(8) as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight0),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight0),
-                );
-                store2 = _mm_add_epi32(
-                    store2,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight0),
-                );
-                store3 = _mm_add_epi32(
-                    store3,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight0),
-                );
-
-                let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-                let item_row11 = _mm_loadu_si128(src_ptr1.as_ptr().add(8) as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row10, zeros), v_weight1),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row10, zeros), v_weight1),
-                );
-                store2 = _mm_add_epi32(
-                    store2,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row11, zeros), v_weight1),
-                );
-                store3 = _mm_add_epi32(
-                    store3,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row11, zeros), v_weight1),
-                );
-            } else if bounds_size == 3 {
-                let weights = weight.get_unchecked(0..3);
-
-                let v_weight0 = _mm_set1_epi32(weights[0] as i32);
-                let v_weight1 = _mm_set1_epi32(weights[1] as i32);
-                let v_weight2 = _mm_set1_epi32(weights[2] as i32);
-
-                let py = bounds.start;
-                let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
-                let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
-                let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..);
-
-                let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-                let item_row1 = _mm_loadu_si128(src_ptr0.as_ptr().add(8) as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight0),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight0),
-                );
-                store2 = _mm_add_epi32(
-                    store2,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight0),
-                );
-                store3 = _mm_add_epi32(
-                    store3,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight0),
-                );
-
-                let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-                let item_row11 = _mm_loadu_si128(src_ptr1.as_ptr().add(8) as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row10, zeros), v_weight1),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row10, zeros), v_weight1),
-                );
-                store2 = _mm_add_epi32(
-                    store2,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row11, zeros), v_weight1),
-                );
-                store3 = _mm_add_epi32(
-                    store3,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row11, zeros), v_weight1),
-                );
-
-                let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-                let item_row21 = _mm_loadu_si128(src_ptr2.as_ptr().add(8) as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row20, zeros), v_weight2),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row20, zeros), v_weight2),
-                );
-                store2 = _mm_add_epi32(
-                    store2,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row21, zeros), v_weight2),
-                );
-                store3 = _mm_add_epi32(
-                    store3,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row21, zeros), v_weight2),
-                );
-            } else if bounds_size == 4 {
-                let weights = weight.get_unchecked(0..4);
-
-                let v_weight0 = _mm_set1_epi32(weights[0] as i32);
-                let v_weight1 = _mm_set1_epi32(weights[1] as i32);
-                let v_weight2 = _mm_set1_epi32(weights[2] as i32);
-                let v_weight3 = _mm_set1_epi32(weights[3] as i32);
-
-                let py = bounds.start;
-                let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
-                let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
-                let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..);
-                let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_dx)..);
+            for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
+                let py = bounds.start + j;
+                let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
 
-                let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-                let item_row1 = _mm_loadu_si128(src_ptr0.as_ptr().add(8) as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight0),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight0),
-                );
-                store2 = _mm_add_epi32(
-                    store2,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight0),
-                );
-                store3 = _mm_add_epi32(
-                    store3,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight0),
-                );
-
-                let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-                let item_row11 = _mm_loadu_si128(src_ptr1.as_ptr().add(8) as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row10, zeros), v_weight1),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row10, zeros), v_weight1),
-                );
-                store2 = _mm_add_epi32(
-                    store2,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row11, zeros), v_weight1),
-                );
-                store3 = _mm_add_epi32(
-                    store3,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row11, zeros), v_weight1),
-                );
-
-                let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-                let item_row21 = _mm_loadu_si128(src_ptr2.as_ptr().add(8) as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row20, zeros), v_weight2),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row20, zeros), v_weight2),
-                );
-                store2 = _mm_add_epi32(
-                    store2,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row21, zeros), v_weight2),
-                );
-                store3 = _mm_add_epi32(
-                    store3,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row21, zeros), v_weight2),
-                );
+                let v_weight = _mm_set1_epi32(k_weight as i32);
 
-                let item_row30 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i);
-                let item_row31 = _mm_loadu_si128(src_ptr3.as_ptr().add(8) as *const __m128i);
+                let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
+                let item_row1 = _mm_loadu_si128(src_ptr.as_ptr().add(8) as *const __m128i);
 
                 store0 = _mm_add_epi32(
                     store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row30, zeros), v_weight3),
+                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight),
                 );
                 store1 = _mm_add_epi32(
                     store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row30, zeros), v_weight3),
+                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight),
                 );
                 store2 = _mm_add_epi32(
                     store2,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row31, zeros), v_weight3),
+                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight),
                 );
                 store3 = _mm_add_epi32(
                     store3,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row31, zeros), v_weight3),
+                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight),
                 );
-            } else {
-                for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
-                    let py = bounds.start + j;
-                    let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
-
-                    let v_weight = _mm_set1_epi32(k_weight as i32);
-
-                    let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
-                    let item_row1 = _mm_loadu_si128(src_ptr.as_ptr().add(8) as *const __m128i);
-
-                    store0 = _mm_add_epi32(
-                        store0,
-                        _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight),
-                    );
-                    store1 = _mm_add_epi32(
-                        store1,
-                        _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight),
-                    );
-                    store2 = _mm_add_epi32(
-                        store2,
-                        _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight),
-                    );
-                    store3 = _mm_add_epi32(
-                        store3,
-                        _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight),
-                    );
-                }
             }
 
-            let v_st0 = _mm_srai_epi32::<PRECISION>(_mm_max_epi32(store0, zeros));
-            let v_st1 = _mm_srai_epi32::<PRECISION>(_mm_max_epi32(store1, zeros));
-            let v_st2 = _mm_srai_epi32::<PRECISION>(_mm_max_epi32(store2, zeros));
-            let v_st3 = _mm_srai_epi32::<PRECISION>(_mm_max_epi32(store3, zeros));
+            let v_st0 = _mm_srai_epi32::<PRECISION>(store0);
+            let v_st1 = _mm_srai_epi32::<PRECISION>(store1);
+            let v_st2 = _mm_srai_epi32::<PRECISION>(store2);
+            let v_st3 = _mm_srai_epi32::<PRECISION>(store3);
 
             let item0 = _mm_min_epi16(_mm_packus_epi32(v_st0, v_st1), v_max_colors);
             let item1 = _mm_min_epi16(_mm_packus_epi32(v_st2, v_st3), v_max_colors);
@@ -347,160 +132,26 @@ unsafe fn convolve_column_lb_u16_impl(
 
             let v_dx = v_px + x * 8;
 
-            if bounds_size == 2 {
-                let weights = weight.get_unchecked(0..2);
-
-                let v_weight0 = _mm_set1_epi32(weights[0] as i32);
-                let v_weight1 = _mm_set1_epi32(weights[1] as i32);
-
-                let py = bounds.start;
-                let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
-                let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
-
-                let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight0),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight0),
-                );
-
-                let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight1),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight1),
-                );
-            } else if bounds_size == 3 {
-                let weights = weight.get_unchecked(0..3);
-
-                let v_weight0 = _mm_set1_epi32(weights[0] as i32);
-                let v_weight1 = _mm_set1_epi32(weights[1] as i32);
-                let v_weight2 = _mm_set1_epi32(weights[2] as i32);
-
-                let py = bounds.start;
-                let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
-                let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
-                let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..);
-
-                let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight0),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight0),
-                );
-
-                let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
+            for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
+                let py = bounds.start + j;
+                let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
 
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight1),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight1),
-                );
-
-                let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row2, zeros), v_weight2),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row2, zeros), v_weight2),
-                );
-            } else if bounds_size == 4 {
-                let weights = weight.get_unchecked(0..4);
-
-                let v_weight0 = _mm_set1_epi32(weights[0] as i32);
-                let v_weight1 = _mm_set1_epi32(weights[1] as i32);
-                let v_weight2 = _mm_set1_epi32(weights[2] as i32);
-                let v_weight3 = _mm_set1_epi32(weights[3] as i32);
-
-                let py = bounds.start;
-                let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
-                let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
-                let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..);
-                let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_dx)..);
-
-                let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight0),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight0),
-                );
-
-                let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight1),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight1),
-                );
+                let v_weight = _mm_set1_epi16(k_weight);
 
-                let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
+                let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
 
                 store0 = _mm_add_epi32(
                     store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row2, zeros), v_weight2),
+                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row, zeros), v_weight),
                 );
                 store1 = _mm_add_epi32(
                     store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row2, zeros), v_weight2),
+                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row, zeros), v_weight),
                 );
-
-                let item_row3 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i);
-
-                store0 = _mm_add_epi32(
-                    store0,
-                    _mm_madd_epi16(_mm_unpacklo_epi16(item_row3, zeros), v_weight3),
-                );
-                store1 = _mm_add_epi32(
-                    store1,
-                    _mm_madd_epi16(_mm_unpackhi_epi16(item_row3, zeros), v_weight3),
-                );
-            } else {
-                for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
-                    let py = bounds.start + j;
-                    let src_ptr = src.get_unchecked((src_stride * py + v_dx)..);
-
-                    let v_weight = _mm_set1_epi16(k_weight);
-
-                    let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
-
-                    store0 = _mm_add_epi32(
-                        store0,
-                        _mm_madd_epi16(_mm_unpacklo_epi16(item_row, zeros), v_weight),
-                    );
-                    store1 = _mm_add_epi32(
-                        store1,
-                        _mm_madd_epi16(_mm_unpackhi_epi16(item_row, zeros), v_weight),
-                    );
-                }
             }
 
-            let v_st0 = _mm_srai_epi32::<PRECISION>(_mm_max_epi32(store0, zeros));
-            let v_st1 = _mm_srai_epi32::<PRECISION>(_mm_max_epi32(store1, zeros));
+            let v_st0 = _mm_srai_epi32::<PRECISION>(store0);
+            let v_st1 = _mm_srai_epi32::<PRECISION>(store1);
 
             let item = _mm_min_epi16(_mm_packus_epi32(v_st0, v_st1), v_max_colors);
             _mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, item);
@@ -621,14 +272,10 @@ unsafe fn convolve_column_lb_u16_impl(
                 }
             }
 
-            let v_st = _mm_srai_epi32::<PRECISION>(_mm_max_epi32(store0, zeros));
+            let v_st = _mm_srai_epi32::<PRECISION>(store0);
 
             let u_store0 = _mm_min_epi16(_mm_packus_epi32(v_st, v_st), v_max_colors);
-            std::ptr::copy_nonoverlapping(
-                &u_store0 as *const _ as *const u8,
-                dst.as_mut_ptr() as *mut u8,
-                8,
-            );
+            _mm_storeu_si64(dst.as_mut_ptr() as *mut u8, u_store0);
 
             cx = v_dx;
         }
diff --git a/src/sse/vertical_u8.rs b/src/sse/vertical_u8.rs
index 5b66bf4..ab567d7 100644
--- a/src/sse/vertical_u8.rs
+++ b/src/sse/vertical_u8.rs
@@ -84,172 +84,62 @@ pub(crate) unsafe fn convolve_vertical_part_sse_32(
 
     let bounds_size = bounds.size;
 
-    if bounds_size == 2 {
-        let py = start_y;
-        let weight = filter.get_unchecked(0..2);
-        let v_weight0 = _mm_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm_set1_epi32(weight[1] as i32);
-
-        let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-        let item_row_0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-        let item_row_1 = _mm_loadu_si128(src_ptr0.as_ptr().add(16) as *const __m128i);
-
-        let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-        let item_row_10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-        let item_row_11 = _mm_loadu_si128(src_ptr1.as_ptr().add(16) as *const __m128i);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight0);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight0);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_10, v_weight1);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_11, v_weight1);
-    } else if bounds_size == 3 {
-        let py = start_y;
-        let weight = filter.get_unchecked(0..3);
-        let v_weight0 = _mm_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm_set1_epi32(weight[2] as i32);
-
-        let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-        let item_row_0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-        let item_row_1 = _mm_loadu_si128(src_ptr0.as_ptr().add(16) as *const __m128i);
-
-        let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-        let item_row_10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-        let item_row_11 = _mm_loadu_si128(src_ptr1.as_ptr().add(16) as *const __m128i);
-
-        let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-        let item_row_20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-        let item_row_21 = _mm_loadu_si128(src_ptr2.as_ptr().add(16) as *const __m128i);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight0);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight0);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_10, v_weight1);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_11, v_weight1);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_20, v_weight2);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_21, v_weight2);
-    } else if bounds_size == 4 {
-        let py = start_y;
-        let weight = filter.get_unchecked(0..4);
-        let v_weight0 = _mm_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm_set1_epi32(weight[2] as i32);
-        let v_weight3 = _mm_set1_epi32(weight[3] as i32);
-
-        let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-        let item_row_0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-        let item_row_1 = _mm_loadu_si128(src_ptr0.as_ptr().add(16) as *const __m128i);
-
-        let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-        let item_row_10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-        let item_row_11 = _mm_loadu_si128(src_ptr1.as_ptr().add(16) as *const __m128i);
-
-        let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-        let item_row_20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-        let item_row_21 = _mm_loadu_si128(src_ptr2.as_ptr().add(16) as *const __m128i);
-
-        let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
-        let item_row_30 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i);
-        let item_row_31 = _mm_loadu_si128(src_ptr3.as_ptr().add(16) as *const __m128i);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight0);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight0);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_10, v_weight1);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_11, v_weight1);
+    let mut jj = 0usize;
+
+    while jj < bounds_size.saturating_sub(2) {
+        let py = start_y + jj;
+        let f_ptr = filter.get_unchecked(jj..).as_ptr() as *const i32;
+        let v_weight_2 = _mm_set1_epi32(f_ptr.read_unaligned());
+        let src_ptr = src.get_unchecked((src_stride * py + px)..);
+        let s_ptr_next = src_ptr.as_ptr().add(src_stride);
+
+        let item_row_0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
+        let item_row_1 = _mm_loadu_si128(s_ptr_next as *const __m128i);
+
+        let interleaved = _mm_unpacklo_epi8(item_row_0, item_row_1);
+        let pix = _mm_unpacklo_epi8(interleaved, zeros);
+        store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(pix, v_weight_2));
+        let pix = _mm_unpackhi_epi8(interleaved, zeros);
+        store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(pix, v_weight_2));
+
+        let interleaved = _mm_unpackhi_epi8(item_row_0, item_row_1);
+        let pix = _mm_unpacklo_epi8(interleaved, zeros);
+        store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(pix, v_weight_2));
+        let pix = _mm_unpackhi_epi8(interleaved, zeros);
+        store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(pix, v_weight_2));
+
+        let item_row_0 = _mm_loadu_si128(src_ptr.as_ptr().add(16) as *const __m128i);
+        let item_row_1 = _mm_loadu_si128(s_ptr_next.add(16) as *const __m128i);
+
+        let interleaved = _mm_unpacklo_epi8(item_row_0, item_row_1);
+        let pix = _mm_unpacklo_epi8(interleaved, zeros);
+        store_4 = _mm_add_epi32(store_4, _mm_madd_epi16(pix, v_weight_2));
+        let pix = _mm_unpackhi_epi8(interleaved, zeros);
+        store_5 = _mm_add_epi32(store_5, _mm_madd_epi16(pix, v_weight_2));
+
+        let interleaved = _mm_unpackhi_epi8(item_row_0, item_row_1);
+        let pix = _mm_unpacklo_epi8(interleaved, zeros);
+        store_6 = _mm_add_epi32(store_6, _mm_madd_epi16(pix, v_weight_2));
+        let pix = _mm_unpackhi_epi8(interleaved, zeros);
+        store_7 = _mm_add_epi32(store_7, _mm_madd_epi16(pix, v_weight_2));
+
+        jj += 2;
+    }
 
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_20, v_weight2);
-        (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_21, v_weight2);
+    for j in jj..bounds_size {
+        let py = start_y + j;
+        let weight = *filter.get_unchecked(j);
+        let v_weight = _mm_set1_epi32(weight as i32);
+        let src_ptr = src.get_unchecked((src_stride * py + px)..);
+        let item_row_0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
+        let item_row_1 = _mm_loadu_si128(src_ptr.as_ptr().add(16) as *const __m128i);
 
         (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row_30, v_weight3);
+            dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight);
         (store_4, store_5, store_6, store_7) =
-            dot_prod(store_4, store_5, store_6, store_7, item_row_31, v_weight3);
-    } else {
-        let mut jj = 0usize;
-
-        while jj < bounds_size.saturating_sub(2) {
-            let py = start_y + jj;
-            let f_ptr = filter.get_unchecked(jj..).as_ptr() as *const i32;
-            let v_weight_2 = _mm_set1_epi32(f_ptr.read_unaligned());
-            let src_ptr = src.get_unchecked((src_stride * py + px)..);
-            let s_ptr_next = src_ptr.as_ptr().add(src_stride);
-
-            let item_row_0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
-            let item_row_1 = _mm_loadu_si128(s_ptr_next as *const __m128i);
-
-            let interleaved = _mm_unpacklo_epi8(item_row_0, item_row_1);
-            let pix = _mm_unpacklo_epi8(interleaved, zeros);
-            store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(pix, v_weight_2));
-            let pix = _mm_unpackhi_epi8(interleaved, zeros);
-            store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(pix, v_weight_2));
-
-            let interleaved = _mm_unpackhi_epi8(item_row_0, item_row_1);
-            let pix = _mm_unpacklo_epi8(interleaved, zeros);
-            store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(pix, v_weight_2));
-            let pix = _mm_unpackhi_epi8(interleaved, zeros);
-            store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(pix, v_weight_2));
-
-            let item_row_0 = _mm_loadu_si128(src_ptr.as_ptr().add(16) as *const __m128i);
-            let item_row_1 = _mm_loadu_si128(s_ptr_next.add(16) as *const __m128i);
-
-            let interleaved = _mm_unpacklo_epi8(item_row_0, item_row_1);
-            let pix = _mm_unpacklo_epi8(interleaved, zeros);
-            store_4 = _mm_add_epi32(store_4, _mm_madd_epi16(pix, v_weight_2));
-            let pix = _mm_unpackhi_epi8(interleaved, zeros);
-            store_5 = _mm_add_epi32(store_5, _mm_madd_epi16(pix, v_weight_2));
-
-            let interleaved = _mm_unpackhi_epi8(item_row_0, item_row_1);
-            let pix = _mm_unpacklo_epi8(interleaved, zeros);
-            store_6 = _mm_add_epi32(store_6, _mm_madd_epi16(pix, v_weight_2));
-            let pix = _mm_unpackhi_epi8(interleaved, zeros);
-            store_7 = _mm_add_epi32(store_7, _mm_madd_epi16(pix, v_weight_2));
-
-            jj += 2;
-        }
-
-        for j in jj..bounds_size {
-            let py = start_y + j;
-            let weight = *filter.get_unchecked(j);
-            let v_weight = _mm_set1_epi32(weight as i32);
-            let src_ptr = src.get_unchecked((src_stride * py + px)..);
-            let item_row_0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
-            let item_row_1 = _mm_loadu_si128(src_ptr.as_ptr().add(16) as *const __m128i);
-
-            (store_0, store_1, store_2, store_3) =
-                dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight);
-            (store_4, store_5, store_6, store_7) =
-                dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight);
-        }
+            dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight);
     }
 
-    store_0 = _mm_srai_epi32::<PRECISION>(store_0);
-    store_1 = _mm_srai_epi32::<PRECISION>(store_1);
-    store_2 = _mm_srai_epi32::<PRECISION>(store_2);
-    store_3 = _mm_srai_epi32::<PRECISION>(store_3);
-    store_4 = _mm_srai_epi32::<PRECISION>(store_4);
-    store_5 = _mm_srai_epi32::<PRECISION>(store_5);
-    store_6 = _mm_srai_epi32::<PRECISION>(store_6);
-    store_7 = _mm_srai_epi32::<PRECISION>(store_7);
-
     let rgb0 = _mm_packs_epi32(store_0, store_1);
     let rgb2 = _mm_packs_epi32(store_2, store_3);
     let rgb = _mm_packus_epi16(rgb0, rgb2);
@@ -283,88 +173,19 @@ pub(crate) unsafe fn convolve_vertical_part_sse_16(
 
     let px = start_x;
 
-    let zeros = _mm_setzero_si128();
-
     let bounds_size = bounds.size;
 
-    if bounds_size == 2 {
-        let py = start_y;
-        let weight = filter.get_unchecked(0..2);
-        let v_weight0 = _mm_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm_set1_epi32(weight[1] as i32);
-
-        let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-        let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-        let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-        let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row0, v_weight0);
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row1, v_weight1);
-    } else if bounds_size == 3 {
-        let py = start_y;
-        let weight = filter.get_unchecked(0..3);
-        let v_weight0 = _mm_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm_set1_epi32(weight[2] as i32);
-
-        let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-        let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-        let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-        let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-        let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-        let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
+    for j in 0..bounds_size {
+        let py = start_y + j;
+        let weight = *filter.get_unchecked(j);
+        let v_weight = _mm_set1_epi32(weight as i32);
+        let src_ptr = src.get_unchecked((src_stride * py + px)..);
+        let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
 
         (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row0, v_weight0);
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row1, v_weight1);
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row2, v_weight2);
-    } else if bounds_size == 4 {
-        let py = start_y;
-        let weight = filter.get_unchecked(0..4);
-        let v_weight0 = _mm_set1_epi32(weight[0] as i32);
-        let v_weight1 = _mm_set1_epi32(weight[1] as i32);
-        let v_weight2 = _mm_set1_epi32(weight[2] as i32);
-        let v_weight3 = _mm_set1_epi32(weight[3] as i32);
-
-        let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-        let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-        let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-        let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
-        let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-        let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-        let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-        let item_row3 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i);
-
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row0, v_weight0);
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row1, v_weight1);
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row2, v_weight2);
-        (store_0, store_1, store_2, store_3) =
-            dot_prod(store_0, store_1, store_2, store_3, item_row3, v_weight3);
-    } else {
-        for j in 0..bounds_size {
-            let py = start_y + j;
-            let weight = *filter.get_unchecked(j);
-            let v_weight = _mm_set1_epi32(weight as i32);
-            let src_ptr = src.get_unchecked((src_stride * py + px)..);
-            let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
-
-            (store_0, store_1, store_2, store_3) =
-                dot_prod(store_0, store_1, store_2, store_3, item_row, v_weight);
-        }
+            dot_prod(store_0, store_1, store_2, store_3, item_row, v_weight);
     }
 
-    store_0 = _mm_max_epi32(store_0, zeros);
-    store_1 = _mm_max_epi32(store_1, zeros);
-    store_2 = _mm_max_epi32(store_2, zeros);
-    store_3 = _mm_max_epi32(store_3, zeros);
-
     let low_16 = _mm_packs_epi32(
         _mm_srai_epi32::<PRECISION>(store_0),
         _mm_srai_epi32::<PRECISION>(store_1),
@@ -410,14 +231,14 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8(
         let item_row0 = _mm_loadu_si64(src_ptr0.as_ptr());
         let item_row1 = _mm_loadu_si64(src_ptr1.as_ptr());
 
-        let low0 = _mm_cvtepu8_epi16(item_row0);
+        let low0 = _mm_unpacklo_epi8(item_row0, zeros);
         store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low0), v_weight0));
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low0, zeros), v_weight0),
         );
 
-        let low1 = _mm_cvtepu8_epi16(item_row1);
+        let low1 = _mm_unpacklo_epi8(item_row1, zeros);
         store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low1), v_weight1));
         store_1 = _mm_add_epi32(
             store_1,
@@ -436,21 +257,21 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8(
         let item_row1 = _mm_loadu_si64(src_ptr1.as_ptr());
         let item_row2 = _mm_loadu_si64(src_ptr2.as_ptr());
 
-        let low0 = _mm_cvtepu8_epi16(item_row0);
+        let low0 = _mm_unpacklo_epi8(item_row0, zeros);
         store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low0), v_weight0));
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low0, zeros), v_weight0),
         );
 
-        let low1 = _mm_cvtepu8_epi16(item_row1);
+        let low1 = _mm_unpacklo_epi8(item_row1, zeros);
         store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low1), v_weight1));
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low1, zeros), v_weight1),
         );
 
-        let low2 = _mm_cvtepu8_epi16(item_row2);
+        let low2 = _mm_unpacklo_epi8(item_row2, zeros);
         store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low2), v_weight2));
         store_1 = _mm_add_epi32(
             store_1,
@@ -472,28 +293,28 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8(
         let item_row2 = _mm_loadu_si64(src_ptr2.as_ptr());
         let item_row3 = _mm_loadu_si64(src_ptr3.as_ptr());
 
-        let low0 = _mm_cvtepu8_epi16(item_row0);
+        let low0 = _mm_unpacklo_epi8(item_row0, zeros);
         store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low0), v_weight0));
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low0, zeros), v_weight0),
         );
 
-        let low1 = _mm_cvtepu8_epi16(item_row1);
+        let low1 = _mm_unpacklo_epi8(item_row1, zeros);
         store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low1), v_weight1));
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low1, zeros), v_weight1),
         );
 
-        let low2 = _mm_cvtepu8_epi16(item_row2);
+        let low2 = _mm_unpacklo_epi8(item_row2, zeros);
         store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low2), v_weight2));
         store_1 = _mm_add_epi32(
             store_1,
             _mm_madd_epi16(_mm_unpackhi_epi16(low2, zeros), v_weight2),
         );
 
-        let low3 = _mm_cvtepu8_epi16(item_row3);
+        let low3 = _mm_unpacklo_epi8(item_row3, zeros);
         store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low3), v_weight3));
         store_1 = _mm_add_epi32(
             store_1,
@@ -507,7 +328,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8(
             let src_ptr = src.get_unchecked((src_stride * py + px)..);
             let item_row = _mm_loadu_si64(src_ptr.as_ptr());
 
-            let low = _mm_cvtepu8_epi16(item_row);
+            let low = _mm_unpacklo_epi8(item_row, zeros);
             store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low), v_weight));
             store_1 = _mm_add_epi32(
                 store_1,
@@ -516,9 +337,6 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8(
         }
     }
 
-    store_0 = _mm_max_epi32(store_0, zeros);
-    store_1 = _mm_max_epi32(store_1, zeros);
-
     let low_16 = _mm_packus_epi32(
         _mm_srai_epi32::<PRECISION>(store_0),
         _mm_srai_epi32::<PRECISION>(store_1),
@@ -527,7 +345,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8(
     let item = _mm_packus_epi16(low_16, low_16);
 
     let dst_ptr = dst.get_unchecked_mut(px..);
-    std::ptr::copy_nonoverlapping(&item as *const _ as *const u8, dst_ptr.as_mut_ptr(), 8);
+    _mm_storeu_si64(dst_ptr.as_mut_ptr(), item);
 }
 
 #[inline(always)]
@@ -543,8 +361,6 @@ pub(crate) unsafe fn convolve_vertical_part_sse(
     let vld = _mm_set1_epi32(ROUNDING_CONST);
     let mut store = vld;
 
-    let zeros = _mm_setzero_si128();
-
     let px = start_x;
 
     let bounds_size = bounds.size;
@@ -619,8 +435,6 @@ pub(crate) unsafe fn convolve_vertical_part_sse(
         }
     }
 
-    store = _mm_max_epi32(store, zeros);
-
     let vegi = _mm_srai_epi32::<PRECISION>(store);
 
     let low_16 = _mm_packus_epi32(vegi, vegi);
diff --git a/src/sse/vertical_u8_lp.rs b/src/sse/vertical_u8_lp.rs
index d507857..7b1a651 100644
--- a/src/sse/vertical_u8_lp.rs
+++ b/src/sse/vertical_u8_lp.rs
@@ -94,190 +94,23 @@ unsafe fn convolve_vertical_sse_row_impl(
 
         let px = cx;
 
-        if bounds_size == 2 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..2);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..);
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..);
-
-            let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            let item_row1 =
-                _mm_loadu_si128(src_ptr0.get_unchecked(16..).as_ptr() as *const __m128i);
-            let item_row2 =
-                _mm_loadu_si128(src_ptr0.get_unchecked(32..).as_ptr() as *const __m128i);
-            let item_row3 =
-                _mm_loadu_si128(src_ptr0.get_unchecked(48..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight0);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight0);
-            (store4, store5) = mdot::<SCALE>(store4, store5, item_row2, v_weight0);
-            (store6, store7) = mdot::<SCALE>(store6, store7, item_row3, v_weight0);
-
-            let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            let item_row11 =
-                _mm_loadu_si128(src_ptr1.get_unchecked(16..).as_ptr() as *const __m128i);
-            let item_row12 =
-                _mm_loadu_si128(src_ptr1.get_unchecked(32..).as_ptr() as *const __m128i);
-            let item_row13 =
-                _mm_loadu_si128(src_ptr1.get_unchecked(48..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row10, v_weight1);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row11, v_weight1);
-            (store4, store5) = mdot::<SCALE>(store4, store5, item_row12, v_weight1);
-            (store6, store7) = mdot::<SCALE>(store6, store7, item_row13, v_weight1);
-        } else if bounds_size == 3 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..3);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_weight2 = _mm_set1_epi16(weights[2]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..);
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..);
-            let v_offset2 = src_stride * (py + 2) + px;
-            let src_ptr2 = src.get_unchecked(v_offset2..);
-
-            let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            let item_row1 =
-                _mm_loadu_si128(src_ptr0.get_unchecked(16..).as_ptr() as *const __m128i);
-            let item_row2 =
-                _mm_loadu_si128(src_ptr0.get_unchecked(32..).as_ptr() as *const __m128i);
-            let item_row3 =
-                _mm_loadu_si128(src_ptr0.get_unchecked(48..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight0);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight0);
-            (store4, store5) = mdot::<SCALE>(store4, store5, item_row2, v_weight0);
-            (store6, store7) = mdot::<SCALE>(store6, store7, item_row3, v_weight0);
-
-            let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            let item_row11 =
-                _mm_loadu_si128(src_ptr1.get_unchecked(16..).as_ptr() as *const __m128i);
-            let item_row12 =
-                _mm_loadu_si128(src_ptr1.get_unchecked(32..).as_ptr() as *const __m128i);
-            let item_row13 =
-                _mm_loadu_si128(src_ptr1.get_unchecked(48..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row10, v_weight1);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row11, v_weight1);
-            (store4, store5) = mdot::<SCALE>(store4, store5, item_row12, v_weight1);
-            (store6, store7) = mdot::<SCALE>(store6, store7, item_row13, v_weight1);
-
-            let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-            let item_row21 =
-                _mm_loadu_si128(src_ptr2.get_unchecked(16..).as_ptr() as *const __m128i);
-            let item_row22 =
-                _mm_loadu_si128(src_ptr2.get_unchecked(32..).as_ptr() as *const __m128i);
-            let item_row23 =
-                _mm_loadu_si128(src_ptr2.get_unchecked(48..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row20, v_weight2);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row21, v_weight2);
-            (store4, store5) = mdot::<SCALE>(store4, store5, item_row22, v_weight2);
-            (store6, store7) = mdot::<SCALE>(store6, store7, item_row23, v_weight2);
-        } else if bounds_size == 4 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..4);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_weight2 = _mm_set1_epi16(weights[2]);
-            let v_weight3 = _mm_set1_epi16(weights[3]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..);
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..);
-            let v_offset2 = src_stride * (py + 2) + px;
-            let src_ptr2 = src.get_unchecked(v_offset2..);
-            let v_offset3 = src_stride * (py + 3) + px;
-            let src_ptr3 = src.get_unchecked(v_offset3..);
-
-            let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            let item_row1 =
-                _mm_loadu_si128(src_ptr0.get_unchecked(16..).as_ptr() as *const __m128i);
-            let item_row2 =
-                _mm_loadu_si128(src_ptr0.get_unchecked(32..).as_ptr() as *const __m128i);
-            let item_row3 =
-                _mm_loadu_si128(src_ptr0.get_unchecked(48..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight0);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight0);
-            (store4, store5) = mdot::<SCALE>(store4, store5, item_row2, v_weight0);
-            (store6, store7) = mdot::<SCALE>(store6, store7, item_row3, v_weight0);
-
-            let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            let item_row11 =
-                _mm_loadu_si128(src_ptr1.get_unchecked(16..).as_ptr() as *const __m128i);
-            let item_row12 =
-                _mm_loadu_si128(src_ptr1.get_unchecked(32..).as_ptr() as *const __m128i);
-            let item_row13 =
-                _mm_loadu_si128(src_ptr1.get_unchecked(48..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row10, v_weight1);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row11, v_weight1);
-            (store4, store5) = mdot::<SCALE>(store4, store5, item_row12, v_weight1);
-            (store6, store7) = mdot::<SCALE>(store6, store7, item_row13, v_weight1);
-
-            let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-            let item_row21 =
-                _mm_loadu_si128(src_ptr2.get_unchecked(16..).as_ptr() as *const __m128i);
-            let item_row22 =
-                _mm_loadu_si128(src_ptr2.get_unchecked(32..).as_ptr() as *const __m128i);
-            let item_row23 =
-                _mm_loadu_si128(src_ptr2.get_unchecked(48..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row20, v_weight2);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row21, v_weight2);
-            (store4, store5) = mdot::<SCALE>(store4, store5, item_row22, v_weight2);
-            (store6, store7) = mdot::<SCALE>(store6, store7, item_row23, v_weight2);
-
-            let item_row30 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i);
-            let item_row31 =
-                _mm_loadu_si128(src_ptr3.get_unchecked(16..).as_ptr() as *const __m128i);
-            let item_row32 =
-                _mm_loadu_si128(src_ptr3.get_unchecked(32..).as_ptr() as *const __m128i);
-            let item_row33 =
-                _mm_loadu_si128(src_ptr3.get_unchecked(48..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row30, v_weight3);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row31, v_weight3);
-            (store4, store5) = mdot::<SCALE>(store4, store5, item_row32, v_weight3);
-            (store6, store7) = mdot::<SCALE>(store6, store7, item_row33, v_weight3);
-        } else {
-            for j in 0..bounds_size {
-                let py = bounds.start + j;
-                let weight = weight.get_unchecked(j..(j + 1));
-                let v_weight = _mm_set1_epi16(weight[0]);
-                let v_offset = src_stride * py + px;
-                let src_ptr = src.get_unchecked(v_offset..);
-                let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
-                let item_row1 =
-                    _mm_loadu_si128(src_ptr.get_unchecked(16..).as_ptr() as *const __m128i);
-                let item_row2 =
-                    _mm_loadu_si128(src_ptr.get_unchecked(32..).as_ptr() as *const __m128i);
-                let item_row3 =
-                    _mm_loadu_si128(src_ptr.get_unchecked(48..).as_ptr() as *const __m128i);
-
-                (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight);
-                (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight);
-                (store4, store5) = mdot::<SCALE>(store4, store5, item_row2, v_weight);
-                (store6, store7) = mdot::<SCALE>(store6, store7, item_row3, v_weight);
-            }
+        for j in 0..bounds_size {
+            let py = bounds.start + j;
+            let weight = weight.get_unchecked(j..(j + 1));
+            let v_weight = _mm_set1_epi16(weight[0]);
+            let v_offset = src_stride * py + px;
+            let src_ptr = src.get_unchecked(v_offset..);
+            let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
+            let item_row1 = _mm_loadu_si128(src_ptr.get_unchecked(16..).as_ptr() as *const __m128i);
+            let item_row2 = _mm_loadu_si128(src_ptr.get_unchecked(32..).as_ptr() as *const __m128i);
+            let item_row3 = _mm_loadu_si128(src_ptr.get_unchecked(48..).as_ptr() as *const __m128i);
+
+            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight);
+            (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight);
+            (store4, store5) = mdot::<SCALE>(store4, store5, item_row2, v_weight);
+            (store6, store7) = mdot::<SCALE>(store6, store7, item_row3, v_weight);
         }
 
-        store0 = _mm_max_epi16(store0, zeros);
-        store1 = _mm_max_epi16(store1, zeros);
-        store2 = _mm_max_epi16(store2, zeros);
-        store3 = _mm_max_epi16(store3, zeros);
-        store4 = _mm_max_epi16(store4, zeros);
-        store5 = _mm_max_epi16(store5, zeros);
-        store6 = _mm_max_epi16(store6, zeros);
-        store7 = _mm_max_epi16(store7, zeros);
-
         let rebased0 = _mm_srli_epi16::<R_SHR_SCALE>(store0);
         let rebased1 = _mm_srli_epi16::<R_SHR_SCALE>(store1);
         let rebased2 = _mm_srli_epi16::<R_SHR_SCALE>(store2);
@@ -318,126 +151,19 @@ unsafe fn convolve_vertical_sse_row_impl(
 
         let px = cx;
 
-        if bounds_size == 2 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..2);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..);
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..);
-
-            let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            let item_row1 =
-                _mm_loadu_si128(src_ptr0.get_unchecked(16..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight0);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight0);
-
-            let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            let item_row11 =
-                _mm_loadu_si128(src_ptr1.get_unchecked(16..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row10, v_weight1);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row11, v_weight1);
-        } else if bounds_size == 3 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..3);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_weight2 = _mm_set1_epi16(weights[2]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..);
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..);
-            let v_offset2 = src_stride * (py + 2) + px;
-            let src_ptr2 = src.get_unchecked(v_offset2..);
-
-            let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            let item_row1 =
-                _mm_loadu_si128(src_ptr0.get_unchecked(16..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight0);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight0);
-
-            let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            let item_row11 =
-                _mm_loadu_si128(src_ptr1.get_unchecked(16..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row10, v_weight1);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row11, v_weight1);
-
-            let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-            let item_row21 =
-                _mm_loadu_si128(src_ptr2.get_unchecked(16..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row20, v_weight2);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row21, v_weight2);
-        } else if bounds_size == 4 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..4);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_weight2 = _mm_set1_epi16(weights[2]);
-            let v_weight3 = _mm_set1_epi16(weights[3]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..);
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..);
-            let v_offset2 = src_stride * (py + 2) + px;
-            let src_ptr2 = src.get_unchecked(v_offset2..);
-            let v_offset3 = src_stride * (py + 3) + px;
-            let src_ptr3 = src.get_unchecked(v_offset3..);
-
-            let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            let item_row1 =
-                _mm_loadu_si128(src_ptr0.get_unchecked(16..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight0);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight0);
-
-            let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            let item_row11 =
-                _mm_loadu_si128(src_ptr1.get_unchecked(16..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row10, v_weight1);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row11, v_weight1);
-
-            let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-            let item_row21 =
-                _mm_loadu_si128(src_ptr2.get_unchecked(16..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row20, v_weight2);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row21, v_weight2);
-
-            let item_row30 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i);
-            let item_row31 =
-                _mm_loadu_si128(src_ptr3.get_unchecked(16..).as_ptr() as *const __m128i);
-
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row30, v_weight3);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row31, v_weight3);
-        } else {
-            for j in 0..bounds_size {
-                let py = bounds.start + j;
-                let weight = weight.get_unchecked(j..(j + 1));
-                let v_weight = _mm_set1_epi16(weight[0]);
-                let v_offset = src_stride * py + px;
-                let src_ptr = src.get_unchecked(v_offset..);
-                let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
-                let item_row1 =
-                    _mm_loadu_si128(src_ptr.get_unchecked(16..).as_ptr() as *const __m128i);
-
-                (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight);
-                (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight);
-            }
+        for j in 0..bounds_size {
+            let py = bounds.start + j;
+            let weight = weight.get_unchecked(j..(j + 1));
+            let v_weight = _mm_set1_epi16(weight[0]);
+            let v_offset = src_stride * py + px;
+            let src_ptr = src.get_unchecked(v_offset..);
+            let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
+            let item_row1 = _mm_loadu_si128(src_ptr.get_unchecked(16..).as_ptr() as *const __m128i);
+
+            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight);
+            (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight);
         }
 
-        store0 = _mm_max_epi16(store0, zeros);
-        store1 = _mm_max_epi16(store1, zeros);
-        store2 = _mm_max_epi16(store2, zeros);
-        store3 = _mm_max_epi16(store3, zeros);
-
         let rebased0 = _mm_srli_epi16::<R_SHR_SCALE>(store0);
         let rebased1 = _mm_srli_epi16::<R_SHR_SCALE>(store1);
         let rebased2 = _mm_srli_epi16::<R_SHR_SCALE>(store2);
@@ -532,9 +258,6 @@ unsafe fn convolve_vertical_sse_row_impl(
             }
         }
 
-        store0 = _mm_max_epi16(store0, zeros);
-        store1 = _mm_max_epi16(store1, zeros);
-
         let rebased0 = _mm_srli_epi16::<R_SHR_SCALE>(store0);
         let rebased1 = _mm_srli_epi16::<R_SHR_SCALE>(store1);
         let shrank = _mm_packus_epi16(rebased0, rebased1);
@@ -655,11 +378,9 @@ unsafe fn convolve_vertical_sse_row_impl(
             }
         }
 
-        store = _mm_max_epi16(store, zeros);
-
         let rebased = _mm_srli_epi16::<R_SHR_SCALE>(store);
         let shrank = _mm_packus_epi16(rebased, rebased);
-        std::ptr::copy_nonoverlapping(&shrank as *const _ as *const u8, dst.as_mut_ptr(), 8);
+        _mm_storeu_si64(dst.as_mut_ptr(), shrank);
 
         cx += 8;
     }
@@ -781,8 +502,6 @@ unsafe fn convolve_vertical_sse_row_impl(
             }
         }
 
-        store = _mm_max_epi16(store, zeros);
-
         let rebased = _mm_srli_epi16::<R_SHR_SCALE>(store);
         let value = _mm_extract_epi8::<0>(_mm_packus_epi16(rebased, rebased));
         *dst = value as u8;

From fa1f2ec6e0f4ab74a60824694d829459bd0c179a Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Sat, 28 Dec 2024 10:38:53 +0000
Subject: [PATCH 4/9] Refactor and improvements

---
 .github/workflows/build_push.yml    |    1 +
 Cargo.lock                          |    2 +-
 Cargo.toml                          |    4 +-
 app/src/main.rs                     |   36 +-
 fuzz/Cargo.toml                     |    7 +
 fuzz/colorspaces/colorspaces.rs     |   95 +++
 src/avx2/vertical_u8.rs             |    6 +-
 src/colors/jzazbz_scaler.rs         |  117 ++--
 src/colors/lab_scaler.rs            |  119 ++--
 src/colors/lch_scaler.rs            |  116 +--
 src/colors/linear_precise_scaler.rs |  159 +++--
 src/colors/linear_scaler.rs         |  170 +++--
 src/colors/luv_scaler.rs            |  163 +++--
 src/colors/oklab_scaler.rs          |  159 +++--
 src/colors/sigmoidal_scaler.rs      |  155 ++--
 src/colors/xyz_scaler.rs            |  165 +++--
 src/convolution.rs                  |    6 +-
 src/dispatch_group_f16.rs           |   21 +-
 src/dispatch_group_f32.rs           |   21 +-
 src/dispatch_group_u16.rs           |   15 +-
 src/dispatch_group_u8.rs            |   11 +-
 src/f16.rs                          |   13 +-
 src/image_store.rs                  |  311 ++++++--
 src/lib.rs                          |    2 +-
 src/neon/utils.rs                   |   14 +
 src/neon/vertical_f32.rs            |  125 ++--
 src/plane_f32.rs                    |    5 +-
 src/plane_u16.rs                    |    5 +-
 src/plane_u8.rs                     |    5 +-
 src/rgb_f32.rs                      |    6 +-
 src/rgb_u16.rs                      |    5 +-
 src/rgb_u8.rs                       |    6 +-
 src/rgba_f32.rs                     |    5 +-
 src/rgba_u16.rs                     |    5 +-
 src/rgba_u8.rs                      |    5 +-
 src/scaler.rs                       | 1014 +++++++++------------------
 src/scaler_f16.rs                   |  260 +------
 37 files changed, 1626 insertions(+), 1708 deletions(-)
 create mode 100644 fuzz/colorspaces/colorspaces.rs

diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index e10a6e8..a72a34b 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -57,6 +57,7 @@ jobs:
       - run: cargo fuzz run resize_rgba -- -max_total_time=30
       - run: cargo fuzz run resize_rgb -- -max_total_time=30
       - run: cargo fuzz run resize_plane -- -max_total_time=30
+      - run: cargo fuzz run colorspaces -- -max_total_time=10
 
   fuzz_rgba_high_bit:
     name: Fuzzing High bit-depth
diff --git a/Cargo.lock b/Cargo.lock
index 65a8fe9..f1d0691 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -780,7 +780,7 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
 [[package]]
 name = "pic-scale"
-version = "0.3.6"
+version = "0.4.0"
 dependencies = [
  "colorutils-rs",
  "half",
diff --git a/Cargo.toml b/Cargo.toml
index 68371ce..8dd615e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["app", "wasm", "fuzz"] }
 
 [package]
 name = "pic-scale"
-version = "0.3.6"
+version = "0.4.0"
 edition = "2021"
 description = "High performance image scaling"
 readme = "README.md"
@@ -14,7 +14,7 @@ categories = ["multimedia::images", "multimedia::video"]
 homepage = "https://github.com/awxkee/pic-scale"
 repository = "https://github.com/awxkee/pic-scale"
 exclude = ["*.jpg", "/assets", "*.png", "*.sh", "/assets/*"]
-rust-version = "1.73.0"
+rust-version = "1.82.0"
 
 [dependencies]
 colorutils-rs = {version = "0.7.0", optional = true}
diff --git a/app/src/main.rs b/app/src/main.rs
index cb713ce..8166b23 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -11,8 +11,8 @@ use fast_image_resize::{
 };
 use image::{EncodableLayout, GenericImageView, ImageReader};
 use pic_scale::{
-    Ar30ByteOrder, ImageSize, ImageStore, LinearApproxScaler, LinearScaler, ResamplingFunction,
-    Scaler, Scaling, ScalingU16, ThreadingPolicy,
+    Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, LinearScaler, ResamplingFunction, Scaler,
+    Scaling, ScalingU16, ThreadingPolicy,
 };
 
 fn resize_plane(
@@ -37,15 +37,14 @@ fn resize_plane(
     let mut src_data = vec![15u8; src_width * src_height * 1];
 
     let store = ImageStore::<u8, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let mut dst_store = ImageStoreMut::<u8, 1>::alloc(src_width / 2, src_height / 2);
     let scaler = Scaler::new(sampler);
-    _ = scaler
-        .resize_plane(ImageSize::new(dst_width, dst_height), store)
-        .unwrap();
+    _ = scaler.resize_plane(&store, &mut dst_store).unwrap();
 }
 
 fn main() {
     // test_fast_image();
-    let img = ImageReader::open("./assets/test_1.jpg")
+    let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
         .unwrap()
         .decode()
         .unwrap();
@@ -53,7 +52,7 @@ fn main() {
     let transient = img.to_rgba8();
     let mut bytes = Vec::from(transient.as_bytes());
 
-    let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
+    let mut scaler = LinearScaler::new(ResamplingFunction::Bilinear);
     scaler.set_threading_policy(ThreadingPolicy::Single);
 
     // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);
@@ -77,13 +76,10 @@ fn main() {
     //     )
     //     .unwrap();
 
-    let resized = scaler
-        .resize_rgba(
-            ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
-            store,
-            false,
-        )
-        .unwrap();
+    let mut dst_store =
+        ImageStoreMut::<u8, 4>::alloc(dimensions.0 as usize / 2, dimensions.1 as usize / 2);
+
+    scaler.resize_rgba(&store, &mut dst_store, false).unwrap();
 
     let elapsed_time = start_time.elapsed();
     // Print the elapsed time in milliseconds
@@ -162,7 +158,7 @@ fn main() {
 
     // let dst: Vec<u8> = resized.as_bytes().iter().map(|&x| (x >> 2) as u8).collect();
     //
-    let dst = resized.as_bytes();
+    let dst = dst_store.as_bytes();
     // let dst = resized;
     // image::save_buffer(
     //     "converted.png",
@@ -173,12 +169,12 @@ fn main() {
     // )
     // .unwrap();
 
-    if resized.channels == 4 {
+    if dst_store.channels == 4 {
         image::save_buffer(
             "converted.png",
             &dst,
-            resized.width as u32,
-            resized.height as u32,
+            dst_store.width as u32,
+            dst_store.height as u32,
             image::ColorType::Rgba8,
         )
         .unwrap();
@@ -186,8 +182,8 @@ fn main() {
         image::save_buffer(
             "converted.png",
             &dst,
-            resized.width as u32,
-            resized.height as u32,
+            dst_store.width as u32,
+            dst_store.height as u32,
             image::ColorType::Rgb8,
         )
         .unwrap();
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index 3db86e5..079bc34 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -73,3 +73,10 @@ path = "resize_plane_f32/resize_plane_f32.rs"
 test = false
 doc = false
 bench = false
+
+[[bin]]
+name = "colorspaces"
+path = "colorspaces/colorspaces.rs"
+test = false
+doc = false
+bench = false
diff --git a/fuzz/colorspaces/colorspaces.rs b/fuzz/colorspaces/colorspaces.rs
new file mode 100644
index 0000000..2451bf0
--- /dev/null
+++ b/fuzz/colorspaces/colorspaces.rs
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{
+    ImageStore, ImageStoreMut, JzazbzScaler, LChScaler, LabScaler, LinearApproxScaler,
+    LinearScaler, LuvScaler, OklabScaler, ResamplingFunction, Scaling, SigmoidalScaler,
+    TransferFunction, XYZScaler,
+};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_plane(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_plane(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let scalers: Vec<Box<dyn Scaling>> = vec![
+        Box::new(JzazbzScaler::new(sampler, 203f32, TransferFunction::Srgb)),
+        Box::new(LabScaler::new(sampler)),
+        Box::new(LChScaler::new(sampler)),
+        Box::new(LinearScaler::new(sampler)),
+        Box::new(LinearApproxScaler::new(sampler)),
+        Box::new(LuvScaler::new(sampler)),
+        Box::new(OklabScaler::new(sampler, TransferFunction::Srgb)),
+        Box::new(SigmoidalScaler::new(sampler)),
+        Box::new(XYZScaler::new(sampler)),
+    ];
+
+    for scaler in scalers {
+        let mut src_data_rgb = vec![15u8; src_width * src_height * 3];
+        let store =
+            ImageStore::<u8, 3>::from_slice(&mut src_data_rgb, src_width, src_height).unwrap();
+        let mut target_store = ImageStoreMut::alloc(dst_width, dst_height);
+        scaler.resize_rgb(&store, &mut target_store).unwrap();
+
+        let mut src_data_rgba = vec![15u8; src_width * src_height * 4];
+        let store_rgba =
+            ImageStore::<u8, 4>::from_slice(&mut src_data_rgba, src_width, src_height).unwrap();
+        let mut target_store_rgba = ImageStoreMut::alloc(dst_width, dst_height);
+        scaler
+            .resize_rgba(&store_rgba, &mut target_store_rgba, false)
+            .unwrap();
+    }
+}
diff --git a/src/avx2/vertical_u8.rs b/src/avx2/vertical_u8.rs
index e578206..0e86cf6 100644
--- a/src/avx2/vertical_u8.rs
+++ b/src/avx2/vertical_u8.rs
@@ -107,8 +107,7 @@ unsafe fn convolve_vertical_part_avx_64(
         let pix = _mm256_unpackhi_epi8(interleaved, zeros);
         store_3 = _mm256_add_epi32(store_3, _mm256_madd_epi16(pix, v_weight_2));
 
-        let item_row_0 =
-            _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i);
+        let item_row_0 = _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i);
         let item_row_1 =
             _mm256_loadu_si256(s_ptr_next.get_unchecked(32..).as_ptr() as *const __m256i);
 
@@ -134,8 +133,7 @@ unsafe fn convolve_vertical_part_avx_64(
         let src_ptr = src.get_unchecked((src_stride * py + px)..);
 
         let item_row_0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
-        let item_row_1 =
-            _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i);
+        let item_row_1 = _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i);
 
         (store_0, store_1, store_2, store_3) =
             dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight);
diff --git a/src/colors/jzazbz_scaler.rs b/src/colors/jzazbz_scaler.rs
index f939632..845d321 100644
--- a/src/colors/jzazbz_scaler.rs
+++ b/src/colors/jzazbz_scaler.rs
@@ -30,11 +30,10 @@ use colorutils_rs::{
     jzazbz_to_rgb, jzazbz_to_rgba, rgb_to_jzazbz, rgba_to_jzazbz, TransferFunction,
 };
 
-use crate::alpha_check::has_non_constant_cap_alpha_rgba8;
 use crate::pic_scale_error::PicScaleError;
 use crate::scaler::ScalingF32;
 use crate::support::check_image_size_overflow;
-use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
+use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
 
 #[derive(Debug, Copy, Clone)]
 /// Converts image to *Jzazbz* components scales it and convert back
@@ -60,35 +59,44 @@ impl JzazbzScaler {
         }
     }
 
-    fn rgba_to_laba<'a>(&self, store: ImageStore<'a, u8, 4>) -> ImageStore<'a, f32, 4> {
-        let mut new_store = ImageStore::<f32, 4>::alloc(store.width, store.height);
+    fn rgba_to_laba<'a>(&self, store: &ImageStore<'a, u8, 4>) -> ImageStore<'a, f32, 4> {
+        let mut source_slice = vec![f32::default(); 4 * store.width * store.height];
         let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32;
         rgba_to_jzazbz(
-            store.buffer.borrow(),
+            store.buffer.as_ref(),
             store.width as u32 * 4u32,
-            new_store.buffer.borrow_mut(),
+            &mut source_slice,
             lab_stride,
             store.width as u32,
             store.height as u32,
             self.display_luminance,
             self.transfer_function,
         );
+        let new_store = ImageStore::<f32, 4> {
+            buffer: std::borrow::Cow::Owned(source_slice),
+            channels: 4,
+            width: store.width,
+            height: store.height,
+            bit_depth: store.bit_depth,
+        };
         new_store
     }
 
-    fn laba_to_srgba<'a>(&self, store: ImageStore<'a, f32, 4>) -> ImageStore<'a, u8, 4> {
-        let mut new_store = ImageStore::<u8, 4>::alloc(store.width, store.height);
+    fn laba_to_srgba<'a>(
+        &self,
+        store: &ImageStoreMut<'a, f32, 4>,
+        into: &mut ImageStoreMut<'a, u8, 4>,
+    ) {
         jzazbz_to_rgba(
             store.buffer.borrow(),
             store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32,
-            new_store.buffer.borrow_mut(),
+            into.buffer.borrow_mut(),
             store.width as u32 * 4u32,
             store.width as u32,
             store.height as u32,
             self.display_luminance,
             self.transfer_function,
         );
-        new_store
     }
 }
 
@@ -97,11 +105,12 @@ impl Scaling for JzazbzScaler {
         self.scaler.threading_policy = threading_policy;
     }
 
-    fn resize_rgb(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<u8, 3>,
-    ) -> Result<ImageStore<u8, 3>, PicScaleError> {
+    fn resize_rgb<'a>(
+        &'a self,
+        store: &ImageStore<u8, 3>,
+        into: &mut ImageStoreMut<u8, 3>,
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -115,15 +124,22 @@ impl Scaling for JzazbzScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
         const COMPONENTS: usize = 3;
-        let mut lab_store = ImageStore::<f32, COMPONENTS>::alloc(store.width, store.height);
+
+        let mut target = vec![f32::default(); store.width * store.height * COMPONENTS];
+
+        let mut lab_store =
+            ImageStoreMut::<f32, COMPONENTS>::from_slice(&mut target, store.width, store.height)?;
+        lab_store.bit_depth = into.bit_depth;
+
         let lab_stride =
             lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
         rgb_to_jzazbz(
-            store.buffer.borrow(),
+            store.buffer.as_ref(),
             store.width as u32 * COMPONENTS as u32,
             lab_store.buffer.borrow_mut(),
             lab_stride,
@@ -132,29 +148,42 @@ impl Scaling for JzazbzScaler {
             self.display_luminance,
             self.transfer_function,
         );
-        let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?;
-        let mut new_u8_store = ImageStore::<u8, COMPONENTS>::alloc(new_size.width, new_size.height);
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+        self.scaler
+            .resize_rgb_f32(&new_immutable_store, &mut new_store)?;
+
         let new_lab_stride =
             new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
         jzazbz_to_rgb(
             new_store.buffer.borrow(),
             new_lab_stride,
-            new_u8_store.buffer.borrow_mut(),
-            new_u8_store.width as u32 * COMPONENTS as u32,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
             new_store.width as u32,
             new_store.height as u32,
             self.display_luminance,
             self.transfer_function,
         );
-        Ok(new_u8_store)
+        Ok(())
     }
 
     fn resize_rgba<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 4>,
+        store: &ImageStore<'a, u8, 4>,
+        into: &mut ImageStoreMut<'a, u8, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u8, 4>, PicScaleError> {
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -168,36 +197,16 @@ impl Scaling for JzazbzScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
-        let mut src_store = store;
+        let lab_store = self.rgba_to_laba(store);
+        let mut new_target_store = ImageStoreMut::alloc(new_size.width, new_size.height);
 
-        let pool = self
-            .scaler
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-
-        let mut has_alpha_premultiplied = false;
-
-        if premultiply_alpha {
-            let is_alpha_premultiplication_reasonable =
-                has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width);
-            if is_alpha_premultiplication_reasonable {
-                let mut new_store = ImageStore::<u8, 4>::alloc(src_store.width, src_store.height);
-                src_store.premultiply_alpha(&mut new_store, &pool);
-                src_store = new_store;
-                has_alpha_premultiplied = true;
-            }
-        }
-        let lab_store = self.rgba_to_laba(src_store);
-        let new_store = self
-            .scaler
-            .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?;
-        let mut rgba_store = self.laba_to_srgba(new_store);
-        if premultiply_alpha && has_alpha_premultiplied {
-            rgba_store.unpremultiply_alpha(&pool);
-        }
-        Ok(rgba_store)
+        self.scaler
+            .resize_rgba_f32(&lab_store, &mut new_target_store, premultiply_alpha)?;
+        self.laba_to_srgba(&new_target_store, into);
+        Ok(())
     }
 }
diff --git a/src/colors/lab_scaler.rs b/src/colors/lab_scaler.rs
index 0ea7d70..0878451 100644
--- a/src/colors/lab_scaler.rs
+++ b/src/colors/lab_scaler.rs
@@ -27,17 +27,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+use crate::pic_scale_error::PicScaleError;
+use crate::scaler::{Scaling, ScalingF32};
+use crate::support::check_image_size_overflow;
+use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ThreadingPolicy};
 use colorutils_rs::{
     lab_to_srgb, lab_with_alpha_to_rgba, rgb_to_lab, rgba_to_lab_with_alpha, TransferFunction,
     SRGB_TO_XYZ_D65, XYZ_TO_SRGB_D65,
 };
 
-use crate::alpha_check::has_non_constant_cap_alpha_rgba8;
-use crate::pic_scale_error::PicScaleError;
-use crate::scaler::{Scaling, ScalingF32};
-use crate::support::check_image_size_overflow;
-use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, ThreadingPolicy};
-
 #[derive(Debug, Copy, Clone)]
 /// Converts image to *CIE LAB* components scales it and convert back
 pub struct LabScaler {
@@ -51,35 +49,40 @@ impl LabScaler {
         }
     }
 
-    fn rgba_to_laba(store: ImageStore<u8, 4>) -> ImageStore<f32, 4> {
-        let mut new_store = ImageStore::<f32, 4>::alloc(store.width, store.height);
+    fn rgba_to_laba<'a>(store: &ImageStore<'a, u8, 4>) -> ImageStore<'a, f32, 4> {
+        let mut source_slice = vec![f32::default(); 4 * store.width * store.height];
         let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32;
         rgba_to_lab_with_alpha(
-            store.buffer.borrow(),
+            store.buffer.as_ref(),
             store.width as u32 * 4u32,
-            new_store.buffer.borrow_mut(),
+            &mut source_slice,
             lab_stride,
             store.width as u32,
             store.height as u32,
             &SRGB_TO_XYZ_D65,
             TransferFunction::Srgb,
         );
+        let new_store = ImageStore::<f32, 4> {
+            buffer: std::borrow::Cow::Owned(source_slice),
+            channels: 4,
+            width: store.width,
+            height: store.height,
+            bit_depth: store.bit_depth,
+        };
         new_store
     }
 
-    fn laba_to_srgba(store: ImageStore<f32, 4>) -> ImageStore<u8, 4> {
-        let mut new_store = ImageStore::<u8, 4>::alloc(store.width, store.height);
+    fn laba_to_srgba<'a>(store: &ImageStoreMut<'a, f32, 4>, into: &mut ImageStoreMut<'a, u8, 4>) {
         lab_with_alpha_to_rgba(
             store.buffer.borrow(),
             store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32,
-            new_store.buffer.borrow_mut(),
+            into.buffer.borrow_mut(),
             store.width as u32 * 4u32,
             store.width as u32,
             store.height as u32,
             &XYZ_TO_SRGB_D65,
             TransferFunction::Srgb,
         );
-        new_store
     }
 }
 
@@ -88,11 +91,12 @@ impl Scaling for LabScaler {
         self.scaler.threading_policy = threading_policy;
     }
 
-    fn resize_rgb(
+    fn resize_rgb<'a>(
         &self,
-        new_size: ImageSize,
-        store: ImageStore<u8, 3>,
-    ) -> Result<ImageStore<u8, 3>, PicScaleError> {
+        store: &ImageStore<'a, u8, 3>,
+        into: &mut ImageStoreMut<'a, u8, 3>,
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -106,14 +110,23 @@ impl Scaling for LabScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
+
         const COMPONENTS: usize = 3;
-        let mut lab_store = ImageStore::<f32, COMPONENTS>::alloc(store.width, store.height);
+
+        let mut target = vec![f32::default(); store.width * store.height * COMPONENTS];
+
+        let mut lab_store =
+            ImageStoreMut::<f32, COMPONENTS>::from_slice(&mut target, store.width, store.height)?;
+        lab_store.bit_depth = into.bit_depth;
+
         let lab_stride =
             lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
         rgb_to_lab(
-            store.buffer.borrow(),
+            store.buffer.as_ref(),
             store.width as u32 * COMPONENTS as u32,
             lab_store.buffer.borrow_mut(),
             lab_stride,
@@ -122,27 +135,39 @@ impl Scaling for LabScaler {
             &SRGB_TO_XYZ_D65,
             TransferFunction::Srgb,
         );
-        let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?;
-        let mut new_u8_store = ImageStore::<u8, COMPONENTS>::alloc(new_size.width, new_size.height);
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+        self.scaler
+            .resize_rgb_f32(&new_immutable_store, &mut new_store)?;
+
         let new_lab_stride =
             new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
         lab_to_srgb(
             new_store.buffer.borrow(),
             new_lab_stride,
-            new_u8_store.buffer.borrow_mut(),
-            new_u8_store.width as u32 * COMPONENTS as u32,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
             new_store.width as u32,
             new_store.height as u32,
         );
-        Ok(new_u8_store)
+        Ok(())
     }
 
     fn resize_rgba<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 4>,
+        store: &ImageStore<'a, u8, 4>,
+        into: &mut ImageStoreMut<'a, u8, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u8, 4>, PicScaleError> {
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -156,36 +181,16 @@ impl Scaling for LabScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
-        let mut src_store = store;
-
-        let pool = self
-            .scaler
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-
-        let mut has_alpha_premultiplied = false;
+        let lab_store = Self::rgba_to_laba(store);
+        let mut new_target_store = ImageStoreMut::alloc(new_size.width, new_size.height);
 
-        if premultiply_alpha {
-            let is_alpha_premultiplication_reasonable =
-                has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width);
-            if is_alpha_premultiplication_reasonable {
-                let mut new_store = ImageStore::<u8, 4>::alloc(src_store.width, src_store.height);
-                src_store.premultiply_alpha(&mut new_store, &pool);
-                src_store = new_store;
-                has_alpha_premultiplied = true;
-            }
-        }
-        let lab_store = Self::rgba_to_laba(src_store);
-        let new_store = self
-            .scaler
-            .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?;
-        let mut rgba_store = Self::laba_to_srgba(new_store);
-        if premultiply_alpha && has_alpha_premultiplied {
-            rgba_store.unpremultiply_alpha(&pool);
-        }
-        Ok(rgba_store)
+        self.scaler
+            .resize_rgba_f32(&lab_store, &mut new_target_store, premultiply_alpha)?;
+        Self::laba_to_srgba(&new_target_store, into);
+        Ok(())
     }
 }
diff --git a/src/colors/lch_scaler.rs b/src/colors/lch_scaler.rs
index 96fc290..00d9812 100644
--- a/src/colors/lch_scaler.rs
+++ b/src/colors/lch_scaler.rs
@@ -32,11 +32,10 @@ use colorutils_rs::{
     SRGB_TO_XYZ_D65, XYZ_TO_SRGB_D65,
 };
 
-use crate::alpha_check::has_non_constant_cap_alpha_rgba8;
 use crate::pic_scale_error::PicScaleError;
 use crate::scaler::ScalingF32;
 use crate::support::check_image_size_overflow;
-use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
+use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
 
 #[derive(Debug, Copy, Clone)]
 /// Converts image to *CIE LCH(uv)* components scales it and convert back
@@ -51,35 +50,40 @@ impl LChScaler {
         }
     }
 
-    fn rgba_to_lcha(store: ImageStore<u8, 4>) -> ImageStore<f32, 4> {
-        let mut new_store = ImageStore::<f32, 4>::alloc(store.width, store.height);
+    fn rgba_to_lcha<'a>(store: &ImageStore<'a, u8, 4>) -> ImageStore<'a, f32, 4> {
+        let mut source_slice = vec![f32::default(); 4 * store.width * store.height];
         let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32;
         rgba_to_lch_with_alpha(
-            store.buffer.borrow(),
+            store.buffer.as_ref(),
             store.width as u32 * 4u32,
-            new_store.buffer.borrow_mut(),
+            &mut source_slice,
             lab_stride,
             store.width as u32,
             store.height as u32,
             &SRGB_TO_XYZ_D65,
             TransferFunction::Srgb,
         );
+        let new_store = ImageStore::<f32, 4> {
+            buffer: std::borrow::Cow::Owned(source_slice),
+            channels: 4,
+            width: store.width,
+            height: store.height,
+            bit_depth: store.bit_depth,
+        };
         new_store
     }
 
-    fn lcha_to_srgba(store: ImageStore<f32, 4>) -> ImageStore<u8, 4> {
-        let mut new_store = ImageStore::<u8, 4>::alloc(store.width, store.height);
+    fn lcha_to_srgba<'a>(store: &ImageStoreMut<'a, f32, 4>, into: &mut ImageStoreMut<'a, u8, 4>) {
         lch_with_alpha_to_rgba(
             store.buffer.borrow(),
             store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32,
-            new_store.buffer.borrow_mut(),
+            into.buffer.borrow_mut(),
             store.width as u32 * 4u32,
             store.width as u32,
             store.height as u32,
             &XYZ_TO_SRGB_D65,
             TransferFunction::Srgb,
         );
-        new_store
     }
 }
 
@@ -88,11 +92,12 @@ impl Scaling for LChScaler {
         self.scaler.set_threading_policy(threading_policy)
     }
 
-    fn resize_rgb(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<u8, 3>,
-    ) -> Result<ImageStore<u8, 3>, PicScaleError> {
+    fn resize_rgb<'a>(
+        &'a self,
+        store: &ImageStore<'a, u8, 3>,
+        into: &mut ImageStoreMut<'a, u8, 3>,
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -106,15 +111,25 @@ impl Scaling for LChScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
         const COMPONENTS: usize = 3;
-        let mut lab_store = ImageStore::<f32, COMPONENTS>::alloc(store.width, store.height);
+
+        let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS];
+
+        let mut lab_store = ImageStoreMut::<f32, COMPONENTS>::from_slice(
+            &mut target_vertical,
+            store.width,
+            store.height,
+        )?;
+        lab_store.bit_depth = into.bit_depth;
+
         let lab_stride =
             lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
         rgb_to_lch(
-            store.buffer.borrow(),
+            store.buffer.as_ref(),
             store.width as u32 * COMPONENTS as u32,
             lab_store.buffer.borrow_mut(),
             lab_stride,
@@ -123,29 +138,42 @@ impl Scaling for LChScaler {
             &SRGB_TO_XYZ_D65,
             TransferFunction::Srgb,
         );
-        let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?;
-        let mut new_u8_store = ImageStore::<u8, COMPONENTS>::alloc(new_size.width, new_size.height);
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+        self.scaler
+            .resize_rgb_f32(&new_immutable_store, &mut new_store)?;
+
         let new_lab_stride =
             new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
         lch_to_rgb(
             new_store.buffer.borrow(),
             new_lab_stride,
-            new_u8_store.buffer.borrow_mut(),
-            new_u8_store.width as u32 * COMPONENTS as u32,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
             new_store.width as u32,
             new_store.height as u32,
             &XYZ_TO_SRGB_D65,
             TransferFunction::Srgb,
         );
-        Ok(new_u8_store)
+        Ok(())
     }
 
     fn resize_rgba<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 4>,
+        store: &ImageStore<'a, u8, 4>,
+        into: &mut ImageStoreMut<'a, u8, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u8, 4>, PicScaleError> {
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -159,36 +187,16 @@ impl Scaling for LChScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
-        let mut src_store = store;
-
-        let pool = self
-            .scaler
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
+        let lab_store = Self::rgba_to_lcha(store);
+        let mut new_target_store = ImageStoreMut::alloc(new_size.width, new_size.height);
 
-        let mut has_alpha_premultiplied = false;
-
-        if premultiply_alpha {
-            let is_alpha_premultiplication_reasonable =
-                has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width);
-            if is_alpha_premultiplication_reasonable {
-                let mut new_store = ImageStore::<u8, 4>::alloc(src_store.width, src_store.height);
-                src_store.premultiply_alpha(&mut new_store, &pool);
-                src_store = new_store;
-                has_alpha_premultiplied = true;
-            }
-        }
-        let lab_store = Self::rgba_to_lcha(src_store);
-        let new_store = self
-            .scaler
-            .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?;
-        let mut rgba_store = Self::lcha_to_srgba(new_store);
-        if premultiply_alpha && has_alpha_premultiplied {
-            rgba_store.unpremultiply_alpha(&pool);
-        }
-        Ok(rgba_store)
+        self.scaler
+            .resize_rgba_f32(&lab_store, &mut new_target_store, premultiply_alpha)?;
+        Self::lcha_to_srgba(&new_target_store, into);
+        Ok(())
     }
 }
diff --git a/src/colors/linear_precise_scaler.rs b/src/colors/linear_precise_scaler.rs
index 2ab3256..03f11d8 100644
--- a/src/colors/linear_precise_scaler.rs
+++ b/src/colors/linear_precise_scaler.rs
@@ -27,11 +27,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::alpha_check::has_non_constant_cap_alpha_rgba8;
 use crate::pic_scale_error::PicScaleError;
 use crate::scaler::{Scaling, ScalingF32};
 use crate::support::check_image_size_overflow;
-use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, ThreadingPolicy};
+use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ThreadingPolicy};
 use colorutils_rs::{
     linear_to_rgb, linear_to_rgba, rgb_to_linear, rgba_to_linear, TransferFunction,
 };
@@ -63,35 +62,6 @@ impl LinearScaler {
             transfer_function,
         }
     }
-
-    fn rgba_to_linear(&self, store: ImageStore<u8, 4>) -> ImageStore<f32, 4> {
-        let mut new_store = ImageStore::<f32, 4>::alloc(store.width, store.height);
-        let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32;
-        rgba_to_linear(
-            store.buffer.borrow(),
-            store.width as u32 * 4u32,
-            new_store.buffer.borrow_mut(),
-            lab_stride,
-            store.width as u32,
-            store.height as u32,
-            self.transfer_function,
-        );
-        new_store
-    }
-
-    fn linear_to_rgba(&self, store: ImageStore<f32, 4>) -> ImageStore<u8, 4> {
-        let mut new_store = ImageStore::<u8, 4>::alloc(store.width, store.height);
-        linear_to_rgba(
-            store.buffer.borrow(),
-            store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32,
-            new_store.buffer.borrow_mut(),
-            store.width as u32 * 4u32,
-            store.width as u32,
-            store.height as u32,
-            self.transfer_function,
-        );
-        new_store
-    }
 }
 
 impl Scaling for LinearScaler {
@@ -99,11 +69,12 @@ impl Scaling for LinearScaler {
         self.scaler.threading_policy = threading_policy;
     }
 
-    fn resize_rgb(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<u8, 3>,
-    ) -> Result<ImageStore<u8, 3>, PicScaleError> {
+    fn resize_rgb<'a>(
+        &'a self,
+        store: &ImageStore<'a, u8, 3>,
+        into: &mut ImageStoreMut<'a, u8, 3>,
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -117,15 +88,26 @@ impl Scaling for LinearScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
         const COMPONENTS: usize = 3;
-        let mut lab_store = ImageStore::<f32, COMPONENTS>::alloc(store.width, store.height);
+
+        let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS];
+
+        let mut lab_store = ImageStoreMut::<f32, COMPONENTS>::from_slice(
+            &mut target_vertical,
+            store.width,
+            store.height,
+        )?;
+        lab_store.bit_depth = into.bit_depth;
+
         let lab_stride =
             lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
         rgb_to_linear(
-            store.buffer.borrow(),
+            store.buffer.as_ref(),
             store.width as u32 * COMPONENTS as u32,
             lab_store.buffer.borrow_mut(),
             lab_stride,
@@ -133,28 +115,40 @@ impl Scaling for LinearScaler {
             lab_store.height as u32,
             self.transfer_function,
         );
-        let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?;
-        let mut new_u8_store = ImageStore::<u8, COMPONENTS>::alloc(new_size.width, new_size.height);
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+
+        self.scaler
+            .resize_rgb_f32(&new_immutable_store, &mut new_store)?;
         let new_lab_stride =
             new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
         linear_to_rgb(
             new_store.buffer.borrow(),
             new_lab_stride,
-            new_u8_store.buffer.borrow_mut(),
-            new_u8_store.width as u32 * COMPONENTS as u32,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
             new_store.width as u32,
             new_store.height as u32,
             self.transfer_function,
         );
-        Ok(new_u8_store)
+        Ok(())
     }
 
     fn resize_rgba<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 4>,
+        store: &ImageStore<'a, u8, 4>,
+        into: &mut ImageStoreMut<'a, u8, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u8, 4>, PicScaleError> {
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -168,36 +162,55 @@ impl Scaling for LinearScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
-        let mut src_store = store;
+        const COMPONENTS: usize = 4;
 
-        let pool = self
-            .scaler
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
+        let mut target = vec![f32::default(); store.width * store.height * COMPONENTS];
 
-        let mut has_alpha_premultiplied = false;
+        let mut lab_store =
+            ImageStoreMut::<f32, COMPONENTS>::from_slice(&mut target, store.width, store.height)?;
+        lab_store.bit_depth = into.bit_depth;
 
-        if premultiply_alpha {
-            let is_alpha_premultiplication_reasonable =
-                has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width);
-            if is_alpha_premultiplication_reasonable {
-                let mut new_store = ImageStore::<u8, 4>::alloc(src_store.width, src_store.height);
-                src_store.premultiply_alpha(&mut new_store, &pool);
-                src_store = new_store;
-                has_alpha_premultiplied = true;
-            }
-        }
-        let lab_store = self.rgba_to_linear(src_store);
-        let new_store = self
-            .scaler
-            .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?;
-        let mut rgba_store = self.linear_to_rgba(new_store);
-        if premultiply_alpha && has_alpha_premultiplied {
-            rgba_store.unpremultiply_alpha(&pool);
-        }
-        Ok(rgba_store)
+        let lab_stride =
+            lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+        rgba_to_linear(
+            store.buffer.as_ref(),
+            store.width as u32 * COMPONENTS as u32,
+            lab_store.buffer.borrow_mut(),
+            lab_stride,
+            lab_store.width as u32,
+            lab_store.height as u32,
+            TransferFunction::Srgb,
+        );
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+        self.scaler
+            .resize_rgba_f32(&new_immutable_store, &mut new_store, premultiply_alpha)?;
+
+        let new_lab_stride =
+            new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
+        linear_to_rgba(
+            new_store.buffer.borrow(),
+            new_lab_stride,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
+            new_store.width as u32,
+            new_store.height as u32,
+            TransferFunction::Srgb,
+        );
+
+        Ok(())
     }
 }
diff --git a/src/colors/linear_scaler.rs b/src/colors/linear_scaler.rs
index 230cda6..427cc38 100644
--- a/src/colors/linear_scaler.rs
+++ b/src/colors/linear_scaler.rs
@@ -31,11 +31,10 @@ use colorutils_rs::{
     linear_u8_to_rgb, linear_u8_to_rgba, rgb_to_linear_u8, rgba_to_linear_u8, TransferFunction,
 };
 
-use crate::alpha_check::has_non_constant_cap_alpha_rgba8;
 use crate::pic_scale_error::PicScaleError;
 use crate::scaler::Scaling;
 use crate::support::check_image_size_overflow;
-use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, ThreadingPolicy};
+use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ThreadingPolicy};
 
 #[derive(Debug, Copy, Clone)]
 /// Linearize image into u8, scale and then convert it back. It's much faster than scale in f32, however involves some precision loss
@@ -70,11 +69,12 @@ impl Scaling for LinearApproxScaler {
         self.scaler.threading_policy = threading_policy;
     }
 
-    fn resize_rgb(
+    fn resize_rgb<'a>(
         &self,
-        new_size: ImageSize,
-        store: ImageStore<u8, 3>,
-    ) -> Result<ImageStore<u8, 3>, PicScaleError> {
+        store: &ImageStore<'a, u8, 3>,
+        into: &mut ImageStoreMut<'a, u8, 3>,
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -88,42 +88,67 @@ impl Scaling for LinearApproxScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
-        const CHANNELS: usize = 3;
-        let mut linear_store = ImageStore::<u8, CHANNELS>::alloc(store.width, store.height);
+        const COMPONENTS: usize = 3;
+
+        let mut target_vertical = vec![u8::default(); store.width * store.height * COMPONENTS];
+
+        let mut lab_store = ImageStoreMut::<u8, COMPONENTS>::from_slice(
+            &mut target_vertical,
+            store.width,
+            store.height,
+        )?;
+        lab_store.bit_depth = into.bit_depth;
+
+        let lab_stride =
+            lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<u8>() as u32;
+
         rgb_to_linear_u8(
-            store.buffer.borrow(),
-            store.width as u32 * CHANNELS as u32,
-            linear_store.buffer.borrow_mut(),
-            linear_store.width as u32 * CHANNELS as u32,
-            linear_store.width as u32,
-            linear_store.height as u32,
+            store.buffer.as_ref(),
+            store.width as u32 * COMPONENTS as u32,
+            lab_store.buffer.borrow_mut(),
+            lab_stride,
+            lab_store.width as u32,
+            lab_store.height as u32,
             self.transfer_function,
         );
-        let new_store = self.scaler.resize_rgb(new_size, linear_store)?;
-        let mut gamma_store = ImageStore::<u8, CHANNELS>::alloc(new_store.width, new_store.height);
-        let src = new_store.buffer.borrow();
-        let gamma_buffer = gamma_store.buffer.borrow_mut();
+
+        let new_immutable_store = ImageStore::<u8, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<u8, COMPONENTS>::alloc(into.width, into.height);
+
+        self.scaler
+            .resize_rgb(&new_immutable_store, &mut new_store)?;
+        let new_lab_stride =
+            new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<u8>() as u32;
         linear_u8_to_rgb(
-            src,
-            new_store.width as u32 * CHANNELS as u32,
-            gamma_buffer,
-            gamma_store.width as u32 * CHANNELS as u32,
-            gamma_store.width as u32,
-            gamma_store.height as u32,
+            new_store.buffer.borrow(),
+            new_lab_stride,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
+            new_store.width as u32,
+            new_store.height as u32,
             self.transfer_function,
         );
-        Ok(gamma_store)
+        Ok(())
     }
 
     fn resize_rgba<'a>(
         &self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 4>,
+        store: &ImageStore<'a, u8, 4>,
+        into: &mut ImageStoreMut<'a, u8, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u8, 4>, PicScaleError> {
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -137,58 +162,57 @@ impl Scaling for LinearApproxScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
-        const CHANNELS: usize = 4;
-        let mut src_store = store;
-
-        let pool = self
-            .scaler
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-
-        let mut has_alpha_premultiplied = false;
-
-        if premultiply_alpha {
-            let is_alpha_premultiplication_reasonable =
-                has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width);
-            if is_alpha_premultiplication_reasonable {
-                let mut new_store = ImageStore::<u8, 4>::alloc(src_store.width, src_store.height);
-                src_store.premultiply_alpha(&mut new_store, &pool);
-                src_store = new_store;
-                has_alpha_premultiplied = true;
-            }
-        }
+        const COMPONENTS: usize = 4;
+
+        let mut target_vertical = vec![u8::default(); store.width * store.height * COMPONENTS];
+
+        let mut lab_store = ImageStoreMut::<u8, COMPONENTS>::from_slice(
+            &mut target_vertical,
+            store.width,
+            store.height,
+        )?;
+        lab_store.bit_depth = into.bit_depth;
+
+        let lab_stride =
+            lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<u8>() as u32;
 
-        let mut linear_store = ImageStore::<u8, CHANNELS>::alloc(src_store.width, src_store.height);
         rgba_to_linear_u8(
-            src_store.buffer.borrow(),
-            src_store.width as u32 * CHANNELS as u32,
-            linear_store.buffer.borrow_mut(),
-            linear_store.width as u32 * CHANNELS as u32,
-            linear_store.width as u32,
-            linear_store.height as u32,
+            store.buffer.as_ref(),
+            store.width as u32 * COMPONENTS as u32,
+            lab_store.buffer.borrow_mut(),
+            lab_stride,
+            lab_store.width as u32,
+            lab_store.height as u32,
             self.transfer_function,
         );
-        let new_store = self
-            .scaler
-            .resize_rgba_impl(new_size, linear_store, false, &pool)?;
-        let mut gamma_store = ImageStore::<u8, CHANNELS>::alloc(new_store.width, new_store.height);
-        let src = new_store.buffer.borrow();
-        let gamma_buffer = gamma_store.buffer.borrow_mut();
+
+        let new_immutable_store = ImageStore::<u8, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<u8, COMPONENTS>::alloc(into.width, into.height);
+
+        self.scaler
+            .resize_rgba(&new_immutable_store, &mut new_store, premultiply_alpha)?;
+        let new_lab_stride =
+            new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<u8>() as u32;
         linear_u8_to_rgba(
-            src,
-            new_store.width as u32 * CHANNELS as u32,
-            gamma_buffer,
-            gamma_store.width as u32 * CHANNELS as u32,
-            gamma_store.width as u32,
-            gamma_store.height as u32,
+            new_store.buffer.borrow(),
+            new_lab_stride,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
+            new_store.width as u32,
+            new_store.height as u32,
             self.transfer_function,
         );
-        if premultiply_alpha && has_alpha_premultiplied {
-            gamma_store.unpremultiply_alpha(&pool);
-        }
-        Ok(gamma_store)
+        Ok(())
     }
 }
diff --git a/src/colors/luv_scaler.rs b/src/colors/luv_scaler.rs
index 3470c5c..4528a71 100644
--- a/src/colors/luv_scaler.rs
+++ b/src/colors/luv_scaler.rs
@@ -32,11 +32,10 @@ use colorutils_rs::{
     SRGB_TO_XYZ_D65, XYZ_TO_SRGB_D65,
 };
 
-use crate::alpha_check::has_non_constant_cap_alpha_rgba8;
 use crate::pic_scale_error::PicScaleError;
 use crate::scaler::ScalingF32;
 use crate::support::check_image_size_overflow;
-use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
+use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
 
 #[derive(Debug, Copy, Clone)]
 /// Converts image to *CIE LUV* components scales it and convert back
@@ -50,37 +49,6 @@ impl LuvScaler {
             scaler: Scaler::new(filter),
         }
     }
-
-    fn rgba_to_laba(store: ImageStore<u8, 4>) -> ImageStore<f32, 4> {
-        let mut new_store = ImageStore::<f32, 4>::alloc(store.width, store.height);
-        let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32;
-        rgba_to_luv_with_alpha(
-            store.buffer.borrow(),
-            store.width as u32 * 4u32,
-            new_store.buffer.borrow_mut(),
-            lab_stride,
-            store.width as u32,
-            store.height as u32,
-            &SRGB_TO_XYZ_D65,
-            TransferFunction::Srgb,
-        );
-        new_store
-    }
-
-    fn laba_to_srgba(store: ImageStore<f32, 4>) -> ImageStore<u8, 4> {
-        let mut new_store = ImageStore::<u8, 4>::alloc(store.width, store.height);
-        luv_with_alpha_to_rgba(
-            store.buffer.borrow(),
-            store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32,
-            new_store.buffer.borrow_mut(),
-            store.width as u32 * 4u32,
-            store.width as u32,
-            store.height as u32,
-            &XYZ_TO_SRGB_D65,
-            TransferFunction::Srgb,
-        );
-        new_store
-    }
 }
 
 impl Scaling for LuvScaler {
@@ -88,11 +56,12 @@ impl Scaling for LuvScaler {
         self.scaler.set_threading_policy(threading_policy)
     }
 
-    fn resize_rgb(
+    fn resize_rgb<'a>(
         &self,
-        new_size: ImageSize,
-        store: ImageStore<u8, 3>,
-    ) -> Result<ImageStore<u8, 3>, PicScaleError> {
+        store: &ImageStore<'a, u8, 3>,
+        into: &mut ImageStoreMut<'a, u8, 3>,
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -106,15 +75,25 @@ impl Scaling for LuvScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
         const COMPONENTS: usize = 3;
-        let mut lab_store = ImageStore::<f32, COMPONENTS>::alloc(store.width, store.height);
+
+        let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS];
+
+        let mut lab_store = ImageStoreMut::<f32, COMPONENTS>::from_slice(
+            &mut target_vertical,
+            store.width,
+            store.height,
+        )?;
+        lab_store.bit_depth = into.bit_depth;
+
         let lab_stride =
             lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
         rgb_to_luv(
-            store.buffer.borrow(),
+            store.buffer.as_ref(),
             store.width as u32 * COMPONENTS as u32,
             lab_store.buffer.borrow_mut(),
             lab_stride,
@@ -123,29 +102,42 @@ impl Scaling for LuvScaler {
             &SRGB_TO_XYZ_D65,
             TransferFunction::Srgb,
         );
-        let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?;
-        let mut new_u8_store = ImageStore::<u8, COMPONENTS>::alloc(new_size.width, new_size.height);
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+        self.scaler
+            .resize_rgb_f32(&new_immutable_store, &mut new_store)?;
+
         let new_lab_stride =
             new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
         luv_to_rgb(
             new_store.buffer.borrow(),
             new_lab_stride,
-            new_u8_store.buffer.borrow_mut(),
-            new_u8_store.width as u32 * COMPONENTS as u32,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
             new_store.width as u32,
             new_store.height as u32,
             &XYZ_TO_SRGB_D65,
             TransferFunction::Srgb,
         );
-        Ok(new_u8_store)
+        Ok(())
     }
 
     fn resize_rgba<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 4>,
+        store: &ImageStore<'a, u8, 4>,
+        into: &mut ImageStoreMut<'a, u8, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u8, 4>, PicScaleError> {
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -159,36 +151,59 @@ impl Scaling for LuvScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
-        let mut src_store = store;
+        const COMPONENTS: usize = 4;
 
-        let pool = self
-            .scaler
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
+        let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS];
 
-        let mut has_alpha_premultiplied = false;
+        let mut lab_store = ImageStoreMut::<f32, COMPONENTS>::from_slice(
+            &mut target_vertical,
+            store.width,
+            store.height,
+        )?;
+        lab_store.bit_depth = into.bit_depth;
 
-        if premultiply_alpha {
-            let is_alpha_premultiplication_reasonable =
-                has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width);
-            if is_alpha_premultiplication_reasonable {
-                let mut new_store = ImageStore::<u8, 4>::alloc(src_store.width, src_store.height);
-                src_store.premultiply_alpha(&mut new_store, &pool);
-                src_store = new_store;
-                has_alpha_premultiplied = true;
-            }
-        }
-        let lab_store = Self::rgba_to_laba(src_store);
-        let new_store = self
-            .scaler
-            .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?;
-        let mut rgba_store = Self::laba_to_srgba(new_store);
-        if premultiply_alpha && has_alpha_premultiplied {
-            rgba_store.unpremultiply_alpha(&pool);
-        }
-        Ok(rgba_store)
+        let lab_stride =
+            lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+        rgba_to_luv_with_alpha(
+            store.buffer.as_ref(),
+            store.width as u32 * COMPONENTS as u32,
+            lab_store.buffer.borrow_mut(),
+            lab_stride,
+            lab_store.width as u32,
+            lab_store.height as u32,
+            &SRGB_TO_XYZ_D65,
+            TransferFunction::Srgb,
+        );
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+        self.scaler
+            .resize_rgba_f32(&new_immutable_store, &mut new_store, premultiply_alpha)?;
+
+        let new_lab_stride =
+            new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
+        luv_with_alpha_to_rgba(
+            new_store.buffer.borrow(),
+            new_lab_stride,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
+            new_store.width as u32,
+            new_store.height as u32,
+            &XYZ_TO_SRGB_D65,
+            TransferFunction::Srgb,
+        );
+        Ok(())
     }
 }
diff --git a/src/colors/oklab_scaler.rs b/src/colors/oklab_scaler.rs
index d37bfdf..1a49697 100644
--- a/src/colors/oklab_scaler.rs
+++ b/src/colors/oklab_scaler.rs
@@ -28,11 +28,10 @@
  */
 use colorutils_rs::{oklab_to_rgb, oklab_to_rgba, rgb_to_oklab, rgba_to_oklab, TransferFunction};
 
-use crate::alpha_check::has_non_constant_cap_alpha_rgba8;
 use crate::pic_scale_error::PicScaleError;
 use crate::scaler::ScalingF32;
 use crate::support::check_image_size_overflow;
-use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
+use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
 
 #[derive(Debug, Copy, Clone)]
 /// Converts image to *Oklab* components scales it and convert back
@@ -50,35 +49,6 @@ impl OklabScaler {
             transfer_function,
         }
     }
-
-    fn rgba_to_laba(&self, store: ImageStore<u8, 4>) -> ImageStore<f32, 4> {
-        let mut new_store = ImageStore::<f32, 4>::alloc(store.width, store.height);
-        let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32;
-        rgba_to_oklab(
-            store.buffer.borrow(),
-            store.width as u32 * 4u32,
-            new_store.buffer.borrow_mut(),
-            lab_stride,
-            store.width as u32,
-            store.height as u32,
-            self.transfer_function,
-        );
-        new_store
-    }
-
-    fn laba_to_srgba(&self, store: ImageStore<f32, 4>) -> ImageStore<u8, 4> {
-        let mut new_store = ImageStore::<u8, 4>::alloc(store.width, store.height);
-        oklab_to_rgba(
-            store.buffer.borrow(),
-            store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32,
-            new_store.buffer.borrow_mut(),
-            store.width as u32 * 4u32,
-            store.width as u32,
-            store.height as u32,
-            self.transfer_function,
-        );
-        new_store
-    }
 }
 
 impl Scaling for OklabScaler {
@@ -86,11 +56,12 @@ impl Scaling for OklabScaler {
         self.scaler.threading_policy = threading_policy;
     }
 
-    fn resize_rgb(
+    fn resize_rgb<'a>(
         &self,
-        new_size: ImageSize,
-        store: ImageStore<u8, 3>,
-    ) -> Result<ImageStore<u8, 3>, PicScaleError> {
+        store: &ImageStore<'a, u8, 3>,
+        into: &mut ImageStoreMut<'a, u8, 3>,
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -104,15 +75,25 @@ impl Scaling for OklabScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
         const COMPONENTS: usize = 3;
-        let mut lab_store = ImageStore::<f32, COMPONENTS>::alloc(store.width, store.height);
+
+        let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS];
+
+        let mut lab_store = ImageStoreMut::<f32, COMPONENTS>::from_slice(
+            &mut target_vertical,
+            store.width,
+            store.height,
+        )?;
+        lab_store.bit_depth = into.bit_depth;
+
         let lab_stride =
             lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
         rgb_to_oklab(
-            store.buffer.borrow(),
+            store.buffer.as_ref(),
             store.width as u32 * COMPONENTS as u32,
             lab_store.buffer.borrow_mut(),
             lab_stride,
@@ -120,28 +101,41 @@ impl Scaling for OklabScaler {
             lab_store.height as u32,
             self.transfer_function,
         );
-        let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?;
-        let mut new_u8_store = ImageStore::<u8, COMPONENTS>::alloc(new_size.width, new_size.height);
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+        self.scaler
+            .resize_rgb_f32(&new_immutable_store, &mut new_store)?;
+
         let new_lab_stride =
             new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
         oklab_to_rgb(
             new_store.buffer.borrow(),
             new_lab_stride,
-            new_u8_store.buffer.borrow_mut(),
-            new_u8_store.width as u32 * COMPONENTS as u32,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
             new_store.width as u32,
             new_store.height as u32,
             self.transfer_function,
         );
-        Ok(new_u8_store)
+        Ok(())
     }
 
     fn resize_rgba<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 4>,
+        store: &ImageStore<'a, u8, 4>,
+        into: &mut ImageStoreMut<'a, u8, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u8, 4>, PicScaleError> {
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -155,36 +149,57 @@ impl Scaling for OklabScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
-        let mut src_store = store;
+        const COMPONENTS: usize = 4;
 
-        let pool = self
-            .scaler
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
+        let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS];
 
-        let mut has_alpha_premultiplied = false;
+        let mut lab_store = ImageStoreMut::<f32, COMPONENTS>::from_slice(
+            &mut target_vertical,
+            store.width,
+            store.height,
+        )?;
+        lab_store.bit_depth = into.bit_depth;
 
-        if premultiply_alpha {
-            let is_alpha_premultiplication_reasonable =
-                has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width);
-            if is_alpha_premultiplication_reasonable {
-                let mut new_store = ImageStore::<u8, 4>::alloc(src_store.width, src_store.height);
-                src_store.premultiply_alpha(&mut new_store, &pool);
-                src_store = new_store;
-                has_alpha_premultiplied = true;
-            }
-        }
-        let lab_store = self.rgba_to_laba(src_store);
-        let new_store = self
-            .scaler
-            .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?;
-        let mut rgba_store = self.laba_to_srgba(new_store);
-        if premultiply_alpha && has_alpha_premultiplied {
-            rgba_store.unpremultiply_alpha(&pool);
-        }
-        Ok(rgba_store)
+        let lab_stride =
+            lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+        rgba_to_oklab(
+            store.buffer.as_ref(),
+            store.width as u32 * COMPONENTS as u32,
+            lab_store.buffer.borrow_mut(),
+            lab_stride,
+            lab_store.width as u32,
+            lab_store.height as u32,
+            self.transfer_function,
+        );
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+        self.scaler
+            .resize_rgba_f32(&new_immutable_store, &mut new_store, premultiply_alpha)?;
+
+        let new_lab_stride =
+            new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
+        oklab_to_rgba(
+            new_store.buffer.borrow(),
+            new_lab_stride,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
+            new_store.width as u32,
+            new_store.height as u32,
+            self.transfer_function,
+        );
+        Ok(())
     }
 }
diff --git a/src/colors/sigmoidal_scaler.rs b/src/colors/sigmoidal_scaler.rs
index 69cae09..e1bf688 100644
--- a/src/colors/sigmoidal_scaler.rs
+++ b/src/colors/sigmoidal_scaler.rs
@@ -27,11 +27,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::alpha_check::has_non_constant_cap_alpha_rgba8;
 use crate::pic_scale_error::PicScaleError;
 use crate::scaler::ScalingF32;
 use crate::support::check_image_size_overflow;
-use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
+use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
 use colorutils_rs::{rgb_to_sigmoidal, rgba_to_sigmoidal, sigmoidal_to_rgb, sigmoidal_to_rgba};
 
 #[derive(Debug, Copy, Clone)]
@@ -46,33 +45,6 @@ impl SigmoidalScaler {
             scaler: Scaler::new(filter),
         }
     }
-
-    fn rgba_to_sigmoidal(store: ImageStore<u8, 4>) -> ImageStore<f32, 4> {
-        let mut new_store = ImageStore::<f32, 4>::alloc(store.width, store.height);
-        let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32;
-        rgba_to_sigmoidal(
-            store.buffer.borrow(),
-            store.width as u32 * 4u32,
-            new_store.buffer.borrow_mut(),
-            lab_stride,
-            store.width as u32,
-            store.height as u32,
-        );
-        new_store
-    }
-
-    fn sigmoidal_to_rgba(store: ImageStore<f32, 4>) -> ImageStore<u8, 4> {
-        let mut new_store = ImageStore::<u8, 4>::alloc(store.width, store.height);
-        sigmoidal_to_rgba(
-            store.buffer.borrow(),
-            store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32,
-            new_store.buffer.borrow_mut(),
-            store.width as u32 * 4u32,
-            store.width as u32,
-            store.height as u32,
-        );
-        new_store
-    }
 }
 
 impl Scaling for SigmoidalScaler {
@@ -80,11 +52,12 @@ impl Scaling for SigmoidalScaler {
         self.scaler.set_threading_policy(threading_policy)
     }
 
-    fn resize_rgb(
+    fn resize_rgb<'a>(
         &self,
-        new_size: ImageSize,
-        store: ImageStore<u8, 3>,
-    ) -> Result<ImageStore<u8, 3>, PicScaleError> {
+        store: &ImageStore<'a, u8, 3>,
+        into: &mut ImageStoreMut<'a, u8, 3>,
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -98,42 +71,65 @@ impl Scaling for SigmoidalScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
         const COMPONENTS: usize = 3;
-        let mut lab_store = ImageStore::<f32, COMPONENTS>::alloc(store.width, store.height);
+
+        let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS];
+
+        let mut lab_store = ImageStoreMut::<f32, COMPONENTS>::from_slice(
+            &mut target_vertical,
+            store.width,
+            store.height,
+        )?;
+        lab_store.bit_depth = into.bit_depth;
+
         let lab_stride =
             lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
         rgb_to_sigmoidal(
-            store.buffer.borrow(),
+            store.buffer.as_ref(),
             store.width as u32 * COMPONENTS as u32,
             lab_store.buffer.borrow_mut(),
             lab_stride,
             lab_store.width as u32,
             lab_store.height as u32,
         );
-        let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?;
-        let mut new_u8_store = ImageStore::<u8, COMPONENTS>::alloc(new_size.width, new_size.height);
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+        self.scaler
+            .resize_rgb_f32(&new_immutable_store, &mut new_store)?;
+
         let new_lab_stride =
             new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
         sigmoidal_to_rgb(
             new_store.buffer.borrow(),
             new_lab_stride,
-            new_u8_store.buffer.borrow_mut(),
-            new_u8_store.width as u32 * COMPONENTS as u32,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
             new_store.width as u32,
             new_store.height as u32,
         );
-        Ok(new_u8_store)
+        Ok(())
     }
 
     fn resize_rgba<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 4>,
+        store: &ImageStore<'a, u8, 4>,
+        into: &mut ImageStoreMut<'a, u8, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u8, 4>, PicScaleError> {
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -147,36 +143,55 @@ impl Scaling for SigmoidalScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
-        let mut src_store = store;
+        const COMPONENTS: usize = 4;
 
-        let pool = self
-            .scaler
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
+        let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS];
 
-        let mut has_alpha_premultiplied = false;
+        let mut lab_store = ImageStoreMut::<f32, COMPONENTS>::from_slice(
+            &mut target_vertical,
+            store.width,
+            store.height,
+        )?;
+        lab_store.bit_depth = into.bit_depth;
 
-        if premultiply_alpha {
-            let is_alpha_premultiplication_reasonable =
-                has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width);
-            if is_alpha_premultiplication_reasonable {
-                let mut new_store = ImageStore::<u8, 4>::alloc(src_store.width, src_store.height);
-                src_store.premultiply_alpha(&mut new_store, &pool);
-                src_store = new_store;
-                has_alpha_premultiplied = true;
-            }
-        }
-        let lab_store = Self::rgba_to_sigmoidal(src_store);
-        let new_store = self
-            .scaler
-            .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?;
-        let mut rgba_store = Self::sigmoidal_to_rgba(new_store);
-        if premultiply_alpha && has_alpha_premultiplied {
-            rgba_store.unpremultiply_alpha(&pool);
-        }
-        Ok(rgba_store)
+        let lab_stride =
+            lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+        rgba_to_sigmoidal(
+            store.buffer.as_ref(),
+            store.width as u32 * COMPONENTS as u32,
+            lab_store.buffer.borrow_mut(),
+            lab_stride,
+            lab_store.width as u32,
+            lab_store.height as u32,
+        );
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+        self.scaler
+            .resize_rgba_f32(&new_immutable_store, &mut new_store, premultiply_alpha)?;
+
+        let new_lab_stride =
+            new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
+        sigmoidal_to_rgba(
+            new_store.buffer.borrow(),
+            new_lab_stride,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
+            new_store.width as u32,
+            new_store.height as u32,
+        );
+        Ok(())
     }
 }
diff --git a/src/colors/xyz_scaler.rs b/src/colors/xyz_scaler.rs
index 75f5321..328eb48 100644
--- a/src/colors/xyz_scaler.rs
+++ b/src/colors/xyz_scaler.rs
@@ -32,11 +32,10 @@ use colorutils_rs::{
     SRGB_TO_XYZ_D65, XYZ_TO_SRGB_D65,
 };
 
-use crate::alpha_check::has_non_constant_cap_alpha_rgba8;
 use crate::pic_scale_error::PicScaleError;
 use crate::scaler::{Scaling, ScalingF32};
 use crate::support::check_image_size_overflow;
-use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, ThreadingPolicy};
+use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ThreadingPolicy};
 
 #[derive(Debug, Copy, Clone)]
 /// Converts image to CIE XYZ components scales it and convert back
@@ -50,37 +49,6 @@ impl XYZScaler {
             scaler: Scaler::new(filter),
         }
     }
-
-    fn rgba_to_xyz(store: ImageStore<u8, 4>) -> ImageStore<f32, 4> {
-        let mut new_store = ImageStore::<f32, 4>::alloc(store.width, store.height);
-        let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32;
-        rgba_to_xyz_with_alpha(
-            store.buffer.borrow(),
-            store.width as u32 * 4u32,
-            new_store.buffer.borrow_mut(),
-            lab_stride,
-            store.width as u32,
-            store.height as u32,
-            &SRGB_TO_XYZ_D65,
-            TransferFunction::Srgb,
-        );
-        new_store
-    }
-
-    fn xyz_to_srgba(store: ImageStore<f32, 4>) -> ImageStore<u8, 4> {
-        let mut new_store = ImageStore::<u8, 4>::alloc(store.width, store.height);
-        xyz_with_alpha_to_rgba(
-            store.buffer.borrow(),
-            store.width as u32 * 4u32 * std::mem::size_of::<f32>() as u32,
-            new_store.buffer.borrow_mut(),
-            store.width as u32 * 4u32,
-            store.width as u32,
-            store.height as u32,
-            &XYZ_TO_SRGB_D65,
-            TransferFunction::Srgb,
-        );
-        new_store
-    }
 }
 
 impl Scaling for XYZScaler {
@@ -88,11 +56,12 @@ impl Scaling for XYZScaler {
         self.scaler.threading_policy = threading_policy;
     }
 
-    fn resize_rgb(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<u8, 3>,
-    ) -> Result<ImageStore<u8, 3>, PicScaleError> {
+    fn resize_rgb<'a>(
+        &'a self,
+        store: &ImageStore<'a, u8, 3>,
+        into: &mut ImageStoreMut<'a, u8, 3>,
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -106,42 +75,65 @@ impl Scaling for XYZScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
         const COMPONENTS: usize = 3;
-        let mut lab_store = ImageStore::<f32, COMPONENTS>::alloc(store.width, store.height);
+
+        let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS];
+
+        let mut lab_store = ImageStoreMut::<f32, COMPONENTS>::from_slice(
+            &mut target_vertical,
+            store.width,
+            store.height,
+        )?;
+        lab_store.bit_depth = into.bit_depth;
+
         let lab_stride =
             lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
         srgb_to_xyz(
-            store.buffer.borrow(),
+            store.buffer.as_ref(),
             store.width as u32 * COMPONENTS as u32,
             lab_store.buffer.borrow_mut(),
             lab_stride,
             lab_store.width as u32,
             lab_store.height as u32,
         );
-        let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?;
-        let mut new_u8_store = ImageStore::<u8, COMPONENTS>::alloc(new_size.width, new_size.height);
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+
+        self.scaler
+            .resize_rgb_f32(&new_immutable_store, &mut new_store)?;
         let new_lab_stride =
             new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
         xyz_to_srgb(
             new_store.buffer.borrow(),
             new_lab_stride,
-            new_u8_store.buffer.borrow_mut(),
-            new_u8_store.width as u32 * COMPONENTS as u32,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
             new_store.width as u32,
             new_store.height as u32,
         );
-        Ok(new_u8_store)
+        Ok(())
     }
 
     fn resize_rgba<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 4>,
+        store: &ImageStore<'a, u8, 4>,
+        into: &mut ImageStoreMut<'a, u8, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u8, 4>, PicScaleError> {
+    ) -> Result<(), PicScaleError> {
+        let new_size = into.get_size();
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -155,36 +147,59 @@ impl Scaling for XYZScaler {
         }
 
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
-        let mut src_store = store;
+        const COMPONENTS: usize = 4;
 
-        let pool = self
-            .scaler
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
+        let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS];
 
-        let mut has_alpha_premultiplied = false;
+        let mut lab_store = ImageStoreMut::<f32, COMPONENTS>::from_slice(
+            &mut target_vertical,
+            store.width,
+            store.height,
+        )?;
+        lab_store.bit_depth = into.bit_depth;
 
-        if premultiply_alpha {
-            let is_alpha_premultiplication_reasonable =
-                has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width);
-            if is_alpha_premultiplication_reasonable {
-                let mut new_store = ImageStore::<u8, 4>::alloc(src_store.width, src_store.height);
-                src_store.premultiply_alpha(&mut new_store, &pool);
-                src_store = new_store;
-                has_alpha_premultiplied = true;
-            }
-        }
-        let lab_store = Self::rgba_to_xyz(src_store);
-        let new_store = self
-            .scaler
-            .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?;
-        let mut rgba_store = Self::xyz_to_srgba(new_store);
-        if premultiply_alpha && has_alpha_premultiplied {
-            rgba_store.unpremultiply_alpha(&pool);
-        }
-        Ok(rgba_store)
+        let lab_stride =
+            lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+
+        rgba_to_xyz_with_alpha(
+            store.buffer.as_ref(),
+            store.width as u32 * COMPONENTS as u32,
+            lab_store.buffer.borrow_mut(),
+            lab_stride,
+            lab_store.width as u32,
+            lab_store.height as u32,
+            &SRGB_TO_XYZ_D65,
+            TransferFunction::Srgb,
+        );
+
+        let new_immutable_store = ImageStore::<f32, COMPONENTS> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: COMPONENTS,
+            width: store.width,
+            height: store.height,
+            bit_depth: into.bit_depth,
+        };
+
+        let mut new_store = ImageStoreMut::<f32, COMPONENTS>::alloc(into.width, into.height);
+
+        self.scaler
+            .resize_rgba_f32(&new_immutable_store, &mut new_store, premultiply_alpha)?;
+        let new_lab_stride =
+            new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::<f32>() as u32;
+        xyz_with_alpha_to_rgba(
+            new_store.buffer.borrow(),
+            new_lab_stride,
+            into.buffer.borrow_mut(),
+            into.width as u32 * COMPONENTS as u32,
+            new_store.width as u32,
+            new_store.height as u32,
+            &XYZ_TO_SRGB_D65,
+            TransferFunction::Srgb,
+        );
+        Ok(())
     }
 }
diff --git a/src/convolution.rs b/src/convolution.rs
index 944b4ae..8c67463 100644
--- a/src/convolution.rs
+++ b/src/convolution.rs
@@ -32,7 +32,7 @@ use rayon::ThreadPool;
 use std::fmt::Debug;
 
 use crate::filter_weights::FilterWeights;
-use crate::ImageStore;
+use crate::image_store::ImageStoreMut;
 
 pub(crate) trait HorizontalConvolutionPass<T, const N: usize>
 where
@@ -41,7 +41,7 @@ where
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<T, N>,
+        destination: &mut ImageStoreMut<T, N>,
         pool: &Option<ThreadPool>,
     );
 }
@@ -53,7 +53,7 @@ where
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<T, N>,
+        destination: &mut ImageStoreMut<T, N>,
         pool: &Option<ThreadPool>,
     );
 }
diff --git a/src/dispatch_group_f16.rs b/src/dispatch_group_f16.rs
index dd811b4..03bd2d2 100644
--- a/src/dispatch_group_f16.rs
+++ b/src/dispatch_group_f16.rs
@@ -28,6 +28,7 @@
  */
 
 use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::image_store::ImageStoreMut;
 use crate::ImageStore;
 use half::f16;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
@@ -37,7 +38,7 @@ use rayon::ThreadPool;
 pub(crate) fn convolve_vertical_dispatch_f16<const COMPONENTS: usize>(
     image_store: &ImageStore<f16, COMPONENTS>,
     filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f16, COMPONENTS>,
+    destination: &mut ImageStoreMut<f16, COMPONENTS>,
     pool: &Option<ThreadPool>,
     dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]),
 ) {
@@ -57,7 +58,7 @@ pub(crate) fn convolve_vertical_dispatch_f16<const COMPONENTS: usize>(
                     let bounds = filter_weights.bounds[y];
                     let filter_offset = y * filter_weights.aligned_size;
                     let weights = &filter_weights.weights[filter_offset..];
-                    let source_buffer = image_store.buffer.borrow();
+                    let source_buffer = image_store.buffer.as_ref();
                     dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights);
                 });
         });
@@ -71,7 +72,7 @@ pub(crate) fn convolve_vertical_dispatch_f16<const COMPONENTS: usize>(
                 let bounds = filter_weights.bounds[y];
                 let filter_offset = y * filter_weights.aligned_size;
                 let weights = &filter_weights.weights[filter_offset..];
-                let source_buffer = image_store.buffer.borrow();
+                let source_buffer = image_store.buffer.as_ref();
                 dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights);
             });
     }
@@ -80,7 +81,7 @@ pub(crate) fn convolve_vertical_dispatch_f16<const COMPONENTS: usize>(
 pub(crate) fn convolve_horizontal_dispatch_f16<const CHANNELS: usize>(
     image_store: &ImageStore<f16, CHANNELS>,
     filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f16, CHANNELS>,
+    destination: &mut ImageStoreMut<f16, CHANNELS>,
     pool: &Option<ThreadPool>,
     dispatcher_4_rows: Option<
         fn(usize, usize, &FilterWeights<f32>, &[f16], usize, &mut [f16], usize),
@@ -99,7 +100,7 @@ pub(crate) fn convolve_horizontal_dispatch_f16<const CHANNELS: usize>(
             if let Some(dispatcher) = dispatcher_4_rows {
                 image_store
                     .buffer
-                    .borrow()
+                    .as_ref()
                     .par_chunks_exact(src_stride * 4)
                     .zip(
                         destination
@@ -124,11 +125,11 @@ pub(crate) fn convolve_horizontal_dispatch_f16<const CHANNELS: usize>(
             let left_src_rows = if processed_4 {
                 image_store
                     .buffer
-                    .borrow()
+                    .as_ref()
                     .chunks_exact(src_stride * 4)
                     .remainder()
             } else {
-                image_store.buffer.borrow()
+                image_store.buffer.as_ref()
             };
             let left_dst_rows = if processed_4 {
                 destination
@@ -152,7 +153,7 @@ pub(crate) fn convolve_horizontal_dispatch_f16<const CHANNELS: usize>(
         if let Some(dispatcher) = dispatcher_4_rows {
             for (src, dst) in image_store
                 .buffer
-                .borrow()
+                .as_ref()
                 .chunks_exact(src_stride * 4)
                 .zip(
                     destination
@@ -177,11 +178,11 @@ pub(crate) fn convolve_horizontal_dispatch_f16<const CHANNELS: usize>(
         let left_src_rows = if processed_4 {
             image_store
                 .buffer
-                .borrow()
+                .as_ref()
                 .chunks_exact(src_stride * 4)
                 .remainder()
         } else {
-            image_store.buffer.borrow()
+            image_store.buffer.as_ref()
         };
         let left_dst_rows = if processed_4 {
             destination
diff --git a/src/dispatch_group_f32.rs b/src/dispatch_group_f32.rs
index f4b87e9..418afba 100644
--- a/src/dispatch_group_f32.rs
+++ b/src/dispatch_group_f32.rs
@@ -28,6 +28,7 @@
  */
 
 use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::image_store::ImageStoreMut;
 use crate::ImageStore;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
@@ -36,7 +37,7 @@ use rayon::ThreadPool;
 pub(crate) fn convolve_vertical_dispatch_f32<const COMPONENTS: usize>(
     image_store: &ImageStore<f32, COMPONENTS>,
     filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f32, COMPONENTS>,
+    destination: &mut ImageStoreMut<f32, COMPONENTS>,
     pool: &Option<ThreadPool>,
     dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]),
 ) {
@@ -56,7 +57,7 @@ pub(crate) fn convolve_vertical_dispatch_f32<const COMPONENTS: usize>(
                     let bounds = filter_weights.bounds[y];
                     let filter_offset = y * filter_weights.aligned_size;
                     let weights = &filter_weights.weights[filter_offset..];
-                    let source_buffer = image_store.buffer.borrow();
+                    let source_buffer = image_store.buffer.as_ref();
                     dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights);
                 });
         });
@@ -70,7 +71,7 @@ pub(crate) fn convolve_vertical_dispatch_f32<const COMPONENTS: usize>(
                 let bounds = filter_weights.bounds[y];
                 let filter_offset = y * filter_weights.aligned_size;
                 let weights = &filter_weights.weights[filter_offset..];
-                let source_buffer = image_store.buffer.borrow();
+                let source_buffer = image_store.buffer.as_ref();
                 dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights);
             });
     }
@@ -80,7 +81,7 @@ pub(crate) fn convolve_vertical_dispatch_f32<const COMPONENTS: usize>(
 pub(crate) fn convolve_horizontal_dispatch_f32<const CHANNELS: usize>(
     image_store: &ImageStore<f32, CHANNELS>,
     filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f32, CHANNELS>,
+    destination: &mut ImageStoreMut<f32, CHANNELS>,
     pool: &Option<ThreadPool>,
     dispatcher_4_rows: Option<
         fn(usize, usize, &FilterWeights<f32>, &[f32], usize, &mut [f32], usize),
@@ -99,7 +100,7 @@ pub(crate) fn convolve_horizontal_dispatch_f32<const CHANNELS: usize>(
             if let Some(dispatcher) = dispatcher_4_rows {
                 image_store
                     .buffer
-                    .borrow()
+                    .as_ref()
                     .par_chunks_exact(src_stride * 4)
                     .zip(
                         destination
@@ -124,11 +125,11 @@ pub(crate) fn convolve_horizontal_dispatch_f32<const CHANNELS: usize>(
             let left_src_rows = if processed_4 {
                 image_store
                     .buffer
-                    .borrow()
+                    .as_ref()
                     .chunks_exact(src_stride * 4)
                     .remainder()
             } else {
-                image_store.buffer.borrow()
+                image_store.buffer.as_ref()
             };
             let left_dst_rows = if processed_4 {
                 destination
@@ -152,7 +153,7 @@ pub(crate) fn convolve_horizontal_dispatch_f32<const CHANNELS: usize>(
         if let Some(dispatcher) = dispatcher_4_rows {
             for (src, dst) in image_store
                 .buffer
-                .borrow()
+                .as_ref()
                 .chunks_exact(src_stride * 4)
                 .zip(
                     destination
@@ -177,11 +178,11 @@ pub(crate) fn convolve_horizontal_dispatch_f32<const CHANNELS: usize>(
         let left_src_rows = if processed_4 {
             image_store
                 .buffer
-                .borrow()
+                .as_ref()
                 .chunks_exact(src_stride * 4)
                 .remainder()
         } else {
-            image_store.buffer.borrow()
+            image_store.buffer.as_ref()
         };
         let left_dst_rows = if processed_4 {
             destination
diff --git a/src/dispatch_group_u16.rs b/src/dispatch_group_u16.rs
index 0e16934..3ff1c37 100644
--- a/src/dispatch_group_u16.rs
+++ b/src/dispatch_group_u16.rs
@@ -32,6 +32,7 @@ use crate::handler_provider::{
     ColumnHandlerFixedPoint, ColumnHandlerFloatingPoint, RowHandlerFixedPoint,
     RowHandlerFloatingPoint,
 };
+use crate::image_store::ImageStoreMut;
 use crate::support::PRECISION;
 use crate::ImageStore;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
@@ -42,10 +43,10 @@ use rayon::ThreadPool;
 pub(crate) fn convolve_horizontal_dispatch_u16<const CHANNELS: usize>(
     image_store: &ImageStore<u16, CHANNELS>,
     filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<u16, CHANNELS>,
+    destination: &mut ImageStoreMut<u16, CHANNELS>,
     pool: &Option<ThreadPool>,
 ) {
-    let src = image_store.buffer.borrow();
+    let src = image_store.buffer.as_ref();
     let dst = destination.buffer.borrow_mut();
 
     let src_stride = image_store.width * image_store.channels;
@@ -156,7 +157,7 @@ pub(crate) fn convolve_horizontal_dispatch_u16<const CHANNELS: usize>(
 pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
     image_store: &ImageStore<u16, COMPONENTS>,
     filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<'_, u16, COMPONENTS>,
+    destination: &mut ImageStoreMut<'_, u16, COMPONENTS>,
     pool: &Option<ThreadPool>,
 ) {
     let src_stride = image_store.width * image_store.channels;
@@ -176,7 +177,7 @@ pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
                         let bounds = filter_weights.bounds[y];
                         let filter_offset = y * filter_weights.aligned_size;
                         let weights = &filter_weights.weights[filter_offset..];
-                        let source_buffer = image_store.buffer.borrow();
+                        let source_buffer = image_store.buffer.as_ref();
                         u16::handle_floating_column(
                             dst_width,
                             &bounds,
@@ -196,7 +197,7 @@ pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
                         let bounds = filter_weights.bounds[y];
                         let filter_offset = y * filter_weights.aligned_size;
                         let weights = &approx.weights[filter_offset..];
-                        let source_buffer = image_store.buffer.borrow();
+                        let source_buffer = image_store.buffer.as_ref();
                         u16::handle_fixed_column::<i32, COMPONENTS>(
                             dst_width,
                             &bounds,
@@ -218,7 +219,7 @@ pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
                 let bounds = filter_weights.bounds[y];
                 let filter_offset = y * filter_weights.aligned_size;
                 let weights = &filter_weights.weights[filter_offset..];
-                let source_buffer = image_store.buffer.borrow();
+                let source_buffer = image_store.buffer.as_ref();
                 u16::handle_floating_column(
                     dst_width,
                     &bounds,
@@ -239,7 +240,7 @@ pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
                 let bounds = filter_weights.bounds[y];
                 let filter_offset = y * filter_weights.aligned_size;
                 let weights = &approx.weights[filter_offset..];
-                let source_buffer = image_store.buffer.borrow();
+                let source_buffer = image_store.buffer.as_ref();
                 u16::handle_fixed_column::<i32, COMPONENTS>(
                     dst_width,
                     &bounds,
diff --git a/src/dispatch_group_u8.rs b/src/dispatch_group_u8.rs
index f78b6c4..89d9eaa 100644
--- a/src/dispatch_group_u8.rs
+++ b/src/dispatch_group_u8.rs
@@ -28,6 +28,7 @@
  */
 
 use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::image_store::ImageStoreMut;
 use crate::support::PRECISION;
 use crate::ImageStore;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
@@ -39,14 +40,14 @@ use std::sync::Arc;
 pub(crate) fn convolve_horizontal_dispatch_u8<const CHANNELS: usize>(
     image_store: &ImageStore<u8, CHANNELS>,
     filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<u8, CHANNELS>,
+    destination: &mut ImageStoreMut<u8, CHANNELS>,
     pool: &Option<ThreadPool>,
     dispatcher_4_rows: Option<fn(&[u8], usize, &mut [u8], usize, &FilterWeights<i16>)>,
     dispatcher_1_row: fn(&[u8], &mut [u8], &FilterWeights<i16>),
 ) {
     let approx_weights = filter_weights.numerical_approximation_i16::<PRECISION>(0);
 
-    let src = image_store.buffer.borrow();
+    let src = image_store.buffer.as_ref();
     let dst = destination.buffer.borrow_mut();
 
     let src_stride = image_store.width * image_store.channels;
@@ -100,7 +101,7 @@ pub(crate) fn convolve_horizontal_dispatch_u8<const CHANNELS: usize>(
 pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>(
     image_store: &ImageStore<u8, COMPONENTS>,
     filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<'a, u8, COMPONENTS>,
+    destination: &mut ImageStoreMut<'a, u8, COMPONENTS>,
     pool: &Option<ThreadPool>,
     dispatcher: fn(usize, &FilterBounds, &[u8], &mut [u8], usize, &[i16]),
 ) {
@@ -120,7 +121,7 @@ pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>(
                     let bounds = filter_weights.bounds[y];
                     let filter_offset = y * filter_weights.aligned_size;
                     let weights = &approx.weights[filter_offset..];
-                    let source_buffer = image_store.buffer.borrow();
+                    let source_buffer = image_store.buffer.as_ref();
                     dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights);
                 });
         });
@@ -134,7 +135,7 @@ pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>(
                 let bounds = filter_weights.bounds[y];
                 let filter_offset = y * filter_weights.aligned_size;
                 let weights = &approx.weights[filter_offset..];
-                let source_buffer = image_store.buffer.borrow();
+                let source_buffer = image_store.buffer.as_ref();
                 dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights);
             });
     }
diff --git a/src/f16.rs b/src/f16.rs
index 3e5c23c..a445a9e 100644
--- a/src/f16.rs
+++ b/src/f16.rs
@@ -43,6 +43,7 @@ use crate::floating_point_horizontal::{
     convolve_row_handler_floating_point, convolve_row_handler_floating_point_4,
 };
 use crate::floating_point_vertical::column_handler_floating_point;
+use crate::image_store::ImageStoreMut;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
 use crate::neon::{
     convolve_horizontal_rgb_neon_row_one_f16, convolve_horizontal_rgb_neon_rows_4_f16,
@@ -98,7 +99,7 @@ impl<'a> HorizontalConvolutionPass<f16, 4> for ImageStore<'a, f16, 4> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<f16, 4>,
+        destination: &mut ImageStoreMut<f16, 4>,
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher_4_rows: Option<
@@ -170,7 +171,7 @@ impl<'a> VerticalConvolutionPass<f16, 4> for ImageStore<'a, f16, 4> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<f16, 4>,
+        destination: &mut ImageStoreMut<f16, 4>,
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
@@ -214,7 +215,7 @@ impl<'a> HorizontalConvolutionPass<f16, 3> for ImageStore<'a, f16, 3> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<f16, 3>,
+        destination: &mut ImageStoreMut<f16, 3>,
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher_4_rows: Option<
@@ -266,7 +267,7 @@ impl<'a> VerticalConvolutionPass<f16, 3> for ImageStore<'a, f16, 3> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<f16, 3>,
+        destination: &mut ImageStoreMut<f16, 3>,
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
@@ -310,7 +311,7 @@ impl<'a> HorizontalConvolutionPass<f16, 1> for ImageStore<'a, f16, 1> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<f16, 1>,
+        destination: &mut ImageStoreMut<f16, 1>,
         pool: &Option<ThreadPool>,
     ) {
         let _dispatcher_4_rows: Option<
@@ -333,7 +334,7 @@ impl<'a> VerticalConvolutionPass<f16, 1> for ImageStore<'a, f16, 1> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<f16, 1>,
+        destination: &mut ImageStoreMut<f16, 1>,
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
diff --git a/src/image_store.rs b/src/image_store.rs
index 16bd36f..2fc67c2 100644
--- a/src/image_store.rs
+++ b/src/image_store.rs
@@ -26,6 +26,10 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+use crate::alpha_check::{
+    has_non_constant_cap_alpha_rgba16, has_non_constant_cap_alpha_rgba8,
+    has_non_constant_cap_alpha_rgba_f32,
+};
 #[cfg(feature = "half")]
 use crate::alpha_handle_f16::{premultiply_alpha_rgba_f16, unpremultiply_alpha_rgba_f16};
 use crate::alpha_handle_f32::{premultiply_alpha_rgba_f32, unpremultiply_alpha_rgba_f32};
@@ -35,9 +39,10 @@ use crate::pic_scale_error::{PicScaleBufferMismatch, PicScaleError};
 use crate::ImageSize;
 use num_traits::FromPrimitive;
 use rayon::ThreadPool;
+use std::borrow::Cow;
 use std::fmt::Debug;
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 /// Holds an image
 ///
 /// # Arguments
@@ -51,7 +56,7 @@ pub struct ImageStore<'a, T, const N: usize>
 where
     T: FromPrimitive + Clone + Copy + Debug,
 {
-    pub(crate) buffer: BufferStore<'a, T>,
+    pub(crate) buffer: std::borrow::Cow<'a, [T]>,
     /// Channels in the image
     pub channels: usize,
     /// Image width
@@ -62,6 +67,35 @@ where
     pub(crate) bit_depth: usize,
 }
 
+#[derive(Debug)]
+/// Holds an image
+///
+/// # Arguments
+/// `N` - count of channels
+///
+/// # Examples
+/// ImageStore<u8, 4> - represents RGBA
+/// ImageStore<u8, 3> - represents RGB
+/// ImageStore<f32, 3> - represents RGB in f32 and etc
+pub struct ImageStoreMut<'a, T, const N: usize>
+where
+    T: FromPrimitive + Clone + Copy + Debug,
+{
+    pub(crate) buffer: BufferStore<'a, T>,
+    /// Channels in the image
+    pub channels: usize,
+    /// Image width
+    pub width: usize,
+    /// Image height
+    pub height: usize,
+    /// Required for `u16` images
+    pub bit_depth: usize,
+}
+
+pub(crate) trait CheckStoreDensity {
+    fn should_have_bit_depth(&self) -> bool;
+}
+
 #[derive(Debug)]
 pub(crate) enum BufferStore<'a, T: Copy + Debug> {
     Borrowed(&'a mut [T]),
@@ -84,7 +118,7 @@ impl<T: Copy + Debug> BufferStore<'_, T> {
     }
 }
 
-impl<T, const N: usize> ImageStore<'static, T, N>
+impl<'a, T, const N: usize> ImageStore<'a, T, N>
 where
     T: FromPrimitive + Clone + Copy + Debug + Default,
 {
@@ -92,7 +126,7 @@ where
         slice_ref: Vec<T>,
         width: usize,
         height: usize,
-    ) -> Result<ImageStore<'static, T, N>, PicScaleError> {
+    ) -> Result<ImageStore<'a, T, N>, PicScaleError> {
         let expected_size = width * height * N;
         if slice_ref.len() != width * height * N {
             return Err(PicScaleError::BufferMismatch(PicScaleBufferMismatch {
@@ -104,7 +138,7 @@ where
             }));
         }
         Ok(ImageStore::<T, N> {
-            buffer: BufferStore::Owned(slice_ref),
+            buffer: std::borrow::Cow::Owned(slice_ref),
             channels: N,
             width,
             height,
@@ -112,9 +146,93 @@ where
         })
     }
 
-    pub fn alloc(width: usize, height: usize) -> ImageStore<'static, T, N> {
+    pub fn alloc(width: usize, height: usize) -> ImageStore<'a, T, N> {
         let vc = vec![T::from_u32(0).unwrap_or_default(); width * N * height];
         ImageStore::<T, N> {
+            buffer: std::borrow::Cow::Owned(vc),
+            channels: N,
+            width,
+            height,
+            bit_depth: 0,
+        }
+    }
+}
+
+impl<const N: usize> CheckStoreDensity for ImageStoreMut<'_, u8, N> {
+    fn should_have_bit_depth(&self) -> bool {
+        false
+    }
+}
+
+impl<const N: usize> CheckStoreDensity for ImageStoreMut<'_, f32, N> {
+    fn should_have_bit_depth(&self) -> bool {
+        false
+    }
+}
+
+#[cfg(feature = "half")]
+impl<const N: usize> CheckStoreDensity for ImageStoreMut<'_, half::f16, N> {
+    fn should_have_bit_depth(&self) -> bool {
+        false
+    }
+}
+
+impl<const N: usize> CheckStoreDensity for ImageStoreMut<'_, u16, N> {
+    fn should_have_bit_depth(&self) -> bool {
+        true
+    }
+}
+
+impl<T, const N: usize> ImageStoreMut<'_, T, N>
+where
+    T: FromPrimitive + Clone + Copy + Debug + Default,
+{
+    pub(crate) fn validate(&self) -> Result<(), PicScaleError> {
+        let expected_size = self.width * self.height * N;
+        if self.buffer.borrow().len() != self.width * self.height * N {
+            return Err(PicScaleError::BufferMismatch(PicScaleBufferMismatch {
+                expected: expected_size,
+                width: self.width,
+                height: self.height,
+                channels: N,
+                slice_len: self.buffer.borrow().len(),
+            }));
+        }
+        Ok(())
+    }
+}
+
+impl<'a, T, const N: usize> ImageStoreMut<'a, T, N>
+where
+    T: FromPrimitive + Clone + Copy + Debug + Default,
+{
+    pub fn new(
+        slice_ref: Vec<T>,
+        width: usize,
+        height: usize,
+    ) -> Result<ImageStoreMut<'a, T, N>, PicScaleError> {
+        let expected_size = width * height * N;
+        if slice_ref.len() != width * height * N {
+            return Err(PicScaleError::BufferMismatch(PicScaleBufferMismatch {
+                expected: expected_size,
+                width,
+                height,
+                channels: N,
+                slice_len: slice_ref.len(),
+            }));
+        }
+        Ok(ImageStoreMut::<T, N> {
+            buffer: BufferStore::Owned(slice_ref),
+            channels: N,
+            width,
+            height,
+            bit_depth: 0,
+        })
+    }
+
+    pub fn alloc(width: usize, height: usize) -> ImageStoreMut<'a, T, N> {
+        let vc = vec![T::from_u32(0).unwrap_or_default(); width * N * height];
+        ImageStoreMut::<T, N> {
             buffer: BufferStore::Owned(vc),
             channels: N,
             width,
@@ -122,6 +240,21 @@ where
             bit_depth: 0,
         }
     }
+
+    pub fn alloc_with_depth(
+        width: usize,
+        height: usize,
+        bit_depth: usize,
+    ) -> ImageStoreMut<'a, T, N> {
+        let vc = vec![T::from_u32(0).unwrap_or_default(); width * N * height];
+        ImageStoreMut::<T, N> {
+            buffer: BufferStore::Owned(vc),
+            channels: N,
+            width,
+            height,
+            bit_depth,
+        }
+    }
 }
 
 impl<'a, T, const N: usize> ImageStore<'a, T, N>
@@ -134,13 +267,13 @@ where
 
     pub fn as_bytes(&self) -> &[T] {
         match &self.buffer {
-            BufferStore::Borrowed(p) => p,
-            BufferStore::Owned(v) => v,
+            Cow::Borrowed(br) => br.as_ref(),
+            Cow::Owned(v) => v.as_ref(),
         }
     }
 
     pub fn from_slice(
-        slice_ref: &'a mut [T],
+        slice_ref: &'a [T],
         width: usize,
         height: usize,
     ) -> Result<ImageStore<'a, T, N>, PicScaleError> {
@@ -155,7 +288,7 @@ where
             }));
         }
         Ok(ImageStore::<T, N> {
-            buffer: BufferStore::Borrowed(slice_ref),
+            buffer: std::borrow::Cow::Borrowed(slice_ref),
             channels: N,
             width,
             height,
@@ -165,68 +298,168 @@ where
 
     pub fn copied<'b>(&self) -> ImageStore<'b, T, N> {
         ImageStore::<T, N> {
-            buffer: BufferStore::Owned(self.buffer.borrow().to_vec()),
+            buffer: std::borrow::Cow::Owned(self.buffer.as_ref().to_vec()),
             channels: N,
             width: self.width,
             height: self.height,
             bit_depth: self.bit_depth,
         }
     }
+
+    pub fn copied_to_mut<'b>(&self, into: &mut ImageStoreMut<'b, T, N>) {
+        for (&src, dst) in self.buffer.as_ref().iter().zip(into.buffer.borrow_mut()) {
+            *dst = src;
+        }
+    }
 }
 
-impl ImageStore<'_, u8, 4> {
-    pub fn unpremultiply_alpha(&mut self, pool: &Option<ThreadPool>) {
-        let dst = self.buffer.borrow_mut();
-        unpremultiply_alpha_rgba(dst, self.width, self.height, pool);
+impl<'a, T, const N: usize> ImageStoreMut<'a, T, N>
+where
+    T: FromPrimitive + Clone + Copy + Debug,
+{
+    pub fn get_size(&self) -> ImageSize {
+        ImageSize::new(self.width, self.height)
+    }
+
+    pub fn as_bytes(&self) -> &[T] {
+        match &self.buffer {
+            BufferStore::Borrowed(p) => p,
+            BufferStore::Owned(v) => v,
+        }
+    }
+
+    pub fn from_slice(
+        slice_ref: &'a mut [T],
+        width: usize,
+        height: usize,
+    ) -> Result<ImageStoreMut<'a, T, N>, PicScaleError> {
+        let expected_size = width * height * N;
+        if slice_ref.len() != width * height * N {
+            return Err(PicScaleError::BufferMismatch(PicScaleBufferMismatch {
+                expected: expected_size,
+                width,
+                height,
+                channels: N,
+                slice_len: slice_ref.len(),
+            }));
+        }
+        Ok(ImageStoreMut::<T, N> {
+            buffer: BufferStore::Borrowed(slice_ref),
+            channels: N,
+            width,
+            height,
+            bit_depth: 0,
+        })
+    }
+
+    pub fn copied<'b>(&self) -> ImageStoreMut<'b, T, N> {
+        ImageStoreMut::<T, N> {
+            buffer: BufferStore::Owned(self.buffer.borrow().to_vec()),
+            channels: N,
+            width: self.width,
+            height: self.height,
+            bit_depth: self.bit_depth,
+        }
     }
 
-    pub fn premultiply_alpha(&self, into: &mut ImageStore<'_, u8, 4>, pool: &Option<ThreadPool>) {
+    pub fn to_immutable(&self) -> ImageStore<'_, T, N> {
+        ImageStore::<T, N> {
+            buffer: std::borrow::Cow::Owned(self.buffer.borrow().to_owned()),
+            channels: N,
+            width: self.width,
+            height: self.height,
+            bit_depth: self.bit_depth,
+        }
+    }
+}
+
+pub(crate) trait AssociateAlpha<T: FromPrimitive + Clone + Copy + Debug, const N: usize> {
+    fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, T, N>, pool: &Option<ThreadPool>);
+    fn is_alpha_premultiplication_needed(&self) -> bool;
+}
+
+pub(crate) trait UnassociateAlpha<T: FromPrimitive + Clone + Copy + Debug, const N: usize> {
+    fn unpremultiply_alpha(&mut self, pool: &Option<ThreadPool>);
+}
+
+impl AssociateAlpha<u8, 4> for ImageStore<'_, u8, 4> {
+    fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, u8, 4>, pool: &Option<ThreadPool>) {
         let dst = into.buffer.borrow_mut();
-        let src = self.buffer.borrow();
+        let src = self.buffer.as_ref();
         premultiply_alpha_rgba(dst, src, self.width, self.height, pool);
     }
+
+    fn is_alpha_premultiplication_needed(&self) -> bool {
+        has_non_constant_cap_alpha_rgba8(self.buffer.as_ref(), self.width)
+    }
 }
 
-impl ImageStore<'_, u16, 4> {
-    pub fn unpremultiply_alpha(&mut self, pool: &Option<ThreadPool>) {
-        let in_place = self.buffer.borrow_mut();
-        unpremultiply_alpha_rgba_u16(in_place, self.width, self.height, self.bit_depth, pool);
+impl UnassociateAlpha<u8, 4> for ImageStoreMut<'_, u8, 4> {
+    fn unpremultiply_alpha(&mut self, pool: &Option<ThreadPool>) {
+        let dst = self.buffer.borrow_mut();
+        unpremultiply_alpha_rgba(dst, self.width, self.height, pool);
     }
+}
 
-    pub fn premultiply_alpha(&self, into: &mut ImageStore<'_, u16, 4>, pool: &Option<ThreadPool>) {
+impl AssociateAlpha<u16, 4> for ImageStore<'_, u16, 4> {
+    fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, u16, 4>, pool: &Option<ThreadPool>) {
         let dst = into.buffer.borrow_mut();
-        let src = self.buffer.borrow();
+        let src = self.buffer.as_ref();
         premultiply_alpha_rgba_u16(dst, src, self.width, self.height, self.bit_depth, pool);
     }
-}
 
-impl ImageStore<'_, f32, 4> {
-    pub fn unpremultiply_alpha(&mut self, pool: &Option<ThreadPool>) {
-        let dst = self.buffer.borrow_mut();
-        unpremultiply_alpha_rgba_f32(dst, self.width, self.height, pool);
+    fn is_alpha_premultiplication_needed(&self) -> bool {
+        has_non_constant_cap_alpha_rgba16(self.buffer.as_ref(), self.width)
     }
+}
 
-    pub fn premultiply_alpha(&self, into: &mut ImageStore<'_, f32, 4>, pool: &Option<ThreadPool>) {
+impl AssociateAlpha<f32, 4> for ImageStore<'_, f32, 4> {
+    fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, f32, 4>, pool: &Option<ThreadPool>) {
         let dst = into.buffer.borrow_mut();
-        let src = self.buffer.borrow();
+        let src = self.buffer.as_ref();
         premultiply_alpha_rgba_f32(dst, src, self.width, self.height, pool);
     }
-}
 
-#[cfg(feature = "half")]
-impl<'a> ImageStore<'a, half::f16, 4> {
-    pub fn unpremultiply_alpha(&mut self, pool: &Option<ThreadPool>) {
-        let dst = self.buffer.borrow_mut();
-        unpremultiply_alpha_rgba_f16(dst, self.width, self.height, pool);
+    fn is_alpha_premultiplication_needed(&self) -> bool {
+        has_non_constant_cap_alpha_rgba_f32(self.buffer.as_ref(), self.width)
     }
+}
 
-    pub fn premultiply_alpha(
+#[cfg(feature = "half")]
+impl AssociateAlpha<half::f16, 4> for ImageStore<'_, half::f16, 4> {
+    fn premultiply_alpha(
         &self,
-        into: &mut ImageStore<'_, half::f16, 4>,
+        into: &mut ImageStoreMut<'_, half::f16, 4>,
         pool: &Option<ThreadPool>,
     ) {
         let dst = into.buffer.borrow_mut();
-        let src = self.buffer.borrow();
+        let src = self.buffer.as_ref();
         premultiply_alpha_rgba_f16(dst, src, self.width, self.height, pool);
     }
+
+    fn is_alpha_premultiplication_needed(&self) -> bool {
+        true
+    }
+}
+
+impl UnassociateAlpha<u16, 4> for ImageStoreMut<'_, u16, 4> {
+    fn unpremultiply_alpha(&mut self, pool: &Option<ThreadPool>) {
+        let in_place = self.buffer.borrow_mut();
+        unpremultiply_alpha_rgba_u16(in_place, self.width, self.height, self.bit_depth, pool);
+    }
+}
+
+impl UnassociateAlpha<f32, 4> for ImageStoreMut<'_, f32, 4> {
+    fn unpremultiply_alpha(&mut self, pool: &Option<ThreadPool>) {
+        let dst = self.buffer.borrow_mut();
+        unpremultiply_alpha_rgba_f32(dst, self.width, self.height, pool);
+    }
+}
+
+#[cfg(feature = "half")]
+impl UnassociateAlpha<half::f16, 4> for ImageStoreMut<'_, half::f16, 4> {
+    fn unpremultiply_alpha(&mut self, pool: &Option<ThreadPool>) {
+        let dst = self.buffer.borrow_mut();
+        unpremultiply_alpha_rgba_f16(dst, self.width, self.height, pool);
+    }
 }
diff --git a/src/lib.rs b/src/lib.rs
index 290b8d4..6591d36 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -97,7 +97,7 @@ pub use colors::*;
 #[cfg(feature = "colorspaces")]
 pub use colorutils_rs::TransferFunction;
 pub use image_size::ImageSize;
-pub use image_store::ImageStore;
+pub use image_store::{ImageStore, ImageStoreMut};
 pub use math::*;
 pub use sampler::*;
 pub use scaler::Scaler;
diff --git a/src/neon/utils.rs b/src/neon/utils.rs
index a17f401..76c2381 100644
--- a/src/neon/utils.rs
+++ b/src/neon/utils.rs
@@ -104,6 +104,20 @@ pub(crate) unsafe fn prefer_vfmaq_f32(
     }
 }
 
+#[inline(always)]
+pub unsafe fn xvst1q_f32_x4(a: *mut f32, b: float32x4x4_t) {
+    vst1q_f32(a, b.0);
+    vst1q_f32(a.add(4), b.1);
+    vst1q_f32(a.add(8), b.2);
+    vst1q_f32(a.add(12), b.3);
+}
+
+#[inline(always)]
+pub unsafe fn xvst1q_f32_x2(a: *mut f32, b: float32x4x2_t) {
+    vst1q_f32(a, b.0);
+    vst1q_f32(a.add(4), b.1);
+}
+
 #[inline(always)]
 pub(crate) unsafe fn prefer_vfmaq_laneq_f32<const LANE: i32>(
     a: float32x4_t,
diff --git a/src/neon/vertical_f32.rs b/src/neon/vertical_f32.rs
index 454892c..71226ef 100644
--- a/src/neon/vertical_f32.rs
+++ b/src/neon/vertical_f32.rs
@@ -27,8 +27,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::filter_weights::FilterBounds;
-use crate::neon::utils::xvld1q_f32_x4;
 use crate::neon::utils::{prefer_vfmaq_f32, xvld1q_f32_x2};
+use crate::neon::utils::{xvld1q_f32_x4, xvst1q_f32_x2, xvst1q_f32_x4};
 use std::arch::aarch64::*;
 
 macro_rules! conv_vertical_part_neon_16_f32 {
@@ -56,7 +56,7 @@ macro_rules! conv_vertical_part_neon_16_f32 {
 
             let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr();
             let f_set = float32x4x4_t(store_0, store_1, store_2, store_3);
-            vst1q_f32_x4(dst_ptr, f_set);
+            xvst1q_f32_x4(dst_ptr, f_set);
         }
     }};
 }
@@ -96,70 +96,77 @@ macro_rules! conv_vertical_part_neon_32_f32 {
 
             let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr();
             let f_set = float32x4x4_t(store_0, store_1, store_2, store_3);
-            vst1q_f32_x4(dst_ptr, f_set);
+            xvst1q_f32_x4(dst_ptr, f_set);
 
             let f_set_1 = float32x4x4_t(store_4, store_5, store_6, store_7);
-            vst1q_f32_x4(dst_ptr.add(16), f_set_1);
+            xvst1q_f32_x4(dst_ptr.add(16), f_set_1);
         }
     }};
 }
 
-macro_rules! conv_vertical_part_neon_48_f32 {
-    ($start_y: expr, $start_x: expr, $src: expr, $src_stride: expr, $dst: expr, $filter: expr, $bounds: expr) => {{
-        unsafe {
-            let mut store_0 = vdupq_n_f32(0.);
-            let mut store_1 = vdupq_n_f32(0.);
-            let mut store_2 = vdupq_n_f32(0.);
-            let mut store_3 = vdupq_n_f32(0.);
-
-            let mut store_4 = vdupq_n_f32(0.);
-            let mut store_5 = vdupq_n_f32(0.);
-            let mut store_6 = vdupq_n_f32(0.);
-            let mut store_7 = vdupq_n_f32(0.);
-
-            let mut store_8 = vdupq_n_f32(0.);
-            let mut store_9 = vdupq_n_f32(0.);
-            let mut store_10 = vdupq_n_f32(0.);
-            let mut store_11 = vdupq_n_f32(0.);
-
-            let px = $start_x;
-
-            for j in 0..$bounds.size {
-                let py = $start_y + j;
-                let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr());
-                let src_ptr = $src.get_unchecked($src_stride * py + px..).as_ptr();
-
-                let item_row_0 = xvld1q_f32_x4(src_ptr);
-                let item_row_1 = xvld1q_f32_x4(src_ptr.add(16));
-                let item_row_2 = xvld1q_f32_x4(src_ptr.add(32));
-
-                store_0 = prefer_vfmaq_f32(store_0, item_row_0.0, v_weight);
-                store_1 = prefer_vfmaq_f32(store_1, item_row_0.1, v_weight);
-                store_2 = prefer_vfmaq_f32(store_2, item_row_0.2, v_weight);
-                store_3 = prefer_vfmaq_f32(store_3, item_row_0.3, v_weight);
-
-                store_4 = prefer_vfmaq_f32(store_4, item_row_1.0, v_weight);
-                store_5 = prefer_vfmaq_f32(store_5, item_row_1.1, v_weight);
-                store_6 = prefer_vfmaq_f32(store_6, item_row_1.2, v_weight);
-                store_7 = prefer_vfmaq_f32(store_7, item_row_1.3, v_weight);
-
-                store_8 = prefer_vfmaq_f32(store_8, item_row_2.0, v_weight);
-                store_9 = prefer_vfmaq_f32(store_9, item_row_2.1, v_weight);
-                store_10 = prefer_vfmaq_f32(store_10, item_row_2.2, v_weight);
-                store_11 = prefer_vfmaq_f32(store_11, item_row_2.3, v_weight);
-            }
+#[inline(always)]
+fn conv_vertical_part_neon_48_f32(
+    start_y: usize,
+    start_x: usize,
+    src: &[f32],
+    src_stride: usize,
+    dst: &mut [f32],
+    filter: &[f32],
+    bounds: &FilterBounds,
+) {
+    unsafe {
+        let mut store_0 = vdupq_n_f32(0.);
+        let mut store_1 = vdupq_n_f32(0.);
+        let mut store_2 = vdupq_n_f32(0.);
+        let mut store_3 = vdupq_n_f32(0.);
+
+        let mut store_4 = vdupq_n_f32(0.);
+        let mut store_5 = vdupq_n_f32(0.);
+        let mut store_6 = vdupq_n_f32(0.);
+        let mut store_7 = vdupq_n_f32(0.);
+
+        let mut store_8 = vdupq_n_f32(0.);
+        let mut store_9 = vdupq_n_f32(0.);
+        let mut store_10 = vdupq_n_f32(0.);
+        let mut store_11 = vdupq_n_f32(0.);
+
+        let px = start_x;
+
+        for j in 0..bounds.size {
+            let py = start_y + j;
+            let v_weight = vld1q_dup_f32(filter.get_unchecked(j..).as_ptr());
+            let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr();
+
+            let item_row_0 = xvld1q_f32_x4(src_ptr);
+            let item_row_1 = xvld1q_f32_x4(src_ptr.add(16));
+            let item_row_2 = xvld1q_f32_x4(src_ptr.add(32));
+
+            store_0 = prefer_vfmaq_f32(store_0, item_row_0.0, v_weight);
+            store_1 = prefer_vfmaq_f32(store_1, item_row_0.1, v_weight);
+            store_2 = prefer_vfmaq_f32(store_2, item_row_0.2, v_weight);
+            store_3 = prefer_vfmaq_f32(store_3, item_row_0.3, v_weight);
+
+            store_4 = prefer_vfmaq_f32(store_4, item_row_1.0, v_weight);
+            store_5 = prefer_vfmaq_f32(store_5, item_row_1.1, v_weight);
+            store_6 = prefer_vfmaq_f32(store_6, item_row_1.2, v_weight);
+            store_7 = prefer_vfmaq_f32(store_7, item_row_1.3, v_weight);
+
+            store_8 = prefer_vfmaq_f32(store_8, item_row_2.0, v_weight);
+            store_9 = prefer_vfmaq_f32(store_9, item_row_2.1, v_weight);
+            store_10 = prefer_vfmaq_f32(store_10, item_row_2.2, v_weight);
+            store_11 = prefer_vfmaq_f32(store_11, item_row_2.3, v_weight);
+        }
 
-            let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr();
-            let f_set = float32x4x4_t(store_0, store_1, store_2, store_3);
-            vst1q_f32_x4(dst_ptr, f_set);
+        let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
+        let f_set = float32x4x4_t(store_0, store_1, store_2, store_3);
+        xvst1q_f32_x4(dst_ptr, f_set);
 
-            let f_set_1 = float32x4x4_t(store_4, store_5, store_6, store_7);
-            vst1q_f32_x4(dst_ptr.add(16), f_set_1);
+        let f_set_1 = float32x4x4_t(store_4, store_5, store_6, store_7);
+        xvst1q_f32_x4(dst_ptr.add(16), f_set_1);
 
-            let f_set_2 = float32x4x4_t(store_8, store_9, store_10, store_11);
-            vst1q_f32_x4(dst_ptr.add(32), f_set_2);
-        }
-    }};
+        let f_set_2 = float32x4x4_t(store_8, store_9, store_10, store_11);
+        xvst1q_f32_x4(dst_ptr.add(32), f_set_2);
+    }
 }
 
 #[inline(always)]
@@ -191,7 +198,7 @@ unsafe fn convolve_vertical_part_neon_8_f32(
     let item = float32x4x2_t(store_0, store_1);
 
     let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
-    vst1q_f32_x2(dst_ptr, item);
+    xvst1q_f32_x2(dst_ptr, item);
 }
 
 #[inline(always)]
@@ -263,7 +270,7 @@ pub(crate) fn convolve_vertical_rgb_neon_row_f32<const CHANNELS: usize>(
     let dst_width = width * CHANNELS;
 
     while cx + 48 < dst_width {
-        conv_vertical_part_neon_48_f32!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
+        conv_vertical_part_neon_48_f32(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
 
         cx += 48;
     }
diff --git a/src/plane_f32.rs b/src/plane_f32.rs
index 61f533a..14df280 100644
--- a/src/plane_f32.rs
+++ b/src/plane_f32.rs
@@ -34,6 +34,7 @@ use crate::convolve_naive_f32::{
 };
 use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32};
 use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::image_store::ImageStoreMut;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
 use crate::neon::{
     convolve_horizontal_plane_neon_row_one, convolve_horizontal_plane_neon_rows_4,
@@ -54,7 +55,7 @@ impl HorizontalConvolutionPass<f32, 1> for ImageStore<'_, f32, 1> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<f32, 1>,
+        destination: &mut ImageStoreMut<f32, 1>,
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher_4_rows: Option<
@@ -93,7 +94,7 @@ impl VerticalConvolutionPass<f32, 1> for ImageStore<'_, f32, 1> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<f32, 1>,
+        destination: &mut ImageStoreMut<f32, 1>,
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) =
diff --git a/src/plane_u16.rs b/src/plane_u16.rs
index 4d6fa7d..bb86038 100644
--- a/src/plane_u16.rs
+++ b/src/plane_u16.rs
@@ -30,6 +30,7 @@
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16};
 use crate::filter_weights::FilterWeights;
+use crate::image_store::ImageStoreMut;
 use crate::ImageStore;
 use rayon::ThreadPool;
 
@@ -38,7 +39,7 @@ impl HorizontalConvolutionPass<u16, 1> for ImageStore<'_, u16, 1> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<u16, 1>,
+        destination: &mut ImageStoreMut<u16, 1>,
         _pool: &Option<ThreadPool>,
     ) {
         convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool);
@@ -49,7 +50,7 @@ impl VerticalConvolutionPass<u16, 1> for ImageStore<'_, u16, 1> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<u16, 1>,
+        destination: &mut ImageStoreMut<u16, 1>,
         pool: &Option<ThreadPool>,
     ) {
         convolve_vertical_dispatch_u16(self, filter_weights, destination, pool);
diff --git a/src/plane_u8.rs b/src/plane_u8.rs
index 0cd736e..b2d43b8 100644
--- a/src/plane_u8.rs
+++ b/src/plane_u8.rs
@@ -32,6 +32,7 @@ use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8};
 use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::handler_provider::{handle_fixed_column_u8, handle_fixed_row_u8};
+use crate::image_store::ImageStoreMut;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
 use crate::neon::{convolve_horizontal_plane_neon_row, convolve_horizontal_plane_neon_rows_4_u8};
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
@@ -51,7 +52,7 @@ impl HorizontalConvolutionPass<u8, 1> for ImageStore<'_, u8, 1> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<u8, 1>,
+        destination: &mut ImageStoreMut<u8, 1>,
         _pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher_4_rows: Option<
@@ -86,7 +87,7 @@ impl VerticalConvolutionPass<u8, 1> for ImageStore<'_, u8, 1> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<u8, 1>,
+        destination: &mut ImageStoreMut<u8, 1>,
         pool: &Option<ThreadPool>,
     ) {
         let _scale_factor = self.height as f32 / destination.height as f32;
diff --git a/src/rgb_f32.rs b/src/rgb_f32.rs
index 357d8ec..a3c896c 100644
--- a/src/rgb_f32.rs
+++ b/src/rgb_f32.rs
@@ -33,7 +33,7 @@ use crate::convolve_naive_f32::*;
 use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32};
 use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::floating_point_vertical::column_handler_floating_point;
-use crate::image_store::ImageStore;
+use crate::image_store::{ImageStore, ImageStoreMut};
 #[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
 use crate::neon::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
@@ -56,7 +56,7 @@ impl HorizontalConvolutionPass<f32, 3> for ImageStore<'_, f32, 3> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<f32, 3>,
+        destination: &mut ImageStoreMut<f32, 3>,
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher_4_rows: Option<
@@ -95,7 +95,7 @@ impl VerticalConvolutionPass<f32, 3> for ImageStore<'_, f32, 3> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<f32, 3>,
+        destination: &mut ImageStoreMut<f32, 3>,
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) =
diff --git a/src/rgb_u16.rs b/src/rgb_u16.rs
index ac33c17..d420454 100644
--- a/src/rgb_u16.rs
+++ b/src/rgb_u16.rs
@@ -30,6 +30,7 @@
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16};
 use crate::filter_weights::FilterWeights;
+use crate::image_store::ImageStoreMut;
 use crate::ImageStore;
 use rayon::ThreadPool;
 
@@ -38,7 +39,7 @@ impl HorizontalConvolutionPass<u16, 3> for ImageStore<'_, u16, 3> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<u16, 3>,
+        destination: &mut ImageStoreMut<u16, 3>,
         _pool: &Option<ThreadPool>,
     ) {
         convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool);
@@ -49,7 +50,7 @@ impl VerticalConvolutionPass<u16, 3> for ImageStore<'_, u16, 3> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<u16, 3>,
+        destination: &mut ImageStoreMut<u16, 3>,
         pool: &Option<ThreadPool>,
     ) {
         convolve_vertical_dispatch_u16(self, filter_weights, destination, pool);
diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs
index 85ce1e5..8aff370 100644
--- a/src/rgb_u8.rs
+++ b/src/rgb_u8.rs
@@ -34,7 +34,7 @@ use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::handler_provider::{
     handle_fixed_column_u8, handle_fixed_row_u8, handle_fixed_rows_4_u8,
 };
-use crate::image_store::ImageStore;
+use crate::image_store::{ImageStore, ImageStoreMut};
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::neon::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
@@ -51,7 +51,7 @@ impl HorizontalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<u8, 3>,
+        destination: &mut ImageStoreMut<u8, 3>,
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher_4_rows: Option<
@@ -86,7 +86,7 @@ impl VerticalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<u8, 3>,
+        destination: &mut ImageStoreMut<u8, 3>,
         pool: &Option<ThreadPool>,
     ) {
         let _scale_factor = self.height as f32 / destination.height as f32;
diff --git a/src/rgba_f32.rs b/src/rgba_f32.rs
index f2081a4..66c60cc 100644
--- a/src/rgba_f32.rs
+++ b/src/rgba_f32.rs
@@ -37,6 +37,7 @@ use crate::convolve_naive_f32::{
 };
 use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32};
 use crate::filter_weights::*;
+use crate::image_store::ImageStoreMut;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::neon::*;
 use crate::rgb_f32::convolve_vertical_rgb_native_row_f32;
@@ -50,7 +51,7 @@ impl HorizontalConvolutionPass<f32, 4> for ImageStore<'_, f32, 4> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<f32, 4>,
+        destination: &mut ImageStoreMut<f32, 4>,
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher_4_rows: Option<
@@ -97,7 +98,7 @@ impl VerticalConvolutionPass<f32, 4> for ImageStore<'_, f32, 4> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<f32, 4>,
+        destination: &mut ImageStoreMut<f32, 4>,
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) =
diff --git a/src/rgba_u16.rs b/src/rgba_u16.rs
index 38532b3..613bc19 100644
--- a/src/rgba_u16.rs
+++ b/src/rgba_u16.rs
@@ -30,6 +30,7 @@
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16};
 use crate::filter_weights::FilterWeights;
+use crate::image_store::ImageStoreMut;
 use crate::ImageStore;
 use rayon::ThreadPool;
 
@@ -38,7 +39,7 @@ impl HorizontalConvolutionPass<u16, 4> for ImageStore<'_, u16, 4> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<u16, 4>,
+        destination: &mut ImageStoreMut<u16, 4>,
         _pool: &Option<ThreadPool>,
     ) {
         convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool);
@@ -49,7 +50,7 @@ impl VerticalConvolutionPass<u16, 4> for ImageStore<'_, u16, 4> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<u16, 4>,
+        destination: &mut ImageStoreMut<u16, 4>,
         pool: &Option<ThreadPool>,
     ) {
         convolve_vertical_dispatch_u16(self, filter_weights, destination, pool);
diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs
index 597ffe4..b4466ff 100644
--- a/src/rgba_u8.rs
+++ b/src/rgba_u8.rs
@@ -36,6 +36,7 @@ use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::handler_provider::{
     handle_fixed_column_u8, handle_fixed_row_u8, handle_fixed_rows_4_u8,
 };
+use crate::image_store::ImageStoreMut;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
 use crate::neon::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
@@ -54,7 +55,7 @@ impl HorizontalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<u8, 4>,
+        destination: &mut ImageStoreMut<u8, 4>,
         _pool: &Option<ThreadPool>,
     ) {
         let _scale_factor = self.width as f32 / destination.width as f32;
@@ -100,7 +101,7 @@ impl VerticalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
-        destination: &mut ImageStore<u8, 4>,
+        destination: &mut ImageStoreMut<u8, 4>,
         pool: &Option<ThreadPool>,
     ) {
         let _scale_factor = self.height as f32 / destination.height as f32;
diff --git a/src/scaler.rs b/src/scaler.rs
index 513183a..70841d7 100644
--- a/src/scaler.rs
+++ b/src/scaler.rs
@@ -26,22 +26,20 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-use crate::alpha_check::{
-    has_non_constant_cap_alpha_rgba16, has_non_constant_cap_alpha_rgba8,
-    has_non_constant_cap_alpha_rgba_f32,
-};
 use crate::ar30::{Ar30ByteOrder, Rgb30};
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::image_size::ImageSize;
-use crate::image_store::ImageStore;
+use crate::image_store::{
+    AssociateAlpha, CheckStoreDensity, ImageStore, ImageStoreMut, UnassociateAlpha,
+};
 use crate::nearest_sampler::resize_nearest;
 use crate::pic_scale_error::PicScaleError;
 use crate::resize_ar30::resize_ar30_impl;
 use crate::support::check_image_size_overflow;
 use crate::threading_policy::ThreadingPolicy;
 use crate::{ConstPI, ConstSqrt2, Jinc, ResamplingFunction};
-use num_traits::{AsPrimitive, Float, Signed};
+use num_traits::{AsPrimitive, Float, FromPrimitive, Signed};
 use rayon::ThreadPool;
 use std::fmt::Debug;
 use std::ops::{AddAssign, MulAssign, Neg};
@@ -59,34 +57,34 @@ pub trait Scaling {
     /// Performs rescaling for RGB, channel order does not matter
     fn resize_rgb<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 3>,
-    ) -> Result<ImageStore<'a, u8, 3>, PicScaleError>;
+        store: &ImageStore<'a, u8, 3>,
+        into: &mut ImageStoreMut<'a, u8, 3>,
+    ) -> Result<(), PicScaleError>;
 
     /// Performs rescaling for RGBA, for pre-multiplying alpha, converting to LUV or LAB alpha must be last channel
     fn resize_rgba<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 4>,
+        store: &ImageStore<'a, u8, 4>,
+        into: &mut ImageStoreMut<'a, u8, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u8, 4>, PicScaleError>;
+    ) -> Result<(), PicScaleError>;
 }
 
 pub trait ScalingF32 {
     /// Performs rescaling for RGB f32, channel order does not matter
     fn resize_rgb_f32<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, f32, 3>,
-    ) -> Result<ImageStore<'a, f32, 3>, PicScaleError>;
+        store: &ImageStore<'a, f32, 3>,
+        into: &mut ImageStoreMut<'a, f32, 3>,
+    ) -> Result<(), PicScaleError>;
 
     /// Performs rescaling for RGBA f32, alpha expected to be last
     fn resize_rgba_f32<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, f32, 4>,
+        store: &ImageStore<'a, f32, 4>,
+        into: &mut ImageStoreMut<'a, f32, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, f32, 4>, PicScaleError>;
+    ) -> Result<(), PicScaleError>;
 }
 
 pub trait ScalingU16 {
@@ -100,11 +98,10 @@ pub trait ScalingU16 {
     /// # Panics
     /// Panic if bit-depth < 1 or bit-depth > 16
     fn resize_plane_u16<'a>(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u16, 1>,
-        bit_depth: usize,
-    ) -> Result<ImageStore<'a, u16, 1>, PicScaleError>;
+        &'a self,
+        store: &ImageStore<'a, u16, 1>,
+        into: &mut ImageStoreMut<'a, u16, 1>,
+    ) -> Result<(), PicScaleError>;
 
     /// Performs rescaling for RGB, channel order does not matter
     ///
@@ -116,11 +113,10 @@ pub trait ScalingU16 {
     /// # Panics
     /// Panic if bit-depth < 1 or bit-depth > 16
     fn resize_rgb_u16<'a>(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u16, 3>,
-        bit_depth: usize,
-    ) -> Result<ImageStore<'a, u16, 3>, PicScaleError>;
+        &'a self,
+        store: &ImageStore<'a, u16, 3>,
+        into: &mut ImageStoreMut<'a, u16, 3>,
+    ) -> Result<(), PicScaleError>;
 
     /// Performs rescaling for RGBA, for pre-multiplying alpha should be last
     ///
@@ -133,12 +129,11 @@ pub trait ScalingU16 {
     /// # Panics
     /// Panic if bit-depth < 1 or bit-depth > 16
     fn resize_rgba_u16<'a>(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u16, 4>,
-        bit_depth: usize,
+        &'a self,
+        store: &ImageStore<'a, u16, 4>,
+        into: &mut ImageStoreMut<'a, u16, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u16, 4>, PicScaleError>;
+    ) -> Result<(), PicScaleError>;
 }
 
 impl Scaler {
@@ -213,8 +208,8 @@ impl Scaler {
                 let start: usize = (center_x - filter_radius).floor().max(0f32.as_()).as_();
                 let end: usize = (center_x + filter_radius)
                     .ceil()
-                    .min(in_size.as_())
                     .min(start.as_() + kernel_size.as_())
+                    .min(in_size.as_())
                     .as_();
 
                 let center = center_x - 0.5.as_();
@@ -325,8 +320,8 @@ impl Scaler {
                 let start: usize = sx.floor().max(0f32.as_()).as_();
                 let end: usize = (sx + kernel_size.as_())
                     .ceil()
-                    .min(in_size.as_())
                     .min(start.as_() + kernel_size.as_())
+                    .min(in_size.as_())
                     .as_();
 
                 let size = end - start;
@@ -368,13 +363,21 @@ impl Scaler {
 }
 
 impl Scaler {
-    pub(crate) fn resize_rgba_impl<'a>(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 4>,
-        premultiply_alpha: bool,
-        pool: &Option<ThreadPool>,
-    ) -> Result<ImageStore<'a, u8, 4>, PicScaleError> {
+    pub(crate) fn generic_resize<
+        'a,
+        T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static,
+        const N: usize,
+    >(
+        &'a self,
+        store: &ImageStore<'a, T, N>,
+        into: &mut ImageStoreMut<'a, T, N>,
+    ) -> Result<(), PicScaleError>
+    where
+        ImageStore<'a, T, N>: VerticalConvolutionPass<T, N> + HorizontalConvolutionPass<T, N>,
+        ImageStoreMut<'a, T, N>: CheckStoreDensity,
+    {
+        let new_size = into.get_size();
+        into.validate()?;
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -387,96 +390,15 @@ impl Scaler {
             return Err(PicScaleError::DestinationImageIsTooLarge);
         }
 
-        if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
-        }
-
-        let should_do_horizontal = store.width != new_size.width;
-        let should_do_vertical = store.height != new_size.height;
-        assert!(should_do_horizontal || should_do_vertical);
-
-        let mut src_store = store;
-
-        if self.function == ResamplingFunction::Nearest {
-            let mut new_image = ImageStore::<u8, 4>::alloc(new_size.width, new_size.height);
-            resize_nearest::<u8, 4>(
-                src_store.buffer.borrow(),
-                src_store.width,
-                src_store.height,
-                new_image.buffer.borrow_mut(),
-                new_size.width,
-                new_size.height,
-                pool,
-            );
-            return Ok(new_image);
-        }
-
-        let mut has_alpha_premultiplied = false;
-
-        if premultiply_alpha {
-            let is_alpha_premultiplication_reasonable =
-                has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width);
-            if is_alpha_premultiplication_reasonable {
-                let mut new_store = ImageStore::<u8, 4>::alloc(src_store.width, src_store.height);
-                src_store.premultiply_alpha(&mut new_store, pool);
-                src_store = new_store;
-                has_alpha_premultiplied = true;
+        if into.should_have_bit_depth() {
+            if !(1..=16).contains(&into.bit_depth) {
+                return Err(PicScaleError::UnsupportedBitDepth(into.bit_depth));
             }
         }
 
-        if should_do_vertical {
-            let mut new_image_vertical =
-                ImageStore::<u8, 4>::alloc(src_store.width, new_size.height);
-            let vertical_filters =
-                self.generate_weights(src_store.height, new_image_vertical.height);
-            src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, pool);
-            src_store = new_image_vertical;
-        }
-
-        assert_eq!(src_store.height, new_size.height);
-
-        if should_do_horizontal {
-            let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-            let mut new_image_horizontal =
-                ImageStore::<u8, 4>::alloc(new_size.width, new_size.height);
-            src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, pool);
-            src_store = new_image_horizontal;
-        }
-
-        assert_eq!(src_store.width, new_size.width);
-
-        if premultiply_alpha && has_alpha_premultiplied {
-            src_store.unpremultiply_alpha(pool);
-        }
-
-        Ok(src_store)
-    }
-}
-
-impl Scaling for Scaler {
-    fn set_threading_policy(&mut self, threading_policy: ThreadingPolicy) {
-        self.threading_policy = threading_policy;
-    }
-
-    fn resize_rgb<'a>(
-        &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 3>,
-    ) -> Result<ImageStore<'a, u8, 3>, PicScaleError> {
-        if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
-            return Err(PicScaleError::ZeroImageDimensions);
-        }
-
-        if check_image_size_overflow(store.width, store.height, store.channels) {
-            return Err(PicScaleError::SourceImageIsTooLarge);
-        }
-
-        if check_image_size_overflow(new_size.width, new_size.height, store.channels) {
-            return Err(PicScaleError::DestinationImageIsTooLarge);
-        }
-
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
         let pool = self
@@ -484,164 +406,257 @@ impl Scaling for Scaler {
             .get_pool(ImageSize::new(new_size.width, new_size.height));
 
         if self.function == ResamplingFunction::Nearest {
-            let mut allocated_store: Vec<u8> = vec![0u8; new_size.width * 3 * new_size.height];
-            resize_nearest::<u8, 3>(
-                store.buffer.borrow(),
+            resize_nearest::<T, N>(
+                store.buffer.as_ref(),
                 store.width,
                 store.height,
-                &mut allocated_store,
+                into.buffer.borrow_mut(),
                 new_size.width,
                 new_size.height,
                 &pool,
             );
-            return ImageStore::<u8, 3>::new(allocated_store, new_size.width, new_size.height);
+            return Ok(());
         }
 
         let should_do_horizontal = store.width != new_size.width;
         let should_do_vertical = store.height != new_size.height;
         assert!(should_do_horizontal || should_do_vertical);
 
-        let mut src_store = store;
-
-        if should_do_vertical {
-            let vertical_filters = self.generate_weights(src_store.height, new_size.height);
-            let mut new_image_vertical =
-                ImageStore::<u8, 3>::alloc(src_store.width, new_size.height);
-            src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
-            src_store = new_image_vertical;
-        }
-
-        assert_eq!(src_store.height, new_size.height);
+        if should_do_vertical && should_do_horizontal {
+            let mut target_vertical = vec![T::default(); store.width * new_size.height * N];
 
-        if should_do_horizontal {
-            let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-            let mut new_image_horizontal =
-                ImageStore::<u8, 3>::alloc(new_size.width, new_size.height);
-            src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool);
-            src_store = new_image_horizontal;
+            let mut new_image_vertical = ImageStoreMut::<T, N>::from_slice(
+                &mut target_vertical,
+                store.width,
+                new_size.height,
+            )?;
+            new_image_vertical.bit_depth = into.bit_depth;
+            let vertical_filters = self.generate_weights(store.height, new_size.height);
+            store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
+
+            let new_immutable_store = ImageStore::<T, N> {
+                buffer: std::borrow::Cow::Owned(target_vertical),
+                channels: N,
+                width: store.width,
+                height: new_size.height,
+                bit_depth: into.bit_depth,
+            };
+            let horizontal_filters = self.generate_weights(store.width, new_size.width);
+            new_immutable_store.convolve_horizontal(horizontal_filters, into, &pool);
+            Ok(())
+        } else if should_do_vertical {
+            let vertical_filters = self.generate_weights(store.height, new_size.height);
+            store.convolve_vertical(vertical_filters, into, &pool);
+            Ok(())
+        } else {
+            assert!(should_do_horizontal);
+            let horizontal_filters = self.generate_weights(store.width, new_size.width);
+            store.convolve_horizontal(horizontal_filters, into, &pool);
+            Ok(())
         }
-
-        assert_eq!(src_store.width, new_size.width);
-
-        Ok(src_store)
-    }
-
-    fn resize_rgba<'a>(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 4>,
-        premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u8, 4>, PicScaleError> {
-        let pool = self
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-        self.resize_rgba_impl(new_size, store, premultiply_alpha, &pool)
     }
-}
 
-impl Scaler {
-    pub(crate) fn resize_rgba_f32_impl<'a>(
+    fn forward_resize_with_alpha<
+        'a,
+        T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static,
+        const N: usize,
+    >(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, f32, 4>,
-        premultiply_alpha: bool,
+        store: &ImageStore<'a, T, N>,
+        into: &mut ImageStoreMut<'a, T, N>,
+        premultiply_alpha_requested: bool,
         pool: &Option<ThreadPool>,
-    ) -> Result<ImageStore<'a, f32, 4>, PicScaleError> {
-        if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
-            return Err(PicScaleError::ZeroImageDimensions);
-        }
+    ) -> Result<(), PicScaleError>
+    where
+        ImageStore<'a, T, N>:
+            VerticalConvolutionPass<T, N> + HorizontalConvolutionPass<T, N> + AssociateAlpha<T, N>,
+        ImageStoreMut<'a, T, N>: CheckStoreDensity + UnassociateAlpha<T, N>,
+    {
+        let new_size = into.get_size();
+        let mut src_store: std::borrow::Cow<'_, ImageStore<'_, T, N>> =
+            std::borrow::Cow::Borrowed(store);
 
-        if check_image_size_overflow(store.width, store.height, store.channels) {
-            return Err(PicScaleError::SourceImageIsTooLarge);
-        }
+        let mut has_alpha_premultiplied = true;
 
-        if check_image_size_overflow(new_size.width, new_size.height, store.channels) {
-            return Err(PicScaleError::DestinationImageIsTooLarge);
+        if premultiply_alpha_requested {
+            let is_alpha_premultiplication_reasonable =
+                src_store.is_alpha_premultiplication_needed();
+            if is_alpha_premultiplication_reasonable {
+                let mut target_premultiplied =
+                    vec![T::default(); src_store.width * src_store.height * N];
+                let mut new_store = ImageStoreMut::<T, N>::from_slice(
+                    &mut target_premultiplied,
+                    src_store.width,
+                    src_store.height,
+                )?;
+                new_store.bit_depth = into.bit_depth;
+                src_store.premultiply_alpha(&mut new_store, &pool);
+                src_store = std::borrow::Cow::Owned(ImageStore::<T, N> {
+                    buffer: std::borrow::Cow::Owned(target_premultiplied),
+                    channels: N,
+                    width: src_store.width,
+                    height: src_store.height,
+                    bit_depth: into.bit_depth,
+                });
+                has_alpha_premultiplied = true;
+            }
         }
 
-        if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
-        }
+        let mut target_vertical = vec![T::default(); src_store.width * new_size.height * N];
 
-        let mut src_store = store;
+        let mut new_image_vertical = ImageStoreMut::<T, N>::from_slice(
+            &mut target_vertical,
+            src_store.width,
+            new_size.height,
+        )?;
+        new_image_vertical.bit_depth = into.bit_depth;
+        let vertical_filters = self.generate_weights(src_store.height, new_size.height);
+        src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
 
-        if self.function == ResamplingFunction::Nearest {
-            let mut allocated_store: Vec<f32> = vec![0f32; new_size.width * 4 * new_size.height];
-            resize_nearest::<f32, 4>(
-                src_store.buffer.borrow(),
-                src_store.width,
-                src_store.height,
-                &mut allocated_store,
-                new_size.width,
-                new_size.height,
-                pool,
-            );
-            let new_image = ImageStore::new(allocated_store, new_size.width, new_size.height)?;
-            return Ok(new_image);
+        let new_immutable_store = ImageStore::<T, N> {
+            buffer: std::borrow::Cow::Owned(target_vertical),
+            channels: N,
+            width: src_store.width,
+            height: new_size.height,
+            bit_depth: into.bit_depth,
+        };
+        let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
+        new_immutable_store.convolve_horizontal(horizontal_filters, into, &pool);
+
+        if premultiply_alpha_requested && has_alpha_premultiplied {
+            into.unpremultiply_alpha(&pool);
         }
 
-        let should_do_horizontal = src_store.width != new_size.width;
-        let should_do_vertical = src_store.height != new_size.height;
-        assert!(should_do_horizontal || should_do_vertical);
+        Ok(())
+    }
 
-        let mut has_alpha_premultiplied = false;
+    fn forward_resize_vertical_with_alpha<
+        'a,
+        T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static,
+        const N: usize,
+    >(
+        &'a self,
+        store: &ImageStore<'a, T, N>,
+        into: &mut ImageStoreMut<'a, T, N>,
+        premultiply_alpha_requested: bool,
+        pool: &Option<ThreadPool>,
+    ) -> Result<(), PicScaleError>
+    where
+        ImageStore<'a, T, N>:
+            VerticalConvolutionPass<T, N> + HorizontalConvolutionPass<T, N> + AssociateAlpha<T, N>,
+        ImageStoreMut<'a, T, N>: CheckStoreDensity + UnassociateAlpha<T, N>,
+    {
+        let new_size = into.get_size();
+        let mut src_store = std::borrow::Cow::Borrowed(store);
+
+        let mut has_alpha_premultiplied = true;
 
-        if premultiply_alpha {
+        if premultiply_alpha_requested {
             let is_alpha_premultiplication_reasonable =
-                has_non_constant_cap_alpha_rgba_f32(src_store.buffer.borrow(), src_store.width);
+                src_store.is_alpha_premultiplication_needed();
             if is_alpha_premultiplication_reasonable {
-                let mut new_store = ImageStore::<f32, 4>::alloc(src_store.width, new_size.height);
-                src_store.premultiply_alpha(&mut new_store, pool);
-                src_store = new_store;
+                let mut target_premultiplied =
+                    vec![T::default(); src_store.width * src_store.height * N];
+                let mut new_store = ImageStoreMut::<T, N>::from_slice(
+                    &mut target_premultiplied,
+                    src_store.width,
+                    src_store.height,
+                )?;
+                new_store.bit_depth = into.bit_depth;
+                src_store.premultiply_alpha(&mut new_store, &pool);
+                src_store = std::borrow::Cow::Owned(ImageStore::<T, N> {
+                    buffer: std::borrow::Cow::Owned(target_premultiplied),
+                    channels: N,
+                    width: src_store.width,
+                    height: src_store.height,
+                    bit_depth: into.bit_depth,
+                });
                 has_alpha_premultiplied = true;
             }
         }
 
-        if should_do_vertical {
-            let allocated_store_vertical: Vec<f32> =
-                vec![0f32; src_store.width * 4 * new_size.height];
-            let mut new_image_vertical = ImageStore::<f32, 4>::new(
-                allocated_store_vertical,
-                src_store.width,
-                new_size.height,
-            )?;
-            let vertical_filters =
-                self.generate_weights(src_store.height, new_image_vertical.height);
-            src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, pool);
-            src_store = new_image_vertical;
+        let vertical_filters = self.generate_weights(src_store.height, new_size.height);
+        src_store.convolve_vertical(vertical_filters, into, &pool);
+
+        if premultiply_alpha_requested && has_alpha_premultiplied {
+            into.unpremultiply_alpha(&pool);
         }
 
-        assert_eq!(src_store.height, new_size.height);
+        Ok(())
+    }
 
-        if should_do_horizontal {
-            let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-            let allocated_store_horizontal: Vec<f32> =
-                vec![0f32; new_size.width * 4 * new_size.height];
-            let mut new_image_horizontal = ImageStore::<f32, 4>::new(
-                allocated_store_horizontal,
-                new_size.width,
-                new_size.height,
-            )?;
-            src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, pool);
-            src_store = new_image_horizontal;
+    fn forward_resize_horizontal_with_alpha<
+        'a,
+        T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static,
+        const N: usize,
+    >(
+        &'a self,
+        store: &ImageStore<'a, T, N>,
+        into: &mut ImageStoreMut<'a, T, N>,
+        premultiply_alpha_requested: bool,
+        pool: &Option<ThreadPool>,
+    ) -> Result<(), PicScaleError>
+    where
+        ImageStore<'a, T, N>:
+            VerticalConvolutionPass<T, N> + HorizontalConvolutionPass<T, N> + AssociateAlpha<T, N>,
+        ImageStoreMut<'a, T, N>: CheckStoreDensity + UnassociateAlpha<T, N>,
+    {
+        let new_size = into.get_size();
+        let mut src_store = std::borrow::Cow::Borrowed(store);
+
+        let mut has_alpha_premultiplied = true;
+
+        if premultiply_alpha_requested {
+            let is_alpha_premultiplication_reasonable =
+                src_store.is_alpha_premultiplication_needed();
+            if is_alpha_premultiplication_reasonable {
+                let mut target_premultiplied =
+                    vec![T::default(); src_store.width * src_store.height * N];
+                let mut new_store = ImageStoreMut::<T, N>::from_slice(
+                    &mut target_premultiplied,
+                    src_store.width,
+                    src_store.height,
+                )?;
+                new_store.bit_depth = into.bit_depth;
+                src_store.premultiply_alpha(&mut new_store, &pool);
+                src_store = std::borrow::Cow::Owned(ImageStore::<T, N> {
+                    buffer: std::borrow::Cow::Owned(target_premultiplied),
+                    channels: N,
+                    width: src_store.width,
+                    height: src_store.height,
+                    bit_depth: into.bit_depth,
+                });
+                has_alpha_premultiplied = true;
+            }
         }
 
-        assert_eq!(src_store.width, new_size.width);
+        let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
+        src_store.convolve_horizontal(horizontal_filters, into, &pool);
 
-        if premultiply_alpha && has_alpha_premultiplied {
-            src_store.unpremultiply_alpha(pool);
+        if premultiply_alpha_requested && has_alpha_premultiplied {
+            into.unpremultiply_alpha(&pool);
         }
 
-        Ok(src_store)
+        Ok(())
     }
-}
 
-impl ScalingF32 for Scaler {
-    fn resize_rgb_f32<'a>(
+    pub(crate) fn generic_resize_with_alpha<
+        'a,
+        T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static,
+        const N: usize,
+    >(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, f32, 3>,
-    ) -> Result<ImageStore<'a, f32, 3>, PicScaleError> {
+        store: &ImageStore<'a, T, N>,
+        into: &mut ImageStoreMut<'a, T, N>,
+        premultiply_alpha_requested: bool,
+    ) -> Result<(), PicScaleError>
+    where
+        ImageStore<'a, T, N>:
+            VerticalConvolutionPass<T, N> + HorizontalConvolutionPass<T, N> + AssociateAlpha<T, N>,
+        ImageStoreMut<'a, T, N>: CheckStoreDensity + UnassociateAlpha<T, N>,
+    {
+        let new_size = into.get_size();
+        into.validate()?;
         if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
             return Err(PicScaleError::ZeroImageDimensions);
         }
@@ -654,8 +669,15 @@ impl ScalingF32 for Scaler {
             return Err(PicScaleError::DestinationImageIsTooLarge);
         }
 
+        if into.should_have_bit_depth() {
+            if !(1..=16).contains(&into.bit_depth) {
+                return Err(PicScaleError::UnsupportedBitDepth(into.bit_depth));
+            }
+        }
+
         if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
+            store.copied_to_mut(into);
+            return Ok(());
         }
 
         let pool = self
@@ -663,153 +685,88 @@ impl ScalingF32 for Scaler {
             .get_pool(ImageSize::new(new_size.width, new_size.height));
 
         if self.function == ResamplingFunction::Nearest {
-            let mut allocated_store: Vec<f32> = vec![0f32; new_size.width * 3 * new_size.height];
-            resize_nearest::<f32, 3>(
-                store.buffer.borrow(),
+            resize_nearest::<T, N>(
+                store.buffer.as_ref(),
                 store.width,
                 store.height,
-                &mut allocated_store,
+                into.buffer.borrow_mut(),
                 new_size.width,
                 new_size.height,
                 &pool,
             );
-            let new_image =
-                ImageStore::<f32, 3>::new(allocated_store, new_size.width, new_size.height);
-            return new_image;
+            return Ok(());
         }
 
-        let mut src_store = store;
-
-        let should_do_horizontal = src_store.width != new_size.width;
-        let should_do_vertical = src_store.height != new_size.height;
+        let should_do_horizontal = store.width != new_size.width;
+        let should_do_vertical = store.height != new_size.height;
         assert!(should_do_horizontal || should_do_vertical);
 
-        if should_do_vertical {
-            let allocated_store_vertical: Vec<f32> =
-                vec![0f32; src_store.width * 3 * new_size.height];
-            let mut new_image_vertical = ImageStore::<f32, 3>::new(
-                allocated_store_vertical,
-                src_store.width,
-                new_size.height,
-            )?;
-            let vertical_filters =
-                self.generate_weights(src_store.height, new_image_vertical.height);
-            src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
-            src_store = new_image_vertical;
+        if should_do_vertical && should_do_horizontal {
+            self.forward_resize_with_alpha(store, into, premultiply_alpha_requested, &pool)
+        } else if should_do_vertical {
+            self.forward_resize_vertical_with_alpha(store, into, premultiply_alpha_requested, &pool)
+        } else {
+            assert!(should_do_horizontal);
+            self.forward_resize_horizontal_with_alpha(
+                store,
+                into,
+                premultiply_alpha_requested,
+                &pool,
+            )
         }
+    }
+}
 
-        assert_eq!(src_store.height, new_size.height);
+impl Scaling for Scaler {
+    fn set_threading_policy(&mut self, threading_policy: ThreadingPolicy) {
+        self.threading_policy = threading_policy;
+    }
 
-        if should_do_horizontal {
-            let allocated_store_horizontal: Vec<f32> =
-                vec![0f32; new_size.width * 3 * new_size.height];
-            let mut new_image_horizontal = ImageStore::<f32, 3>::new(
-                allocated_store_horizontal,
-                new_size.width,
-                new_size.height,
-            )?;
-            let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-            src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool);
-            src_store = new_image_horizontal;
-        }
+    fn resize_rgb<'a>(
+        &'a self,
+        store: &ImageStore<'a, u8, 3>,
+        into: &mut ImageStoreMut<'a, u8, 3>,
+    ) -> Result<(), PicScaleError> {
+        self.generic_resize(store, into)
+    }
 
-        assert_eq!(src_store.width, new_size.width);
+    fn resize_rgba<'a>(
+        &'a self,
+        store: &ImageStore<'a, u8, 4>,
+        into: &mut ImageStoreMut<'a, u8, 4>,
+        premultiply_alpha: bool,
+    ) -> Result<(), PicScaleError> {
+        self.generic_resize_with_alpha(store, into, premultiply_alpha)
+    }
+}
 
-        Ok(src_store)
+impl ScalingF32 for Scaler {
+    fn resize_rgb_f32<'a>(
+        &'a self,
+        store: &ImageStore<'a, f32, 3>,
+        into: &mut ImageStoreMut<'a, f32, 3>,
+    ) -> Result<(), PicScaleError> {
+        self.generic_resize(store, into)
     }
 
     fn resize_rgba_f32<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, f32, 4>,
+        store: &ImageStore<'a, f32, 4>,
+        into: &mut ImageStoreMut<'a, f32, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, f32, 4>, PicScaleError> {
-        let pool = self
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-        self.resize_rgba_f32_impl(new_size, store, premultiply_alpha, &pool)
+    ) -> Result<(), PicScaleError> {
+        self.generic_resize_with_alpha(store, into, premultiply_alpha)
     }
 }
 
 impl Scaler {
     /// Performs rescaling for f32 plane
     pub fn resize_plane_f32<'a>(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<'a, f32, 1>,
-    ) -> Result<ImageStore<'a, f32, 1>, PicScaleError> {
-        if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
-            return Err(PicScaleError::ZeroImageDimensions);
-        }
-
-        if check_image_size_overflow(store.width, store.height, store.channels) {
-            return Err(PicScaleError::SourceImageIsTooLarge);
-        }
-
-        if check_image_size_overflow(new_size.width, new_size.height, store.channels) {
-            return Err(PicScaleError::DestinationImageIsTooLarge);
-        }
-
-        if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
-        }
-
-        let pool = self
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-
-        if self.function == ResamplingFunction::Nearest {
-            let mut allocated_store: Vec<f32> = vec![0f32; new_size.width * new_size.height];
-            resize_nearest::<f32, 1>(
-                store.buffer.borrow(),
-                store.width,
-                store.height,
-                &mut allocated_store,
-                new_size.width,
-                new_size.height,
-                &pool,
-            );
-            let new_image =
-                ImageStore::<f32, 1>::new(allocated_store, new_size.width, new_size.height)?;
-            return Ok(new_image);
-        }
-
-        let mut src_store = store;
-
-        let should_do_horizontal = src_store.width != new_size.width;
-        let should_do_vertical = src_store.height != new_size.height;
-        assert!(should_do_horizontal || should_do_vertical);
-
-        if should_do_vertical {
-            let allocated_store_vertical: Vec<f32> = vec![0f32; src_store.width * new_size.height];
-            let mut new_image_vertical = ImageStore::<f32, 1>::new(
-                allocated_store_vertical,
-                src_store.width,
-                new_size.height,
-            )?;
-            let vertical_filters =
-                self.generate_weights(src_store.height, new_image_vertical.height);
-            src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
-            src_store = new_image_vertical;
-        }
-
-        assert_eq!(src_store.height, new_size.height);
-
-        if should_do_horizontal {
-            let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-            let allocated_store_horizontal: Vec<f32> = vec![0f32; new_size.width * new_size.height];
-            let mut new_image_horizontal = ImageStore::<f32, 1>::new(
-                allocated_store_horizontal,
-                new_size.width,
-                new_size.height,
-            )?;
-            src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool);
-            src_store = new_image_horizontal;
-        }
-
-        assert_eq!(src_store.width, new_size.width);
-
-        Ok(src_store)
+        &'a self,
+        store: &ImageStore<'a, f32, 1>,
+        into: &mut ImageStoreMut<'a, f32, 1>,
+    ) -> Result<(), PicScaleError> {
+        self.generic_resize(store, into)
     }
 }
 
@@ -817,154 +774,20 @@ impl Scaler {
     /// Performs rescaling for u8 plane
     pub fn resize_plane<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u8, 1>,
-    ) -> Result<ImageStore<'a, u8, 1>, PicScaleError> {
-        if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
-            return Err(PicScaleError::ZeroImageDimensions);
-        }
-
-        if check_image_size_overflow(store.width, store.height, store.channels) {
-            return Err(PicScaleError::SourceImageIsTooLarge);
-        }
-
-        if check_image_size_overflow(new_size.width, new_size.height, store.channels) {
-            return Err(PicScaleError::DestinationImageIsTooLarge);
-        }
-
-        if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
-        }
-
-        let pool = self
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-
-        if self.function == ResamplingFunction::Nearest {
-            let mut allocated_store: Vec<u8> = vec![0u8; new_size.width * new_size.height];
-            resize_nearest::<u8, 1>(
-                store.buffer.borrow(),
-                store.width,
-                store.height,
-                &mut allocated_store,
-                new_size.width,
-                new_size.height,
-                &pool,
-            );
-            let new_image =
-                ImageStore::<u8, 1>::new(allocated_store, new_size.width, new_size.height)?;
-            return Ok(new_image);
-        }
-
-        let should_do_horizontal = store.width != new_size.width;
-        let should_do_vertical = store.height != new_size.height;
-        assert!(should_do_horizontal || should_do_vertical);
-
-        let mut src_store = store;
-
-        if should_do_vertical {
-            let vertical_filters = self.generate_weights(src_store.height, new_size.height);
-            let mut new_image_vertical =
-                ImageStore::<u8, 1>::alloc(src_store.width, new_size.height);
-            src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
-            src_store = new_image_vertical;
-        }
-
-        assert_eq!(src_store.height, new_size.height);
-
-        if should_do_horizontal {
-            let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-            let mut new_image_horizontal =
-                ImageStore::<u8, 1>::alloc(new_size.width, new_size.height);
-            src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool);
-            src_store = new_image_horizontal;
-        }
-
-        assert_eq!(src_store.width, new_size.width);
-
-        Ok(src_store)
+        store: &ImageStore<'a, u8, 1>,
+        into: &mut ImageStoreMut<'a, u8, 1>,
+    ) -> Result<(), PicScaleError> {
+        self.generic_resize(store, into)
     }
 }
 
 impl ScalingU16 for Scaler {
     fn resize_rgb_u16<'a>(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u16, 3>,
-        bit_depth: usize,
-    ) -> Result<ImageStore<'a, u16, 3>, PicScaleError> {
-        if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
-            return Err(PicScaleError::ZeroImageDimensions);
-        }
-
-        if check_image_size_overflow(store.width, store.height, store.channels) {
-            return Err(PicScaleError::SourceImageIsTooLarge);
-        }
-
-        if check_image_size_overflow(new_size.width, new_size.height, store.channels) {
-            return Err(PicScaleError::DestinationImageIsTooLarge);
-        }
-
-        if !(1..=16).contains(&bit_depth) {
-            return Err(PicScaleError::UnsupportedBitDepth(bit_depth));
-        }
-
-        if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
-        }
-
-        let should_do_horizontal = store.width != new_size.width;
-        let should_do_vertical = store.height != new_size.height;
-        assert!(should_do_horizontal || should_do_vertical);
-
-        let pool = self
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-
-        if self.function == ResamplingFunction::Nearest {
-            let mut allocated_store: Vec<u16> = vec![0u16; new_size.width * 3 * new_size.height];
-            resize_nearest::<u16, 3>(
-                store.buffer.borrow(),
-                store.width,
-                store.height,
-                &mut allocated_store,
-                new_size.width,
-                new_size.height,
-                &pool,
-            );
-            let mut new_image =
-                ImageStore::<u16, 3>::new(allocated_store, new_size.width, new_size.height)?;
-            new_image.bit_depth = bit_depth;
-            return Ok(new_image);
-        }
-
-        let mut src_store = store;
-        src_store.bit_depth = bit_depth;
-
-        if should_do_vertical {
-            let vertical_filters = self.generate_weights(src_store.height, new_size.height);
-            let mut new_image_vertical =
-                ImageStore::<u16, 3>::alloc(src_store.width, new_size.height);
-            new_image_vertical.bit_depth = bit_depth;
-            src_store.bit_depth = bit_depth;
-            src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
-            src_store = new_image_vertical;
-        }
-
-        assert_eq!(src_store.height, new_size.height);
-
-        if should_do_horizontal {
-            let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-            let mut new_image_horizontal =
-                ImageStore::<u16, 3>::alloc(new_size.width, new_size.height);
-            new_image_horizontal.bit_depth = bit_depth;
-            src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool);
-            src_store = new_image_horizontal;
-        }
-
-        assert_eq!(src_store.width, new_size.width);
-
-        Ok(src_store)
+        &'a self,
+        store: &ImageStore<'a, u16, 3>,
+        into: &mut ImageStoreMut<'a, u16, 3>,
+    ) -> Result<(), PicScaleError> {
+        self.generic_resize(store, into)
     }
 
     /// Resizes u16 image
@@ -978,180 +801,21 @@ impl ScalingU16 for Scaler {
     /// # Panics
     /// Panic if bit depth < 1 or bit depth > 16
     fn resize_rgba_u16<'a>(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u16, 4>,
-        bit_depth: usize,
+        &'a self,
+        store: &ImageStore<'a, u16, 4>,
+        into: &mut ImageStoreMut<'a, u16, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, u16, 4>, PicScaleError> {
-        if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
-            return Err(PicScaleError::ZeroImageDimensions);
-        }
-
-        if check_image_size_overflow(store.width, store.height, store.channels) {
-            return Err(PicScaleError::SourceImageIsTooLarge);
-        }
-
-        if check_image_size_overflow(new_size.width, new_size.height, store.channels) {
-            return Err(PicScaleError::DestinationImageIsTooLarge);
-        }
-
-        if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
-        }
-
-        let should_do_horizontal = store.width != new_size.width;
-        let should_do_vertical = store.height != new_size.height;
-        assert!(should_do_horizontal || should_do_vertical);
-
-        if !(1..=16).contains(&bit_depth) {
-            return Err(PicScaleError::UnsupportedBitDepth(bit_depth));
-        }
-
-        let mut src_store = store;
-        src_store.bit_depth = bit_depth;
-
-        let pool = self
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-
-        if self.function == ResamplingFunction::Nearest {
-            let mut new_image = ImageStore::<u16, 4>::alloc(new_size.width, new_size.height);
-            resize_nearest::<u16, 4>(
-                src_store.buffer.borrow(),
-                src_store.width,
-                src_store.height,
-                new_image.buffer.borrow_mut(),
-                new_size.width,
-                new_size.height,
-                &pool,
-            );
-            new_image.bit_depth = bit_depth;
-            return Ok(new_image);
-        }
-
-        let mut has_alpha_premultiplied = false;
-
-        if premultiply_alpha {
-            let is_alpha_premultiplication_reasonable =
-                has_non_constant_cap_alpha_rgba16(src_store.buffer.borrow(), src_store.width);
-            if is_alpha_premultiplication_reasonable {
-                let mut new_store = ImageStore::<u16, 4>::alloc(src_store.width, src_store.height);
-                new_store.bit_depth = src_store.bit_depth;
-                src_store.premultiply_alpha(&mut new_store, &pool);
-                src_store = new_store;
-                has_alpha_premultiplied = true;
-            }
-        }
-
-        if should_do_vertical {
-            let mut new_image_vertical =
-                ImageStore::<u16, 4>::alloc(src_store.width, new_size.height);
-            let vertical_filters =
-                self.generate_weights(src_store.height, new_image_vertical.height);
-            src_store.bit_depth = bit_depth;
-            new_image_vertical.bit_depth = bit_depth;
-            src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
-            src_store = new_image_vertical;
-        }
-
-        assert_eq!(src_store.height, new_size.height);
-
-        if should_do_horizontal {
-            let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-            let mut new_image_horizontal =
-                ImageStore::<u16, 4>::alloc(new_size.width, new_size.height);
-            new_image_horizontal.bit_depth = bit_depth;
-            src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool);
-            src_store = new_image_horizontal;
-        }
-
-        assert_eq!(src_store.width, new_size.width);
-
-        if premultiply_alpha && has_alpha_premultiplied {
-            src_store.unpremultiply_alpha(&pool);
-            return Ok(src_store);
-        }
-        Ok(src_store)
+    ) -> Result<(), PicScaleError> {
+        self.generic_resize_with_alpha(store, into, premultiply_alpha)
     }
 
     /// Performs rescaling for u16 plane
     fn resize_plane_u16<'a>(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<'a, u16, 1>,
-        bit_depth: usize,
-    ) -> Result<ImageStore<'a, u16, 1>, PicScaleError> {
-        if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
-            return Err(PicScaleError::ZeroImageDimensions);
-        }
-
-        if check_image_size_overflow(store.width, store.height, store.channels) {
-            return Err(PicScaleError::SourceImageIsTooLarge);
-        }
-
-        if check_image_size_overflow(new_size.width, new_size.height, store.channels) {
-            return Err(PicScaleError::DestinationImageIsTooLarge);
-        }
-
-        if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
-        }
-
-        if !(1..=16).contains(&bit_depth) {
-            return Err(PicScaleError::UnsupportedBitDepth(bit_depth));
-        }
-
-        let should_do_horizontal = store.width != new_size.width;
-        let should_do_vertical = store.height != new_size.height;
-        assert!(should_do_horizontal || should_do_vertical);
-
-        let pool = self
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-
-        if self.function == ResamplingFunction::Nearest {
-            let mut allocated_store: Vec<u16> = vec![0u16; new_size.width * new_size.height];
-            resize_nearest::<u16, 1>(
-                store.buffer.borrow(),
-                store.width,
-                store.height,
-                &mut allocated_store,
-                new_size.width,
-                new_size.height,
-                &pool,
-            );
-            let mut new_image =
-                ImageStore::<u16, 1>::new(allocated_store, new_size.width, new_size.height)?;
-            new_image.bit_depth = bit_depth;
-            return Ok(new_image);
-        }
-
-        let mut src_store = store;
-        src_store.bit_depth = bit_depth;
-
-        if should_do_vertical {
-            let vertical_filters = self.generate_weights(src_store.height, new_size.height);
-            let mut new_image_vertical =
-                ImageStore::<u16, 1>::alloc(src_store.width, new_size.height);
-            new_image_vertical.bit_depth = bit_depth;
-            src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
-            src_store = new_image_vertical;
-        }
-
-        assert_eq!(src_store.height, new_size.height);
-
-        if should_do_horizontal {
-            let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-            let mut new_image_horizontal =
-                ImageStore::<u16, 1>::alloc(new_size.width, new_size.height);
-            new_image_horizontal.bit_depth = bit_depth;
-            src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool);
-            src_store = new_image_horizontal;
-        }
-        assert_eq!(src_store.width, new_size.width);
-
-        Ok(src_store)
+        &'a self,
+        store: &ImageStore<'a, u16, 1>,
+        into: &mut ImageStoreMut<'a, u16, 1>,
+    ) -> Result<(), PicScaleError> {
+        self.generic_resize(store, into)
     }
 }
 
diff --git a/src/scaler_f16.rs b/src/scaler_f16.rs
index d7c3d72..ba8096f 100644
--- a/src/scaler_f16.rs
+++ b/src/scaler_f16.rs
@@ -27,12 +27,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
-use crate::nearest_sampler::resize_nearest;
+use crate::image_store::ImageStoreMut;
 use crate::pic_scale_error::PicScaleError;
-use crate::support::check_image_size_overflow;
-use crate::ResamplingFunction::Nearest;
-use crate::{ImageSize, ImageStore, Scaler};
+use crate::{ImageStore, Scaler};
 use half::f16;
 
 // f16
@@ -40,253 +37,28 @@ impl Scaler {
     /// Resize f16 RGBA image
     pub fn resize_rgba_f16<'a>(
         &'a self,
-        new_size: ImageSize,
-        store: ImageStore<'a, f16, 4>,
+        store: &ImageStore<'a, f16, 4>,
+        into: &mut ImageStoreMut<'a, f16, 4>,
         premultiply_alpha: bool,
-    ) -> Result<ImageStore<'a, f16, 4>, PicScaleError> {
-        if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
-            return Err(PicScaleError::ZeroImageDimensions);
-        }
-
-        if check_image_size_overflow(store.width, store.height, store.channels) {
-            return Err(PicScaleError::SourceImageIsTooLarge);
-        }
-
-        if check_image_size_overflow(new_size.width, new_size.height, store.channels) {
-            return Err(PicScaleError::DestinationImageIsTooLarge);
-        }
-
-        if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
-        }
-
-        let mut src_store = store;
-
-        let pool = self
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-
-        if self.function == Nearest {
-            let mut allocated_store: Vec<f16> =
-                vec![f16::from_f32(0.); new_size.width * 4 * new_size.height];
-            resize_nearest::<f16, 4>(
-                &src_store.buffer.borrow(),
-                src_store.width,
-                src_store.height,
-                &mut allocated_store,
-                new_size.width,
-                new_size.height,
-                &pool,
-            );
-            let new_image =
-                ImageStore::<f16, 4>::new(allocated_store, new_size.width, new_size.height)?;
-
-            return Ok(new_image);
-        }
-
-        let should_do_horizontal = src_store.width != new_size.width;
-        let should_do_vertical = src_store.height != new_size.height;
-        assert!(should_do_horizontal || should_do_vertical);
-
-        if premultiply_alpha {
-            let mut new_store = ImageStore::alloc(src_store.width, src_store.height);
-            src_store.premultiply_alpha(&mut new_store, &pool);
-            src_store = new_store;
-        }
-
-        if should_do_vertical {
-            let allocated_store_vertical: Vec<f16> =
-                vec![f16::from_f32(0.); src_store.width * 4 * new_size.height];
-            let mut new_image_vertical = ImageStore::<f16, 4>::new(
-                allocated_store_vertical,
-                src_store.width,
-                new_size.height,
-            )?;
-            let vertical_filters =
-                self.generate_weights(src_store.height, new_image_vertical.height);
-            src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
-            src_store = new_image_vertical;
-        }
-
-        assert_eq!(src_store.height, new_size.height);
-
-        if should_do_horizontal {
-            let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-            let allocated_store_horizontal: Vec<f16> =
-                vec![f16::from_f32(0.); new_size.width * 4 * new_size.height];
-            let mut new_image_horizontal = ImageStore::<f16, 4>::new(
-                allocated_store_horizontal,
-                new_size.width,
-                new_size.height,
-            )?;
-            src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool);
-            src_store = new_image_horizontal;
-        }
-
-        assert_eq!(src_store.width, new_size.width);
-
-        if premultiply_alpha {
-            src_store.unpremultiply_alpha(&pool);
-        }
-
-        Ok(src_store)
+    ) -> Result<(), PicScaleError> {
+        self.generic_resize_with_alpha(store, into, premultiply_alpha)
     }
 
     /// Resize f16 RGB image
     pub fn resize_rgb_f16<'a>(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<'a, f16, 3>,
-    ) -> Result<ImageStore<'a, f16, 3>, PicScaleError> {
-        if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
-            return Err(PicScaleError::ZeroImageDimensions);
-        }
-
-        if check_image_size_overflow(store.width, store.height, store.channels) {
-            return Err(PicScaleError::SourceImageIsTooLarge);
-        }
-
-        if check_image_size_overflow(new_size.width, new_size.height, store.channels) {
-            return Err(PicScaleError::DestinationImageIsTooLarge);
-        }
-
-        if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
-        }
-
-        let pool = self
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-
-        if self.function == Nearest {
-            let mut allocated_store: Vec<f16> =
-                vec![f16::from_f32(0.); new_size.width * 3 * new_size.height];
-            resize_nearest::<f16, 3>(
-                &store.buffer.borrow(),
-                store.width,
-                store.height,
-                &mut allocated_store,
-                new_size.width,
-                new_size.height,
-                &pool,
-            );
-            let new_image =
-                ImageStore::<f16, 3>::new(allocated_store, new_size.width, new_size.height)?;
-            return Ok(new_image);
-        }
-
-        let mut src_store = store;
-
-        let should_do_horizontal = src_store.width != new_size.width;
-        let should_do_vertical = src_store.height != new_size.height;
-        assert!(should_do_horizontal || should_do_vertical);
-
-        if should_do_vertical {
-            let allocated_store_vertical: Vec<f16> =
-                vec![f16::from_f32(0.); src_store.width * 3 * new_size.height];
-            let mut new_image_vertical = ImageStore::<f16, 3>::new(
-                allocated_store_vertical,
-                src_store.width,
-                new_size.height,
-            )?;
-            let vertical_filters =
-                self.generate_weights(src_store.height, new_image_vertical.height);
-            src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
-            src_store = new_image_vertical;
-        }
-
-        if should_do_horizontal {
-            let allocated_store_horizontal: Vec<f16> =
-                vec![f16::from_f32(0.); new_size.width * 3 * new_size.height];
-            let mut new_image_horizontal = ImageStore::<f16, 3>::new(
-                allocated_store_horizontal,
-                new_size.width,
-                new_size.height,
-            )?;
-            let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-            src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool);
-            src_store = new_image_horizontal;
-        }
-
-        Ok(src_store)
+        &'a self,
+        store: &ImageStore<'a, f16, 3>,
+        into: &mut ImageStoreMut<'a, f16, 3>,
+    ) -> Result<(), PicScaleError> {
+        self.generic_resize(store, into)
     }
 
     /// Resize f16 plane
     pub fn resize_plane_f16<'a>(
-        &self,
-        new_size: ImageSize,
-        store: ImageStore<'a, f16, 1>,
-    ) -> Result<ImageStore<'a, f16, 1>, PicScaleError> {
-        if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 {
-            return Err(PicScaleError::ZeroImageDimensions);
-        }
-
-        if check_image_size_overflow(store.width, store.height, store.channels) {
-            return Err(PicScaleError::SourceImageIsTooLarge);
-        }
-
-        if check_image_size_overflow(new_size.width, new_size.height, store.channels) {
-            return Err(PicScaleError::DestinationImageIsTooLarge);
-        }
-
-        if store.width == new_size.width && store.height == new_size.height {
-            return Ok(store.copied());
-        }
-
-        let pool = self
-            .threading_policy
-            .get_pool(ImageSize::new(new_size.width, new_size.height));
-
-        if self.function == Nearest {
-            let mut allocated_store: Vec<f16> =
-                vec![f16::from_f32(0.); new_size.width * new_size.height];
-            resize_nearest::<f16, 1>(
-                &store.buffer.borrow(),
-                store.width,
-                store.height,
-                &mut allocated_store,
-                new_size.width,
-                new_size.height,
-                &pool,
-            );
-            let new_image =
-                ImageStore::<f16, 1>::new(allocated_store, new_size.width, new_size.height)?;
-            return Ok(new_image);
-        }
-
-        let mut src_store = store;
-
-        let should_do_horizontal = src_store.width != new_size.width;
-        let should_do_vertical = src_store.height != new_size.height;
-        assert!(should_do_horizontal || should_do_vertical);
-
-        if should_do_vertical {
-            let allocated_store_vertical: Vec<f16> =
-                vec![f16::from_f32(0.); src_store.width * new_size.height];
-            let mut new_image_vertical = ImageStore::<f16, 1>::new(
-                allocated_store_vertical,
-                src_store.width,
-                new_size.height,
-            )?;
-            let vertical_filters =
-                self.generate_weights(src_store.height, new_image_vertical.height);
-            src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
-            src_store = new_image_vertical;
-        }
-
-        if should_do_horizontal {
-            let allocated_store_horizontal: Vec<f16> =
-                vec![f16::from_f32(0.); new_size.width * new_size.height];
-            let mut new_image_horizontal = ImageStore::<f16, 1>::new(
-                allocated_store_horizontal,
-                new_size.width,
-                new_size.height,
-            )?;
-            let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-            src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool);
-            src_store = new_image_horizontal;
-        }
-
-        Ok(src_store)
+        &'a self,
+        store: &ImageStore<'a, f16, 1>,
+        into: &mut ImageStoreMut<'a, f16, 1>,
+    ) -> Result<(), PicScaleError> {
+        self.generic_resize(store, into)
     }
 }

From 1d38af30828531ca8b26fc744e4318e44174641d Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Sat, 28 Dec 2024 11:40:20 +0000
Subject: [PATCH 5/9] Updates for x86

---
 app/benches/resize_rgb/main.rs            | 16 ++---
 app/benches/resize_rgba/main.rs           | 32 ++++-----
 fuzz/resize_plane/resize_plane.rs         |  7 +-
 fuzz/resize_plane_f32/resize_plane_f32.rs | 12 ++--
 fuzz/resize_plane_u16/resize_plane_u16.rs | 13 ++--
 fuzz/resize_rgb/resize_rgb.rs             |  7 +-
 fuzz/resize_rgb_f32/resize_rgb_f32.rs     | 12 ++--
 fuzz/resize_rgb_u16/resize_rgb_u16.rs     | 13 ++--
 fuzz/resize_rgba/resize_rgba.rs           | 11 ++-
 fuzz/resize_rgba_f32/resize_rgba_f32.rs   | 12 ++--
 fuzz/resize_rgba_u16/resize_rgba_u16.rs   | 24 +++----
 src/avx2/vertical_u8_lp.rs                | 50 ++++++-------
 src/sse/rgba_u8_lb.rs                     | 20 +++---
 src/sse/vertical_u8_lp.rs                 | 86 ++++++++++++-----------
 14 files changed, 147 insertions(+), 168 deletions(-)

diff --git a/app/benches/resize_rgb/main.rs b/app/benches/resize_rgb/main.rs
index 329a933..aeb1de2 100644
--- a/app/benches/resize_rgb/main.rs
+++ b/app/benches/resize_rgb/main.rs
@@ -4,7 +4,7 @@ use fast_image_resize::FilterType::Lanczos3;
 use fast_image_resize::{CpuExtensions, PixelType, ResizeAlg, ResizeOptions, Resizer};
 use image::{EncodableLayout, GenericImageView, ImageReader};
 use pic_scale::{
-    ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ScalingF32, ThreadingPolicy,
+    ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingF32, ThreadingPolicy,
 };
 
 pub fn criterion_benchmark(c: &mut Criterion) {
@@ -27,10 +27,9 @@ pub fn criterion_benchmark(c: &mut Criterion) {
                 dimensions.1 as usize,
             )
             .unwrap();
-            _ = scaler.resize_rgb(
-                ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4),
-                store,
-            );
+            let mut target =
+                ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
+            scaler.resize_rgb(&store, &mut target).unwrap();
         })
     });
 
@@ -47,10 +46,9 @@ pub fn criterion_benchmark(c: &mut Criterion) {
                 dimensions.1 as usize,
             )
             .unwrap();
-            _ = scaler.resize_rgb_f32(
-                ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4),
-                store,
-            );
+            let mut target =
+                ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
+            scaler.resize_rgb_f32(&store, &mut target).unwrap();
         })
     });
 
diff --git a/app/benches/resize_rgba/main.rs b/app/benches/resize_rgba/main.rs
index 5635f51..3ac926c 100644
--- a/app/benches/resize_rgba/main.rs
+++ b/app/benches/resize_rgba/main.rs
@@ -4,7 +4,7 @@ use fast_image_resize::FilterType::Lanczos3;
 use fast_image_resize::{CpuExtensions, PixelType, ResizeAlg, ResizeOptions, Resizer};
 use image::{GenericImageView, ImageReader};
 use pic_scale::{
-    ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ScalingF32, ThreadingPolicy,
+    ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingF32, ThreadingPolicy,
 };
 
 pub fn criterion_benchmark(c: &mut Criterion) {
@@ -25,11 +25,9 @@ pub fn criterion_benchmark(c: &mut Criterion) {
                 dimensions.1 as usize,
             )
             .unwrap();
-            _ = scaler.resize_rgba(
-                ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
-                store,
-                true,
-            );
+            let mut target =
+                ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
+            _ = scaler.resize_rgba(&store, &mut target, true);
         })
     });
 
@@ -46,17 +44,15 @@ pub fn criterion_benchmark(c: &mut Criterion) {
                 dimensions.1 as usize,
             )
             .unwrap();
-            _ = scaler.resize_rgba_f32(
-                ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
-                store,
-                false,
-            );
+            let mut target =
+                ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
+            _ = scaler.resize_rgba_f32(&store, &mut target, false);
         })
     });
 
     c.bench_function("Fast image resize RGBA with alpha: Lanczos 3", |b| {
+        let mut vc = Vec::from(img.as_bytes());
         b.iter(|| {
-            let mut vc = Vec::from(img.as_bytes());
             let pixel_type: PixelType = PixelType::U8x4;
             let src_image =
                 Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();
@@ -84,27 +80,25 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     });
 
     c.bench_function("Pic scale RGBA without alpha: Lanczos 3", |b| {
+        let mut copied: Vec<u8> = Vec::from(src_bytes);
         b.iter(|| {
             let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
             scaler.set_threading_policy(ThreadingPolicy::Single);
-            let mut copied: Vec<u8> = Vec::from(src_bytes);
             let store = ImageStore::<u8, 4>::from_slice(
                 &mut copied,
                 dimensions.0 as usize,
                 dimensions.1 as usize,
             )
             .unwrap();
-            _ = scaler.resize_rgba(
-                ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
-                store,
-                false,
-            );
+            let mut target =
+                ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
+            _ = scaler.resize_rgba(&store, &mut target, false);
         })
     });
 
     c.bench_function("Fast image resize RGBA without alpha: Lanczos 3", |b| {
+        let mut vc = Vec::from(img.as_bytes());
         b.iter(|| {
-            let mut vc = Vec::from(img.as_bytes());
             let pixel_type: PixelType = PixelType::U8x4;
             let src_image =
                 Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();
diff --git a/fuzz/resize_plane/resize_plane.rs b/fuzz/resize_plane/resize_plane.rs
index 829cca4..7238732 100644
--- a/fuzz/resize_plane/resize_plane.rs
+++ b/fuzz/resize_plane/resize_plane.rs
@@ -30,7 +30,7 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler};
+use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler};
 
 fuzz_target!(|data: (u16, u16, u16, u16)| {
     resize_plane(
@@ -64,8 +64,7 @@ fn resize_plane(
     let mut src_data = vec![15u8; src_width * src_height];
 
     let store = ImageStore::<u8, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let mut target = ImageStoreMut::alloc(dst_width, dst_height);
     let scaler = Scaler::new(sampler);
-    _ = scaler
-        .resize_plane(ImageSize::new(dst_width, dst_height), store)
-        .unwrap();
+    scaler.resize_plane(&store, &mut target).unwrap();
 }
diff --git a/fuzz/resize_plane_f32/resize_plane_f32.rs b/fuzz/resize_plane_f32/resize_plane_f32.rs
index bb128e0..74f2d31 100644
--- a/fuzz/resize_plane_f32/resize_plane_f32.rs
+++ b/fuzz/resize_plane_f32/resize_plane_f32.rs
@@ -30,7 +30,7 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler};
+use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler};
 
 fuzz_target!(|data: (u16, u16, u16, u16)| {
     resize_plane(
@@ -64,12 +64,8 @@ fn resize_plane(
     let mut src_data = vec![0f32; src_width * src_height];
 
     let store = ImageStore::<f32, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let mut target = ImageStoreMut::alloc(dst_width, dst_height);
+
     let scaler = Scaler::new(sampler);
-    _ = scaler
-        .resize_plane_f32(ImageSize::new(dst_width, dst_height), store)
-        .unwrap();
-    let store = ImageStore::<f32, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
-    _ = scaler
-        .resize_plane_f32(ImageSize::new(dst_width, dst_height), store)
-        .unwrap();
+    scaler.resize_plane_f32(&store, &mut target).unwrap();
 }
diff --git a/fuzz/resize_plane_u16/resize_plane_u16.rs b/fuzz/resize_plane_u16/resize_plane_u16.rs
index 8a59c96..0b7f44f 100644
--- a/fuzz/resize_plane_u16/resize_plane_u16.rs
+++ b/fuzz/resize_plane_u16/resize_plane_u16.rs
@@ -30,7 +30,7 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16};
+use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ScalingU16};
 
 fuzz_target!(|data: (u16, u16, u16, u16)| {
     resize_rgb(
@@ -64,13 +64,12 @@ fn resize_rgb(
     let mut src_data = vec![1u16; src_width * src_height];
 
     let store = ImageStore::<u16, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10);
+
     let scaler = Scaler::new(sampler);
-    _ = scaler
-        .resize_plane_u16(ImageSize::new(dst_width, dst_height), store, 10)
-        .unwrap();
+    scaler.resize_plane_u16(&store, &mut target).unwrap();
 
     let store = ImageStore::<u16, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
-    _ = scaler
-        .resize_plane_u16(ImageSize::new(dst_width, dst_height), store, 16)
-        .unwrap();
+    let mut target16 = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16);
+    scaler.resize_plane_u16(&store, &mut target16).unwrap();
 }
diff --git a/fuzz/resize_rgb/resize_rgb.rs b/fuzz/resize_rgb/resize_rgb.rs
index ecc74d3..40ad6ae 100644
--- a/fuzz/resize_rgb/resize_rgb.rs
+++ b/fuzz/resize_rgb/resize_rgb.rs
@@ -30,7 +30,7 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling};
+use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling};
 
 fuzz_target!(|data: (u16, u16, u16, u16)| {
     resize_rgb(
@@ -64,8 +64,7 @@ fn resize_rgb(
     let mut src_data = vec![0u8; src_width * src_height * 3];
 
     let store = ImageStore::<u8, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let mut target = ImageStoreMut::alloc(dst_width, dst_height);
     let scaler = Scaler::new(sampler);
-    _ = scaler
-        .resize_rgb(ImageSize::new(dst_width, dst_height), store)
-        .unwrap();
+    scaler.resize_rgb(&store, &mut target).unwrap();
 }
diff --git a/fuzz/resize_rgb_f32/resize_rgb_f32.rs b/fuzz/resize_rgb_f32/resize_rgb_f32.rs
index f2d4773..68228c9 100644
--- a/fuzz/resize_rgb_f32/resize_rgb_f32.rs
+++ b/fuzz/resize_rgb_f32/resize_rgb_f32.rs
@@ -30,7 +30,7 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingF32};
+use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ScalingF32};
 
 fuzz_target!(|data: (u16, u16, u16, u16)| {
     resize_rgb(
@@ -64,12 +64,8 @@ fn resize_rgb(
     let mut src_data = vec![0f32; src_width * src_height * 3];
 
     let store = ImageStore::<f32, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let mut target = ImageStoreMut::alloc(dst_width, dst_height);
+
     let scaler = Scaler::new(sampler);
-    _ = scaler
-        .resize_rgb_f32(ImageSize::new(dst_width, dst_height), store)
-        .unwrap();
-    let store = ImageStore::<f32, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
-    _ = scaler
-        .resize_rgb_f32(ImageSize::new(dst_width, dst_height), store)
-        .unwrap();
+    scaler.resize_rgb_f32(&store, &mut target).unwrap();
 }
diff --git a/fuzz/resize_rgb_u16/resize_rgb_u16.rs b/fuzz/resize_rgb_u16/resize_rgb_u16.rs
index 47e48fd..7018c61 100644
--- a/fuzz/resize_rgb_u16/resize_rgb_u16.rs
+++ b/fuzz/resize_rgb_u16/resize_rgb_u16.rs
@@ -30,7 +30,7 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16};
+use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ScalingU16};
 
 fuzz_target!(|data: (u16, u16, u16, u16)| {
     resize_rgb(
@@ -64,13 +64,12 @@ fn resize_rgb(
     let mut src_data = vec![1u16; src_width * src_height * 3];
 
     let store = ImageStore::<u16, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10);
+
     let scaler = Scaler::new(sampler);
-    _ = scaler
-        .resize_rgb_u16(ImageSize::new(dst_width, dst_height), store, 10)
-        .unwrap();
+    scaler.resize_rgb_u16(&store, &mut target).unwrap();
 
     let store = ImageStore::<u16, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
-    _ = scaler
-        .resize_rgb_u16(ImageSize::new(dst_width, dst_height), store, 16)
-        .unwrap();
+    let mut target16 = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16);
+    scaler.resize_rgb_u16(&store, &mut target16).unwrap();
 }
diff --git a/fuzz/resize_rgba/resize_rgba.rs b/fuzz/resize_rgba/resize_rgba.rs
index dab34f5..ecf7055 100644
--- a/fuzz/resize_rgba/resize_rgba.rs
+++ b/fuzz/resize_rgba/resize_rgba.rs
@@ -30,7 +30,7 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling};
+use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling};
 
 fuzz_target!(|data: (u16, u16, u16, u16)| {
     resize_rgba(
@@ -64,12 +64,9 @@ fn resize_rgba(
     let mut src_data = vec![0u8; src_width * src_height * 4];
 
     let store = ImageStore::<u8, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let mut target = ImageStoreMut::alloc(dst_width, dst_height);
     let scaler = Scaler::new(sampler);
-    _ = scaler
-        .resize_rgba(ImageSize::new(dst_width, dst_height), store, false)
-        .unwrap();
+    scaler.resize_rgba(&store, &mut target, false).unwrap();
     let store = ImageStore::<u8, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
-    _ = scaler
-        .resize_rgba(ImageSize::new(dst_width, dst_height), store, true)
-        .unwrap();
+    scaler.resize_rgba(&store, &mut target, true).unwrap();
 }
diff --git a/fuzz/resize_rgba_f32/resize_rgba_f32.rs b/fuzz/resize_rgba_f32/resize_rgba_f32.rs
index 8c08146..67a2e47 100644
--- a/fuzz/resize_rgba_f32/resize_rgba_f32.rs
+++ b/fuzz/resize_rgba_f32/resize_rgba_f32.rs
@@ -30,7 +30,7 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingF32};
+use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ScalingF32};
 
 fuzz_target!(|data: (u16, u16, u16, u16)| {
     resize_rgba(
@@ -64,12 +64,10 @@ fn resize_rgba(
     let mut src_data = vec![0f32; src_width * src_height * 4];
 
     let store = ImageStore::<f32, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let mut target = ImageStoreMut::alloc(dst_width, dst_height);
+
     let scaler = Scaler::new(sampler);
-    _ = scaler
-        .resize_rgba_f32(ImageSize::new(dst_width, dst_height), store, false)
-        .unwrap();
+    scaler.resize_rgba_f32(&store, &mut target, false).unwrap();
     let store = ImageStore::<f32, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
-    _ = scaler
-        .resize_rgba_f32(ImageSize::new(dst_width, dst_height), store, true)
-        .unwrap();
+    scaler.resize_rgba_f32(&store, &mut target, true).unwrap();
 }
diff --git a/fuzz/resize_rgba_u16/resize_rgba_u16.rs b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
index 494da64..57d8090 100644
--- a/fuzz/resize_rgba_u16/resize_rgba_u16.rs
+++ b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
@@ -30,7 +30,9 @@
 #![no_main]
 
 use libfuzzer_sys::fuzz_target;
-use pic_scale::{Ar30ByteOrder, ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16};
+use pic_scale::{
+    Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ScalingU16,
+};
 
 fuzz_target!(|data: (u16, u16, u16, u16)| {
     resize_rgba(
@@ -64,24 +66,20 @@ fn resize_rgba(
     let mut src_data = vec![1u16; src_width * src_height * 4];
 
     let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10);
+
     let scaler = Scaler::new(sampler);
-    _ = scaler
-        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 10, false)
-        .unwrap();
+    scaler.resize_rgba_u16(&store, &mut target, false).unwrap();
     let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
-    _ = scaler
-        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 10, true)
-        .unwrap();
+    scaler.resize_rgba_u16(&store, &mut target, true).unwrap();
+
+    let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16);
 
     let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
-    _ = scaler
-        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 16, false)
-        .unwrap();
+    scaler.resize_rgba_u16(&store, &mut target, false).unwrap();
 
     let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
-    _ = scaler
-        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 16, true)
-        .unwrap();
+    scaler.resize_rgba_u16(&store, &mut target, true).unwrap();
 
     let src_data_ar30 = vec![1u32; src_width * src_height];
     let mut dst_data_ar30 = vec![1u32; dst_width * dst_height];
diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs
index 48ea85e..c63a4ca 100644
--- a/src/avx2/vertical_u8_lp.rs
+++ b/src/avx2/vertical_u8_lp.rs
@@ -59,11 +59,11 @@ unsafe fn m256dot<const SCALE: i32>(
 
     let store0 = _mm256_add_epi16(
         store0,
-        _mm256_mulhi_epi16(_mm256_slli_epi16::<SCALE>(lo), weight),
+        _mm256_mulhrs_epi16(_mm256_slli_epi16::<SCALE>(lo), weight),
     );
     let store1 = _mm256_add_epi16(
         store1,
-        _mm256_mulhi_epi16(_mm256_slli_epi16::<SCALE>(hi), weight),
+        _mm256_mulhrs_epi16(_mm256_slli_epi16::<SCALE>(hi), weight),
     );
     (store0, store1)
 }
@@ -81,8 +81,8 @@ unsafe fn convolve_vertical_avx2_row_impl(
 
     let bounds_size = bounds.size;
     const SCALE: i32 = 6;
-    const R_SHR_SCALE: i32 = SCALE - 1;
-    const ROUNDING: i16 = 1 << (SCALE - 1);
+    const R_SHR_SCALE: i32 = SCALE;
+    const ROUNDING: i16 = 1 << (R_SHR_SCALE - 1);
 
     let mut cx = 0usize;
 
@@ -342,7 +342,7 @@ unsafe fn convolve_vertical_avx2_row_impl(
 
             store0 = _mm256_add_epi16(
                 store0,
-                _mm256_mulhi_epi16(_mm256_slli_epi16::<SCALE>(item_row), v_weight),
+                _mm256_mulhrs_epi16(_mm256_slli_epi16::<SCALE>(item_row), v_weight),
             );
         }
 
@@ -377,13 +377,13 @@ unsafe fn convolve_vertical_avx2_row_impl(
             let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
             );
 
             let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
             );
         } else if bounds_size == 3 {
             let py = bounds.start;
@@ -401,19 +401,19 @@ unsafe fn convolve_vertical_avx2_row_impl(
             let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
             );
 
             let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
             );
 
             let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
             );
         } else if bounds_size == 4 {
             let py = bounds.start;
@@ -434,25 +434,25 @@ unsafe fn convolve_vertical_avx2_row_impl(
             let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
             );
 
             let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
             );
 
             let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
             );
 
             let item_row3 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr3.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row3), v_weight3),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row3), v_weight3),
             );
         } else {
             for j in 0..bounds_size {
@@ -464,7 +464,7 @@ unsafe fn convolve_vertical_avx2_row_impl(
                 let item_row = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr.as_ptr()), zeros);
 
                 let low = _mm_slli_epi16::<SCALE>(item_row);
-                store = _mm_add_epi16(store, _mm_mulhi_epi16(low, v_weight));
+                store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight));
             }
         }
 
@@ -497,13 +497,13 @@ unsafe fn convolve_vertical_avx2_row_impl(
 
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
             );
 
             let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
             );
         } else if bounds_size == 3 {
             let py = bounds.start;
@@ -522,19 +522,19 @@ unsafe fn convolve_vertical_avx2_row_impl(
 
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
             );
 
             let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
             );
 
             let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
             );
         } else if bounds_size == 4 {
             let py = bounds.start;
@@ -556,25 +556,25 @@ unsafe fn convolve_vertical_avx2_row_impl(
 
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
             );
 
             let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
             );
 
             let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
             );
 
             let item_row3 = _mm_set1_epi16(src_ptr3[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row3), v_weight3),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row3), v_weight3),
             );
         } else {
             for j in 0..bounds_size {
@@ -587,7 +587,7 @@ unsafe fn convolve_vertical_avx2_row_impl(
 
                 store = _mm_add_epi16(
                     store,
-                    _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row), v_weight),
+                    _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row), v_weight),
                 );
             }
         }
diff --git a/src/sse/rgba_u8_lb.rs b/src/sse/rgba_u8_lb.rs
index 9f7ffe9..1eded53 100644
--- a/src/sse/rgba_u8_lb.rs
+++ b/src/sse/rgba_u8_lb.rs
@@ -47,7 +47,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_sse<const SCALE: i32>(
     let rgba_pixel = _mm_cvtsi32_si128(src_ptr_32.read_unaligned());
     let lo = _mm_slli_epi16::<SCALE>(_mm_unpacklo_epi8(rgba_pixel, _mm_setzero_si128()));
 
-    _mm_add_epi16(store_0, _mm_mulhi_epi16(lo, weight0))
+    _mm_add_epi16(store_0, _mm_mulhrs_epi16(lo, weight0))
 }
 
 pub(crate) fn convolve_horizontal_rgba_sse_rows_4_lb(
@@ -77,10 +77,10 @@ unsafe fn hdot4<const SCALE: i32>(
     let hi0 = _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(v0, zeros));
     let lo1 = _mm_slli_epi16::<SCALE>(_mm_unpacklo_epi8(v1, zeros));
     let hi1 = _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(v1, zeros));
-    let mut p = _mm_mulhi_epi16(lo0, w01);
-    p = _mm_add_epi16(p, _mm_mulhi_epi16(hi0, w23));
-    p = _mm_add_epi16(p, _mm_mulhi_epi16(lo1, w45));
-    p = _mm_add_epi16(p, _mm_mulhi_epi16(hi1, w67));
+    let mut p = _mm_mulhrs_epi16(lo0, w01);
+    p = _mm_add_epi16(p, _mm_mulhrs_epi16(hi0, w23));
+    p = _mm_add_epi16(p, _mm_mulhrs_epi16(lo1, w45));
+    p = _mm_add_epi16(p, _mm_mulhrs_epi16(hi1, w67));
     let hi_part = _mm_unpackhi_epi64(p, p);
     p = _mm_add_epi16(hi_part, p);
     _mm_add_epi16(store, p)
@@ -96,8 +96,8 @@ unsafe fn hdot2<const SCALE: i32>(
     let zeros = _mm_setzero_si128();
     let lo = _mm_slli_epi16::<SCALE>(_mm_unpacklo_epi8(v, zeros));
     let hi = _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(v, zeros));
-    let mut p = _mm_mulhi_epi16(lo, w01);
-    p = _mm_add_epi16(p, _mm_mulhi_epi16(hi, w23));
+    let mut p = _mm_mulhrs_epi16(lo, w01);
+    p = _mm_add_epi16(p, _mm_mulhrs_epi16(hi, w23));
     let hi_part = _mm_unpackhi_epi64(p, p);
     p = _mm_add_epi16(hi_part, p);
     _mm_add_epi16(store, p)
@@ -107,7 +107,7 @@ unsafe fn hdot2<const SCALE: i32>(
 unsafe fn hdot<const SCALE: i32>(store: __m128i, v: __m128i, w01: __m128i) -> __m128i {
     let zeros = _mm_setzero_si128();
     let lo = _mm_slli_epi16::<SCALE>(_mm_unpacklo_epi8(v, zeros));
-    let mut p = _mm_mulhi_epi16(lo, w01);
+    let mut p = _mm_mulhrs_epi16(lo, w01);
     let hi_part = _mm_unpackhi_epi64(p, p);
     p = _mm_add_epi16(hi_part, p);
     _mm_add_epi16(store, p)
@@ -125,8 +125,8 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
         const CHANNELS: usize = 4;
 
         const SCALE: i32 = 6;
-        const ROUNDING: i16 = 1 << (SCALE - 1);
-        const V_SHR: i32 = SCALE - 1;
+        const V_SHR: i32 = SCALE;
+        const ROUNDING: i16 = 1 << (V_SHR - 1);
 
         let vld = _mm_set1_epi16(ROUNDING);
 
diff --git a/src/sse/vertical_u8_lp.rs b/src/sse/vertical_u8_lp.rs
index 7b1a651..5deefdf 100644
--- a/src/sse/vertical_u8_lp.rs
+++ b/src/sse/vertical_u8_lp.rs
@@ -56,8 +56,14 @@ unsafe fn mdot<const SCALE: i32>(
     let lo = _mm_unpacklo_epi8(row, zeros);
     let hi = _mm_unpackhi_epi8(row, zeros);
 
-    let store0 = _mm_add_epi16(store0, _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(lo), weight));
-    let store1 = _mm_add_epi16(store1, _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(hi), weight));
+    let store0 = _mm_add_epi16(
+        store0,
+        _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(lo), weight),
+    );
+    let store1 = _mm_add_epi16(
+        store1,
+        _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(hi), weight),
+    );
     (store0, store1)
 }
 
@@ -74,8 +80,8 @@ unsafe fn convolve_vertical_sse_row_impl(
 
     let bounds_size = bounds.size;
     const SCALE: i32 = 6;
-    const R_SHR_SCALE: i32 = SCALE - 1;
-    const ROUNDING: i16 = 1 << (SCALE - 1);
+    const R_SHR_SCALE: i32 = SCALE;
+    const ROUNDING: i16 = 1 << (R_SHR_SCALE - 1);
 
     let mut cx = 0usize;
 
@@ -111,14 +117,14 @@ unsafe fn convolve_vertical_sse_row_impl(
             (store6, store7) = mdot::<SCALE>(store6, store7, item_row3, v_weight);
         }
 
-        let rebased0 = _mm_srli_epi16::<R_SHR_SCALE>(store0);
-        let rebased1 = _mm_srli_epi16::<R_SHR_SCALE>(store1);
-        let rebased2 = _mm_srli_epi16::<R_SHR_SCALE>(store2);
-        let rebased3 = _mm_srli_epi16::<R_SHR_SCALE>(store3);
-        let rebased4 = _mm_srli_epi16::<R_SHR_SCALE>(store4);
-        let rebased5 = _mm_srli_epi16::<R_SHR_SCALE>(store5);
-        let rebased6 = _mm_srli_epi16::<R_SHR_SCALE>(store6);
-        let rebased7 = _mm_srli_epi16::<R_SHR_SCALE>(store7);
+        let rebased0 = _mm_srai_epi16::<R_SHR_SCALE>(store0);
+        let rebased1 = _mm_srai_epi16::<R_SHR_SCALE>(store1);
+        let rebased2 = _mm_srai_epi16::<R_SHR_SCALE>(store2);
+        let rebased3 = _mm_srai_epi16::<R_SHR_SCALE>(store3);
+        let rebased4 = _mm_srai_epi16::<R_SHR_SCALE>(store4);
+        let rebased5 = _mm_srai_epi16::<R_SHR_SCALE>(store5);
+        let rebased6 = _mm_srai_epi16::<R_SHR_SCALE>(store6);
+        let rebased7 = _mm_srai_epi16::<R_SHR_SCALE>(store7);
         let shrank0 = _mm_packus_epi16(rebased0, rebased1);
         let shrank1 = _mm_packus_epi16(rebased2, rebased3);
         let shrank2 = _mm_packus_epi16(rebased4, rebased5);
@@ -164,10 +170,10 @@ unsafe fn convolve_vertical_sse_row_impl(
             (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight);
         }
 
-        let rebased0 = _mm_srli_epi16::<R_SHR_SCALE>(store0);
-        let rebased1 = _mm_srli_epi16::<R_SHR_SCALE>(store1);
-        let rebased2 = _mm_srli_epi16::<R_SHR_SCALE>(store2);
-        let rebased3 = _mm_srli_epi16::<R_SHR_SCALE>(store3);
+        let rebased0 = _mm_srai_epi16::<R_SHR_SCALE>(store0);
+        let rebased1 = _mm_srai_epi16::<R_SHR_SCALE>(store1);
+        let rebased2 = _mm_srai_epi16::<R_SHR_SCALE>(store2);
+        let rebased3 = _mm_srai_epi16::<R_SHR_SCALE>(store3);
         let shrank0 = _mm_packus_epi16(rebased0, rebased1);
         let shrank1 = _mm_packus_epi16(rebased2, rebased3);
         _mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, shrank0);
@@ -258,8 +264,8 @@ unsafe fn convolve_vertical_sse_row_impl(
             }
         }
 
-        let rebased0 = _mm_srli_epi16::<R_SHR_SCALE>(store0);
-        let rebased1 = _mm_srli_epi16::<R_SHR_SCALE>(store1);
+        let rebased0 = _mm_srai_epi16::<R_SHR_SCALE>(store0);
+        let rebased1 = _mm_srai_epi16::<R_SHR_SCALE>(store1);
         let shrank = _mm_packus_epi16(rebased0, rebased1);
         _mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, shrank);
 
@@ -287,13 +293,13 @@ unsafe fn convolve_vertical_sse_row_impl(
             let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
             );
 
             let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
             );
         } else if bounds_size == 3 {
             let py = bounds.start;
@@ -311,19 +317,19 @@ unsafe fn convolve_vertical_sse_row_impl(
             let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
             );
 
             let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
             );
 
             let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
             );
         } else if bounds_size == 4 {
             let py = bounds.start;
@@ -344,25 +350,25 @@ unsafe fn convolve_vertical_sse_row_impl(
             let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
             );
 
             let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
             );
 
             let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
             );
 
             let item_row3 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr3.as_ptr()), zeros);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row3), v_weight3),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row3), v_weight3),
             );
         } else {
             for j in 0..bounds_size {
@@ -374,11 +380,11 @@ unsafe fn convolve_vertical_sse_row_impl(
                 let item_row = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr.as_ptr()), zeros);
 
                 let low = _mm_slli_epi16::<SCALE>(item_row);
-                store = _mm_add_epi16(store, _mm_mulhi_epi16(low, v_weight));
+                store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight));
             }
         }
 
-        let rebased = _mm_srli_epi16::<R_SHR_SCALE>(store);
+        let rebased = _mm_srai_epi16::<R_SHR_SCALE>(store);
         let shrank = _mm_packus_epi16(rebased, rebased);
         _mm_storeu_si64(dst.as_mut_ptr(), shrank);
 
@@ -407,13 +413,13 @@ unsafe fn convolve_vertical_sse_row_impl(
 
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
             );
 
             let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
             );
         } else if bounds_size == 3 {
             let py = bounds.start;
@@ -432,19 +438,19 @@ unsafe fn convolve_vertical_sse_row_impl(
 
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
             );
 
             let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
             );
 
             let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
             );
         } else if bounds_size == 4 {
             let py = bounds.start;
@@ -466,25 +472,25 @@ unsafe fn convolve_vertical_sse_row_impl(
 
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
             );
 
             let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
             );
 
             let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
             );
 
             let item_row3 = _mm_set1_epi16(src_ptr3[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row3), v_weight3),
+                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row3), v_weight3),
             );
         } else {
             for j in 0..bounds_size {
@@ -497,12 +503,12 @@ unsafe fn convolve_vertical_sse_row_impl(
 
                 store = _mm_add_epi16(
                     store,
-                    _mm_mulhi_epi16(_mm_slli_epi16::<SCALE>(item_row), v_weight),
+                    _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row), v_weight),
                 );
             }
         }
 
-        let rebased = _mm_srli_epi16::<R_SHR_SCALE>(store);
+        let rebased = _mm_srai_epi16::<R_SHR_SCALE>(store);
         let value = _mm_extract_epi8::<0>(_mm_packus_epi16(rebased, rebased));
         *dst = value as u8;
 

From b169d4535304d00c823a1620fe0296ae69e9ae3e Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Sat, 28 Dec 2024 11:46:59 +0000
Subject: [PATCH 6/9] Updates for x86

---
 fuzz/resize_plane_u16/resize_plane_u16.rs |  6 ++----
 fuzz/resize_rgb_u16/resize_rgb_u16.rs     |  6 ++----
 fuzz/resize_rgba_f32/resize_rgba_f32.rs   |  6 ++----
 fuzz/resize_rgba_u16/resize_rgba_u16.rs   | 10 ++++------
 4 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/fuzz/resize_plane_u16/resize_plane_u16.rs b/fuzz/resize_plane_u16/resize_plane_u16.rs
index 0b7f44f..c6e983d 100644
--- a/fuzz/resize_plane_u16/resize_plane_u16.rs
+++ b/fuzz/resize_plane_u16/resize_plane_u16.rs
@@ -61,15 +61,13 @@ fn resize_rgb(
         return;
     }
 
-    let mut src_data = vec![1u16; src_width * src_height];
-
-    let store = ImageStore::<u16, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let store = ImageStore::<u16, 1>::alloc(src_width, src_height);
     let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10);
 
     let scaler = Scaler::new(sampler);
     scaler.resize_plane_u16(&store, &mut target).unwrap();
 
-    let store = ImageStore::<u16, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let store = ImageStore::<u16, 1>::alloc(src_width, src_height);
     let mut target16 = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16);
     scaler.resize_plane_u16(&store, &mut target16).unwrap();
 }
diff --git a/fuzz/resize_rgb_u16/resize_rgb_u16.rs b/fuzz/resize_rgb_u16/resize_rgb_u16.rs
index 7018c61..2cbc018 100644
--- a/fuzz/resize_rgb_u16/resize_rgb_u16.rs
+++ b/fuzz/resize_rgb_u16/resize_rgb_u16.rs
@@ -61,15 +61,13 @@ fn resize_rgb(
         return;
     }
 
-    let mut src_data = vec![1u16; src_width * src_height * 3];
-
-    let store = ImageStore::<u16, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let store = ImageStore::<u16, 3>::alloc(src_width, src_height);
     let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10);
 
     let scaler = Scaler::new(sampler);
     scaler.resize_rgb_u16(&store, &mut target).unwrap();
 
-    let store = ImageStore::<u16, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let store = ImageStore::<u16, 3>::alloc(src_width, src_height);
     let mut target16 = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16);
     scaler.resize_rgb_u16(&store, &mut target16).unwrap();
 }
diff --git a/fuzz/resize_rgba_f32/resize_rgba_f32.rs b/fuzz/resize_rgba_f32/resize_rgba_f32.rs
index 67a2e47..628b885 100644
--- a/fuzz/resize_rgba_f32/resize_rgba_f32.rs
+++ b/fuzz/resize_rgba_f32/resize_rgba_f32.rs
@@ -61,13 +61,11 @@ fn resize_rgba(
         return;
     }
 
-    let mut src_data = vec![0f32; src_width * src_height * 4];
-
-    let store = ImageStore::<f32, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let store = ImageStore::<f32, 4>::alloc(src_width, src_height);
     let mut target = ImageStoreMut::alloc(dst_width, dst_height);
 
     let scaler = Scaler::new(sampler);
     scaler.resize_rgba_f32(&store, &mut target, false).unwrap();
-    let store = ImageStore::<f32, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let store = ImageStore::<f32, 4>::alloc(src_width, src_height);
     scaler.resize_rgba_f32(&store, &mut target, true).unwrap();
 }
diff --git a/fuzz/resize_rgba_u16/resize_rgba_u16.rs b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
index 57d8090..571290d 100644
--- a/fuzz/resize_rgba_u16/resize_rgba_u16.rs
+++ b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
@@ -63,22 +63,20 @@ fn resize_rgba(
         return;
     }
 
-    let mut src_data = vec![1u16; src_width * src_height * 4];
-
-    let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let store = ImageStore::<u16, 4>::alloc(src_width, src_height);
     let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10);
 
     let scaler = Scaler::new(sampler);
     scaler.resize_rgba_u16(&store, &mut target, false).unwrap();
-    let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let store = ImageStore::<u16, 4>::alloc(src_width, src_height);
     scaler.resize_rgba_u16(&store, &mut target, true).unwrap();
 
     let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16);
 
-    let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let store = ImageStore::<u16, 4>::alloc(src_width, src_height);
     scaler.resize_rgba_u16(&store, &mut target, false).unwrap();
 
-    let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let store = ImageStore::<u16, 4>::alloc(src_width, src_height);
     scaler.resize_rgba_u16(&store, &mut target, true).unwrap();
 
     let src_data_ar30 = vec![1u32; src_width * src_height];

From 1b7444a525ee7e03d772f165e17786c2dd754a6b Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Sat, 28 Dec 2024 12:21:18 +0000
Subject: [PATCH 7/9] Fuzzing fixes

---
 fuzz/resize_rgba/resize_rgba.rs | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fuzz/resize_rgba/resize_rgba.rs b/fuzz/resize_rgba/resize_rgba.rs
index ecf7055..2654ba7 100644
--- a/fuzz/resize_rgba/resize_rgba.rs
+++ b/fuzz/resize_rgba/resize_rgba.rs
@@ -61,12 +61,10 @@ fn resize_rgba(
         return;
     }
 
-    let mut src_data = vec![0u8; src_width * src_height * 4];
-
-    let store = ImageStore::<u8, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let store = ImageStore::<u8, 4>::alloc(src_width, src_height);
     let mut target = ImageStoreMut::alloc(dst_width, dst_height);
     let scaler = Scaler::new(sampler);
     scaler.resize_rgba(&store, &mut target, false).unwrap();
-    let store = ImageStore::<u8, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let store = ImageStore::<u8, 4>::alloc(src_width, src_height);
     scaler.resize_rgba(&store, &mut target, true).unwrap();
 }

From 60200971613829276bdb47119a97cfc0854a31d8 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Sat, 28 Dec 2024 13:45:15 +0000
Subject: [PATCH 8/9] Improvements on x86 u8

---
 Cargo.lock                 |   4 +-
 Cargo.toml                 |   2 +-
 app/src/main.rs            |   5 +-
 src/avx2/vertical_u8_lp.rs | 245 ++++----------------------------
 src/sse/rgba_u8_lb.rs      |  97 +++++--------
 src/sse/vertical_u8_lp.rs  | 278 ++++++-------------------------------
 6 files changed, 115 insertions(+), 516 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f1d0691..441a439 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -259,9 +259,9 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
 
 [[package]]
 name = "colorutils-rs"
-version = "0.7.3"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "31ca2cc8ed986672b15bfd3e416014e40cada05196bdfaa51168985f3c2e81f1"
+checksum = "c06bb7c7479a238be740a3312b5693d76e234eb49b73b3e61ae768132c79d06a"
 dependencies = [
  "erydanos",
  "half",
diff --git a/Cargo.toml b/Cargo.toml
index 8dd615e..12abb08 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,7 +17,7 @@ exclude = ["*.jpg", "/assets", "*.png", "*.sh", "/assets/*"]
 rust-version = "1.82.0"
 
 [dependencies]
-colorutils-rs = {version = "0.7.0", optional = true}
+colorutils-rs = {version = "0.7.4", optional = true}
 half = { version = "2.4.1", optional = true, features = ["alloc", "std", "num-traits"] }
 num-traits = { version = "0.2.19", features = ["std"] }
 rayon = "1.10.0"
diff --git a/app/src/main.rs b/app/src/main.rs
index 8166b23..91e5cc3 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -11,8 +11,9 @@ use fast_image_resize::{
 };
 use image::{EncodableLayout, GenericImageView, ImageReader};
 use pic_scale::{
-    Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, LinearScaler, ResamplingFunction, Scaler,
-    Scaling, ScalingU16, ThreadingPolicy,
+    Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, JzazbzScaler, LChScaler, LabScaler,
+    LinearApproxScaler, LinearScaler, LuvScaler, OklabScaler, ResamplingFunction, Scaler, Scaling,
+    ScalingU16, SigmoidalScaler, ThreadingPolicy, TransferFunction, XYZScaler,
 };
 
 fn resize_plane(
diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs
index c63a4ca..1ccf827 100644
--- a/src/avx2/vertical_u8_lp.rs
+++ b/src/avx2/vertical_u8_lp.rs
@@ -53,17 +53,16 @@ unsafe fn m256dot<const SCALE: i32>(
     row: __m256i,
     weight: __m256i,
 ) -> (__m256i, __m256i) {
-    let zeros = _mm256_setzero_si256();
-    let lo = _mm256_unpacklo_epi8(row, zeros);
-    let hi = _mm256_unpackhi_epi8(row, zeros);
+    let lo = _mm256_unpacklo_epi8(row, row);
+    let hi = _mm256_unpackhi_epi8(row, row);
 
     let store0 = _mm256_add_epi16(
         store0,
-        _mm256_mulhrs_epi16(_mm256_slli_epi16::<SCALE>(lo), weight),
+        _mm256_mulhrs_epi16(_mm256_srli_epi16::<2>(lo), weight),
     );
     let store1 = _mm256_add_epi16(
         store1,
-        _mm256_mulhrs_epi16(_mm256_slli_epi16::<SCALE>(hi), weight),
+        _mm256_mulhrs_epi16(_mm256_srli_epi16::<2>(hi), weight),
     );
     (store0, store1)
 }
@@ -77,8 +76,6 @@ unsafe fn convolve_vertical_avx2_row_impl(
     src_stride: usize,
     weight: &[i16],
 ) {
-    let zeros = _mm_setzero_si128();
-
     let bounds_size = bounds.size;
     const SCALE: i32 = 6;
     const R_SHR_SCALE: i32 = SCALE;
@@ -337,12 +334,13 @@ unsafe fn convolve_vertical_avx2_row_impl(
             let v_weight = _mm256_set1_epi16(weight[0]);
             let v_offset = src_stride * py + px;
             let src_ptr = src.get_unchecked(v_offset..);
-            let item_row =
-                _mm256_cvtepu8_epi16(_mm_loadu_si128(src_ptr.as_ptr() as *const __m128i));
-
+            let mut item_row = _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256(
+                _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i),
+            ));
+            item_row = _mm256_unpacklo_epi8(item_row, item_row);
             store0 = _mm256_add_epi16(
                 store0,
-                _mm256_mulhrs_epi16(_mm256_slli_epi16::<SCALE>(item_row), v_weight),
+                _mm256_mulhrs_epi16(_mm256_srli_epi16::<2>(item_row), v_weight),
             );
         }
 
@@ -364,108 +362,17 @@ unsafe fn convolve_vertical_avx2_row_impl(
 
         let px = cx;
 
-        if bounds_size == 2 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..2);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..);
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..);
-
-            let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
-            );
-
-            let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
-            );
-        } else if bounds_size == 3 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..3);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_weight2 = _mm_set1_epi16(weights[2]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..);
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..);
-            let v_offset2 = src_stride * (py + 2) + px;
-            let src_ptr2 = src.get_unchecked(v_offset2..);
-
-            let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
-            );
-
-            let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
-            );
-
-            let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
-            );
-        } else if bounds_size == 4 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..4);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_weight2 = _mm_set1_epi16(weights[2]);
-            let v_weight3 = _mm_set1_epi16(weights[3]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..);
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..);
-            let v_offset2 = src_stride * (py + 2) + px;
-            let src_ptr2 = src.get_unchecked(v_offset2..);
-            let v_offset3 = src_stride * (py + 3) + px;
-            let src_ptr3 = src.get_unchecked(v_offset3..);
-
-            let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
-            );
-
-            let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
-            );
-
-            let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
-            );
-
-            let item_row3 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr3.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row3), v_weight3),
-            );
-        } else {
-            for j in 0..bounds_size {
-                let py = bounds.start + j;
-                let weight = weight.get_unchecked(j..(j + 1));
-                let v_weight = _mm_set1_epi16(weight[0]);
-                let v_offset = src_stride * py + px;
-                let src_ptr = src.get_unchecked(v_offset..);
-                let item_row = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr.as_ptr()), zeros);
+        for j in 0..bounds_size {
+            let py = bounds.start + j;
+            let weight = weight.get_unchecked(j..(j + 1));
+            let v_weight = _mm_set1_epi16(weight[0]);
+            let v_offset = src_stride * py + px;
+            let src_ptr = src.get_unchecked(v_offset..);
+            let mut item_row = _mm_loadu_si64(src_ptr.as_ptr());
+            item_row = _mm_unpacklo_epi8(item_row, item_row);
 
-                let low = _mm_slli_epi16::<SCALE>(item_row);
-                store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight));
-            }
+            let low = _mm_srli_epi16::<2>(item_row);
+            store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight));
         }
 
         let rebased = _mm_srai_epi16::<R_SHR_SCALE>(store);
@@ -483,113 +390,21 @@ unsafe fn convolve_vertical_avx2_row_impl(
 
         let px = cx;
 
-        if bounds_size == 2 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..2);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..(v_offset0 + 1));
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..(v_offset1 + 1));
-
-            let item_row0 = _mm_set1_epi16(src_ptr0[0] as i16);
-
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
-            );
-
-            let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
-            );
-        } else if bounds_size == 3 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..3);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_weight2 = _mm_set1_epi16(weights[2]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..(v_offset0 + 1));
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..(v_offset1 + 1));
-            let v_offset2 = src_stride * (py + 2) + px;
-            let src_ptr2 = src.get_unchecked(v_offset2..(v_offset2 + 1));
-
-            let item_row0 = _mm_set1_epi16(src_ptr0[0] as i16);
-
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
-            );
-
-            let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
-            );
-
-            let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
-            );
-        } else if bounds_size == 4 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..4);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_weight2 = _mm_set1_epi16(weights[2]);
-            let v_weight3 = _mm_set1_epi16(weights[3]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..(v_offset0 + 1));
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..(v_offset1 + 1));
-            let v_offset2 = src_stride * (py + 2) + px;
-            let src_ptr2 = src.get_unchecked(v_offset2..(v_offset2 + 1));
-            let v_offset3 = src_stride * (py + 3) + px;
-            let src_ptr3 = src.get_unchecked(v_offset3..(v_offset3 + 1));
-
-            let item_row0 = _mm_set1_epi16(src_ptr0[0] as i16);
-
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
-            );
-
-            let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
-            );
-
-            let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
-            );
+        for j in 0..bounds_size {
+            let py = bounds.start + j;
+            let weight = weight.get_unchecked(j..(j + 1));
+            let v_weight = _mm_set1_epi16(weight[0]);
+            let v_offset = src_stride * py + px;
+            let src_ptr = src.get_unchecked(v_offset..(v_offset + 1));
+            let item_row = _mm_set1_epi8(src_ptr[0] as i8);
 
-            let item_row3 = _mm_set1_epi16(src_ptr3[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row3), v_weight3),
+                _mm_mulhrs_epi16(
+                    _mm_srli_epi16::<2>(_mm_unpacklo_epi8(item_row, item_row)),
+                    v_weight,
+                ),
             );
-        } else {
-            for j in 0..bounds_size {
-                let py = bounds.start + j;
-                let weight = weight.get_unchecked(j..(j + 1));
-                let v_weight = _mm_set1_epi16(weight[0]);
-                let v_offset = src_stride * py + px;
-                let src_ptr = src.get_unchecked(v_offset..(v_offset + 1));
-                let item_row = _mm_set1_epi16(src_ptr[0] as i16);
-
-                store = _mm_add_epi16(
-                    store,
-                    _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row), v_weight),
-                );
-            }
         }
 
         let rebased = _mm_srai_epi16::<R_SHR_SCALE>(store);
diff --git a/src/sse/rgba_u8_lb.rs b/src/sse/rgba_u8_lb.rs
index 1eded53..b72751b 100644
--- a/src/sse/rgba_u8_lb.rs
+++ b/src/sse/rgba_u8_lb.rs
@@ -34,7 +34,7 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 #[inline]
-unsafe fn convolve_horizontal_parts_one_rgba_sse<const SCALE: i32>(
+unsafe fn convolve_horizontal_parts_one_rgba_sse(
     start_x: usize,
     src: &[u8],
     weight0: __m128i,
@@ -45,7 +45,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_sse<const SCALE: i32>(
 
     let src_ptr_32 = src_ptr.as_ptr() as *const i32;
     let rgba_pixel = _mm_cvtsi32_si128(src_ptr_32.read_unaligned());
-    let lo = _mm_slli_epi16::<SCALE>(_mm_unpacklo_epi8(rgba_pixel, _mm_setzero_si128()));
+    let lo = _mm_srli_epi16::<2>(_mm_unpacklo_epi8(rgba_pixel, rgba_pixel));
 
     _mm_add_epi16(store_0, _mm_mulhrs_epi16(lo, weight0))
 }
@@ -63,7 +63,7 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_lb(
 }
 
 #[inline(always)]
-unsafe fn hdot4<const SCALE: i32>(
+unsafe fn hdot4(
     store: __m128i,
     v0: __m128i,
     v1: __m128i,
@@ -72,11 +72,10 @@ unsafe fn hdot4<const SCALE: i32>(
     w45: __m128i,
     w67: __m128i,
 ) -> __m128i {
-    let zeros = _mm_setzero_si128();
-    let lo0 = _mm_slli_epi16::<SCALE>(_mm_unpacklo_epi8(v0, zeros));
-    let hi0 = _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(v0, zeros));
-    let lo1 = _mm_slli_epi16::<SCALE>(_mm_unpacklo_epi8(v1, zeros));
-    let hi1 = _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(v1, zeros));
+    let lo0 = _mm_srli_epi16::<2>(_mm_unpacklo_epi8(v0, v0));
+    let hi0 = _mm_srli_epi16::<2>(_mm_unpackhi_epi8(v0, v0));
+    let lo1 = _mm_srli_epi16::<2>(_mm_unpacklo_epi8(v1, v1));
+    let hi1 = _mm_srli_epi16::<2>(_mm_unpackhi_epi8(v1, v1));
     let mut p = _mm_mulhrs_epi16(lo0, w01);
     p = _mm_add_epi16(p, _mm_mulhrs_epi16(hi0, w23));
     p = _mm_add_epi16(p, _mm_mulhrs_epi16(lo1, w45));
@@ -87,15 +86,9 @@ unsafe fn hdot4<const SCALE: i32>(
 }
 
 #[inline(always)]
-unsafe fn hdot2<const SCALE: i32>(
-    store: __m128i,
-    v: __m128i,
-    w01: __m128i,
-    w23: __m128i,
-) -> __m128i {
-    let zeros = _mm_setzero_si128();
-    let lo = _mm_slli_epi16::<SCALE>(_mm_unpacklo_epi8(v, zeros));
-    let hi = _mm_slli_epi16::<SCALE>(_mm_unpackhi_epi8(v, zeros));
+unsafe fn hdot2(store: __m128i, v: __m128i, w01: __m128i, w23: __m128i) -> __m128i {
+    let lo = _mm_srli_epi16::<2>(_mm_unpacklo_epi8(v, v));
+    let hi = _mm_srli_epi16::<2>(_mm_unpackhi_epi8(v, v));
     let mut p = _mm_mulhrs_epi16(lo, w01);
     p = _mm_add_epi16(p, _mm_mulhrs_epi16(hi, w23));
     let hi_part = _mm_unpackhi_epi64(p, p);
@@ -104,9 +97,8 @@ unsafe fn hdot2<const SCALE: i32>(
 }
 
 #[inline(always)]
-unsafe fn hdot<const SCALE: i32>(store: __m128i, v: __m128i, w01: __m128i) -> __m128i {
-    let zeros = _mm_setzero_si128();
-    let lo = _mm_slli_epi16::<SCALE>(_mm_unpacklo_epi8(v, zeros));
+unsafe fn hdot(store: __m128i, v: __m128i, w01: __m128i) -> __m128i {
+    let lo = _mm_srli_epi16::<2>(_mm_unpacklo_epi8(v, v));
     let mut p = _mm_mulhrs_epi16(lo, w01);
     let hi_part = _mm_unpackhi_epi64(p, p);
     p = _mm_add_epi16(hi_part, p);
@@ -222,7 +214,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
                         .as_ptr() as *const __m128i,
                 );
 
-                store_0 = hdot4::<SCALE>(
+                store_0 = hdot4(
                     store_0,
                     rgb_pixel_0,
                     rgb_pixel_0_1,
@@ -231,7 +223,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
                     weight45,
                     weight67,
                 );
-                store_1 = hdot4::<SCALE>(
+                store_1 = hdot4(
                     store_1,
                     rgb_pixel_1,
                     rgb_pixel_1_0,
@@ -240,7 +232,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
                     weight45,
                     weight67,
                 );
-                store_2 = hdot4::<SCALE>(
+                store_2 = hdot4(
                     store_2,
                     rgb_pixel_2,
                     rgb_pixel_2_1,
@@ -249,7 +241,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
                     weight45,
                     weight67,
                 );
-                store_3 = hdot4::<SCALE>(
+                store_3 = hdot4(
                     store_3,
                     rgb_pixel_3,
                     rgb_pixel_3_1,
@@ -290,10 +282,10 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
                     src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i,
                 );
 
-                store_0 = hdot2::<SCALE>(store_0, rgb_pixel_0, weight01, weight23);
-                store_1 = hdot2::<SCALE>(store_1, rgb_pixel_1, weight01, weight23);
-                store_2 = hdot2::<SCALE>(store_2, rgb_pixel_2, weight01, weight23);
-                store_3 = hdot2::<SCALE>(store_3, rgb_pixel_3, weight01, weight23);
+                store_0 = hdot2(store_0, rgb_pixel_0, weight01, weight23);
+                store_1 = hdot2(store_1, rgb_pixel_1, weight01, weight23);
+                store_2 = hdot2(store_2, rgb_pixel_2, weight01, weight23);
+                store_3 = hdot2(store_3, rgb_pixel_3, weight01, weight23);
 
                 jx += 4;
             }
@@ -316,10 +308,10 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
                 let rgb_pixel_3 =
                     _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr());
 
-                store_0 = hdot::<SCALE>(store_0, rgb_pixel_0, weight01);
-                store_1 = hdot::<SCALE>(store_1, rgb_pixel_1, weight01);
-                store_2 = hdot::<SCALE>(store_2, rgb_pixel_2, weight01);
-                store_3 = hdot::<SCALE>(store_3, rgb_pixel_3, weight01);
+                store_0 = hdot(store_0, rgb_pixel_0, weight01);
+                store_1 = hdot(store_1, rgb_pixel_1, weight01);
+                store_2 = hdot(store_2, rgb_pixel_2, weight01);
+                store_3 = hdot(store_3, rgb_pixel_3, weight01);
 
                 jx += 2;
             }
@@ -331,30 +323,14 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
 
                 let start_bounds = bounds.start + jx;
 
-                store_0 = convolve_horizontal_parts_one_rgba_sse::<SCALE>(
-                    start_bounds,
-                    src0,
-                    weight0,
-                    store_0,
-                );
-                store_1 = convolve_horizontal_parts_one_rgba_sse::<SCALE>(
-                    start_bounds,
-                    src1,
-                    weight0,
-                    store_1,
-                );
-                store_2 = convolve_horizontal_parts_one_rgba_sse::<SCALE>(
-                    start_bounds,
-                    src2,
-                    weight0,
-                    store_2,
-                );
-                store_3 = convolve_horizontal_parts_one_rgba_sse::<SCALE>(
-                    start_bounds,
-                    src3,
-                    weight0,
-                    store_3,
-                );
+                store_0 =
+                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0);
+                store_1 =
+                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1);
+                store_2 =
+                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2);
+                store_3 =
+                    convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3);
                 jx += 1;
             }
 
@@ -452,7 +428,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl(
                 src.get_unchecked((start_bounds * CHANNELS + 16)..).as_ptr() as *const __m128i,
             );
 
-            store = hdot4::<SCALE>(
+            store = hdot4(
                 store,
                 rgb_pixel_0,
                 rgb_pixel_0_1,
@@ -482,7 +458,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl(
 
             let rgb_pixel = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
 
-            store = hdot2::<SCALE>(store, rgb_pixel, weight01, weight23);
+            store = hdot2(store, rgb_pixel, weight01, weight23);
 
             jx += 4;
         }
@@ -500,7 +476,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl(
 
             let rgb_pixel = _mm_loadu_si64(src_ptr.as_ptr());
 
-            store = hdot::<SCALE>(store, rgb_pixel, weight01);
+            store = hdot(store, rgb_pixel, weight01);
 
             jx += 2;
         }
@@ -511,8 +487,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl(
 
             let start_bounds = bounds.start + jx;
 
-            store =
-                convolve_horizontal_parts_one_rgba_sse::<SCALE>(start_bounds, src, weight0, store);
+            store = convolve_horizontal_parts_one_rgba_sse(start_bounds, src, weight0, store);
             jx += 1;
         }
 
diff --git a/src/sse/vertical_u8_lp.rs b/src/sse/vertical_u8_lp.rs
index 5deefdf..4236b58 100644
--- a/src/sse/vertical_u8_lp.rs
+++ b/src/sse/vertical_u8_lp.rs
@@ -46,24 +46,17 @@ pub(crate) fn convolve_vertical_sse_row_lp(
 }
 
 #[inline(always)]
-unsafe fn mdot<const SCALE: i32>(
+unsafe fn mdot(
     store0: __m128i,
     store1: __m128i,
     row: __m128i,
     weight: __m128i,
 ) -> (__m128i, __m128i) {
-    let zeros = _mm_setzero_si128();
-    let lo = _mm_unpacklo_epi8(row, zeros);
-    let hi = _mm_unpackhi_epi8(row, zeros);
-
-    let store0 = _mm_add_epi16(
-        store0,
-        _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(lo), weight),
-    );
-    let store1 = _mm_add_epi16(
-        store1,
-        _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(hi), weight),
-    );
+    let lo = _mm_unpacklo_epi8(row, row);
+    let hi = _mm_unpackhi_epi8(row, row);
+
+    let store0 = _mm_add_epi16(store0, _mm_mulhrs_epi16(_mm_srli_epi16::<2>(lo), weight));
+    let store1 = _mm_add_epi16(store1, _mm_mulhrs_epi16(_mm_srli_epi16::<2>(hi), weight));
     (store0, store1)
 }
 
@@ -76,8 +69,6 @@ unsafe fn convolve_vertical_sse_row_impl(
     src_stride: usize,
     weight: &[i16],
 ) {
-    let zeros = _mm_setzero_si128();
-
     let bounds_size = bounds.size;
     const SCALE: i32 = 6;
     const R_SHR_SCALE: i32 = SCALE;
@@ -111,10 +102,10 @@ unsafe fn convolve_vertical_sse_row_impl(
             let item_row2 = _mm_loadu_si128(src_ptr.get_unchecked(32..).as_ptr() as *const __m128i);
             let item_row3 = _mm_loadu_si128(src_ptr.get_unchecked(48..).as_ptr() as *const __m128i);
 
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight);
-            (store4, store5) = mdot::<SCALE>(store4, store5, item_row2, v_weight);
-            (store6, store7) = mdot::<SCALE>(store6, store7, item_row3, v_weight);
+            (store0, store1) = mdot(store0, store1, item_row0, v_weight);
+            (store2, store3) = mdot(store2, store3, item_row1, v_weight);
+            (store4, store5) = mdot(store4, store5, item_row2, v_weight);
+            (store6, store7) = mdot(store6, store7, item_row3, v_weight);
         }
 
         let rebased0 = _mm_srai_epi16::<R_SHR_SCALE>(store0);
@@ -125,10 +116,12 @@ unsafe fn convolve_vertical_sse_row_impl(
         let rebased5 = _mm_srai_epi16::<R_SHR_SCALE>(store5);
         let rebased6 = _mm_srai_epi16::<R_SHR_SCALE>(store6);
         let rebased7 = _mm_srai_epi16::<R_SHR_SCALE>(store7);
+
         let shrank0 = _mm_packus_epi16(rebased0, rebased1);
         let shrank1 = _mm_packus_epi16(rebased2, rebased3);
         let shrank2 = _mm_packus_epi16(rebased4, rebased5);
         let shrank3 = _mm_packus_epi16(rebased6, rebased7);
+
         _mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, shrank0);
         _mm_storeu_si128(
             dst.get_unchecked_mut(16..).as_mut_ptr() as *mut __m128i,
@@ -166,8 +159,8 @@ unsafe fn convolve_vertical_sse_row_impl(
             let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
             let item_row1 = _mm_loadu_si128(src_ptr.get_unchecked(16..).as_ptr() as *const __m128i);
 
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight);
-            (store2, store3) = mdot::<SCALE>(store2, store3, item_row1, v_weight);
+            (store0, store1) = mdot(store0, store1, item_row0, v_weight);
+            (store2, store3) = mdot(store2, store3, item_row1, v_weight);
         }
 
         let rebased0 = _mm_srai_epi16::<R_SHR_SCALE>(store0);
@@ -205,9 +198,9 @@ unsafe fn convolve_vertical_sse_row_impl(
             let src_ptr1 = src.get_unchecked(v_offset1..);
 
             let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight0);
+            (store0, store1) = mdot(store0, store1, item_row0, v_weight0);
             let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row1, v_weight1);
+            (store0, store1) = mdot(store0, store1, item_row1, v_weight1);
         } else if bounds_size == 3 {
             let py = bounds.start;
             let weights = weight.get_unchecked(0..3);
@@ -222,11 +215,11 @@ unsafe fn convolve_vertical_sse_row_impl(
             let src_ptr2 = src.get_unchecked(v_offset2..);
 
             let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight0);
+            (store0, store1) = mdot(store0, store1, item_row0, v_weight0);
             let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row1, v_weight1);
+            (store0, store1) = mdot(store0, store1, item_row1, v_weight1);
             let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row2, v_weight2);
+            (store0, store1) = mdot(store0, store1, item_row2, v_weight2);
         } else if bounds_size == 4 {
             let py = bounds.start;
             let weights = weight.get_unchecked(0..4);
@@ -244,13 +237,13 @@ unsafe fn convolve_vertical_sse_row_impl(
             let src_ptr3 = src.get_unchecked(v_offset3..);
 
             let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i);
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row0, v_weight0);
+            (store0, store1) = mdot(store0, store1, item_row0, v_weight0);
             let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i);
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row1, v_weight1);
+            (store0, store1) = mdot(store0, store1, item_row1, v_weight1);
             let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i);
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row2, v_weight2);
+            (store0, store1) = mdot(store0, store1, item_row2, v_weight2);
             let item_row3 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i);
-            (store0, store1) = mdot::<SCALE>(store0, store1, item_row3, v_weight3);
+            (store0, store1) = mdot(store0, store1, item_row3, v_weight3);
         } else {
             for j in 0..bounds_size {
                 let py = bounds.start + j;
@@ -260,7 +253,7 @@ unsafe fn convolve_vertical_sse_row_impl(
                 let src_ptr = src.get_unchecked(v_offset..);
                 let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i);
 
-                (store0, store1) = mdot::<SCALE>(store0, store1, item_row, v_weight);
+                (store0, store1) = mdot(store0, store1, item_row, v_weight);
             }
         }
 
@@ -280,108 +273,17 @@ unsafe fn convolve_vertical_sse_row_impl(
 
         let px = cx;
 
-        if bounds_size == 2 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..2);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..);
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..);
-
-            let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
-            );
-
-            let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
-            );
-        } else if bounds_size == 3 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..3);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_weight2 = _mm_set1_epi16(weights[2]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..);
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..);
-            let v_offset2 = src_stride * (py + 2) + px;
-            let src_ptr2 = src.get_unchecked(v_offset2..);
-
-            let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
-            );
-
-            let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
-            );
-
-            let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
-            );
-        } else if bounds_size == 4 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..4);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_weight2 = _mm_set1_epi16(weights[2]);
-            let v_weight3 = _mm_set1_epi16(weights[3]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..);
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..);
-            let v_offset2 = src_stride * (py + 2) + px;
-            let src_ptr2 = src.get_unchecked(v_offset2..);
-            let v_offset3 = src_stride * (py + 3) + px;
-            let src_ptr3 = src.get_unchecked(v_offset3..);
-
-            let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
-            );
-
-            let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
-            );
-
-            let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
-            );
-
-            let item_row3 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr3.as_ptr()), zeros);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row3), v_weight3),
-            );
-        } else {
-            for j in 0..bounds_size {
-                let py = bounds.start + j;
-                let weight = weight.get_unchecked(j..(j + 1));
-                let v_weight = _mm_set1_epi16(weight[0]);
-                let v_offset = src_stride * py + px;
-                let src_ptr = src.get_unchecked(v_offset..);
-                let item_row = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr.as_ptr()), zeros);
+        for j in 0..bounds_size {
+            let py = bounds.start + j;
+            let weight = weight.get_unchecked(j..(j + 1));
+            let v_weight = _mm_set1_epi16(weight[0]);
+            let v_offset = src_stride * py + px;
+            let src_ptr = src.get_unchecked(v_offset..);
+            let mut item_row = _mm_loadu_si64(src_ptr.as_ptr());
+            item_row = _mm_unpacklo_epi8(item_row, item_row);
 
-                let low = _mm_slli_epi16::<SCALE>(item_row);
-                store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight));
-            }
+            let low = _mm_srli_epi16::<2>(item_row);
+            store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight));
         }
 
         let rebased = _mm_srai_epi16::<R_SHR_SCALE>(store);
@@ -399,113 +301,19 @@ unsafe fn convolve_vertical_sse_row_impl(
 
         let px = cx;
 
-        if bounds_size == 2 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..2);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..(v_offset0 + 1));
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..(v_offset1 + 1));
-
-            let item_row0 = _mm_set1_epi16(src_ptr0[0] as i16);
-
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
-            );
-
-            let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
-            );
-        } else if bounds_size == 3 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..3);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_weight2 = _mm_set1_epi16(weights[2]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..(v_offset0 + 1));
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..(v_offset1 + 1));
-            let v_offset2 = src_stride * (py + 2) + px;
-            let src_ptr2 = src.get_unchecked(v_offset2..(v_offset2 + 1));
-
-            let item_row0 = _mm_set1_epi16(src_ptr0[0] as i16);
-
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
-            );
-
-            let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
-            );
-
-            let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
-            );
-        } else if bounds_size == 4 {
-            let py = bounds.start;
-            let weights = weight.get_unchecked(0..4);
-            let v_weight0 = _mm_set1_epi16(weights[0]);
-            let v_weight1 = _mm_set1_epi16(weights[1]);
-            let v_weight2 = _mm_set1_epi16(weights[2]);
-            let v_weight3 = _mm_set1_epi16(weights[3]);
-            let v_offset0 = src_stride * py + px;
-            let src_ptr0 = src.get_unchecked(v_offset0..(v_offset0 + 1));
-            let v_offset1 = src_stride * (py + 1) + px;
-            let src_ptr1 = src.get_unchecked(v_offset1..(v_offset1 + 1));
-            let v_offset2 = src_stride * (py + 2) + px;
-            let src_ptr2 = src.get_unchecked(v_offset2..(v_offset2 + 1));
-            let v_offset3 = src_stride * (py + 3) + px;
-            let src_ptr3 = src.get_unchecked(v_offset3..(v_offset3 + 1));
-
-            let item_row0 = _mm_set1_epi16(src_ptr0[0] as i16);
-
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row0), v_weight0),
-            );
-
-            let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row1), v_weight1),
-            );
-
-            let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16);
-            store = _mm_add_epi16(
-                store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row2), v_weight2),
-            );
+        for j in 0..bounds_size {
+            let py = bounds.start + j;
+            let weight = weight.get_unchecked(j..(j + 1));
+            let v_weight = _mm_set1_epi16(weight[0]);
+            let v_offset = src_stride * py + px;
+            let src_ptr = src.get_unchecked(v_offset..(v_offset + 1));
+            let mut item_row = _mm_set1_epi8(src_ptr[0] as i8);
+            item_row = _mm_unpacklo_epi8(item_row, item_row);
 
-            let item_row3 = _mm_set1_epi16(src_ptr3[0] as i16);
             store = _mm_add_epi16(
                 store,
-                _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row3), v_weight3),
+                _mm_mulhrs_epi16(_mm_srli_epi16::<2>(item_row), v_weight),
             );
-        } else {
-            for j in 0..bounds_size {
-                let py = bounds.start + j;
-                let weight = weight.get_unchecked(j..(j + 1));
-                let v_weight = _mm_set1_epi16(weight[0]);
-                let v_offset = src_stride * py + px;
-                let src_ptr = src.get_unchecked(v_offset..(v_offset + 1));
-                let item_row = _mm_set1_epi16(src_ptr[0] as i16);
-
-                store = _mm_add_epi16(
-                    store,
-                    _mm_mulhrs_epi16(_mm_slli_epi16::<SCALE>(item_row), v_weight),
-                );
-            }
         }
 
         let rebased = _mm_srai_epi16::<R_SHR_SCALE>(store);

From 642f00dde8771682e03eccfdbd6c6f28a17b0835 Mon Sep 17 00:00:00 2001
From: Radzivon Bartoshyk <radzivon.bartoshyk@proton.me>
Date: Sat, 28 Dec 2024 13:52:11 +0000
Subject: [PATCH 9/9] Fuzzing fixes

---
 src/avx2/vertical_u8_lp.rs | 62 +++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs
index 1ccf827..d560c5f 100644
--- a/src/avx2/vertical_u8_lp.rs
+++ b/src/avx2/vertical_u8_lp.rs
@@ -47,7 +47,7 @@ pub(crate) fn convolve_vertical_avx_row_lp(
 }
 
 #[inline(always)]
-unsafe fn m256dot<const SCALE: i32>(
+unsafe fn m256dot(
     store0: __m256i,
     store1: __m256i,
     row: __m256i,
@@ -108,15 +108,15 @@ unsafe fn convolve_vertical_avx2_row_impl(
             let item_row1 =
                 _mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i);
 
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row0, v_weight0);
-            (store2, store3) = m256dot::<SCALE>(store2, store3, item_row1, v_weight0);
+            (store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
+            (store2, store3) = m256dot(store2, store3, item_row1, v_weight0);
 
             let item_row10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
             let item_row11 =
                 _mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i);
 
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row10, v_weight1);
-            (store2, store3) = m256dot::<SCALE>(store2, store3, item_row11, v_weight1);
+            (store0, store1) = m256dot(store0, store1, item_row10, v_weight1);
+            (store2, store3) = m256dot(store2, store3, item_row11, v_weight1);
         } else if bounds_size == 3 {
             let py = bounds.start;
             let weights = weight.get_unchecked(0..3);
@@ -134,22 +134,22 @@ unsafe fn convolve_vertical_avx2_row_impl(
             let item_row1 =
                 _mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i);
 
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row0, v_weight0);
-            (store2, store3) = m256dot::<SCALE>(store2, store3, item_row1, v_weight0);
+            (store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
+            (store2, store3) = m256dot(store2, store3, item_row1, v_weight0);
 
             let item_row10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
             let item_row11 =
                 _mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i);
 
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row10, v_weight1);
-            (store2, store3) = m256dot::<SCALE>(store2, store3, item_row11, v_weight1);
+            (store0, store1) = m256dot(store0, store1, item_row10, v_weight1);
+            (store2, store3) = m256dot(store2, store3, item_row11, v_weight1);
 
             let item_row20 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
             let item_row21 =
                 _mm256_loadu_si256(src_ptr2.get_unchecked(32..).as_ptr() as *const __m256i);
 
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row20, v_weight2);
-            (store2, store3) = m256dot::<SCALE>(store2, store3, item_row21, v_weight2);
+            (store0, store1) = m256dot(store0, store1, item_row20, v_weight2);
+            (store2, store3) = m256dot(store2, store3, item_row21, v_weight2);
         } else if bounds_size == 4 {
             let py = bounds.start;
             let weights = weight.get_unchecked(0..4);
@@ -170,29 +170,29 @@ unsafe fn convolve_vertical_avx2_row_impl(
             let item_row1 =
                 _mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i);
 
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row0, v_weight0);
-            (store2, store3) = m256dot::<SCALE>(store2, store3, item_row1, v_weight0);
+            (store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
+            (store2, store3) = m256dot(store2, store3, item_row1, v_weight0);
 
             let item_row10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
             let item_row11 =
                 _mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i);
 
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row10, v_weight1);
-            (store2, store3) = m256dot::<SCALE>(store2, store3, item_row11, v_weight1);
+            (store0, store1) = m256dot(store0, store1, item_row10, v_weight1);
+            (store2, store3) = m256dot(store2, store3, item_row11, v_weight1);
 
             let item_row20 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
             let item_row21 =
                 _mm256_loadu_si256(src_ptr2.get_unchecked(32..).as_ptr() as *const __m256i);
 
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row20, v_weight2);
-            (store2, store3) = m256dot::<SCALE>(store2, store3, item_row21, v_weight2);
+            (store0, store1) = m256dot(store0, store1, item_row20, v_weight2);
+            (store2, store3) = m256dot(store2, store3, item_row21, v_weight2);
 
             let item_row30 = _mm256_loadu_si256(src_ptr3.as_ptr() as *const __m256i);
             let item_row31 =
                 _mm256_loadu_si256(src_ptr3.get_unchecked(32..).as_ptr() as *const __m256i);
 
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row30, v_weight3);
-            (store2, store3) = m256dot::<SCALE>(store2, store3, item_row31, v_weight3);
+            (store0, store1) = m256dot(store0, store1, item_row30, v_weight3);
+            (store2, store3) = m256dot(store2, store3, item_row31, v_weight3);
         } else {
             for j in 0..bounds_size {
                 let py = bounds.start + j;
@@ -204,8 +204,8 @@ unsafe fn convolve_vertical_avx2_row_impl(
                 let item_row1 =
                     _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i);
 
-                (store0, store1) = m256dot::<SCALE>(store0, store1, item_row0, v_weight);
-                (store2, store3) = m256dot::<SCALE>(store2, store3, item_row1, v_weight);
+                (store0, store1) = m256dot(store0, store1, item_row0, v_weight);
+                (store2, store3) = m256dot(store2, store3, item_row1, v_weight);
             }
         }
 
@@ -246,10 +246,10 @@ unsafe fn convolve_vertical_avx2_row_impl(
             let src_ptr1 = src.get_unchecked(v_offset1..);
 
             let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row0, v_weight0);
+            (store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
 
             let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row1, v_weight1);
+            (store0, store1) = m256dot(store0, store1, item_row1, v_weight1);
         } else if bounds_size == 3 {
             let py = bounds.start;
             let weights = weight.get_unchecked(0..3);
@@ -264,13 +264,13 @@ unsafe fn convolve_vertical_avx2_row_impl(
             let src_ptr2 = src.get_unchecked(v_offset2..);
 
             let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row0, v_weight0);
+            (store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
 
             let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row1, v_weight1);
+            (store0, store1) = m256dot(store0, store1, item_row1, v_weight1);
 
             let item_row2 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row2, v_weight2);
+            (store0, store1) = m256dot(store0, store1, item_row2, v_weight2);
         } else if bounds_size == 4 {
             let py = bounds.start;
             let weights = weight.get_unchecked(0..4);
@@ -288,16 +288,16 @@ unsafe fn convolve_vertical_avx2_row_impl(
             let src_ptr3 = src.get_unchecked(v_offset3..);
 
             let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i);
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row0, v_weight0);
+            (store0, store1) = m256dot(store0, store1, item_row0, v_weight0);
 
             let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i);
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row1, v_weight1);
+            (store0, store1) = m256dot(store0, store1, item_row1, v_weight1);
 
             let item_row2 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i);
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row2, v_weight2);
+            (store0, store1) = m256dot(store0, store1, item_row2, v_weight2);
 
             let item_row3 = _mm256_loadu_si256(src_ptr3.as_ptr() as *const __m256i);
-            (store0, store1) = m256dot::<SCALE>(store0, store1, item_row3, v_weight3);
+            (store0, store1) = m256dot(store0, store1, item_row3, v_weight3);
         } else {
             for j in 0..bounds_size {
                 let py = bounds.start + j;
@@ -307,7 +307,7 @@ unsafe fn convolve_vertical_avx2_row_impl(
                 let src_ptr = src.get_unchecked(v_offset..);
                 let item_row0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i);
 
-                (store0, store1) = m256dot::<SCALE>(store0, store1, item_row0, v_weight);
+                (store0, store1) = m256dot(store0, store1, item_row0, v_weight);
             }
         }