diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index 3e8fb85..e10a6e8 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -8,7 +8,6 @@ on:
   push:
     branches:
       - 'master'
-      - 'dev'
       - '!ci_test_*'
     tags-ignore:
       - '*'
diff --git a/app/src/main.rs b/app/src/main.rs
index 3b4cb82..6d8dcec 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -11,11 +11,8 @@ use fast_image_resize::{
 };
 use image::{EncodableLayout, GenericImageView, ImageReader};
 use pic_scale::{
-    Ar30ByteOrder, ImageSize, ImageStore, LinearApproxScaler, ResamplingFunction, Scaler, Scaling,
-    ScalingU16, ThreadingPolicy,
-};
-use yuvutils_rs::{
-    ar30_to_rgba8, ra30_to_rgba8, rgb8_to_ar30, rgba8_to_ar30, rgba8_to_ra30, Rgb30ByteOrder,
+    Ar30ByteOrder, ImageSize, ImageStore, LinearApproxScaler, LinearScaler, ResamplingFunction,
+    Scaler, Scaling, ScalingU16, ThreadingPolicy,
 };
 
 fn resize_plane(
@@ -46,7 +43,6 @@ fn resize_plane(
         .unwrap();
 }
 
-
 fn main() {
     // test_fast_image();
     let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
@@ -57,10 +53,10 @@ fn main() {
     let transient = img.to_rgba8();
     let mut bytes = Vec::from(transient.as_bytes());
 
-    let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
+    let mut scaler = LinearScaler::new(ResamplingFunction::Bilinear);
     scaler.set_threading_policy(ThreadingPolicy::Single);
 
-    resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);
+    // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);
 
     // let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
     //
diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs
index e9d645d..85207bb 100644
--- a/src/avx2/rgba_f16.rs
+++ b/src/avx2/rgba_f16.rs
@@ -120,8 +120,8 @@ pub(crate) fn convolve_horizontal_rgba_avx_row_one_f16<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     unsafe {
         if FMA {
@@ -129,16 +129,16 @@ pub(crate) fn convolve_horizontal_rgba_avx_row_one_f16<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
+                src,
+                dst,
             );
         } else {
             convolve_horizontal_rgba_avx_row_one_f16_regular(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
+                src,
+                dst,
             );
         }
     }
@@ -150,15 +150,15 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     convolve_horizontal_rgba_avx_row_one_f16_impl::<true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
@@ -168,15 +168,15 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     convolve_horizontal_rgba_avx_row_one_f16_impl::<false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
@@ -185,8 +185,8 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl<const FMA: bool>(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     const CHANNELS: usize = 4;
     let mut filter_offset = 0usize;
@@ -203,7 +203,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl<const FMA: bool>(
             let filter_start = jx + bounds.start;
             store = convolve_horizontal_parts_8_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -219,7 +219,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl<const FMA: bool>(
             let filter_start = jx + bounds.start;
             store = convolve_horizontal_parts_4_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 weight1,
                 store,
@@ -235,7 +235,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl<const FMA: bool>(
             let filter_start = jx + bounds.start;
             store = convolve_horizontal_parts_2_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight,
                 store,
             );
@@ -248,7 +248,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl<const FMA: bool>(
             let filter_start = jx + bounds.start;
             store = convolve_horizontal_parts_one_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 store,
             );
@@ -256,7 +256,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl<const FMA: bool>(
         }
 
         let px = x * CHANNELS;
-        let dest_ptr = unsafe_destination_ptr_0.add(px);
+        let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
         let converted_f16 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps(
             _mm256_castps256_ps128(store),
             _mm256_extractf128_ps::<1>(store),
@@ -275,9 +275,9 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f16<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     unsafe {
@@ -286,9 +286,9 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f16<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 dst_stride,
             );
         } else {
@@ -296,9 +296,9 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f16<const FMA: bool>(
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 dst_stride,
             );
         }
@@ -311,18 +311,18 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgba_avx_rows_4_f16_impl::<false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
@@ -333,18 +333,18 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgba_avx_rows_4_f16_impl::<true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
@@ -354,9 +354,9 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     const CHANNELS: usize = 4;
@@ -379,7 +379,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
 
             store_0 = convolve_horizontal_parts_8_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -388,7 +388,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
             );
             store_1 = convolve_horizontal_parts_8_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..).as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -397,7 +397,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
             );
             store_2 = convolve_horizontal_parts_8_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..).as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -406,7 +406,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
             );
             store_3 = convolve_horizontal_parts_8_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..).as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -423,28 +423,28 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
 
             store_0 = convolve_horizontal_parts_4_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 weight1,
                 store_0,
             );
             store_1 = convolve_horizontal_parts_4_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..).as_ptr(),
                 weight0,
                 weight1,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_4_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..).as_ptr(),
                 weight0,
                 weight1,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_4_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..).as_ptr(),
                 weight0,
                 weight1,
                 store_3,
@@ -460,25 +460,25 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
             let filter_start = jx + bounds.start;
             store_0 = convolve_horizontal_parts_2_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight,
                 store_0,
             );
             store_1 = convolve_horizontal_parts_2_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..).as_ptr(),
                 weight,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_2_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..).as_ptr(),
                 weight,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_2_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..).as_ptr(),
                 weight,
                 store_3,
             );
@@ -491,25 +491,25 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
             let weight0 = _mm256_set1_ps(ptr.read_unaligned());
             store_0 = convolve_horizontal_parts_one_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 store_0,
             );
             store_1 = convolve_horizontal_parts_one_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..).as_ptr(),
                 weight0,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_one_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..).as_ptr(),
                 weight0,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_one_rgba_f16::<FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..).as_ptr(),
                 weight0,
                 store_3,
             );
@@ -517,7 +517,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
         }
 
         let px = x * CHANNELS;
-        let dest_ptr = unsafe_destination_ptr_0.add(px);
+        let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
         let converted_f16_0 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps(
             _mm256_castps256_ps128(store_0),
             _mm256_extractf128_ps::<1>(store_0),
@@ -528,7 +528,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
             8,
         );
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
         let converted_f16_1 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps(
             _mm256_castps256_ps128(store_1),
             _mm256_extractf128_ps::<1>(store_1),
@@ -539,7 +539,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
             8,
         );
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
         let converted_f16_2 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps(
             _mm256_castps256_ps128(store_2),
             _mm256_extractf128_ps::<1>(store_2),
@@ -550,7 +550,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl<const FMA: bool>(
             8,
         );
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
         let converted_f16_3 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps(
             _mm256_castps256_ps128(store_3),
             _mm256_extractf128_ps::<1>(store_3),
diff --git a/src/avx2/vertical_f16.rs b/src/avx2/vertical_f16.rs
index 63b2871..bfe1436 100644
--- a/src/avx2/vertical_f16.rs
+++ b/src/avx2/vertical_f16.rs
@@ -37,9 +37,9 @@ use std::arch::x86_64::*;
 unsafe fn convolve_vertical_part_avx_f16<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    dst: *mut half::f16,
+    dst: &mut [half::f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -51,7 +51,7 @@ unsafe fn convolve_vertical_part_avx_f16<const FMA: bool>(
         let py = start_y + j;
         let weight = *filter.get_unchecked(j);
         let v_weight = _mm256_set1_ps(weight);
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
 
         let s_ptr = src_ptr.add(px);
         let item_row_0 = _mm256_set1_epi16(s_ptr.read_unaligned().to_bits() as i16);
@@ -63,7 +63,7 @@ unsafe fn convolve_vertical_part_avx_f16<const FMA: bool>(
         );
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
 
     const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT;
 
@@ -76,9 +76,9 @@ unsafe fn convolve_vertical_part_avx_f16<const FMA: bool>(
 unsafe fn convolve_vertical_part_avx_4_f16<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    dst: *mut half::f16,
+    dst: &mut [half::f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -90,7 +90,7 @@ unsafe fn convolve_vertical_part_avx_4_f16<const FMA: bool>(
         let py = start_y + j;
         let weight = *filter.get_unchecked(j);
         let v_weight = _mm256_set1_ps(weight);
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
 
         let s_ptr = src_ptr.add(px);
         let item_row_0 = _mm_loadu_si64(s_ptr as *const u8);
@@ -100,7 +100,7 @@ unsafe fn convolve_vertical_part_avx_4_f16<const FMA: bool>(
 
     const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT;
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     let acc = _mm256_cvtps_ph::<ROUNDING_FLAGS>(store_0);
     std::ptr::copy_nonoverlapping(&acc as *const _ as *const u8, dst_ptr as *mut u8, 8);
 }
@@ -109,9 +109,9 @@ unsafe fn convolve_vertical_part_avx_4_f16<const FMA: bool>(
 unsafe fn convolve_vertical_part_avx_32_f16<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    dst: *mut half::f16,
+    dst: &mut [half::f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -126,7 +126,7 @@ unsafe fn convolve_vertical_part_avx_32_f16<const FMA: bool>(
         let py = start_y + j;
         let weight = *filter.get_unchecked(j);
         let v_weight = _mm256_set1_ps(weight);
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
 
         let s_ptr = src_ptr.add(px);
         let item_row_0 = _mm256_loadu_si256(s_ptr as *const __m256i);
@@ -143,7 +143,7 @@ unsafe fn convolve_vertical_part_avx_32_f16<const FMA: bool>(
         store_3 = _mm256_fma_ps::<FMA>(store_3, items3, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
 
     const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT;
 
@@ -164,9 +164,9 @@ unsafe fn convolve_vertical_part_avx_32_f16<const FMA: bool>(
 unsafe fn convolve_vertical_part_avx_16_f16<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    dst: *mut half::f16,
+    dst: &mut [half::f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -179,7 +179,7 @@ unsafe fn convolve_vertical_part_avx_16_f16<const FMA: bool>(
         let py = start_y + j;
         let weight = *filter.get_unchecked(j);
         let v_weight = _mm256_set1_ps(weight);
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
 
         let s_ptr = src_ptr.add(px);
         let item_row = _mm256_loadu_si256(s_ptr as *const __m256i);
@@ -193,7 +193,7 @@ unsafe fn convolve_vertical_part_avx_16_f16<const FMA: bool>(
 
     const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT;
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     let acc0 = avx_combine_epi(
         _mm256_cvtps_ph::<ROUNDING_FLAGS>(store_0),
         _mm256_cvtps_ph::<ROUNDING_FLAGS>(store_1),
@@ -204,29 +204,19 @@ unsafe fn convolve_vertical_part_avx_16_f16<const FMA: bool>(
 pub(crate) fn convolve_vertical_avx_row_f16<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     unsafe {
         if FMA {
             convolve_vertical_avx_row_f16_fma::<CHANNELS>(
-                width,
-                bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
+                width, bounds, src, dst, src_stride, weight_ptr,
             );
         } else {
             convolve_vertical_avx_row_f16_regular::<CHANNELS>(
-                width,
-                bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
+                width, bounds, src, dst, src_stride, weight_ptr,
             );
         }
     }
@@ -237,18 +227,13 @@ pub(crate) fn convolve_vertical_avx_row_f16<const CHANNELS: usize, const FMA: bo
 unsafe fn convolve_vertical_avx_row_f16_regular<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     convolve_vertical_avx_row_f16_impl::<CHANNELS, false>(
-        width,
-        bounds,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
-        src_stride,
-        weight_ptr,
+        width, bounds, src, dst, src_stride, weight_ptr,
     );
 }
 
@@ -257,18 +242,13 @@ unsafe fn convolve_vertical_avx_row_f16_regular<const CHANNELS: usize>(
 unsafe fn convolve_vertical_avx_row_f16_fma<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     convolve_vertical_avx_row_f16_impl::<CHANNELS, true>(
-        width,
-        bounds,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
-        src_stride,
-        weight_ptr,
+        width, bounds, src, dst, src_stride, weight_ptr,
     );
 }
 
@@ -276,8 +256,8 @@ unsafe fn convolve_vertical_avx_row_f16_fma<const CHANNELS: usize>(
 pub(crate) fn convolve_vertical_avx_row_f16_impl<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -289,9 +269,9 @@ pub(crate) fn convolve_vertical_avx_row_f16_impl<const CHANNELS: usize, const FM
             convolve_vertical_part_avx_32_f16::<FMA>(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
             );
@@ -305,9 +285,9 @@ pub(crate) fn convolve_vertical_avx_row_f16_impl<const CHANNELS: usize, const FM
             convolve_vertical_part_avx_16_f16::<FMA>(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
             );
@@ -321,9 +301,9 @@ pub(crate) fn convolve_vertical_avx_row_f16_impl<const CHANNELS: usize, const FM
             convolve_vertical_part_avx_4_f16::<FMA>(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
             );
@@ -337,9 +317,9 @@ pub(crate) fn convolve_vertical_avx_row_f16_impl<const CHANNELS: usize, const FM
             convolve_vertical_part_avx_f16::<FMA>(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
             );
diff --git a/src/convolve_naive_u16.rs b/src/convolve_naive_u16.rs
deleted file mode 100644
index 1a2f849..0000000
--- a/src/convolve_naive_u16.rs
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) Radzivon Bartoshyk. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without modification,
- * are permitted provided that the following conditions are met:
- *
- * 1.  Redistributions of source code must retain the above copyright notice, this
- * list of conditions and the following disclaimer.
- *
- * 2.  Redistributions in binary form must reproduce the above copyright notice,
- * this list of conditions and the following disclaimer in the documentation
- * and/or other materials provided with the distribution.
- *
- * 3.  Neither the name of the copyright holder nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-use crate::filter_weights::FilterBounds;
-use crate::floating_point_vertical::{
-    convolve_column_handler_floating_point, convolve_column_handler_floating_point_4,
-};
-
-#[allow(dead_code)]
-pub(crate) fn convolve_vertical_rgb_native_row_u16<const COMPONENTS: usize>(
-    dst_width: usize,
-    bounds: &FilterBounds,
-    src: &[u16],
-    dst: &mut [u16],
-    src_stride: usize,
-    weight: &[f32],
-    bit_depth: usize,
-) {
-    let mut cx = 0usize;
-
-    while cx + 4 < dst_width {
-        convolve_column_handler_floating_point_4::<u16, f32, f32, COMPONENTS>(
-            src,
-            src_stride,
-            dst,
-            weight,
-            bounds,
-            bit_depth as u32,
-            cx,
-        );
-
-        cx += 4;
-    }
-
-    while cx < dst_width {
-        convolve_column_handler_floating_point::<u16, f32, f32, COMPONENTS>(
-            src,
-            src_stride,
-            dst,
-            weight,
-            bounds,
-            bit_depth as u32,
-            cx,
-        );
-
-        cx += 1;
-    }
-}
diff --git a/src/dispatch_group_f16.rs b/src/dispatch_group_f16.rs
index 8c162c0..dd811b4 100644
--- a/src/dispatch_group_f16.rs
+++ b/src/dispatch_group_f16.rs
@@ -28,71 +28,52 @@
  */
 
 use crate::filter_weights::{FilterBounds, FilterWeights};
-use crate::unsafe_slice::UnsafeSlice;
 use crate::ImageStore;
 use half::f16;
+use rayon::iter::{IndexedParallelIterator, ParallelIterator};
+use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use rayon::ThreadPool;
-use std::sync::Arc;
 
 pub(crate) fn convolve_vertical_dispatch_f16<const COMPONENTS: usize>(
     image_store: &ImageStore<f16, COMPONENTS>,
     filter_weights: FilterWeights<f32>,
     destination: &mut ImageStore<f16, COMPONENTS>,
     pool: &Option<ThreadPool>,
-    dispatcher: fn(usize, &FilterBounds, *const f16, *mut f16, usize, &[f32]),
+    dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]),
 ) {
-    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
     let src_stride = image_store.width * image_store.channels;
-
-    let mut filter_offset = 0usize;
-
     let dst_stride = destination.width * image_store.channels;
+
     let dst_width = destination.width;
 
     if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let bounds = unsafe { weights.bounds.get_unchecked(y) };
-                    let weight_ptr =
-                        unsafe { weights.weights.get_unchecked((weights.aligned_size * y)..) };
-                    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    dispatcher(
-                        dst_width,
-                        bounds,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                        src_stride,
-                        weight_ptr,
-                    );
+        pool.install(|| {
+            destination
+                .buffer
+                .borrow_mut()
+                .par_chunks_exact_mut(dst_stride)
+                .enumerate()
+                .for_each(|(y, row)| {
+                    let bounds = filter_weights.bounds[y];
+                    let filter_offset = y * filter_weights.aligned_size;
+                    let weights = &filter_weights.weights[filter_offset..];
+                    let source_buffer = image_store.buffer.borrow();
+                    dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights);
                 });
-            }
         });
     } else {
-        for y in 0..destination.height {
-            let bounds = unsafe { filter_weights.bounds.get_unchecked(y) };
-            let weight_ptr = unsafe { filter_weights.weights.get_unchecked(filter_offset..) };
-
-            dispatcher(
-                dst_width,
-                bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
-            );
-
-            filter_offset += filter_weights.aligned_size;
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
+        destination
+            .buffer
+            .borrow_mut()
+            .chunks_exact_mut(dst_stride)
+            .enumerate()
+            .for_each(|(y, row)| {
+                let bounds = filter_weights.bounds[y];
+                let filter_offset = y * filter_weights.aligned_size;
+                let weights = &filter_weights.weights[filter_offset..];
+                let source_buffer = image_store.buffer.borrow();
+                dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights);
+            });
     }
 }
 
@@ -102,95 +83,120 @@ pub(crate) fn convolve_horizontal_dispatch_f16<const CHANNELS: usize>(
     destination: &mut ImageStore<f16, CHANNELS>,
     pool: &Option<ThreadPool>,
     dispatcher_4_rows: Option<
-        fn(usize, usize, &FilterWeights<f32>, *const f16, usize, *mut f16, usize),
+        fn(usize, usize, &FilterWeights<f32>, &[f16], usize, &mut [f16], usize),
     >,
-    dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f16, *mut f16),
+    dispatcher_row: fn(usize, usize, &FilterWeights<f32>, &[f16], &mut [f16]),
 ) {
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
     let src_stride = image_store.width * image_store.channels;
     let dst_stride = destination.width * image_store.channels;
     let dst_width = destination.width;
     let src_width = image_store.width;
 
     if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            let mut yy = 0usize;
+        pool.install(|| {
+            let mut processed_4 = false;
+
             if let Some(dispatcher) = dispatcher_4_rows {
-                for y in (0..destination.height.saturating_sub(4)).step_by(4) {
-                    let weights = arc_weights.clone();
-                    scope.spawn(move |_| {
-                        let unsafe_source_ptr_0 =
-                            unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                        let dst_ptr = unsafe_slice.mut_ptr();
-                        let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
+                image_store
+                    .buffer
+                    .borrow()
+                    .par_chunks_exact(src_stride * 4)
+                    .zip(
+                        destination
+                            .buffer
+                            .borrow_mut()
+                            .par_chunks_exact_mut(dst_stride * 4),
+                    )
+                    .for_each(|(src, dst)| {
                         dispatcher(
                             dst_width,
                             src_width,
-                            &weights,
-                            unsafe_source_ptr_0,
+                            &filter_weights,
+                            src,
                             src_stride,
-                            unsafe_destination_ptr_0,
+                            dst,
                             dst_stride,
                         );
                     });
-                    yy = y;
-                }
+                processed_4 = true;
             }
-            for y in yy..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    dispatcher_row(
-                        dst_width,
-                        src_width,
-                        &weights,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                    );
+
+            let left_src_rows = if processed_4 {
+                image_store
+                    .buffer
+                    .borrow()
+                    .chunks_exact(src_stride * 4)
+                    .remainder()
+            } else {
+                image_store.buffer.borrow()
+            };
+            let left_dst_rows = if processed_4 {
+                destination
+                    .buffer
+                    .borrow_mut()
+                    .chunks_exact_mut(dst_stride * 4)
+                    .into_remainder()
+            } else {
+                destination.buffer.borrow_mut()
+            };
+
+            left_src_rows
+                .par_chunks_exact(src_stride)
+                .zip(left_dst_rows.par_chunks_exact_mut(dst_stride))
+                .for_each(|(src, dst)| {
+                    dispatcher_row(dst_width, src_width, &filter_weights, src, dst);
                 });
-            }
         });
     } else {
-        let mut yy = 0usize;
-
+        let mut processed_4 = false;
         if let Some(dispatcher) = dispatcher_4_rows {
-            while yy + 4 < destination.height {
+            for (src, dst) in image_store
+                .buffer
+                .borrow()
+                .chunks_exact(src_stride * 4)
+                .zip(
+                    destination
+                        .buffer
+                        .borrow_mut()
+                        .chunks_exact_mut(dst_stride * 4),
+                )
+            {
                 dispatcher(
                     dst_width,
                     src_width,
                     &filter_weights,
-                    unsafe_source_ptr_0,
+                    src,
                     src_stride,
-                    unsafe_destination_ptr_0,
+                    dst,
                     dst_stride,
                 );
-
-                unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) };
-                unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) };
-
-                yy += 4;
             }
+            processed_4 = true;
         }
 
-        for _ in yy..destination.height {
-            dispatcher_row(
-                dst_width,
-                src_width,
-                &filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-            );
-
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
+        let left_src_rows = if processed_4 {
+            image_store
+                .buffer
+                .borrow()
+                .chunks_exact(src_stride * 4)
+                .remainder()
+        } else {
+            image_store.buffer.borrow()
+        };
+        let left_dst_rows = if processed_4 {
+            destination
+                .buffer
+                .borrow_mut()
+                .chunks_exact_mut(dst_stride * 4)
+                .into_remainder()
+        } else {
+            destination.buffer.borrow_mut()
+        };
+        for (src, dst) in left_src_rows
+            .chunks_exact(src_stride)
+            .zip(left_dst_rows.chunks_exact_mut(dst_stride))
+        {
+            dispatcher_row(dst_width, src_width, &filter_weights, src, dst);
         }
     }
 }
diff --git a/src/f16.rs b/src/f16.rs
index 5d74f8b..3e5c23c 100644
--- a/src/f16.rs
+++ b/src/f16.rs
@@ -35,13 +35,14 @@ use crate::avx2::{
     convolve_vertical_avx_row_f16,
 };
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
-use crate::convolve_naive_f32::{
-    convolve_horizontal_rgb_native_row, convolve_horizontal_rgba_4_row_f32,
-};
 #[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
 use crate::cpu_features::{is_aarch_f16_supported, is_aarch_f16c_supported};
 use crate::dispatch_group_f16::{convolve_horizontal_dispatch_f16, convolve_vertical_dispatch_f16};
 use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::floating_point_horizontal::{
+    convolve_row_handler_floating_point, convolve_row_handler_floating_point_4,
+};
+use crate::floating_point_vertical::column_handler_floating_point;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
 use crate::neon::{
     convolve_horizontal_rgb_neon_row_one_f16, convolve_horizontal_rgb_neon_rows_4_f16,
@@ -54,7 +55,6 @@ use crate::neon::{
     xconvolve_horizontal_rgba_neon_row_one_f16, xconvolve_horizontal_rgba_neon_rows_4_f16,
     xconvolve_vertical_rgb_neon_row_f16,
 };
-use crate::rgb_f32::convolve_vertical_rgb_native_row_f32;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse::{
     convolve_horizontal_rgb_sse_row_one_f16, convolve_horizontal_rgb_sse_rows_4_f16,
@@ -65,6 +65,35 @@ use crate::ImageStore;
 use half::f16;
 use rayon::ThreadPool;
 
+fn convolve_horizontal_rgba_4_row_f16<const CHANNELS: usize>(
+    _: usize,
+    _: usize,
+    filter_weights: &FilterWeights<f32>,
+    src: &[f16],
+    src_stride: usize,
+    dst: &mut [f16],
+    dst_stride: usize,
+) {
+    convolve_row_handler_floating_point_4::<f16, f32, f32, CHANNELS>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        filter_weights,
+        8,
+    )
+}
+
+fn convolve_horizontal_rgb_native_row_f16<const CHANNELS: usize>(
+    _: usize,
+    _: usize,
+    filter_weights: &FilterWeights<f32>,
+    src: &[f16],
+    dst: &mut [f16],
+) {
+    convolve_row_handler_floating_point::<f16, f32, f32, CHANNELS>(src, dst, filter_weights, 8)
+}
+
 impl<'a> HorizontalConvolutionPass<f16, 4> for ImageStore<'a, f16, 4> {
     fn convolve_horizontal(
         &self,
@@ -73,10 +102,10 @@ impl<'a> HorizontalConvolutionPass<f16, 4> for ImageStore<'a, f16, 4> {
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher_4_rows: Option<
-            fn(usize, usize, &FilterWeights<f32>, *const f16, usize, *mut f16, usize),
-        > = Some(convolve_horizontal_rgba_4_row_f32::<f16, f32, 4>);
-        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f16, *mut f16) =
-            convolve_horizontal_rgb_native_row::<f16, f32, 4>;
+            fn(usize, usize, &FilterWeights<f32>, &[f16], usize, &mut [f16], usize),
+        > = Some(convolve_horizontal_rgba_4_row_f16::<4>);
+        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, &[f16], &mut [f16]) =
+            convolve_horizontal_rgb_native_row_f16::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             if is_aarch_f16c_supported() {
@@ -126,6 +155,17 @@ impl<'a> HorizontalConvolutionPass<f16, 4> for ImageStore<'a, f16, 4> {
     }
 }
 
+fn convolve_vertical_rgb_native_row_f16<const COMPONENTS: usize>(
+    _: usize,
+    bounds: &FilterBounds,
+    src: &[f16],
+    dst: &mut [f16],
+    src_stride: usize,
+    weight: &[f32],
+) {
+    column_handler_floating_point::<f16, f32, f32>(bounds, src, dst, src_stride, weight, 8);
+}
+
 impl<'a> VerticalConvolutionPass<f16, 4> for ImageStore<'a, f16, 4> {
     fn convolve_vertical(
         &self,
@@ -133,8 +173,8 @@ impl<'a> VerticalConvolutionPass<f16, 4> for ImageStore<'a, f16, 4> {
         destination: &mut ImageStore<f16, 4>,
         pool: &Option<ThreadPool>,
     ) {
-        let mut _dispatcher: fn(usize, &FilterBounds, *const f16, *mut f16, usize, &[f32]) =
-            convolve_vertical_rgb_native_row_f32::<f16, 4>;
+        let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
+            convolve_vertical_rgb_native_row_f16::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             if is_aarch_f16c_supported() {
@@ -178,10 +218,10 @@ impl<'a> HorizontalConvolutionPass<f16, 3> for ImageStore<'a, f16, 3> {
         pool: &Option<ThreadPool>,
     ) {
         let mut _dispatcher_4_rows: Option<
-            fn(usize, usize, &FilterWeights<f32>, *const f16, usize, *mut f16, usize),
-        > = Some(convolve_horizontal_rgba_4_row_f32::<f16, f32, 3>);
-        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f16, *mut f16) =
-            convolve_horizontal_rgb_native_row::<f16, f32, 3>;
+            fn(usize, usize, &FilterWeights<f32>, &[f16], usize, &mut [f16], usize),
+        > = Some(convolve_horizontal_rgba_4_row_f16::<3>);
+        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, &[f16], &mut [f16]) =
+            convolve_horizontal_rgb_native_row_f16::<3>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             if is_aarch_f16c_supported() {
@@ -229,8 +269,8 @@ impl<'a> VerticalConvolutionPass<f16, 3> for ImageStore<'a, f16, 3> {
         destination: &mut ImageStore<f16, 3>,
         pool: &Option<ThreadPool>,
     ) {
-        let mut _dispatcher: fn(usize, &FilterBounds, *const f16, *mut f16, usize, &[f32]) =
-            convolve_vertical_rgb_native_row_f32::<f16, 3>;
+        let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
+            convolve_vertical_rgb_native_row_f16::<3>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             if is_aarch_f16c_supported() {
@@ -274,10 +314,10 @@ impl<'a> HorizontalConvolutionPass<f16, 1> for ImageStore<'a, f16, 1> {
         pool: &Option<ThreadPool>,
     ) {
         let _dispatcher_4_rows: Option<
-            fn(usize, usize, &FilterWeights<f32>, *const f16, usize, *mut f16, usize),
-        > = Some(convolve_horizontal_rgba_4_row_f32::<f16, f32, 1>);
-        let _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f16, *mut f16) =
-            convolve_horizontal_rgb_native_row::<f16, f32, 1>;
+            fn(usize, usize, &FilterWeights<f32>, &[f16], usize, &mut [f16], usize),
+        > = Some(convolve_horizontal_rgba_4_row_f16::<1>);
+        let _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, &[f16], &mut [f16]) =
+            convolve_horizontal_rgb_native_row_f16::<1>;
         convolve_horizontal_dispatch_f16(
             self,
             filter_weights,
@@ -296,8 +336,8 @@ impl<'a> VerticalConvolutionPass<f16, 1> for ImageStore<'a, f16, 1> {
         destination: &mut ImageStore<f16, 1>,
         pool: &Option<ThreadPool>,
     ) {
-        let mut _dispatcher: fn(usize, &FilterBounds, *const f16, *mut f16, usize, &[f32]) =
-            convolve_vertical_rgb_native_row_f32::<f16, 1>;
+        let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
+            convolve_vertical_rgb_native_row_f16::<1>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             if is_aarch_f16c_supported() {
diff --git a/src/lib.rs b/src/lib.rs
index 929b771..de47fb5 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -43,7 +43,6 @@ mod color_group;
 mod colors;
 mod convolution;
 mod convolve_naive_f32;
-mod convolve_naive_u16;
 mod cpu_features;
 mod dispatch_group_ar30;
 #[cfg(feature = "half")]
diff --git a/src/mixed_storage.rs b/src/mixed_storage.rs
index 594b8c4..1cc63ba 100644
--- a/src/mixed_storage.rs
+++ b/src/mixed_storage.rs
@@ -55,6 +55,15 @@ impl MixedStorage<u16> for f32 {
     }
 }
 
+#[cfg(feature = "half")]
+impl MixedStorage<half::f16> for f32 {
+    #[inline(always)]
+    #[allow(clippy::manual_clamp)]
+    fn to_mixed(self, _: u32) -> half::f16 {
+        half::f16::from_f32(self)
+    }
+}
+
 impl MixedStorage<u16> for f64 {
     #[inline(always)]
     #[allow(clippy::manual_clamp)]
diff --git a/src/neon/alpha_f16.rs b/src/neon/alpha_f16.rs
index 20a320c..71bd739 100644
--- a/src/neon/alpha_f16.rs
+++ b/src/neon/alpha_f16.rs
@@ -90,7 +90,7 @@ unsafe fn neon_premultiply_alpha_rgba_row_f16(dst: &mut [half::f16], src: &[half
     premultiply_pixel_f16_row(rem, src_rem);
 }
 
-pub fn neon_premultiply_alpha_rgba_f16(
+pub(crate) fn neon_premultiply_alpha_rgba_f16(
     dst: &mut [half::f16],
     src: &[half::f16],
     width: usize,
@@ -197,7 +197,7 @@ unsafe fn neon_unpremultiply_alpha_rgba_row_f16(in_place: &mut [half::f16]) {
     unpremultiply_pixel_f16_row(rem);
 }
 
-pub fn neon_unpremultiply_alpha_rgba_f16(
+pub(crate) fn neon_unpremultiply_alpha_rgba_f16(
     in_place: &mut [half::f16],
     width: usize,
     _: usize,
diff --git a/src/neon/alpha_f16_full.rs b/src/neon/alpha_f16_full.rs
index c3c16ec..0a2f1c4 100644
--- a/src/neon/alpha_f16_full.rs
+++ b/src/neon/alpha_f16_full.rs
@@ -67,7 +67,7 @@ unsafe fn neon_premultiply_alpha_rgba_row_f16_full(dst: &mut [half::f16], src: &
     premultiply_pixel_f16_row(rem, src_rem);
 }
 
-pub fn neon_premultiply_alpha_rgba_f16_full(
+pub(crate) fn neon_premultiply_alpha_rgba_f16_full(
     dst: &mut [half::f16],
     src: &[half::f16],
     width: usize,
@@ -135,7 +135,7 @@ unsafe fn neon_unpremultiply_alpha_rgba_f16_row_full(in_place: &mut [half::f16])
     unpremultiply_pixel_f16_row(rem);
 }
 
-pub fn neon_unpremultiply_alpha_rgba_f16_full(
+pub(crate) fn neon_unpremultiply_alpha_rgba_f16_full(
     in_place: &mut [half::f16],
     width: usize,
     _: usize,
diff --git a/src/neon/alpha_f32.rs b/src/neon/alpha_f32.rs
index d280c19..dbcfe7d 100644
--- a/src/neon/alpha_f32.rs
+++ b/src/neon/alpha_f32.rs
@@ -64,7 +64,7 @@ unsafe fn neon_premultiply_alpha_rgba_row_f32(dst: &mut [f32], src: &[f32]) {
     premultiply_pixel_f32_row(rem, src_rem);
 }
 
-pub fn neon_premultiply_alpha_rgba_f32(
+pub(crate) fn neon_premultiply_alpha_rgba_f32(
     dst: &mut [f32],
     src: &[f32],
     width: usize,
@@ -108,7 +108,7 @@ unsafe fn neon_unpremultiply_alpha_rgba_f32_row(in_place: &mut [f32]) {
     unpremultiply_pixel_f32_row(rem);
 }
 
-pub fn neon_unpremultiply_alpha_rgba_f32(
+pub(crate) fn neon_unpremultiply_alpha_rgba_f32(
     in_place: &mut [f32],
     width: usize,
     _: usize,
diff --git a/src/neon/alpha_u16.rs b/src/neon/alpha_u16.rs
index dbdee0e..bc7048f 100644
--- a/src/neon/alpha_u16.rs
+++ b/src/neon/alpha_u16.rs
@@ -161,7 +161,7 @@ pub fn neon_premultiply_alpha_rgba_row_u16(dst: &mut [u16], src: &[u16], bit_dep
     premultiply_alpha_rgba_row(rem, src_rem, max_colors);
 }
 
-pub fn neon_premultiply_alpha_rgba_u16(
+pub(crate) fn neon_premultiply_alpha_rgba_u16(
     dst: &mut [u16],
     src: &[u16],
     width: usize,
@@ -252,7 +252,7 @@ fn neon_unpremultiply_alpha_rgba_row_u16(in_place: &mut [u16], bit_depth: usize)
     unpremultiply_alpha_rgba_row(rem, max_colors);
 }
 
-pub fn neon_unpremultiply_alpha_rgba_u16(
+pub(crate) fn neon_unpremultiply_alpha_rgba_u16(
     in_place: &mut [u16],
     width: usize,
     _: usize,
diff --git a/src/neon/alpha_u8.rs b/src/neon/alpha_u8.rs
index c021518..c296a63 100644
--- a/src/neon/alpha_u8.rs
+++ b/src/neon/alpha_u8.rs
@@ -134,7 +134,7 @@ unsafe fn neon_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
     premultiply_alpha_rgba_row_impl(rem, src_rem);
 }
 
-pub fn neon_premultiply_alpha_rgba(
+pub(crate) fn neon_premultiply_alpha_rgba(
     dst: &mut [u8],
     src: &[u8],
     width: usize,
@@ -178,7 +178,7 @@ unsafe fn neon_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) {
     unpremultiply_alpha_rgba_row_impl(rem);
 }
 
-pub fn neon_unpremultiply_alpha_rgba(
+pub(crate) fn neon_unpremultiply_alpha_rgba(
     in_place: &mut [u8],
     width: usize,
     _: usize,
diff --git a/src/neon/convolve_f16.rs b/src/neon/convolve_f16.rs
index e7276ee..8d0ada8 100644
--- a/src/neon/convolve_f16.rs
+++ b/src/neon/convolve_f16.rs
@@ -36,9 +36,9 @@ use crate::neon::*;
 pub(crate) unsafe fn convolve_vertical_part_neon_8_f16<const USE_BLENDING: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    dst: *mut half::f16,
+    dst: &mut [half::f16],
     filter: &[f32],
     bounds: &FilterBounds,
     blend_length: usize,
@@ -52,7 +52,7 @@ pub(crate) unsafe fn convolve_vertical_part_neon_8_f16<const USE_BLENDING: bool>
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = vld1q_dup_f32(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
 
         let s_ptr = src_ptr.add(px);
         let item_row = if USE_BLENDING {
@@ -72,7 +72,7 @@ pub(crate) unsafe fn convolve_vertical_part_neon_8_f16<const USE_BLENDING: bool>
 
     let item = xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1));
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     if USE_BLENDING {
         let mut transient: [half::f16; 8] = [half::f16::from_f32(0.); 8];
         xvstq_f16(transient.as_mut_ptr(), item);
diff --git a/src/neon/f16_utils.rs b/src/neon/f16_utils.rs
index 6a23a70..0a94d05 100644
--- a/src/neon/f16_utils.rs
+++ b/src/neon/f16_utils.rs
@@ -33,23 +33,56 @@ use std::arch::asm;
 
 /// Provides basic support for f16
 
+#[allow(unused)]
+macro_rules! static_assert {
+    ($e:expr) => {
+        const {
+            assert!($e);
+        }
+    };
+    ($e:expr, $msg:expr) => {
+        const {
+            assert!($e, $msg);
+        }
+    };
+}
+
+#[allow(unused_macros)]
+macro_rules! static_assert_uimm_bits {
+    ($imm:ident, $bits:expr) => {
+        // `0 <= $imm` produces a warning if the immediate has an unsigned type
+        #[allow(unused_comparisons)]
+        {
+            static_assert!(
+                0 <= $imm && $imm < (1 << $bits),
+                concat!(
+                    stringify!($imm),
+                    " doesn't fit in ",
+                    stringify!($bits),
+                    " bits",
+                )
+            )
+        }
+    };
+}
+
 #[derive(Debug, Clone, Copy)]
 #[allow(non_camel_case_types)]
 #[allow(dead_code)]
-pub struct x_float16x4_t(pub(crate) uint16x4_t);
+pub(crate) struct x_float16x4_t(pub(crate) uint16x4_t);
 
 #[derive(Debug, Clone, Copy)]
 #[allow(non_camel_case_types)]
 #[allow(dead_code)]
-pub struct x_float16x8_t(pub(crate) uint16x8_t);
+pub(crate) struct x_float16x8_t(pub(crate) uint16x8_t);
 
 #[derive(Debug, Clone, Copy)]
 #[allow(non_camel_case_types)]
-pub struct x_float16x8x2_t(pub(crate) x_float16x8_t, pub(crate) x_float16x8_t);
+pub(crate) struct x_float16x8x2_t(pub(crate) x_float16x8_t, pub(crate) x_float16x8_t);
 
 #[derive(Debug, Clone, Copy)]
 #[allow(non_camel_case_types)]
-pub struct x_float16x8x4_t(
+pub(crate) struct x_float16x8x4_t(
     pub(crate) x_float16x8_t,
     pub(crate) x_float16x8_t,
     pub(crate) x_float16x8_t,
@@ -57,41 +90,49 @@ pub struct x_float16x8x4_t(
 );
 
 #[inline]
-pub unsafe fn xvld_f16(ptr: *const half::f16) -> x_float16x4_t {
+pub(crate) unsafe fn xvld_f16(ptr: *const half::f16) -> x_float16x4_t {
     let store: uint16x4_t = vld1_u16(std::mem::transmute(ptr));
     std::mem::transmute(store)
 }
 
 #[inline]
-pub unsafe fn xvldq_f16(ptr: *const half::f16) -> x_float16x8_t {
+pub(crate) unsafe fn xvldq_f16(ptr: *const half::f16) -> x_float16x8_t {
     let store: uint16x8_t = vld1q_u16(std::mem::transmute(ptr));
     std::mem::transmute(store)
 }
 
 #[inline]
-pub unsafe fn xvldq_f16_x2(ptr: *const half::f16) -> x_float16x8x2_t {
-    let store = vld1q_u16_x2(std::mem::transmute(ptr));
-    std::mem::transmute(store)
+pub(crate) unsafe fn xvldq_f16_x2(ptr: *const half::f16) -> x_float16x8x2_t {
+    let ptr_u16 = ptr as *const u16;
+    x_float16x8x2_t(
+        xreinterpretq_f16_u16(vld1q_u16(ptr_u16)),
+        xreinterpretq_f16_u16(vld1q_u16(ptr_u16.add(8))),
+    )
 }
 
 #[inline]
-pub unsafe fn xvldq_f16_x4(ptr: *const half::f16) -> x_float16x8x4_t {
-    let store = vld1q_u16_x4(std::mem::transmute(ptr));
-    std::mem::transmute(store)
+pub(crate) unsafe fn xvldq_f16_x4(ptr: *const half::f16) -> x_float16x8x4_t {
+    let ptr_u16 = ptr as *const u16;
+    x_float16x8x4_t(
+        xreinterpretq_f16_u16(vld1q_u16(ptr_u16)),
+        xreinterpretq_f16_u16(vld1q_u16(ptr_u16.add(8))),
+        xreinterpretq_f16_u16(vld1q_u16(ptr_u16.add(16))),
+        xreinterpretq_f16_u16(vld1q_u16(ptr_u16.add(24))),
+    )
 }
 
 #[inline]
-pub unsafe fn xvget_low_f16(x: x_float16x8_t) -> x_float16x4_t {
+pub(crate) unsafe fn xvget_low_f16(x: x_float16x8_t) -> x_float16x4_t {
     std::mem::transmute(vget_low_u16(std::mem::transmute(x)))
 }
 
 #[inline]
-pub unsafe fn xvget_high_f16(x: x_float16x8_t) -> x_float16x4_t {
+pub(crate) unsafe fn xvget_high_f16(x: x_float16x8_t) -> x_float16x4_t {
     std::mem::transmute(vget_high_u16(std::mem::transmute(x)))
 }
 
 #[inline]
-pub unsafe fn xcombine_f16(low: x_float16x4_t, high: x_float16x4_t) -> x_float16x8_t {
+pub(crate) unsafe fn xcombine_f16(low: x_float16x4_t, high: x_float16x4_t) -> x_float16x8_t {
     std::mem::transmute(vcombine_u16(
         std::mem::transmute(low),
         std::mem::transmute(high),
@@ -99,22 +140,22 @@ pub unsafe fn xcombine_f16(low: x_float16x4_t, high: x_float16x4_t) -> x_float16
 }
 
 #[inline]
-pub unsafe fn xreinterpret_u16_f16(x: x_float16x4_t) -> uint16x4_t {
+pub(crate) unsafe fn xreinterpret_u16_f16(x: x_float16x4_t) -> uint16x4_t {
     std::mem::transmute(x)
 }
 
 #[inline]
-pub unsafe fn xreinterpretq_u16_f16(x: x_float16x8_t) -> uint16x8_t {
+pub(crate) unsafe fn xreinterpretq_u16_f16(x: x_float16x8_t) -> uint16x8_t {
     std::mem::transmute(x)
 }
 
 #[inline]
-pub unsafe fn xreinterpret_f16_u16(x: uint16x4_t) -> x_float16x4_t {
+pub(crate) unsafe fn xreinterpret_f16_u16(x: uint16x4_t) -> x_float16x4_t {
     std::mem::transmute(x)
 }
 
 #[inline]
-pub unsafe fn xreinterpretq_f16_u16(x: uint16x8_t) -> x_float16x8_t {
+pub(crate) unsafe fn xreinterpretq_f16_u16(x: uint16x8_t) -> x_float16x8_t {
     std::mem::transmute(x)
 }
 
@@ -129,7 +170,7 @@ pub(super) unsafe fn xvzeros_f16() -> x_float16x4_t {
 }
 
 #[inline]
-pub unsafe fn xvcvt_f32_f16(x: x_float16x4_t) -> float32x4_t {
+pub(crate) unsafe fn xvcvt_f32_f16(x: x_float16x4_t) -> float32x4_t {
     let src: uint16x4_t = xreinterpret_u16_f16(x);
     let dst: float32x4_t;
     asm!(
@@ -216,6 +257,130 @@ pub(super) unsafe fn xvfmla_f16(
     xreinterpret_f16_u16(result)
 }
 
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn xvfmla_laneq_f16<const LANE: i32>(
+    a: x_float16x4_t,
+    b: x_float16x4_t,
+    c: x_float16x8_t,
+) -> x_float16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    let mut result: uint16x4_t = xreinterpret_u16_f16(a);
+
+    if LANE == 0 {
+        asm!(
+        "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[0]",
+        inout(vreg) result,
+        in(vreg) xreinterpret_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 1 {
+        asm!(
+        "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[1]",
+        inout(vreg) result,
+        in(vreg) xreinterpret_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 2 {
+        asm!(
+        "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[2]",
+        inout(vreg) result,
+        in(vreg) xreinterpret_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 3 {
+        asm!(
+        "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[3]",
+        inout(vreg) result,
+        in(vreg) xreinterpret_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 4 {
+        asm!(
+        "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[4]",
+        inout(vreg) result,
+        in(vreg) xreinterpret_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 5 {
+        asm!(
+        "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[5]",
+        inout(vreg) result,
+        in(vreg) xreinterpret_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 6 {
+        asm!(
+        "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[6]",
+        inout(vreg) result,
+        in(vreg) xreinterpret_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 7 {
+        asm!(
+        "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[7]",
+        inout(vreg) result,
+        in(vreg) xreinterpret_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    }
+    xreinterpret_f16_u16(result)
+}
+
+#[target_feature(enable = "fp16")]
+#[inline]
+pub(super) unsafe fn xvfmla_lane_f16<const LANE: i32>(
+    a: x_float16x4_t,
+    b: x_float16x4_t,
+    c: x_float16x4_t,
+) -> x_float16x4_t {
+    static_assert_uimm_bits!(LANE, 3);
+    let mut result: uint16x4_t = xreinterpret_u16_f16(a);
+
+    if LANE == 0 {
+        asm!(
+        "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[0]",
+        inout(vreg) result,
+        in(vreg) xreinterpret_u16_f16(b),
+        in(vreg) xreinterpret_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 1 {
+        asm!(
+        "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[1]",
+        inout(vreg) result,
+        in(vreg) xreinterpret_u16_f16(b),
+        in(vreg) xreinterpret_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 2 {
+        asm!(
+        "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[2]",
+        inout(vreg) result,
+        in(vreg) xreinterpret_u16_f16(b),
+        in(vreg) xreinterpret_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 3 {
+        asm!(
+        "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[3]",
+        inout(vreg) result,
+        in(vreg) xreinterpret_u16_f16(b),
+        in(vreg) xreinterpret_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    }
+    xreinterpret_f16_u16(result)
+}
+
 #[target_feature(enable = "fp16")]
 #[inline]
 pub(super) unsafe fn xvfmlaq_f16(
@@ -309,38 +474,66 @@ pub(super) unsafe fn xvbslq_f16(
 }
 
 #[inline]
-pub unsafe fn xvst_f16(ptr: *const half::f16, x: x_float16x4_t) {
+pub(crate) unsafe fn xvst_f16(ptr: *mut half::f16, x: x_float16x4_t) {
     vst1_u16(std::mem::transmute(ptr), xreinterpret_u16_f16(x))
 }
 
 #[inline]
-pub unsafe fn xvstq_f16(ptr: *const half::f16, x: x_float16x8_t) {
+pub(crate) unsafe fn xvstq_f16(ptr: *mut half::f16, x: x_float16x8_t) {
     vst1q_u16(std::mem::transmute(ptr), xreinterpretq_u16_f16(x))
 }
 
 #[inline]
-pub unsafe fn xvstq_f16_x2(ptr: *const half::f16, x: x_float16x8x2_t) {
-    vst1q_u16_x2(std::mem::transmute(ptr), std::mem::transmute(x))
+pub(crate) unsafe fn xvstq_f16_x2(ptr: *mut half::f16, x: x_float16x8x2_t) {
+    let ptr_u16 = ptr as *mut u16;
+    vst1q_u16(ptr_u16, xreinterpretq_u16_f16(x.0));
+    vst1q_u16(ptr_u16.add(8), xreinterpretq_u16_f16(x.1));
 }
 
 #[inline]
-pub unsafe fn xvstq_f16_x4(ptr: *const half::f16, x: x_float16x8x4_t) {
-    vst1q_u16_x4(std::mem::transmute(ptr), std::mem::transmute(x))
+pub(crate) unsafe fn xvstq_f16_x4(ptr: *const half::f16, x: x_float16x8x4_t) {
+    let ptr_u16 = ptr as *mut u16;
+    vst1q_u16(ptr_u16, xreinterpretq_u16_f16(x.0));
+    vst1q_u16(ptr_u16.add(8), xreinterpretq_u16_f16(x.1));
+    vst1q_u16(ptr_u16.add(16), xreinterpretq_u16_f16(x.2));
+    vst1q_u16(ptr_u16.add(24), xreinterpretq_u16_f16(x.3));
 }
 
 #[inline]
-pub unsafe fn xvdup_lane_f16<const N: i32>(a: x_float16x4_t) -> x_float16x4_t {
+pub(crate) unsafe fn xvdup_lane_f16<const N: i32>(a: x_float16x4_t) -> x_float16x4_t {
     xreinterpret_f16_u16(vdup_lane_u16::<N>(xreinterpret_u16_f16(a)))
 }
 
 #[inline]
-pub unsafe fn xvdup_laneq_f16<const N: i32>(a: x_float16x8_t) -> x_float16x4_t {
+pub(crate) unsafe fn xvdup_laneq_f16<const N: i32>(a: x_float16x8_t) -> x_float16x4_t {
     xreinterpret_f16_u16(vdup_laneq_u16::<N>(xreinterpretq_u16_f16(a)))
 }
 
+#[inline]
+pub(crate) unsafe fn xvld1q_lane_f16<const LANE: i32>(
+    ptr: *const half::f16,
+    src: x_float16x8_t,
+) -> x_float16x8_t {
+    xreinterpretq_f16_u16(vld1q_lane_u16::<LANE>(
+        ptr as *const u16,
+        xreinterpretq_u16_f16(src),
+    ))
+}
+
+#[inline]
+pub(crate) unsafe fn xvsetq_lane_f16<const LANE: i32>(
+    v: half::f16,
+    r: x_float16x8_t,
+) -> x_float16x8_t {
+    xreinterpretq_f16_u16(vsetq_lane_u16::<LANE>(
+        v.to_bits(),
+        xreinterpretq_u16_f16(r),
+    ))
+}
+
 #[target_feature(enable = "fp16")]
 #[inline]
-pub unsafe fn vceqzq_f16(a: x_float16x8_t) -> uint16x8_t {
+pub(crate) unsafe fn vceqzq_f16(a: x_float16x8_t) -> uint16x8_t {
     let mut result: uint16x8_t;
     asm!(
     "fcmeq {0:v}.8h, {1:v}.8h, #0",
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index f937e95..00c0c9d 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -66,51 +66,62 @@ mod vertical_u16_lb;
 mod vertical_u8;
 
 #[cfg(feature = "half")]
-pub use alpha_f16::{neon_premultiply_alpha_rgba_f16, neon_unpremultiply_alpha_rgba_f16};
+pub(crate) use alpha_f16::{neon_premultiply_alpha_rgba_f16, neon_unpremultiply_alpha_rgba_f16};
 #[cfg(feature = "half")]
-pub use alpha_f16_full::{
+pub(crate) use alpha_f16_full::{
     neon_premultiply_alpha_rgba_f16_full, neon_unpremultiply_alpha_rgba_f16_full,
 };
-pub use alpha_f32::neon_premultiply_alpha_rgba_f32;
-pub use alpha_f32::neon_unpremultiply_alpha_rgba_f32;
-pub use alpha_u16::{neon_premultiply_alpha_rgba_u16, neon_unpremultiply_alpha_rgba_u16};
-pub use alpha_u8::neon_premultiply_alpha_rgba;
-pub use alpha_u8::neon_unpremultiply_alpha_rgba;
+pub(crate) use alpha_f32::neon_premultiply_alpha_rgba_f32;
+pub(crate) use alpha_f32::neon_unpremultiply_alpha_rgba_f32;
+pub(crate) use alpha_u16::{neon_premultiply_alpha_rgba_u16, neon_unpremultiply_alpha_rgba_u16};
+pub(crate) use alpha_u8::neon_premultiply_alpha_rgba;
+pub(crate) use alpha_u8::neon_unpremultiply_alpha_rgba;
 #[cfg(feature = "half")]
-pub use f16_utils::*;
+pub(crate) use f16_utils::*;
 pub(crate) use horizontal_ar30::neon_convolve_horizontal_rgba_rows_4_ar30;
-pub use plane_f32::convolve_horizontal_plane_neon_row_one;
-pub use plane_f32::convolve_horizontal_plane_neon_rows_4;
+pub(crate) use plane_f32::convolve_horizontal_plane_neon_row_one;
+pub(crate) use plane_f32::convolve_horizontal_plane_neon_rows_4;
 pub use plane_u8::{convolve_horizontal_plane_neon_row, convolve_horizontal_plane_neon_rows_4_u8};
 #[cfg(feature = "half")]
-pub use rgb_f16::{
+pub(crate) use rgb_f16::{
     convolve_horizontal_rgb_neon_row_one_f16, convolve_horizontal_rgb_neon_rows_4_f16,
 };
 #[cfg(feature = "half")]
-pub use rgb_f16_full::{
+pub(crate) use rgb_f16_full::{
     xconvolve_horizontal_rgb_neon_row_one_f16, xconvolve_horizontal_rgb_neon_rows_4_f16,
 };
-pub use rgb_f32::*;
-pub use rgb_u8::*;
+pub(crate) use rgb_f32::{
+    convolve_horizontal_rgb_neon_row_one_f32, convolve_horizontal_rgb_neon_rows_4_f32,
+};
+pub(crate) use rgb_u8::{
+    convolve_horizontal_rgb_neon_row_one, convolve_horizontal_rgb_neon_rows_4,
+};
 #[cfg(feature = "half")]
-pub use rgba_f16::convolve_horizontal_rgba_neon_row_one_f16;
+pub(crate) use rgba_f16::convolve_horizontal_rgba_neon_row_one_f16;
 #[cfg(feature = "half")]
-pub use rgba_f16::convolve_horizontal_rgba_neon_rows_4_f16;
+pub(crate) use rgba_f16::convolve_horizontal_rgba_neon_rows_4_f16;
 #[cfg(feature = "half")]
-pub use rgba_f16_full::{
+pub(crate) use rgba_f16_full::{
     xconvolve_horizontal_rgba_neon_row_one_f16, xconvolve_horizontal_rgba_neon_rows_4_f16,
 };
-pub use rgba_f32::*;
-pub use rgba_u16_lb::{
+pub(crate) use rgba_f32::{
+    convolve_horizontal_rgba_neon_row_one, convolve_horizontal_rgba_neon_rows_4,
+};
+pub(crate) use rgba_u16_lb::{
     convolve_horizontal_rgba_neon_rows_4_lb_u16, convolve_horizontal_rgba_neon_u16_lb_row,
 };
-pub use rgba_u8::*;
+pub(crate) use rgba_u8::{
+    convolve_horizontal_rgba_neon_row, convolve_horizontal_rgba_neon_row_i16,
+    convolve_horizontal_rgba_neon_rows_4_u8, convolve_horizontal_rgba_neon_rows_4_u8_i16,
+};
 pub(crate) use vertical_ar30::neon_column_handler_fixed_point_ar30;
 #[cfg(feature = "half")]
-pub use vertical_f16::convolve_vertical_rgb_neon_row_f16;
+pub(crate) use vertical_f16::convolve_vertical_rgb_neon_row_f16;
 #[cfg(feature = "half")]
-pub use vertical_f16_full::xconvolve_vertical_rgb_neon_row_f16;
-pub use vertical_f32::convolve_vertical_rgb_neon_row_f32;
-pub use vertical_u16::convolve_column_u16;
-pub use vertical_u16_lb::convolve_column_lb_u16;
-pub use vertical_u8::{convolve_vertical_neon_i16_precision, convolve_vertical_neon_i32_precision};
+pub(crate) use vertical_f16_full::xconvolve_vertical_rgb_neon_row_f16;
+pub(crate) use vertical_f32::convolve_vertical_rgb_neon_row_f32;
+pub(crate) use vertical_u16::convolve_column_u16;
+pub(crate) use vertical_u16_lb::convolve_column_lb_u16;
+pub(crate) use vertical_u8::{
+    convolve_vertical_neon_i16_precision, convolve_vertical_neon_i32_precision,
+};
diff --git a/src/neon/plane_f32.rs b/src/neon/plane_f32.rs
index e13e5b3..3db9629 100644
--- a/src/neon/plane_f32.rs
+++ b/src/neon/plane_f32.rs
@@ -28,7 +28,7 @@
  */
 
 use crate::filter_weights::FilterWeights;
-use crate::neon::utils::{prefer_vfmaq_f32, xvld1q_f32_x4};
+use crate::neon::utils::{prefer_vfmaq_f32, xvld1q_f32_x2, xvld1q_f32_x4};
 use std::arch::aarch64::*;
 
 macro_rules! conv_horiz_plane_16_f32 {
@@ -49,7 +49,7 @@ macro_rules! conv_horiz_plane_8_f32 {
     ($start_x: expr, $src: expr, $set1: expr, $set2: expr, $store: expr) => {{
         let src_ptr = $src.add($start_x);
 
-        let rgb_pixel = vld1q_f32_x2(src_ptr);
+        let rgb_pixel = xvld1q_f32_x2(src_ptr);
 
         let mut acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set1);
         acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set2);
@@ -87,7 +87,7 @@ macro_rules! conv_horiz_plane_1_f32 {
     }};
 }
 
-pub fn convolve_horizontal_plane_neon_row_one(
+pub(crate) fn convolve_horizontal_plane_neon_row_one(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
@@ -119,7 +119,7 @@ pub fn convolve_horizontal_plane_neon_row_one(
             while jx + 8 < bounds.size {
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = vld1q_f32_x2(ptr);
+                let read_weights = xvld1q_f32_x2(ptr);
                 store = conv_horiz_plane_8_f32!(
                     bounds_start,
                     unsafe_source_ptr_0,
@@ -165,7 +165,7 @@ pub fn convolve_horizontal_plane_neon_row_one(
     }
 }
 
-pub fn convolve_horizontal_plane_neon_rows_4(
+pub(crate) fn convolve_horizontal_plane_neon_rows_4(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
@@ -208,7 +208,7 @@ pub fn convolve_horizontal_plane_neon_rows_4(
 
             while jx + 8 < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = vld1q_f32_x2(ptr);
+                let read_weights = xvld1q_f32_x2(ptr);
                 let bounds_start = bounds.start + jx;
                 store_0 = conv_horiz_plane_8_f32!(
                     bounds_start,
diff --git a/src/neon/rgb_f16.rs b/src/neon/rgb_f16.rs
index ea47563..b2f9737 100644
--- a/src/neon/rgb_f16.rs
+++ b/src/neon/rgb_f16.rs
@@ -30,7 +30,7 @@
 use std::arch::aarch64::*;
 
 use crate::filter_weights::FilterWeights;
-use crate::neon::utils::prefer_vfmaq_f32;
+use crate::neon::utils::{prefer_vfmaq_f32, prefer_vfmaq_lane_f32, prefer_vfmaq_laneq_f32};
 use crate::neon::*;
 
 macro_rules! write_rgb_f16 {
@@ -43,45 +43,8 @@ macro_rules! write_rgb_f16 {
     }};
 }
 
-macro_rules! conv_horiz_5_rgb_f16 {
-    ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{
-        const COMPONENTS: usize = 3;
-        let src_ptr = $src.add($start_x * COMPONENTS);
-
-        let rgb_pixel_s = xvldq_f16_x2(src_ptr);
-        let rgb_first_u = vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.0));
-        let rgb_first = xreinterpret_f16_u16(rgb_first_u);
-        let rgb_second_u = vext_u16::<3>(
-            vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)),
-            vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)),
-        );
-        let rgb_second = xreinterpret_f16_u16(rgb_second_u);
-
-        let rgb_third_u = vext_u16::<2>(
-            vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)),
-            vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)),
-        );
-        let rgb_third = xreinterpret_f16_u16(rgb_third_u);
-
-        let rgb_fourth_u = vext_u16::<1>(
-            vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)),
-            vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)),
-        );
-        let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u);
-
-        let rgb_fifth = xvget_high_f16(rgb_pixel_s.1);
-
-        let mut acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(rgb_first), $set.0);
-        acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_second), $set.1);
-        acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_third), $set.2);
-        acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_fourth), $set.3);
-        acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_fifth), $set.4);
-        acc
-    }};
-}
-
 macro_rules! conv_horiz_4_rgb_f16 {
-    ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{
+    ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{
         const COMPONENTS: usize = 3;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
@@ -106,10 +69,10 @@ macro_rules! conv_horiz_4_rgb_f16 {
         );
         let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u);
 
-        let acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(rgb_first), $set.0);
-        let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_second), $set.1);
-        let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_third), $set.2);
-        let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_fourth), $set.3);
+        let acc = prefer_vfmaq_laneq_f32::<0>($store, xvcvt_f32_f16(rgb_first), $weights);
+        let acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(rgb_second), $weights);
+        let acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(rgb_third), $weights);
+        let acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(rgb_fourth), $weights);
         acc
     }};
 }
@@ -138,8 +101,8 @@ macro_rules! conv_horiz_2_rgb_f16 {
         );
         let rgb_second = xreinterpret_f16_u16(rgb_second_u);
 
-        let acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(rgb_first), $set.0);
-        let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_second), $set.1);
+        let acc = prefer_vfmaq_lane_f32::<0>($store, xvcvt_f32_f16(rgb_first), $set);
+        let acc = prefer_vfmaq_lane_f32::<1>(acc, xvcvt_f32_f16(rgb_second), $set);
         acc
     }};
 }
@@ -164,13 +127,13 @@ macro_rules! conv_horiz_1_rgb_f16 {
     }};
 }
 
-pub fn convolve_horizontal_rgb_neon_rows_4_f16(
+pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut half::f16,
+    dst: &mut [half::f16],
     dst_stride: usize,
 ) {
     unsafe {
@@ -189,43 +152,17 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f16(
             let mut store_2 = zeros;
             let mut store_3 = zeros;
 
-            while jx + 5 < bounds.size && bounds.start + jx + 6 < src_width {
-                let bounds_start = bounds.start + jx;
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = vld1q_f32(ptr);
-                let w0 = vdupq_laneq_f32::<0>(read_weights);
-                let w1 = vdupq_laneq_f32::<1>(read_weights);
-                let w2 = vdupq_laneq_f32::<2>(read_weights);
-                let w3 = vdupq_laneq_f32::<3>(read_weights);
-                let w4 = vld1q_dup_f32(ptr.add(4));
-                let set = (w0, w1, w2, w3, w4);
-                let b_start = bounds_start;
-                store_0 = conv_horiz_5_rgb_f16!(b_start, unsafe_source_ptr_0, set, store_0);
-                let s_ptr1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_5_rgb_f16!(b_start, s_ptr1, set, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_5_rgb_f16!(b_start, s_ptr2, set, store_2);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_5_rgb_f16!(b_start, s_ptr3, set, store_3);
-                jx += 5;
-            }
-
             while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                let w0 = vdupq_laneq_f32::<0>(read_weights);
-                let w1 = vdupq_laneq_f32::<1>(read_weights);
-                let w2 = vdupq_laneq_f32::<2>(read_weights);
-                let w3 = vdupq_laneq_f32::<3>(read_weights);
-                let set = (w0, w1, w2, w3);
-                store_0 = conv_horiz_4_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store_0);
-                let s_ptr1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr1, set, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr2, set, store_2);
-                let s_ptr = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr, set, store_3);
+                store_0 = conv_horiz_4_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store_0);
+                let s_ptr1 = src.get_unchecked(src_stride..).as_ptr();
+                store_1 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr1, read_weights, store_1);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
+                store_2 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr2, read_weights, store_2);
+                let s_ptr = src.get_unchecked(src_stride * 3..).as_ptr();
+                store_3 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr, read_weights, store_3);
                 jx += 4;
             }
 
@@ -233,16 +170,13 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f16(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                let w0 = vdupq_lane_f32::<0>(read_weights);
-                let w1 = vdupq_lane_f32::<1>(read_weights);
-                let set = (w0, w1);
-                store_0 = conv_horiz_2_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store_0);
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr_1, set, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr2, set, store_2);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr3, set, store_3);
+                store_0 = conv_horiz_2_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
+                store_1 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr_1, read_weights, store_1);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
+                store_2 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr2, read_weights, store_2);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
+                store_3 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr3, read_weights, store_3);
                 jx += 2;
             }
 
@@ -250,28 +184,27 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f16(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let bounds_start = bounds.start + jx;
                 let weight0 = vld1q_dup_f32(ptr);
-                store_0 =
-                    conv_horiz_1_rgb_f16!(bounds_start, unsafe_source_ptr_0, weight0, store_0);
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_1_rgb_f16!(bounds_start, src.as_ptr(), weight0, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_1, weight0, store_1);
-                let s_ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_2, weight0, store_2);
-                let s_ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_3, weight0, store_3);
                 jx += 1;
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             write_rgb_f16!(store_0, dest_ptr);
 
-            let dest_ptr_1 = unsafe_destination_ptr_0.add(px + dst_stride);
+            let dest_ptr_1 = dst.get_unchecked_mut(px + dst_stride..).as_ptr();
             write_rgb_f16!(store_1, dest_ptr_1);
 
-            let dest_ptr_2 = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+            let dest_ptr_2 = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
             write_rgb_f16!(store_2, dest_ptr_2);
 
-            let dest_ptr_3 = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+            let dest_ptr_3 = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
             write_rgb_f16!(store_3, dest_ptr_3);
 
             filter_offset += filter_weights.aligned_size;
@@ -279,12 +212,12 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f16(
     }
 }
 
-pub fn convolve_horizontal_rgb_neon_row_one_f16(
+pub(crate) fn convolve_horizontal_rgb_neon_row_one_f16(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
 ) {
     unsafe {
         const CHANNELS: usize = 3;
@@ -300,13 +233,7 @@ pub fn convolve_horizontal_rgb_neon_row_one_f16(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                let w0 = vdupq_laneq_f32::<0>(read_weights);
-                let w1 = vdupq_laneq_f32::<1>(read_weights);
-                let w2 = vdupq_laneq_f32::<2>(read_weights);
-                let w3 = vdupq_laneq_f32::<3>(read_weights);
-                let set = (w0, w1, w2, w3);
-
-                store = conv_horiz_4_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store);
+                store = conv_horiz_4_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 4;
             }
 
@@ -314,10 +241,7 @@ pub fn convolve_horizontal_rgb_neon_row_one_f16(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                let w0 = vdupq_lane_f32::<0>(read_weights);
-                let w1 = vdupq_lane_f32::<1>(read_weights);
-                let set = (w0, w1);
-                store = conv_horiz_2_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store);
+                store = conv_horiz_2_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 2;
             }
 
@@ -325,12 +249,12 @@ pub fn convolve_horizontal_rgb_neon_row_one_f16(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = vld1q_dup_f32(ptr);
                 let bounds_start = bounds.start + jx;
-                store = conv_horiz_1_rgb_f16!(bounds_start, unsafe_source_ptr_0, weight0, store);
+                store = conv_horiz_1_rgb_f16!(bounds_start, src.as_ptr(), weight0, store);
                 jx += 1;
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             write_rgb_f16!(store, dest_ptr);
 
             filter_offset += filter_weights.aligned_size;
diff --git a/src/neon/rgb_f16_full.rs b/src/neon/rgb_f16_full.rs
index da8e18a..9a5e407 100644
--- a/src/neon/rgb_f16_full.rs
+++ b/src/neon/rgb_f16_full.rs
@@ -44,43 +44,6 @@ macro_rules! write_rgb_f16 {
     }};
 }
 
-macro_rules! conv_horiz_5_rgb_f16 {
-    ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{
-        const COMPONENTS: usize = 3;
-        let src_ptr = $src.add($start_x * COMPONENTS);
-
-        let rgb_pixel_s = xvldq_f16_x2(src_ptr);
-        let rgb_first_u = vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.0));
-        let rgb_first = xreinterpret_f16_u16(rgb_first_u);
-        let rgb_second_u = vext_u16::<3>(
-            vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)),
-            vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)),
-        );
-        let rgb_second = xreinterpret_f16_u16(rgb_second_u);
-
-        let rgb_third_u = vext_u16::<2>(
-            vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)),
-            vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)),
-        );
-        let rgb_third = xreinterpret_f16_u16(rgb_third_u);
-
-        let rgb_fourth_u = vext_u16::<1>(
-            vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)),
-            vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)),
-        );
-        let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u);
-
-        let rgb_fifth = xvget_high_f16(rgb_pixel_s.1);
-
-        let mut acc = xvfmla_f16($store, rgb_first, $set.0);
-        acc = xvfmla_f16(acc, rgb_second, $set.1);
-        acc = xvfmla_f16(acc, rgb_third, $set.2);
-        acc = xvfmla_f16(acc, rgb_fourth, $set.3);
-        acc = xvfmla_f16(acc, rgb_fifth, $set.4);
-        acc
-    }};
-}
-
 macro_rules! conv_horiz_4_rgb_f16 {
     ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{
         const COMPONENTS: usize = 3;
@@ -107,10 +70,10 @@ macro_rules! conv_horiz_4_rgb_f16 {
         );
         let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u);
 
-        let acc = xvfmla_f16($store, rgb_first, $set.0);
-        let acc = xvfmla_f16(acc, rgb_second, $set.1);
-        let acc = xvfmla_f16(acc, rgb_third, $set.2);
-        let acc = xvfmla_f16(acc, rgb_fourth, $set.3);
+        let acc = xvfmla_lane_f16::<0>($store, rgb_first, $set);
+        let acc = xvfmla_lane_f16::<1>(acc, rgb_second, $set);
+        let acc = xvfmla_lane_f16::<2>(acc, rgb_third, $set);
+        let acc = xvfmla_lane_f16::<3>(acc, rgb_fourth, $set);
         acc
     }};
 }
@@ -141,8 +104,8 @@ macro_rules! conv_horiz_2_rgb_f16 {
         rgb_second_u = vset_lane_u16::<3>(0, rgb_second_u);
         let rgb_second = xreinterpret_f16_u16(rgb_second_u);
 
-        let acc = xvfmla_f16($store, rgb_first, $set.0);
-        let acc = xvfmla_f16(acc, rgb_second, $set.1);
+        let acc = xvfmla_lane_f16::<0>($store, rgb_first, $set);
+        let acc = xvfmla_lane_f16::<1>(acc, rgb_second, $set);
         acc
     }};
 }
@@ -167,13 +130,13 @@ macro_rules! conv_horiz_1_rgb_f16 {
     }};
 }
 
-pub fn xconvolve_horizontal_rgb_neon_rows_4_f16(
+pub(crate) fn xconvolve_horizontal_rgb_neon_rows_4_f16(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     unsafe {
@@ -181,9 +144,9 @@ pub fn xconvolve_horizontal_rgb_neon_rows_4_f16(
             dst_width,
             src_width,
             filter_weights,
-            unsafe_source_ptr_0,
+            src,
             src_stride,
-            unsafe_destination_ptr_0,
+            dst,
             dst_stride,
         );
     }
@@ -194,9 +157,9 @@ unsafe fn xconvolve_horizontal_rgb_neon_rows_4_f16_impl(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     unsafe {
@@ -213,43 +176,17 @@ unsafe fn xconvolve_horizontal_rgb_neon_rows_4_f16_impl(
             let mut store_2 = xvzeros_f16();
             let mut store_3 = xvzeros_f16();
 
-            while jx + 5 < bounds.size && bounds.start + jx + 6 < src_width {
-                let bounds_start = bounds.start + jx;
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = xvcvt_f16_f32(vld1q_f32(ptr));
-                let w0 = xvdup_lane_f16::<0>(read_weights);
-                let w1 = xvdup_lane_f16::<1>(read_weights);
-                let w2 = xvdup_lane_f16::<2>(read_weights);
-                let w3 = xvdup_lane_f16::<3>(read_weights);
-                let w4 = xvcvt_f16_f32(vld1q_dup_f32(ptr.add(4)));
-                let set = (w0, w1, w2, w3, w4);
-                let b_start = bounds_start;
-                store_0 = conv_horiz_5_rgb_f16!(b_start, unsafe_source_ptr_0, set, store_0);
-                let s_ptr1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_5_rgb_f16!(b_start, s_ptr1, set, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_5_rgb_f16!(b_start, s_ptr2, set, store_2);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_5_rgb_f16!(b_start, s_ptr3, set, store_3);
-                jx += 5;
-            }
-
             while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = xvcvt_f16_f32(vld1q_f32(ptr));
-                let w0 = xvdup_lane_f16::<0>(read_weights);
-                let w1 = xvdup_lane_f16::<1>(read_weights);
-                let w2 = xvdup_lane_f16::<2>(read_weights);
-                let w3 = xvdup_lane_f16::<3>(read_weights);
-                let set = (w0, w1, w2, w3);
-                store_0 = conv_horiz_4_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store_0);
-                let s_ptr1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr1, set, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr2, set, store_2);
-                let s_ptr = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr, set, store_3);
+                store_0 = conv_horiz_4_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store_0);
+                let s_ptr1 = src.get_unchecked(src_stride..).as_ptr();
+                store_1 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr1, read_weights, store_1);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
+                store_2 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr2, read_weights, store_2);
+                let s_ptr = src.get_unchecked(src_stride * 3..).as_ptr();
+                store_3 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr, read_weights, store_3);
                 jx += 4;
             }
 
@@ -258,16 +195,13 @@ unsafe fn xconvolve_horizontal_rgb_neon_rows_4_f16_impl(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights_h = vld1_f32(ptr);
                 let read_weights = xvcvt_f16_f32(vcombine_f32(read_weights_h, read_weights_h));
-                let w0 = xvdup_lane_f16::<0>(read_weights);
-                let w1 = xvdup_lane_f16::<1>(read_weights);
-                let set = (w0, w1);
-                store_0 = conv_horiz_2_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store_0);
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr_1, set, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr2, set, store_2);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr3, set, store_3);
+                store_0 = conv_horiz_2_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
+                store_1 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr_1, read_weights, store_1);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
+                store_2 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr2, read_weights, store_2);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
+                store_3 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr3, read_weights, store_3);
                 jx += 2;
             }
 
@@ -275,28 +209,27 @@ unsafe fn xconvolve_horizontal_rgb_neon_rows_4_f16_impl(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let bounds_start = bounds.start + jx;
                 let weight0 = xvcvt_f16_f32(vld1q_dup_f32(ptr));
-                store_0 =
-                    conv_horiz_1_rgb_f16!(bounds_start, unsafe_source_ptr_0, weight0, store_0);
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_1_rgb_f16!(bounds_start, src.as_ptr(), weight0, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_1, weight0, store_1);
-                let s_ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let s_ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_2, weight0, store_2);
-                let s_ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let s_ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_3, weight0, store_3);
                 jx += 1;
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             write_rgb_f16!(store_0, dest_ptr);
 
-            let dest_ptr_1 = unsafe_destination_ptr_0.add(px + dst_stride);
+            let dest_ptr_1 = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
             write_rgb_f16!(store_1, dest_ptr_1);
 
-            let dest_ptr_2 = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+            let dest_ptr_2 = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
             write_rgb_f16!(store_2, dest_ptr_2);
 
-            let dest_ptr_3 = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+            let dest_ptr_3 = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
             write_rgb_f16!(store_3, dest_ptr_3);
 
             filter_offset += filter_weights.aligned_size;
@@ -304,20 +237,20 @@ unsafe fn xconvolve_horizontal_rgb_neon_rows_4_f16_impl(
     }
 }
 
-pub fn xconvolve_horizontal_rgb_neon_row_one_f16(
+pub(crate) fn xconvolve_horizontal_rgb_neon_row_one_f16(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     unsafe {
         xconvolve_horizontal_rgb_neon_row_one_f16_impl(
             dst_width,
             src_width,
             filter_weights,
-            unsafe_source_ptr_0,
-            unsafe_destination_ptr_0,
+            src,
+            dst,
         );
     }
 }
@@ -327,8 +260,8 @@ unsafe fn xconvolve_horizontal_rgb_neon_row_one_f16_impl(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     const CHANNELS: usize = 3;
     let weights_ptr = filter_weights.weights.as_ptr();
@@ -343,13 +276,7 @@ unsafe fn xconvolve_horizontal_rgb_neon_row_one_f16_impl(
             let bounds_start = bounds.start + jx;
             let ptr = weights_ptr.add(jx + filter_offset);
             let read_weights = xvcvt_f16_f32(vld1q_f32(ptr));
-            let w0 = xvdup_lane_f16::<0>(read_weights);
-            let w1 = xvdup_lane_f16::<1>(read_weights);
-            let w2 = xvdup_lane_f16::<2>(read_weights);
-            let w3 = xvdup_lane_f16::<3>(read_weights);
-            let set = (w0, w1, w2, w3);
-
-            store = conv_horiz_4_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store);
+            store = conv_horiz_4_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store);
             jx += 4;
         }
 
@@ -358,10 +285,7 @@ unsafe fn xconvolve_horizontal_rgb_neon_row_one_f16_impl(
             let ptr = weights_ptr.add(jx + filter_offset);
             let read_weights_h = vld1_f32(ptr);
             let read_weights = xvcvt_f16_f32(vcombine_f32(read_weights_h, read_weights_h));
-            let w0 = xvdup_lane_f16::<0>(read_weights);
-            let w1 = xvdup_lane_f16::<1>(read_weights);
-            let set = (w0, w1);
-            store = conv_horiz_2_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store);
+            store = conv_horiz_2_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store);
             jx += 2;
         }
 
@@ -369,12 +293,12 @@ unsafe fn xconvolve_horizontal_rgb_neon_row_one_f16_impl(
             let ptr = weights_ptr.add(jx + filter_offset);
             let weight0 = xvcvt_f16_f32(vld1q_dup_f32(ptr));
             let bounds_start = bounds.start + jx;
-            store = conv_horiz_1_rgb_f16!(bounds_start, unsafe_source_ptr_0, weight0, store);
+            store = conv_horiz_1_rgb_f16!(bounds_start, src.as_ptr(), weight0, store);
             jx += 1;
         }
 
         let px = x * CHANNELS;
-        let dest_ptr = unsafe_destination_ptr_0.add(px);
+        let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
         write_rgb_f16!(store, dest_ptr);
 
         filter_offset += filter_weights.aligned_size;
diff --git a/src/neon/rgb_f32.rs b/src/neon/rgb_f32.rs
index 6934bbb..473f745 100644
--- a/src/neon/rgb_f32.rs
+++ b/src/neon/rgb_f32.rs
@@ -30,8 +30,8 @@
 use std::arch::aarch64::*;
 
 use crate::filter_weights::FilterWeights;
-use crate::neon::utils::xvld1q_f32_x4;
-use crate::neon::utils::{prefer_vfmaq_f32, vsplit_rgb_5};
+use crate::neon::utils::prefer_vfmaq_laneq_f32;
+use crate::neon::utils::{prefer_vfmaq_f32, prefer_vfmaq_lane_f32};
 
 macro_rules! write_rgb_f32 {
     ($store: expr, $dest_ptr: expr) => {{
@@ -42,25 +42,8 @@ macro_rules! write_rgb_f32 {
     }};
 }
 
-macro_rules! conv_horiz_5_rgb_f32 {
-    ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{
-        const COMPONENTS: usize = 3;
-        let src_ptr = $src.add($start_x * COMPONENTS);
-
-        let full_pixel = xvld1q_f32_x4(src_ptr);
-        let splat = vsplit_rgb_5(full_pixel);
-
-        let mut acc = prefer_vfmaq_f32($store, splat.0, $set.0);
-        acc = prefer_vfmaq_f32(acc, splat.1, $set.1);
-        acc = prefer_vfmaq_f32(acc, splat.2, $set.2);
-        acc = prefer_vfmaq_f32(acc, splat.3, $set.3);
-        acc = prefer_vfmaq_f32(acc, splat.4, $set.4);
-        acc
-    }};
-}
-
 macro_rules! conv_horiz_4_rgb_f32 {
-    ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{
+    ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{
         const COMPONENTS: usize = 3;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
@@ -77,10 +60,10 @@ macro_rules! conv_horiz_4_rgb_f32 {
             .as_ptr(),
         );
 
-        let acc = prefer_vfmaq_f32($store, rgb_pixel_0, $set.0);
-        let acc = prefer_vfmaq_f32(acc, rgb_pixel_1, $set.1);
-        let acc = prefer_vfmaq_f32(acc, rgb_pixel_2, $set.2);
-        let acc = prefer_vfmaq_f32(acc, rgb_pixel_3, $set.3);
+        let acc = prefer_vfmaq_laneq_f32::<0>($store, rgb_pixel_0, $weights);
+        let acc = prefer_vfmaq_laneq_f32::<1>(acc, rgb_pixel_1, $weights);
+        let acc = prefer_vfmaq_laneq_f32::<2>(acc, rgb_pixel_2, $weights);
+        let acc = prefer_vfmaq_laneq_f32::<3>(acc, rgb_pixel_3, $weights);
         acc
     }};
 }
@@ -102,8 +85,8 @@ macro_rules! conv_horiz_2_rgb_f32 {
             .as_ptr(),
         );
 
-        let acc = prefer_vfmaq_f32($store, rgb_pixel_0, $set.0);
-        let acc = prefer_vfmaq_f32(acc, rgb_pixel_1, $set.1);
+        let acc = prefer_vfmaq_lane_f32::<0>($store, rgb_pixel_0, $set);
+        let acc = prefer_vfmaq_lane_f32::<1>(acc, rgb_pixel_1, $set);
         acc
     }};
 }
@@ -126,7 +109,7 @@ macro_rules! conv_horiz_1_rgb_f32 {
     }};
 }
 
-pub fn convolve_horizontal_rgb_neon_rows_4_f32(
+pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f32(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -151,43 +134,18 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f32(
             let mut store_2 = zeros;
             let mut store_3 = zeros;
 
-            while jx + 5 < bounds.size && bounds.start + jx + 6 < src_width {
-                let bounds_start = bounds.start + jx;
-                let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = vld1q_f32(ptr);
-                let w0 = vdupq_laneq_f32::<0>(read_weights);
-                let w1 = vdupq_laneq_f32::<1>(read_weights);
-                let w2 = vdupq_laneq_f32::<2>(read_weights);
-                let w3 = vdupq_laneq_f32::<3>(read_weights);
-                let w4 = vld1q_dup_f32(ptr.add(4));
-                let set = (w0, w1, w2, w3, w4);
-                let b_start = bounds_start;
-                store_0 = conv_horiz_5_rgb_f32!(b_start, unsafe_source_ptr_0, set, store_0);
-                let s_ptr1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_5_rgb_f32!(b_start, s_ptr1, set, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_5_rgb_f32!(b_start, s_ptr2, set, store_2);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_5_rgb_f32!(b_start, s_ptr3, set, store_3);
-                jx += 5;
-            }
-
             while jx + 4 < bounds.size && bounds.start + jx + 5 < src_width {
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                let w0 = vdupq_laneq_f32::<0>(read_weights);
-                let w1 = vdupq_laneq_f32::<1>(read_weights);
-                let w2 = vdupq_laneq_f32::<2>(read_weights);
-                let w3 = vdupq_laneq_f32::<3>(read_weights);
-                let set = (w0, w1, w2, w3);
-                store_0 = conv_horiz_4_rgb_f32!(bounds_start, unsafe_source_ptr_0, set, store_0);
+                store_0 =
+                    conv_horiz_4_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store_0);
                 let s_ptr1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr1, set, store_1);
+                store_1 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr1, read_weights, store_1);
                 let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr2, set, store_2);
+                store_2 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr2, read_weights, store_2);
                 let s_ptr = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr, set, store_3);
+                store_3 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr, read_weights, store_3);
                 jx += 4;
             }
 
@@ -195,16 +153,14 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f32(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                let w0 = vdupq_lane_f32::<0>(read_weights);
-                let w1 = vdupq_lane_f32::<1>(read_weights);
-                let set = (w0, w1);
-                store_0 = conv_horiz_2_rgb_f32!(bounds_start, unsafe_source_ptr_0, set, store_0);
+                store_0 =
+                    conv_horiz_2_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store_0);
                 let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr_1, set, store_1);
+                store_1 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr_1, read_weights, store_1);
                 let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr2, set, store_2);
+                store_2 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr2, read_weights, store_2);
                 let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr3, set, store_3);
+                store_3 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr3, read_weights, store_3);
                 jx += 2;
             }
 
@@ -241,7 +197,7 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f32(
     }
 }
 
-pub fn convolve_horizontal_rgb_neon_row_one_f32(
+pub(crate) fn convolve_horizontal_rgb_neon_row_one_f32(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -262,13 +218,8 @@ pub fn convolve_horizontal_rgb_neon_row_one_f32(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                let w0 = vdupq_laneq_f32::<0>(read_weights);
-                let w1 = vdupq_laneq_f32::<1>(read_weights);
-                let w2 = vdupq_laneq_f32::<2>(read_weights);
-                let w3 = vdupq_laneq_f32::<3>(read_weights);
-                let set = (w0, w1, w2, w3);
-
-                store = conv_horiz_4_rgb_f32!(bounds_start, unsafe_source_ptr_0, set, store);
+                store =
+                    conv_horiz_4_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store);
                 jx += 4;
             }
 
@@ -276,10 +227,8 @@ pub fn convolve_horizontal_rgb_neon_row_one_f32(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                let w0 = vdupq_lane_f32::<0>(read_weights);
-                let w1 = vdupq_lane_f32::<1>(read_weights);
-                let set = (w0, w1);
-                store = conv_horiz_2_rgb_f32!(bounds_start, unsafe_source_ptr_0, set, store);
+                store =
+                    conv_horiz_2_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store);
                 jx += 2;
             }
 
diff --git a/src/neon/rgb_u8.rs b/src/neon/rgb_u8.rs
index 8192ec2..44c0862 100644
--- a/src/neon/rgb_u8.rs
+++ b/src/neon/rgb_u8.rs
@@ -109,7 +109,7 @@ unsafe fn write_accumulator_u8(store: int32x4_t, dst: &mut [u8]) {
     *dst.get_unchecked_mut(2) = bytes[2];
 }
 
-pub fn convolve_horizontal_rgb_neon_rows_4(
+pub(crate) fn convolve_horizontal_rgb_neon_rows_4(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -200,7 +200,7 @@ pub fn convolve_horizontal_rgb_neon_rows_4(
     }
 }
 
-pub fn convolve_horizontal_rgb_neon_row_one(
+pub(crate) fn convolve_horizontal_rgb_neon_row_one(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/neon/rgba_f16.rs b/src/neon/rgba_f16.rs
index ac1ea1e..3926a16 100644
--- a/src/neon/rgba_f16.rs
+++ b/src/neon/rgba_f16.rs
@@ -29,7 +29,9 @@
 
 use crate::filter_weights::FilterWeights;
 use crate::neon::f16_utils::xvcvt_f16_f32;
-use crate::neon::utils::prefer_vfmaq_f32;
+use crate::neon::utils::{
+    prefer_vfmaq_f32, prefer_vfmaq_lane_f32, prefer_vfmaq_laneq_f32, xvld1q_f32_x2,
+};
 use crate::neon::{
     xvcvt_f32_f16, xvget_high_f16, xvget_low_f16, xvld_f16, xvldq_f16, xvldq_f16_x2, xvldq_f16_x4,
     xvst_f16,
@@ -43,14 +45,15 @@ macro_rules! conv_horiz_rgba_8_f16 {
 
         let rgb_pixel = xvldq_f16_x4(src_ptr);
 
-        let mut acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), $set1.0);
-        acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1.1);
-        acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), $set1.2);
-        acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.1)), $set1.3);
-        acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.2)), $set2.0);
-        acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.2)), $set2.1);
-        acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.3)), $set2.2);
-        acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.3)), $set2.3);
+        let mut acc =
+            prefer_vfmaq_laneq_f32::<0>($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), $set1);
+        acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1);
+        acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), $set1);
+        acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.1)), $set1);
+        acc = prefer_vfmaq_laneq_f32::<0>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.2)), $set2);
+        acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.2)), $set2);
+        acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.3)), $set2);
+        acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.3)), $set2);
         acc
     }};
 }
@@ -62,10 +65,14 @@ macro_rules! conv_horiz_rgba_4_f16 {
 
         let rgb_pixel = xvldq_f16_x2(src_ptr);
 
-        let acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), $set1.0);
-        let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1.1);
-        let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), $set1.2);
-        let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1.3);
+        let acc =
+            prefer_vfmaq_laneq_f32::<0>($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), $set1);
+        let acc =
+            prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1);
+        let acc =
+            prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), $set1);
+        let acc =
+            prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1);
         acc
     }};
 }
@@ -77,8 +84,9 @@ macro_rules! conv_horiz_rgba_2_f32 {
 
         let rgb_pixel = xvldq_f16(src_ptr);
 
-        let mut acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel)), $set.0);
-        acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel)), $set.1);
+        let mut acc =
+            prefer_vfmaq_lane_f32::<0>($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel)), $set);
+        acc = prefer_vfmaq_lane_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel)), $set);
         acc
     }};
 }
@@ -93,12 +101,12 @@ macro_rules! conv_horiz_rgba_1_f16 {
     }};
 }
 
-pub fn convolve_horizontal_rgba_neon_row_one_f16(
+pub(crate) fn convolve_horizontal_rgba_neon_row_one_f16(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
 ) {
     unsafe {
         const CHANNELS: usize = 4;
@@ -114,12 +122,7 @@ pub fn convolve_horizontal_rgba_neon_row_one_f16(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                let w0 = vdupq_laneq_f32::<0>(read_weights);
-                let w1 = vdupq_laneq_f32::<1>(read_weights);
-                let w2 = vdupq_laneq_f32::<2>(read_weights);
-                let w3 = vdupq_laneq_f32::<3>(read_weights);
-                let set1 = (w0, w1, w2, w3);
-                store = conv_horiz_rgba_4_f16!(bounds_start, unsafe_source_ptr_0, set1, store);
+                store = conv_horiz_rgba_4_f16!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 4;
             }
 
@@ -127,10 +130,7 @@ pub fn convolve_horizontal_rgba_neon_row_one_f16(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                let w0 = vdupq_lane_f32::<0>(read_weights);
-                let w1 = vdupq_lane_f32::<1>(read_weights);
-                let set = (w0, w1);
-                store = conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, set, store);
+                store = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 2;
             }
 
@@ -138,12 +138,12 @@ pub fn convolve_horizontal_rgba_neon_row_one_f16(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = vld1q_dup_f32(ptr);
-                store = conv_horiz_rgba_1_f16!(bounds_start, unsafe_source_ptr_0, weight0, store);
+                store = conv_horiz_rgba_1_f16!(bounds_start, src.as_ptr(), weight0, store);
                 jx += 1;
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             xvst_f16(dest_ptr, xvcvt_f16_f32(store));
 
             filter_offset += filter_weights.aligned_size;
@@ -151,13 +151,13 @@ pub fn convolve_horizontal_rgba_neon_row_one_f16(
     }
 }
 
-pub fn convolve_horizontal_rgba_neon_rows_4_f16(
+pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut half::f16,
+    dst: &mut [half::f16],
     dst_stride: usize,
 ) {
     unsafe {
@@ -176,62 +176,67 @@ pub fn convolve_horizontal_rgba_neon_rows_4_f16(
 
             while jx + 8 < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = vld1q_f32_x2(ptr);
-                let w0 = vdupq_laneq_f32::<0>(read_weights.0);
-                let w1 = vdupq_laneq_f32::<1>(read_weights.0);
-                let w2 = vdupq_laneq_f32::<2>(read_weights.0);
-                let w3 = vdupq_laneq_f32::<3>(read_weights.0);
-                let w4 = vdupq_laneq_f32::<0>(read_weights.1);
-                let w5 = vdupq_laneq_f32::<1>(read_weights.1);
-                let w6 = vdupq_laneq_f32::<2>(read_weights.1);
-                let w7 = vdupq_laneq_f32::<3>(read_weights.1);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
+                let read_weights = xvld1q_f32_x2(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 =
-                    conv_horiz_rgba_8_f16!(bounds_start, unsafe_source_ptr_0, set1, set2, store_0);
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr_1, set1, set2, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr2, set1, set2, store_2);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr3, set1, set2, store_3);
+                store_0 = conv_horiz_rgba_8_f16!(
+                    bounds_start,
+                    src.as_ptr(),
+                    read_weights.0,
+                    read_weights.1,
+                    store_0
+                );
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
+                store_1 = conv_horiz_rgba_8_f16!(
+                    bounds_start,
+                    s_ptr_1,
+                    read_weights.0,
+                    read_weights.1,
+                    store_1
+                );
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
+                store_2 = conv_horiz_rgba_8_f16!(
+                    bounds_start,
+                    s_ptr2,
+                    read_weights.0,
+                    read_weights.1,
+                    store_2
+                );
+                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
+                store_3 = conv_horiz_rgba_8_f16!(
+                    bounds_start,
+                    s_ptr3,
+                    read_weights.0,
+                    read_weights.1,
+                    store_3
+                );
                 jx += 8;
             }
 
             while jx + 4 < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                let w0 = vdupq_laneq_f32::<0>(read_weights);
-                let w1 = vdupq_laneq_f32::<1>(read_weights);
-                let w2 = vdupq_laneq_f32::<2>(read_weights);
-                let w3 = vdupq_laneq_f32::<3>(read_weights);
-                let set1 = (w0, w1, w2, w3);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_rgba_4_f16!(bounds_start, unsafe_source_ptr_0, set1, store_0);
-                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr_1, set1, store_1);
-                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr2, set1, store_2);
-                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr3, set1, store_3);
+                store_0 = conv_horiz_rgba_4_f16!(bounds_start, src.as_ptr(), read_weights, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
+                store_1 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr_1, read_weights, store_1);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
+                store_2 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr2, read_weights, store_2);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
+                store_3 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr3, read_weights, store_3);
                 jx += 4;
             }
 
             while jx + 2 < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                let w0 = vdupq_lane_f32::<0>(read_weights);
-                let w1 = vdupq_lane_f32::<1>(read_weights);
-                let set = (w0, w1);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, set, store_0);
-                let ptr_1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, set, store_1);
-                let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, set, store_2);
-                let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, set, store_3);
+                store_0 = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store_0);
+                let ptr_1 = src.get_unchecked(src_stride..).as_ptr();
+                store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, read_weights, store_1);
+                let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
+                store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, read_weights, store_2);
+                let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
+                store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, read_weights, store_3);
                 jx += 2;
             }
 
@@ -239,28 +244,27 @@ pub fn convolve_horizontal_rgba_neon_rows_4_f16(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = vld1q_dup_f32(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 =
-                    conv_horiz_rgba_1_f16!(bounds_start, unsafe_source_ptr_0, weight0, store_0);
-                let ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_0 = conv_horiz_rgba_1_f16!(bounds_start, src.as_ptr(), weight0, store_0);
+                let ptr_1 = src.get_unchecked(src_stride..).as_ptr();
                 store_1 = conv_horiz_rgba_1_f16!(bounds_start, ptr_1, weight0, store_1);
-                let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+                let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
                 store_2 = conv_horiz_rgba_1_f16!(bounds_start, ptr_2, weight0, store_2);
-                let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+                let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
                 store_3 = conv_horiz_rgba_1_f16!(bounds_start, ptr_3, weight0, store_3);
                 jx += 1;
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             xvst_f16(dest_ptr, xvcvt_f16_f32(store_0));
 
-            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
+            let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
             xvst_f16(dest_ptr, xvcvt_f16_f32(store_1));
 
-            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+            let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
             xvst_f16(dest_ptr, xvcvt_f16_f32(store_2));
 
-            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+            let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
             xvst_f16(dest_ptr, xvcvt_f16_f32(store_3));
 
             filter_offset += filter_weights.aligned_size;
diff --git a/src/neon/rgba_f16_full.rs b/src/neon/rgba_f16_full.rs
index 758293a..9b582ff 100644
--- a/src/neon/rgba_f16_full.rs
+++ b/src/neon/rgba_f16_full.rs
@@ -30,55 +30,57 @@
 use std::arch::aarch64::*;
 
 use crate::filter_weights::FilterWeights;
-use crate::neon::f16_utils::{xvcombine_f16, xvcvt_f16_f32, xvfmla_f16, xvzeros_f16};
+use crate::neon::f16_utils::{
+    xvcombine_f16, xvcvt_f16_f32, xvfmla_f16, xvfmla_lane_f16, xvfmla_laneq_f16, xvzeros_f16,
+};
+use crate::neon::utils::xvld1q_f32_x2;
 use crate::neon::{
-    xvdup_lane_f16, xvdup_laneq_f16, xvget_high_f16, xvget_low_f16, xvld_f16, xvldq_f16,
-    xvldq_f16_x2, xvldq_f16_x4, xvst_f16,
+    xvget_high_f16, xvget_low_f16, xvld_f16, xvldq_f16, xvldq_f16_x2, xvldq_f16_x4, xvst_f16,
 };
 
 macro_rules! conv_horiz_rgba_8_f16 {
-    ($start_x: expr, $src: expr, $set1: expr, $set2: expr, $store: expr) => {{
+    ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{
         const COMPONENTS: usize = 4;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
         let rgb_pixel = xvldq_f16_x4(src_ptr);
 
-        let mut acc = xvfmla_f16($store, xvget_low_f16(rgb_pixel.0), $set1.0);
-        acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel.0), $set1.1);
-        acc = xvfmla_f16(acc, xvget_low_f16(rgb_pixel.1), $set1.2);
-        acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel.1), $set1.3);
-        acc = xvfmla_f16(acc, xvget_low_f16(rgb_pixel.2), $set2.0);
-        acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel.2), $set2.1);
-        acc = xvfmla_f16(acc, xvget_low_f16(rgb_pixel.3), $set2.2);
-        acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel.3), $set2.3);
+        let mut acc = xvfmla_laneq_f16::<0>($store, xvget_low_f16(rgb_pixel.0), $weights);
+        acc = xvfmla_laneq_f16::<1>(acc, xvget_high_f16(rgb_pixel.0), $weights);
+        acc = xvfmla_laneq_f16::<2>(acc, xvget_low_f16(rgb_pixel.1), $weights);
+        acc = xvfmla_laneq_f16::<3>(acc, xvget_high_f16(rgb_pixel.1), $weights);
+        acc = xvfmla_laneq_f16::<4>(acc, xvget_low_f16(rgb_pixel.2), $weights);
+        acc = xvfmla_laneq_f16::<5>(acc, xvget_high_f16(rgb_pixel.2), $weights);
+        acc = xvfmla_laneq_f16::<6>(acc, xvget_low_f16(rgb_pixel.3), $weights);
+        acc = xvfmla_laneq_f16::<7>(acc, xvget_high_f16(rgb_pixel.3), $weights);
         acc
     }};
 }
 
 macro_rules! conv_horiz_rgba_4_f16 {
-    ($start_x: expr, $src: expr, $set1: expr,  $store: expr) => {{
+    ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{
         const COMPONENTS: usize = 4;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
         let rgb_pixel = xvldq_f16_x2(src_ptr);
 
-        let acc = xvfmla_f16($store, xvget_low_f16(rgb_pixel.0), $set1.0);
-        let acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel.0), $set1.1);
-        let acc = xvfmla_f16(acc, xvget_low_f16(rgb_pixel.1), $set1.2);
-        let acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel.0), $set1.3);
+        let acc = xvfmla_lane_f16::<0>($store, xvget_low_f16(rgb_pixel.0), $weights);
+        let acc = xvfmla_lane_f16::<1>(acc, xvget_high_f16(rgb_pixel.0), $weights);
+        let acc = xvfmla_lane_f16::<2>(acc, xvget_low_f16(rgb_pixel.1), $weights);
+        let acc = xvfmla_lane_f16::<3>(acc, xvget_high_f16(rgb_pixel.0), $weights);
         acc
     }};
 }
 
 macro_rules! conv_horiz_rgba_2_f32 {
-    ($start_x: expr, $src: expr, $set: expr,  $store: expr) => {{
+    ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{
         const COMPONENTS: usize = 4;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
         let rgb_pixel = xvldq_f16(src_ptr);
 
-        let mut acc = xvfmla_f16($store, xvget_low_f16(rgb_pixel), $set.0);
-        acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel), $set.1);
+        let mut acc = xvfmla_lane_f16::<0>($store, xvget_low_f16(rgb_pixel), $weights);
+        acc = xvfmla_lane_f16::<1>(acc, xvget_high_f16(rgb_pixel), $weights);
         acc
     }};
 }
@@ -93,20 +95,20 @@ macro_rules! conv_horiz_rgba_1_f16 {
     }};
 }
 
-pub fn xconvolve_horizontal_rgba_neon_row_one_f16(
+pub(crate) fn xconvolve_horizontal_rgba_neon_row_one_f16(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
 ) {
     unsafe {
         xconvolve_horizontal_rgba_neon_row_one_f16_impl(
             dst_width,
             src_width,
             filter_weights,
-            unsafe_source_ptr_0,
-            unsafe_destination_ptr_0,
+            src,
+            dst,
         );
     }
 }
@@ -116,8 +118,8 @@ unsafe fn xconvolve_horizontal_rgba_neon_row_one_f16_impl(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
 ) {
     unsafe {
         const CHANNELS: usize = 4;
@@ -133,12 +135,7 @@ unsafe fn xconvolve_horizontal_rgba_neon_row_one_f16_impl(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = xvcvt_f16_f32(vld1q_f32(ptr));
-                let w0 = xvdup_lane_f16::<0>(read_weights);
-                let w1 = xvdup_lane_f16::<1>(read_weights);
-                let w2 = xvdup_lane_f16::<2>(read_weights);
-                let w3 = xvdup_lane_f16::<3>(read_weights);
-                let set1 = (w0, w1, w2, w3);
-                store = conv_horiz_rgba_4_f16!(bounds_start, unsafe_source_ptr_0, set1, store);
+                store = conv_horiz_rgba_4_f16!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 4;
             }
 
@@ -147,10 +144,7 @@ unsafe fn xconvolve_horizontal_rgba_neon_row_one_f16_impl(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights_h = vld1_f32(ptr);
                 let read_weights = xvcvt_f16_f32(vcombine_f32(read_weights_h, read_weights_h));
-                let w0 = xvdup_lane_f16::<0>(read_weights);
-                let w1 = xvdup_lane_f16::<1>(read_weights);
-                let set = (w0, w1);
-                store = conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, set, store);
+                store = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store);
                 jx += 2;
             }
 
@@ -158,12 +152,12 @@ unsafe fn xconvolve_horizontal_rgba_neon_row_one_f16_impl(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = xvcvt_f16_f32(vld1q_dup_f32(ptr));
-                store = conv_horiz_rgba_1_f16!(bounds_start, unsafe_source_ptr_0, weight0, store);
+                store = conv_horiz_rgba_1_f16!(bounds_start, src.as_ptr(), weight0, store);
                 jx += 1;
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             xvst_f16(dest_ptr, store);
 
             filter_offset += filter_weights.aligned_size;
@@ -171,13 +165,13 @@ unsafe fn xconvolve_horizontal_rgba_neon_row_one_f16_impl(
     }
 }
 
-pub fn xconvolve_horizontal_rgba_neon_rows_4_f16(
+pub(crate) fn xconvolve_horizontal_rgba_neon_rows_4_f16(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const half::f16,
+    unsafe_source_ptr_0: &[half::f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut half::f16,
+    unsafe_destination_ptr_0: &mut [half::f16],
     dst_stride: usize,
 ) {
     unsafe {
@@ -198,9 +192,9 @@ unsafe fn xconvolve_horizontal_rgba_neon_rows_4_f16_impl(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut half::f16,
+    dst: &mut [half::f16],
     dst_stride: usize,
 ) {
     const CHANNELS: usize = 4;
@@ -218,49 +212,33 @@ unsafe fn xconvolve_horizontal_rgba_neon_rows_4_f16_impl(
 
         while jx + 8 < bounds.size {
             let ptr = weights_ptr.add(jx + filter_offset);
-            let read_weights_h = vld1q_f32_x2(ptr);
+            let read_weights_h = xvld1q_f32_x2(ptr);
             let read_weights = xvcombine_f16(
                 xvcvt_f16_f32(read_weights_h.0),
                 xvcvt_f16_f32(read_weights_h.1),
             );
-            let w0 = xvdup_laneq_f16::<0>(read_weights);
-            let w1 = xvdup_laneq_f16::<1>(read_weights);
-            let w2 = xvdup_laneq_f16::<2>(read_weights);
-            let w3 = xvdup_laneq_f16::<3>(read_weights);
-            let w4 = xvdup_laneq_f16::<4>(read_weights);
-            let w5 = xvdup_laneq_f16::<5>(read_weights);
-            let w6 = xvdup_laneq_f16::<6>(read_weights);
-            let w7 = xvdup_laneq_f16::<7>(read_weights);
-            let set1 = (w0, w1, w2, w3);
-            let set2 = (w4, w5, w6, w7);
             let bounds_start = bounds.start + jx;
-            store_0 =
-                conv_horiz_rgba_8_f16!(bounds_start, unsafe_source_ptr_0, set1, set2, store_0);
-            let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
-            store_1 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr_1, set1, set2, store_1);
-            let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-            store_2 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr2, set1, set2, store_2);
-            let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
-            store_3 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr3, set1, set2, store_3);
+            store_0 = conv_horiz_rgba_8_f16!(bounds_start, src.as_ptr(), read_weights, store_0);
+            let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
+            store_1 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr_1, read_weights, store_1);
+            let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
+            store_2 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr2, read_weights, store_2);
+            let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
+            store_3 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr3, read_weights, store_3);
             jx += 8;
         }
 
         while jx + 4 < bounds.size {
             let ptr = weights_ptr.add(jx + filter_offset);
             let read_weights = xvcvt_f16_f32(vld1q_f32(ptr));
-            let w0 = xvdup_lane_f16::<0>(read_weights);
-            let w1 = xvdup_lane_f16::<1>(read_weights);
-            let w2 = xvdup_lane_f16::<2>(read_weights);
-            let w3 = xvdup_lane_f16::<3>(read_weights);
-            let set1 = (w0, w1, w2, w3);
             let bounds_start = bounds.start + jx;
-            store_0 = conv_horiz_rgba_4_f16!(bounds_start, unsafe_source_ptr_0, set1, store_0);
-            let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
-            store_1 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr_1, set1, store_1);
-            let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-            store_2 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr2, set1, store_2);
-            let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
-            store_3 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr3, set1, store_3);
+            store_0 = conv_horiz_rgba_4_f16!(bounds_start, src.as_ptr(), read_weights, store_0);
+            let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
+            store_1 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr_1, read_weights, store_1);
+            let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
+            store_2 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr2, read_weights, store_2);
+            let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
+            store_3 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr3, read_weights, store_3);
             jx += 4;
         }
 
@@ -268,17 +246,14 @@ unsafe fn xconvolve_horizontal_rgba_neon_rows_4_f16_impl(
             let ptr = weights_ptr.add(jx + filter_offset);
             let read_weights_h = vld1_f32(ptr);
             let read_weights = xvcvt_f16_f32(vcombine_f32(read_weights_h, read_weights_h));
-            let w0 = xvdup_lane_f16::<0>(read_weights);
-            let w1 = xvdup_lane_f16::<1>(read_weights);
-            let set = (w0, w1);
             let bounds_start = bounds.start + jx;
-            store_0 = conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, set, store_0);
-            let ptr_1 = unsafe_source_ptr_0.add(src_stride);
-            store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, set, store_1);
-            let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
-            store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, set, store_2);
-            let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
-            store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, set, store_3);
+            store_0 = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store_0);
+            let ptr_1 = src.get_unchecked(src_stride..).as_ptr();
+            store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, read_weights, store_1);
+            let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
+            store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, read_weights, store_2);
+            let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
+            store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, read_weights, store_3);
             jx += 2;
         }
 
@@ -286,27 +261,27 @@ unsafe fn xconvolve_horizontal_rgba_neon_rows_4_f16_impl(
             let ptr = weights_ptr.add(jx + filter_offset);
             let weight0 = xvcvt_f16_f32(vld1q_dup_f32(ptr));
             let bounds_start = bounds.start + jx;
-            store_0 = conv_horiz_rgba_1_f16!(bounds_start, unsafe_source_ptr_0, weight0, store_0);
-            let ptr_1 = unsafe_source_ptr_0.add(src_stride);
+            store_0 = conv_horiz_rgba_1_f16!(bounds_start, src.as_ptr(), weight0, store_0);
+            let ptr_1 = src.get_unchecked(src_stride..).as_ptr();
             store_1 = conv_horiz_rgba_1_f16!(bounds_start, ptr_1, weight0, store_1);
-            let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+            let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
             store_2 = conv_horiz_rgba_1_f16!(bounds_start, ptr_2, weight0, store_2);
-            let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+            let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
             store_3 = conv_horiz_rgba_1_f16!(bounds_start, ptr_3, weight0, store_3);
             jx += 1;
         }
 
         let px = x * CHANNELS;
-        let dest_ptr = unsafe_destination_ptr_0.add(px);
+        let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
         xvst_f16(dest_ptr, store_0);
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
         xvst_f16(dest_ptr, store_1);
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
         xvst_f16(dest_ptr, store_2);
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
         xvst_f16(dest_ptr, store_3);
 
         filter_offset += filter_weights.aligned_size;
diff --git a/src/neon/rgba_f32.rs b/src/neon/rgba_f32.rs
index 0f60df9..3db6b0e 100644
--- a/src/neon/rgba_f32.rs
+++ b/src/neon/rgba_f32.rs
@@ -28,41 +28,41 @@
  */
 
 use crate::filter_weights::FilterWeights;
-use crate::neon::utils::prefer_vfmaq_f32;
-use crate::neon::utils::xvld1q_f32_x4;
+use crate::neon::utils::{prefer_vfmaq_f32, prefer_vfmaq_laneq_f32, xvld1q_f32_x2};
+use crate::neon::utils::{prefer_vfmaq_lane_f32, xvld1q_f32_x4};
 use std::arch::aarch64::*;
 
 macro_rules! conv_horiz_rgba_8_f32 {
-    ($start_x: expr, $src: expr, $set1: expr, $set2: expr, $store: expr) => {{
+    ($start_x: expr, $src: expr, $weights1: expr, $weights2: expr, $store: expr) => {{
         const COMPONENTS: usize = 4;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
         let rgb_pixel0 = xvld1q_f32_x4(src_ptr);
         let rgb_pixel1 = xvld1q_f32_x4(src_ptr.add(16));
 
-        let mut acc = prefer_vfmaq_f32($store, rgb_pixel0.0, $set1.0);
-        acc = prefer_vfmaq_f32(acc, rgb_pixel0.1, $set1.1);
-        acc = prefer_vfmaq_f32(acc, rgb_pixel0.2, $set1.2);
-        acc = prefer_vfmaq_f32(acc, rgb_pixel0.3, $set1.3);
-        acc = prefer_vfmaq_f32(acc, rgb_pixel1.0, $set2.0);
-        acc = prefer_vfmaq_f32(acc, rgb_pixel1.1, $set2.1);
-        acc = prefer_vfmaq_f32(acc, rgb_pixel1.2, $set2.2);
-        acc = prefer_vfmaq_f32(acc, rgb_pixel1.3, $set2.3);
+        let mut acc = prefer_vfmaq_laneq_f32::<0>($store, rgb_pixel0.0, $weights1);
+        acc = prefer_vfmaq_laneq_f32::<1>(acc, rgb_pixel0.1, $weights1);
+        acc = prefer_vfmaq_laneq_f32::<2>(acc, rgb_pixel0.2, $weights1);
+        acc = prefer_vfmaq_laneq_f32::<3>(acc, rgb_pixel0.3, $weights1);
+        acc = prefer_vfmaq_laneq_f32::<0>(acc, rgb_pixel1.0, $weights2);
+        acc = prefer_vfmaq_laneq_f32::<1>(acc, rgb_pixel1.1, $weights2);
+        acc = prefer_vfmaq_laneq_f32::<2>(acc, rgb_pixel1.2, $weights2);
+        acc = prefer_vfmaq_laneq_f32::<3>(acc, rgb_pixel1.3, $weights2);
         acc
     }};
 }
 
 macro_rules! conv_horiz_rgba_4_f32 {
-    ($start_x: expr, $src: expr, $set1: expr,  $store: expr) => {{
+    ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{
         const COMPONENTS: usize = 4;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
         let rgb_pixel = xvld1q_f32_x4(src_ptr);
 
-        let acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set1.0);
-        let acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set1.1);
-        let acc = prefer_vfmaq_f32(acc, rgb_pixel.2, $set1.2);
-        let acc = prefer_vfmaq_f32(acc, rgb_pixel.3, $set1.3);
+        let acc = prefer_vfmaq_laneq_f32::<0>($store, rgb_pixel.0, $weights);
+        let acc = prefer_vfmaq_laneq_f32::<1>(acc, rgb_pixel.1, $weights);
+        let acc = prefer_vfmaq_laneq_f32::<2>(acc, rgb_pixel.2, $weights);
+        let acc = prefer_vfmaq_laneq_f32::<3>(acc, rgb_pixel.3, $weights);
         acc
     }};
 }
@@ -72,10 +72,10 @@ macro_rules! conv_horiz_rgba_2_f32 {
         const COMPONENTS: usize = 4;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
-        let rgb_pixel = vld1q_f32_x2(src_ptr);
+        let rgb_pixel = xvld1q_f32_x2(src_ptr);
 
-        let mut acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set.0);
-        acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set.1);
+        let mut acc = prefer_vfmaq_lane_f32::<0>($store, rgb_pixel.0, $set);
+        acc = prefer_vfmaq_lane_f32::<1>(acc, rgb_pixel.1, $set);
         acc
     }};
 }
@@ -90,7 +90,7 @@ macro_rules! conv_horiz_rgba_1_f32 {
     }};
 }
 
-pub fn convolve_horizontal_rgba_neon_row_one(
+pub(crate) fn convolve_horizontal_rgba_neon_row_one(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
@@ -111,12 +111,8 @@ pub fn convolve_horizontal_rgba_neon_row_one(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                let w0 = vdupq_laneq_f32::<0>(read_weights);
-                let w1 = vdupq_laneq_f32::<1>(read_weights);
-                let w2 = vdupq_laneq_f32::<2>(read_weights);
-                let w3 = vdupq_laneq_f32::<3>(read_weights);
-                let set1 = (w0, w1, w2, w3);
-                store = conv_horiz_rgba_4_f32!(bounds_start, unsafe_source_ptr_0, set1, store);
+                store =
+                    conv_horiz_rgba_4_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store);
                 jx += 4;
             }
 
@@ -124,10 +120,8 @@ pub fn convolve_horizontal_rgba_neon_row_one(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                let w0 = vdupq_lane_f32::<0>(read_weights);
-                let w1 = vdupq_lane_f32::<1>(read_weights);
-                let set = (w0, w1);
-                store = conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, set, store);
+                store =
+                    conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store);
                 jx += 2;
             }
 
@@ -148,7 +142,7 @@ pub fn convolve_horizontal_rgba_neon_row_one(
     }
 }
 
-pub fn convolve_horizontal_rgba_neon_rows_4(
+pub(crate) fn convolve_horizontal_rgba_neon_rows_4(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
@@ -173,62 +167,77 @@ pub fn convolve_horizontal_rgba_neon_rows_4(
 
             while jx + 8 < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = vld1q_f32_x2(ptr);
-                let w0 = vdupq_laneq_f32::<0>(read_weights.0);
-                let w1 = vdupq_laneq_f32::<1>(read_weights.0);
-                let w2 = vdupq_laneq_f32::<2>(read_weights.0);
-                let w3 = vdupq_laneq_f32::<3>(read_weights.0);
-                let w4 = vdupq_laneq_f32::<0>(read_weights.1);
-                let w5 = vdupq_laneq_f32::<1>(read_weights.1);
-                let w6 = vdupq_laneq_f32::<2>(read_weights.1);
-                let w7 = vdupq_laneq_f32::<3>(read_weights.1);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
+                let read_weights = xvld1q_f32_x2(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 =
-                    conv_horiz_rgba_8_f32!(bounds_start, unsafe_source_ptr_0, set1, set2, store_0);
+                store_0 = conv_horiz_rgba_8_f32!(
+                    bounds_start,
+                    unsafe_source_ptr_0,
+                    read_weights.0,
+                    read_weights.1,
+                    store_0
+                );
                 let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_rgba_8_f32!(bounds_start, s_ptr_1, set1, set2, store_1);
+                store_1 = conv_horiz_rgba_8_f32!(
+                    bounds_start,
+                    s_ptr_1,
+                    read_weights.0,
+                    read_weights.1,
+                    store_1
+                );
                 let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_rgba_8_f32!(bounds_start, s_ptr2, set1, set2, store_2);
+                store_2 = conv_horiz_rgba_8_f32!(
+                    bounds_start,
+                    s_ptr2,
+                    read_weights.0,
+                    read_weights.1,
+                    store_2
+                );
                 let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_rgba_8_f32!(bounds_start, s_ptr3, set1, set2, store_3);
+                store_3 = conv_horiz_rgba_8_f32!(
+                    bounds_start,
+                    s_ptr3,
+                    read_weights.0,
+                    read_weights.1,
+                    store_3
+                );
                 jx += 8;
             }
 
             while jx + 4 < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                let w0 = vdupq_laneq_f32::<0>(read_weights);
-                let w1 = vdupq_laneq_f32::<1>(read_weights);
-                let w2 = vdupq_laneq_f32::<2>(read_weights);
-                let w3 = vdupq_laneq_f32::<3>(read_weights);
-                let set1 = (w0, w1, w2, w3);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_rgba_4_f32!(bounds_start, unsafe_source_ptr_0, set1, store_0);
+                store_0 = conv_horiz_rgba_4_f32!(
+                    bounds_start,
+                    unsafe_source_ptr_0,
+                    read_weights,
+                    store_0
+                );
                 let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr_1, set1, store_1);
+                store_1 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr_1, read_weights, store_1);
                 let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr2, set1, store_2);
+                store_2 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr2, read_weights, store_2);
                 let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr3, set1, store_3);
+                store_3 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr3, read_weights, store_3);
                 jx += 4;
             }
 
             while jx + 2 < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                let w0 = vdupq_lane_f32::<0>(read_weights);
-                let w1 = vdupq_lane_f32::<1>(read_weights);
-                let set = (w0, w1);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, set, store_0);
+                store_0 = conv_horiz_rgba_2_f32!(
+                    bounds_start,
+                    unsafe_source_ptr_0,
+                    read_weights,
+                    store_0
+                );
                 let ptr_1 = unsafe_source_ptr_0.add(src_stride);
-                store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, set, store_1);
+                store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, read_weights, store_1);
                 let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
-                store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, set, store_2);
+                store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, read_weights, store_2);
                 let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
-                store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, set, store_3);
+                store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, read_weights, store_3);
                 jx += 2;
             }
 
diff --git a/src/neon/rgba_u16_lb.rs b/src/neon/rgba_u16_lb.rs
index 36f2d91..8570fe1 100644
--- a/src/neon/rgba_u16_lb.rs
+++ b/src/neon/rgba_u16_lb.rs
@@ -113,7 +113,7 @@ unsafe fn conv_horiz_rgba_8_u16(
     acc
 }
 
-pub fn convolve_horizontal_rgba_neon_rows_4_lb_u16(
+pub(crate) fn convolve_horizontal_rgba_neon_rows_4_lb_u16(
     src: &[u16],
     src_stride: usize,
     dst: &mut [u16],
@@ -218,7 +218,7 @@ pub fn convolve_horizontal_rgba_neon_rows_4_lb_u16(
     }
 }
 
-pub fn convolve_horizontal_rgba_neon_u16_lb_row(
+pub(crate) fn convolve_horizontal_rgba_neon_u16_lb_row(
     src: &[u16],
     dst: &mut [u16],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs
index 62373f0..d4fa251 100644
--- a/src/neon/rgba_u8.rs
+++ b/src/neon/rgba_u8.rs
@@ -195,7 +195,7 @@ unsafe fn conv_horiz_rgba_1_u8_i16<const SCALE: i32>(
     vqrdmlah_s16(store, lo, w0)
 }
 
-pub fn convolve_horizontal_rgba_neon_rows_4_u8_i16(
+pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8_i16(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -334,7 +334,7 @@ unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
     }
 }
 
-pub fn convolve_horizontal_rgba_neon_rows_4_u8(
+pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -452,7 +452,7 @@ pub fn convolve_horizontal_rgba_neon_rows_4_u8(
     }
 }
 
-pub fn convolve_horizontal_rgba_neon_row(
+pub(crate) fn convolve_horizontal_rgba_neon_row(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
@@ -516,7 +516,7 @@ pub fn convolve_horizontal_rgba_neon_row(
     }
 }
 
-pub fn convolve_horizontal_rgba_neon_row_i16(
+pub(crate) fn convolve_horizontal_rgba_neon_row_i16(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/neon/utils.rs b/src/neon/utils.rs
index e21a1d8..a17f401 100644
--- a/src/neon/utils.rs
+++ b/src/neon/utils.rs
@@ -69,6 +69,11 @@ pub(crate) unsafe fn xvld1q_f32_x4(a: *const f32) -> float32x4x4_t {
     )
 }
 
+#[inline(always)]
+pub(crate) unsafe fn xvld1q_f32_x2(a: *const f32) -> float32x4x2_t {
+    float32x4x2_t(vld1q_f32(a), vld1q_f32(a.add(4)))
+}
+
 #[inline(always)]
 pub(crate) unsafe fn xvst1q_u8_x2(ptr: *mut u8, b: uint8x16x2_t) {
     vst1q_u8(ptr, b.0);
@@ -99,6 +104,24 @@ pub(crate) unsafe fn prefer_vfmaq_f32(
     }
 }
 
+#[inline(always)]
+pub(crate) unsafe fn prefer_vfmaq_laneq_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x4_t,
+) -> float32x4_t {
+    vfmaq_laneq_f32::<LANE>(a, b, c)
+}
+
+#[inline(always)]
+pub(crate) unsafe fn prefer_vfmaq_lane_f32<const LANE: i32>(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x2_t,
+) -> float32x4_t {
+    vfmaq_lane_f32::<LANE>(a, b, c)
+}
+
 #[inline(always)]
 pub(crate) unsafe fn load_3b_as_u16x4(src_ptr: *const u8) -> uint16x4_t {
     let v_new_value1 = u16::from_le_bytes([src_ptr.read_unaligned(), 0]);
@@ -117,20 +140,3 @@ pub(crate) unsafe fn load_4b_as_u16x4(src_ptr: *const u8) -> uint16x4_t {
     let arr = [v_new_value1, v_new_value2, v_new_value3, v_new_value4];
     vld1_u16(arr.as_ptr())
 }
-
-#[inline(always)]
-pub(crate) unsafe fn vsplit_rgb_5(px: float32x4x4_t) -> Float32x5T {
-    let first_pixel = px.0;
-    let second_pixel = vextq_f32::<3>(px.0, px.1);
-    let third_pixel = vextq_f32::<2>(px.1, px.2);
-    let four_pixel = vextq_f32::<1>(px.2, px.3);
-    Float32x5T(first_pixel, second_pixel, third_pixel, four_pixel, px.3)
-}
-
-pub(crate) struct Float32x5T(
-    pub(crate) float32x4_t,
-    pub(crate) float32x4_t,
-    pub(crate) float32x4_t,
-    pub(crate) float32x4_t,
-    pub(crate) float32x4_t,
-);
diff --git a/src/neon/vertical_f16.rs b/src/neon/vertical_f16.rs
index 1a7a6ef..49e8315 100644
--- a/src/neon/vertical_f16.rs
+++ b/src/neon/vertical_f16.rs
@@ -46,7 +46,7 @@ macro_rules! conv_vertical_part_neon_16_f16 {
             for j in 0..$bounds.size {
                 let py = $start_y + j;
                 let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr());
-                let src_ptr = $src.add($src_stride * py);
+                let src_ptr = $src.get_unchecked($src_stride * py..).as_ptr();
 
                 let s_ptr = src_ptr.add(px);
                 let item_row = xvldq_f16_x2(s_ptr);
@@ -61,7 +61,7 @@ macro_rules! conv_vertical_part_neon_16_f16 {
                     prefer_vfmaq_f32(store_3, xvcvt_f32_f16(xvget_high_f16(item_row.1)), v_weight);
             }
 
-            let dst_ptr = $dst.add(px);
+            let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr();
             let f_set = x_float16x8x2_t(
                 xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)),
                 xcombine_f16(xvcvt_f16_f32(store_2), xvcvt_f16_f32(store_3)),
@@ -88,7 +88,7 @@ macro_rules! conv_vertical_part_neon_32_f16 {
             for j in 0..$bounds.size {
                 let py = $start_y + j;
                 let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr());
-                let src_ptr = $src.add($src_stride * py);
+                let src_ptr = $src.get_unchecked($src_stride * py..).as_ptr();
 
                 let s_ptr = src_ptr.add(px);
                 let item_row = xvldq_f16_x4(s_ptr);
@@ -112,7 +112,7 @@ macro_rules! conv_vertical_part_neon_32_f16 {
                     prefer_vfmaq_f32(store_7, xvcvt_f32_f16(xvget_high_f16(item_row.3)), v_weight);
             }
 
-            let dst_ptr = $dst.add(px);
+            let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr();
             let f_set = x_float16x8x4_t(
                 xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)),
                 xcombine_f16(xvcvt_f16_f32(store_2), xvcvt_f16_f32(store_3)),
@@ -147,7 +147,7 @@ macro_rules! conv_vertical_part_neon_48_f16 {
             for j in 0..$bounds.size {
                 let py = $start_y + j;
                 let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr());
-                let src_ptr = $src.add($src_stride * py);
+                let src_ptr = $src.get_unchecked($src_stride * py..).as_ptr();
 
                 let s_ptr = src_ptr.add(px);
                 let item_row_0 = xvldq_f16_x4(s_ptr);
@@ -217,7 +217,7 @@ macro_rules! conv_vertical_part_neon_48_f16 {
                 );
             }
 
-            let dst_ptr = $dst.add(px);
+            let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr();
             let f_set = x_float16x8x4_t(
                 xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)),
                 xcombine_f16(xvcvt_f16_f32(store_2), xvcvt_f16_f32(store_3)),
@@ -236,11 +236,11 @@ macro_rules! conv_vertical_part_neon_48_f16 {
     }};
 }
 
-pub fn convolve_vertical_rgb_neon_row_f16<const CHANNELS: usize>(
+pub(crate) fn convolve_vertical_rgb_neon_row_f16<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -248,43 +248,19 @@ pub fn convolve_vertical_rgb_neon_row_f16<const CHANNELS: usize>(
     let dst_width = width * CHANNELS;
 
     while cx + 48 < dst_width {
-        conv_vertical_part_neon_48_f16!(
-            bounds.start,
-            cx,
-            unsafe_source_ptr_0,
-            src_stride,
-            unsafe_destination_ptr_0,
-            weight_ptr,
-            bounds
-        );
+        conv_vertical_part_neon_48_f16!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
 
         cx += 48;
     }
 
     while cx + 32 < dst_width {
-        conv_vertical_part_neon_32_f16!(
-            bounds.start,
-            cx,
-            unsafe_source_ptr_0,
-            src_stride,
-            unsafe_destination_ptr_0,
-            weight_ptr,
-            bounds
-        );
+        conv_vertical_part_neon_32_f16!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
 
         cx += 32;
     }
 
     while cx + 16 < dst_width {
-        conv_vertical_part_neon_16_f16!(
-            bounds.start,
-            cx,
-            unsafe_source_ptr_0,
-            src_stride,
-            unsafe_destination_ptr_0,
-            weight_ptr,
-            bounds
-        );
+        conv_vertical_part_neon_16_f16!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
 
         cx += 16;
     }
@@ -294,9 +270,9 @@ pub fn convolve_vertical_rgb_neon_row_f16<const CHANNELS: usize>(
             convolve_vertical_part_neon_8_f16::<false>(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
                 8,
@@ -313,9 +289,9 @@ pub fn convolve_vertical_rgb_neon_row_f16<const CHANNELS: usize>(
             convolve_vertical_part_neon_8_f16::<true>(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
                 left,
diff --git a/src/neon/vertical_f16_full.rs b/src/neon/vertical_f16_full.rs
index d6ae278..2630b25 100644
--- a/src/neon/vertical_f16_full.rs
+++ b/src/neon/vertical_f16_full.rs
@@ -35,9 +35,9 @@ use crate::neon::*;
 pub(crate) unsafe fn xconvolve_vertical_part_neon_8_f16<const USE_BLENDING: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    dst: *mut half::f16,
+    dst: &mut [half::f16],
     filter: &[f32],
     bounds: &FilterBounds,
     blend_length: usize,
@@ -51,7 +51,7 @@ pub(crate) unsafe fn xconvolve_vertical_part_neon_8_f16<const USE_BLENDING: bool
         let weight = filter.get_unchecked(j..);
         let v_weight_h = xvcvt_f16_f32(vld1q_dup_f32(weight.as_ptr()));
         let v_weight = xvcombine_f16(v_weight_h, v_weight_h);
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
 
         let s_ptr = src_ptr.add(px);
         let item_row = if USE_BLENDING {
@@ -65,7 +65,7 @@ pub(crate) unsafe fn xconvolve_vertical_part_neon_8_f16<const USE_BLENDING: bool
         store_0 = xvfmlaq_f16(store_0, item_row, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     if USE_BLENDING {
         let mut transient: [half::f16; 8] = [half::f16::from_bits(0); 8];
         xvstq_f16(transient.as_mut_ptr(), store_0);
@@ -87,7 +87,7 @@ macro_rules! conv_vertical_part_neon_16_f16 {
                 let py = $start_y + j;
                 let v_weight_h = xvcvt_f16_f32(vld1q_dup_f32($filter.get_unchecked(j..).as_ptr()));
                 let v_weight = xvcombine_f16(v_weight_h, v_weight_h);
-                let src_ptr = $src.add($src_stride * py);
+                let src_ptr = $src.get_unchecked($src_stride * py..).as_ptr();
 
                 let s_ptr = src_ptr.add(px);
                 let item_row = xvldq_f16_x2(s_ptr);
@@ -96,7 +96,7 @@ macro_rules! conv_vertical_part_neon_16_f16 {
                 store_1 = xvfmlaq_f16(store_1, item_row.1, v_weight);
             }
 
-            let dst_ptr = $dst.add(px);
+            let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr();
             let f_set = x_float16x8x2_t(store_0, store_1);
             xvstq_f16_x2(dst_ptr, f_set);
         }
@@ -117,7 +117,7 @@ macro_rules! conv_vertical_part_neon_32_f16 {
                 let py = $start_y + j;
                 let v_weight_h = xvcvt_f16_f32(vld1q_dup_f32($filter.get_unchecked(j..).as_ptr()));
                 let v_weight = xvcombine_f16(v_weight_h, v_weight_h);
-                let src_ptr = $src.add($src_stride * py);
+                let src_ptr = $src.get_unchecked($src_stride * py..).as_ptr();
 
                 let s_ptr = src_ptr.add(px);
                 let item_row = xvldq_f16_x4(s_ptr);
@@ -128,7 +128,7 @@ macro_rules! conv_vertical_part_neon_32_f16 {
                 store_3 = xvfmlaq_f16(store_3, item_row.3, v_weight);
             }
 
-            let dst_ptr = $dst.add(px);
+            let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr();
             let f_set = x_float16x8x4_t(store_0, store_1, store_2, store_3);
             xvstq_f16_x4(dst_ptr, f_set);
         }
@@ -152,7 +152,7 @@ macro_rules! conv_vertical_part_neon_48_f16 {
                 let py = $start_y + j;
                 let v_weight_h = xvcvt_f16_f32(vld1q_dup_f32($filter.get_unchecked(j..).as_ptr()));
                 let v_weight = xvcombine_f16(v_weight_h, v_weight_h);
-                let src_ptr = $src.add($src_stride * py);
+                let src_ptr = $src.get_unchecked($src_stride * py..).as_ptr();
 
                 let s_ptr = src_ptr.add(px);
                 let item_row_0 = xvldq_f16_x4(s_ptr);
@@ -167,7 +167,7 @@ macro_rules! conv_vertical_part_neon_48_f16 {
                 store_5 = xvfmlaq_f16(store_5, item_row_1.1, v_weight);
             }
 
-            let dst_ptr = $dst.add(px);
+            let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr();
             let f_set = x_float16x8x4_t(store_0, store_1, store_2, store_3);
             xvstq_f16_x4(dst_ptr, f_set);
             let dst_ptr2 = dst_ptr.add(32);
@@ -178,22 +178,17 @@ macro_rules! conv_vertical_part_neon_48_f16 {
     }};
 }
 
-pub fn xconvolve_vertical_rgb_neon_row_f16<const CHANNELS: usize>(
+pub(crate) fn xconvolve_vertical_rgb_neon_row_f16<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     unsafe {
         xconvolve_vertical_rgb_neon_row_f16_impl::<CHANNELS>(
-            width,
-            bounds,
-            unsafe_source_ptr_0,
-            unsafe_destination_ptr_0,
-            src_stride,
-            weight_ptr,
+            width, bounds, src, dst, src_stride, weight_ptr,
         );
     }
 }
@@ -202,8 +197,8 @@ pub fn xconvolve_vertical_rgb_neon_row_f16<const CHANNELS: usize>(
 pub unsafe fn xconvolve_vertical_rgb_neon_row_f16_impl<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -211,43 +206,19 @@ pub unsafe fn xconvolve_vertical_rgb_neon_row_f16_impl<const CHANNELS: usize>(
     let dst_width = width * CHANNELS;
 
     while cx + 48 < dst_width {
-        conv_vertical_part_neon_48_f16!(
-            bounds.start,
-            cx,
-            unsafe_source_ptr_0,
-            src_stride,
-            unsafe_destination_ptr_0,
-            weight_ptr,
-            bounds
-        );
+        conv_vertical_part_neon_48_f16!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
 
         cx += 48;
     }
 
     while cx + 32 < dst_width {
-        conv_vertical_part_neon_32_f16!(
-            bounds.start,
-            cx,
-            unsafe_source_ptr_0,
-            src_stride,
-            unsafe_destination_ptr_0,
-            weight_ptr,
-            bounds
-        );
+        conv_vertical_part_neon_32_f16!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
 
         cx += 32;
     }
 
     while cx + 16 < dst_width {
-        conv_vertical_part_neon_16_f16!(
-            bounds.start,
-            cx,
-            unsafe_source_ptr_0,
-            src_stride,
-            unsafe_destination_ptr_0,
-            weight_ptr,
-            bounds
-        );
+        conv_vertical_part_neon_16_f16!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
 
         cx += 16;
     }
@@ -257,9 +228,9 @@ pub unsafe fn xconvolve_vertical_rgb_neon_row_f16_impl<const CHANNELS: usize>(
             xconvolve_vertical_part_neon_8_f16::<false>(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
                 8,
@@ -276,9 +247,9 @@ pub unsafe fn xconvolve_vertical_rgb_neon_row_f16_impl<const CHANNELS: usize>(
             xconvolve_vertical_part_neon_8_f16::<true>(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
                 left,
diff --git a/src/neon/vertical_f32.rs b/src/neon/vertical_f32.rs
index d1a241f..dbb1b6d 100644
--- a/src/neon/vertical_f32.rs
+++ b/src/neon/vertical_f32.rs
@@ -27,8 +27,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::filter_weights::FilterBounds;
-use crate::neon::utils::prefer_vfmaq_f32;
 use crate::neon::utils::xvld1q_f32_x4;
+use crate::neon::utils::{prefer_vfmaq_f32, xvld1q_f32_x2};
 use std::arch::aarch64::*;
 
 macro_rules! conv_vertical_part_neon_16_f32 {
@@ -187,7 +187,7 @@ unsafe fn convolve_vertical_part_neon_8_f32(
         let src_ptr = src.add(src_stride * py);
 
         let s_ptr = src_ptr.add(px);
-        let item_row = vld1q_f32_x2(s_ptr);
+        let item_row = xvld1q_f32_x2(s_ptr);
 
         store_0 = prefer_vfmaq_f32(store_0, item_row.0, v_weight);
         store_1 = prefer_vfmaq_f32(store_1, item_row.1, v_weight);
@@ -220,7 +220,7 @@ unsafe fn convolve_vertical_part_neon_4_f32(
         let src_ptr = src.add(src_stride * py);
 
         let s_ptr = src_ptr.add(px);
-        let item_row = vld1q_f32_x2(s_ptr);
+        let item_row = xvld1q_f32_x2(s_ptr);
 
         store_0 = prefer_vfmaq_f32(store_0, item_row.0, v_weight);
     }
@@ -259,7 +259,7 @@ unsafe fn convolve_vertical_part_neon_1_f32(
     dst_ptr.write_unaligned(vgetq_lane_f32::<0>(store_0));
 }
 
-pub fn convolve_vertical_rgb_neon_row_f32<const CHANNELS: usize>(
+pub(crate) fn convolve_vertical_rgb_neon_row_f32<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const f32,
diff --git a/src/neon/vertical_u16.rs b/src/neon/vertical_u16.rs
index a268f4c..bf34f50 100644
--- a/src/neon/vertical_u16.rs
+++ b/src/neon/vertical_u16.rs
@@ -32,7 +32,7 @@ use crate::neon::utils::prefer_vfmaq_f32;
 use std::arch::aarch64::*;
 
 #[inline(always)]
-pub fn convolve_column_u16(
+pub(crate) fn convolve_column_u16(
     _: usize,
     bounds: &FilterBounds,
     src: &[u16],
diff --git a/src/neon/vertical_u16_lb.rs b/src/neon/vertical_u16_lb.rs
index 7dc925d..f76a525 100644
--- a/src/neon/vertical_u16_lb.rs
+++ b/src/neon/vertical_u16_lb.rs
@@ -31,7 +31,7 @@ use crate::support::{PRECISION, ROUNDING_CONST};
 use std::arch::aarch64::*;
 
 #[inline(always)]
-pub fn convolve_column_lb_u16(
+pub(crate) fn convolve_column_lb_u16(
     _: usize,
     bounds: &FilterBounds,
     src: &[u16],
diff --git a/src/neon/vertical_u8.rs b/src/neon/vertical_u8.rs
index 667fb39..1e01da3 100644
--- a/src/neon/vertical_u8.rs
+++ b/src/neon/vertical_u8.rs
@@ -69,7 +69,7 @@ macro_rules! accumulate_4_into_lane {
     }};
 }
 
-pub fn convolve_vertical_neon_i16_precision(
+pub(crate) fn convolve_vertical_neon_i16_precision(
     width: usize,
     bounds: &FilterBounds,
     src: &[u8],
@@ -82,7 +82,7 @@ pub fn convolve_vertical_neon_i16_precision(
     }
 }
 
-pub fn convolve_vertical_neon_i32_precision(
+pub(crate) fn convolve_vertical_neon_i32_precision(
     width: usize,
     bounds: &FilterBounds,
     src: &[u8],
diff --git a/src/sse/rgb_f16.rs b/src/sse/rgb_f16.rs
index 18ba209..4f145c5 100644
--- a/src/sse/rgb_f16.rs
+++ b/src/sse/rgb_f16.rs
@@ -150,8 +150,8 @@ pub(crate) fn convolve_horizontal_rgb_sse_row_one_f16<const F16C: bool, const FM
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     unsafe {
         if F16C {
@@ -160,16 +160,16 @@ pub(crate) fn convolve_horizontal_rgb_sse_row_one_f16<const F16C: bool, const FM
                     dst_width,
                     src_width,
                     filter_weights,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
+                    src,
+                    dst,
                 );
             } else {
                 convolve_horizontal_rgb_sse_row_one_f16c(
                     dst_width,
                     src_width,
                     filter_weights,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
+                    src,
+                    dst,
                 );
             }
         } else {
@@ -177,8 +177,8 @@ pub(crate) fn convolve_horizontal_rgb_sse_row_one_f16<const F16C: bool, const FM
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
+                src,
+                dst,
             );
         }
     }
@@ -190,15 +190,15 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     convolve_horizontal_rgb_sse_row_one_f16_impl::<false, false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
@@ -208,15 +208,15 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16c(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     convolve_horizontal_rgb_sse_row_one_f16_impl::<true, false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
@@ -226,15 +226,15 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16c_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     convolve_horizontal_rgb_sse_row_one_f16_impl::<true, true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
@@ -243,8 +243,8 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_impl<const F16C: bool, const F
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     const CHANNELS: usize = 3;
     let mut filter_offset = 0usize;
@@ -261,7 +261,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_impl<const F16C: bool, const F
             let filter_start = jx + bounds.start;
             store = convolve_horizontal_parts_4_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -283,7 +283,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_impl<const F16C: bool, const F
             let filter_start = jx + bounds.start;
             store = convolve_horizontal_parts_2_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 weight1,
                 store,
@@ -297,7 +297,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_impl<const F16C: bool, const F
             let filter_start = jx + bounds.start;
             store = convolve_horizontal_parts_one_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 store,
             );
@@ -307,7 +307,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_impl<const F16C: bool, const F
         let store_ph = _mm_cvtps_phx::<F16C>(store);
 
         let px = x * CHANNELS;
-        let dest_ptr = unsafe_destination_ptr_0.add(px);
+        let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
         (dest_ptr as *mut i32).write_unaligned(_mm_extract_epi32::<0>(store_ph));
         (dest_ptr as *mut i16)
             .add(2)
@@ -321,9 +321,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f16<const F16C: bool, const FMA
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     unsafe {
@@ -333,9 +333,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f16<const F16C: bool, const FMA
                     dst_width,
                     src_width,
                     filter_weights,
-                    unsafe_source_ptr_0,
+                    src,
                     src_stride,
-                    unsafe_destination_ptr_0,
+                    dst,
                     dst_stride,
                 );
             } else {
@@ -343,9 +343,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f16<const F16C: bool, const FMA
                     dst_width,
                     src_width,
                     filter_weights,
-                    unsafe_source_ptr_0,
+                    src,
                     src_stride,
-                    unsafe_destination_ptr_0,
+                    dst,
                     dst_stride,
                 );
             }
@@ -354,9 +354,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f16<const F16C: bool, const FMA
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 dst_stride,
             );
         }
@@ -369,18 +369,18 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgb_sse_rows_4_f16_impl::<false, false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
@@ -391,18 +391,18 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16c(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgb_sse_rows_4_f16_impl::<true, false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
@@ -413,30 +413,30 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16c_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgb_sse_rows_4_f16_impl::<true, true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl<const F16C: bool, const FMA: bool>(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     const CHANNELS: usize = 3;
@@ -458,7 +458,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl<const F16C: bool, const FM
             let filter_start = jx + bounds.start;
             store_0 = convolve_horizontal_parts_4_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -467,7 +467,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl<const F16C: bool, const FM
             );
             store_1 = convolve_horizontal_parts_4_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..).as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -476,7 +476,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl<const F16C: bool, const FM
             );
             store_2 = convolve_horizontal_parts_4_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..).as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -485,7 +485,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl<const F16C: bool, const FM
             );
             store_3 = convolve_horizontal_parts_4_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..).as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -507,28 +507,28 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl<const F16C: bool, const FM
             let filter_start = jx + bounds.start;
             store_0 = convolve_horizontal_parts_2_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 weight1,
                 store_0,
             );
             store_1 = convolve_horizontal_parts_2_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..).as_ptr(),
                 weight0,
                 weight1,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_2_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..).as_ptr(),
                 weight0,
                 weight1,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_2_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..).as_ptr(),
                 weight0,
                 weight1,
                 store_3,
@@ -542,25 +542,25 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl<const F16C: bool, const FM
             let filter_start = jx + bounds.start;
             store_0 = convolve_horizontal_parts_one_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 store_0,
             );
             store_1 = convolve_horizontal_parts_one_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..).as_ptr(),
                 weight0,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_one_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..).as_ptr(),
                 weight0,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_one_rgb_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..).as_ptr(),
                 weight0,
                 store_3,
             );
@@ -573,25 +573,25 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl<const F16C: bool, const FM
         let store_ph_3 = _mm_cvtps_phx::<F16C>(store_3);
 
         let px = x * CHANNELS;
-        let dest_ptr = unsafe_destination_ptr_0.add(px);
+        let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
         (dest_ptr as *mut i32).write_unaligned(_mm_extract_epi32::<0>(store_ph_0));
         (dest_ptr as *mut i16)
             .add(2)
             .write_unaligned(_mm_extract_epi16::<2>(store_ph_0) as i16);
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
         (dest_ptr as *mut i32).write_unaligned(_mm_extract_epi32::<0>(store_ph_1));
         (dest_ptr as *mut i16)
             .add(2)
             .write_unaligned(_mm_extract_epi16::<2>(store_ph_1) as i16);
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
         (dest_ptr as *mut i32).write_unaligned(_mm_extract_epi32::<0>(store_ph_2));
         (dest_ptr as *mut i16)
             .add(2)
             .write_unaligned(_mm_extract_epi16::<2>(store_ph_2) as i16);
 
-        let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
         (dest_ptr as *mut i32).write_unaligned(_mm_extract_epi32::<0>(store_ph_3));
         (dest_ptr as *mut i16)
             .add(2)
diff --git a/src/sse/rgba_f16.rs b/src/sse/rgba_f16.rs
index 6b7b1e7..f29b541 100644
--- a/src/sse/rgba_f16.rs
+++ b/src/sse/rgba_f16.rs
@@ -107,8 +107,8 @@ pub(crate) fn convolve_horizontal_rgba_sse_row_one_f16<const F16C: bool, const F
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     unsafe {
         if F16C {
@@ -117,16 +117,16 @@ pub(crate) fn convolve_horizontal_rgba_sse_row_one_f16<const F16C: bool, const F
                     dst_width,
                     src_width,
                     filter_weights,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
+                    src,
+                    dst,
                 );
             } else {
                 convolve_horizontal_rgba_sse_row_one_f16c(
                     dst_width,
                     src_width,
                     filter_weights,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
+                    src,
+                    dst,
                 );
             }
         } else {
@@ -134,8 +134,8 @@ pub(crate) fn convolve_horizontal_rgba_sse_row_one_f16<const F16C: bool, const F
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
+                src,
+                dst,
             );
         }
     }
@@ -147,15 +147,15 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     convolve_horizontal_rgba_sse_row_one_f16_impl::<false, false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
@@ -165,15 +165,15 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16c(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     convolve_horizontal_rgba_sse_row_one_f16_impl::<true, false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
@@ -183,15 +183,15 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16c_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     convolve_horizontal_rgba_sse_row_one_f16_impl::<true, true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
+        src,
+        dst,
     );
 }
 
@@ -200,8 +200,8 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl<const F16C: bool, const
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
-    unsafe_destination_ptr_0: *mut f16,
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     unsafe {
         const CHANNELS: usize = 4;
@@ -219,7 +219,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl<const F16C: bool, const
                 let filter_start = jx + bounds.start;
                 store = convolve_horizontal_parts_4_rgba_f16::<F16C, FMA>(
                     filter_start,
-                    unsafe_source_ptr_0,
+                    src.as_ptr(),
                     weight0,
                     weight1,
                     weight2,
@@ -241,7 +241,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl<const F16C: bool, const
                 let filter_start = jx + bounds.start;
                 store = convolve_horizontal_parts_2_rgba_f16::<F16C, FMA>(
                     filter_start,
-                    unsafe_source_ptr_0,
+                    src.as_ptr(),
                     weight0,
                     weight1,
                     store,
@@ -255,7 +255,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl<const F16C: bool, const
                 let filter_start = jx + bounds.start;
                 store = convolve_horizontal_parts_one_rgba_f16::<F16C, FMA>(
                     filter_start,
-                    unsafe_source_ptr_0,
+                    src.as_ptr(),
                     weight0,
                     store,
                 );
@@ -263,7 +263,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl<const F16C: bool, const
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
             let converted_f16 = _mm_cvtps_phx::<F16C>(store);
             std::ptr::copy_nonoverlapping(
                 &converted_f16 as *const _ as *const u8,
@@ -280,9 +280,9 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f16<const F16C: bool, const FM
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     unsafe {
@@ -292,9 +292,9 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f16<const F16C: bool, const FM
                     dst_width,
                     src_width,
                     filter_weights,
-                    unsafe_source_ptr_0,
+                    src,
                     src_stride,
-                    unsafe_destination_ptr_0,
+                    dst,
                     dst_stride,
                 );
             } else {
@@ -302,9 +302,9 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f16<const F16C: bool, const FM
                     dst_width,
                     src_width,
                     filter_weights,
-                    unsafe_source_ptr_0,
+                    src,
                     src_stride,
-                    unsafe_destination_ptr_0,
+                    dst,
                     dst_stride,
                 );
             }
@@ -313,9 +313,9 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f16<const F16C: bool, const FM
                 dst_width,
                 src_width,
                 filter_weights,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 dst_stride,
             );
         }
@@ -328,18 +328,18 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16c_fma(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgba_sse_rows_4_f16_impl::<true, true>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
@@ -350,18 +350,18 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16c(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgba_sse_rows_4_f16_impl::<true, false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
@@ -372,18 +372,18 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_regular(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     convolve_horizontal_rgba_sse_rows_4_f16_impl::<false, false>(
         dst_width,
         src_width,
         filter_weights,
-        unsafe_source_ptr_0,
+        src,
         src_stride,
-        unsafe_destination_ptr_0,
+        dst,
         dst_stride,
     );
 }
@@ -393,9 +393,9 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl<const F16C: bool, const F
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f16,
+    src: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: *mut f16,
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     const CHANNELS: usize = 4;
@@ -417,7 +417,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl<const F16C: bool, const F
 
             store_0 = convolve_horizontal_parts_4_rgba_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -426,7 +426,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl<const F16C: bool, const F
             );
             store_1 = convolve_horizontal_parts_4_rgba_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..).as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -435,7 +435,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl<const F16C: bool, const F
             );
             store_2 = convolve_horizontal_parts_4_rgba_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..).as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -444,7 +444,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl<const F16C: bool, const F
             );
             store_3 = convolve_horizontal_parts_4_rgba_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..).as_ptr(),
                 weight0,
                 weight1,
                 weight2,
@@ -466,28 +466,28 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl<const F16C: bool, const F
             let filter_start = jx + bounds.start;
             store_0 = convolve_horizontal_parts_2_rgba_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 weight1,
                 store_0,
             );
             store_1 = convolve_horizontal_parts_2_rgba_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..).as_ptr(),
                 weight0,
                 weight1,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_2_rgba_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..).as_ptr(),
                 weight0,
                 weight1,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_2_rgba_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..).as_ptr(),
                 weight0,
                 weight1,
                 store_3,
@@ -501,25 +501,25 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl<const F16C: bool, const F
             let weight0 = _mm_load1_ps(ptr);
             store_0 = convolve_horizontal_parts_one_rgba_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0,
+                src.as_ptr(),
                 weight0,
                 store_0,
             );
             store_1 = convolve_horizontal_parts_one_rgba_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride),
+                src.get_unchecked(src_stride..).as_ptr(),
                 weight0,
                 store_1,
             );
             store_2 = convolve_horizontal_parts_one_rgba_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 2),
+                src.get_unchecked(src_stride * 2..).as_ptr(),
                 weight0,
                 store_2,
             );
             store_3 = convolve_horizontal_parts_one_rgba_f16::<F16C, FMA>(
                 filter_start,
-                unsafe_source_ptr_0.add(src_stride * 3),
+                src.get_unchecked(src_stride * 3..).as_ptr(),
                 weight0,
                 store_3,
             );
@@ -527,10 +527,10 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl<const F16C: bool, const F
         }
 
         let px = x * CHANNELS;
-        let dest_ptr0 = unsafe_destination_ptr_0.add(px);
-        let dest_ptr1 = unsafe_destination_ptr_0.add(px + dst_stride);
-        let dest_ptr2 = unsafe_destination_ptr_0.add(px + dst_stride * 2);
-        let dest_ptr3 = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+        let dest_ptr0 = dst.get_unchecked_mut(px..).as_mut_ptr();
+        let dest_ptr1 = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
+        let dest_ptr2 = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
+        let dest_ptr3 = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
 
         let converted_f16_0 = _mm_cvtps_phx::<F16C>(store_0);
         let converted_f16_1 = _mm_cvtps_phx::<F16C>(store_1);
diff --git a/src/sse/vertical_f16.rs b/src/sse/vertical_f16.rs
index 50e0ede..6d7ca93 100644
--- a/src/sse/vertical_f16.rs
+++ b/src/sse/vertical_f16.rs
@@ -39,9 +39,9 @@ use crate::sse::f16_utils::{_mm_cvtph_psx, _mm_cvtps_phx};
 pub(crate) unsafe fn convolve_vertical_part_sse_f16<const F16C: bool, const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    dst: *mut half::f16,
+    dst: &mut [half::f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -53,7 +53,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_f16<const F16C: bool, const FMA:
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = _mm_load1_ps(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
 
         let s_ptr = src_ptr.add(px);
         let item_row_0 = _mm_set1_epi16(s_ptr.read_unaligned().to_bits() as i16);
@@ -61,7 +61,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_f16<const F16C: bool, const FMA:
         store_0 = _mm_prefer_fma_ps::<FMA>(store_0, _mm_cvtph_psx::<F16C>(item_row_0), v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     let converted = _mm_cvtps_phx::<F16C>(store_0);
     let first_item = _mm_extract_epi16::<0>(converted) as u16;
     (dst_ptr as *mut u16).write_unaligned(first_item);
@@ -71,9 +71,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_f16<const F16C: bool, const FMA:
 pub(crate) unsafe fn convolve_vertical_part_sse_4_f16<const F16C: bool, const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    dst: *mut half::f16,
+    dst: &mut [half::f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -85,7 +85,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_4_f16<const F16C: bool, const FM
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = _mm_load1_ps(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
 
         let s_ptr = src_ptr.add(px);
         let item_row_0 = _mm_loadu_si64(s_ptr as *const u8);
@@ -93,7 +93,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_4_f16<const F16C: bool, const FM
         store_0 = _mm_prefer_fma_ps::<FMA>(store_0, _mm_cvtph_psx::<F16C>(item_row_0), v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     let acc = _mm_cvtps_phx::<F16C>(store_0);
     std::ptr::copy_nonoverlapping(&acc as *const _ as *const u8, dst_ptr as *mut u8, 8);
 }
@@ -102,9 +102,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_4_f16<const F16C: bool, const FM
 pub(crate) unsafe fn convolve_vertical_part_sse_16_16<const F16C: bool, const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    dst: *mut half::f16,
+    dst: &mut [half::f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -119,7 +119,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_16_16<const F16C: bool, const FM
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = _mm_load1_ps(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
 
         let s_ptr = src_ptr.add(px);
         let item_row_0 = _mm_loadu_si128(s_ptr as *const __m128i);
@@ -136,7 +136,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_16_16<const F16C: bool, const FM
         store_3 = _mm_prefer_fma_ps::<FMA>(store_3, items3, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
 
     let acc0 = _mm_unpacklo_epi64(
         _mm_cvtps_phx::<F16C>(store_0),
@@ -155,9 +155,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_16_16<const F16C: bool, const FM
 pub(crate) unsafe fn convolve_vertical_part_sse_8_f16<const F16C: bool, const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: *const half::f16,
+    src: &[half::f16],
     src_stride: usize,
-    dst: *mut half::f16,
+    dst: &mut [half::f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -170,7 +170,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8_f16<const F16C: bool, const FM
         let py = start_y + j;
         let weight = filter.get_unchecked(j..);
         let v_weight = _mm_load1_ps(weight.as_ptr());
-        let src_ptr = src.add(src_stride * py);
+        let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
 
         let s_ptr = src_ptr.add(px);
         let item_row = _mm_loadu_si128(s_ptr as *const __m128i);
@@ -181,7 +181,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8_f16<const F16C: bool, const FM
         store_1 = _mm_prefer_fma_ps::<FMA>(store_1, items1, v_weight);
     }
 
-    let dst_ptr = dst.add(px);
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     let acc0 = _mm_unpacklo_epi64(
         _mm_cvtps_phx::<F16C>(store_0),
         _mm_cvtps_phx::<F16C>(store_1),
@@ -196,8 +196,8 @@ pub(crate) fn convolve_vertical_sse_row_f16<
 >(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -205,31 +205,16 @@ pub(crate) fn convolve_vertical_sse_row_f16<
         if F16C {
             if FMA {
                 convolve_vertical_sse_row_f16c_fma::<CHANNELS>(
-                    width,
-                    bounds,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
-                    src_stride,
-                    weight_ptr,
+                    width, bounds, src, dst, src_stride, weight_ptr,
                 );
             } else {
                 convolve_vertical_sse_row_f16c::<CHANNELS>(
-                    width,
-                    bounds,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
-                    src_stride,
-                    weight_ptr,
+                    width, bounds, src, dst, src_stride, weight_ptr,
                 );
             }
         } else {
             convolve_vertical_sse_row_f16_regular::<CHANNELS>(
-                width,
-                bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
+                width, bounds, src, dst, src_stride, weight_ptr,
             );
         }
     }
@@ -240,18 +225,13 @@ pub(crate) fn convolve_vertical_sse_row_f16<
 unsafe fn convolve_vertical_sse_row_f16_regular<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     convolve_vertical_sse_row_f16_impl::<CHANNELS, false, false>(
-        width,
-        bounds,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
-        src_stride,
-        weight_ptr,
+        width, bounds, src, dst, src_stride, weight_ptr,
     );
 }
 
@@ -260,18 +240,13 @@ unsafe fn convolve_vertical_sse_row_f16_regular<const CHANNELS: usize>(
 unsafe fn convolve_vertical_sse_row_f16c_fma<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     convolve_vertical_sse_row_f16_impl::<CHANNELS, true, true>(
-        width,
-        bounds,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
-        src_stride,
-        weight_ptr,
+        width, bounds, src, dst, src_stride, weight_ptr,
     );
 }
 
@@ -280,18 +255,13 @@ unsafe fn convolve_vertical_sse_row_f16c_fma<const CHANNELS: usize>(
 unsafe fn convolve_vertical_sse_row_f16c<const CHANNELS: usize>(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
     convolve_vertical_sse_row_f16_impl::<CHANNELS, false, true>(
-        width,
-        bounds,
-        unsafe_source_ptr_0,
-        unsafe_destination_ptr_0,
-        src_stride,
-        weight_ptr,
+        width, bounds, src, dst, src_stride, weight_ptr,
     );
 }
 
@@ -303,8 +273,8 @@ unsafe fn convolve_vertical_sse_row_f16_impl<
 >(
     width: usize,
     bounds: &FilterBounds,
-    unsafe_source_ptr_0: *const half::f16,
-    unsafe_destination_ptr_0: *mut half::f16,
+    src: &[half::f16],
+    dst: &mut [half::f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -316,9 +286,9 @@ unsafe fn convolve_vertical_sse_row_f16_impl<
             convolve_vertical_part_sse_16_16::<F16C, FMA>(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
             );
@@ -332,9 +302,9 @@ unsafe fn convolve_vertical_sse_row_f16_impl<
             convolve_vertical_part_sse_8_f16::<F16C, FMA>(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
             );
@@ -348,9 +318,9 @@ unsafe fn convolve_vertical_sse_row_f16_impl<
             convolve_vertical_part_sse_4_f16::<F16C, FMA>(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
             );
@@ -364,9 +334,9 @@ unsafe fn convolve_vertical_sse_row_f16_impl<
             convolve_vertical_part_sse_f16::<F16C, FMA>(
                 bounds.start,
                 cx,
-                unsafe_source_ptr_0,
+                src,
                 src_stride,
-                unsafe_destination_ptr_0,
+                dst,
                 weight_ptr,
                 bounds,
             );