diff --git a/app/src/main.rs b/app/src/main.rs
index f64de06..87e234f 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -8,26 +8,38 @@ use fast_image_resize::{
 use image::io::Reader as ImageReader;
 use image::{EncodableLayout, GenericImageView};
 
-use pic_scale::{ImageSize, ImageStore, LabScaler, LChScaler, LinearApproxScaler, LinearScaler, LuvScaler, ResamplingFunction, Scaler, Scaling, SigmoidalScaler, ThreadingPolicy, TransferFunction, XYZScaler};
+use pic_scale::{
+    ImageSize, ImageStore, LinearScaler, ResamplingFunction, Scaler, Scaling, ThreadingPolicy,
+};
 
 fn main() {
     // test_fast_image();
 
-    let img = ImageReader::open("./assets/beach_horizon.jpg")
+    let img = ImageReader::open("./assets/asset.jpg")
         .unwrap()
         .decode()
         .unwrap();
     let dimensions = img.dimensions();
     let mut bytes = Vec::from(img.as_bytes());
 
+    let mut scaler = LinearScaler::new(ResamplingFunction::Lagrange3);
+    scaler.set_threading_policy(ThreadingPolicy::Single);
+    // let store =
+    //     ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize);
+    // let resized = scaler.resize_rgba(
+    //     ImageSize::new(dimensions.0 as usize / 3, dimensions.1 as usize / 3),
+    //     store,
+    //     false,
+    // );
+
+    let mut f_store: Vec<f32> = bytes.iter().map(|&x| x as f32 * (1f32 / 255f32)).collect();
+
     let start_time = Instant::now();
 
-    let mut scaler = LChScaler::new(ResamplingFunction::Lanczos3);
-    scaler.set_threading_policy(ThreadingPolicy::Single);
     let store =
-        ImageStore::<u8, 3>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize);
-    let resized = scaler.resize_rgb(
-        ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
+        ImageStore::<f32, 3>::from_slice(&mut f_store, dimensions.0 as usize, dimensions.1 as usize);
+    let resized = scaler.resize_rgb_f32(
+        ImageSize::new(dimensions.0 as usize / 3, dimensions.1 as usize / 3),
         store,
     );
 
@@ -35,10 +47,13 @@ fn main() {
     // Print the elapsed time in milliseconds
     println!("Scaler: {:.2?}", elapsed_time);
 
+    let j_store: Vec<u8> = resized.as_bytes().iter().map(|&x| (x * 255f32) as u8).collect();
+    let dst = j_store;
+
     if resized.channels == 4 {
         image::save_buffer(
             "converted.png",
-            resized.as_bytes(),
+            &dst,
             resized.width as u32,
             resized.height as u32,
             image::ExtendedColorType::Rgba8,
@@ -46,8 +61,8 @@ fn main() {
         .unwrap();
     } else {
         image::save_buffer(
-            "converted_lch.jpg",
-            resized.as_bytes(),
+            "converted.jpg",
+            &dst,
             resized.width as u32,
             resized.height as u32,
             image::ExtendedColorType::Rgb8,
diff --git a/src/lab_scaler.rs b/src/colors/lab_scaler.rs
similarity index 100%
rename from src/lab_scaler.rs
rename to src/colors/lab_scaler.rs
diff --git a/src/lch_scaler.rs b/src/colors/lch_scaler.rs
similarity index 97%
rename from src/lch_scaler.rs
rename to src/colors/lch_scaler.rs
index 73ef3b1..200c948 100644
--- a/src/lch_scaler.rs
+++ b/src/colors/lch_scaler.rs
@@ -5,10 +5,7 @@
  * // license that can be found in the LICENSE file.
  */
 
-use colorutils_rs::{
-    lch_to_rgb, lch_with_alpha_to_rgba,
-    rgb_to_lch, rgba_to_lch_with_alpha,
-};
+use colorutils_rs::{lch_to_rgb, lch_with_alpha_to_rgba, rgb_to_lch, rgba_to_lch_with_alpha};
 
 use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
 
diff --git a/src/linear_precise_scaler.rs b/src/colors/linear_precise_scaler.rs
similarity index 100%
rename from src/linear_precise_scaler.rs
rename to src/colors/linear_precise_scaler.rs
diff --git a/src/linear_scaler.rs b/src/colors/linear_scaler.rs
similarity index 100%
rename from src/linear_scaler.rs
rename to src/colors/linear_scaler.rs
diff --git a/src/luv_scaler.rs b/src/colors/luv_scaler.rs
similarity index 100%
rename from src/luv_scaler.rs
rename to src/colors/luv_scaler.rs
diff --git a/src/colors/mod.rs b/src/colors/mod.rs
new file mode 100644
index 0000000..01172d2
--- /dev/null
+++ b/src/colors/mod.rs
@@ -0,0 +1,15 @@
+mod lab_scaler;
+mod lch_scaler;
+mod linear_precise_scaler;
+mod linear_scaler;
+mod luv_scaler;
+mod sigmoidal_scaler;
+mod xyz_scaler;
+
+pub use lab_scaler::*;
+pub use lch_scaler::*;
+pub use linear_precise_scaler::*;
+pub use linear_scaler::*;
+pub use luv_scaler::*;
+pub use sigmoidal_scaler::*;
+pub use xyz_scaler::*;
diff --git a/src/sigmoidal_scaler.rs b/src/colors/sigmoidal_scaler.rs
similarity index 100%
rename from src/sigmoidal_scaler.rs
rename to src/colors/sigmoidal_scaler.rs
diff --git a/src/xyz_scaler.rs b/src/colors/xyz_scaler.rs
similarity index 100%
rename from src/xyz_scaler.rs
rename to src/colors/xyz_scaler.rs
diff --git a/src/convolve_naive_f32.rs b/src/convolve_naive_f32.rs
new file mode 100644
index 0000000..187a3f5
--- /dev/null
+++ b/src/convolve_naive_f32.rs
@@ -0,0 +1,253 @@
+/*
+ * // Copyright (c) the Radzivon Bartoshyk. All rights reserved.
+ * //
+ * // Use of this source code is governed by a BSD-style
+ * // license that can be found in the LICENSE file.
+ */
+
+use crate::filter_weights::{FilterBounds, FilterWeights};
+
+#[inline(always)]
+pub(crate) unsafe fn convolve_vertical_part_f32<const PART: usize, const CHANNELS: usize>(
+    start_y: usize,
+    start_x: usize,
+    src: *const f32,
+    src_stride: usize,
+    dst: *mut f32,
+    filter: *const f32,
+    bounds: &FilterBounds,
+) {
+    let mut store: [[f32; CHANNELS]; PART] = [[0f32; CHANNELS]; PART];
+
+    for j in 0..bounds.size {
+        let py = start_y + j;
+        let weight = unsafe { filter.add(j).read_unaligned() };
+        let src_ptr = src.add(src_stride * py);
+        for x in 0..PART {
+            let px = (start_x + x) * CHANNELS;
+            let s_ptr = src_ptr.add(px);
+            for c in 0..CHANNELS {
+                let store_p = store.get_unchecked_mut(x);
+                let store_v = store_p.get_unchecked_mut(c);
+                *store_v += unsafe { s_ptr.add(c).read_unaligned() } * weight;
+            }
+        }
+    }
+
+    for x in 0..PART {
+        let px = (start_x + x) * CHANNELS;
+        let dst_ptr = dst.add(px);
+        for c in 0..CHANNELS {
+            let vl = *(*store.get_unchecked_mut(x)).get_unchecked_mut(c);
+            dst_ptr.add(c).write_unaligned(vl);
+        }
+    }
+}
+
+#[inline(always)]
+pub(crate) fn convolve_horizontal_rgb_native_row<const CHANNELS: usize>(
+    dst_width: usize,
+    _: usize,
+    filter_weights: &FilterWeights<f32>,
+    unsafe_source_ptr_0: *const f32,
+    unsafe_destination_ptr_0: *mut f32,
+) {
+    unsafe {
+        let weights_ptr = filter_weights.weights.as_ptr();
+        let mut filter_offset = 0usize;
+
+        for x in 0..dst_width {
+            let mut _sum_r = 0f32;
+            let mut _sum_g = 0f32;
+            let mut _sum_b = 0f32;
+            let mut _sum_a = 0f32;
+
+            let bounds = filter_weights.bounds.get_unchecked(x);
+            let start_x = bounds.start;
+            for j in 0..bounds.size {
+                let px = (start_x + j) * CHANNELS;
+                let weight = weights_ptr.add(j + filter_offset).read_unaligned();
+                let src = unsafe_source_ptr_0.add(px);
+                _sum_r += src.read_unaligned() * weight;
+                if CHANNELS > 1 {
+                    _sum_g += src.add(1).read_unaligned() * weight;
+                }
+                if CHANNELS > 2 {
+                    _sum_b += src.add(2).read_unaligned() * weight;
+                }
+                if CHANNELS == 4 {
+                    _sum_a += src.add(3).read_unaligned() * weight;
+                }
+            }
+
+            let px = x * CHANNELS;
+
+            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            dest_ptr.write_unaligned(_sum_r);
+            if CHANNELS > 1 {
+                dest_ptr.add(1).write_unaligned(_sum_g);
+            }
+            if CHANNELS > 2 {
+                dest_ptr.add(2).write_unaligned(_sum_b);
+            }
+            if CHANNELS == 4 {
+                dest_ptr.add(3).write_unaligned(_sum_a);
+            }
+
+            filter_offset += filter_weights.aligned_size;
+        }
+    }
+}
+
+#[allow(unused)]
+pub(crate) fn convolve_horizontal_rgba_4_row_f32<const CHANNELS: usize>(
+    dst_width: usize,
+    _: usize,
+    filter_weights: &FilterWeights<f32>,
+    unsafe_source_ptr_0: *const f32,
+    src_stride: usize,
+    unsafe_destination_ptr_0: *mut f32,
+    dst_stride: usize,
+) {
+    unsafe {
+        let mut filter_offset = 0usize;
+        let weights_ptr = filter_weights.weights.as_ptr();
+
+        let src_row0 = unsafe_source_ptr_0;
+        let src_row1 = unsafe_source_ptr_0.add(src_stride);
+        let src_row2 = unsafe_source_ptr_0.add(src_stride * 2);
+        let src_row3 = unsafe_source_ptr_0.add(src_stride * 3);
+
+        let dst_row0 = unsafe_destination_ptr_0;
+        let dst_row1 = unsafe_destination_ptr_0.add(dst_stride);
+        let dst_row2 = unsafe_destination_ptr_0.add(dst_stride * 2);
+        let dst_row3 = unsafe_destination_ptr_0.add(dst_stride * 3);
+
+        for x in 0..dst_width {
+            let mut sum_r_0 = 0f32;
+            let mut sum_g_0 = 0f32;
+            let mut sum_b_0 = 0f32;
+            let mut sum_a_0 = 0f32;
+            let mut sum_r_1 = 0f32;
+            let mut sum_g_1 = 0f32;
+            let mut sum_b_1 = 0f32;
+            let mut sum_a_1 = 0f32;
+            let mut sum_r_2 = 0f32;
+            let mut sum_g_2 = 0f32;
+            let mut sum_b_2 = 0f32;
+            let mut sum_a_2 = 0f32;
+            let mut sum_r_3 = 0f32;
+            let mut sum_g_3 = 0f32;
+            let mut sum_b_3 = 0f32;
+            let mut sum_a_3 = 0f32;
+
+            let bounds = filter_weights.bounds.get_unchecked(x);
+            let start_x = bounds.start;
+            for j in 0..bounds.size {
+                let px = (start_x + j) * CHANNELS;
+                let weight = weights_ptr.add(j + filter_offset).read_unaligned();
+
+                let src0 = src_row0.add(px);
+                sum_r_0 += src0.read_unaligned() * weight;
+                if CHANNELS > 1 {
+                    sum_g_0 += src0.add(1).read_unaligned() * weight;
+                }
+                if CHANNELS > 2 {
+                    sum_b_0 += src0.add(2).read_unaligned() * weight;
+                }
+                if CHANNELS == 4 {
+                    sum_a_0 += src0.add(3).read_unaligned() * weight;
+                }
+
+                let src1 = src_row1.add(px);
+                sum_r_1 += src1.read_unaligned() * weight;
+                if CHANNELS > 1 {
+                    sum_g_1 += src1.add(1).read_unaligned() * weight;
+                }
+                if CHANNELS > 2 {
+                    sum_b_1 += src1.add(2).read_unaligned() * weight;
+                }
+                if CHANNELS == 4 {
+                    sum_a_1 += src1.add(3).read_unaligned() * weight;
+                }
+
+                let src2 = src_row2.add(px);
+                sum_r_2 += src2.read_unaligned() * weight;
+                if CHANNELS > 1 {
+                    sum_g_2 += src2.add(1).read_unaligned() * weight;
+                }
+                if CHANNELS > 2 {
+                    sum_b_2 += src2.add(2).read_unaligned() * weight;
+                }
+                if CHANNELS == 4 {
+                    sum_a_2 += src2.add(3).read_unaligned() * weight;
+                }
+
+                let src3 = src_row3.add(px);
+                sum_r_3 += src3.read_unaligned() * weight;
+                if CHANNELS > 1 {
+                    sum_g_3 += src3.add(1).read_unaligned() * weight;
+                }
+                if CHANNELS > 2 {
+                    sum_b_3 += src3.add(2).read_unaligned() * weight;
+                }
+                if CHANNELS == 4 {
+                    sum_a_3 += src3.add(3).read_unaligned() * weight;
+                }
+            }
+
+            let px = x * CHANNELS;
+
+            let dest_ptr_0 = dst_row0.add(px);
+            let dest_ptr_1 = dst_row1.add(px);
+            let dest_ptr_2 = dst_row2.add(px);
+            let dest_ptr_3 = dst_row3.add(px);
+
+            dest_ptr_0.write_unaligned(sum_r_0);
+            if CHANNELS > 1 {
+                dest_ptr_0.add(1).write_unaligned(sum_g_0);
+            }
+            if CHANNELS > 2 {
+                dest_ptr_0.add(2).write_unaligned(sum_b_0);
+            }
+            if CHANNELS == 4 {
+                dest_ptr_0.add(3).write_unaligned(sum_a_0);
+            }
+
+            dest_ptr_1.write_unaligned(sum_r_1);
+            if CHANNELS > 1 {
+                dest_ptr_1.add(1).write_unaligned(sum_g_1);
+            }
+            if CHANNELS > 2 {
+                dest_ptr_1.add(2).write_unaligned(sum_b_1);
+            }
+            if CHANNELS == 4 {
+                dest_ptr_1.add(3).write_unaligned(sum_a_1);
+            }
+
+            dest_ptr_2.write_unaligned(sum_r_2);
+            if CHANNELS > 1 {
+                dest_ptr_2.add(1).write_unaligned(sum_g_2);
+            }
+            if CHANNELS > 2 {
+                dest_ptr_2.add(2).write_unaligned(sum_b_2);
+            }
+            if CHANNELS == 4 {
+                dest_ptr_2.add(3).write_unaligned(sum_a_2);
+            }
+
+            dest_ptr_3.write_unaligned(sum_r_3);
+            if CHANNELS > 1 {
+                dest_ptr_3.add(1).write_unaligned(sum_g_3);
+            }
+            if CHANNELS > 2 {
+                dest_ptr_3.add(2).write_unaligned(sum_b_3);
+            }
+            if CHANNELS == 4 {
+                dest_ptr_3.add(3).write_unaligned(sum_a_3);
+            }
+
+            filter_offset += filter_weights.aligned_size;
+        }
+    }
+}
diff --git a/src/convolve_naive_u8.rs b/src/convolve_naive_u8.rs
new file mode 100644
index 0000000..d0144aa
--- /dev/null
+++ b/src/convolve_naive_u8.rs
@@ -0,0 +1,285 @@
+/*
+ * // Copyright (c) the Radzivon Bartoshyk. All rights reserved.
+ * //
+ * // Use of this source code is governed by a BSD-style
+ * // license that can be found in the LICENSE file.
+ */
+
+use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::support::{PRECISION, ROUNDING_APPROX};
+
+#[inline(always)]
+pub(crate) unsafe fn convolve_vertical_part<const PART: usize, const CHANNELS: usize>(
+    start_y: usize,
+    start_x: usize,
+    src: *const u8,
+    src_stride: usize,
+    dst: *mut u8,
+    filter: *const i16,
+    bounds: &FilterBounds,
+) {
+    let mut store: [[i32; CHANNELS]; PART] = [[ROUNDING_APPROX; CHANNELS]; PART];
+
+    for j in 0..bounds.size {
+        let py = start_y + j;
+        let weight = unsafe { filter.add(j).read_unaligned() } as i32;
+        let src_ptr = src.add(src_stride * py);
+        for x in 0..PART {
+            let px = (start_x + x) * CHANNELS;
+            let s_ptr = src_ptr.add(px);
+            for c in 0..CHANNELS {
+                let store_p = store.get_unchecked_mut(x);
+                let store_v = store_p.get_unchecked_mut(c);
+                *store_v += unsafe { s_ptr.add(c).read_unaligned() } as i32 * weight;
+            }
+        }
+    }
+
+    for x in 0..PART {
+        let px = (start_x + x) * CHANNELS;
+        let dst_ptr = dst.add(px);
+        for c in 0..CHANNELS {
+            let vl = *(*store.get_unchecked_mut(x)).get_unchecked_mut(c);
+            let ck = vl >> PRECISION;
+            dst_ptr.add(c).write_unaligned(ck.max(0).min(255) as u8);
+        }
+    }
+}
+
+pub(crate) fn convolve_horizontal_rgba_native_row<const CHANNELS: usize>(
+    dst_width: usize,
+    _: usize,
+    filter_weights: &FilterWeights<i16>,
+    unsafe_source_ptr_0: *const u8,
+    unsafe_destination_ptr_0: *mut u8,
+) {
+    let mut filter_offset = 0usize;
+    let weights_ptr = filter_weights.weights.as_ptr();
+
+    for x in 0..dst_width {
+        let mut sum_r = ROUNDING_APPROX;
+        let mut sum_g = ROUNDING_APPROX;
+        let mut sum_b = ROUNDING_APPROX;
+        let mut sum_a = ROUNDING_APPROX;
+
+        let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
+        let start_x = bounds.start;
+        for j in 0..bounds.size {
+            let px = (start_x + j) * CHANNELS;
+            let weight = unsafe { weights_ptr.add(j + filter_offset).read_unaligned() } as i32;
+            let src = unsafe { unsafe_source_ptr_0.add(px) };
+            sum_r += unsafe { src.read_unaligned() } as i32 * weight;
+            if CHANNELS > 1 {
+                sum_g += unsafe { src.add(1).read_unaligned() } as i32 * weight;
+            }
+            if CHANNELS > 2 {
+                sum_b += unsafe { src.add(2).read_unaligned() } as i32 * weight;
+            }
+            if CHANNELS == 4 {
+                sum_a += unsafe { src.add(3).read_unaligned() } as i32 * weight;
+            }
+        }
+
+        let px = x * CHANNELS;
+
+        let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
+
+        unsafe {
+            dest_ptr.write_unaligned((sum_r >> PRECISION).min(255).max(0) as u8);
+            if CHANNELS > 1 {
+                dest_ptr
+                    .add(1)
+                    .write_unaligned((sum_g >> PRECISION).min(255).max(0) as u8);
+            }
+            if CHANNELS > 2 {
+                dest_ptr
+                    .add(2)
+                    .write_unaligned((sum_b >> PRECISION).min(255).max(0) as u8);
+            }
+            if CHANNELS == 4 {
+                dest_ptr
+                    .add(3)
+                    .write_unaligned((sum_a >> PRECISION).min(255).max(0) as u8);
+            }
+        }
+
+        filter_offset += filter_weights.aligned_size;
+    }
+}
+
+#[allow(unused)]
+pub(crate) fn convolve_horizontal_rgba_native_4_row<const CHANNELS: usize>(
+    dst_width: usize,
+    _: usize,
+    filter_weights: &FilterWeights<i16>,
+    unsafe_source_ptr_0: *const u8,
+    src_stride: usize,
+    unsafe_destination_ptr_0: *mut u8,
+    dst_stride: usize,
+) {
+    unsafe {
+        let mut filter_offset = 0usize;
+        let weights_ptr = filter_weights.weights.as_ptr();
+
+        let src_row0 = unsafe_source_ptr_0;
+        let src_row1 = unsafe_source_ptr_0.add(src_stride);
+        let src_row2 = unsafe_source_ptr_0.add(src_stride * 2);
+        let src_row3 = unsafe_source_ptr_0.add(src_stride * 3);
+
+        let dst_row0 = unsafe_destination_ptr_0;
+        let dst_row1 = unsafe_destination_ptr_0.add(dst_stride);
+        let dst_row2 = unsafe_destination_ptr_0.add(dst_stride * 2);
+        let dst_row3 = unsafe_destination_ptr_0.add(dst_stride * 3);
+
+        for x in 0..dst_width {
+            let mut sum_r_0 = ROUNDING_APPROX;
+            let mut sum_g_0 = ROUNDING_APPROX;
+            let mut sum_b_0 = ROUNDING_APPROX;
+            let mut sum_a_0 = ROUNDING_APPROX;
+            let mut sum_r_1 = ROUNDING_APPROX;
+            let mut sum_g_1 = ROUNDING_APPROX;
+            let mut sum_b_1 = ROUNDING_APPROX;
+            let mut sum_a_1 = ROUNDING_APPROX;
+            let mut sum_r_2 = ROUNDING_APPROX;
+            let mut sum_g_2 = ROUNDING_APPROX;
+            let mut sum_b_2 = ROUNDING_APPROX;
+            let mut sum_a_2 = ROUNDING_APPROX;
+            let mut sum_r_3 = ROUNDING_APPROX;
+            let mut sum_g_3 = ROUNDING_APPROX;
+            let mut sum_b_3 = ROUNDING_APPROX;
+            let mut sum_a_3 = ROUNDING_APPROX;
+
+            let bounds = filter_weights.bounds.get_unchecked(x);
+            let start_x = bounds.start;
+            for j in 0..bounds.size {
+                let px = (start_x + j) * CHANNELS;
+                let weight = weights_ptr.add(j + filter_offset).read_unaligned() as i32;
+
+                let src0 = src_row0.add(px);
+                sum_r_0 += src0.read_unaligned() as i32 * weight;
+                if CHANNELS > 1 {
+                    sum_g_0 += src0.add(1).read_unaligned() as i32 * weight;
+                }
+                if CHANNELS > 2 {
+                    sum_b_0 += src0.add(2).read_unaligned() as i32 * weight;
+                }
+                if CHANNELS == 4 {
+                    sum_a_0 += src0.add(3).read_unaligned() as i32 * weight;
+                }
+
+                let src1 = src_row1.add(px);
+                sum_r_1 += src1.read_unaligned() as i32 * weight;
+                if CHANNELS > 1 {
+                    sum_g_1 += src1.add(1).read_unaligned() as i32 * weight;
+                }
+                if CHANNELS > 2 {
+                    sum_b_1 += src1.add(2).read_unaligned() as i32 * weight;
+                }
+                if CHANNELS == 4 {
+                    sum_a_1 += src1.add(3).read_unaligned() as i32 * weight;
+                }
+
+                let src2 = src_row2.add(px);
+                sum_r_2 += src2.read_unaligned() as i32 * weight;
+                if CHANNELS > 1 {
+                    sum_g_2 += src2.add(1).read_unaligned() as i32 * weight;
+                }
+                if CHANNELS > 2 {
+                    sum_b_2 += src2.add(2).read_unaligned() as i32 * weight;
+                }
+                if CHANNELS == 4 {
+                    sum_a_2 += src2.add(3).read_unaligned() as i32 * weight;
+                }
+
+                let src3 = src_row3.add(px);
+                sum_r_3 += src3.read_unaligned() as i32 * weight;
+                if CHANNELS > 1 {
+                    sum_g_3 += src3.add(1).read_unaligned() as i32 * weight;
+                }
+                if CHANNELS > 2 {
+                    sum_b_3 += src3.add(2).read_unaligned() as i32 * weight;
+                }
+                if CHANNELS == 4 {
+                    sum_a_3 += src3.add(3).read_unaligned() as i32 * weight;
+                }
+            }
+
+            let px = x * CHANNELS;
+
+            let dest_ptr_0 = dst_row0.add(px);
+            let dest_ptr_1 = dst_row1.add(px);
+            let dest_ptr_2 = dst_row2.add(px);
+            let dest_ptr_3 = dst_row3.add(px);
+
+            dest_ptr_0.write_unaligned((sum_r_0 >> PRECISION).min(255).max(0) as u8);
+            if CHANNELS > 1 {
+                dest_ptr_0
+                    .add(1)
+                    .write_unaligned((sum_g_0 >> PRECISION).min(255).max(0) as u8);
+            }
+            if CHANNELS > 2 {
+                dest_ptr_0
+                    .add(2)
+                    .write_unaligned((sum_b_0 >> PRECISION).min(255).max(0) as u8);
+            }
+            if CHANNELS == 4 {
+                dest_ptr_0
+                    .add(3)
+                    .write_unaligned((sum_a_0 >> PRECISION).min(255).max(0) as u8);
+            }
+
+            dest_ptr_1.write_unaligned((sum_r_1 >> PRECISION).min(255).max(0) as u8);
+            if CHANNELS > 1 {
+                dest_ptr_1
+                    .add(1)
+                    .write_unaligned((sum_g_1 >> PRECISION).min(255).max(0) as u8);
+            }
+            if CHANNELS > 2 {
+                dest_ptr_1
+                    .add(2)
+                    .write_unaligned((sum_b_1 >> PRECISION).min(255).max(0) as u8);
+            }
+            if CHANNELS == 4 {
+                dest_ptr_1
+                    .add(3)
+                    .write_unaligned((sum_a_1 >> PRECISION).min(255).max(0) as u8);
+            }
+
+            dest_ptr_2.write_unaligned((sum_r_2 >> PRECISION).min(255).max(0) as u8);
+            if CHANNELS > 1 {
+                dest_ptr_2
+                    .add(1)
+                    .write_unaligned((sum_g_2 >> PRECISION).min(255).max(0) as u8);
+            }
+            if CHANNELS > 2 {
+                dest_ptr_2
+                    .add(2)
+                    .write_unaligned((sum_b_2 >> PRECISION).min(255).max(0) as u8);
+            }
+            if CHANNELS == 4 {
+                dest_ptr_2
+                    .add(3)
+                    .write_unaligned((sum_a_2 >> PRECISION).min(255).max(0) as u8);
+            }
+
+            dest_ptr_3.write_unaligned((sum_r_3 >> PRECISION).min(255).max(0) as u8);
+            if CHANNELS > 1 {
+                dest_ptr_3
+                    .add(1)
+                    .write_unaligned((sum_g_3 >> PRECISION).min(255).max(0) as u8);
+            }
+            if CHANNELS > 2 {
+                dest_ptr_3
+                    .add(2)
+                    .write_unaligned((sum_b_3 >> PRECISION).min(255).max(0) as u8);
+            }
+            if CHANNELS == 4 {
+                dest_ptr_3
+                    .add(3)
+                    .write_unaligned((sum_a_3 >> PRECISION).min(255).max(0) as u8);
+            }
+
+            filter_offset += filter_weights.aligned_size;
+        }
+    }
+}
diff --git a/src/convolve_u8.rs b/src/convolve_u8.rs
deleted file mode 100644
index 32533d7..0000000
--- a/src/convolve_u8.rs
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * // Copyright (c) the Radzivon Bartoshyk. All rights reserved.
- * //
- * // Use of this source code is governed by a BSD-style
- * // license that can be found in the LICENSE file.
- */
-
-use crate::filter_weights::FilterBounds;
-use crate::support::{PRECISION, ROUNDING_APPROX};
-
-#[inline(always)]
-#[allow(unused)]
-pub(crate) unsafe fn convolve_vertical_part<const PART: usize, const CHANNELS: usize>(
-    start_y: usize,
-    start_x: usize,
-    src: *const u8,
-    src_stride: usize,
-    dst: *mut u8,
-    filter: *const i16,
-    bounds: &FilterBounds,
-) {
-    let mut store: [[i32; CHANNELS]; PART] = [[ROUNDING_APPROX; CHANNELS]; PART];
-
-    for j in 0..bounds.size {
-        let py = start_y + j;
-        let weight = unsafe { filter.add(j).read_unaligned() } as i32;
-        let src_ptr = src.add(src_stride * py);
-        for x in 0..PART {
-            let px = (start_x + x) * CHANNELS;
-            let s_ptr = src_ptr.add(px);
-            for c in 0..CHANNELS {
-                let store_p = store.get_unchecked_mut(x);
-                let store_v = store_p.get_unchecked_mut(c);
-                *store_v += unsafe { s_ptr.add(c).read_unaligned() } as i32 * weight;
-            }
-        }
-    }
-
-    for x in 0..PART {
-        let px = (start_x + x) * CHANNELS;
-        let dst_ptr = dst.add(px);
-        for c in 0..CHANNELS {
-            let vl = *(*store.get_unchecked_mut(x)).get_unchecked_mut(c);
-            let ck = vl >> PRECISION;
-            dst_ptr.add(c).write_unaligned(ck.max(0).min(255) as u8);
-        }
-    }
-}
diff --git a/src/dispatch_group_f32.rs b/src/dispatch_group_f32.rs
new file mode 100644
index 0000000..f4b7ee4
--- /dev/null
+++ b/src/dispatch_group_f32.rs
@@ -0,0 +1,165 @@
+use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::unsafe_slice::UnsafeSlice;
+use crate::ImageStore;
+use rayon::ThreadPool;
+use std::sync::Arc;
+
+pub(crate) fn convolve_vertical_dispatch_f32<const COMPONENTS: usize>(
+    image_store: &ImageStore<f32, COMPONENTS>,
+    filter_weights: FilterWeights<f32>,
+    destination: &mut ImageStore<f32, COMPONENTS>,
+    pool: &Option<ThreadPool>,
+    dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, *const f32),
+) {
+    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
+    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
+
+    let src_stride = image_store.width * image_store.channels;
+
+    let mut filter_offset = 0usize;
+
+    let dst_stride = destination.width * image_store.channels;
+    let dst_width = destination.width;
+
+    if let Some(pool) = pool {
+        let arc_weights = Arc::new(filter_weights);
+        let borrowed = destination.buffer.borrow_mut();
+        let unsafe_slice = UnsafeSlice::new(borrowed);
+        pool.scope(|scope| {
+            for y in 0..destination.height {
+                let weights = arc_weights.clone();
+                scope.spawn(move |_| {
+                    let bounds = unsafe { weights.bounds.get_unchecked(y) };
+                    let weight_ptr =
+                        unsafe { weights.weights.as_ptr().add(weights.aligned_size * y) };
+                    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
+                    let dst_ptr = unsafe_slice.mut_ptr();
+                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
+                    dispatcher(
+                        dst_width,
+                        bounds,
+                        unsafe_source_ptr_0,
+                        unsafe_destination_ptr_0,
+                        src_stride,
+                        weight_ptr,
+                    );
+                });
+            }
+        });
+    } else {
+        for y in 0..destination.height {
+            let bounds = unsafe { filter_weights.bounds.get_unchecked(y) };
+            let weight_ptr = unsafe { filter_weights.weights.as_ptr().add(filter_offset) };
+
+            dispatcher(
+                dst_width,
+                bounds,
+                unsafe_source_ptr_0,
+                unsafe_destination_ptr_0,
+                src_stride,
+                weight_ptr,
+            );
+
+            filter_offset += filter_weights.aligned_size;
+            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
+        }
+    }
+}
+
+#[inline(always)]
+pub(crate) fn convolve_horizontal_dispatch_f32<const CHANNELS: usize>(
+    image_store: &ImageStore<f32, CHANNELS>,
+    filter_weights: FilterWeights<f32>,
+    destination: &mut ImageStore<f32, CHANNELS>,
+    pool: &Option<ThreadPool>,
+    dispatcher_4_rows: Option<fn(usize, usize, &FilterWeights<f32>, *const f32, usize, *mut f32, usize)>,
+    dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f32, *mut f32),
+) {
+    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
+    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
+
+    let src_stride = image_store.width * image_store.channels;
+    let dst_stride = destination.width * image_store.channels;
+    let dst_width = destination.width;
+    let src_width = image_store.width;
+
+    if let Some(pool) = pool {
+        let arc_weights = Arc::new(filter_weights);
+        let borrowed = destination.buffer.borrow_mut();
+        let unsafe_slice = UnsafeSlice::new(borrowed);
+        pool.scope(|scope| {
+            let mut yy = 0usize;
+            if let Some(dispatcher) = dispatcher_4_rows {
+                for y in (0..destination.height.saturating_sub(4)).step_by(4) {
+                    let weights = arc_weights.clone();
+                    scope.spawn(move |_| {
+                        let unsafe_source_ptr_0 =
+                            unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
+                        let dst_ptr = unsafe_slice.mut_ptr();
+                        let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
+                        dispatcher(
+                            dst_width,
+                            src_width,
+                            &weights,
+                            unsafe_source_ptr_0,
+                            src_stride,
+                            unsafe_destination_ptr_0,
+                            dst_stride,
+                        );
+                    });
+                    yy = y;
+                }
+            }
+            for y in (yy..destination.height).step_by(4) {
+                let weights = arc_weights.clone();
+                scope.spawn(move |_| {
+                    let unsafe_source_ptr_0 =
+                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
+                    let dst_ptr = unsafe_slice.mut_ptr();
+                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
+                    dispatcher_row(
+                        dst_width,
+                        src_width,
+                        &weights,
+                        unsafe_source_ptr_0,
+                        unsafe_destination_ptr_0,
+                    );
+                });
+            }
+        });
+    } else {
+        let mut yy = 0usize;
+
+        if let Some(dispatcher) = dispatcher_4_rows {
+            while yy + 4 < destination.height {
+                dispatcher(
+                    dst_width,
+                    src_width,
+                    &filter_weights,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    dst_stride,
+                );
+
+                unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) };
+                unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) };
+
+                yy += 4;
+            }
+        }
+
+        for _ in yy..destination.height {
+            dispatcher_row(
+                dst_width,
+                src_width,
+                &filter_weights,
+                unsafe_source_ptr_0,
+                unsafe_destination_ptr_0,
+            );
+
+            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
+            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
+        }
+    }
+}
diff --git a/src/dispatch_group_u8.rs b/src/dispatch_group_u8.rs
new file mode 100644
index 0000000..1204dd7
--- /dev/null
+++ b/src/dispatch_group_u8.rs
@@ -0,0 +1,164 @@
+use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::support::PRECISION;
+use crate::unsafe_slice::UnsafeSlice;
+use crate::ImageStore;
+use rayon::ThreadPool;
+use std::sync::Arc;
+
+pub(crate) fn convolve_horizontal_dispatch_u8<const CHANNELS: usize>(
+    image_store: &ImageStore<u8, CHANNELS>,
+    filter_weights: FilterWeights<f32>,
+    destination: &mut ImageStore<u8, CHANNELS>,
+    pool: &Option<ThreadPool>,
+    dispatcher_4_rows: Option<
+        fn(usize, usize, &FilterWeights<i16>, *const u8, usize, *mut u8, usize),
+    >,
+    dispatcher_1_row: fn(usize, usize, &FilterWeights<i16>, *const u8, *mut u8),
+) {
+    let approx_weights = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+
+    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
+    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
+
+    let src_stride = image_store.width * image_store.channels;
+    let dst_stride = destination.width * image_store.channels;
+    let dst_width = destination.width;
+    let src_width = image_store.width;
+
+    if let Some(pool) = pool {
+        let arc_weights = Arc::new(approx_weights);
+        let borrowed = destination.buffer.borrow_mut();
+        let unsafe_slice = UnsafeSlice::new(borrowed);
+        pool.scope(|scope| {
+            let mut yy = 0usize;
+            if let Some(dispatcher) = dispatcher_4_rows {
+                for y in (0..destination.height.saturating_sub(4)).step_by(4) {
+                    let weights = arc_weights.clone();
+                    scope.spawn(move |_| {
+                        let unsafe_source_ptr_0 =
+                            unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
+                        let dst_ptr = unsafe_slice.mut_ptr();
+                        let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
+                        dispatcher(
+                            dst_width,
+                            src_width,
+                            &weights,
+                            unsafe_source_ptr_0,
+                            src_stride,
+                            unsafe_destination_ptr_0,
+                            dst_stride,
+                        );
+                    });
+                    yy = y;
+                }
+            }
+            for y in (yy..destination.height).step_by(4) {
+                let weights = arc_weights.clone();
+                scope.spawn(move |_| {
+                    let unsafe_source_ptr_0 =
+                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
+                    let dst_ptr = unsafe_slice.mut_ptr();
+                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
+                    dispatcher_1_row(
+                        dst_width,
+                        src_width,
+                        &weights,
+                        unsafe_source_ptr_0,
+                        unsafe_destination_ptr_0,
+                    );
+                });
+            }
+        });
+    } else {
+        let mut yy = 0usize;
+        if let Some(dispatcher) = dispatcher_4_rows {
+            while yy + 4 < destination.height {
+                dispatcher(
+                    dst_width,
+                    src_width,
+                    &approx_weights,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    dst_stride,
+                );
+                unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) };
+                unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) };
+                yy += 4;
+            }
+        }
+
+        for _ in yy..destination.height {
+            dispatcher_1_row(
+                dst_width,
+                src_width,
+                &approx_weights,
+                unsafe_source_ptr_0,
+                unsafe_destination_ptr_0,
+            );
+            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
+            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
+        }
+    }
+}
+
+pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>(
+    image_store: &ImageStore<u8, COMPONENTS>,
+    filter_weights: FilterWeights<f32>,
+    destination: &mut ImageStore<'a, u8, COMPONENTS>,
+    pool: &Option<ThreadPool>,
+    dispatcher: fn(usize, &FilterBounds, *const u8, *mut u8, usize, *const i16),
+) {
+    let approx_weights = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+
+    let src_stride = image_store.width * image_store.channels;
+    let dst_stride = destination.width * image_store.channels;
+
+    let dst_width = destination.width;
+
+    if let Some(pool) = pool {
+        let arc_weights = Arc::new(approx_weights);
+        let borrowed = destination.buffer.borrow_mut();
+        let unsafe_slice = UnsafeSlice::new(borrowed);
+        pool.scope(|scope| {
+            for y in 0..destination.height {
+                let weights = arc_weights.clone();
+                scope.spawn(move |_| {
+                    let bounds = unsafe { weights.bounds.get_unchecked(y) };
+                    let weight_ptr =
+                        unsafe { weights.weights.as_ptr().add(weights.aligned_size * y) };
+                    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
+                    let dst_ptr = unsafe_slice.mut_ptr();
+                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
+                    dispatcher(
+                        dst_width,
+                        bounds,
+                        unsafe_source_ptr_0,
+                        unsafe_destination_ptr_0,
+                        src_stride,
+                        weight_ptr,
+                    );
+                });
+            }
+        });
+    } else {
+        let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
+        let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
+        let mut filter_offset = 0usize;
+        for y in 0..destination.height {
+            let bounds = unsafe { approx_weights.bounds.get_unchecked(y) };
+            let weight_ptr = unsafe { approx_weights.weights.as_ptr().add(filter_offset) };
+            dispatcher(
+                dst_width,
+                bounds,
+                unsafe_source_ptr_0,
+                unsafe_destination_ptr_0,
+                src_stride,
+                weight_ptr,
+            );
+
+            filter_offset += approx_weights.aligned_size;
+            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
+        }
+    }
+}
diff --git a/src/filter_weights.rs b/src/filter_weights.rs
index 0c1c5ab..2839e3d 100644
--- a/src/filter_weights.rs
+++ b/src/filter_weights.rs
@@ -7,6 +7,7 @@
 
 use crate::chunking::chunked;
 
+#[derive(Debug, Clone)]
 pub struct FilterWeights<T> {
     pub weights: Vec<T>,
     pub bounds: Vec<FilterBounds>,
@@ -16,7 +17,7 @@ pub struct FilterWeights<T> {
     pub coeffs_size: i32,
 }
 
-#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
 pub struct FilterBounds {
     pub start: usize,
     pub size: usize,
diff --git a/src/image_size.rs b/src/image_size.rs
index 1ada831..f2e3ec5 100644
--- a/src/image_size.rs
+++ b/src/image_size.rs
@@ -5,7 +5,7 @@
  * // license that can be found in the LICENSE file.
  */
 
-#[derive(Copy, Clone)]
+#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
 pub struct ImageSize {
     pub width: usize,
     pub height: usize,
diff --git a/src/lib.rs b/src/lib.rs
index 050d4fe..2bcb529 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,50 +9,43 @@ mod acceleration_feature;
 mod alpha_handle;
 mod avx2_utils;
 mod chunking;
+mod colors;
 mod convolution;
-mod convolve_f32;
-mod convolve_u8;
+mod convolve_naive_f32;
+mod convolve_naive_u8;
+mod dispatch_group_f32;
+mod dispatch_group_u8;
 mod filter_weights;
 mod image_size;
 mod image_store;
-mod lab_scaler;
-mod linear_precise_scaler;
-mod linear_scaler;
-mod luv_scaler;
 mod math;
 mod nearest_sampler;
-mod neon_rgb_f32;
-mod neon_rgb_u8;
-mod neon_simd_u8;
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+mod neon;
 mod rgb_f32;
 mod rgb_u8;
 mod rgba_f32;
 mod rgba_u8;
 mod sampler;
 mod scaler;
-mod sigmoidal_scaler;
-mod sse_rgb_f32;
-mod sse_rgb_u8;
-mod sse_simd_u8;
-mod sse_utils;
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+mod sse;
 mod support;
 mod threading_policy;
 mod unsafe_slice;
-mod xyz_scaler;
-mod lch_scaler;
 
+pub use colors::LChScaler;
+pub use colors::LabScaler;
+pub use colors::LinearApproxScaler;
+pub use colors::LinearScaler;
+pub use colors::SigmoidalScaler;
+pub use colors::XYZScaler;
+pub use colors::*;
 pub use colorutils_rs::TransferFunction;
 pub use image_size::ImageSize;
 pub use image_store::ImageStore;
-pub use lab_scaler::LabScaler;
-pub use linear_precise_scaler::LinearScaler;
-pub use linear_scaler::LinearApproxScaler;
-pub use luv_scaler::*;
 pub use math::*;
 pub use sampler::*;
 pub use scaler::Scaler;
 pub use scaler::Scaling;
-pub use sigmoidal_scaler::SigmoidalScaler;
 pub use threading_policy::*;
-pub use xyz_scaler::XYZScaler;
-pub use lch_scaler::LChScaler;
diff --git a/src/convolve_f32.rs b/src/neon/convolve_f32.rs
similarity index 70%
rename from src/convolve_f32.rs
rename to src/neon/convolve_f32.rs
index 42d423b..0f92778 100644
--- a/src/convolve_f32.rs
+++ b/src/neon/convolve_f32.rs
@@ -1,73 +1,8 @@
-/*
- * // Copyright (c) the Radzivon Bartoshyk. All rights reserved.
- * //
- * // Use of this source code is governed by a BSD-style
- * // license that can be found in the LICENSE file.
- */
-
 use crate::filter_weights::FilterBounds;
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+use crate::neon::utils::prefer_vfmaq_f32;
 use std::arch::aarch64::*;
 
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-#[inline(always)]
-#[allow(dead_code)]
-pub(crate) unsafe fn prefer_vfmaq_f32(
-    a: float32x4_t,
-    b: float32x4_t,
-    c: float32x4_t,
-) -> float32x4_t {
-    #[cfg(target_arch = "aarch64")]
-    {
-        return vfmaq_f32(a, b, c);
-    }
-    #[cfg(target_arch = "arm")]
-    {
-        return vmlaq_f32(a, b, c);
-    }
-}
-
-#[inline(always)]
-#[allow(unused)]
-pub(crate) unsafe fn convolve_vertical_part_f32<const PART: usize, const CHANNELS: usize>(
-    start_y: usize,
-    start_x: usize,
-    src: *const f32,
-    src_stride: usize,
-    dst: *mut f32,
-    filter: *const f32,
-    bounds: &FilterBounds,
-) {
-    let mut store: [[f32; CHANNELS]; PART] = [[0f32; CHANNELS]; PART];
-
-    for j in 0..bounds.size {
-        let py = start_y + j;
-        let weight = *unsafe { filter.add(j) };
-        let src_ptr = src.add(src_stride * py);
-        for x in 0..PART {
-            let px = (start_x + x) * CHANNELS;
-            let s_ptr = src_ptr.add(px);
-            for c in 0..CHANNELS {
-                let store_p = store.get_unchecked_mut(x);
-                let store_v = store_p.get_unchecked_mut(c);
-                *store_v += unsafe { s_ptr.add(c).read_unaligned() } * weight;
-            }
-        }
-    }
-
-    for x in 0..PART {
-        let px = (start_x + x) * CHANNELS;
-        let dst_ptr = dst.add(px);
-        for c in 0..CHANNELS {
-            let vl = *(*store.get_unchecked_mut(x)).get_unchecked_mut(c);
-            dst_ptr.add(c).write_unaligned(vl);
-        }
-    }
-}
-
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 #[allow(unused)]
-#[inline(always)]
 pub(crate) unsafe fn convolve_vertical_part_neon_16_f32(
     start_y: usize,
     start_x: usize,
@@ -86,7 +21,7 @@ pub(crate) unsafe fn convolve_vertical_part_neon_16_f32(
 
     for j in 0..bounds.size {
         let py = start_y + j;
-        let weight = *unsafe { filter.add(j) };
+        let weight = unsafe { filter.add(j).read_unaligned() };
         let v_weight = vdupq_n_f32(weight);
         let src_ptr = src.add(src_stride * py);
 
@@ -104,7 +39,6 @@ pub(crate) unsafe fn convolve_vertical_part_neon_16_f32(
     vst1q_f32_x4(dst_ptr, f_set);
 }
 
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 #[inline(always)]
 pub(crate) unsafe fn convolve_vertical_part_neon_8_f32<const USE_BLENDING: bool>(
     start_y: usize,
@@ -152,7 +86,6 @@ pub(crate) unsafe fn convolve_vertical_part_neon_8_f32<const USE_BLENDING: bool>
     }
 }
 
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 #[inline(always)]
 pub unsafe fn vtransposeq_f32(matrix: float32x4x4_t) -> float32x4x4_t {
     let row0 = matrix.0;
@@ -172,7 +105,6 @@ pub unsafe fn vtransposeq_f32(matrix: float32x4x4_t) -> float32x4x4_t {
     return r;
 }
 
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 #[inline(always)]
 pub(crate) unsafe fn convolve_horizontal_parts_4_rgb_f32(
     start_x: usize,
@@ -203,7 +135,6 @@ pub(crate) unsafe fn convolve_horizontal_parts_4_rgb_f32(
     acc
 }
 
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 #[inline(always)]
 pub(crate) unsafe fn convolve_horizontal_parts_one_rgb_f32(
     start_x: usize,
@@ -226,7 +157,6 @@ pub(crate) unsafe fn convolve_horizontal_parts_one_rgb_f32(
     acc
 }
 
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 #[inline(always)]
 pub(crate) unsafe fn convolve_horizontal_parts_4_rgba_f32(
     start_x: usize,
@@ -249,7 +179,6 @@ pub(crate) unsafe fn convolve_horizontal_parts_4_rgba_f32(
     acc
 }
 
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 #[inline(always)]
 pub(crate) unsafe fn convolve_horizontal_parts_one_rgba_f32(
     start_x: usize,
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
new file mode 100644
index 0000000..450bf00
--- /dev/null
+++ b/src/neon/mod.rs
@@ -0,0 +1,10 @@
+mod convolve_f32;
+mod rgb_f32;
+mod rgb_u8;
+mod rgba_u8;
+mod utils;
+
+pub use convolve_f32::*;
+pub use rgb_f32::neon_convolve_floats::*;
+pub use rgb_u8::neon_rgb::*;
+pub use rgba_u8::*;
diff --git a/src/neon/rgb_f32.rs b/src/neon/rgb_f32.rs
new file mode 100644
index 0000000..d878c8b
--- /dev/null
+++ b/src/neon/rgb_f32.rs
@@ -0,0 +1,448 @@
+/*
+ * // Copyright (c) the Radzivon Bartoshyk. All rights reserved.
+ * //
+ * // Use of this source code is governed by a BSD-style
+ * // license that can be found in the LICENSE file.
+ */
+
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+pub mod neon_convolve_floats {
+    use crate::filter_weights::{FilterBounds, FilterWeights};
+    use crate::neon::*;
+    use std::arch::aarch64::*;
+
+    pub fn convolve_horizontal_rgba_neon_row_one(
+        dst_width: usize,
+        _: usize,
+        filter_weights: &FilterWeights<f32>,
+        unsafe_source_ptr_0: *const f32,
+        unsafe_destination_ptr_0: *mut f32,
+    ) {
+        unsafe {
+            const CHANNELS: usize = 4;
+            let mut filter_offset = 0usize;
+            let weights_ptr = filter_weights.weights.as_ptr();
+
+            for x in 0..dst_width {
+                let bounds = filter_weights.bounds.get_unchecked(x);
+                let mut jx = 0usize;
+                let mut store = vdupq_n_f32(0f32);
+
+                while jx + 4 < bounds.size {
+                    let ptr = weights_ptr.add(jx + filter_offset);
+                    let weight0 = ptr.read_unaligned();
+                    let weight1 = ptr.add(1).read_unaligned();
+                    let weight2 = ptr.add(2).read_unaligned();
+                    let weight3 = ptr.add(3).read_unaligned();
+                    store = convolve_horizontal_parts_4_rgba_f32(
+                        bounds.start,
+                        unsafe_source_ptr_0,
+                        weight0,
+                        weight1,
+                        weight2,
+                        weight3,
+                        store,
+                    );
+                    jx += 4;
+                }
+                while jx < bounds.size {
+                    let ptr = weights_ptr.add(jx + filter_offset);
+                    let weight0 = ptr.read_unaligned();
+                    store = convolve_horizontal_parts_one_rgba_f32(
+                        bounds.start,
+                        unsafe_source_ptr_0,
+                        weight0,
+                        store,
+                    );
+                    jx += 1;
+                }
+
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe_destination_ptr_0.add(px);
+                vst1q_f32(dest_ptr, store);
+
+                filter_offset += filter_weights.aligned_size;
+            }
+        }
+    }
+
+    pub fn convolve_horizontal_rgba_neon_rows_4(
+        dst_width: usize,
+        _: usize,
+        filter_weights: &FilterWeights<f32>,
+        unsafe_source_ptr_0: *const f32,
+        src_stride: usize,
+        unsafe_destination_ptr_0: *mut f32,
+        dst_stride: usize,
+    ) {
+        unsafe {
+            const CHANNELS: usize = 4;
+            let mut filter_offset = 0usize;
+            let zeros = vdupq_n_f32(0f32);
+            let weights_ptr = filter_weights.weights.as_ptr();
+
+            for x in 0..dst_width {
+                let bounds = filter_weights.bounds.get_unchecked(x);
+                let mut jx = 0usize;
+                let mut store_0 = zeros;
+                let mut store_1 = zeros;
+                let mut store_2 = zeros;
+                let mut store_3 = zeros;
+
+                while jx + 4 < bounds.size {
+                    let ptr =  weights_ptr.add(jx + filter_offset);
+                    let weight0 = ptr.read_unaligned();
+                    let weight1 = ptr.add(1).read_unaligned();
+                    let weight2 = ptr.add(2).read_unaligned();
+                    let weight3 = ptr.add(3).read_unaligned();
+                    store_0 = convolve_horizontal_parts_4_rgba_f32(
+                        bounds.start,
+                        unsafe_source_ptr_0,
+                        weight0,
+                        weight1,
+                        weight2,
+                        weight3,
+                        store_0,
+                    );
+                    store_1 = convolve_horizontal_parts_4_rgba_f32(
+                        bounds.start,
+                        unsafe_source_ptr_0.add(src_stride),
+                        weight0,
+                        weight1,
+                        weight2,
+                        weight3,
+                        store_1,
+                    );
+                    store_2 = convolve_horizontal_parts_4_rgba_f32(
+                        bounds.start,
+                        unsafe_source_ptr_0.add(src_stride * 2),
+                        weight0,
+                        weight1,
+                        weight2,
+                        weight3,
+                        store_2,
+                    );
+                    store_3 = convolve_horizontal_parts_4_rgba_f32(
+                        bounds.start,
+                        unsafe_source_ptr_0.add(src_stride * 3),
+                        weight0,
+                        weight1,
+                        weight2,
+                        weight3,
+                        store_3,
+                    );
+                    jx += 4;
+                }
+                while jx < bounds.size {
+                    let ptr = weights_ptr.add(jx + filter_offset);
+                    let weight0 = ptr.read_unaligned();
+                    store_0 = convolve_horizontal_parts_one_rgba_f32(
+                        bounds.start,
+                        unsafe_source_ptr_0,
+                        weight0,
+                        store_0,
+                    );
+                    store_1 = convolve_horizontal_parts_one_rgba_f32(
+                        bounds.start,
+                        unsafe_source_ptr_0.add(src_stride),
+                        weight0,
+                        store_1,
+                    );
+                    store_2 = convolve_horizontal_parts_one_rgba_f32(
+                        bounds.start,
+                        unsafe_source_ptr_0.add(src_stride * 2),
+                        weight0,
+                        store_2,
+                    );
+                    store_3 = convolve_horizontal_parts_one_rgba_f32(
+                        bounds.start,
+                        unsafe_source_ptr_0.add(src_stride * 3),
+                        weight0,
+                        store_3,
+                    );
+                    jx += 1;
+                }
+
+                let px = x * CHANNELS;
+                let dest_ptr =  unsafe_destination_ptr_0.add(px);
+                vst1q_f32(dest_ptr, store_0);
+
+                let dest_ptr =  unsafe_destination_ptr_0.add(px + dst_stride);
+                vst1q_f32(dest_ptr, store_1);
+
+                let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+                vst1q_f32(dest_ptr, store_2);
+
+                let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+                vst1q_f32(dest_ptr, store_3);
+
+                filter_offset += filter_weights.aligned_size;
+            }
+        }
+    }
+
+    pub fn convolve_horizontal_rgb_neon_rows_4_f32(
+        dst_width: usize,
+        src_width: usize,
+        filter_weights: &FilterWeights<f32>,
+        unsafe_source_ptr_0: *const f32,
+        src_stride: usize,
+        unsafe_destination_ptr_0: *mut f32,
+        dst_stride: usize,
+    ) {
+       unsafe {
+           const CHANNELS: usize = 3;
+           let mut filter_offset = 0usize;
+
+           let zeros =  vdupq_n_f32(0f32);
+
+           let weights_ptr = filter_weights.weights.as_ptr();
+
+           for x in 0..dst_width {
+               let bounds = filter_weights.bounds.get_unchecked(x);
+               let mut jx = 0usize;
+               let mut store_0 = zeros;
+               let mut store_1 = zeros;
+               let mut store_2 = zeros;
+               let mut store_3 = zeros;
+
+               while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
+                   let bounds_start = bounds.start + jx;
+                   let ptr = weights_ptr.add(jx + filter_offset);
+                   let weight0 = vdupq_n_f32(ptr.read_unaligned());
+                   let weight1 = vdupq_n_f32(ptr.add(1).read_unaligned());
+                   let weight2 = vdupq_n_f32(ptr.add(2).read_unaligned());
+                   let weight3 = vdupq_n_f32(ptr.add(3).read_unaligned());
+                   store_0 = convolve_horizontal_parts_4_rgb_f32(
+                       bounds_start,
+                       unsafe_source_ptr_0,
+                       weight0,
+                       weight1,
+                       weight2,
+                       weight3,
+                       store_0,
+                   );
+                   store_1 = convolve_horizontal_parts_4_rgb_f32(
+                       bounds_start,
+                       unsafe_source_ptr_0.add(src_stride),
+                       weight0,
+                       weight1,
+                       weight2,
+                       weight3,
+                       store_1,
+                   );
+                   store_2 = convolve_horizontal_parts_4_rgb_f32(
+                       bounds_start,
+                       unsafe_source_ptr_0.add(src_stride * 2),
+                       weight0,
+                       weight1,
+                       weight2,
+                       weight3,
+                       store_2,
+                   );
+                   store_3 = convolve_horizontal_parts_4_rgb_f32(
+                       bounds_start,
+                       unsafe_source_ptr_0.add(src_stride * 3),
+                       weight0,
+                       weight1,
+                       weight2,
+                       weight3,
+                       store_3,
+                   );
+                   jx += 4;
+               }
+
+               while jx < bounds.size {
+                   let ptr = weights_ptr.add(jx + filter_offset);
+                   let bounds_start = bounds.start + jx;
+                   let weight0 = vdupq_n_f32(ptr.read_unaligned());
+                   store_0 = convolve_horizontal_parts_one_rgb_f32(
+                       bounds_start,
+                       unsafe_source_ptr_0,
+                       weight0,
+                       store_0,
+                   );
+                   store_1 = convolve_horizontal_parts_one_rgb_f32(
+                       bounds_start,
+                       unsafe_source_ptr_0.add(src_stride),
+                       weight0,
+                       store_1,
+                   );
+                   store_2 = convolve_horizontal_parts_one_rgb_f32(
+                       bounds_start,
+                       unsafe_source_ptr_0.add(src_stride * 2),
+                       weight0,
+                       store_2,
+                   );
+                   store_3 = convolve_horizontal_parts_one_rgb_f32(
+                       bounds_start,
+                       unsafe_source_ptr_0.add(src_stride * 3),
+                       weight0,
+                       store_3,
+                   );
+                   jx += 1;
+               }
+
+               let px = x * CHANNELS;
+               let dest_ptr = unsafe_destination_ptr_0.add(px);
+               let l1 = vgetq_lane_f32::<0>(store_0);
+               let l2 = vgetq_lane_f32::<1>(store_0);
+               let l3 = vgetq_lane_f32::<2>(store_0);
+               dest_ptr.write_unaligned(l1);
+               dest_ptr.add(1).write_unaligned(l2);
+               dest_ptr.add(2).write_unaligned(l3);
+
+               let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
+               let l1 = vgetq_lane_f32::<0>(store_1);
+               let l2 = vgetq_lane_f32::<1>(store_1);
+               let l3 = vgetq_lane_f32::<2>(store_1);
+               dest_ptr.write_unaligned(l1);
+               dest_ptr.add(1).write_unaligned(l2);
+               dest_ptr.add(2).write_unaligned(l3);
+
+               let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+               let l1 = vgetq_lane_f32::<0>(store_2);
+               let l2 = vgetq_lane_f32::<1>(store_2);
+               let l3 = vgetq_lane_f32::<2>(store_2);
+               dest_ptr.write_unaligned(l1);
+               dest_ptr.add(1).write_unaligned(l2);
+               dest_ptr.add(2).write_unaligned(l3);
+
+               let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+               let l1 = vgetq_lane_f32::<0>(store_3);
+               let l2 = vgetq_lane_f32::<1>(store_3);
+               let l3 = vgetq_lane_f32::<2>(store_3);
+               dest_ptr.write_unaligned(l1);
+               dest_ptr.add(1).write_unaligned(l2);
+               dest_ptr.add(2).write_unaligned(l3);
+
+               filter_offset += filter_weights.aligned_size;
+           }
+       }
+    }
+
+    pub fn convolve_horizontal_rgb_neon_row_one_f32(
+        dst_width: usize,
+        src_width: usize,
+        filter_weights: &FilterWeights<f32>,
+        unsafe_source_ptr_0: *const f32,
+        unsafe_destination_ptr_0: *mut f32,
+    ) {
+       unsafe {
+           const CHANNELS: usize = 3;
+           let weights_ptr = filter_weights.weights.as_ptr();
+           let mut filter_offset = 0usize;
+
+           for x in 0..dst_width {
+               let bounds = filter_weights.bounds.get_unchecked(x);
+               let mut jx = 0usize;
+               let mut store =  vdupq_n_f32(0f32);
+
+               while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
+                   let bounds_start = bounds.start + jx;
+                   let ptr = weights_ptr.add(jx + filter_offset);
+                   let weight0 = vdupq_n_f32(ptr.read_unaligned());
+                   let weight1 = vdupq_n_f32(ptr.add(1).read_unaligned());
+                   let weight2 = vdupq_n_f32(ptr.add(2).read_unaligned());
+                   let weight3 = vdupq_n_f32(ptr.add(3).read_unaligned());
+                   store = convolve_horizontal_parts_4_rgb_f32(
+                       bounds_start,
+                       unsafe_source_ptr_0,
+                       weight0,
+                       weight1,
+                       weight2,
+                       weight3,
+                       store,
+                   );
+                   jx += 4;
+               }
+
+               while jx < bounds.size {
+                   let ptr = weights_ptr.add(jx + filter_offset);
+                   let weight0 = vdupq_n_f32(ptr.read_unaligned());
+                   store = convolve_horizontal_parts_one_rgb_f32(
+                       bounds.start + jx,
+                       unsafe_source_ptr_0,
+                       weight0,
+                       store,
+                   );
+                   jx += 1;
+               }
+
+               let px = x * CHANNELS;
+               let dest_ptr = unsafe_destination_ptr_0.add(px);
+               let l1 = vgetq_lane_f32::<0>(store);
+               let l2 = vgetq_lane_f32::<1>(store);
+               let l3 = vgetq_lane_f32::<2>(store);
+               dest_ptr.write_unaligned(l1);
+               dest_ptr.add(1).write_unaligned(l2);
+               dest_ptr.add(2).write_unaligned(l3);
+
+               filter_offset += filter_weights.aligned_size;
+           }
+       }
+    }
+
+    #[inline(always)]
+    pub(crate) fn convolve_vertical_rgb_neon_row_f32<const CHANNELS: usize>(
+        width: usize,
+        bounds: &FilterBounds,
+        unsafe_source_ptr_0: *const f32,
+        unsafe_destination_ptr_0: *mut f32,
+        src_stride: usize,
+        weight_ptr: *const f32,
+    ) {
+        let mut cx = 0usize;
+        let dst_width = width * CHANNELS;
+
+        while cx + 16 < dst_width {
+            unsafe {
+                convolve_vertical_part_neon_16_f32(
+                    bounds.start,
+                    cx,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    weight_ptr,
+                    bounds,
+                );
+            }
+
+            cx += 16;
+        }
+
+        while cx + 8 < dst_width {
+            unsafe {
+                convolve_vertical_part_neon_8_f32::<false>(
+                    bounds.start,
+                    cx,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    weight_ptr,
+                    bounds,
+                    8,
+                );
+            }
+
+            cx += 8;
+        }
+
+        let left = dst_width - cx;
+
+        if left > 0 {
+            unsafe {
+                convolve_vertical_part_neon_8_f32::<true>(
+                    bounds.start,
+                    cx,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    weight_ptr,
+                    bounds,
+                    left,
+                );
+            }
+        }
+    }
+}
diff --git a/src/neon_rgb_u8.rs b/src/neon/rgb_u8.rs
similarity index 68%
rename from src/neon_rgb_u8.rs
rename to src/neon/rgb_u8.rs
index 571ecd9..1f913ed 100644
--- a/src/neon_rgb_u8.rs
+++ b/src/neon/rgb_u8.rs
@@ -8,11 +8,11 @@
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 pub mod neon_rgb {
     use crate::filter_weights::{FilterBounds, FilterWeights};
-    use crate::neon_simd_u8::neon_convolve_u8;
+    use crate::neon::utils::neon_convolve_u8;
     use crate::support::ROUNDING_APPROX;
     use std::arch::aarch64::*;
 
-    pub unsafe fn convolve_horizontal_rgb_neon_rows_4(
+    pub fn convolve_horizontal_rgb_neon_rows_4(
         dst_width: usize,
         src_width: usize,
         approx_weights: &FilterWeights<i16>,
@@ -21,29 +21,29 @@ pub mod neon_rgb {
         unsafe_destination_ptr_0: *mut u8,
         dst_stride: usize,
     ) {
-        let shuf_table_1: [u8; 8] = [0, 1, 2, 255, 3, 4, 5, 255];
-        let shuffle_1 = vld1_u8(shuf_table_1.as_ptr());
-        let shuf_table_2: [u8; 8] = [6, 7, 8, 255, 9, 10, 11, 255];
-        let shuffle_2 = vld1_u8(shuf_table_2.as_ptr());
-        let shuffle = vcombine_u8(shuffle_1, shuffle_2);
-
-        let mut filter_offset = 0usize;
-        let weights_ptr = approx_weights.weights.as_ptr();
-        const CHANNELS: usize = 3;
-        let zeros = vdupq_n_s32(0i32);
-        let init = vdupq_n_s32(ROUNDING_APPROX);
-        for x in 0..dst_width {
-            let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
-            let mut jx = 0usize;
-            let mut store_0 = init;
-            let mut store_1 = init;
-            let mut store_2 = init;
-            let mut store_3 = init;
-
-            while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
-                let bounds_start = bounds.start + jx;
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
+        unsafe {
+            let shuf_table_1: [u8; 8] = [0, 1, 2, 255, 3, 4, 5, 255];
+            let shuffle_1 = vld1_u8(shuf_table_1.as_ptr());
+            let shuf_table_2: [u8; 8] = [6, 7, 8, 255, 9, 10, 11, 255];
+            let shuffle_2 = vld1_u8(shuf_table_2.as_ptr());
+            let shuffle = vcombine_u8(shuffle_1, shuffle_2);
+
+            let mut filter_offset = 0usize;
+            let weights_ptr = approx_weights.weights.as_ptr();
+            const CHANNELS: usize = 3;
+            let zeros = vdupq_n_s32(0i32);
+            let init = vdupq_n_s32(ROUNDING_APPROX);
+            for x in 0..dst_width {
+                let bounds = approx_weights.bounds.get_unchecked(x);
+                let mut jx = 0usize;
+                let mut store_0 = init;
+                let mut store_1 = init;
+                let mut store_2 = init;
+                let mut store_3 = init;
+
+                while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
+                    let bounds_start = bounds.start + jx;
+                    let ptr = weights_ptr.add(jx + filter_offset);
                     let weight0 = vdup_n_s16(ptr.read_unaligned());
                     let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned());
                     let weight2 = vdup_n_s16(ptr.add(2).read_unaligned());
@@ -88,14 +88,12 @@ pub mod neon_rgb {
                         store_3,
                         shuffle,
                     );
+                    jx += 4;
                 }
-                jx += 4;
-            }
 
-            while jx + 2 < bounds.size && bounds.start + jx + 3 < src_width {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                let bounds_start = bounds.start + jx;
-                unsafe {
+                while jx + 2 < bounds.size && bounds.start + jx + 3 < src_width {
+                    let ptr = weights_ptr.add(jx + filter_offset);
+                    let bounds_start = bounds.start + jx;
                     let weight0 = vdup_n_s16(ptr.read_unaligned());
                     let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned());
                     store_0 = neon_convolve_u8::convolve_horizontal_parts_2_rgb(
@@ -130,14 +128,12 @@ pub mod neon_rgb {
                         store_3,
                         shuffle_1,
                     );
+                    jx += 2;
                 }
-                jx += 2;
-            }
 
-            while jx < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                let bounds_start = bounds.start + jx;
-                unsafe {
+                while jx < bounds.size {
+                    let ptr = weights_ptr.add(jx + filter_offset);
+                    let bounds_start = bounds.start + jx;
                     let weight0 = vdup_n_s16(ptr.read_unaligned());
                     store_0 = neon_convolve_u8::convolve_horizontal_parts_one_rgb(
                         bounds_start,
@@ -163,93 +159,86 @@ pub mod neon_rgb {
                         weight0,
                         store_3,
                     );
+                    jx += 1;
                 }
-                jx += 1;
-            }
 
-            let store_16 = unsafe { vqshrun_n_s32::<12>(vmaxq_s32(store_0, zeros)) };
-            let store_16_8 = unsafe { vqmovn_u16(vcombine_u16(store_16, store_16)) };
+                let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_0, zeros));
+                let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
 
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-            unsafe {
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe_destination_ptr_0.add(px);
                 let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
                 let bytes = pixel.to_le_bytes();
                 dest_ptr.write_unaligned(bytes[0]);
                 dest_ptr.add(1).write_unaligned(bytes[1]);
                 dest_ptr.add(2).write_unaligned(bytes[2]);
-            }
 
-            let store_16 = unsafe { vqshrun_n_s32::<12>(vmaxq_s32(store_1, zeros)) };
-            let store_16_8 = unsafe { vqmovn_u16(vcombine_u16(store_16, store_16)) };
+                let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_1, zeros));
+                let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
+
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
 
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) };
-            unsafe {
                 let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
                 let bytes = pixel.to_le_bytes();
                 dest_ptr.write_unaligned(bytes[0]);
                 dest_ptr.add(1).write_unaligned(bytes[1]);
                 dest_ptr.add(2).write_unaligned(bytes[2]);
-            }
 
-            let store_16 = unsafe { vqshrun_n_s32::<12>(vmaxq_s32(store_2, zeros)) };
-            let store_16_8 = unsafe { vqmovn_u16(vcombine_u16(store_16, store_16)) };
+                let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_2, zeros));
+                let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
 
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) };
-            unsafe {
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
                 let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
                 let bytes = pixel.to_le_bytes();
                 dest_ptr.write_unaligned(bytes[0]);
                 dest_ptr.add(1).write_unaligned(bytes[1]);
                 dest_ptr.add(2).write_unaligned(bytes[2]);
-            }
 
-            let store_16 = unsafe { vqshrun_n_s32::<12>(vmaxq_s32(store_3, zeros)) };
-            let store_16_8 = unsafe { vqmovn_u16(vcombine_u16(store_16, store_16)) };
+                let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_3, zeros));
+                let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
 
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) };
-            unsafe {
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
                 let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
                 let bytes = pixel.to_le_bytes();
                 dest_ptr.write_unaligned(bytes[0]);
                 dest_ptr.add(1).write_unaligned(bytes[1]);
                 dest_ptr.add(2).write_unaligned(bytes[2]);
-            }
 
-            filter_offset += approx_weights.aligned_size;
+                filter_offset += approx_weights.aligned_size;
+            }
         }
     }
 
-    pub unsafe fn convolve_horizontal_rgb_neon_row_one(
+    pub fn convolve_horizontal_rgb_neon_row_one(
         dst_width: usize,
         src_width: usize,
         approx_weights: &FilterWeights<i16>,
         unsafe_source_ptr_0: *const u8,
         unsafe_destination_ptr_0: *mut u8,
     ) {
-        const CHANNELS: usize = 3;
-        let mut filter_offset = 0usize;
-        let zeros = vdupq_n_s32(0i32);
-        let weights_ptr = approx_weights.weights.as_ptr();
-
-        let shuf_table_1: [u8; 8] = [0, 1, 2, 255, 3, 4, 5, 255];
-        let shuffle_1 = vld1_u8(shuf_table_1.as_ptr());
-        let shuf_table_2: [u8; 8] = [6, 7, 8, 255, 9, 10, 11, 255];
-        let shuffle_2 = vld1_u8(shuf_table_2.as_ptr());
-        let shuffle = vcombine_u8(shuffle_1, shuffle_2);
-
-        for x in 0..dst_width {
-            let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
-            let mut jx = 0usize;
-            let mut store = vdupq_n_s32(ROUNDING_APPROX);
-
-            while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
-                let bounds_start = bounds.start + jx;
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
+        unsafe {
+            const CHANNELS: usize = 3;
+            let mut filter_offset = 0usize;
+            let zeros = vdupq_n_s32(0i32);
+            let weights_ptr = approx_weights.weights.as_ptr();
+
+            let shuf_table_1: [u8; 8] = [0, 1, 2, 255, 3, 4, 5, 255];
+            let shuffle_1 = vld1_u8(shuf_table_1.as_ptr());
+            let shuf_table_2: [u8; 8] = [6, 7, 8, 255, 9, 10, 11, 255];
+            let shuffle_2 = vld1_u8(shuf_table_2.as_ptr());
+            let shuffle = vcombine_u8(shuffle_1, shuffle_2);
+
+            for x in 0..dst_width {
+                let bounds = approx_weights.bounds.get_unchecked(x);
+                let mut jx = 0usize;
+                let mut store = vdupq_n_s32(ROUNDING_APPROX);
+
+                while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
+                    let bounds_start = bounds.start + jx;
+                    let ptr = weights_ptr.add(jx + filter_offset);
                     let weight0 = vdup_n_s16(ptr.read_unaligned());
                     let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned());
                     let weight2 = vdup_n_s16(ptr.add(2).read_unaligned());
@@ -264,14 +253,12 @@ pub mod neon_rgb {
                         store,
                         shuffle,
                     );
+                    jx += 4;
                 }
-                jx += 4;
-            }
 
-            while jx + 2 < bounds.size && bounds.start + jx + 3 < src_width {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                let bounds_start = bounds.start + jx;
-                unsafe {
+                while jx + 2 < bounds.size && bounds.start + jx + 3 < src_width {
+                    let ptr = weights_ptr.add(jx + filter_offset);
+                    let bounds_start = bounds.start + jx;
                     let weight0 = vdup_n_s16(ptr.read_unaligned());
                     let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned());
                     store = neon_convolve_u8::convolve_horizontal_parts_2_rgb(
@@ -282,13 +269,11 @@ pub mod neon_rgb {
                         store,
                         shuffle_1,
                     );
+                    jx += 2;
                 }
-                jx += 2;
-            }
 
-            while jx < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
+                while jx < bounds.size {
+                    let ptr = weights_ptr.add(jx + filter_offset);
                     let weight0 = vdup_n_s16(ptr.read_unaligned());
                     store = neon_convolve_u8::convolve_horizontal_parts_one_rgb(
                         bounds.start + jx,
@@ -296,30 +281,28 @@ pub mod neon_rgb {
                         weight0,
                         store,
                     );
+                    jx += 1;
                 }
-                jx += 1;
-            }
 
-            let store_16 = unsafe { vqshrun_n_s32::<12>(vmaxq_s32(store, zeros)) };
-            let store_16_8 = unsafe { vqmovn_u16(vcombine_u16(store_16, store_16)) };
+                let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store, zeros));
+                let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
 
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-            unsafe {
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe_destination_ptr_0.add(px);
                 let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
                 let bytes = pixel.to_le_bytes();
                 dest_ptr.write_unaligned(bytes[0]);
                 dest_ptr.add(1).write_unaligned(bytes[1]);
                 dest_ptr.add(2).write_unaligned(bytes[2]);
-            }
 
-            filter_offset += approx_weights.aligned_size;
+                filter_offset += approx_weights.aligned_size;
+            }
         }
     }
 
     #[inline(always)]
-    pub fn convolve_vertical_rgb_neon_row(
-        dst_width: usize,
+    pub fn convolve_vertical_rgb_neon_row<const CHANNELS: usize>(
+        width: usize,
         bounds: &FilterBounds,
         unsafe_source_ptr_0: *const u8,
         unsafe_destination_ptr_0: *mut u8,
@@ -327,6 +310,7 @@ pub mod neon_rgb {
         weight_ptr: *const i16,
     ) {
         let mut cx = 0usize;
+        let dst_width = width * CHANNELS;
         while cx + 32 < dst_width {
             unsafe {
                 neon_convolve_u8::convolve_vertical_part_neon_32(
diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs
new file mode 100644
index 0000000..5abdc31
--- /dev/null
+++ b/src/neon/rgba_u8.rs
@@ -0,0 +1,261 @@
+use crate::filter_weights::FilterWeights;
+use crate::neon::utils::neon_convolve_u8::{
+    convolve_horizontal_parts_2_rgba, convolve_horizontal_parts_4_rgba,
+    convolve_horizontal_parts_one_rgba,
+};
+use crate::support::ROUNDING_APPROX;
+use std::arch::aarch64::*;
+
+pub fn convolve_horizontal_rgba_neon_rows_4_u8(
+    dst_width: usize,
+    _: usize,
+    approx_weights: &FilterWeights<i16>,
+    unsafe_source_ptr_0: *const u8,
+    src_stride: usize,
+    unsafe_destination_ptr_0: *mut u8,
+    dst_stride: usize,
+) {
+    unsafe {
+        let mut filter_offset = 0usize;
+        let weights_ptr = approx_weights.weights.as_ptr();
+        const CHANNELS: usize = 4;
+        let zeros = vdupq_n_s32(0i32);
+        let init = vdupq_n_s32(ROUNDING_APPROX);
+        for x in 0..dst_width {
+            let bounds = approx_weights.bounds.get_unchecked(x);
+            let mut jx = 0usize;
+            let mut store_0 = init;
+            let mut store_1 = init;
+            let mut store_2 = init;
+            let mut store_3 = init;
+
+            while jx + 4 < bounds.size {
+                let bounds_start = bounds.start + jx;
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let weight0 = vdup_n_s16(ptr.read_unaligned());
+                let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned());
+                let weight2 = vdup_n_s16(ptr.add(2).read_unaligned());
+                let weight3 = vdupq_n_s16(ptr.add(3).read_unaligned());
+                store_0 = convolve_horizontal_parts_4_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0,
+                    weight0,
+                    weight1,
+                    weight2,
+                    weight3,
+                    store_0,
+                );
+                store_1 = convolve_horizontal_parts_4_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0.add(src_stride),
+                    weight0,
+                    weight1,
+                    weight2,
+                    weight3,
+                    store_1,
+                );
+                store_2 = convolve_horizontal_parts_4_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0.add(src_stride * 2),
+                    weight0,
+                    weight1,
+                    weight2,
+                    weight3,
+                    store_2,
+                );
+                store_3 = convolve_horizontal_parts_4_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0.add(src_stride * 3),
+                    weight0,
+                    weight1,
+                    weight2,
+                    weight3,
+                    store_3,
+                );
+                jx += 4;
+            }
+
+            while jx + 2 < bounds.size {
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let bounds_start = bounds.start + jx;
+                let weight0 = vdup_n_s16(ptr.read_unaligned());
+                let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned());
+                store_0 = convolve_horizontal_parts_2_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0,
+                    weight0,
+                    weight1,
+                    store_0,
+                );
+                store_1 = convolve_horizontal_parts_2_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0.add(src_stride),
+                    weight0,
+                    weight1,
+                    store_1,
+                );
+                store_2 = convolve_horizontal_parts_2_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0.add(src_stride * 2),
+                    weight0,
+                    weight1,
+                    store_2,
+                );
+                store_3 = convolve_horizontal_parts_2_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0.add(src_stride * 3),
+                    weight0,
+                    weight1,
+                    store_3,
+                );
+                jx += 2;
+            }
+
+            while jx < bounds.size {
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let bounds_start = bounds.start + jx;
+                let weight0 = vdup_n_s16(ptr.read_unaligned());
+                store_0 = convolve_horizontal_parts_one_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0,
+                    weight0,
+                    store_0,
+                );
+                store_1 = convolve_horizontal_parts_one_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0.add(src_stride),
+                    weight0,
+                    store_1,
+                );
+                store_2 = convolve_horizontal_parts_one_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0.add(src_stride * 2),
+                    weight0,
+                    store_2,
+                );
+                store_3 = convolve_horizontal_parts_one_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0.add(src_stride * 3),
+                    weight0,
+                    store_3,
+                );
+                jx += 1;
+            }
+
+            let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_0, zeros));
+            let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
+
+            let px = x * CHANNELS;
+            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
+            let dest_ptr_32 = dest_ptr as *mut u32;
+            dest_ptr_32.write_unaligned(pixel);
+
+            let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_1, zeros));
+            let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
+
+            let px = x * CHANNELS;
+            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
+            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
+            let dest_ptr_32 = dest_ptr as *mut u32;
+            dest_ptr_32.write_unaligned(pixel);
+
+            let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_2, zeros));
+            let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
+
+            let px = x * CHANNELS;
+            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
+            let dest_ptr_32 = dest_ptr as *mut u32;
+            dest_ptr_32.write_unaligned(pixel);
+
+            let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_3, zeros));
+            let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
+
+            let px = x * CHANNELS;
+            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
+            let dest_ptr_32 = dest_ptr as *mut u32;
+            dest_ptr_32.write_unaligned(pixel);
+
+            filter_offset += approx_weights.aligned_size;
+        }
+    }
+}
+
+pub fn convolve_horizontal_rgba_neon_row(
+    dst_width: usize,
+    _: usize,
+    filter_weights: &FilterWeights<i16>,
+    unsafe_source_ptr_0: *const u8,
+    unsafe_destination_ptr_0: *mut u8,
+) {
+    unsafe {
+        const CHANNELS: usize = 4;
+        let mut filter_offset = 0usize;
+
+        let weights_ptr = filter_weights.weights.as_ptr();
+
+        for x in 0..dst_width {
+            let bounds = filter_weights.bounds.get_unchecked(x);
+            let mut jx = 0usize;
+            let mut store = vdupq_n_s32(ROUNDING_APPROX);
+
+            while jx + 4 < bounds.size {
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let weight0 = vdup_n_s16(ptr.read_unaligned());
+                let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned());
+                let weight2 = vdup_n_s16(ptr.add(2).read_unaligned());
+                let weight3 = vdupq_n_s16(ptr.add(3).read_unaligned());
+                store = convolve_horizontal_parts_4_rgba(
+                    bounds.start + jx,
+                    unsafe_source_ptr_0,
+                    weight0,
+                    weight1,
+                    weight2,
+                    weight3,
+                    store,
+                );
+                jx += 4;
+            }
+
+            while jx + 2 < bounds.size {
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let bounds_start = bounds.start + jx;
+                let weight0 = vdup_n_s16(ptr.read_unaligned());
+                let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned());
+                store = convolve_horizontal_parts_2_rgba(
+                    bounds_start,
+                    unsafe_source_ptr_0,
+                    weight0,
+                    weight1,
+                    store,
+                );
+                jx += 2;
+            }
+
+            while jx < bounds.size {
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let weight0 = vdup_n_s16(ptr.read_unaligned());
+                store = convolve_horizontal_parts_one_rgba(
+                    bounds.start + jx,
+                    unsafe_source_ptr_0,
+                    weight0,
+                    store,
+                );
+                jx += 1;
+            }
+
+            let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store, vdupq_n_s32(0i32)));
+            let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
+
+            let px = x * CHANNELS;
+            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
+            let dest_ptr_32 = dest_ptr as *mut u32;
+            dest_ptr_32.write_unaligned(value);
+
+            filter_offset += filter_weights.aligned_size;
+        }
+    }
+}
diff --git a/src/neon_simd_u8.rs b/src/neon/utils.rs
similarity index 92%
rename from src/neon_simd_u8.rs
rename to src/neon/utils.rs
index 1289d72..14781c6 100644
--- a/src/neon_simd_u8.rs
+++ b/src/neon/utils.rs
@@ -5,6 +5,8 @@
  * // license that can be found in the LICENSE file.
  */
 
+use std::arch::aarch64::{float32x4_t, vfmaq_f32};
+
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 pub mod neon_convolve_u8 {
     use crate::filter_weights::FilterBounds;
@@ -109,6 +111,25 @@ pub mod neon_convolve_u8 {
         acc
     }
 
+    #[inline(always)]
+    pub(crate) unsafe fn convolve_horizontal_parts_2_rgba(
+        start_x: usize,
+        src: *const u8,
+        weight0: int16x4_t,
+        weight1: int16x8_t,
+        store_0: int32x4_t,
+    ) -> int32x4_t {
+        const COMPONENTS: usize = 4;
+        let src_ptr = src.add(start_x * COMPONENTS);
+
+        let rgb_pixel = vld1_u8(src_ptr);
+        let wide = vreinterpretq_s16_u16(vmovl_u8(rgb_pixel));
+
+        let acc = vmlal_high_s16(store_0, wide, weight1);
+        let acc = vmlal_s16(acc, vget_low_s16(wide), weight0);
+        acc
+    }
+
     #[inline(always)]
     pub(crate) unsafe fn convolve_vertical_part_neon_32(
         start_y: usize,
@@ -319,3 +340,19 @@ pub mod neon_convolve_u8 {
         acc
     }
 }
+
+#[inline(always)]
+pub(crate) unsafe fn prefer_vfmaq_f32(
+    a: float32x4_t,
+    b: float32x4_t,
+    c: float32x4_t,
+) -> float32x4_t {
+    #[cfg(target_arch = "aarch64")]
+    {
+        return vfmaq_f32(a, b, c);
+    }
+    #[cfg(target_arch = "arm")]
+    {
+        return vmlaq_f32(a, b, c);
+    }
+}
diff --git a/src/neon_rgb_f32.rs b/src/neon_rgb_f32.rs
deleted file mode 100644
index 6f36586..0000000
--- a/src/neon_rgb_f32.rs
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * // Copyright (c) the Radzivon Bartoshyk. All rights reserved.
- * //
- * // Use of this source code is governed by a BSD-style
- * // license that can be found in the LICENSE file.
- */
-
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-pub mod neon_convolve_floats {
-    use crate::convolve_f32::{
-        convolve_horizontal_parts_4_rgb_f32, convolve_horizontal_parts_4_rgba_f32,
-        convolve_horizontal_parts_one_rgb_f32, convolve_horizontal_parts_one_rgba_f32,
-    };
-    use crate::filter_weights::FilterWeights;
-    use std::arch::aarch64::*;
-
-    pub unsafe fn convolve_horizontal_rgba_neon_row_one(
-        dst_width: usize,
-        filter_weights: &FilterWeights<f32>,
-        unsafe_source_ptr_0: *const f32,
-        unsafe_destination_ptr_0: *mut f32,
-    ) {
-        const CHANNELS: usize = 4;
-        let mut filter_offset = 0usize;
-        let weights_ptr = filter_weights.weights.as_ptr();
-
-        for x in 0..dst_width {
-            let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
-            let mut jx = 0usize;
-            let mut store = unsafe { vdupq_n_f32(0f32) };
-
-            while jx + 4 < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                let weight0 = unsafe { ptr.read_unaligned() };
-                let weight1 = unsafe { ptr.add(1).read_unaligned() };
-                let weight2 = unsafe { ptr.add(2).read_unaligned() };
-                let weight3 = unsafe { ptr.add(3).read_unaligned() };
-                unsafe {
-                    store = convolve_horizontal_parts_4_rgba_f32(
-                        bounds.start,
-                        unsafe_source_ptr_0,
-                        weight0,
-                        weight1,
-                        weight2,
-                        weight3,
-                        store,
-                    );
-                }
-                jx += 4;
-            }
-            while jx < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                let weight0 = unsafe { ptr.read_unaligned() };
-                unsafe {
-                    store = convolve_horizontal_parts_one_rgba_f32(
-                        bounds.start,
-                        unsafe_source_ptr_0,
-                        weight0,
-                        store,
-                    );
-                }
-                jx += 1;
-            }
-
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-            unsafe {
-                vst1q_f32(dest_ptr, store);
-            }
-
-            filter_offset += filter_weights.aligned_size;
-        }
-    }
-
-    pub unsafe fn convolve_horizontal_rgba_neon_rows_4(
-        dst_width: usize,
-        filter_weights: &FilterWeights<f32>,
-        unsafe_source_ptr_0: *const f32,
-        src_stride: usize,
-        unsafe_destination_ptr_0: *mut f32,
-        dst_stride: usize,
-    ) {
-        const CHANNELS: usize = 4;
-        let mut filter_offset = 0usize;
-        let zeros = unsafe { vdupq_n_f32(0f32) };
-        let weights_ptr = filter_weights.weights.as_ptr();
-
-        for x in 0..dst_width {
-            let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
-            let mut jx = 0usize;
-            let mut store_0 = zeros;
-            let mut store_1 = zeros;
-            let mut store_2 = zeros;
-            let mut store_3 = zeros;
-
-            while jx + 4 < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                let weight0 = unsafe { ptr.read_unaligned() };
-                let weight1 = unsafe { ptr.add(1).read_unaligned() };
-                let weight2 = unsafe { ptr.add(2).read_unaligned() };
-                let weight3 = unsafe { ptr.add(3).read_unaligned() };
-                unsafe {
-                    store_0 = convolve_horizontal_parts_4_rgba_f32(
-                        bounds.start,
-                        unsafe_source_ptr_0,
-                        weight0,
-                        weight1,
-                        weight2,
-                        weight3,
-                        store_0,
-                    );
-                    store_1 = convolve_horizontal_parts_4_rgba_f32(
-                        bounds.start,
-                        unsafe_source_ptr_0.add(src_stride),
-                        weight0,
-                        weight1,
-                        weight2,
-                        weight3,
-                        store_1,
-                    );
-                    store_2 = convolve_horizontal_parts_4_rgba_f32(
-                        bounds.start,
-                        unsafe_source_ptr_0.add(src_stride * 2),
-                        weight0,
-                        weight1,
-                        weight2,
-                        weight3,
-                        store_2,
-                    );
-                    store_3 = convolve_horizontal_parts_4_rgba_f32(
-                        bounds.start,
-                        unsafe_source_ptr_0.add(src_stride * 3),
-                        weight0,
-                        weight1,
-                        weight2,
-                        weight3,
-                        store_3,
-                    );
-                }
-                jx += 4;
-            }
-            while jx < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                let weight0 = unsafe { ptr.read_unaligned() };
-                unsafe {
-                    store_0 = convolve_horizontal_parts_one_rgba_f32(
-                        bounds.start,
-                        unsafe_source_ptr_0,
-                        weight0,
-                        store_0,
-                    );
-                    store_1 = convolve_horizontal_parts_one_rgba_f32(
-                        bounds.start,
-                        unsafe_source_ptr_0.add(src_stride),
-                        weight0,
-                        store_1,
-                    );
-                    store_2 = convolve_horizontal_parts_one_rgba_f32(
-                        bounds.start,
-                        unsafe_source_ptr_0.add(src_stride * 2),
-                        weight0,
-                        store_2,
-                    );
-                    store_3 = convolve_horizontal_parts_one_rgba_f32(
-                        bounds.start,
-                        unsafe_source_ptr_0.add(src_stride * 3),
-                        weight0,
-                        store_3,
-                    );
-                }
-                jx += 1;
-            }
-
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-            unsafe {
-                vst1q_f32(dest_ptr, store_0);
-            }
-
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) };
-            unsafe {
-                vst1q_f32(dest_ptr, store_1);
-            }
-
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) };
-            unsafe {
-                vst1q_f32(dest_ptr, store_2);
-            }
-
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) };
-            unsafe {
-                vst1q_f32(dest_ptr, store_3);
-            }
-
-            filter_offset += filter_weights.aligned_size;
-        }
-    }
-
-    pub unsafe fn convolve_horizontal_rgb_neon_rows_4(
-        dst_width: usize,
-        src_width: usize,
-        filter_weights: &FilterWeights<f32>,
-        unsafe_source_ptr_0: *const f32,
-        src_stride: usize,
-        unsafe_destination_ptr_0: *mut f32,
-        dst_stride: usize,
-    ) {
-        const CHANNELS: usize = 3;
-        let mut filter_offset = 0usize;
-
-        let zeros = unsafe { vdupq_n_f32(0f32) };
-
-        let weights_ptr = filter_weights.weights.as_ptr();
-
-        for x in 0..dst_width {
-            let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
-            let mut jx = 0usize;
-            let mut store_0 = zeros;
-            let mut store_1 = zeros;
-            let mut store_2 = zeros;
-            let mut store_3 = zeros;
-
-            while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
-                let bounds_start = bounds.start + jx;
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let weight0 = vdupq_n_f32(ptr.read_unaligned());
-                    let weight1 = vdupq_n_f32(ptr.add(1).read_unaligned());
-                    let weight2 = vdupq_n_f32(ptr.add(2).read_unaligned());
-                    let weight3 = vdupq_n_f32(ptr.add(3).read_unaligned());
-                    store_0 = convolve_horizontal_parts_4_rgb_f32(
-                        bounds_start,
-                        unsafe_source_ptr_0,
-                        weight0,
-                        weight1,
-                        weight2,
-                        weight3,
-                        store_0,
-                    );
-                    store_1 = convolve_horizontal_parts_4_rgb_f32(
-                        bounds_start,
-                        unsafe_source_ptr_0.add(src_stride),
-                        weight0,
-                        weight1,
-                        weight2,
-                        weight3,
-                        store_1,
-                    );
-                    store_2 = convolve_horizontal_parts_4_rgb_f32(
-                        bounds_start,
-                        unsafe_source_ptr_0.add(src_stride * 2),
-                        weight0,
-                        weight1,
-                        weight2,
-                        weight3,
-                        store_2,
-                    );
-                    store_3 = convolve_horizontal_parts_4_rgb_f32(
-                        bounds_start,
-                        unsafe_source_ptr_0.add(src_stride * 3),
-                        weight0,
-                        weight1,
-                        weight2,
-                        weight3,
-                        store_3,
-                    );
-                }
-                jx += 4;
-            }
-
-            while jx < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let bounds_start = bounds.start + jx;
-                    let weight0 = vdupq_n_f32(ptr.read_unaligned());
-                    store_0 = convolve_horizontal_parts_one_rgb_f32(
-                        bounds_start,
-                        unsafe_source_ptr_0,
-                        weight0,
-                        store_0,
-                    );
-                    store_1 = convolve_horizontal_parts_one_rgb_f32(
-                        bounds_start,
-                        unsafe_source_ptr_0.add(src_stride),
-                        weight0,
-                        store_1,
-                    );
-                    store_2 = convolve_horizontal_parts_one_rgb_f32(
-                        bounds_start,
-                        unsafe_source_ptr_0.add(src_stride * 2),
-                        weight0,
-                        store_2,
-                    );
-                    store_3 = convolve_horizontal_parts_one_rgb_f32(
-                        bounds_start,
-                        unsafe_source_ptr_0.add(src_stride * 3),
-                        weight0,
-                        store_3,
-                    );
-                }
-                jx += 1;
-            }
-
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-            unsafe {
-                let l1 = vgetq_lane_f32::<0>(store_0);
-                let l2 = vgetq_lane_f32::<1>(store_0);
-                let l3 = vgetq_lane_f32::<2>(store_0);
-                dest_ptr.write_unaligned(l1);
-                dest_ptr.add(1).write_unaligned(l2);
-                dest_ptr.add(2).write_unaligned(l3);
-            }
-
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) };
-            unsafe {
-                let l1 = vgetq_lane_f32::<0>(store_1);
-                let l2 = vgetq_lane_f32::<1>(store_1);
-                let l3 = vgetq_lane_f32::<2>(store_1);
-                dest_ptr.write_unaligned(l1);
-                dest_ptr.add(1).write_unaligned(l2);
-                dest_ptr.add(2).write_unaligned(l3);
-            }
-
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) };
-            unsafe {
-                let l1 = vgetq_lane_f32::<0>(store_2);
-                let l2 = vgetq_lane_f32::<1>(store_2);
-                let l3 = vgetq_lane_f32::<2>(store_2);
-                dest_ptr.write_unaligned(l1);
-                dest_ptr.add(1).write_unaligned(l2);
-                dest_ptr.add(2).write_unaligned(l3);
-            }
-
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) };
-            unsafe {
-                let l1 = vgetq_lane_f32::<0>(store_3);
-                let l2 = vgetq_lane_f32::<1>(store_3);
-                let l3 = vgetq_lane_f32::<2>(store_3);
-                dest_ptr.write_unaligned(l1);
-                dest_ptr.add(1).write_unaligned(l2);
-                dest_ptr.add(2).write_unaligned(l3);
-            }
-
-            filter_offset += filter_weights.aligned_size;
-        }
-    }
-
-    pub unsafe fn convolve_horizontal_rgb_neon_row_one(
-        dst_width: usize,
-        filter_weights: &FilterWeights<f32>,
-        unsafe_source_ptr_0: *const f32,
-        unsafe_destination_ptr_0: *mut f32,
-    ) {
-        const CHANNELS: usize = 3;
-        let weights_ptr = filter_weights.weights.as_ptr();
-        let mut filter_offset = 0usize;
-
-        for x in 0..dst_width {
-            let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
-            let mut jx = 0usize;
-            let mut store = unsafe { vdupq_n_f32(0f32) };
-
-            while jx + 4 < bounds.size && bounds.start + jx + 6 < dst_width {
-                let bounds_start = bounds.start + jx;
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let weight0 = vdupq_n_f32(ptr.read_unaligned());
-                    let weight1 = vdupq_n_f32(ptr.add(1).read_unaligned());
-                    let weight2 = vdupq_n_f32(ptr.add(2).read_unaligned());
-                    let weight3 = vdupq_n_f32(ptr.add(3).read_unaligned());
-                    store = convolve_horizontal_parts_4_rgb_f32(
-                        bounds_start,
-                        unsafe_source_ptr_0,
-                        weight0,
-                        weight1,
-                        weight2,
-                        weight3,
-                        store,
-                    );
-                }
-                jx += 4;
-            }
-
-            while jx < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let weight0 = vdupq_n_f32(ptr.read_unaligned());
-                    store = convolve_horizontal_parts_one_rgb_f32(
-                        bounds.start + jx,
-                        unsafe_source_ptr_0,
-                        weight0,
-                        store,
-                    );
-                }
-                jx += 1;
-            }
-
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-            unsafe {
-                let l1 = vgetq_lane_f32::<0>(store);
-                let l2 = vgetq_lane_f32::<1>(store);
-                let l3 = vgetq_lane_f32::<2>(store);
-                dest_ptr.write_unaligned(l1);
-                dest_ptr.add(1).write_unaligned(l2);
-                dest_ptr.add(2).write_unaligned(l3);
-            }
-
-            filter_offset += filter_weights.aligned_size;
-        }
-    }
-}
diff --git a/src/rgb_f32.rs b/src/rgb_f32.rs
index e81bd47..0a0668f 100644
--- a/src/rgb_f32.rs
+++ b/src/rgb_f32.rs
@@ -5,461 +5,20 @@
  * // license that can be found in the LICENSE file.
  */
 
-use crate::acceleration_feature::AccelerationFeature;
+use rayon::ThreadPool;
+
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
-use crate::convolve_f32::*;
+use crate::convolve_naive_f32::*;
+use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32};
 use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::image_store::ImageStore;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::neon_rgb_f32::neon_convolve_floats;
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-use crate::sse_rgb_f32::sse_convolve_f32::*;
-use crate::unsafe_slice::UnsafeSlice;
-use rayon::ThreadPool;
-use std::sync::Arc;
-
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-#[inline(always)]
-fn convolve_horizontal_neon(
-    image_store: &ImageStore<f32, 3>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f32, 3>,
-    pool: &Option<ThreadPool>,
-) {
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-    let dst_width = destination.width;
-    let src_width = image_store.width;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            let mut yy = 0usize;
-            for y in (0..destination.height.saturating_sub(4)).step_by(4) {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    unsafe {
-                        neon_convolve_floats::convolve_horizontal_rgb_neon_rows_4(
-                            dst_width,
-                            src_width,
-                            &weights,
-                            unsafe_source_ptr_0,
-                            src_stride,
-                            unsafe_destination_ptr_0,
-                            dst_stride,
-                        );
-                    }
-                });
-                yy = y;
-            }
-            for y in yy..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    unsafe {
-                        neon_convolve_floats::convolve_horizontal_rgb_neon_row_one(
-                            dst_width,
-                            &weights,
-                            unsafe_source_ptr_0,
-                            unsafe_destination_ptr_0,
-                        );
-                    }
-                });
-            }
-        });
-    } else {
-        let mut yy = 0usize;
-        while yy + 4 < destination.height {
-            unsafe {
-                neon_convolve_floats::convolve_horizontal_rgb_neon_rows_4(
-                    dst_width,
-                    src_width,
-                    &filter_weights,
-                    unsafe_source_ptr_0,
-                    src_stride,
-                    unsafe_destination_ptr_0,
-                    dst_stride,
-                );
-            }
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) };
-            yy += 4;
-        }
-
-        for _ in yy..destination.height {
-            unsafe {
-                neon_convolve_floats::convolve_horizontal_rgb_neon_row_one(
-                    dst_width,
-                    &filter_weights,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
-                );
-            }
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
-#[inline(always)]
-fn convolve_horizontal_rgb_native_row(
-    dst_width: usize,
-    filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
-) {
-    let mut filter_offset = 0usize;
-    let weights_ptr = filter_weights.weights.as_ptr();
-
-    const CHANNELS: usize = 3;
-    for x in 0..dst_width {
-        let mut sum_r = 0f32;
-        let mut sum_g = 0f32;
-        let mut sum_b = 0f32;
-
-        let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
-        let start_x = bounds.start;
-        for j in 0..bounds.size {
-            let px = (start_x + j) * CHANNELS;
-            let weight = unsafe { weights_ptr.add(j + filter_offset).read_unaligned() };
-            let src = unsafe { unsafe_source_ptr_0.add(px) };
-            sum_r += unsafe { src.read_unaligned() } * weight;
-            sum_g += unsafe { src.add(1).read_unaligned() } * weight;
-            sum_b += unsafe { src.add(2).read_unaligned() } * weight;
-        }
-
-        let px = x * CHANNELS;
-
-        let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-
-        unsafe {
-            dest_ptr.write_unaligned(sum_r);
-            dest_ptr.add(1).write_unaligned(sum_g);
-            dest_ptr.add(2).write_unaligned(sum_b);
-        }
-
-        filter_offset += filter_weights.aligned_size;
-    }
-}
-
-fn convolve_horizontal_native(
-    image_store: &ImageStore<f32, 3>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f32, 3>,
-    pool: &Option<ThreadPool>,
-) {
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-
-    let dst_width = destination.width;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    convolve_horizontal_rgb_native_row(
-                        dst_width,
-                        &weights,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                    );
-                });
-            }
-        });
-    } else {
-        for _ in 0..destination.height {
-            convolve_horizontal_rgb_native_row(
-                dst_width,
-                &filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-            );
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-#[inline(always)]
-fn convolve_vertical_rgb_native_row(
-    total_width: usize,
-    src_stride: usize,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
-    weight_ptr: *const f32,
-    bounds: &FilterBounds,
-) {
-    let mut cx = 0usize;
-
-    while cx + 16 < total_width {
-        unsafe {
-            convolve_vertical_part_neon_16_f32(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-            );
-        }
-
-        cx += 16;
-    }
-    while cx + 8 < total_width {
-        unsafe {
-            convolve_vertical_part_neon_8_f32::<false>(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-                8,
-            );
-        }
-
-        cx += 8;
-    }
-
-    let left = total_width - cx;
-
-    if left > 0 {
-        unsafe {
-            convolve_vertical_part_neon_8_f32::<true>(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-                left,
-            );
-        }
-    }
-}
-
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-fn convolve_vertical_neon(
-    image_store: &ImageStore<f32, 3>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f32, 3>,
-    pool: &Option<ThreadPool>,
-) {
-    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-    let src_stride = image_store.width * image_store.channels;
-    let mut filter_offset = 0usize;
-    let dst_stride = destination.width * image_store.channels;
-    let total_width = destination.width * image_store.channels;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    let bounds = unsafe { weights.bounds.get_unchecked(y) };
-                    let weight_ptr = unsafe { weights.weights.as_ptr().add(filter_offset) };
-                    convolve_vertical_rgb_native_row(
-                        total_width,
-                        src_stride,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                        weight_ptr,
-                        bounds,
-                    );
-                });
-            }
-        });
-    } else {
-        for y in 0..destination.height {
-            let bounds = unsafe { filter_weights.bounds.get_unchecked(y) };
-            let weight_ptr = unsafe { filter_weights.weights.as_ptr().add(filter_offset) };
-
-            convolve_vertical_rgb_native_row(
-                total_width,
-                src_stride,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-            );
-
-            filter_offset += filter_weights.aligned_size;
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
+use crate::neon::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-#[inline(always)]
-fn convolve_horizontal_rgb_sse_row(
-    total_width: usize,
-    src_stride: usize,
-    bounds: &FilterBounds,
-    weight_ptr: *const f32,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
-) {
-    let mut cx = 0usize;
-
-    while cx + 16 < total_width {
-        unsafe {
-            convolve_vertical_part_sse_16_f32(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-            );
-        }
-
-        cx += 16;
-    }
-
-    while cx + 8 < total_width {
-        unsafe {
-            convolve_vertical_part_sse_8_f32(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-            );
-        }
-
-        cx += 8;
-    }
-
-    while cx + 4 < total_width {
-        unsafe {
-            convolve_vertical_part_sse_4_f32(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-            );
-        }
-
-        cx += 4;
-    }
-
-    while cx < total_width {
-        unsafe {
-            convolve_vertical_part_sse_f32(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-            );
-        }
-        cx += 1;
-    }
-}
-
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-pub(crate) fn convolve_vertical_sse_rgb_f32<const COMPONENTS: usize>(
-    image_store: &ImageStore<f32, COMPONENTS>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f32, COMPONENTS>,
-    pool: &Option<ThreadPool>,
-) {
-    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-    let total_width = destination.width * image_store.channels;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    let filter_offset = y * weights.aligned_size;
-                    let bounds = unsafe { weights.bounds.get_unchecked(y) };
-                    let weight_ptr = unsafe { weights.weights.as_ptr().add(filter_offset) };
-                    convolve_horizontal_rgb_sse_row(
-                        total_width,
-                        src_stride,
-                        &bounds,
-                        weight_ptr,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                    );
-                });
-            }
-        });
-    } else {
-        let mut filter_offset = 0usize;
-        for y in 0..destination.height {
-            let bounds = unsafe { filter_weights.bounds.get_unchecked(y) };
-            let weight_ptr = unsafe { filter_weights.weights.as_ptr().add(filter_offset) };
-
-            convolve_horizontal_rgb_sse_row(
-                total_width,
-                src_stride,
-                &bounds,
-                weight_ptr,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-            );
-
-            filter_offset += filter_weights.aligned_size;
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
+use crate::sse::sse_convolve_f32::convolve_vertical_rgb_sse_row_f32;
 
 #[inline(always)]
-fn convolve_vertical_rgb_native_row_f32<const COMPONENTS: usize>(
+pub(crate) fn convolve_vertical_rgb_native_row_f32<const COMPONENTS: usize>(
     dst_width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const f32,
@@ -517,67 +76,6 @@ fn convolve_vertical_rgb_native_row_f32<const COMPONENTS: usize>(
     }
 }
 
-pub(crate) fn convolve_vertical_native_f32<const COMPONENTS: usize>(
-    image_store: &ImageStore<f32, COMPONENTS>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f32, COMPONENTS>,
-    pool: &Option<ThreadPool>,
-) {
-    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
-    let src_stride = image_store.width * image_store.channels;
-
-    let mut filter_offset = 0usize;
-
-    let dst_stride = destination.width * image_store.channels;
-    let dst_width = destination.width;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let bounds = unsafe { weights.bounds.get_unchecked(y) };
-                    let weight_ptr =
-                        unsafe { weights.weights.as_ptr().add(weights.aligned_size * y) };
-                    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    convolve_vertical_rgb_native_row_f32::<COMPONENTS>(
-                        dst_width,
-                        bounds,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                        src_stride,
-                        weight_ptr,
-                    );
-                });
-            }
-        });
-    } else {
-        for y in 0..destination.height {
-            let bounds = unsafe { filter_weights.bounds.get_unchecked(y) };
-            let weight_ptr = unsafe { filter_weights.weights.as_ptr().add(filter_offset) };
-
-            convolve_vertical_rgb_native_row_f32::<COMPONENTS>(
-                dst_width,
-                bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
-            );
-
-            filter_offset += filter_weights.aligned_size;
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
 impl<'a> HorizontalConvolutionPass<f32, 3> for ImageStore<'a, f32, 3> {
     #[inline(always)]
     fn convolve_horizontal(
@@ -586,22 +84,34 @@ impl<'a> HorizontalConvolutionPass<f32, 3> for ImageStore<'a, f32, 3> {
         destination: &mut ImageStore<f32, 3>,
         pool: &Option<ThreadPool>,
     ) {
-        let mut _using_feature = AccelerationFeature::Native;
+        let mut _dispatcher_4_rows: Option<
+            fn(usize, usize, &FilterWeights<f32>, *const f32, usize, *mut f32, usize),
+        > = Some(convolve_horizontal_rgba_4_row_f32::<3>);
+        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f32, *mut f32) =
+            convolve_horizontal_rgb_native_row::<3>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            _using_feature = AccelerationFeature::Neon;
-        }
-        match _using_feature {
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            AccelerationFeature::Neon => {
-                convolve_horizontal_neon(self, filter_weights, destination, pool);
-            }
-            AccelerationFeature::Native => {
-                convolve_horizontal_native(self, filter_weights, destination, pool);
-            }
-            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-            AccelerationFeature::Sse => {}
-        }
+            _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4_f32);
+            _dispatcher_row = convolve_horizontal_rgb_neon_row_one_f32;
+        }
+        // #[cfg(all(
+        //     any(target_arch = "x86_64", target_arch = "x86"),
+        //     target_feature = "sse4.1"
+        // ))]
+        // {
+        //     if is_x86_feature_detected!("sse4.1") {
+        //         _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4_f32);
+        //         _dispatcher_row = convolve_horizontal_rgb_sse_row_f32;
+        //     }
+        // }
+        convolve_horizontal_dispatch_f32(
+            self,
+            filter_weights,
+            destination,
+            pool,
+            _dispatcher_4_rows,
+            _dispatcher_row,
+        );
     }
 }
 
@@ -612,12 +122,11 @@ impl<'a> VerticalConvolutionPass<f32, 3> for ImageStore<'a, f32, 3> {
         destination: &mut ImageStore<f32, 3>,
         pool: &Option<ThreadPool>,
     ) {
-        #[allow(unused_assignments)]
-        #[allow(unused_mut)]
-        let mut using_feature = AccelerationFeature::Native;
+        let mut _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, *const f32) =
+            convolve_vertical_rgb_native_row_f32::<3>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            using_feature = AccelerationFeature::Neon;
+            _dispatcher = convolve_vertical_rgb_neon_row_f32::<3>;
         }
         #[cfg(all(
             any(target_arch = "x86_64", target_arch = "x86"),
@@ -625,21 +134,9 @@ impl<'a> VerticalConvolutionPass<f32, 3> for ImageStore<'a, f32, 3> {
         ))]
         {
             if is_x86_feature_detected!("sse4.1") {
-                using_feature = AccelerationFeature::Sse;
-            }
-        }
-        match using_feature {
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            AccelerationFeature::Neon => {
-                convolve_vertical_neon(self, filter_weights, destination, pool);
-            }
-            AccelerationFeature::Native => {
-                convolve_vertical_native_f32(self, filter_weights, destination, pool);
-            }
-            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-            AccelerationFeature::Sse => {
-                convolve_vertical_sse_rgb_f32(self, filter_weights, destination, pool);
+                _dispatcher = convolve_vertical_rgb_sse_row_f32;
             }
         }
+        convolve_vertical_dispatch_f32(self, filter_weights, destination, pool, _dispatcher);
     }
 }
diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs
index b1c9dc2..d680d0a 100644
--- a/src/rgb_u8.rs
+++ b/src/rgb_u8.rs
@@ -5,432 +5,24 @@
  * // license that can be found in the LICENSE file.
  */
 
-use rayon::ThreadPool;
-use std::sync::Arc;
-
-use crate::acceleration_feature::AccelerationFeature;
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
-use crate::convolve_u8::*;
+use crate::convolve_naive_u8::*;
+use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8};
 use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::image_store::ImageStore;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::neon_rgb_u8::neon_rgb::*;
+use crate::neon::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-use crate::sse_rgb_u8::sse_rgb::*;
-use crate::support::{PRECISION, ROUNDING_APPROX};
-use crate::unsafe_slice::UnsafeSlice;
-
+use crate::sse::sse_rgb::{
+    convolve_horizontal_rgb_sse_row_one, convolve_horizontal_rgb_sse_rows_4,
+    convolve_vertical_rgb_sse_row,
+};
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-fn convolve_horizontal_rgb_sse(
-    image_store: &ImageStore<u8, 3>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<u8, 3>,
-    pool: &Option<ThreadPool>,
-) {
-    let approx_weights = filter_weights.numerical_approximation_i16::<12>(0);
-
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(approx_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        let destination_height = destination.height;
-        let dst_width = destination.width;
-        pool.scope(|scope| {
-            let mut yy = 0usize;
-            while yy + 4 < destination_height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * yy) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * yy) };
-                    unsafe {
-                        convolve_horizontal_rgb_sse_rows_4(
-                            image_store.width,
-                            dst_width,
-                            &weights,
-                            unsafe_source_ptr_0,
-                            src_stride,
-                            unsafe_destination_ptr_0,
-                            dst_stride,
-                        );
-                    }
-                });
-                yy += 4;
-            }
-            for y in (yy..destination.height).step_by(4) {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    unsafe {
-                        convolve_horizontal_rgb_sse_row_one(
-                            image_store.width,
-                            dst_width,
-                            &weights,
-                            unsafe_source_ptr_0,
-                            unsafe_destination_ptr_0,
-                        );
-                    }
-                });
-            }
-        });
-    } else {
-        let mut yy = 0usize;
-
-        while yy + 4 < destination.height {
-            unsafe {
-                convolve_horizontal_rgb_sse_rows_4(
-                    image_store.width,
-                    destination.width,
-                    &approx_weights,
-                    unsafe_source_ptr_0,
-                    src_stride,
-                    unsafe_destination_ptr_0,
-                    dst_stride,
-                );
-            }
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) };
-            yy += 4;
-        }
-
-        for _ in yy..destination.height {
-            unsafe {
-                convolve_horizontal_rgb_sse_row_one(
-                    image_store.width,
-                    destination.width,
-                    &approx_weights,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
-                );
-            }
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-fn convolve_horizontal_rgb_neon(
-    image_store: &ImageStore<u8, 3>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<u8, 3>,
-    pool: &Option<ThreadPool>,
-) {
-    let approx_weights = filter_weights.numerical_approximation_i16::<12>(0);
-
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-    let dst_width = destination.width;
-    let src_width = image_store.width;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(approx_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            let mut yy = 0usize;
-            for y in (0..destination.height.saturating_sub(4)).step_by(4) {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    unsafe {
-                        convolve_horizontal_rgb_neon_rows_4(
-                            dst_width,
-                            src_width,
-                            &weights,
-                            unsafe_source_ptr_0,
-                            src_stride,
-                            unsafe_destination_ptr_0,
-                            dst_stride,
-                        );
-                    }
-                });
-                yy = y;
-            }
-            for y in (yy..destination.height).step_by(4) {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    unsafe {
-                        convolve_horizontal_rgb_neon_row_one(
-                            dst_width,
-                            src_width,
-                            &weights,
-                            unsafe_source_ptr_0,
-                            unsafe_destination_ptr_0,
-                        );
-                    }
-                });
-            }
-        });
-    } else {
-        let mut yy = 0usize;
-        while yy + 4 < destination.height {
-            unsafe {
-                convolve_horizontal_rgb_neon_rows_4(
-                    dst_width,
-                    src_width,
-                    &approx_weights,
-                    unsafe_source_ptr_0,
-                    src_stride,
-                    unsafe_destination_ptr_0,
-                    dst_stride,
-                );
-            }
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) };
-
-            yy += 4;
-        }
-
-        for _ in yy..destination.height {
-            unsafe {
-                convolve_horizontal_rgb_neon_row_one(
-                    dst_width,
-                    src_width,
-                    &approx_weights,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
-                );
-            }
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
-fn convolve_horizontal_rgb_native_row(
-    dst_width: usize,
-    filter_weights: &FilterWeights<i16>,
-    unsafe_source_ptr_0: *const u8,
-    unsafe_destination_ptr_0: *mut u8,
-) {
-    const CHANNELS: usize = 3;
-    let mut filter_offset = 0usize;
-    let weights_ptr = filter_weights.weights.as_ptr();
-    for x in 0..dst_width {
-        let mut sum_r = ROUNDING_APPROX;
-        let mut sum_g = ROUNDING_APPROX;
-        let mut sum_b = ROUNDING_APPROX;
-
-        let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
-        let start_x = bounds.start;
-        for j in 0..bounds.size {
-            let px = (start_x + j) * CHANNELS;
-            let weight = unsafe { weights_ptr.add(j + filter_offset).read_unaligned() } as i32;
-            let src = unsafe { unsafe_source_ptr_0.add(px) };
-            sum_r += unsafe { src.read_unaligned() } as i32 * weight;
-            sum_g += unsafe { src.add(1).read_unaligned() } as i32 * weight;
-            sum_b += unsafe { src.add(2).read_unaligned() } as i32 * weight;
-        }
-
-        let px = x * CHANNELS;
-
-        let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-
-        unsafe {
-            dest_ptr.write_unaligned((sum_r >> PRECISION).min(255).max(0) as u8);
-            dest_ptr
-                .add(1)
-                .write_unaligned((sum_g >> PRECISION).min(255).max(0) as u8);
-            dest_ptr
-                .add(2)
-                .write_unaligned((sum_b >> PRECISION).min(255).max(0) as u8);
-        }
-
-        filter_offset += filter_weights.aligned_size;
-    }
-}
-
-fn convolve_horizontal_rgb_native(
-    image_store: &ImageStore<u8, 3>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<u8, 3>,
-    pool: &Option<ThreadPool>,
-) {
-    let approx_weights = filter_weights.numerical_approximation_i16::<12>(0);
-
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-    let dst_width = destination.width;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(approx_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    convolve_horizontal_rgb_native_row(
-                        dst_width,
-                        &weights,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                    );
-                });
-            }
-        });
-    } else {
-        for _ in 0..destination.height {
-            convolve_horizontal_rgb_native_row(
-                destination.width,
-                &approx_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-            );
-
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-pub(crate) fn convolve_vertical_rgb_sse_8<const COMPONENTS: usize>(
-    image_store: &ImageStore<u8, COMPONENTS>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<u8, COMPONENTS>,
-    pool: &Option<ThreadPool>,
-) {
-    let approx_weights = filter_weights.numerical_approximation_i16::<12>(0);
-    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-    let src_stride = image_store.width * image_store.channels;
-    let mut filter_offset = 0usize;
-    let dst_stride = destination.width * image_store.channels;
-    let total_width = destination.width * image_store.channels;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(approx_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let bounds = unsafe { weights.bounds.get_unchecked(y) };
-                    let weight_ptr =
-                        unsafe { weights.weights.as_ptr().add(weights.aligned_size * y) };
-                    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    convolve_vertical_rgb_sse_row(
-                        total_width,
-                        &bounds,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                        src_stride,
-                        weight_ptr,
-                    );
-                });
-            }
-        });
-    } else {
-        for y in 0..destination.height {
-            let bounds = unsafe { approx_weights.bounds.get_unchecked(y) };
-            let weight_ptr = unsafe { approx_weights.weights.as_ptr().add(filter_offset) };
-            convolve_vertical_rgb_sse_row(
-                total_width,
-                &bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
-            );
-            filter_offset += approx_weights.aligned_size;
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-#[inline(always)]
-pub(crate) fn convolve_vertical_rgb_neon<const CHANNELS: usize>(
-    image_store: &ImageStore<u8, CHANNELS>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<u8, CHANNELS>,
-    pool: &Option<ThreadPool>,
-) {
-    let approx_weights = filter_weights.numerical_approximation_i16::<12>(0);
-    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-    let src_stride = image_store.width * image_store.channels;
-    let mut filter_offset = 0usize;
-    let dst_stride = destination.width * image_store.channels;
-    let total_width = destination.width * image_store.channels;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(approx_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let bounds = unsafe { weights.bounds.get_unchecked(y) };
-                    let weight_ptr =
-                        unsafe { weights.weights.as_ptr().add(weights.aligned_size * y) };
-                    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    convolve_vertical_rgb_neon_row(
-                        total_width,
-                        bounds,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                        src_stride,
-                        weight_ptr,
-                    );
-                });
-            }
-        });
-    } else {
-        for y in 0..destination.height {
-            let bounds = unsafe { approx_weights.bounds.get_unchecked(y) };
-            let weight_ptr = unsafe { approx_weights.weights.as_ptr().add(filter_offset) };
-            convolve_vertical_rgb_neon_row(
-                total_width,
-                bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
-            );
-            filter_offset += approx_weights.aligned_size;
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
+use crate::sse::*;
+use rayon::ThreadPool;
 
 #[inline(always)]
-pub(crate) fn convolve_vertical_rgb_native_row(
+pub(crate) fn convolve_vertical_rgb_native_row_u8<const COMPONENTS: usize>(
     dst_width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const u8,
@@ -441,7 +33,7 @@ pub(crate) fn convolve_vertical_rgb_native_row(
     let mut cx = 0usize;
     while cx + 12 < dst_width {
         unsafe {
-            convolve_vertical_part::<12, 3>(
+            convolve_vertical_part::<12, COMPONENTS>(
                 bounds.start,
                 cx,
                 unsafe_source_ptr_0,
@@ -457,7 +49,7 @@ pub(crate) fn convolve_vertical_rgb_native_row(
 
     while cx + 8 < dst_width {
         unsafe {
-            convolve_vertical_part::<8, 3>(
+            convolve_vertical_part::<8, COMPONENTS>(
                 bounds.start,
                 cx,
                 unsafe_source_ptr_0,
@@ -473,7 +65,7 @@ pub(crate) fn convolve_vertical_rgb_native_row(
 
     while cx < dst_width {
         unsafe {
-            convolve_vertical_part::<1, 3>(
+            convolve_vertical_part::<1, COMPONENTS>(
                 bounds.start,
                 cx,
                 unsafe_source_ptr_0,
@@ -488,67 +80,6 @@ pub(crate) fn convolve_vertical_rgb_native_row(
     }
 }
 
-#[inline(always)]
-pub(crate) fn convolve_vertical_rgb_native_8<'a, const COMPONENTS: usize>(
-    image_store: &ImageStore<u8, COMPONENTS>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<'a, u8, COMPONENTS>,
-    pool: &Option<ThreadPool>,
-) {
-    let approx_weights = filter_weights.numerical_approximation_i16::<12>(0);
-
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-
-    let dst_width = destination.width;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(approx_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let bounds = unsafe { weights.bounds.get_unchecked(y) };
-                    let weight_ptr =
-                        unsafe { weights.weights.as_ptr().add(weights.aligned_size * y) };
-                    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    convolve_vertical_rgb_native_row(
-                        dst_width,
-                        bounds,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                        src_stride,
-                        weight_ptr,
-                    );
-                });
-            }
-        });
-    } else {
-        let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-        let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-        let mut filter_offset = 0usize;
-        for y in 0..destination.height {
-            let bounds = unsafe { approx_weights.bounds.get_unchecked(y) };
-            let weight_ptr = unsafe { approx_weights.weights.as_ptr().add(filter_offset) };
-            convolve_vertical_rgb_native_row(
-                dst_width,
-                bounds,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                src_stride,
-                weight_ptr,
-            );
-
-            filter_offset += approx_weights.aligned_size;
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
 impl<'a> HorizontalConvolutionPass<u8, 3> for ImageStore<'a, u8, 3> {
     fn convolve_horizontal(
         &self,
@@ -556,33 +87,35 @@ impl<'a> HorizontalConvolutionPass<u8, 3> for ImageStore<'a, u8, 3> {
         destination: &mut ImageStore<u8, 3>,
         pool: &Option<ThreadPool>,
     ) {
-        let mut _using_feature = AccelerationFeature::Native;
+        let mut _dispatcher_4_rows: Option<
+            fn(usize, usize, &FilterWeights<i16>, *const u8, usize, *mut u8, usize),
+        > = None;
+        let mut _dispatcher_1_row: fn(usize, usize, &FilterWeights<i16>, *const u8, *mut u8) =
+            convolve_horizontal_rgba_native_row::<3>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            _using_feature = AccelerationFeature::Neon;
+            _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4);
+            _dispatcher_1_row = convolve_horizontal_rgb_neon_row_one;
         }
         #[cfg(all(
             any(target_arch = "x86_64", target_arch = "x86"),
             target_feature = "sse4.1"
         ))]
         {
+            _dispatcher_4_rows = Some(convolve_horizontal_rgba_native_4_row::<3>);
             if is_x86_feature_detected!("sse4.1") {
-                _using_feature = AccelerationFeature::Sse;
-            }
-        }
-        match _using_feature {
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            AccelerationFeature::Neon => {
-                convolve_horizontal_rgb_neon(self, filter_weights, destination, pool);
-            }
-            AccelerationFeature::Native => {
-                convolve_horizontal_rgb_native(self, filter_weights, destination, pool);
-            }
-            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-            AccelerationFeature::Sse => {
-                convolve_horizontal_rgb_sse(self, filter_weights, destination, pool);
-            }
-        }
+                _dispatcher_4_rows = Some(convolve_horizontal_rgb_sse_rows_4);
+                _dispatcher_1_row = convolve_horizontal_rgb_sse_row_one;
+            }
+        }
+        convolve_horizontal_dispatch_u8(
+            self,
+            filter_weights,
+            destination,
+            pool,
+            _dispatcher_4_rows,
+            _dispatcher_1_row,
+        );
     }
 }
 
@@ -593,10 +126,17 @@ impl<'a> VerticalConvolutionPass<u8, 3> for ImageStore<'a, u8, 3> {
         destination: &mut ImageStore<u8, 3>,
         pool: &Option<ThreadPool>,
     ) {
-        let mut _using_feature = AccelerationFeature::Native;
+        let mut _dispatcher: fn(
+            dst_width: usize,
+            bounds: &FilterBounds,
+            unsafe_source_ptr_0: *const u8,
+            unsafe_destination_ptr_0: *mut u8,
+            src_stride: usize,
+            weight_ptr: *const i16,
+        ) = convolve_vertical_rgb_native_row_u8::<3>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            _using_feature = AccelerationFeature::Neon;
+            _dispatcher = convolve_vertical_rgb_neon_row::<3>;
         }
         #[cfg(all(
             any(target_arch = "x86_64", target_arch = "x86"),
@@ -604,21 +144,9 @@ impl<'a> VerticalConvolutionPass<u8, 3> for ImageStore<'a, u8, 3> {
         ))]
         {
             if is_x86_feature_detected!("sse4.1") {
-                _using_feature = AccelerationFeature::Sse;
-            }
-        }
-        match _using_feature {
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            AccelerationFeature::Neon => {
-                convolve_vertical_rgb_neon(self, filter_weights, destination, pool);
-            }
-            AccelerationFeature::Native => {
-                convolve_vertical_rgb_native_8(self, filter_weights, destination, pool);
-            }
-            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-            AccelerationFeature::Sse => {
-                convolve_vertical_rgb_sse_8(self, filter_weights, destination, pool);
+                _dispatcher = convolve_vertical_rgb_sse_row;
             }
         }
+        convolve_vertical_dispatch_u8(self, filter_weights, destination, pool, _dispatcher);
     }
 }
diff --git a/src/rgba_f32.rs b/src/rgba_f32.rs
index ac3779d..c92f2ba 100644
--- a/src/rgba_f32.rs
+++ b/src/rgba_f32.rs
@@ -5,426 +5,21 @@
  * // license that can be found in the LICENSE file.
  */
 
-use std::sync::Arc;
-
 use rayon::ThreadPool;
 
-use crate::acceleration_feature::AccelerationFeature;
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::convolve_f32::*;
+use crate::convolve_naive_f32::{convolve_horizontal_rgb_native_row, convolve_horizontal_rgba_4_row_f32};
+use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32};
 use crate::filter_weights::*;
-use crate::rgb_f32::convolve_vertical_native_f32;
-use crate::unsafe_slice::UnsafeSlice;
 use crate::ImageStore;
-
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-#[inline(always)]
-fn convolve_horizontal_rgba_f32_neon(
-    image_store: &ImageStore<f32, 4>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f32, 4>,
-    pool: &Option<ThreadPool>,
-) {
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-    let dst_width = destination.width;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            let mut yy = 0usize;
-            for y in (0..destination.height.saturating_sub(4)).step_by(4) {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    unsafe {
-                        crate::neon_rgb_f32::neon_convolve_floats::convolve_horizontal_rgba_neon_rows_4(
-                            dst_width,
-                            &weights,
-                            unsafe_source_ptr_0,
-                            src_stride,
-                            unsafe_destination_ptr_0,
-                            dst_stride,
-                        );
-                    }
-                });
-                yy = y;
-            }
-            for y in (yy..destination.height).step_by(4) {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    unsafe {
-                        crate::neon_rgb_f32::neon_convolve_floats::convolve_horizontal_rgba_neon_row_one(
-                            dst_width,
-                            &weights,
-                            unsafe_source_ptr_0,
-                            unsafe_destination_ptr_0,
-                        );
-                    }
-                });
-            }
-        });
-    } else {
-        let mut yy = 0usize;
-
-        while yy + 4 < destination.height {
-            unsafe {
-                crate::neon_rgb_f32::neon_convolve_floats::convolve_horizontal_rgba_neon_rows_4(
-                    dst_width,
-                    &filter_weights,
-                    unsafe_source_ptr_0,
-                    src_stride,
-                    unsafe_destination_ptr_0,
-                    dst_stride,
-                );
-            }
-
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) };
-
-            yy += 4;
-        }
-
-        for _ in yy..destination.height {
-            unsafe {
-                crate::neon_rgb_f32::neon_convolve_floats::convolve_horizontal_rgba_neon_row_one(
-                    dst_width,
-                    &filter_weights,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
-                );
-            }
-
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
+use crate::neon::*;
+use crate::rgb_f32::convolve_vertical_rgb_native_row_f32;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-#[inline(always)]
-fn convolve_horizontal_rgba_f32_sse(
-    image_store: &ImageStore<f32, 4>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f32, 4>,
-    pool: &Option<ThreadPool>,
-) {
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-    let dst_width = destination.width;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            let mut yy = 0usize;
-            for y in (0..destination.height.saturating_sub(4)).step_by(4) {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    unsafe {
-                        crate::sse_rgb_f32::sse_convolve_f32::convolve_horizontal_rgba_sse_rows_4(
-                            dst_width,
-                            &weights,
-                            unsafe_source_ptr_0,
-                            src_stride,
-                            unsafe_destination_ptr_0,
-                            dst_stride,
-                        );
-                    }
-                });
-                yy = y;
-            }
-            for y in (yy..destination.height).step_by(4) {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    unsafe {
-                        crate::sse_rgb_f32::sse_convolve_f32::convolve_horizontal_rgba_sse_row_one(
-                            dst_width,
-                            &weights,
-                            unsafe_source_ptr_0,
-                            unsafe_destination_ptr_0,
-                        );
-                    }
-                });
-            }
-        });
-    } else {
-        let mut yy = 0usize;
-
-        while yy + 4 < destination.height {
-            unsafe {
-                crate::sse_rgb_f32::sse_convolve_f32::convolve_horizontal_rgba_sse_rows_4(
-                    dst_width,
-                    &filter_weights,
-                    unsafe_source_ptr_0,
-                    src_stride,
-                    unsafe_destination_ptr_0,
-                    dst_stride,
-                );
-            }
-
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) };
-
-            yy += 4;
-        }
-
-        for _ in yy..destination.height {
-            unsafe {
-                crate::sse_rgb_f32::sse_convolve_f32::convolve_horizontal_rgba_sse_row_one(
-                    dst_width,
-                    &filter_weights,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
-                );
-            }
-
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
-#[inline(always)]
-fn convolve_horizontal_rgb_native_row(
-    dst_width: usize,
-    filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
-) {
-    const CHANNELS: usize = 4;
-    let weights_ptr = filter_weights.weights.as_ptr();
-    let mut filter_offset = 0usize;
-
-    for x in 0..dst_width {
-        let mut sum_r = 0f32;
-        let mut sum_g = 0f32;
-        let mut sum_b = 0f32;
-        let mut sum_a = 0f32;
-
-        let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
-        let start_x = bounds.start;
-        for j in 0..bounds.size {
-            let px = (start_x + j) * CHANNELS;
-            let weight = unsafe { weights_ptr.add(j + filter_offset).read_unaligned() };
-            let src = unsafe { unsafe_source_ptr_0.add(px) };
-            sum_r += unsafe { src.read_unaligned() } * weight;
-            sum_g += unsafe { src.add(1).read_unaligned() } * weight;
-            sum_b += unsafe { src.add(2).read_unaligned() } * weight;
-            sum_a += unsafe { src.add(3).read_unaligned() } * weight;
-        }
-
-        let px = x * CHANNELS;
-
-        let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-
-        unsafe {
-            dest_ptr.write_unaligned(sum_r);
-            dest_ptr.add(1).write_unaligned(sum_g);
-            dest_ptr.add(2).write_unaligned(sum_b);
-            dest_ptr.add(3).write_unaligned(sum_a);
-        }
-
-        filter_offset += filter_weights.aligned_size;
-    }
-}
-
-fn convolve_horizontal_rgba_f32_native(
-    image_store: &ImageStore<f32, 4>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f32, 4>,
-    pool: &Option<ThreadPool>,
-) {
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-
-    let dst_width = destination.width;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    convolve_horizontal_rgb_native_row(
-                        dst_width,
-                        &weights,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                    );
-                });
-            }
-        });
-    } else {
-        for _ in 0..destination.height {
-            convolve_horizontal_rgb_native_row(
-                dst_width,
-                &filter_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-            );
-
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-#[inline(always)]
-fn convolve_vertical_rgb_native_row(
-    total_width: usize,
-    src_stride: usize,
-    unsafe_source_ptr_0: *const f32,
-    unsafe_destination_ptr_0: *mut f32,
-    weight_ptr: *const f32,
-    bounds: &FilterBounds,
-) {
-    let mut cx = 0usize;
-
-    while cx + 16 < total_width {
-        unsafe {
-            convolve_vertical_part_neon_16_f32(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-            );
-        }
-
-        cx += 16;
-    }
-    while cx + 8 < total_width {
-        unsafe {
-            convolve_vertical_part_neon_8_f32::<false>(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-                8,
-            );
-        }
-
-        cx += 8;
-    }
-
-    let left = total_width - cx;
-
-    if left > 0 {
-        unsafe {
-            convolve_vertical_part_neon_8_f32::<true>(
-                bounds.start,
-                cx,
-                unsafe_source_ptr_0,
-                src_stride,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                bounds,
-                left,
-            );
-        }
-    }
-}
-
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-fn convolve_vertical_rgba_f32_neon(
-    image_store: &ImageStore<f32, 4>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<f32, 4>,
-    pool: &Option<ThreadPool>,
-) {
-    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-    let total_width = destination.width * image_store.channels;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(filter_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    let filter_offset = y * weights.aligned_size;
-                    let bounds = unsafe { weights.bounds.get_unchecked(y) };
-                    let weight_ptr = unsafe { weights.weights.as_ptr().add(filter_offset) };
-
-                    convolve_vertical_rgb_native_row(
-                        total_width,
-                        src_stride,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                        weight_ptr,
-                        &bounds,
-                    );
-                });
-            }
-        });
-    } else {
-        let mut filter_offset = 0usize;
-        for y in 0..destination.height {
-            let bounds = unsafe { filter_weights.bounds.get_unchecked(y) };
-            let weight_ptr = unsafe { filter_weights.weights.as_ptr().add(filter_offset) };
-
-            convolve_vertical_rgb_native_row(
-                total_width,
-                src_stride,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-                weight_ptr,
-                &bounds,
-            );
-
-            filter_offset += filter_weights.aligned_size;
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
+use crate::sse::sse_convolve_f32::{
+    convolve_vertical_rgb_sse_row_f32, convolve_horizontal_rgba_sse_row_one_f32,
+    convolve_horizontal_rgba_sse_rows_4_f32,
+};
 
 impl<'a> HorizontalConvolutionPass<f32, 4> for ImageStore<'a, f32, 4> {
     #[inline(always)]
@@ -434,12 +29,15 @@ impl<'a> HorizontalConvolutionPass<f32, 4> for ImageStore<'a, f32, 4> {
         destination: &mut ImageStore<f32, 4>,
         pool: &Option<ThreadPool>,
     ) {
-        #[allow(unused_assignments)]
-        #[allow(unused_mut)]
-        let mut using_feature = AccelerationFeature::Native;
+        let mut _dispatcher_4_rows: Option<
+            fn(usize, usize, &FilterWeights<f32>, *const f32, usize, *mut f32, usize),
+        > = Some(convolve_horizontal_rgba_4_row_f32::<4>);
+        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f32, *mut f32) =
+            convolve_horizontal_rgb_native_row::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            using_feature = AccelerationFeature::Neon;
+            _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4);
+            _dispatcher_row = convolve_horizontal_rgba_neon_row_one;
         }
         #[cfg(all(
             any(target_arch = "x86_64", target_arch = "x86"),
@@ -447,22 +45,18 @@ impl<'a> HorizontalConvolutionPass<f32, 4> for ImageStore<'a, f32, 4> {
         ))]
         {
             if is_x86_feature_detected!("sse4.1") {
-                using_feature = AccelerationFeature::Sse;
-            }
-        }
-        match using_feature {
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            AccelerationFeature::Neon => {
-                convolve_horizontal_rgba_f32_neon(self, filter_weights, destination, pool);
-            }
-            AccelerationFeature::Native => {
-                convolve_horizontal_rgba_f32_native(self, filter_weights, destination, pool);
-            }
-            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-            AccelerationFeature::Sse => {
-                convolve_horizontal_rgba_f32_sse(self, filter_weights, destination, pool);
-            }
-        }
+                _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4_f32);
+                _dispatcher_row = convolve_horizontal_rgba_sse_row_one_f32;
+            }
+        }
+        convolve_horizontal_dispatch_f32(
+            self,
+            filter_weights,
+            destination,
+            pool,
+            _dispatcher_4_rows,
+            _dispatcher_row,
+        );
     }
 }
 
@@ -473,12 +67,11 @@ impl<'a> VerticalConvolutionPass<f32, 4> for ImageStore<'a, f32, 4> {
         destination: &mut ImageStore<f32, 4>,
         pool: &Option<ThreadPool>,
     ) {
-        #[allow(unused_assignments)]
-        #[allow(unused_mut)]
-        let mut using_feature = AccelerationFeature::Native;
+        let mut _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, *const f32) =
+            convolve_vertical_rgb_native_row_f32::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            using_feature = AccelerationFeature::Neon;
+            _dispatcher = convolve_vertical_rgb_neon_row_f32::<4>;
         }
         #[cfg(all(
             any(target_arch = "x86_64", target_arch = "x86"),
@@ -486,26 +79,9 @@ impl<'a> VerticalConvolutionPass<f32, 4> for ImageStore<'a, f32, 4> {
         ))]
         {
             if is_x86_feature_detected!("sse4.1") {
-                using_feature = AccelerationFeature::Sse;
-            }
-        }
-        match using_feature {
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            AccelerationFeature::Neon => {
-                convolve_vertical_rgba_f32_neon(self, filter_weights, destination, pool);
-            }
-            AccelerationFeature::Native => {
-                convolve_vertical_native_f32(self, filter_weights, destination, pool);
-            }
-            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-            AccelerationFeature::Sse => {
-                crate::rgb_f32::convolve_vertical_sse_rgb_f32(
-                    self,
-                    filter_weights,
-                    destination,
-                    pool,
-                );
+                _dispatcher = convolve_vertical_rgb_sse_row_f32;
             }
         }
+        convolve_vertical_dispatch_f32(self, filter_weights, destination, pool, _dispatcher);
     }
 }
diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs
index 855b505..47bde4d 100644
--- a/src/rgba_u8.rs
+++ b/src/rgba_u8.rs
@@ -5,346 +5,22 @@
  * // license that can be found in the LICENSE file.
  */
 
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use std::arch::aarch64::*;
-use std::sync::Arc;
-
 use rayon::ThreadPool;
 
-use crate::acceleration_feature::AccelerationFeature;
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
-use crate::filter_weights::FilterWeights;
+use crate::convolve_naive_u8::convolve_horizontal_rgba_native_row;
+use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8};
+use crate::filter_weights::{FilterBounds, FilterWeights};
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::neon_simd_u8::*;
+use crate::neon::*;
 use crate::rgb_u8::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-use crate::sse_rgb_u8::sse_rgb::*;
-use crate::support::{PRECISION, ROUNDING_APPROX};
-use crate::unsafe_slice::UnsafeSlice;
+use crate::sse::sse_rgb::{
+    convolve_horizontal_rgba_sse_rows_4, convolve_horizontal_rgba_sse_rows_one,
+    convolve_vertical_rgb_sse_row,
+};
 use crate::ImageStore;
 
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-fn convolve_horizontal_rgba_sse(
-    image_store: &ImageStore<u8, 4>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<u8, 4>,
-    pool: &Option<ThreadPool>,
-) {
-    let approx_weights = filter_weights.numerical_approximation_i16::<12>(0);
-
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-    let dst_width = destination.width;
-
-    let mut yy = 0usize;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(approx_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        let destination_height = destination.height;
-        let dst_width = destination.width;
-        pool.scope(|scope| {
-            let mut yy = 0usize;
-            for y in (0..destination_height.saturating_sub(4)).step_by(4) {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    unsafe {
-                        convolve_horizontal_rgba_sse_rows_4(
-                            dst_width,
-                            &weights,
-                            unsafe_source_ptr_0,
-                            src_stride,
-                            unsafe_destination_ptr_0,
-                            dst_stride,
-                        );
-                    }
-                });
-                yy = y;
-            }
-            for y in (yy..destination.height).step_by(4) {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    unsafe {
-                        convolve_horizontal_rgba_sse_rows_one(
-                            dst_width,
-                            &weights,
-                            unsafe_source_ptr_0,
-                            unsafe_destination_ptr_0,
-                        );
-                    }
-                });
-            }
-        });
-    } else {
-        while yy < destination.height.saturating_sub(4) {
-            unsafe {
-                convolve_horizontal_rgba_sse_rows_4(
-                    dst_width,
-                    &approx_weights,
-                    unsafe_source_ptr_0,
-                    src_stride,
-                    unsafe_destination_ptr_0,
-                    dst_stride,
-                );
-            }
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) };
-
-            yy += 4;
-        }
-
-        for _ in yy..destination.height {
-            unsafe {
-                convolve_horizontal_rgba_sse_rows_one(
-                    dst_width,
-                    &approx_weights,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
-                );
-            }
-
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-pub unsafe fn convolve_horizontal_rgba_neon_row(
-    dst_width: usize,
-    approx_weights: &FilterWeights<i16>,
-    unsafe_source_ptr_0: *const u8,
-    unsafe_destination_ptr_0: *mut u8,
-) {
-    const CHANNELS: usize = 4;
-    let mut filter_offset = 0usize;
-
-    let weights_ptr = approx_weights.weights.as_ptr();
-
-    for x in 0..dst_width {
-        let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
-        let mut jx = 0usize;
-        let mut store = unsafe { vdupq_n_s32(ROUNDING_APPROX) };
-
-        while jx + 4 < bounds.size {
-            let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-            unsafe {
-                let weight0 = vdup_n_s16(ptr.read_unaligned());
-                let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned());
-                let weight2 = vdup_n_s16(ptr.add(2).read_unaligned());
-                let weight3 = vdupq_n_s16(ptr.add(3).read_unaligned());
-                store = neon_convolve_u8::convolve_horizontal_parts_4_rgba(
-                    bounds.start + jx,
-                    unsafe_source_ptr_0,
-                    weight0,
-                    weight1,
-                    weight2,
-                    weight3,
-                    store,
-                );
-            }
-            jx += 4;
-        }
-
-        while jx < bounds.size {
-            let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-            unsafe {
-                let weight0 = vdup_n_s16(ptr.read_unaligned());
-                store = neon_convolve_u8::convolve_horizontal_parts_one_rgba(
-                    bounds.start + jx,
-                    unsafe_source_ptr_0,
-                    weight0,
-                    store,
-                );
-            }
-            jx += 1;
-        }
-
-        let store_16 = unsafe { vqshrun_n_s32::<12>(vmaxq_s32(store, vdupq_n_s32(0i32))) };
-        let store_16_8 = unsafe { vqmovn_u16(vcombine_u16(store_16, store_16)) };
-
-        let px = x * CHANNELS;
-        let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-        let value = unsafe { vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)) };
-        let dest_ptr_32 = dest_ptr as *mut u32;
-        unsafe {
-            dest_ptr_32.write_unaligned(value);
-        }
-
-        filter_offset += approx_weights.aligned_size;
-    }
-}
-
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-fn convolve_horizontal_rgba_neon(
-    image_store: &ImageStore<u8, 4>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<u8, 4>,
-    pool: &Option<ThreadPool>,
-) {
-    let approx_weights = filter_weights.numerical_approximation_i16::<12>(0);
-
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-
-    let dst_width = destination.width;
-
-    if let Some(pool) = pool {
-        let arc_weights = Arc::new(approx_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        let destination_height = destination.height;
-        let dst_width = destination.width;
-        pool.scope(|_| {
-            let weights = arc_weights.clone();
-            for y in 0..destination_height {
-                let unsafe_source_ptr_0 =
-                    unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                let dst_ptr = unsafe_slice.mut_ptr();
-                let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                unsafe {
-                    convolve_horizontal_rgba_neon_row(
-                        dst_width,
-                        &weights,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                    );
-                }
-            }
-        });
-    } else {
-        for _ in 0..destination.height {
-            unsafe {
-                convolve_horizontal_rgba_neon_row(
-                    dst_width,
-                    &approx_weights,
-                    unsafe_source_ptr_0,
-                    unsafe_destination_ptr_0,
-                );
-            }
-
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
-fn convolve_horizontal_rgba_native_row(
-    dst_width: usize,
-    filter_weights: &FilterWeights<i16>,
-    unsafe_source_ptr_0: *const u8,
-    unsafe_destination_ptr_0: *mut u8,
-) {
-    const CHANNELS: usize = 4;
-    let mut filter_offset = 0usize;
-    let weights_ptr = filter_weights.weights.as_ptr();
-
-    for x in 0..dst_width {
-        let mut sum_r = ROUNDING_APPROX;
-        let mut sum_g = ROUNDING_APPROX;
-        let mut sum_b = ROUNDING_APPROX;
-        let mut sum_a = ROUNDING_APPROX;
-
-        let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
-        let start_x = bounds.start;
-        for j in 0..bounds.size {
-            let px = (start_x + j) * CHANNELS;
-            let weight = unsafe { weights_ptr.add(j + filter_offset).read_unaligned() } as i32;
-            let src = unsafe { unsafe_source_ptr_0.add(px) };
-            sum_r += unsafe { src.read_unaligned() } as i32 * weight;
-            sum_g += unsafe { src.add(1).read_unaligned() } as i32 * weight;
-            sum_b += unsafe { src.add(2).read_unaligned() } as i32 * weight;
-            sum_a += unsafe { src.add(3).read_unaligned() } as i32 * weight;
-        }
-
-        let px = x * CHANNELS;
-
-        let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-
-        unsafe {
-            dest_ptr.write_unaligned((sum_r >> PRECISION).min(255).max(0) as u8);
-            dest_ptr
-                .add(1)
-                .write_unaligned((sum_g >> PRECISION).min(255).max(0) as u8);
-            dest_ptr
-                .add(2)
-                .write_unaligned((sum_b >> PRECISION).min(255).max(0) as u8);
-            dest_ptr
-                .add(3)
-                .write_unaligned((sum_a >> PRECISION).min(255).max(0) as u8);
-        }
-
-        filter_offset += filter_weights.aligned_size;
-    }
-}
-
-fn convolve_horizontal_rgba_native(
-    image_store: &ImageStore<u8, 4>,
-    filter_weights: FilterWeights<f32>,
-    destination: &mut ImageStore<u8, 4>,
-    _pool: &Option<ThreadPool>,
-) {
-    let approx_weights = filter_weights.numerical_approximation_i16::<12>(0);
-
-    let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr();
-    let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr();
-
-    let dst_width = destination.width;
-
-    let src_stride = image_store.width * image_store.channels;
-    let dst_stride = destination.width * image_store.channels;
-
-    if let Some(pool) = _pool {
-        let arc_weights = Arc::new(approx_weights);
-        let borrowed = destination.buffer.borrow_mut();
-        let unsafe_slice = UnsafeSlice::new(borrowed);
-        pool.scope(|scope| {
-            for y in 0..destination.height {
-                let weights = arc_weights.clone();
-                scope.spawn(move |_| {
-                    let unsafe_source_ptr_0 =
-                        unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) };
-                    let dst_ptr = unsafe_slice.mut_ptr();
-                    let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) };
-                    convolve_horizontal_rgba_native_row(
-                        dst_width,
-                        &weights,
-                        unsafe_source_ptr_0,
-                        unsafe_destination_ptr_0,
-                    );
-                });
-            }
-        });
-    } else {
-        for _ in 0..destination.height {
-            convolve_horizontal_rgba_native_row(
-                dst_width,
-                &approx_weights,
-                unsafe_source_ptr_0,
-                unsafe_destination_ptr_0,
-            );
-
-            unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) };
-            unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) };
-        }
-    }
-}
-
 impl<'a> HorizontalConvolutionPass<u8, 4> for ImageStore<'a, u8, 4> {
     fn convolve_horizontal(
         &self,
@@ -352,33 +28,35 @@ impl<'a> HorizontalConvolutionPass<u8, 4> for ImageStore<'a, u8, 4> {
         destination: &mut ImageStore<u8, 4>,
         _pool: &Option<ThreadPool>,
     ) {
-        let mut _using_feature = AccelerationFeature::Native;
+        let mut _dispatcher_4_rows: Option<
+            fn(usize, usize, &FilterWeights<i16>, *const u8, usize, *mut u8, usize),
+        > = None;
+        let mut _dispatcher_1_row: fn(usize, usize, &FilterWeights<i16>, *const u8, *mut u8) =
+            convolve_horizontal_rgba_native_row::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            _using_feature = AccelerationFeature::Neon;
+            _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8);
+            _dispatcher_1_row = convolve_horizontal_rgba_neon_row;
         }
         #[cfg(all(
             any(target_arch = "x86_64", target_arch = "x86"),
             target_feature = "sse4.1"
         ))]
         {
+            _dispatcher_4_rows = Some(convolve_horizontal_rgba_native_4_row::<4>);
             if is_x86_feature_detected!("sse4.1") {
-                _using_feature = AccelerationFeature::Sse;
-            }
-        }
-        match _using_feature {
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            AccelerationFeature::Neon => {
-                convolve_horizontal_rgba_neon(self, filter_weights, destination, _pool);
-            }
-            AccelerationFeature::Native => {
-                convolve_horizontal_rgba_native(self, filter_weights, destination, _pool);
-            }
-            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-            AccelerationFeature::Sse => {
-                convolve_horizontal_rgba_sse(self, filter_weights, destination, _pool);
+                _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4);
+                _dispatcher_1_row = convolve_horizontal_rgba_sse_rows_one;
             }
         }
+        convolve_horizontal_dispatch_u8(
+            self,
+            filter_weights,
+            destination,
+            _pool,
+            _dispatcher_4_rows,
+            _dispatcher_1_row,
+        );
     }
 }
 
@@ -389,10 +67,17 @@ impl<'a> VerticalConvolutionPass<u8, 4> for ImageStore<'a, u8, 4> {
         destination: &mut ImageStore<u8, 4>,
         pool: &Option<ThreadPool>,
     ) {
-        let mut _using_feature = AccelerationFeature::Native;
+        let mut _dispatcher: fn(
+            dst_width: usize,
+            bounds: &FilterBounds,
+            unsafe_source_ptr_0: *const u8,
+            unsafe_destination_ptr_0: *mut u8,
+            src_stride: usize,
+            weight_ptr: *const i16,
+        ) = convolve_vertical_rgb_native_row_u8::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            _using_feature = AccelerationFeature::Neon;
+            _dispatcher = convolve_vertical_rgb_neon_row::<4>;
         }
         #[cfg(all(
             any(target_arch = "x86_64", target_arch = "x86"),
@@ -400,21 +85,9 @@ impl<'a> VerticalConvolutionPass<u8, 4> for ImageStore<'a, u8, 4> {
         ))]
         {
             if is_x86_feature_detected!("sse4.1") {
-                _using_feature = AccelerationFeature::Sse;
-            }
-        }
-        match _using_feature {
-            #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-            AccelerationFeature::Neon => {
-                convolve_vertical_rgb_neon(self, filter_weights, destination, pool);
-            }
-            AccelerationFeature::Native => {
-                convolve_vertical_rgb_native_8(self, filter_weights, destination, pool);
-            }
-            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-            AccelerationFeature::Sse => {
-                convolve_vertical_rgb_sse_8(self, filter_weights, destination, pool);
+                _dispatcher = convolve_vertical_rgb_sse_row;
             }
         }
+        convolve_vertical_dispatch_u8(self, filter_weights, destination, pool, _dispatcher);
     }
 }
diff --git a/src/scaler.rs b/src/scaler.rs
index 9b838ea..75ea329 100644
--- a/src/scaler.rs
+++ b/src/scaler.rs
@@ -28,7 +28,7 @@ pub trait Scaling {
     fn resize_rgb(&self, new_size: ImageSize, store: ImageStore<u8, 3>) -> ImageStore<u8, 3>;
     /// Performs rescaling for RGB f32, channel order does not matter
     fn resize_rgb_f32(&self, new_size: ImageSize, store: ImageStore<f32, 3>) -> ImageStore<f32, 3>;
-    /// Performs rescaling for RGBA, for pre-multiplying alpha, converting to LUV, LAB alpha must be last channel
+    /// Performs rescaling for RGBA, for pre-multiplying alpha, converting to LUV or LAB alpha must be last channel
     fn resize_rgba(
         &self,
         new_size: ImageSize,
diff --git a/src/sse/mod.rs b/src/sse/mod.rs
new file mode 100644
index 0000000..0c4fc30
--- /dev/null
+++ b/src/sse/mod.rs
@@ -0,0 +1,9 @@
+mod rgb_f32;
+mod rgb_u8;
+mod simd_u8;
+mod utils;
+
+pub use rgb_f32::*;
+pub use rgb_u8::*;
+pub use simd_u8::*;
+pub use utils::*;
diff --git a/src/sse_rgb_f32.rs b/src/sse/rgb_f32.rs
similarity index 69%
rename from src/sse_rgb_f32.rs
rename to src/sse/rgb_f32.rs
index dd1a3bc..eda46b8 100644
--- a/src/sse_rgb_f32.rs
+++ b/src/sse/rgb_f32.rs
@@ -52,34 +52,35 @@ pub mod sse_convolve_f32 {
         acc
     }
 
-    pub unsafe fn convolve_horizontal_rgba_sse_rows_4(
+    pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(
         dst_width: usize,
+        _: usize,
         filter_weights: &FilterWeights<f32>,
         unsafe_source_ptr_0: *const f32,
         src_stride: usize,
         unsafe_destination_ptr_0: *mut f32,
         dst_stride: usize,
     ) {
-        const CHANNELS: usize = 4;
-        let mut filter_offset = 0usize;
-        let zeros = unsafe { _mm_setzero_ps() };
-        let weights_ptr = filter_weights.weights.as_ptr();
-
-        for x in 0..dst_width {
-            let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
-            let mut jx = 0usize;
-            let mut store_0 = zeros;
-            let mut store_1 = zeros;
-            let mut store_2 = zeros;
-            let mut store_3 = zeros;
-
-            while jx + 4 < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                let weight0 = unsafe { _mm_set1_ps(ptr.read_unaligned()) };
-                let weight1 = unsafe { _mm_set1_ps(ptr.add(1).read_unaligned()) };
-                let weight2 = unsafe { _mm_set1_ps(ptr.add(2).read_unaligned()) };
-                let weight3 = unsafe { _mm_set1_ps(ptr.add(3).read_unaligned()) };
-                unsafe {
+        unsafe {
+            const CHANNELS: usize = 4;
+            let mut filter_offset = 0usize;
+            let zeros = _mm_setzero_ps();
+            let weights_ptr = filter_weights.weights.as_ptr();
+
+            for x in 0..dst_width {
+                let bounds = filter_weights.bounds.get_unchecked(x);
+                let mut jx = 0usize;
+                let mut store_0 = zeros;
+                let mut store_1 = zeros;
+                let mut store_2 = zeros;
+                let mut store_3 = zeros;
+
+                while jx + 4 < bounds.size {
+                    let ptr = weights_ptr.add(jx + filter_offset);
+                    let weight0 = _mm_set1_ps(ptr.read_unaligned());
+                    let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
+                    let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned());
+                    let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned());
                     store_0 = convolve_horizontal_parts_4_rgba_f32(
                         bounds.start,
                         unsafe_source_ptr_0,
@@ -116,13 +117,11 @@ pub mod sse_convolve_f32 {
                         weight3,
                         store_3,
                     );
+                    jx += 4;
                 }
-                jx += 4;
-            }
-            while jx < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                let weight0 = unsafe { _mm_set1_ps(ptr.read_unaligned()) };
-                unsafe {
+                while jx < bounds.size {
+                    let ptr = weights_ptr.add(jx + filter_offset);
+                    let weight0 = _mm_set1_ps(ptr.read_unaligned());
                     store_0 = convolve_horizontal_parts_one_rgba_f32(
                         bounds.start,
                         unsafe_source_ptr_0,
@@ -147,57 +146,50 @@ pub mod sse_convolve_f32 {
                         weight0,
                         store_3,
                     );
+                    jx += 1;
                 }
-                jx += 1;
-            }
 
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-            unsafe {
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe_destination_ptr_0.add(px);
                 _mm_storeu_ps(dest_ptr, store_0);
-            }
 
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) };
-            unsafe {
+                let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
                 _mm_storeu_ps(dest_ptr, store_1);
-            }
 
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) };
-            unsafe {
+                let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
                 _mm_storeu_ps(dest_ptr, store_2);
-            }
 
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) };
-            unsafe {
+                let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
                 _mm_storeu_ps(dest_ptr, store_3);
-            }
 
-            filter_offset += filter_weights.aligned_size;
+                filter_offset += filter_weights.aligned_size;
+            }
         }
     }
 
-    pub unsafe fn convolve_horizontal_rgba_sse_row_one(
+    pub fn convolve_horizontal_rgba_sse_row_one_f32(
         dst_width: usize,
+        _: usize,
         filter_weights: &FilterWeights<f32>,
         unsafe_source_ptr_0: *const f32,
         unsafe_destination_ptr_0: *mut f32,
     ) {
-        const CHANNELS: usize = 4;
-        let mut filter_offset = 0usize;
-        let weights_ptr = filter_weights.weights.as_ptr();
-
-        for x in 0..dst_width {
-            let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
-            let mut jx = 0usize;
-            let mut store = unsafe { _mm_setzero_ps() };
-
-            while jx + 4 < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                let weight0 = unsafe { _mm_set1_ps(ptr.read_unaligned()) };
-                let weight1 = unsafe { _mm_set1_ps(ptr.add(1).read_unaligned()) };
-                let weight2 = unsafe { _mm_set1_ps(ptr.add(2).read_unaligned()) };
-                let weight3 = unsafe { _mm_set1_ps(ptr.add(3).read_unaligned()) };
-                unsafe {
+        unsafe {
+            const CHANNELS: usize = 4;
+            let mut filter_offset = 0usize;
+            let weights_ptr = filter_weights.weights.as_ptr();
+
+            for x in 0..dst_width {
+                let bounds = filter_weights.bounds.get_unchecked(x);
+                let mut jx = 0usize;
+                let mut store = _mm_setzero_ps();
+
+                while jx + 4 < bounds.size {
+                    let ptr = weights_ptr.add(jx + filter_offset);
+                    let weight0 = _mm_set1_ps(ptr.read_unaligned());
+                    let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
+                    let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned());
+                    let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned());
                     store = convolve_horizontal_parts_4_rgba_f32(
                         bounds.start,
                         unsafe_source_ptr_0,
@@ -207,30 +199,26 @@ pub mod sse_convolve_f32 {
                         weight3,
                         store,
                     );
+                    jx += 4;
                 }
-                jx += 4;
-            }
-            while jx < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                let weight0 = unsafe { _mm_set1_ps(ptr.read_unaligned()) };
-                unsafe {
+                while jx < bounds.size {
+                    let ptr = weights_ptr.add(jx + filter_offset);
+                    let weight0 = _mm_set1_ps(ptr.read_unaligned());
                     store = convolve_horizontal_parts_one_rgba_f32(
                         bounds.start,
                         unsafe_source_ptr_0,
                         weight0,
                         store,
                     );
+                    jx += 1;
                 }
-                jx += 1;
-            }
 
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-            unsafe {
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe_destination_ptr_0.add(px);
                 _mm_storeu_ps(dest_ptr, store);
-            }
 
-            filter_offset += filter_weights.aligned_size;
+                filter_offset += filter_weights.aligned_size;
+            }
         }
     }
 
@@ -374,7 +362,81 @@ pub mod sse_convolve_f32 {
         dst_ptr.write_unaligned(f32::from_bits(_mm_extract_ps::<0>(store_0) as u32));
     }
 
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+    #[inline(always)]
+    pub(crate) fn convolve_vertical_rgb_sse_row_f32(
+        dst_width: usize,
+        bounds: &FilterBounds,
+        unsafe_source_ptr_0: *const f32,
+        unsafe_destination_ptr_0: *mut f32,
+        src_stride: usize,
+        weight_ptr: *const f32,
+    ) {
+        let mut cx = 0usize;
+
+        while cx + 16 < dst_width {
+            unsafe {
+                convolve_vertical_part_sse_16_f32(
+                    bounds.start,
+                    cx,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    weight_ptr,
+                    bounds,
+                );
+            }
+
+            cx += 16;
+        }
+
+        while cx + 8 < dst_width {
+            unsafe {
+                convolve_vertical_part_sse_8_f32(
+                    bounds.start,
+                    cx,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    weight_ptr,
+                    bounds,
+                );
+            }
+
+            cx += 8;
+        }
+
+        while cx + 4 < dst_width {
+            unsafe {
+                convolve_vertical_part_sse_4_f32(
+                    bounds.start,
+                    cx,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    weight_ptr,
+                    bounds,
+                );
+            }
+
+            cx += 4;
+        }
+
+        while cx < dst_width {
+            unsafe {
+                convolve_vertical_part_sse_f32(
+                    bounds.start,
+                    cx,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    weight_ptr,
+                    bounds,
+                );
+            }
+            cx += 1;
+        }
+    }
+
     #[cfg(not(target_feature = "fma"))]
     #[inline]
     #[allow(dead_code)]
@@ -382,7 +444,6 @@ pub mod sse_convolve_f32 {
         return _mm_add_ps(_mm_mul_ps(b, c), a);
     }
 
-    #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     #[cfg(target_feature = "fma")]
     #[inline]
     #[allow(dead_code)]
diff --git a/src/sse/rgb_u8.rs b/src/sse/rgb_u8.rs
new file mode 100644
index 0000000..515e882
--- /dev/null
+++ b/src/sse/rgb_u8.rs
@@ -0,0 +1,678 @@
+/*
+ * // Copyright (c) the Radzivon Bartoshyk. All rights reserved.
+ * //
+ * // Use of this source code is governed by a BSD-style
+ * // license that can be found in the LICENSE file.
+ */
+
+#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
+pub mod sse_rgb {
+    use crate::filter_weights::{FilterBounds, FilterWeights};
+    use crate::sse::sse_convolve_u8;
+    use crate::support::ROUNDING_APPROX;
+    #[cfg(target_arch = "x86")]
+    use std::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    use std::arch::x86_64::*;
+
+    pub(crate) fn convolve_horizontal_rgba_sse_rows_4(
+        dst_width: usize,
+        _: usize,
+        approx_weights: &FilterWeights<i16>,
+        unsafe_source_ptr_0: *const u8,
+        src_stride: usize,
+        unsafe_destination_ptr_0: *mut u8,
+        dst_stride: usize,
+    ) {
+        unsafe {
+            const CHANNELS: usize = 4;
+            let mut filter_offset = 0usize;
+            let weights_ptr = approx_weights.weights.as_ptr();
+
+            #[rustfmt::skip]
+           let shuffle_lo =_mm_setr_epi8(0, -1,
+                                         4, -1,
+                                         1, -1,
+                                         5, -1,
+                                         2, -1 ,
+                                         6,-1,
+                                         3, -1,
+                                         7, -1);
+
+            #[rustfmt::skip]
+           let shuffle_hi =_mm_setr_epi8(8, -1,
+                                         12, -1,
+                                         9, -1,
+                                         13, -1 ,
+                                         10,-1,
+                                         14, -1,
+                                         11, -1,
+                                         15, -1);
+
+            let vld = _mm_set1_epi32(ROUNDING_APPROX);
+
+            for x in 0..dst_width {
+                let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
+                let mut jx = 0usize;
+                let mut store_0 = vld;
+                let mut store_1 = vld;
+                let mut store_2 = vld;
+                let mut store_3 = vld;
+
+                while jx + 4 < bounds.size {
+                    let ptr = weights_ptr.add(jx + filter_offset);
+                    let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
+                    let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned());
+                    let start_bounds = bounds.start + jx;
+
+                    let src_ptr = unsafe_source_ptr_0.add(start_bounds * CHANNELS);
+                    let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i);
+
+                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
+                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+
+                    store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo, weight01));
+                    store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi, weight23));
+
+                    let rgb_pixel = _mm_loadu_si128(src_ptr.add(src_stride) as *const __m128i);
+
+                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
+                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+
+                    store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo, weight01));
+                    store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi, weight23));
+
+                    let rgb_pixel = _mm_loadu_si128(src_ptr.add(src_stride * 2) as *const __m128i);
+
+                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
+                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+
+                    store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo, weight01));
+                    store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi, weight23));
+
+                    let rgb_pixel = _mm_loadu_si128(src_ptr.add(src_stride * 3) as *const __m128i);
+
+                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
+                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+
+                    store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo, weight01));
+                    store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi, weight23));
+                    jx += 4;
+                }
+
+                while jx + 2 < bounds.size {
+                    let ptr = weights_ptr.add(jx + filter_offset);
+                    let bounds_start = bounds.start + jx;
+
+                    let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
+                    let src_ptr = unsafe_source_ptr_0.add(bounds_start * CHANNELS);
+                    let rgb_pixel = _mm_loadu_si64(src_ptr);
+                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+                    store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo, weight01));
+
+                    let rgb_pixel = _mm_loadu_si64(src_ptr.add(src_stride));
+                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+                    store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo, weight01));
+
+                    let rgb_pixel = _mm_loadu_si64(src_ptr.add(src_stride * 2));
+                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+                    store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo, weight01));
+
+                    let rgb_pixel = _mm_loadu_si64(src_ptr.add(src_stride * 3));
+                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+                    store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo, weight01));
+                    jx += 2;
+                }
+
+                while jx < bounds.size {
+                    let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
+                    let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32);
+                    let start_bounds = bounds.start + jx;
+                    store_0 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse(
+                        start_bounds,
+                        unsafe_source_ptr_0,
+                        weight0,
+                        store_0,
+                    );
+                    store_1 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse(
+                        start_bounds,
+                        unsafe_source_ptr_0.add(src_stride),
+                        weight0,
+                        store_1,
+                    );
+                    store_2 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse(
+                        start_bounds,
+                        unsafe_source_ptr_0.add(src_stride * 2),
+                        weight0,
+                        store_2,
+                    );
+                    store_3 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse(
+                        start_bounds,
+                        unsafe_source_ptr_0.add(src_stride * 3),
+                        weight0,
+                        store_3,
+                    );
+                    jx += 1;
+                }
+                let store_16_8 = sse_convolve_u8::compress_i32(store_0);
+                let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) };
+
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
+                let dest_ptr_32 = dest_ptr as *mut i32;
+                dest_ptr_32.write_unaligned(pixel);
+
+                let store_16_8 = sse_convolve_u8::compress_i32(store_1);
+                let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) };
+
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) };
+                let dest_ptr_32 = dest_ptr as *mut i32;
+                dest_ptr_32.write_unaligned(pixel);
+
+                let store_16_8 = sse_convolve_u8::compress_i32(store_2);
+                let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) };
+
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) };
+                let dest_ptr_32 = dest_ptr as *mut i32;
+                dest_ptr_32.write_unaligned(pixel);
+
+                let store_16_8 = sse_convolve_u8::compress_i32(store_3);
+                let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) };
+
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) };
+                let dest_ptr_32 = dest_ptr as *mut i32;
+                dest_ptr_32.write_unaligned(pixel);
+
+                filter_offset += approx_weights.aligned_size;
+            }
+        }
+    }
+
+    pub(crate) fn convolve_horizontal_rgba_sse_rows_one(
+        dst_width: usize,
+        _: usize,
+        approx_weights: &FilterWeights<i16>,
+        unsafe_source_ptr_0: *const u8,
+        unsafe_destination_ptr_0: *mut u8,
+    ) {
+        unsafe {
+            const CHANNELS: usize = 4;
+            let mut filter_offset = 0usize;
+            let weights_ptr = approx_weights.weights.as_ptr();
+
+            #[rustfmt::skip]
+           let shuffle_lo =_mm_setr_epi8(0, -1,
+                                         4, -1,
+                                         1, -1,
+                                         5, -1,
+                                         2, -1 ,
+                                         6,-1,
+                                         3, -1,
+                                         7, -1);
+
+            #[rustfmt::skip]
+           let shuffle_hi =_mm_setr_epi8(8, -1,
+                                         12, -1,
+                                         9, -1,
+                                         13, -1 ,
+                                         10,-1,
+                                         14, -1,
+                                         11, -1,
+                                         15, -1);
+
+            let vld = _mm_set1_epi32(ROUNDING_APPROX);
+
+            for x in 0..dst_width {
+                let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
+                let mut jx = 0usize;
+                let mut store = vld;
+
+                while jx + 4 < bounds.size {
+                    let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
+                    let bounds_start = bounds.start + jx;
+                    let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
+                    let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned());
+
+                    let src_ptr = unsafe_source_ptr_0.add(bounds_start * CHANNELS);
+                    let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i);
+
+                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
+                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+
+                    store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01));
+                    store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23));
+                    jx += 4;
+                }
+
+                while jx + 2 < bounds.size {
+                    let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
+                    let bounds_start = bounds.start + jx;
+                    let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
+                    let src_ptr = unsafe_source_ptr_0.add(bounds_start * CHANNELS);
+                    let rgb_pixel = _mm_loadu_si64(src_ptr);
+                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+                    store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01));
+                    jx += 2;
+                }
+
+                while jx < bounds.size {
+                    let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
+                    let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32);
+                    store = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse(
+                        bounds.start + jx,
+                        unsafe_source_ptr_0,
+                        weight0,
+                        store,
+                    );
+                    jx += 1;
+                }
+
+                let store_16_8 = sse_convolve_u8::compress_i32(store);
+                let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) };
+
+                let px = x * CHANNELS;
+                let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
+                let dest_ptr_32 = dest_ptr as *mut i32;
+                dest_ptr_32.write_unaligned(pixel);
+
+                filter_offset += approx_weights.aligned_size;
+            }
+        }
+    }
+
+    pub fn convolve_horizontal_rgb_sse_rows_4(
+        src_width: usize,
+        dst_width: usize,
+        approx_weights: &FilterWeights<i16>,
+        unsafe_source_ptr_0: *const u8,
+        src_stride: usize,
+        unsafe_destination_ptr_0: *mut u8,
+        dst_stride: usize,
+    ) {
+        unsafe {
+            const CHANNES: usize = 3;
+            let mut filter_offset = 0usize;
+            let weights_ptr = approx_weights.weights.as_ptr();
+
+            #[rustfmt::skip]
+           let shuffle_lo = unsafe { _mm_setr_epi8(0, -1,
+                                                   3, -1,
+                                                   1, -1,
+                                                   4, -1,
+                                                   2, -1 ,
+                                                   5,-1,
+                                                   -1, -1,
+                                                   -1, -1) };
+
+            #[rustfmt::skip]
+           let shuffle_hi = unsafe { _mm_setr_epi8(6, -1,
+                                                   9, -1,
+                                                   7, -1,
+                                                   10, -1 ,
+                                                   8,-1,
+                                                   11, -1,
+                                                   -1, -1,
+                                                   -1, -1) };
+
+            let vld = unsafe { _mm_set1_epi32(ROUNDING_APPROX) };
+
+            for x in 0..dst_width {
+                let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
+                let mut jx = 0usize;
+                let mut store_0 = vld;
+                let mut store_1 = vld;
+                let mut store_2 = vld;
+                let mut store_3 = vld;
+
+                // Will make step in 4 items however since it is RGB it is necessary to make a safe offset
+                while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
+                    let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
+                    unsafe {
+                        let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
+                        let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned());
+                        let bounds_start = bounds.start + jx;
+
+                        let src_ptr_0 = unsafe_source_ptr_0.add(bounds_start * CHANNES);
+
+                        let rgb_pixel = _mm_loadu_si128(src_ptr_0 as *const __m128i);
+                        let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
+                        let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+
+                        store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo, weight01));
+                        store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi, weight23));
+
+                        let src_ptr = src_ptr_0.add(src_stride);
+                        let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i);
+                        let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
+                        let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+
+                        store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo, weight01));
+                        store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi, weight23));
+
+                        let src_ptr = src_ptr_0.add(src_stride * 2);
+                        let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i);
+                        let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
+                        let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+
+                        store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo, weight01));
+                        store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi, weight23));
+
+                        let src_ptr = src_ptr_0.add(src_stride * 3);
+                        let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i);
+                        let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
+                        let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+
+                        store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo, weight01));
+                        store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi, weight23));
+                    }
+                    jx += 4;
+                }
+
+                while jx + 2 < bounds.size && bounds.start + jx + 3 < src_width {
+                    let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
+                    unsafe {
+                        let bounds_start = bounds.start + jx;
+                        let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
+                        store_0 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb(
+                            bounds_start,
+                            unsafe_source_ptr_0,
+                            weight01,
+                            store_0,
+                            shuffle_lo,
+                        );
+                        store_1 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb(
+                            bounds_start,
+                            unsafe_source_ptr_0.add(src_stride),
+                            weight01,
+                            store_1,
+                            shuffle_lo,
+                        );
+                        store_2 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb(
+                            bounds_start,
+                            unsafe_source_ptr_0.add(src_stride * 2),
+                            weight01,
+                            store_2,
+                            shuffle_lo,
+                        );
+                        store_3 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb(
+                            bounds_start,
+                            unsafe_source_ptr_0.add(src_stride * 3),
+                            weight01,
+                            store_3,
+                            shuffle_lo,
+                        );
+                    }
+                    jx += 2;
+                }
+
+                while jx < bounds.size {
+                    let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
+                    let bounds_start = bounds.start + jx;
+                    unsafe {
+                        let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32);
+                        store_0 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb(
+                            bounds_start,
+                            unsafe_source_ptr_0,
+                            weight0,
+                            store_0,
+                        );
+                        store_1 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb(
+                            bounds_start,
+                            unsafe_source_ptr_0.add(src_stride),
+                            weight0,
+                            store_1,
+                        );
+                        store_2 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb(
+                            bounds_start,
+                            unsafe_source_ptr_0.add(src_stride * 2),
+                            weight0,
+                            store_2,
+                        );
+                        store_3 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb(
+                            bounds_start,
+                            unsafe_source_ptr_0.add(src_stride * 3),
+                            weight0,
+                            store_3,
+                        );
+                    }
+                    jx += 1;
+                }
+                let store_0_8 = sse_convolve_u8::compress_i32(store_0);
+
+                let px = x * CHANNES;
+                let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
+
+                let element = unsafe { _mm_extract_epi32::<0>(store_0_8) };
+                let bytes = element.to_le_bytes();
+                unsafe {
+                    dest_ptr.write_unaligned(bytes[0]);
+                    dest_ptr.add(1).write_unaligned(bytes[1]);
+                    dest_ptr.add(2).write_unaligned(bytes[2]);
+                }
+
+                let store_1_8 = sse_convolve_u8::compress_i32(store_1);
+
+                let px = x * CHANNES;
+                let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) };
+
+                let element = unsafe { _mm_extract_epi32::<0>(store_1_8) };
+                let bytes = element.to_le_bytes();
+                unsafe {
+                    dest_ptr.write_unaligned(bytes[0]);
+                    dest_ptr.add(1).write_unaligned(bytes[1]);
+                    dest_ptr.add(2).write_unaligned(bytes[2]);
+                }
+
+                let store_2_8 = sse_convolve_u8::compress_i32(store_2);
+
+                let px = x * CHANNES;
+                let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) };
+
+                let element = unsafe { _mm_extract_epi32::<0>(store_2_8) };
+                let bytes = element.to_le_bytes();
+                unsafe {
+                    dest_ptr.write_unaligned(bytes[0]);
+                    dest_ptr.add(1).write_unaligned(bytes[1]);
+                    dest_ptr.add(2).write_unaligned(bytes[2]);
+                }
+
+                let store_3_8 = sse_convolve_u8::compress_i32(store_3);
+
+                let px = x * CHANNES;
+                let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) };
+
+                let element = unsafe { _mm_extract_epi32::<0>(store_3_8) };
+                let bytes = element.to_le_bytes();
+                unsafe {
+                    dest_ptr.write_unaligned(bytes[0]);
+                    dest_ptr.add(1).write_unaligned(bytes[1]);
+                    dest_ptr.add(2).write_unaligned(bytes[2]);
+                }
+
+                filter_offset += approx_weights.aligned_size;
+            }
+        }
+    }
+
+    pub fn convolve_horizontal_rgb_sse_row_one(
+        src_width: usize,
+        dst_width: usize,
+        approx_weights: &FilterWeights<i16>,
+        unsafe_source_ptr_0: *const u8,
+        unsafe_destination_ptr_0: *mut u8,
+    ) {
+        const CHANNELS: usize = 3;
+        let mut filter_offset = 0usize;
+        let weights_ptr = approx_weights.weights.as_ptr();
+
+        #[rustfmt::skip]
+        let shuffle_lo = unsafe { _mm_setr_epi8(0, -1,
+                                                    3, -1,
+                                                    1, -1,
+                                                    4, -1,
+                                                    2, -1 ,
+                                                    5,-1,
+                                                    -1, -1,
+                                                    -1, -1) };
+
+        #[rustfmt::skip]
+        let shuffle_hi = unsafe { _mm_setr_epi8(6, -1,
+                                                    9, -1,
+                                                    7, -1,
+                                                    10, -1 ,
+                                                    8,-1,
+                                                    11, -1,
+                                                    -1, -1,
+                                                        -1, -1) };
+
+        for x in 0..dst_width {
+            let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
+            let mut jx = 0usize;
+            let mut store = unsafe { _mm_setzero_si128() };
+
+            while jx + 4 < bounds.size && x + 6 < src_width {
+                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
+                unsafe {
+                    let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
+                    let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned());
+                    let bounds_start = bounds.start + jx;
+                    let src_ptr_0 = unsafe_source_ptr_0.add(bounds_start * CHANNELS);
+
+                    let rgb_pixel = _mm_loadu_si128(src_ptr_0 as *const __m128i);
+                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
+                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+
+                    store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01));
+                    store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23));
+                }
+                jx += 4;
+            }
+
+            while jx + 2 < bounds.size && x + 3 < src_width {
+                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
+                unsafe {
+                    let weight0 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
+                    let src_ptr = unsafe_source_ptr_0.add((bounds.start + jx) * 3);
+                    let rgb_pixel = _mm_loadu_si64(src_ptr);
+                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
+                    store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight0));
+                }
+                jx += 2;
+            }
+
+            while jx < bounds.size {
+                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
+                unsafe {
+                    let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32);
+                    store = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb(
+                        bounds.start + jx,
+                        unsafe_source_ptr_0,
+                        weight0,
+                        store,
+                    );
+                }
+                jx += 1;
+            }
+
+            let store_16_8 = sse_convolve_u8::compress_i32(store);
+
+            let px = x * CHANNELS;
+            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
+
+            let element = unsafe { _mm_extract_epi32::<0>(store_16_8) };
+            let bytes = element.to_le_bytes();
+            unsafe {
+                dest_ptr.write_unaligned(bytes[0]);
+                dest_ptr.add(1).write_unaligned(bytes[1]);
+                dest_ptr.add(2).write_unaligned(bytes[2]);
+            }
+            unsafe {
+                dest_ptr.write_unaligned(bytes[0]);
+                dest_ptr.add(1).write_unaligned(bytes[1]);
+                dest_ptr.add(2).write_unaligned(bytes[2]);
+            }
+
+            filter_offset += approx_weights.aligned_size;
+        }
+    }
+
+    #[inline]
+    pub(crate) fn convolve_vertical_rgb_sse_row(
+        total_width: usize,
+        bounds: &FilterBounds,
+        unsafe_source_ptr_0: *const u8,
+        unsafe_destination_ptr_0: *mut u8,
+        src_stride: usize,
+        weight_ptr: *const i16,
+    ) {
+        let mut cx = 0usize;
+
+        while cx + 32 < total_width {
+            unsafe {
+                sse_convolve_u8::convolve_vertical_part_sse_32(
+                    bounds.start,
+                    cx,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    weight_ptr,
+                    bounds,
+                );
+            }
+
+            cx += 32;
+        }
+
+        while cx + 16 < total_width {
+            unsafe {
+                sse_convolve_u8::convolve_vertical_part_sse_16(
+                    bounds.start,
+                    cx,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    weight_ptr,
+                    bounds,
+                );
+            }
+
+            cx += 16;
+        }
+
+        while cx + 8 < total_width {
+            unsafe {
+                sse_convolve_u8::convolve_vertical_part_sse_8::<false>(
+                    bounds.start,
+                    cx,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    weight_ptr,
+                    bounds,
+                    8,
+                );
+            }
+
+            cx += 8;
+        }
+
+        let left = total_width - cx;
+        if left > 0 {
+            unsafe {
+                sse_convolve_u8::convolve_vertical_part_sse_8::<true>(
+                    bounds.start,
+                    cx,
+                    unsafe_source_ptr_0,
+                    src_stride,
+                    unsafe_destination_ptr_0,
+                    weight_ptr,
+                    bounds,
+                    left,
+                );
+            }
+        }
+    }
+}
diff --git a/src/sse_simd_u8.rs b/src/sse/simd_u8.rs
similarity index 100%
rename from src/sse_simd_u8.rs
rename to src/sse/simd_u8.rs
diff --git a/src/sse_utils.rs b/src/sse/utils.rs
similarity index 100%
rename from src/sse_utils.rs
rename to src/sse/utils.rs
diff --git a/src/sse_rgb_u8.rs b/src/sse_rgb_u8.rs
deleted file mode 100644
index c59bb50..0000000
--- a/src/sse_rgb_u8.rs
+++ /dev/null
@@ -1,692 +0,0 @@
-/*
- * // Copyright (c) the Radzivon Bartoshyk. All rights reserved.
- * //
- * // Use of this source code is governed by a BSD-style
- * // license that can be found in the LICENSE file.
- */
-
-#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
-pub mod sse_rgb {
-    use crate::filter_weights::{FilterBounds, FilterWeights};
-    use crate::sse_simd_u8::sse_convolve_u8;
-    use crate::support::ROUNDING_APPROX;
-    #[cfg(target_arch = "x86")]
-    use std::arch::x86::*;
-    #[cfg(target_arch = "x86_64")]
-    use std::arch::x86_64::*;
-
-    pub(crate) unsafe fn convolve_horizontal_rgba_sse_rows_4(
-        dst_width: usize,
-        approx_weights: &FilterWeights<i16>,
-        unsafe_source_ptr_0: *const u8,
-        src_stride: usize,
-        unsafe_destination_ptr_0: *mut u8,
-        dst_stride: usize,
-    ) {
-        const CHANNELS: usize = 4;
-        let mut filter_offset = 0usize;
-        let weights_ptr = approx_weights.weights.as_ptr();
-
-        #[rustfmt::skip]
-        let shuffle_lo =_mm_setr_epi8(0, -1,
-                                      4, -1,
-                                      1, -1,
-                                      5, -1,
-                                      2, -1 ,
-                                      6,-1,
-                                      3, -1,
-                                      7, -1);
-
-        #[rustfmt::skip]
-        let shuffle_hi =_mm_setr_epi8(8, -1,
-                                      12, -1,
-                                      9, -1,
-                                      13, -1 ,
-                                      10,-1,
-                                      14, -1,
-                                      11, -1,
-                                      15, -1);
-
-        let vld = unsafe { _mm_set1_epi32(ROUNDING_APPROX) };
-
-        for x in 0..dst_width {
-            let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
-            let mut jx = 0usize;
-            let mut store_0 = vld;
-            let mut store_1 = vld;
-            let mut store_2 = vld;
-            let mut store_3 = vld;
-
-            while jx + 4 < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
-                    let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned());
-                    let start_bounds = bounds.start + jx;
-
-                    let src_ptr = unsafe_source_ptr_0.add(start_bounds * CHANNELS);
-                    let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i);
-
-                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-
-                    store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo, weight01));
-                    store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi, weight23));
-
-                    let rgb_pixel = _mm_loadu_si128(src_ptr.add(src_stride) as *const __m128i);
-
-                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-
-                    store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo, weight01));
-                    store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi, weight23));
-
-                    let rgb_pixel = _mm_loadu_si128(src_ptr.add(src_stride * 2) as *const __m128i);
-
-                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-
-                    store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo, weight01));
-                    store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi, weight23));
-
-                    let rgb_pixel = _mm_loadu_si128(src_ptr.add(src_stride * 3) as *const __m128i);
-
-                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-
-                    store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo, weight01));
-                    store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi, weight23));
-                }
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                unsafe {
-                    let ptr = weights_ptr.add(jx + filter_offset);
-                    let bounds_start = bounds.start + jx;
-
-                    let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
-                    let src_ptr = unsafe_source_ptr_0.add(bounds_start * CHANNELS);
-                    let rgb_pixel = _mm_loadu_si64(src_ptr);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-                    store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo, weight01));
-
-                    let rgb_pixel = _mm_loadu_si64(src_ptr.add(src_stride));
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-                    store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo, weight01));
-
-                    let rgb_pixel = _mm_loadu_si64(src_ptr.add(src_stride * 2));
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-                    store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo, weight01));
-
-                    let rgb_pixel = _mm_loadu_si64(src_ptr.add(src_stride * 3));
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-                    store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo, weight01));
-                }
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32);
-                    let start_bounds = bounds.start + jx;
-                    store_0 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse(
-                        start_bounds,
-                        unsafe_source_ptr_0,
-                        weight0,
-                        store_0,
-                    );
-                    store_1 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse(
-                        start_bounds,
-                        unsafe_source_ptr_0.add(src_stride),
-                        weight0,
-                        store_1,
-                    );
-                    store_2 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse(
-                        start_bounds,
-                        unsafe_source_ptr_0.add(src_stride * 2),
-                        weight0,
-                        store_2,
-                    );
-                    store_3 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse(
-                        start_bounds,
-                        unsafe_source_ptr_0.add(src_stride * 3),
-                        weight0,
-                        store_3,
-                    );
-                }
-                jx += 1;
-            }
-            let store_16_8 = sse_convolve_u8::compress_i32(store_0);
-            let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) };
-
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-            let dest_ptr_32 = dest_ptr as *mut i32;
-            unsafe {
-                dest_ptr_32.write_unaligned(pixel);
-            }
-
-            let store_16_8 = sse_convolve_u8::compress_i32(store_1);
-            let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) };
-
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) };
-            let dest_ptr_32 = dest_ptr as *mut i32;
-            unsafe {
-                dest_ptr_32.write_unaligned(pixel);
-            }
-
-            let store_16_8 = sse_convolve_u8::compress_i32(store_2);
-            let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) };
-
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) };
-            let dest_ptr_32 = dest_ptr as *mut i32;
-            unsafe {
-                dest_ptr_32.write_unaligned(pixel);
-            }
-
-            let store_16_8 = sse_convolve_u8::compress_i32(store_3);
-            let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) };
-
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) };
-            let dest_ptr_32 = dest_ptr as *mut i32;
-            unsafe {
-                dest_ptr_32.write_unaligned(pixel);
-            }
-
-            filter_offset += approx_weights.aligned_size;
-        }
-    }
-
-    pub(crate) unsafe fn convolve_horizontal_rgba_sse_rows_one(
-        dst_width: usize,
-        approx_weights: &FilterWeights<i16>,
-        unsafe_source_ptr_0: *const u8,
-        unsafe_destination_ptr_0: *mut u8,
-    ) {
-        const CHANNELS: usize = 4;
-        let mut filter_offset = 0usize;
-        let weights_ptr = approx_weights.weights.as_ptr();
-
-        #[rustfmt::skip]
-        let shuffle_lo =_mm_setr_epi8(0, -1,
-                                      4, -1,
-                                      1, -1,
-                                      5, -1,
-                                      2, -1 ,
-                                      6,-1,
-                                      3, -1,
-                                      7, -1);
-
-        #[rustfmt::skip]
-        let shuffle_hi =_mm_setr_epi8(8, -1,
-                                      12, -1,
-                                      9, -1,
-                                      13, -1 ,
-                                      10,-1,
-                                      14, -1,
-                                      11, -1,
-                                      15, -1);
-
-        let vld = unsafe { _mm_set1_epi32(ROUNDING_APPROX) };
-
-        for x in 0..dst_width {
-            let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
-            let mut jx = 0usize;
-            let mut store = vld;
-
-            while jx + 4 < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let bounds_start = bounds.start + jx;
-                    let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
-                    let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned());
-
-                    let src_ptr = unsafe_source_ptr_0.add(bounds_start * CHANNELS);
-                    let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i);
-
-                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-
-                    store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01));
-                    store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23));
-                }
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let bounds_start = bounds.start + jx;
-                    let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
-                    let src_ptr = unsafe_source_ptr_0.add(bounds_start * CHANNELS);
-                    let rgb_pixel = _mm_loadu_si64(src_ptr);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-                    store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01));
-                }
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32);
-                    store = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse(
-                        bounds.start + jx,
-                        unsafe_source_ptr_0,
-                        weight0,
-                        store,
-                    );
-                }
-                jx += 1;
-            }
-
-            let store_16_8 = sse_convolve_u8::compress_i32(store);
-            let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) };
-
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-            let dest_ptr_32 = dest_ptr as *mut i32;
-            unsafe {
-                dest_ptr_32.write_unaligned(pixel);
-            }
-
-            filter_offset += approx_weights.aligned_size;
-        }
-    }
-
-    pub unsafe fn convolve_horizontal_rgb_sse_rows_4(
-        src_width: usize,
-        dst_width: usize,
-        approx_weights: &FilterWeights<i16>,
-        unsafe_source_ptr_0: *const u8,
-        src_stride: usize,
-        unsafe_destination_ptr_0: *mut u8,
-        dst_stride: usize,
-    ) {
-        const CHANNES: usize = 3;
-        let mut filter_offset = 0usize;
-        let weights_ptr = approx_weights.weights.as_ptr();
-
-        #[rustfmt::skip]
-        let shuffle_lo = unsafe { _mm_setr_epi8(0, -1,
-                                                3, -1,
-                                                1, -1,
-                                                4, -1,
-                                                2, -1 ,
-                                                5,-1,
-                                                -1, -1,
-                                                -1, -1) };
-
-        #[rustfmt::skip]
-        let shuffle_hi = unsafe { _mm_setr_epi8(6, -1,
-                                                9, -1,
-                                                7, -1,
-                                                10, -1 ,
-                                                8,-1,
-                                                11, -1,
-                                                -1, -1,
-                                                -1, -1) };
-
-        let vld = unsafe { _mm_set1_epi32(ROUNDING_APPROX) };
-
-        for x in 0..dst_width {
-            let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
-            let mut jx = 0usize;
-            let mut store_0 = vld;
-            let mut store_1 = vld;
-            let mut store_2 = vld;
-            let mut store_3 = vld;
-
-            // Will make step in 4 items however since it is RGB it is necessary to make a safe offset
-            while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
-                    let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned());
-                    let bounds_start = bounds.start + jx;
-
-                    let src_ptr_0 = unsafe_source_ptr_0.add(bounds_start * CHANNES);
-
-                    let rgb_pixel = _mm_loadu_si128(src_ptr_0 as *const __m128i);
-                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-
-                    store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo, weight01));
-                    store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi, weight23));
-
-                    let src_ptr = src_ptr_0.add(src_stride);
-                    let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i);
-                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-
-                    store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo, weight01));
-                    store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi, weight23));
-
-                    let src_ptr = src_ptr_0.add(src_stride * 2);
-                    let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i);
-                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-
-                    store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo, weight01));
-                    store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi, weight23));
-
-                    let src_ptr = src_ptr_0.add(src_stride * 3);
-                    let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i);
-                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-
-                    store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo, weight01));
-                    store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi, weight23));
-                }
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size && bounds.start + jx + 3 < src_width {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let bounds_start = bounds.start + jx;
-                    let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
-                    store_0 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb(
-                        bounds_start,
-                        unsafe_source_ptr_0,
-                        weight01,
-                        store_0,
-                        shuffle_lo,
-                    );
-                    store_1 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb(
-                        bounds_start,
-                        unsafe_source_ptr_0.add(src_stride),
-                        weight01,
-                        store_1,
-                        shuffle_lo,
-                    );
-                    store_2 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb(
-                        bounds_start,
-                        unsafe_source_ptr_0.add(src_stride * 2),
-                        weight01,
-                        store_2,
-                        shuffle_lo,
-                    );
-                    store_3 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb(
-                        bounds_start,
-                        unsafe_source_ptr_0.add(src_stride * 3),
-                        weight01,
-                        store_3,
-                        shuffle_lo,
-                    );
-                }
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                let bounds_start = bounds.start + jx;
-                unsafe {
-                    let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32);
-                    store_0 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb(
-                        bounds_start,
-                        unsafe_source_ptr_0,
-                        weight0,
-                        store_0,
-                    );
-                    store_1 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb(
-                        bounds_start,
-                        unsafe_source_ptr_0.add(src_stride),
-                        weight0,
-                        store_1,
-                    );
-                    store_2 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb(
-                        bounds_start,
-                        unsafe_source_ptr_0.add(src_stride * 2),
-                        weight0,
-                        store_2,
-                    );
-                    store_3 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb(
-                        bounds_start,
-                        unsafe_source_ptr_0.add(src_stride * 3),
-                        weight0,
-                        store_3,
-                    );
-                }
-                jx += 1;
-            }
-            let store_0_8 = sse_convolve_u8::compress_i32(store_0);
-
-            let px = x * CHANNES;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-
-            let element = unsafe { _mm_extract_epi32::<0>(store_0_8) };
-            let bytes = element.to_le_bytes();
-            unsafe {
-                dest_ptr.write_unaligned(bytes[0]);
-                dest_ptr.add(1).write_unaligned(bytes[1]);
-                dest_ptr.add(2).write_unaligned(bytes[2]);
-            }
-
-            let store_1_8 = sse_convolve_u8::compress_i32(store_1);
-
-            let px = x * CHANNES;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) };
-
-            let element = unsafe { _mm_extract_epi32::<0>(store_1_8) };
-            let bytes = element.to_le_bytes();
-            unsafe {
-                dest_ptr.write_unaligned(bytes[0]);
-                dest_ptr.add(1).write_unaligned(bytes[1]);
-                dest_ptr.add(2).write_unaligned(bytes[2]);
-            }
-
-            let store_2_8 = sse_convolve_u8::compress_i32(store_2);
-
-            let px = x * CHANNES;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) };
-
-            let element = unsafe { _mm_extract_epi32::<0>(store_2_8) };
-            let bytes = element.to_le_bytes();
-            unsafe {
-                dest_ptr.write_unaligned(bytes[0]);
-                dest_ptr.add(1).write_unaligned(bytes[1]);
-                dest_ptr.add(2).write_unaligned(bytes[2]);
-            }
-
-            let store_3_8 = sse_convolve_u8::compress_i32(store_3);
-
-            let px = x * CHANNES;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) };
-
-            let element = unsafe { _mm_extract_epi32::<0>(store_3_8) };
-            let bytes = element.to_le_bytes();
-            unsafe {
-                dest_ptr.write_unaligned(bytes[0]);
-                dest_ptr.add(1).write_unaligned(bytes[1]);
-                dest_ptr.add(2).write_unaligned(bytes[2]);
-            }
-
-            filter_offset += approx_weights.aligned_size;
-        }
-    }
-
-    pub unsafe fn convolve_horizontal_rgb_sse_row_one(
-        src_width: usize,
-        dst_width: usize,
-        approx_weights: &FilterWeights<i16>,
-        unsafe_source_ptr_0: *const u8,
-        unsafe_destination_ptr_0: *mut u8,
-    ) {
-        const CHANNELS: usize = 3;
-        let mut filter_offset = 0usize;
-        let weights_ptr = approx_weights.weights.as_ptr();
-
-        #[rustfmt::skip]
-        let shuffle_lo = unsafe { _mm_setr_epi8(0, -1,
-                                                    3, -1,
-                                                    1, -1,
-                                                    4, -1,
-                                                    2, -1 ,
-                                                    5,-1,
-                                                    -1, -1,
-                                                    -1, -1) };
-
-        #[rustfmt::skip]
-        let shuffle_hi = unsafe { _mm_setr_epi8(6, -1,
-                                                    9, -1,
-                                                    7, -1,
-                                                    10, -1 ,
-                                                    8,-1,
-                                                    11, -1,
-                                                    -1, -1,
-                                                        -1, -1) };
-
-        for x in 0..dst_width {
-            let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
-            let mut jx = 0usize;
-            let mut store = unsafe { _mm_setzero_si128() };
-
-            while jx + 4 < bounds.size && x + 6 < src_width {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
-                    let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned());
-                    let bounds_start = bounds.start + jx;
-                    let src_ptr_0 = unsafe_source_ptr_0.add(bounds_start * CHANNELS);
-
-                    let rgb_pixel = _mm_loadu_si128(src_ptr_0 as *const __m128i);
-                    let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-
-                    store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01));
-                    store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23));
-                }
-                jx += 4;
-            }
-
-            while jx + 2 < bounds.size && x + 3 < src_width {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let weight0 = _mm_set1_epi32((ptr as *const i32).read_unaligned());
-                    let src_ptr = unsafe_source_ptr_0.add((bounds.start + jx) * 3);
-                    let rgb_pixel = _mm_loadu_si64(src_ptr);
-                    let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo);
-                    store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight0));
-                }
-                jx += 2;
-            }
-
-            while jx < bounds.size {
-                let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
-                unsafe {
-                    let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32);
-                    store = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb(
-                        bounds.start + jx,
-                        unsafe_source_ptr_0,
-                        weight0,
-                        store,
-                    );
-                }
-                jx += 1;
-            }
-
-            let store_16_8 = sse_convolve_u8::compress_i32(store);
-
-            let px = x * CHANNELS;
-            let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
-
-            let element = unsafe { _mm_extract_epi32::<0>(store_16_8) };
-            let bytes = element.to_le_bytes();
-            unsafe {
-                dest_ptr.write_unaligned(bytes[0]);
-                dest_ptr.add(1).write_unaligned(bytes[1]);
-                dest_ptr.add(2).write_unaligned(bytes[2]);
-            }
-            unsafe {
-                dest_ptr.write_unaligned(bytes[0]);
-                dest_ptr.add(1).write_unaligned(bytes[1]);
-                dest_ptr.add(2).write_unaligned(bytes[2]);
-            }
-
-            filter_offset += approx_weights.aligned_size;
-        }
-    }
-
-    #[inline]
-    pub(crate) fn convolve_vertical_rgb_sse_row(
-        total_width: usize,
-        bounds: &FilterBounds,
-        unsafe_source_ptr_0: *const u8,
-        unsafe_destination_ptr_0: *mut u8,
-        src_stride: usize,
-        weight_ptr: *const i16,
-    ) {
-        let mut cx = 0usize;
-
-        while cx + 32 < total_width {
-            unsafe {
-                sse_convolve_u8::convolve_vertical_part_sse_32(
-                    bounds.start,
-                    cx,
-                    unsafe_source_ptr_0,
-                    src_stride,
-                    unsafe_destination_ptr_0,
-                    weight_ptr,
-                    bounds,
-                );
-            }
-
-            cx += 32;
-        }
-
-        while cx + 16 < total_width {
-            unsafe {
-                sse_convolve_u8::convolve_vertical_part_sse_16(
-                    bounds.start,
-                    cx,
-                    unsafe_source_ptr_0,
-                    src_stride,
-                    unsafe_destination_ptr_0,
-                    weight_ptr,
-                    bounds,
-                );
-            }
-
-            cx += 16;
-        }
-
-        while cx + 8 < total_width {
-            unsafe {
-                sse_convolve_u8::convolve_vertical_part_sse_8::<false>(
-                    bounds.start,
-                    cx,
-                    unsafe_source_ptr_0,
-                    src_stride,
-                    unsafe_destination_ptr_0,
-                    weight_ptr,
-                    bounds,
-                    8,
-                );
-            }
-
-            cx += 8;
-        }
-
-        let left = total_width - cx;
-        if left > 0 {
-            unsafe {
-                sse_convolve_u8::convolve_vertical_part_sse_8::<true>(
-                    bounds.start,
-                    cx,
-                    unsafe_source_ptr_0,
-                    src_stride,
-                    unsafe_destination_ptr_0,
-                    weight_ptr,
-                    bounds,
-                    left,
-                );
-            }
-        }
-    }
-}