diff --git a/Cargo.lock b/Cargo.lock
index 8576e9b..b33039f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -768,7 +768,7 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
 [[package]]
 name = "pic-scale"
-version = "0.1.24"
+version = "0.1.25"
 dependencies = [
  "colorutils-rs",
  "half",
diff --git a/Cargo.toml b/Cargo.toml
index b277761..f83778c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["app"] }
 
 [package]
 name = "pic-scale"
-version = "0.1.24"
+version = "0.1.25"
 edition = "2021"
 description = "High performance image scaling"
 readme = "README.md"
diff --git a/app/src/main.rs b/app/src/main.rs
index b9ba759..ff02fd1 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -1,3 +1,6 @@
+mod merge;
+mod split;
+
 use std::time::Instant;
 
 use fast_image_resize::images::Image;
@@ -9,6 +12,8 @@ use half::f16;
 use image::io::Reader as ImageReader;
 use image::{EncodableLayout, GenericImageView};
 
+use crate::merge::merge_channels_3;
+use crate::split::split_channels_3;
 use pic_scale::{
     ImageSize, ImageStore, JzazbzScaler, OklabScaler, ResamplingFunction, Scaler, Scaling,
     ThreadingPolicy, TransferFunction,
@@ -39,26 +44,61 @@ fn main() {
 
     let start_time = Instant::now();
 
-    let store = ImageStore::<f32, 3>::from_slice(
-        &mut f16_bytes,
+    let mut rec1 = vec![0f32; dimensions.0 as usize * dimensions.1 as usize];
+    let mut rec2 = vec![0f32; dimensions.0 as usize * dimensions.1 as usize];
+    let mut rec3 = vec![0f32; dimensions.0 as usize * dimensions.1 as usize];
+
+    split_channels_3(
+        &f16_bytes,
         dimensions.0 as usize,
         dimensions.1 as usize,
+        &mut rec1,
+        &mut rec2,
+        &mut rec3,
+    );
+
+    let store =
+        ImageStore::<f32, 1>::from_slice(&mut rec1, dimensions.0 as usize, dimensions.1 as usize);
+
+    let resized = scaler.resize_plane_f32(
+        ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
+        store,
     );
+    rec1 = Vec::from(resized.as_bytes());
 
-    let resized = scaler.resize_rgb_f32(
+    let store =
+        ImageStore::<f32, 1>::from_slice(&mut rec2, dimensions.0 as usize, dimensions.1 as usize);
+
+    let resized = scaler.resize_plane_f32(
         ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
         store,
     );
+    rec2 = Vec::from(resized.as_bytes());
+
+    let store =
+        ImageStore::<f32, 1>::from_slice(&mut rec3, dimensions.0 as usize, dimensions.1 as usize);
+
+    let resized = scaler.resize_plane_f32(
+        ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
+        store,
+    );
+    rec3 = Vec::from(resized.as_bytes());
+
+    let mut resized_data: Vec<f32> = vec![0f32; resized.width * resized.height * 3];
+    merge_channels_3(
+        &mut resized_data,
+        resized.width,
+        resized.height,
+        &rec1,
+        &rec2,
+        &rec3,
+    );
 
     let elapsed_time = start_time.elapsed();
     // Print the elapsed time in milliseconds
     println!("Scaler: {:.2?}", elapsed_time);
 
-    let dst: Vec<u8> = resized
-        .as_bytes()
-        .iter()
-        .map(|&x| (x * 255f32) as u8)
-        .collect();
+    let dst: Vec<u8> = resized_data.iter().map(|&x| (x * 255f32) as u8).collect();
     // let dst = resized.as_bytes();
 
     if resized.channels == 4 {
diff --git a/app/src/merge.rs b/app/src/merge.rs
new file mode 100644
index 0000000..a4ef42d
--- /dev/null
+++ b/app/src/merge.rs
@@ -0,0 +1,25 @@
+pub(crate) fn merge_channels_3<T: Copy>(
+    image: &mut [T],
+    width: usize,
+    height: usize,
+    first: &[T],
+    second: &[T],
+    third: &[T],
+) {
+    let mut shift = 0usize;
+    let mut shift_plane = 0usize;
+    for _ in 0..height {
+        let shifted_image = &mut image[shift..];
+        let shifted_first_plane = &first[shift_plane..];
+        let shifted_second_plane = &second[shift_plane..];
+        let shifted_third_plane = &third[shift_plane..];
+        for x in 0..width {
+            let px = x * 3;
+            shifted_image[px] = shifted_first_plane[x];
+            shifted_image[px + 1] = shifted_second_plane[x];
+            shifted_image[px + 2] = shifted_third_plane[x];
+        }
+        shift += width * 3;
+        shift_plane += width;
+    }
+}
diff --git a/app/src/split.rs b/app/src/split.rs
new file mode 100644
index 0000000..2bcbab3
--- /dev/null
+++ b/app/src/split.rs
@@ -0,0 +1,25 @@
+pub(crate) fn split_channels_3<T: Copy>(
+    image: &[T],
+    width: usize,
+    height: usize,
+    first: &mut [T],
+    second: &mut [T],
+    third: &mut [T],
+) {
+    let mut shift = 0usize;
+    let mut shift_plane = 0usize;
+    for _ in 0..height {
+        let shifted_image = &image[shift..];
+        let shifted_first_plane = &mut first[shift_plane..];
+        let shifted_second_plane = &mut second[shift_plane..];
+        let shifted_third_plane = &mut third[shift_plane..];
+        for x in 0..width {
+            let px = x * 3;
+            shifted_first_plane[x] = shifted_image[px];
+            shifted_second_plane[x] = shifted_image[px + 1];
+            shifted_third_plane[x] = shifted_image[px + 2];
+        }
+        shift += width * 3;
+        shift_plane += width;
+    }
+}
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index fc07377..6e3e1fd 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -33,6 +33,7 @@ mod convolve_f16;
 mod convolve_f32;
 #[cfg(all(feature = "half"))]
 mod f16_utils;
+mod plane_f32;
 #[cfg(all(feature = "half"))]
 mod rgb_f16;
 mod rgb_f32;
@@ -51,6 +52,8 @@ pub use alpha::neon_premultiply_alpha_rgba;
 pub use alpha::neon_unpremultiply_alpha_rgba;
 #[cfg(all(feature = "half"))]
 pub use f16_utils::*;
+pub use plane_f32::convolve_horizontal_plane_neon_row_one;
+pub use plane_f32::convolve_horizontal_plane_neon_rows_4;
 #[cfg(all(feature = "half"))]
 pub use rgb_f16::{
     convolve_horizontal_rgb_neon_row_one_f16, convolve_horizontal_rgb_neon_rows_4_f16,
diff --git a/src/neon/plane_f32.rs b/src/neon/plane_f32.rs
new file mode 100644
index 0000000..ad5001c
--- /dev/null
+++ b/src/neon/plane_f32.rs
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+use crate::filter_weights::FilterWeights;
+use crate::neon::utils::prefer_vfmaq_f32;
+use std::arch::aarch64::*;
+
+macro_rules! conv_horiz_plane_16_f32 {
+    ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{
+        let src_ptr = $src.add($start_x);
+
+        let rgb_pixel = vld1q_f32_x4(src_ptr);
+
+        let mut acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set.0);
+        acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set.1);
+        acc = prefer_vfmaq_f32(acc, rgb_pixel.2, $set.2);
+        acc = prefer_vfmaq_f32(acc, rgb_pixel.3, $set.3);
+        acc
+    }};
+}
+
+macro_rules! conv_horiz_plane_8_f32 {
+    ($start_x: expr, $src: expr, $set1: expr, $set2: expr, $store: expr) => {{
+        let src_ptr = $src.add($start_x);
+
+        let rgb_pixel = vld1q_f32_x2(src_ptr);
+
+        let mut acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set1);
+        acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set2);
+        acc
+    }};
+}
+
+macro_rules! conv_horiz_plane_4_f32 {
+    ($start_x: expr, $src: expr, $set1: expr,  $store: expr) => {{
+        let src_ptr = $src.add($start_x);
+
+        let rgb_pixel = vld1q_f32(src_ptr);
+
+        prefer_vfmaq_f32($store, rgb_pixel, $set1)
+    }};
+}
+
+macro_rules! conv_horiz_plane_2_f32 {
+    ($start_x: expr, $src: expr, $set: expr,  $store: expr) => {{
+        let src_ptr = $src.add($start_x);
+
+        let rgb_pixel_0 = vld1_f32(src_ptr);
+        let rgb_pixel = vcombine_f32(rgb_pixel_0, vdup_n_f32(0.));
+
+        prefer_vfmaq_f32($store, rgb_pixel, $set)
+    }};
+}
+
+macro_rules! conv_horiz_plane_1_f32 {
+    ($start_x: expr, $src: expr, $set: expr,  $store: expr) => {{
+        let src_ptr = $src.add($start_x);
+        let mut rgb_pixel = vdupq_n_f32(0.);
+        rgb_pixel = vsetq_lane_f32::<0>(src_ptr.read_unaligned(), rgb_pixel);
+        prefer_vfmaq_f32($store, rgb_pixel, $set)
+    }};
+}
+
+macro_rules! vfullq_sum_f32 {
+    ($reg: expr) => {{
+        let acc = vadd_f32(vget_low_f32($reg), vget_high_f32($reg));
+        vpadds_f32(acc)
+    }};
+}
+
+pub fn convolve_horizontal_plane_neon_row_one(
+    dst_width: usize,
+    _: usize,
+    filter_weights: &FilterWeights<f32>,
+    unsafe_source_ptr_0: *const f32,
+    unsafe_destination_ptr_0: *mut f32,
+) {
+    unsafe {
+        let mut filter_offset = 0usize;
+        let weights_ptr = filter_weights.weights.as_ptr();
+
+        for x in 0..dst_width {
+            let bounds = filter_weights.bounds.get_unchecked(x);
+            let mut jx = 0usize;
+            let mut store = vdupq_n_f32(0f32);
+
+            while jx + 16 < bounds.size {
+                let bounds_start = bounds.start + jx;
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let read_weights = vld1q_f32_x4(ptr);
+                store = conv_horiz_plane_16_f32!(
+                    bounds_start,
+                    unsafe_source_ptr_0,
+                    read_weights,
+                    store
+                );
+                jx += 8;
+            }
+
+            while jx + 8 < bounds.size {
+                let bounds_start = bounds.start + jx;
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let read_weights = vld1q_f32_x2(ptr);
+                store = conv_horiz_plane_8_f32!(
+                    bounds_start,
+                    unsafe_source_ptr_0,
+                    read_weights.0,
+                    read_weights.1,
+                    store
+                );
+                jx += 8;
+            }
+
+            while jx + 4 < bounds.size {
+                let bounds_start = bounds.start + jx;
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let read_weights = vld1q_f32(ptr);
+                store =
+                    conv_horiz_plane_4_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store);
+                jx += 4;
+            }
+
+            while jx + 2 < bounds.size {
+                let bounds_start = bounds.start + jx;
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let weights0 = vld1_f32(ptr);
+                let weights = vcombine_f32(weights0, vdup_n_f32(0.));
+                store = conv_horiz_plane_2_f32!(bounds_start, unsafe_source_ptr_0, weights, store);
+                jx += 2;
+            }
+
+            while jx < bounds.size {
+                let bounds_start = bounds.start + jx;
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let weight0 = vdupq_n_f32(ptr.read_unaligned());
+                store = conv_horiz_plane_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store);
+                jx += 1;
+            }
+
+            let px = x;
+            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            dest_ptr.write_unaligned(vfullq_sum_f32!(store));
+
+            filter_offset += filter_weights.aligned_size;
+        }
+    }
+}
+
+pub fn convolve_horizontal_plane_neon_rows_4(
+    dst_width: usize,
+    _: usize,
+    filter_weights: &FilterWeights<f32>,
+    unsafe_source_ptr_0: *const f32,
+    src_stride: usize,
+    unsafe_destination_ptr_0: *mut f32,
+    dst_stride: usize,
+) {
+    unsafe {
+        let mut filter_offset = 0usize;
+        let zeros = vdupq_n_f32(0f32);
+        let weights_ptr = filter_weights.weights.as_ptr();
+
+        for x in 0..dst_width {
+            let bounds = filter_weights.bounds.get_unchecked(x);
+            let mut jx = 0usize;
+            let mut store_0 = zeros;
+            let mut store_1 = zeros;
+            let mut store_2 = zeros;
+            let mut store_3 = zeros;
+
+            while jx + 16 < bounds.size {
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let read_weights = vld1q_f32_x4(ptr);
+                let bounds_start = bounds.start + jx;
+                store_0 = conv_horiz_plane_16_f32!(
+                    bounds_start,
+                    unsafe_source_ptr_0,
+                    read_weights,
+                    store_0
+                );
+                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, read_weights, store_1);
+                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, read_weights, store_2);
+                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
+                store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, read_weights, store_3);
+                jx += 16;
+            }
+
+            while jx + 8 < bounds.size {
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let read_weights = vld1q_f32_x2(ptr);
+                let bounds_start = bounds.start + jx;
+                store_0 = conv_horiz_plane_8_f32!(
+                    bounds_start,
+                    unsafe_source_ptr_0,
+                    read_weights.0,
+                    read_weights.1,
+                    store_0
+                );
+                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_1 = conv_horiz_plane_8_f32!(
+                    bounds_start,
+                    s_ptr_1,
+                    read_weights.0,
+                    read_weights.1,
+                    store_1
+                );
+                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                store_2 = conv_horiz_plane_8_f32!(
+                    bounds_start,
+                    s_ptr2,
+                    read_weights.0,
+                    read_weights.1,
+                    store_2
+                );
+                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
+                store_3 = conv_horiz_plane_8_f32!(
+                    bounds_start,
+                    s_ptr3,
+                    read_weights.0,
+                    read_weights.1,
+                    store_3
+                );
+                jx += 8;
+            }
+
+            while jx + 4 < bounds.size {
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let read_weights = vld1q_f32(ptr);
+                let bounds_start = bounds.start + jx;
+                store_0 = conv_horiz_plane_4_f32!(
+                    bounds_start,
+                    unsafe_source_ptr_0,
+                    read_weights,
+                    store_0
+                );
+                let s_ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_1 = conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1);
+                let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2);
+                store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2);
+                let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3);
+                store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3);
+                jx += 4;
+            }
+
+            while jx + 2 < bounds.size {
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let weights0 = vld1_f32(ptr);
+                let weights = vcombine_f32(weights0, vdup_n_f32(0.));
+                let bounds_start = bounds.start + jx;
+                store_0 =
+                    conv_horiz_plane_2_f32!(bounds_start, unsafe_source_ptr_0, weights, store_0);
+                let ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1);
+                let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+                store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2);
+                let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+                store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3);
+                jx += 2;
+            }
+
+            while jx < bounds.size {
+                let ptr = weights_ptr.add(jx + filter_offset);
+                let weight0 = vdupq_n_f32(ptr.read_unaligned());
+                let bounds_start = bounds.start + jx;
+                store_0 =
+                    conv_horiz_plane_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store_0);
+                let ptr_1 = unsafe_source_ptr_0.add(src_stride);
+                store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1);
+                let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2);
+                store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2);
+                let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3);
+                store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3);
+                jx += 1;
+            }
+
+            let px = x;
+            let dest_ptr = unsafe_destination_ptr_0.add(px);
+            dest_ptr.write_unaligned(vfullq_sum_f32!(store_0));
+
+            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
+            dest_ptr.write_unaligned(vfullq_sum_f32!(store_1));
+
+            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
+            dest_ptr.write_unaligned(vfullq_sum_f32!(store_2));
+
+            let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
+            dest_ptr.write_unaligned(vfullq_sum_f32!(store_3));
+
+            filter_offset += filter_weights.aligned_size;
+        }
+    }
+}
diff --git a/src/plane_f32.rs b/src/plane_f32.rs
index 412e1c2..77a8359 100644
--- a/src/plane_f32.rs
+++ b/src/plane_f32.rs
@@ -33,7 +33,17 @@ use crate::convolve_naive_f32::{
 };
 use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32};
 use crate::filter_weights::{FilterBounds, FilterWeights};
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+use crate::neon::{
+    convolve_horizontal_plane_neon_row_one, convolve_horizontal_plane_neon_rows_4,
+    convolve_vertical_rgb_neon_row_f32,
+};
 use crate::rgb_f32::convolve_vertical_rgb_native_row_f32;
+#[cfg(all(
+    any(target_arch = "x86_64", target_arch = "x86"),
+    target_feature = "sse4.1"
+))]
+use crate::sse::convolve_vertical_rgb_sse_row_f32;
 use crate::ImageStore;
 use rayon::ThreadPool;
 
@@ -45,11 +55,16 @@ impl<'a> HorizontalConvolutionPass<f32, 1> for ImageStore<'a, f32, 1> {
         destination: &mut ImageStore<f32, 1>,
         pool: &Option<ThreadPool>,
     ) {
-        let _dispatcher_4_rows: Option<
+        let mut _dispatcher_4_rows: Option<
             fn(usize, usize, &FilterWeights<f32>, *const f32, usize, *mut f32, usize),
         > = Some(convolve_horizontal_rgba_4_row_f32::<f32, 1>);
-        let _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f32, *mut f32) =
+        let mut _dispatcher_row: fn(usize, usize, &FilterWeights<f32>, *const f32, *mut f32) =
             convolve_horizontal_rgb_native_row::<f32, 1>;
+        #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+        {
+            _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_4);
+            _dispatcher_row = convolve_horizontal_plane_neon_row_one;
+        }
         convolve_horizontal_dispatch_f32(
             self,
             filter_weights,
@@ -68,8 +83,21 @@ impl<'a> VerticalConvolutionPass<f32, 1> for ImageStore<'a, f32, 1> {
         destination: &mut ImageStore<f32, 1>,
         pool: &Option<ThreadPool>,
     ) {
-        let _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, *const f32) =
+        let mut _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, *const f32) =
             convolve_vertical_rgb_native_row_f32::<f32, 1>;
+        #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+        {
+            _dispatcher = convolve_vertical_rgb_neon_row_f32::<1>;
+        }
+        #[cfg(all(
+            any(target_arch = "x86_64", target_arch = "x86"),
+            target_feature = "sse4.1"
+        ))]
+        {
+            if is_x86_feature_detected!("sse4.1") {
+                _dispatcher = convolve_vertical_rgb_sse_row_f32::<1>;
+            }
+        }
         convolve_vertical_dispatch_f32(self, filter_weights, destination, pool, _dispatcher);
     }
 }