diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index 873aea2..fa78e62 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -23,14 +23,14 @@ jobs:
       - uses: actions/checkout@v4
       - uses: actions-rust-lang/setup-rust-toolchain@v1
       - run: rustup target add aarch64-unknown-linux-gnu x86_64-unknown-linux-gnu i686-unknown-linux-gnu powerpc-unknown-linux-gnu riscv64gc-unknown-linux-gnu
-      - run: RUSTFLAGS="-C target-feature=+neon,-fp16" cargo build --target aarch64-unknown-linux-gnu --features half
-      - run: RUSTFLAGS="-C target-feature=+neon,+fp16" cargo build --target aarch64-unknown-linux-gnu --features half
+      - run: RUSTFLAGS="-C target-feature=+neon,-fp16" cargo build --target aarch64-unknown-linux-gnu
+      - run: RUSTFLAGS="-C target-feature=+neon,+fp16" cargo +nightly build --target aarch64-unknown-linux-gnu --features nightly_f16
       - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target i686-unknown-linux-gnu
       - run: cargo build --target powerpc-unknown-linux-gnu
       - run: cargo build --target riscv64gc-unknown-linux-gnu
       - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target x86_64-unknown-linux-gnu
-      - run: RUSTFLAGS="-C target-feature=+sse4.1,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu
-      - run: RUSTFLAGS="-C target-feature=+avx2,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu
+      - run: RUSTFLAGS="-C target-feature=+sse4.1,+f16c" cargo +nightly build --features nightly_f16 --target x86_64-unknown-linux-gnu
+      - run: RUSTFLAGS="-C target-feature=+avx2,+f16c" cargo +nightly build --features nightly_f16 --target x86_64-unknown-linux-gnu
       - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu
 
   clippy:
@@ -42,7 +42,18 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: dtolnay/rust-toolchain@stable
-      - run: cargo clippy
+      - run: cargo clippy -- -D warnings
+
+  clippy_nightly:
+    name: Clippy Nightly
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@stable
+      - run: cargo clippy --all-features -- -D warnings
 
   tests:
     name: Testing
@@ -71,6 +82,18 @@ jobs:
       - run: cargo fuzz run resize_plane -- -max_total_time=30
       - run: cargo fuzz run colorspaces -- -max_total_time=10
 
+  fuzz_rgba_8bit_no_rdm:
+    name: Fuzzing 8bit wo RDM
+    runs-on: macos-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@nightly
+      - run: cargo install cargo-fuzz
+      - run: cargo fuzz run resize_rgba --no-default-features -- -max_total_time=30
+      - run: cargo fuzz run resize_rgb --no-default-features -- -max_total_time=30
+      - run: cargo fuzz run resize_cbcr8 --no-default-features -- -max_total_time=30
+      - run: cargo fuzz run resize_plane --no-default-features -- -max_total_time=30
+
   fuzz_rgba_high_bit:
     name: Fuzzing High bit-depth
     strategy:
@@ -82,6 +105,7 @@ jobs:
       - uses: dtolnay/rust-toolchain@nightly
       - run: cargo install cargo-fuzz
       - run: cargo fuzz run resize_rgba_u16 -- -max_total_time=30
+      - run: cargo fuzz run resize_rgba_f16 -- -max_total_time=30
       - run: cargo fuzz run resize_rgb_u16 -- -max_total_time=30
       - run: cargo fuzz run resize_cbcr16 -- -max_total_time=30
       - run: cargo fuzz run resize_plane_u16 -- -max_total_time=30
diff --git a/Cargo.lock b/Cargo.lock
index 29d65f6..192a101 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -469,7 +469,6 @@ checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
 dependencies = [
  "cfg-if 1.0.0",
  "crunchy",
- "num-traits",
 ]
 
 [[package]]
@@ -632,12 +631,6 @@ dependencies = [
  "cc",
 ]
 
-[[package]]
-name = "libm"
-version = "0.2.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
-
 [[package]]
 name = "litrs"
 version = "0.4.1"
@@ -767,7 +760,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
 dependencies = [
  "autocfg",
- "libm",
 ]
 
 [[package]]
@@ -790,10 +782,9 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
 [[package]]
 name = "pic-scale"
-version = "0.4.2"
+version = "0.5.0"
 dependencies = [
  "colorutils-rs",
- "half",
  "libc",
  "num-traits",
  "rayon",
diff --git a/Cargo.toml b/Cargo.toml
index 8e76123..4c1bf4b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@ workspace = { members = ["app", "wasm", "fuzz", "app/accelerate"], exclude = ["p
 
 [package]
 name = "pic-scale"
-version = "0.4.2"
+version = "0.5.0"
 edition = "2021"
 description = "High performance image scaling"
 readme = "README.md"
@@ -18,7 +18,6 @@ rust-version = "1.82.0"
 
 [dependencies]
 colorutils-rs = {version = "0.7.4", optional = true}
-half = { version = "2.4.1", optional = true, features = ["alloc", "std", "num-traits"] }
 num-traits = { version = "0.2.19", features = ["std"] }
 rayon = "1.10.0"
 
@@ -26,7 +25,9 @@ rayon = "1.10.0"
 libc = "0.2.158"
 
 [features]
-default = ["colorspaces"]
+default = ["colorspaces", "rdm"]
 colorspaces = ["dep:colorutils-rs"]
 nightly_avx512 = []
-nightly_avx512fp16 = ["nightly_avx512"]
\ No newline at end of file
+nightly_avx512fp16 = ["nightly_avx512"]
+nightly_f16 = []
+rdm = []
\ No newline at end of file
diff --git a/app/Cargo.toml b/app/Cargo.toml
index ba9a88c..182eb4b 100644
--- a/app/Cargo.toml
+++ b/app/Cargo.toml
@@ -5,8 +5,7 @@ edition = "2021"
 
 [dependencies]
 image = { version = "0.25.5", features = ["default"] }
-#image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
-pic-scale = { path = "..", features = ["half"], default-features = true }
+pic-scale = { path = "..", features = ["nightly_f16"], default-features = false }
 fast_image_resize = { version = "5.0.0", features = [] }
 half = { version = "2.4.1", default-features = true }
 accelerate = {path = "accelerate/"}
@@ -16,7 +15,7 @@ libc = "0.2.169"
 criterion = "0.5.1"
 #image = { version = "0.25.2", features = ["default"] }
 fast_image_resize = { version = "5.0.0", features = [] }
-pic-scale = { path = "..", features = ["half"] }
+pic-scale = { path = "..", features = ["nightly_f16", "rdm"], default-features = false }
 
 [[bench]]
 name = "resize_rgb"
diff --git a/app/accelerate/src/lib.rs b/app/accelerate/src/lib.rs
index e7a46b5..138a9ef 100644
--- a/app/accelerate/src/lib.rs
+++ b/app/accelerate/src/lib.rs
@@ -42,6 +42,15 @@ mod accelerate {
             flags: libc::c_uint,
         ) -> libc::c_int;
 
+        #[allow(non_camel_case_types)]
+        #[allow(non_snake_case)]
+        pub fn vImageScale_XRGB2101010W(
+            src: *const vImage_Buffer,
+            dest: *mut vImage_Buffer,
+            temp_buffer: *mut libc::c_void,
+            flags: libc::c_uint,
+        ) -> libc::c_int;
+
         #[allow(non_camel_case_types)]
         #[allow(non_snake_case)]
         pub fn vImageScale_ARGBFFFF(
diff --git a/app/benches/resize_rgb/main.rs b/app/benches/resize_rgb/main.rs
index 7853693..a7ca580 100644
--- a/app/benches/resize_rgb/main.rs
+++ b/app/benches/resize_rgb/main.rs
@@ -5,7 +5,7 @@ use fast_image_resize::{CpuExtensions, PixelType, ResizeAlg, ResizeOptions, Resi
 use image::{EncodableLayout, GenericImageView, ImageReader};
 use pic_scale::{
     ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingF32, ScalingU16,
-    ThreadingPolicy,
+    ThreadingPolicy, WorkloadStrategy,
 };
 
 pub fn criterion_benchmark(c: &mut Criterion) {
@@ -25,6 +25,22 @@ pub fn criterion_benchmark(c: &mut Criterion) {
         b.iter(|| {
             let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
             scaler.set_threading_policy(ThreadingPolicy::Single);
+            scaler.set_workload_strategy(WorkloadStrategy::PreferSpeed);
+            let mut target =
+                ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
+            scaler.resize_rgb(&store, &mut target).unwrap();
+        })
+    });
+
+    c.bench_function("Pic scale RGB: Lanczos 3/Quality", |b| {
+        let copied: Vec<u8> = Vec::from(src_bytes);
+        let store =
+            ImageStore::<u8, 3>::from_slice(&copied, dimensions.0 as usize, dimensions.1 as usize)
+                .unwrap();
+        b.iter(|| {
+            let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+            scaler.set_threading_policy(ThreadingPolicy::Single);
+            scaler.set_workload_strategy(WorkloadStrategy::PreferQuality);
             let mut target =
                 ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
             scaler.resize_rgb(&store, &mut target).unwrap();
diff --git a/app/benches/resize_rgba/main.rs b/app/benches/resize_rgba/main.rs
index 69fbed7..cc681b1 100644
--- a/app/benches/resize_rgba/main.rs
+++ b/app/benches/resize_rgba/main.rs
@@ -1,11 +1,12 @@
+#![feature(f16)]
 use criterion::{criterion_group, criterion_main, Criterion};
 use fast_image_resize::images::Image;
 use fast_image_resize::FilterType::Lanczos3;
 use fast_image_resize::{CpuExtensions, PixelType, ResizeAlg, ResizeOptions, Resizer};
 use image::{GenericImageView, ImageReader};
 use pic_scale::{
-    ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingF32, ScalingU16,
-    ThreadingPolicy,
+    Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling,
+    ScalingF32, ScalingU16, ThreadingPolicy, WorkloadStrategy,
 };
 
 pub fn criterion_benchmark(c: &mut Criterion) {
@@ -16,7 +17,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     let dimensions = img.dimensions();
     let src_bytes = img.as_bytes();
 
-    c.bench_function("Pic scale RGBA with alpha: Lanczos 3", |b| {
+    /*c.bench_function("Pic scale RGBA with alpha: Lanczos 3", |b| {
         let copied: Vec<u8> = Vec::from(src_bytes);
         b.iter(|| {
             let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
@@ -98,8 +99,26 @@ pub fn criterion_benchmark(c: &mut Criterion) {
         })
     });
 
+    c.bench_function("Pic scale RGBA without alpha: Lanczos 3/Quality", |b| {
+        let copied: Vec<u8> = Vec::from(src_bytes);
+        b.iter(|| {
+            let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+            scaler.set_threading_policy(ThreadingPolicy::Single);
+            scaler.set_workload_strategy(WorkloadStrategy::PreferQuality);
+            let store = ImageStore::<u8, 4>::from_slice(
+                &copied,
+                dimensions.0 as usize,
+                dimensions.1 as usize,
+            )
+            .unwrap();
+            let mut target =
+                ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
+            _ = scaler.resize_rgba(&store, &mut target, false);
+        })
+    });
+
     #[cfg(any(target_os = "macos", target_os = "ios"))]
-    c.bench_function("Apple Accelerate: Lanczos 3", |b| {
+    c.bench_function("Apple Accelerate RGBA: Lanczos 3", |b| {
         let copied: Vec<u8> = Vec::from(src_bytes);
         use accelerate::{kvImageDoNotTile, vImageScale_ARGB8888, vImage_Buffer};
         b.iter(|| {
@@ -331,6 +350,152 @@ pub fn criterion_benchmark(c: &mut Criterion) {
             }
         })
     });
+
+    use core::f16;
+
+    c.bench_function("Pic scale RGBA F16 without alpha: Lanczos 3/Quality", |b| {
+        let copied: Vec<f16> = vec![0.; src_bytes.len()];
+        b.iter(|| {
+            let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+            scaler.set_threading_policy(ThreadingPolicy::Single);
+            scaler.set_workload_strategy(WorkloadStrategy::PreferQuality);
+            let store = ImageStore::<f16, 4>::from_slice(
+                &copied,
+                dimensions.0 as usize,
+                dimensions.1 as usize,
+            )
+            .unwrap();
+            let mut target =
+                ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
+            scaler.resize_rgba_f16(&store, &mut target, false).unwrap();
+        })
+    });
+
+    #[cfg(any(target_os = "macos", target_os = "ios"))]
+    c.bench_function("Apple Accelerate RGBAF16: Lanczos 3", |b| {
+        let copied: Vec<f16> = vec![0.; src_bytes.len()];
+        use accelerate::{kvImageDoNotTile, vImageScale_ARGB16F, vImage_Buffer};
+        b.iter(|| {
+            let mut target = ImageStoreMut::<f16, 4>::alloc(
+                dimensions.0 as usize / 4,
+                dimensions.1 as usize / 4,
+            );
+
+            let src_buffer = vImage_Buffer {
+                data: copied.as_ptr() as *mut libc::c_void,
+                width: dimensions.0 as usize,
+                height: dimensions.1 as usize,
+                row_bytes: dimensions.0 as usize * 4 * std::mem::size_of::<f16>(),
+            };
+
+            let target_stride = target.stride();
+            let target_ptr = target.buffer.borrow_mut().as_mut_ptr() as *mut libc::c_void;
+
+            let mut dst_buffer = vImage_Buffer {
+                data: target_ptr,
+                width: target.width,
+                height: target.height,
+                row_bytes: target_stride * std::mem::size_of::<f16>(),
+            };
+
+            let result = unsafe {
+                vImageScale_ARGB16F(
+                    &src_buffer,
+                    &mut dst_buffer,
+                    std::ptr::null_mut(),
+                    kvImageDoNotTile,
+                )
+            };
+            if result != 0 {
+                panic!("Can't resize by accelerate");
+            }
+        })
+    });*/
+
+    c.bench_function("Pic scale RGBA1010102(N0: Lanczos 3/Speed", |b| {
+        let copied: Vec<u8> = Vec::from(src_bytes);
+        b.iter(|| {
+            let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+            scaler.set_threading_policy(ThreadingPolicy::Single);
+            scaler.set_workload_strategy(WorkloadStrategy::PreferSpeed);
+
+            let mut dst_data_ar30 =
+                vec![1u8; (dimensions.0 as usize / 4) * (dimensions.1 as usize / 4) * 4];
+            scaler
+                .resize_ar30(
+                    &copied,
+                    dimensions.0 as usize * 4,
+                    ImageSize::new(dimensions.0 as usize, dimensions.1 as usize),
+                    &mut dst_data_ar30,
+                    (dimensions.0 as usize / 4) * 4,
+                    ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4),
+                    Ar30ByteOrder::Network,
+                )
+                .unwrap();
+        })
+    });
+
+    c.bench_function("Pic scale RGBA1010102(N): Lanczos 3/Quality", |b| {
+        let copied: Vec<u8> = Vec::from(src_bytes);
+        b.iter(|| {
+            let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+            scaler.set_threading_policy(ThreadingPolicy::Single);
+            scaler.set_workload_strategy(WorkloadStrategy::PreferQuality);
+
+            let mut dst_data_ar30 =
+                vec![1u8; (dimensions.0 as usize / 4) * (dimensions.1 as usize / 4) * 4];
+            scaler
+                .resize_ar30(
+                    &copied,
+                    dimensions.0 as usize * 4,
+                    ImageSize::new(dimensions.0 as usize, dimensions.1 as usize),
+                    &mut dst_data_ar30,
+                    (dimensions.0 as usize / 4) * 4,
+                    ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4),
+                    Ar30ByteOrder::Network,
+                )
+                .unwrap();
+        })
+    });
+
+    #[cfg(any(target_os = "macos", target_os = "ios"))]
+    c.bench_function("Apple Accelerate RGBX1010102(N): Lanczos 3", |b| {
+        let copied: Vec<u8> = Vec::from(src_bytes);
+        use accelerate::{kvImageDoNotTile, vImageScale_XRGB2101010W, vImage_Buffer};
+        b.iter(|| {
+            let mut target =
+                ImageStoreMut::<u8, 4>::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
+
+            let src_buffer = vImage_Buffer {
+                data: copied.as_ptr() as *mut libc::c_void,
+                width: dimensions.0 as usize,
+                height: dimensions.1 as usize,
+                row_bytes: dimensions.0 as usize * 4,
+            };
+
+            let target_stride = target.stride();
+            let target_ptr = target.buffer.borrow_mut().as_mut_ptr() as *mut libc::c_void;
+
+            let mut dst_buffer = vImage_Buffer {
+                data: target_ptr,
+                width: target.width,
+                height: target.height,
+                row_bytes: target_stride,
+            };
+
+            let result = unsafe {
+                vImageScale_XRGB2101010W(
+                    &src_buffer,
+                    &mut dst_buffer,
+                    std::ptr::null_mut(),
+                    kvImageDoNotTile,
+                )
+            };
+            if result != 0 {
+                panic!("Can't resize by accelerate");
+            }
+        })
+    });
 }
 
 criterion_group!(benches, criterion_benchmark);
diff --git a/app/src/main.rs b/app/src/main.rs
index de82ee5..03d7565 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -1,17 +1,20 @@
 #![feature(avx512_target_feature)]
+#![feature(f16)]
 mod merge;
 mod split;
 
 use std::time::Instant;
 
+use core::f16;
 use fast_image_resize::images::Image;
 use fast_image_resize::{
     CpuExtensions, FilterType, IntoImageView, PixelType, ResizeAlg, ResizeOptions, Resizer,
 };
 use image::{EncodableLayout, GenericImageView, ImageReader};
 use pic_scale::{
-    CbCr8ImageStore, CbCr8ImageStoreMut, ImageSize, ImageStore, ImageStoreMut, ImageStoreScaling,
-    ResamplingFunction, Scaler, Scaling, ScalingU16, ThreadingPolicy,
+    Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, ImageStoreScaling, ResamplingFunction,
+    RgbF16ImageStore, RgbF16ImageStoreMut, Rgba16ImageStoreMut, RgbaF16ImageStore,
+    RgbaF16ImageStoreMut, Scaler, Scaling, ScalingU16, ThreadingPolicy, WorkloadStrategy,
 };
 
 fn resize_plane(
@@ -48,19 +51,44 @@ fn main() {
         .decode()
         .unwrap();
     let dimensions = img.dimensions();
-    let transient = img.to_luma_alpha8();
+    let transient = img.to_rgb8();
     let mut bytes = Vec::from(transient.as_bytes());
 
     let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
     scaler.set_threading_policy(ThreadingPolicy::Single);
+    scaler.set_workload_strategy(WorkloadStrategy::PreferQuality);
 
     // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);
 
     // let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
 
+    let src_width = 289;
+    let src_height = 257;
+    let dst_width = 257;
+    let dst_height = 511;
+    let src_data_ar30 = vec![1u8; src_width * src_height * 4];
+    let mut dst_data_ar30 = vec![1u8; dst_width * dst_height * 4];
+    scaler
+        .resize_ar30(
+            &src_data_ar30,
+            src_width * 4,
+            ImageSize::new(src_width, src_height),
+            &mut dst_data_ar30,
+            dst_width * 4,
+            ImageSize::new(dst_width, dst_height),
+            Ar30ByteOrder::Host,
+        )
+        .unwrap();
+
+    let rgb_feature16 = transient
+        .iter()
+        .map(|&x| (x as f32 / 255f32) as f16)
+        .collect::<Vec<_>>();
+
     //
     let store =
-        CbCr8ImageStore::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize).unwrap();
+        RgbF16ImageStore::from_slice(&rgb_feature16, dimensions.0 as usize, dimensions.1 as usize)
+            .unwrap();
 
     let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
     // let mut resized_ar = vec![0u32; dst_size.width * dst_size.height];
@@ -74,7 +102,7 @@ fn main() {
     //     )
     //     .unwrap();
 
-    let mut dst_store = CbCr8ImageStoreMut::alloc_with_depth(
+    let mut dst_store = RgbF16ImageStoreMut::alloc_with_depth(
         dimensions.0 as usize / 4,
         dimensions.1 as usize / 4,
         10,
@@ -82,7 +110,7 @@ fn main() {
 
     // for i in 0..25 {
     let start_time = Instant::now();
-    scaler.resize_cbcr8(&store, &mut dst_store).unwrap();
+    scaler.resize_rgb_f16(&store, &mut dst_store).unwrap();
 
     let elapsed_time = start_time.elapsed();
     // Print the elapsed time in milliseconds
@@ -130,7 +158,11 @@ fn main() {
     //     .map(|&x| (x >> 2) as u8)
     //     .collect();
 
-    let dst = dst_store.as_bytes();
+    let dst = dst_store
+        .as_bytes()
+        .iter()
+        .map(|&x| (x as f32 * 255.).round() as u8)
+        .collect::<Vec<_>>();
     // let dst = resized;
     // image::save_buffer(
     //     "converted.png",
@@ -152,11 +184,11 @@ fn main() {
         .unwrap();
     } else {
         image::save_buffer(
-            "converted.webp",
+            "converted.png",
             &dst,
             dst_store.width as u32,
             dst_store.height as u32,
-            image::ColorType::La8,
+            image::ColorType::Rgb8,
         )
         .unwrap();
     }
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
index 973b015..1b25765 100644
--- a/fuzz/Cargo.toml
+++ b/fuzz/Cargo.toml
@@ -9,10 +9,11 @@ cargo-fuzz = true
 
 [dependencies]
 libfuzzer-sys = "0.4"
-pic-scale = { path = "../" }
+pic-scale = { path = "../", features = ["nightly_f16"], default-features = true }
 
 [features]
 nightly_avx512 = ["pic-scale/nightly_avx512"]
+rdm = []
 
 [[bin]]
 name = "resize_rgba"
@@ -49,6 +50,13 @@ test = false
 doc = false
 bench = false
 
+[[bin]]
+name = "resize_rgba_f16"
+path = "resize_rgba_f16/resize_rgba_f16.rs"
+test = false
+doc = false
+bench = false
+
 [[bin]]
 name = "resize_cbcr16"
 path = "resize_cbcr16/resize_cbcr16.rs"
diff --git a/fuzz/resize_rgba_f16/resize_rgba_f16.rs b/fuzz/resize_rgba_f16/resize_rgba_f16.rs
new file mode 100644
index 0000000..bcb24ad
--- /dev/null
+++ b/fuzz/resize_rgba_f16/resize_rgba_f16.rs
@@ -0,0 +1,82 @@
+#![feature(f16)]
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#![no_main]
+
+use core::f16;
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, WorkloadStrategy};
+
+fuzz_target!(|data: (u16, u16, u16, u16, bool, bool)| {
+    let strategy = if data.5 {
+        WorkloadStrategy::PreferQuality
+    } else {
+        WorkloadStrategy::PreferSpeed
+    };
+    resize_rgba(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Lanczos3,
+        data.4,
+        strategy,
+    )
+});
+
+fn resize_rgba(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+    premultiply_alpha: bool,
+    workload_strategy: WorkloadStrategy,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let store = ImageStore::<f16, 4>::alloc(src_width, src_height);
+    let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10);
+
+    let mut scaler = Scaler::new(sampler);
+    scaler.set_workload_strategy(workload_strategy);
+    scaler
+        .resize_rgba_f16(&store, &mut target, premultiply_alpha)
+        .unwrap();
+}
diff --git a/fuzz/resize_rgba_u16/resize_rgba_u16.rs b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
index 571290d..e916f24 100644
--- a/fuzz/resize_rgba_u16/resize_rgba_u16.rs
+++ b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
@@ -32,15 +32,23 @@
 use libfuzzer_sys::fuzz_target;
 use pic_scale::{
     Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ScalingU16,
+    WorkloadStrategy,
 };
 
-fuzz_target!(|data: (u16, u16, u16, u16)| {
+fuzz_target!(|data: (u16, u16, u16, u16, bool, bool)| {
+    let strategy = if data.5 {
+        WorkloadStrategy::PreferQuality
+    } else {
+        WorkloadStrategy::PreferSpeed
+    };
     resize_rgba(
         data.0 as usize,
         data.1 as usize,
         data.2 as usize,
         data.3 as usize,
         ResamplingFunction::Lanczos3,
+        data.4,
+        strategy,
     )
 });
 
@@ -50,6 +58,8 @@ fn resize_rgba(
     dst_width: usize,
     dst_height: usize,
     sampler: ResamplingFunction,
+    premultiply_alpha: bool,
+    workload_strategy: WorkloadStrategy,
 ) {
     if src_width == 0
         || src_width > 2000
@@ -66,25 +76,27 @@ fn resize_rgba(
     let store = ImageStore::<u16, 4>::alloc(src_width, src_height);
     let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10);
 
-    let scaler = Scaler::new(sampler);
-    scaler.resize_rgba_u16(&store, &mut target, false).unwrap();
-    let store = ImageStore::<u16, 4>::alloc(src_width, src_height);
-    scaler.resize_rgba_u16(&store, &mut target, true).unwrap();
+    let mut scaler = Scaler::new(sampler);
+    scaler.set_workload_strategy(workload_strategy);
+    scaler
+        .resize_rgba_u16(&store, &mut target, premultiply_alpha)
+        .unwrap();
 
     let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16);
 
     let store = ImageStore::<u16, 4>::alloc(src_width, src_height);
-    scaler.resize_rgba_u16(&store, &mut target, false).unwrap();
-
-    let store = ImageStore::<u16, 4>::alloc(src_width, src_height);
-    scaler.resize_rgba_u16(&store, &mut target, true).unwrap();
+    scaler
+        .resize_rgba_u16(&store, &mut target, premultiply_alpha)
+        .unwrap();
 
-    let src_data_ar30 = vec![1u32; src_width * src_height];
-    let mut dst_data_ar30 = vec![1u32; dst_width * dst_height];
+    let src_data_ar30 = vec![1u8; src_width * src_height * 4];
+    let mut dst_data_ar30 = vec![1u8; dst_width * dst_height * 4];
     _ = scaler.resize_ar30(
         &src_data_ar30,
+        src_width * 4,
         ImageSize::new(src_width, src_height),
         &mut dst_data_ar30,
+        dst_height * 4,
         ImageSize::new(dst_width, dst_height),
         Ar30ByteOrder::Host,
     );
diff --git a/picscale/Cargo.lock b/picscale/Cargo.lock
index 779b028..0215a43 100644
--- a/picscale/Cargo.lock
+++ b/picscale/Cargo.lock
@@ -65,9 +65,9 @@ checksum = "1be3f42a67d6d345ecd59f675f3f012d6974981560836e938c22b424b85ce1be"
 
 [[package]]
 name = "cbindgen"
-version = "0.27.0"
+version = "0.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3fce8dd7fcfcbf3a0a87d8f515194b49d6135acab73e18bd380d1d93bb1a15eb"
+checksum = "eadd868a2ce9ca38de7eeafdcec9c7065ef89b42b32f0839278d55f35c54d1ff"
 dependencies = [
  "clap",
  "heck",
@@ -220,7 +220,6 @@ checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
 dependencies = [
  "cfg-if",
  "crunchy",
- "num-traits",
 ]
 
 [[package]]
@@ -263,12 +262,6 @@ version = "0.2.169"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
 
-[[package]]
-name = "libm"
-version = "0.2.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
-
 [[package]]
 name = "linux-raw-sys"
 version = "0.4.15"
@@ -294,7 +287,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
 dependencies = [
  "autocfg",
- "libm",
 ]
 
 [[package]]
@@ -305,10 +297,9 @@ checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
 
 [[package]]
 name = "pic-scale"
-version = "0.4.2"
+version = "0.5.0"
 dependencies = [
  "colorutils-rs",
- "half",
  "libc",
  "num-traits",
  "rayon",
@@ -319,7 +310,6 @@ name = "picscale"
 version = "0.1.0"
 dependencies = [
  "cbindgen",
- "half",
  "num-traits",
  "pic-scale",
 ]
diff --git a/picscale/Cargo.toml b/picscale/Cargo.toml
index 215c206..f6875ee 100644
--- a/picscale/Cargo.toml
+++ b/picscale/Cargo.toml
@@ -5,8 +5,7 @@ edition = "2024"
 build = "build.rs"
 
 [dependencies]
-pic-scale = { path = "../", features = ["half"] }
-half = "2.4.1"
+pic-scale = { path = "../", features = ["nightly_f16"] }
 num-traits = "0.2.19"
 
 [features]
@@ -17,7 +16,7 @@ full_support = []
 crate-type = ["staticlib", "rlib"]
 
 [build-dependencies]
-cbindgen = "0.27.0"
+cbindgen = "0.28.0"
 
 [profile.release]
 strip = true
diff --git a/picscale/src/lib.rs b/picscale/src/lib.rs
index b8b7ffe..a835ca3 100644
--- a/picscale/src/lib.rs
+++ b/picscale/src/lib.rs
@@ -740,6 +740,8 @@ pub extern "C" fn pic_scale_resize_planar_f32(
     )
 }
 
+use core::f16;
+
 /// Resizes an RGBAF16 image
 ///
 /// # Arguments
@@ -769,12 +771,12 @@ pub extern "C" fn pic_scale_resize_rgba_f16(
     resizing_filter: ScalingFilter,
     flags: u32,
 ) -> usize {
-    pic_scale_scale_generic::<half::f16, 4>(
-        src as *const half::f16,
+    pic_scale_scale_generic::<f16, 4>(
+        src as *const f16,
         src_stride,
         width,
         height,
-        dst as *mut half::f16,
+        dst as *mut f16,
         dst_stride,
         new_width,
         new_height,
@@ -814,12 +816,12 @@ pub extern "C" fn pic_scale_resize_rgb_f16(
     resizing_filter: ScalingFilter,
     flags: u32,
 ) -> usize {
-    pic_scale_scale_generic::<half::f16, 3>(
-        src as *const half::f16,
+    pic_scale_scale_generic::<f16, 3>(
+        src as *const f16,
         src_stride,
         width,
         height,
-        dst as *mut half::f16,
+        dst as *mut f16,
         dst_stride,
         new_width,
         new_height,
@@ -859,12 +861,12 @@ pub extern "C" fn pic_scale_resize_cbcr_f16(
     resizing_filter: ScalingFilter,
     flags: u32,
 ) -> usize {
-    pic_scale_scale_generic::<half::f16, 2>(
-        src as *const half::f16,
+    pic_scale_scale_generic::<f16, 2>(
+        src as *const f16,
         src_stride,
         width,
         height,
-        dst as *mut half::f16,
+        dst as *mut f16,
         dst_stride,
         new_width,
         new_height,
@@ -904,12 +906,12 @@ pub extern "C" fn pic_scale_resize_planar_f16(
     resizing_filter: ScalingFilter,
     flags: u32,
 ) -> usize {
-    pic_scale_scale_generic::<half::f16, 1>(
-        src as *const half::f16,
+    pic_scale_scale_generic::<f16, 1>(
+        src as *const f16,
         src_stride,
         width,
         height,
-        dst as *mut half::f16,
+        dst as *mut f16,
         dst_stride,
         new_width,
         new_height,
diff --git a/src/alpha_handle_f16.rs b/src/alpha_handle_f16.rs
index 10dfef0..835df49 100644
--- a/src/alpha_handle_f16.rs
+++ b/src/alpha_handle_f16.rs
@@ -36,17 +36,18 @@ use crate::neon::{neon_premultiply_alpha_rgba_f16, neon_unpremultiply_alpha_rgba
 use crate::neon::{neon_premultiply_alpha_rgba_f16_full, neon_unpremultiply_alpha_rgba_f16_full};
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16};
+use core::f16;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use rayon::ThreadPool;
 
 #[inline]
-pub(crate) fn unpremultiply_pixel_f16_row(in_place: &mut [half::f16]) {
+pub(crate) fn unpremultiply_pixel_f16_row(in_place: &mut [f16]) {
     for dst in in_place.chunks_exact_mut(4) {
-        let mut r = dst[0].to_f32();
-        let mut g = dst[1].to_f32();
-        let mut b = dst[2].to_f32();
-        let a = dst[3].to_f32();
+        let mut r = dst[0] as f32;
+        let mut g = dst[1] as f32;
+        let mut b = dst[2] as f32;
+        let a = dst[3] as f32;
         if a != 0. {
             let scale_alpha = 1. / a;
             r *= scale_alpha;
@@ -57,33 +58,33 @@ pub(crate) fn unpremultiply_pixel_f16_row(in_place: &mut [half::f16]) {
             g = 0.;
             b = 0.;
         }
-        dst[0] = half::f16::from_f32(r);
-        dst[1] = half::f16::from_f32(g);
-        dst[2] = half::f16::from_f32(b);
+        dst[0] = r as f16;
+        dst[1] = g as f16;
+        dst[2] = b as f16;
     }
 }
 
 #[inline]
-pub(crate) fn premultiply_pixel_f16_row(dst: &mut [half::f16], src: &[half::f16]) {
+pub(crate) fn premultiply_pixel_f16_row(dst: &mut [f16], src: &[f16]) {
     for (dst, src) in dst.chunks_exact_mut(4).zip(src.chunks_exact(4)) {
-        let mut r = src[0].to_f32();
-        let mut g = src[1].to_f32();
-        let mut b = src[2].to_f32();
-        let a = src[3].to_f32();
+        let mut r = src[0] as f32;
+        let mut g = src[1] as f32;
+        let mut b = src[2] as f32;
+        let a = src[3] as f32;
         r *= a;
         g *= a;
         b *= a;
-        dst[0] = half::f16::from_f32(r);
-        dst[1] = half::f16::from_f32(g);
-        dst[2] = half::f16::from_f32(b);
-        dst[3] = half::f16::from_f32(a);
+        dst[0] = r as f16;
+        dst[1] = g as f16;
+        dst[2] = b as f16;
+        dst[3] = a as f16;
     }
 }
 
 fn premultiply_alpha_rgba_impl_f16(
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
     width: usize,
     _: usize,
@@ -107,7 +108,7 @@ fn premultiply_alpha_rgba_impl_f16(
 }
 
 fn unpremultiply_alpha_rgba_impl_f16(
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     stride: usize,
     width: usize,
     _: usize,
@@ -127,18 +128,19 @@ fn unpremultiply_alpha_rgba_impl_f16(
 }
 
 pub(crate) fn premultiply_alpha_rgba_f16(
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
     width: usize,
     height: usize,
     pool: &Option<ThreadPool>,
 ) {
+    #[allow(clippy::type_complexity)]
     let mut _dispatcher: fn(
-        &mut [half::f16],
+        &mut [f16],
         usize,
-        &[half::f16],
+        &[f16],
         usize,
         usize,
         usize,
@@ -167,13 +169,13 @@ pub(crate) fn premultiply_alpha_rgba_f16(
 }
 
 pub(crate) fn unpremultiply_alpha_rgba_f16(
-    in_place: &mut [half::f16],
+    in_place: &mut [f16],
     stride: usize,
     width: usize,
     height: usize,
     pool: &Option<ThreadPool>,
 ) {
-    let mut _dispatcher: fn(&mut [half::f16], usize, usize, usize, &Option<ThreadPool>) =
+    let mut _dispatcher: fn(&mut [f16], usize, usize, usize, &Option<ThreadPool>) =
         unpremultiply_alpha_rgba_impl_f16;
     #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
     {
diff --git a/src/ar30.rs b/src/ar30.rs
index b3ccf4c..f97ecec 100644
--- a/src/ar30.rs
+++ b/src/ar30.rs
@@ -58,11 +58,24 @@ const fn ntohl(netlong: u32) -> u32 {
 }
 
 impl Rgb30 {
+    // #[inline]
+    // pub(crate) const fn pack_w_a<const STORE: usize>(self, r: i32, g: i32, b: i32, a: i32) -> u32 {
+    //     let value: u32 = match self {
+    //         Rgb30::Ar30 => (((a << 30) | (b << 20)) | ((g << 10) | r)) as u32,
+    //         Rgb30::Ra30 => (((r << 22) | (g << 12)) | ((b << 2) | a)) as u32,
+    //     };
+    //     if STORE == 0 {
+    //         value
+    //     } else {
+    //         htonl(value)
+    //     }
+    // }
+
     #[inline]
-    pub(crate) const fn pack_w_a<const STORE: usize>(self, r: i32, g: i32, b: i32, a: i32) -> u32 {
+    pub(crate) const fn pack_w_a<const STORE: usize>(self, r: i32, g: i32, b: i32, _: i32) -> u32 {
         let value: u32 = match self {
-            Rgb30::Ar30 => (((a << 30) | (b << 20)) | ((g << 10) | r)) as u32,
-            Rgb30::Ra30 => (((r << 22) | (g << 12)) | ((b << 2) | a)) as u32,
+            Rgb30::Ar30 => (((3 << 30) | (b << 20)) | ((g << 10) | r)) as u32,
+            Rgb30::Ra30 => (((r << 22) | (g << 12)) | ((b << 2) | 3)) as u32,
         };
         if STORE == 0 {
             value
@@ -79,15 +92,15 @@ impl Rgb30 {
                 let r10 = pixel & 0x3ff;
                 let g10 = (pixel >> 10) & 0x3ff;
                 let b10 = (pixel >> 20) & 0x3ff;
-                let a10 = pixel >> 30;
-                (r10, g10, b10, a10)
+                // let a10 = pixel >> 30;
+                (r10, g10, b10, 3)
             }
             Rgb30::Ra30 => {
-                let a2 = pixel & 0x3;
+                // let a2 = pixel & 0x3;
                 let r10 = (pixel >> 22) & 0x3ff;
                 let g10 = (pixel >> 12) & 0x3ff;
                 let b10 = (pixel >> 2) & 0x3ff;
-                (r10, g10, b10, a2)
+                (r10, g10, b10, 3)
             }
         }
     }
diff --git a/src/avx2/alpha_f16.rs b/src/avx2/alpha_f16.rs
index d74f2e1..9cafee2 100644
--- a/src/avx2/alpha_f16.rs
+++ b/src/avx2/alpha_f16.rs
@@ -29,6 +29,7 @@
 
 use crate::alpha_handle_f16::{premultiply_pixel_f16_row, unpremultiply_pixel_f16_row};
 use crate::avx2::utils::{avx_combine_epi, avx_deinterleave_rgba_epi16, avx_interleave_rgba_epi16};
+use core::f16;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use rayon::ThreadPool;
@@ -38,9 +39,9 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 pub(crate) fn avx_premultiply_alpha_rgba_f16(
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
     width: usize,
     height: usize,
@@ -53,7 +54,7 @@ pub(crate) fn avx_premultiply_alpha_rgba_f16(
 
 #[target_feature(enable = "avx2", enable = "f16c")]
 /// This inlining is required to activate all features for runtime dispatch
-unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[half::f16]) {
+unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [f16], src: &[f16]) {
     let mut rem = dst;
     let mut src_rem = src;
 
@@ -116,9 +117,9 @@ unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[
 #[target_feature(enable = "avx2", enable = "f16c")]
 /// This inlining is required to activate all features for runtime dispatch
 unsafe fn avx_premultiply_alpha_rgba_f16_impl(
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
     width: usize,
     _: usize,
@@ -145,7 +146,7 @@ unsafe fn avx_premultiply_alpha_rgba_f16_impl(
 }
 
 pub(crate) fn avx_unpremultiply_alpha_rgba_f16(
-    in_place: &mut [half::f16],
+    in_place: &mut [f16],
     stride: usize,
     width: usize,
     height: usize,
@@ -158,7 +159,7 @@ pub(crate) fn avx_unpremultiply_alpha_rgba_f16(
 
 #[target_feature(enable = "avx2", enable = "f16c")]
 /// This inlining is required to activate all features for runtime dispatch
-unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) {
+unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [f16]) {
     let mut rem = in_place;
 
     for dst in rem.chunks_exact_mut(16 * 4) {
@@ -243,7 +244,7 @@ unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16])
 #[target_feature(enable = "avx2", enable = "f16c")]
 /// This inlining is required to activate all features for runtime dispatch
 unsafe fn avx_unpremultiply_alpha_rgba_f16_impl(
-    in_place: &mut [half::f16],
+    in_place: &mut [f16],
     stride: usize,
     width: usize,
     _: usize,
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
index 48aa472..72176a1 100644
--- a/src/avx2/mod.rs
+++ b/src/avx2/mod.rs
@@ -27,26 +27,26 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod alpha_f16;
 mod alpha_f32;
 mod alpha_u16;
 mod alpha_u8;
 mod check_alpha;
 mod rgb_u8;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod rgba_f16;
 mod rgba_f32;
 mod rgba_u8_lb;
 pub(crate) mod utils;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod vertical_f16;
 mod vertical_f32;
 mod vertical_u16_lb;
 mod vertical_u8;
 mod vertical_u8_lp;
 
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) use alpha_f16::{avx_premultiply_alpha_rgba_f16, avx_unpremultiply_alpha_rgba_f16};
 pub(crate) use alpha_f32::avx_premultiply_alpha_rgba_f32;
 pub(crate) use alpha_f32::avx_unpremultiply_alpha_rgba_f32;
@@ -57,7 +57,7 @@ pub(crate) use check_alpha::{
     avx_has_non_constant_cap_alpha_rgba16, avx_has_non_constant_cap_alpha_rgba8,
 };
 pub(crate) use rgb_u8::{convolve_horizontal_rgb_avx_row_one, convolve_horizontal_rgb_avx_rows_4};
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) use rgba_f16::{
     convolve_horizontal_rgba_avx_row_one_f16, convolve_horizontal_rgba_avx_rows_4_f16,
 };
@@ -67,7 +67,7 @@ pub(crate) use rgba_f32::{
 pub(crate) use rgba_u8_lb::{
     convolve_horizontal_rgba_avx_rows_4_lb, convolve_horizontal_rgba_avx_rows_one_lb,
 };
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) use vertical_f16::convolve_vertical_avx_row_f16;
 pub(crate) use vertical_f32::convolve_vertical_avx_row_f32;
 pub(crate) use vertical_u16_lb::convolve_column_lb_avx2_u16;
diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs
index b2bc4d5..67a8cd1 100644
--- a/src/avx2/rgba_f16.rs
+++ b/src/avx2/rgba_f16.rs
@@ -32,7 +32,7 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-use half::f16;
+use core::f16;
 
 use crate::avx2::utils::{_mm256_fma_ps, avx_combine_ps};
 use crate::filter_weights::FilterWeights;
diff --git a/src/avx2/vertical_f16.rs b/src/avx2/vertical_f16.rs
index 67f3f0d..3c2e710 100644
--- a/src/avx2/vertical_f16.rs
+++ b/src/avx2/vertical_f16.rs
@@ -28,6 +28,7 @@
  */
 use crate::avx2::utils::{_mm256_fma_ps, avx_combine_epi};
 use crate::filter_weights::FilterBounds;
+use core::f16;
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
@@ -37,9 +38,9 @@ use std::arch::x86_64::*;
 unsafe fn convolve_vertical_part_avx_f16<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -76,9 +77,9 @@ unsafe fn convolve_vertical_part_avx_f16<const FMA: bool>(
 unsafe fn convolve_vertical_part_avx_4_f16<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -109,9 +110,9 @@ unsafe fn convolve_vertical_part_avx_4_f16<const FMA: bool>(
 unsafe fn convolve_vertical_part_avx_32_f16<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -164,9 +165,9 @@ unsafe fn convolve_vertical_part_avx_32_f16<const FMA: bool>(
 unsafe fn convolve_vertical_part_avx_16_f16<const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -204,8 +205,8 @@ unsafe fn convolve_vertical_part_avx_16_f16<const FMA: bool>(
 pub(crate) fn convolve_vertical_avx_row_f16<const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -223,8 +224,8 @@ pub(crate) fn convolve_vertical_avx_row_f16<const FMA: bool>(
 unsafe fn convolve_vertical_avx_row_f16_regular(
     width: usize,
     bounds: &FilterBounds,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -236,8 +237,8 @@ unsafe fn convolve_vertical_avx_row_f16_regular(
 unsafe fn convolve_vertical_avx_row_f16_fma(
     width: usize,
     bounds: &FilterBounds,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -248,8 +249,8 @@ unsafe fn convolve_vertical_avx_row_f16_fma(
 unsafe fn convolve_vertical_avx_row_f16_impl<const FMA: bool>(
     _: usize,
     bounds: &FilterBounds,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
diff --git a/src/avx512/alpha_u8.rs b/src/avx512/alpha_u8.rs
index 3f2e223..998e056 100644
--- a/src/avx512/alpha_u8.rs
+++ b/src/avx512/alpha_u8.rs
@@ -94,7 +94,7 @@ impl AssociateAlpha for AssociateAlphaDefault {
         if !rem.is_empty() {
             assert!(rem.len() <= 64);
             assert!(src_rem.len() <= 64);
-            self.associate_chunk(&mut rem, &src_rem);
+            self.associate_chunk(rem, src_rem);
         }
     }
 }
diff --git a/src/cbcr16.rs b/src/cbcr16.rs
index fd2d13d..e882d35 100644
--- a/src/cbcr16.rs
+++ b/src/cbcr16.rs
@@ -26,7 +26,7 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16};
 use crate::filter_weights::FilterWeights;
 use crate::{ImageStore, ImageStoreMut};
@@ -39,6 +39,7 @@ impl HorizontalConvolutionPass<u16, 2> for ImageStore<'_, u16, 2> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u16, 2>,
         _pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool);
     }
@@ -50,7 +51,8 @@ impl VerticalConvolutionPass<u16, 2> for ImageStore<'_, u16, 2> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u16, 2>,
         pool: &Option<ThreadPool>,
+        options: ConvolutionOptions,
     ) {
-        convolve_vertical_dispatch_u16(self, filter_weights, destination, pool);
+        convolve_vertical_dispatch_u16(self, filter_weights, destination, pool, options);
     }
 }
diff --git a/src/cbcr8.rs b/src/cbcr8.rs
index d1be3e8..3153506 100644
--- a/src/cbcr8.rs
+++ b/src/cbcr8.rs
@@ -28,14 +28,12 @@
  */
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::avx2::{convolve_vertical_avx_row, convolve_vertical_avx_row_lp};
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8};
 use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights};
 use crate::handler_provider::{
     handle_fixed_column_u8, handle_fixed_row_u8, handle_fixed_rows_4_u8,
 };
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::neon::{convolve_vertical_neon_i16_precision, convolve_vertical_neon_i32_precision};
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse::{convolve_vertical_sse_row, convolve_vertical_sse_row_lp};
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
@@ -50,6 +48,7 @@ impl HorizontalConvolutionPass<u8, 2> for ImageStore<'_, u8, 2> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u8, 2>,
         _pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         let _scale_factor = self.width as f32 / destination.width as f32;
         let mut _dispatcher_4_rows: Option<
@@ -59,7 +58,11 @@ impl HorizontalConvolutionPass<u8, 2> for ImageStore<'_, u8, 2> {
             handle_fixed_row_u8::<2>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
+            #[cfg(feature = "rdm")]
+            if _scale_factor < 8.
+                && crate::cpu_features::is_aarch_rdm_supported()
+                && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+            {
                 use crate::neon::{
                     convolve_horizontal_cbcr_neon_rdm_row,
                     convolve_horizontal_cbcr_neon_rows_rdm_4_u8,
@@ -70,7 +73,10 @@ impl HorizontalConvolutionPass<u8, 2> for ImageStore<'_, u8, 2> {
         }
         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
         {
-            if std::arch::is_x86_feature_detected!("sse4.1") && _scale_factor < 8. {
+            if std::arch::is_x86_feature_detected!("sse4.1")
+                && _scale_factor < 8.
+                && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+            {
                 use crate::sse::{
                     convolve_horizontal_cbcr_sse_hrs_row_one,
                     convolve_horizontal_cbcr_sse_hrs_rows_4,
@@ -97,6 +103,7 @@ impl VerticalConvolutionPass<u8, 2> for ImageStore<'_, u8, 2> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u8, 2>,
         pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         let _scale_factor = self.height as f32 / destination.height as f32;
         #[allow(clippy::type_complexity)]
@@ -105,34 +112,56 @@ impl VerticalConvolutionPass<u8, 2> for ImageStore<'_, u8, 2> {
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             // For more downscaling better to use more precise version
-            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
-                _dispatcher = convolve_vertical_neon_i16_precision;
-            } else {
-                _dispatcher = convolve_vertical_neon_i32_precision;
+            match _options.workload_strategy {
+                crate::WorkloadStrategy::PreferQuality => {
+                    use crate::neon::convolve_vertical_neon_i32_precision_d;
+                    _dispatcher = convolve_vertical_neon_i32_precision_d;
+                }
+                crate::WorkloadStrategy::PreferSpeed => {
+                    // For more downscaling better to use more precise version
+                    #[cfg(feature = "rdm")]
+                    if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
+                        use crate::neon::convolve_vertical_neon_i16_precision;
+                        _dispatcher = convolve_vertical_neon_i16_precision;
+                    } else {
+                        use crate::neon::convolve_vertical_neon_i32_precision;
+                        _dispatcher = convolve_vertical_neon_i32_precision;
+                    }
+                    #[cfg(not(feature = "rdm"))]
+                    {
+                        use crate::neon::convolve_vertical_neon_i32_precision;
+                        _dispatcher = convolve_vertical_neon_i32_precision;
+                    }
+                }
             }
         }
         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
         {
             if is_x86_feature_detected!("sse4.1") {
-                if _scale_factor < 8. {
+                if _scale_factor < 8.
+                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                {
                     _dispatcher = convolve_vertical_sse_row_lp;
                 } else {
                     _dispatcher = convolve_vertical_sse_row;
                 }
             }
             if is_x86_feature_detected!("avx2") {
-                if _scale_factor < 8. {
+                if _scale_factor < 8.
+                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                {
                     _dispatcher = convolve_vertical_avx_row_lp;
                 } else {
                     _dispatcher = convolve_vertical_avx_row;
                 }
             }
             #[cfg(feature = "nightly_avx512")]
-            if std::arch::is_x86_feature_detected!("avx512bw") {
-                if _scale_factor < 8. {
-                    use crate::avx512::convolve_vertical_avx512_row_lp;
-                    _dispatcher = convolve_vertical_avx512_row_lp;
-                }
+            if std::arch::is_x86_feature_detected!("avx512bw")
+                && _scale_factor < 8.
+                && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+            {
+                use crate::avx512::convolve_vertical_avx512_row_lp;
+                _dispatcher = convolve_vertical_avx512_row_lp;
             }
         }
         #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
diff --git a/src/cbcr_f32.rs b/src/cbcr_f32.rs
index cbd372e..b2c72d5 100644
--- a/src/cbcr_f32.rs
+++ b/src/cbcr_f32.rs
@@ -28,7 +28,7 @@
  */
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::avx2::convolve_vertical_avx_row_f32;
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::convolve_naive_f32::{
     convolve_horizontal_rgb_native_row, convolve_horizontal_rgba_4_row_f32,
 };
@@ -49,6 +49,7 @@ impl HorizontalConvolutionPass<f32, 2> for ImageStore<'_, f32, 2> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f32, 2>,
         pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         let _dispatcher_4_rows: Option<
             fn(usize, usize, &FilterWeights<f32>, &[f32], usize, &mut [f32], usize),
@@ -72,6 +73,7 @@ impl VerticalConvolutionPass<f32, 2> for ImageStore<'_, f32, 2> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f32, 2>,
         pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         #[allow(clippy::type_complexity)]
         let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) =
diff --git a/src/color_group.rs b/src/color_group.rs
index 9f71eb6..d90de83 100644
--- a/src/color_group.rs
+++ b/src/color_group.rs
@@ -417,7 +417,8 @@ where
 macro_rules! load_ar30 {
     ($store: expr, $ar_type: expr, $ar_order: ty) => {{
         let ar_type: crate::ar30::Rgb30 = $ar_type.into();
-        let unpacked = ar_type.unpack::<$ar_order>($store[0]);
+        let read_bits = u32::from_ne_bytes([$store[0], $store[1], $store[2], $store[3]]);
+        let unpacked = ar_type.unpack::<$ar_order>(read_bits);
         ColorGroup::<4, i32> {
             r: unpacked.0 as i32,
             g: unpacked.1 as i32,
@@ -432,7 +433,8 @@ pub(crate) use load_ar30;
 macro_rules! load_ar30_p {
     ($store: expr, $ar_type: expr, $ar_order: ty) => {{
         let ar_type: crate::ar30::Rgb30 = $ar_type.into();
-        let unpacked = ar_type.unpack::<$ar_order>(*$store);
+        let read_bits = u32::from_ne_bytes([$store[0], $store[1], $store[2], $store[3]]);
+        let unpacked = ar_type.unpack::<$ar_order>(read_bits);
         ColorGroup::<4, i32> {
             r: unpacked.0 as i32,
             g: unpacked.1 as i32,
@@ -447,7 +449,10 @@ pub(crate) use load_ar30_p;
 macro_rules! load_ar30_with_offset {
     ($store: expr, $ar_type: expr, $ar_order: ty, $offset: expr) => {{
         let ar_type: crate::ar30::Rgb30 = $ar_type.into();
-        let unpacked = ar_type.unpack::<$ar_order>($store[$offset]);
+        let cn = $offset * 4;
+        let read_bits =
+            u32::from_ne_bytes([$store[cn], $store[cn + 1], $store[cn + 2], $store[cn + 3]]);
+        let unpacked = ar_type.unpack::<$ar_order>(read_bits);
         ColorGroup::<4, i32> {
             r: unpacked.0 as i32,
             g: unpacked.1 as i32,
diff --git a/src/convolution.rs b/src/convolution.rs
index 8c67463..b8af2fe 100644
--- a/src/convolution.rs
+++ b/src/convolution.rs
@@ -27,33 +27,48 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use num_traits::FromPrimitive;
 use rayon::ThreadPool;
 use std::fmt::Debug;
 
 use crate::filter_weights::FilterWeights;
 use crate::image_store::ImageStoreMut;
+use crate::scaler::WorkloadStrategy;
+
+#[derive(Debug, Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
+pub(crate) struct ConvolutionOptions {
+    pub(crate) workload_strategy: WorkloadStrategy,
+}
+
+impl ConvolutionOptions {
+    pub(crate) fn new(strategy: WorkloadStrategy) -> Self {
+        Self {
+            workload_strategy: strategy,
+        }
+    }
+}
 
 pub(crate) trait HorizontalConvolutionPass<T, const N: usize>
 where
-    T: FromPrimitive + Clone + Copy + Debug,
+    T: Clone + Copy + Debug,
 {
     fn convolve_horizontal(
         &self,
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<T, N>,
         pool: &Option<ThreadPool>,
+        options: ConvolutionOptions,
     );
 }
 
 pub(crate) trait VerticalConvolutionPass<T, const N: usize>
 where
-    T: FromPrimitive + Clone + Copy + Debug,
+    T: Clone + Copy + Debug,
 {
     fn convolve_vertical(
         &self,
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<T, N>,
         pool: &Option<ThreadPool>,
+        options: ConvolutionOptions,
     );
 }
diff --git a/src/cpu_features.rs b/src/cpu_features.rs
index e110a04..95400df 100644
--- a/src/cpu_features.rs
+++ b/src/cpu_features.rs
@@ -80,7 +80,11 @@ pub(crate) fn is_aarch_f16_supported() -> bool {
 /// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used
 /// otherwise consider it is always available
 #[allow(clippy::too_long_first_doc_paragraph)]
-#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "half"))]
+#[cfg(all(
+    target_arch = "aarch64",
+    target_feature = "neon",
+    feature = "nightly_f16"
+))]
 pub(crate) fn is_aarch_f16c_supported() -> bool {
     #[cfg(any(target_os = "macos", target_os = "ios"))]
     {
@@ -96,6 +100,7 @@ pub(crate) fn is_aarch_f16c_supported() -> bool {
 ///
 /// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+#[cfg(feature = "rdm")]
 pub(crate) fn is_aarch_rdm_supported() -> bool {
     #[cfg(any(target_os = "macos", target_os = "ios"))]
     {
diff --git a/src/dispatch_group_ar30.rs b/src/dispatch_group_ar30.rs
index 95de42b..406453a 100644
--- a/src/dispatch_group_ar30.rs
+++ b/src/dispatch_group_ar30.rs
@@ -27,15 +27,12 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+use crate::convolution::ConvolutionOptions;
 use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::fixed_point_horizontal_ar30::{
     convolve_row_handler_fixed_point_4_ar30, convolve_row_handler_fixed_point_ar30,
 };
 use crate::fixed_point_vertical_ar30::column_handler_fixed_point_ar30;
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::neon::{
-    neon_column_handler_fixed_point_ar30, neon_convolve_horizontal_rgba_rows_4_ar30,
-};
 use crate::support::PRECISION;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
@@ -43,14 +40,15 @@ use rayon::ThreadPool;
 
 #[allow(clippy::type_complexity)]
 pub(crate) fn convolve_horizontal_dispatch_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
-    src: &[u32],
+    src: &[u8],
     src_stride: usize,
     filter_weights: FilterWeights<f32>,
-    dst: &mut [u32],
+    dst: &mut [u8],
     dst_stride: usize,
     pool: &Option<ThreadPool>,
+    _options: ConvolutionOptions,
 ) {
-    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "rdm"))]
     let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
     if let Some(pool) = pool {
         pool.install(|| {
@@ -58,10 +56,13 @@ pub(crate) fn convolve_horizontal_dispatch_ar30<const AR30_TYPE: usize, const AR
             dst.par_chunks_exact_mut(dst_stride * 4)
                 .zip(src.par_chunks_exact(src_stride * 4))
                 .for_each(|(dst, src)| {
-                    let mut _dispatch: fn(&[u32], usize, &mut [u32], usize, &FilterWeights<i16>) =
+                    let mut _dispatch: fn(&[u8], usize, &mut [u8], usize, &FilterWeights<i16>) =
                         convolve_row_handler_fixed_point_4_ar30::<AR30_TYPE, AR30_ORDER>;
-                    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-                    if is_rdm_available {
+                    #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "rdm"))]
+                    if is_rdm_available
+                        && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                    {
+                        use crate::neon::neon_convolve_horizontal_rgba_rows_4_ar30;
                         _dispatch =
                             neon_convolve_horizontal_rgba_rows_4_ar30::<AR30_TYPE, AR30_ORDER>;
                     }
@@ -85,10 +86,13 @@ pub(crate) fn convolve_horizontal_dispatch_ar30<const AR30_TYPE: usize, const AR
         dst.chunks_exact_mut(dst_stride * 4)
             .zip(src.chunks_exact(src_stride * 4))
             .for_each(|(dst, src)| {
-                let mut _dispatch: fn(&[u32], usize, &mut [u32], usize, &FilterWeights<i16>) =
+                let mut _dispatch: fn(&[u8], usize, &mut [u8], usize, &FilterWeights<i16>) =
                     convolve_row_handler_fixed_point_4_ar30::<AR30_TYPE, AR30_ORDER>;
-                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-                if is_rdm_available {
+                #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "rdm"))]
+                if is_rdm_available
+                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                {
+                    use crate::neon::neon_convolve_horizontal_rgba_rows_4_ar30;
                     _dispatch = neon_convolve_horizontal_rgba_rows_4_ar30::<AR30_TYPE, AR30_ORDER>;
                 }
                 _dispatch(src, src_stride, dst, dst_stride, &approx);
@@ -107,14 +111,16 @@ pub(crate) fn convolve_horizontal_dispatch_ar30<const AR30_TYPE: usize, const AR
 }
 
 pub(crate) fn convolve_vertical_dispatch_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
-    src: &[u32],
+    src: &[u8],
     src_stride: usize,
     filter_weights: FilterWeights<f32>,
-    dst: &mut [u32],
+    dst: &mut [u8],
     dst_stride: usize,
     pool: &Option<ThreadPool>,
+    width: usize,
+    _options: ConvolutionOptions,
 ) {
-    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "rdm"))]
     let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
     if let Some(pool) = pool {
         pool.install(|| {
@@ -125,13 +131,18 @@ pub(crate) fn convolve_vertical_dispatch_ar30<const AR30_TYPE: usize, const AR30
                     let bounds = approx.bounds[y];
                     let filter_offset = y * approx.aligned_size;
                     let weights = &approx.weights[filter_offset..];
-                    let mut _dispatch: fn(&FilterBounds, &[u32], &mut [u32], usize, &[i16]) =
+                    let mut _dispatch: fn(&FilterBounds, &[u8], &mut [u8], usize, &[i16]) =
                         column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>;
-                    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-                    if is_rdm_available {
+                    #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "rdm"))]
+                    if is_rdm_available
+                        && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                    {
+                        use crate::neon::neon_column_handler_fixed_point_ar30;
                         _dispatch = neon_column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>;
                     }
 
+                    let row = &mut row[0..4 * width];
+
                     _dispatch(&bounds, src, row, src_stride, weights);
                 });
         });
@@ -144,13 +155,18 @@ pub(crate) fn convolve_vertical_dispatch_ar30<const AR30_TYPE: usize, const AR30
                 let filter_offset = y * approx.aligned_size;
                 let weights = &approx.weights[filter_offset..];
 
-                let mut _dispatch: fn(&FilterBounds, &[u32], &mut [u32], usize, &[i16]) =
+                let mut _dispatch: fn(&FilterBounds, &[u8], &mut [u8], usize, &[i16]) =
                     column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>;
-                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-                if is_rdm_available {
+                #[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "rdm"))]
+                if is_rdm_available
+                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                {
+                    use crate::neon::neon_column_handler_fixed_point_ar30;
                     _dispatch = neon_column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>;
                 }
 
+                let row = &mut row[0..4 * width];
+
                 _dispatch(&bounds, src, row, src_stride, weights);
             });
     }
diff --git a/src/dispatch_group_f16.rs b/src/dispatch_group_f16.rs
index ee2ab1d..e20d91a 100644
--- a/src/dispatch_group_f16.rs
+++ b/src/dispatch_group_f16.rs
@@ -27,25 +27,28 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::filter_weights::{FilterBounds, FilterWeights, WeightsConverter};
 use crate::image_store::ImageStoreMut;
 use crate::ImageStore;
-use half::f16;
+use core::f16;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use rayon::ThreadPool;
 
 #[allow(clippy::type_complexity)]
-pub(crate) fn convolve_vertical_dispatch_f16<const COMPONENTS: usize>(
+pub(crate) fn convolve_vertical_dispatch_f16<V: Copy + Send + Sync, const COMPONENTS: usize>(
     image_store: &ImageStore<f16, COMPONENTS>,
     filter_weights: FilterWeights<f32>,
     destination: &mut ImageStoreMut<f16, COMPONENTS>,
     pool: &Option<ThreadPool>,
-    dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]),
+    dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[V]),
+    weights_converter: impl WeightsConverter<V>,
 ) {
     let src_stride = image_store.stride();
     let dst_stride = destination.stride();
 
+    let c_weights = weights_converter.prepare_weights(&filter_weights).weights;
+
     let dst_width = destination.width;
 
     if let Some(pool) = pool {
@@ -58,7 +61,7 @@ pub(crate) fn convolve_vertical_dispatch_f16<const COMPONENTS: usize>(
                 .for_each(|(y, row)| {
                     let bounds = filter_weights.bounds[y];
                     let filter_offset = y * filter_weights.aligned_size;
-                    let weights = &filter_weights.weights[filter_offset..];
+                    let weights = &c_weights[filter_offset..];
                     let source_buffer = image_store.buffer.as_ref();
                     dispatcher(
                         dst_width,
@@ -79,7 +82,7 @@ pub(crate) fn convolve_vertical_dispatch_f16<const COMPONENTS: usize>(
             .for_each(|(y, row)| {
                 let bounds = filter_weights.bounds[y];
                 let filter_offset = y * filter_weights.aligned_size;
-                let weights = &filter_weights.weights[filter_offset..];
+                let weights = &c_weights[filter_offset..];
                 let source_buffer = image_store.buffer.as_ref();
                 dispatcher(
                     dst_width,
@@ -94,21 +97,24 @@ pub(crate) fn convolve_vertical_dispatch_f16<const COMPONENTS: usize>(
 }
 
 #[allow(clippy::type_complexity)]
-pub(crate) fn convolve_horizontal_dispatch_f16<const CHANNELS: usize>(
+pub(crate) fn convolve_horizontal_dispatch_f16<V: Copy + Send + Sync, const CHANNELS: usize>(
     image_store: &ImageStore<f16, CHANNELS>,
     filter_weights: FilterWeights<f32>,
     destination: &mut ImageStoreMut<f16, CHANNELS>,
     pool: &Option<ThreadPool>,
     dispatcher_4_rows: Option<
-        fn(usize, usize, &FilterWeights<f32>, &[f16], usize, &mut [f16], usize),
+        fn(usize, usize, &FilterWeights<V>, &[f16], usize, &mut [f16], usize),
     >,
-    dispatcher_row: fn(usize, usize, &FilterWeights<f32>, &[f16], &mut [f16]),
+    dispatcher_row: fn(usize, usize, &FilterWeights<V>, &[f16], &mut [f16]),
+    weights_converter: impl WeightsConverter<V>,
 ) {
     let src_stride = image_store.stride();
     let dst_stride = destination.stride();
     let dst_width = destination.width;
     let src_width = image_store.width;
 
+    let c_weights = weights_converter.prepare_weights(&filter_weights);
+
     if let Some(pool) = pool {
         pool.install(|| {
             let mut processed_4 = false;
@@ -126,13 +132,7 @@ pub(crate) fn convolve_horizontal_dispatch_f16<const CHANNELS: usize>(
                     )
                     .for_each(|(src, dst)| {
                         dispatcher(
-                            dst_width,
-                            src_width,
-                            &filter_weights,
-                            src,
-                            src_stride,
-                            dst,
-                            dst_stride,
+                            dst_width, src_width, &c_weights, src, src_stride, dst, dst_stride,
                         );
                     });
                 processed_4 = true;
@@ -161,7 +161,7 @@ pub(crate) fn convolve_horizontal_dispatch_f16<const CHANNELS: usize>(
                 .par_chunks_exact(src_stride)
                 .zip(left_dst_rows.par_chunks_exact_mut(dst_stride))
                 .for_each(|(src, dst)| {
-                    dispatcher_row(dst_width, src_width, &filter_weights, src, dst);
+                    dispatcher_row(dst_width, src_width, &c_weights, src, dst);
                 });
         });
     } else {
@@ -179,13 +179,7 @@ pub(crate) fn convolve_horizontal_dispatch_f16<const CHANNELS: usize>(
                 )
             {
                 dispatcher(
-                    dst_width,
-                    src_width,
-                    &filter_weights,
-                    src,
-                    src_stride,
-                    dst,
-                    dst_stride,
+                    dst_width, src_width, &c_weights, src, src_stride, dst, dst_stride,
                 );
             }
             processed_4 = true;
@@ -213,7 +207,7 @@ pub(crate) fn convolve_horizontal_dispatch_f16<const CHANNELS: usize>(
             .chunks_exact(src_stride)
             .zip(left_dst_rows.chunks_exact_mut(dst_stride))
         {
-            dispatcher_row(dst_width, src_width, &filter_weights, src, dst);
+            dispatcher_row(dst_width, src_width, &c_weights, src, dst);
         }
     }
 }
diff --git a/src/dispatch_group_u16.rs b/src/dispatch_group_u16.rs
index 4eeb1b4..fb2ada7 100644
--- a/src/dispatch_group_u16.rs
+++ b/src/dispatch_group_u16.rs
@@ -27,6 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+use crate::convolution::ConvolutionOptions;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::cpu_features::is_aarch_f16_supported;
 use crate::filter_weights::{
@@ -163,6 +164,7 @@ pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
     filter_weights: FilterWeights<f32>,
     destination: &mut ImageStoreMut<'_, u16, COMPONENTS>,
     pool: &Option<ThreadPool>,
+    _options: ConvolutionOptions,
 ) {
     let src_stride = image_store.stride();
     let dst_stride = destination.stride();
@@ -195,8 +197,10 @@ pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
             } else {
                 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
                 {
-                    if is_aarch_f16_supported() {
-                        use crate::filter_weights::WeightFloat16Converter;
+                    if is_aarch_f16_supported()
+                        && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                    {
+                        use crate::filter_weights::WeightFloat16ConverterCast;
                         execute_low_precision_row(
                             true,
                             image_store,
@@ -207,7 +211,7 @@ pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
                             dst_width,
                             destination_image,
                             HighBitDepthFloat16LowerHandler::default(),
-                            WeightFloat16Converter::default(),
+                            WeightFloat16ConverterCast::default(),
                         );
                     } else {
                         execute_low_precision_row(
@@ -262,8 +266,10 @@ pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
     } else {
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            if is_aarch_f16_supported() {
-                use crate::filter_weights::WeightFloat16Converter;
+            if is_aarch_f16_supported()
+                && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+            {
+                use crate::filter_weights::WeightFloat16ConverterCast;
                 execute_low_precision_row(
                     false,
                     image_store,
@@ -274,7 +280,7 @@ pub(crate) fn convolve_vertical_dispatch_u16<const COMPONENTS: usize>(
                     dst_width,
                     destination.buffer.borrow_mut(),
                     HighBitDepthFloat16LowerHandler::default(),
-                    WeightFloat16Converter::default(),
+                    WeightFloat16ConverterCast::default(),
                 );
             } else {
                 execute_low_precision_row(
diff --git a/src/f16.rs b/src/f16.rs
index 00ba6e1..e43bb1e 100644
--- a/src/f16.rs
+++ b/src/f16.rs
@@ -34,11 +34,11 @@ use crate::avx2::{
     convolve_horizontal_rgba_avx_row_one_f16, convolve_horizontal_rgba_avx_rows_4_f16,
     convolve_vertical_avx_row_f16,
 };
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 #[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
 use crate::cpu_features::{is_aarch_f16_supported, is_aarch_f16c_supported};
 use crate::dispatch_group_f16::{convolve_horizontal_dispatch_f16, convolve_vertical_dispatch_f16};
-use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::filter_weights::{FilterBounds, FilterWeights, PasshroughWeightsConverter};
 use crate::floating_point_horizontal::{
     convolve_row_handler_floating_point, convolve_row_handler_floating_point_4,
 };
@@ -63,7 +63,7 @@ use crate::sse::{
     convolve_vertical_sse_row_f16,
 };
 use crate::ImageStore;
-use half::f16;
+use core::{f16, f32};
 use rayon::ThreadPool;
 
 fn convolve_horizontal_rgba_4_row_f16<const CHANNELS: usize>(
@@ -75,14 +75,19 @@ fn convolve_horizontal_rgba_4_row_f16<const CHANNELS: usize>(
     dst: &mut [f16],
     dst_stride: usize,
 ) {
-    convolve_row_handler_floating_point_4::<f16, f32, f32, CHANNELS>(
-        src,
+    let transient_src = src.iter().map(|&x| x as f32).collect::<Vec<f32>>();
+    let mut transient_dst = vec![0f32; dst.len()];
+    convolve_row_handler_floating_point_4::<f32, f32, f32, CHANNELS>(
+        &transient_src,
         src_stride,
-        dst,
+        &mut transient_dst,
         dst_stride,
         filter_weights,
         8,
-    )
+    );
+    for (dst, src) in dst.iter_mut().zip(transient_dst.iter()) {
+        *dst = *src as f16;
+    }
 }
 
 fn convolve_horizontal_rgb_native_row_f16<const CHANNELS: usize>(
@@ -92,7 +97,17 @@ fn convolve_horizontal_rgb_native_row_f16<const CHANNELS: usize>(
     src: &[f16],
     dst: &mut [f16],
 ) {
-    convolve_row_handler_floating_point::<f16, f32, f32, CHANNELS>(src, dst, filter_weights, 8)
+    let transient_src = src.iter().map(|&x| x as f32).collect::<Vec<f32>>();
+    let mut transient_dst = vec![0f32; dst.len()];
+    convolve_row_handler_floating_point::<f32, f32, f32, CHANNELS>(
+        &transient_src,
+        &mut transient_dst,
+        filter_weights,
+        8,
+    );
+    for (dst, src) in dst.iter_mut().zip(transient_dst.iter()) {
+        *dst = *src as f16;
+    }
 }
 
 impl HorizontalConvolutionPass<f16, 4> for ImageStore<'_, f16, 4> {
@@ -101,6 +116,7 @@ impl HorizontalConvolutionPass<f16, 4> for ImageStore<'_, f16, 4> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 4>,
         pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         #[allow(clippy::type_complexity)]
         let mut _dispatcher_4_rows: Option<
@@ -114,9 +130,31 @@ impl HorizontalConvolutionPass<f16, 4> for ImageStore<'_, f16, 4> {
             if is_aarch_f16c_supported() {
                 _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_f16);
                 _dispatcher_row = convolve_horizontal_rgba_neon_row_one_f16;
-                if is_aarch_f16_supported() {
-                    _dispatcher_4_rows = Some(xconvolve_horizontal_rgba_neon_rows_4_f16);
-                    _dispatcher_row = xconvolve_horizontal_rgba_neon_row_one_f16;
+                match _options.workload_strategy {
+                    crate::WorkloadStrategy::PreferSpeed => {
+                        if is_aarch_f16_supported() {
+                            _dispatcher_4_rows = Some(xconvolve_horizontal_rgba_neon_rows_4_f16);
+                            _dispatcher_row = xconvolve_horizontal_rgba_neon_row_one_f16;
+                        }
+                    }
+                    crate::WorkloadStrategy::PreferQuality => {
+                        if std::arch::is_aarch64_feature_detected!("fhm") {
+                            use crate::filter_weights::WeightFloat16Converter;
+                            use crate::neon::{
+                                convolve_horizontal_rgba_neon_row_one_f16_fhm,
+                                convolve_horizontal_rgba_neon_rows_4_f16_fhm,
+                            };
+                            return convolve_horizontal_dispatch_f16(
+                                self,
+                                filter_weights,
+                                destination,
+                                pool,
+                                Some(convolve_horizontal_rgba_neon_rows_4_f16_fhm),
+                                convolve_horizontal_rgba_neon_row_one_f16_fhm,
+                                WeightFloat16Converter::default(),
+                            );
+                        }
+                    }
                 }
             }
         }
@@ -154,6 +192,7 @@ impl HorizontalConvolutionPass<f16, 4> for ImageStore<'_, f16, 4> {
             pool,
             _dispatcher_4_rows,
             _dispatcher_row,
+            PasshroughWeightsConverter::default(),
         );
     }
 }
@@ -166,7 +205,19 @@ fn convolve_vertical_rgb_native_row_f16(
     src_stride: usize,
     weight: &[f32],
 ) {
-    column_handler_floating_point::<f16, f32, f32>(bounds, src, dst, src_stride, weight, 8);
+    let transient_src = src.iter().map(|&x| x as f32).collect::<Vec<f32>>();
+    let mut transient_dst = vec![0f32; dst.len()];
+    column_handler_floating_point::<f32, f32, f32>(
+        bounds,
+        &transient_src,
+        &mut transient_dst,
+        src_stride,
+        weight,
+        8,
+    );
+    for (dst, src) in dst.iter_mut().zip(transient_dst.iter()) {
+        *dst = *src as f16;
+    }
 }
 
 impl VerticalConvolutionPass<f16, 4> for ImageStore<'_, f16, 4> {
@@ -175,6 +226,7 @@ impl VerticalConvolutionPass<f16, 4> for ImageStore<'_, f16, 4> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 4>,
         pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         #[allow(clippy::type_complexity)]
         let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
@@ -183,8 +235,26 @@ impl VerticalConvolutionPass<f16, 4> for ImageStore<'_, f16, 4> {
         {
             if is_aarch_f16c_supported() {
                 _dispatcher = convolve_vertical_rgb_neon_row_f16;
-                if is_aarch_f16_supported() {
-                    _dispatcher = xconvolve_vertical_rgb_neon_row_f16;
+                match _options.workload_strategy {
+                    crate::WorkloadStrategy::PreferQuality => {
+                        use crate::filter_weights::WeightFloat16Converter;
+                        use crate::neon::convolve_vertical_rgb_neon_row_f16_fhm;
+                        if std::arch::is_aarch64_feature_detected!("fhm") {
+                            return convolve_vertical_dispatch_f16(
+                                self,
+                                filter_weights,
+                                destination,
+                                pool,
+                                convolve_vertical_rgb_neon_row_f16_fhm,
+                                WeightFloat16Converter {},
+                            );
+                        }
+                    }
+                    crate::WorkloadStrategy::PreferSpeed => {
+                        if is_aarch_f16_supported() {
+                            _dispatcher = xconvolve_vertical_rgb_neon_row_f16;
+                        }
+                    }
                 }
             }
         }
@@ -210,7 +280,14 @@ impl VerticalConvolutionPass<f16, 4> for ImageStore<'_, f16, 4> {
                 }
             }
         }
-        convolve_vertical_dispatch_f16(self, filter_weights, destination, pool, _dispatcher);
+        convolve_vertical_dispatch_f16(
+            self,
+            filter_weights,
+            destination,
+            pool,
+            _dispatcher,
+            PasshroughWeightsConverter {},
+        );
     }
 }
 
@@ -220,6 +297,7 @@ impl HorizontalConvolutionPass<f16, 3> for ImageStore<'_, f16, 3> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 3>,
         pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         #[allow(clippy::type_complexity)]
         let mut _dispatcher_4_rows: Option<
@@ -233,9 +311,33 @@ impl HorizontalConvolutionPass<f16, 3> for ImageStore<'_, f16, 3> {
             if is_aarch_f16c_supported() {
                 _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4_f16);
                 _dispatcher_row = convolve_horizontal_rgb_neon_row_one_f16;
-                if is_aarch_f16_supported() {
-                    _dispatcher_4_rows = Some(xconvolve_horizontal_rgb_neon_rows_4_f16);
-                    _dispatcher_row = xconvolve_horizontal_rgb_neon_row_one_f16;
+            }
+            match _options.workload_strategy {
+                crate::WorkloadStrategy::PreferQuality => {
+                    if std::arch::is_aarch64_feature_detected!("fhm") {
+                        use crate::filter_weights::WeightFloat16Converter;
+                        use crate::neon::{
+                            convolve_horizontal_rgb_neon_row_one_f16_fhm,
+                            convolve_horizontal_rgb_neon_rows_4_f16_fhm,
+                        };
+                        return convolve_horizontal_dispatch_f16(
+                            self,
+                            filter_weights,
+                            destination,
+                            pool,
+                            Some(convolve_horizontal_rgb_neon_rows_4_f16_fhm),
+                            convolve_horizontal_rgb_neon_row_one_f16_fhm,
+                            WeightFloat16Converter::default(),
+                        );
+                    }
+                }
+                crate::WorkloadStrategy::PreferSpeed => {
+                    if is_aarch_f16_supported()
+                        && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                    {
+                        _dispatcher_4_rows = Some(xconvolve_horizontal_rgb_neon_rows_4_f16);
+                        _dispatcher_row = xconvolve_horizontal_rgb_neon_row_one_f16;
+                    }
                 }
             }
         }
@@ -264,6 +366,7 @@ impl HorizontalConvolutionPass<f16, 3> for ImageStore<'_, f16, 3> {
             pool,
             _dispatcher_4_rows,
             _dispatcher_row,
+            PasshroughWeightsConverter::default(),
         );
     }
 }
@@ -274,6 +377,7 @@ impl VerticalConvolutionPass<f16, 3> for ImageStore<'_, f16, 3> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 3>,
         pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         #[allow(clippy::type_complexity)]
         let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
@@ -282,8 +386,26 @@ impl VerticalConvolutionPass<f16, 3> for ImageStore<'_, f16, 3> {
         {
             if is_aarch_f16c_supported() {
                 _dispatcher = convolve_vertical_rgb_neon_row_f16;
-                if is_aarch_f16_supported() {
-                    _dispatcher = xconvolve_vertical_rgb_neon_row_f16;
+                match _options.workload_strategy {
+                    crate::WorkloadStrategy::PreferQuality => {
+                        use crate::filter_weights::WeightFloat16Converter;
+                        use crate::neon::convolve_vertical_rgb_neon_row_f16_fhm;
+                        if std::arch::is_aarch64_feature_detected!("fhm") {
+                            return convolve_vertical_dispatch_f16(
+                                self,
+                                filter_weights,
+                                destination,
+                                pool,
+                                convolve_vertical_rgb_neon_row_f16_fhm,
+                                WeightFloat16Converter {},
+                            );
+                        }
+                    }
+                    crate::WorkloadStrategy::PreferSpeed => {
+                        if is_aarch_f16_supported() {
+                            _dispatcher = xconvolve_vertical_rgb_neon_row_f16;
+                        }
+                    }
                 }
             }
         }
@@ -309,7 +431,14 @@ impl VerticalConvolutionPass<f16, 3> for ImageStore<'_, f16, 3> {
                 }
             }
         }
-        convolve_vertical_dispatch_f16(self, filter_weights, destination, pool, _dispatcher);
+        convolve_vertical_dispatch_f16(
+            self,
+            filter_weights,
+            destination,
+            pool,
+            _dispatcher,
+            PasshroughWeightsConverter::default(),
+        );
     }
 }
 
@@ -319,6 +448,7 @@ impl HorizontalConvolutionPass<f16, 1> for ImageStore<'_, f16, 1> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 1>,
         pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         #[allow(clippy::type_complexity)]
         let _dispatcher_4_rows: Option<
@@ -333,6 +463,7 @@ impl HorizontalConvolutionPass<f16, 1> for ImageStore<'_, f16, 1> {
             pool,
             _dispatcher_4_rows,
             _dispatcher_row,
+            PasshroughWeightsConverter::default(),
         );
     }
 }
@@ -343,6 +474,7 @@ impl VerticalConvolutionPass<f16, 1> for ImageStore<'_, f16, 1> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 1>,
         pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         #[allow(clippy::type_complexity)]
         let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
@@ -351,8 +483,26 @@ impl VerticalConvolutionPass<f16, 1> for ImageStore<'_, f16, 1> {
         {
             if is_aarch_f16c_supported() {
                 _dispatcher = convolve_vertical_rgb_neon_row_f16;
-                if is_aarch_f16_supported() {
-                    _dispatcher = xconvolve_vertical_rgb_neon_row_f16;
+                match _options.workload_strategy {
+                    crate::WorkloadStrategy::PreferQuality => {
+                        use crate::filter_weights::WeightFloat16Converter;
+                        use crate::neon::convolve_vertical_rgb_neon_row_f16_fhm;
+                        if std::arch::is_aarch64_feature_detected!("fhm") {
+                            return convolve_vertical_dispatch_f16(
+                                self,
+                                filter_weights,
+                                destination,
+                                pool,
+                                convolve_vertical_rgb_neon_row_f16_fhm,
+                                WeightFloat16Converter {},
+                            );
+                        }
+                    }
+                    crate::WorkloadStrategy::PreferSpeed => {
+                        if is_aarch_f16_supported() {
+                            _dispatcher = xconvolve_vertical_rgb_neon_row_f16;
+                        }
+                    }
                 }
             }
         }
@@ -377,7 +527,14 @@ impl VerticalConvolutionPass<f16, 1> for ImageStore<'_, f16, 1> {
                 }
             }
         }
-        convolve_vertical_dispatch_f16(self, filter_weights, destination, pool, _dispatcher);
+        convolve_vertical_dispatch_f16(
+            self,
+            filter_weights,
+            destination,
+            pool,
+            _dispatcher,
+            PasshroughWeightsConverter::default(),
+        );
     }
 }
 
@@ -387,6 +544,7 @@ impl HorizontalConvolutionPass<f16, 2> for ImageStore<'_, f16, 2> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 2>,
         pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         #[allow(clippy::type_complexity)]
         let _dispatcher_4_rows: Option<
@@ -401,6 +559,7 @@ impl HorizontalConvolutionPass<f16, 2> for ImageStore<'_, f16, 2> {
             pool,
             _dispatcher_4_rows,
             _dispatcher_row,
+            PasshroughWeightsConverter::default(),
         );
     }
 }
@@ -411,6 +570,7 @@ impl VerticalConvolutionPass<f16, 2> for ImageStore<'_, f16, 2> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f16, 2>,
         pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         #[allow(clippy::type_complexity)]
         let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) =
@@ -419,8 +579,26 @@ impl VerticalConvolutionPass<f16, 2> for ImageStore<'_, f16, 2> {
         {
             if is_aarch_f16c_supported() {
                 _dispatcher = convolve_vertical_rgb_neon_row_f16;
-                if is_aarch_f16_supported() {
-                    _dispatcher = xconvolve_vertical_rgb_neon_row_f16;
+                match _options.workload_strategy {
+                    crate::WorkloadStrategy::PreferQuality => {
+                        use crate::filter_weights::WeightFloat16Converter;
+                        use crate::neon::convolve_vertical_rgb_neon_row_f16_fhm;
+                        if std::arch::is_aarch64_feature_detected!("fhm") {
+                            return convolve_vertical_dispatch_f16(
+                                self,
+                                filter_weights,
+                                destination,
+                                pool,
+                                convolve_vertical_rgb_neon_row_f16_fhm,
+                                WeightFloat16Converter {},
+                            );
+                        }
+                    }
+                    crate::WorkloadStrategy::PreferSpeed => {
+                        if is_aarch_f16_supported() {
+                            _dispatcher = xconvolve_vertical_rgb_neon_row_f16;
+                        }
+                    }
                 }
             }
         }
@@ -445,6 +623,13 @@ impl VerticalConvolutionPass<f16, 2> for ImageStore<'_, f16, 2> {
                 }
             }
         }
-        convolve_vertical_dispatch_f16(self, filter_weights, destination, pool, _dispatcher);
+        convolve_vertical_dispatch_f16(
+            self,
+            filter_weights,
+            destination,
+            pool,
+            _dispatcher,
+            PasshroughWeightsConverter::default(),
+        );
     }
 }
diff --git a/src/filter_weights.rs b/src/filter_weights.rs
index a665d96..be308e8 100644
--- a/src/filter_weights.rs
+++ b/src/filter_weights.rs
@@ -149,12 +149,23 @@ where
     }
 }
 
+#[derive(Default)]
+#[cfg(feature = "nightly_f16")]
+pub(crate) struct PasshroughWeightsConverter {}
+
+#[cfg(feature = "nightly_f16")]
+impl WeightsConverter<f32> for PasshroughWeightsConverter {
+    fn prepare_weights(&self, weights: &FilterWeights<f32>) -> FilterWeights<f32> {
+        weights.clone()
+    }
+}
+
 #[derive(Default)]
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-pub(crate) struct WeightFloat16Converter {}
+pub(crate) struct WeightFloat16ConverterCast {}
 
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-impl WeightsConverter<i16> for WeightFloat16Converter {
+impl WeightsConverter<i16> for WeightFloat16ConverterCast {
     fn prepare_weights(&self, weights: &FilterWeights<f32>) -> FilterWeights<i16> {
         use crate::neon::convert_weights_to_f16;
         let converted_weights = convert_weights_to_f16(&weights.weights);
@@ -171,3 +182,32 @@ impl WeightsConverter<i16> for WeightFloat16Converter {
         )
     }
 }
+
+#[derive(Default)]
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+#[cfg(feature = "nightly_f16")]
+pub(crate) struct WeightFloat16Converter {}
+
+#[cfg(feature = "nightly_f16")]
+#[allow(unused)]
+use core::f16;
+
+#[cfg(feature = "nightly_f16")]
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+impl WeightsConverter<f16> for WeightFloat16Converter {
+    fn prepare_weights(&self, weights: &FilterWeights<f32>) -> FilterWeights<f16> {
+        use crate::neon::convert_weights_to_f16_fhm;
+        let converted_weights = convert_weights_to_f16_fhm(&weights.weights);
+
+        let new_bounds = weights.bounds.to_vec();
+
+        FilterWeights::new(
+            converted_weights,
+            weights.kernel_size,
+            weights.kernel_size,
+            weights.distinct_elements,
+            weights.coeffs_size,
+            new_bounds,
+        )
+    }
+}
diff --git a/src/fixed_point_horizontal_ar30.rs b/src/fixed_point_horizontal_ar30.rs
index b46a7cb..8b4b6be 100644
--- a/src/fixed_point_horizontal_ar30.rs
+++ b/src/fixed_point_horizontal_ar30.rs
@@ -37,31 +37,36 @@ pub(crate) fn convolve_row_handler_fixed_point_ar30<
     const AR30_TYPE: usize,
     const AR30_ORDER: usize,
 >(
-    src: &[u32],
-    dst: &mut [u32],
+    src: &[u8],
+    dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
 ) {
-    for ((chunk, &bounds), weights) in dst.iter_mut().zip(filter_weights.bounds.iter()).zip(
-        filter_weights
-            .weights
-            .chunks_exact(filter_weights.aligned_size),
-    ) {
+    for ((chunk, &bounds), weights) in dst
+        .chunks_exact_mut(4)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
         let mut sums = ColorGroup::<4, i32>::dup(ROUNDING_CONST.as_());
 
         let start_x = bounds.start;
         let bounds_size = bounds.size;
 
-        let px = start_x;
+        const CN: usize = 4;
+        let px = start_x * CN;
 
         if bounds_size == 2 {
-            let src_ptr0 = &src[px..(px + 2)];
+            let src_ptr0 = &src[px..(px + 2 * CN)];
             let sliced_weights = &weights[0..2];
             let weight0 = sliced_weights[0] as i32;
             let weight1 = sliced_weights[1] as i32;
             sums += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
                 + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1;
         } else if bounds_size == 3 {
-            let src_ptr0 = &src[px..(px + 3)];
+            let src_ptr0 = &src[px..(px + 3 * CN)];
             let sliced_weights = &weights[0..3];
             let weight0 = sliced_weights[0] as i32;
             let weight1 = sliced_weights[1] as i32;
@@ -70,7 +75,7 @@ pub(crate) fn convolve_row_handler_fixed_point_ar30<
                 + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
                 + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2;
         } else if bounds_size == 4 {
-            let src_ptr0 = &src[px..(px + 4)];
+            let src_ptr0 = &src[px..(px + 4 * CN)];
             let sliced_weights = &weights[0..4];
             let weight0 = sliced_weights[0] as i32;
             let weight1 = sliced_weights[1] as i32;
@@ -81,7 +86,7 @@ pub(crate) fn convolve_row_handler_fixed_point_ar30<
                 + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2
                 + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 3) * weight3;
         } else if bounds_size == 6 {
-            let src_ptr0 = &src[px..(px + 6)];
+            let src_ptr0 = &src[px..(px + 6 * CN)];
 
             let sliced_weights = &weights[0..6];
             let weight0 = sliced_weights[0] as i32;
@@ -98,7 +103,11 @@ pub(crate) fn convolve_row_handler_fixed_point_ar30<
                 + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 5) * weight5;
         } else {
             let src_ptr0 = &src[px..(px + bounds_size)];
-            for (&k_weight, src) in weights.iter().zip(src_ptr0.iter()).take(bounds.size) {
+            for (&k_weight, src) in weights
+                .iter()
+                .zip(src_ptr0.chunks_exact(4))
+                .take(bounds.size)
+            {
                 let weight: i32 = k_weight as i32;
                 let new_px = load_ar30_p!(src, AR30_TYPE, AR30_ORDER);
                 sums += new_px * weight;
@@ -106,7 +115,11 @@ pub(crate) fn convolve_row_handler_fixed_point_ar30<
         }
 
         let narrowed = sums.saturate_ar30();
-        *chunk = narrowed.to_ar30::<AR30_TYPE, AR30_ORDER>();
+        let bytes0 = narrowed.to_ar30::<AR30_TYPE, AR30_ORDER>().to_ne_bytes();
+        chunk[0] = bytes0[0];
+        chunk[1] = bytes0[1];
+        chunk[2] = bytes0[2];
+        chunk[3] = bytes0[3];
     }
 }
 
@@ -115,9 +128,9 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30<
     const AR30_TYPE: usize,
     const AR30_ORDER: usize,
 >(
-    src: &[u32],
+    src: &[u8],
     src_stride: usize,
-    dst: &mut [u32],
+    dst: &mut [u8],
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
@@ -125,10 +138,12 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30<
     let (row1_ref, rest) = rest.split_at_mut(dst_stride);
     let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
 
-    let iter_row0 = row0_ref.iter_mut();
-    let iter_row1 = row1_ref.iter_mut();
-    let iter_row2 = row2_ref.iter_mut();
-    let iter_row3 = row3_ref.iter_mut();
+    const CN: usize = 4;
+
+    let iter_row0 = row0_ref.chunks_exact_mut(CN);
+    let iter_row1 = row1_ref.chunks_exact_mut(CN);
+    let iter_row2 = row2_ref.chunks_exact_mut(CN);
+    let iter_row3 = row3_ref.chunks_exact_mut(CN);
 
     for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
         .zip(iter_row1)
@@ -148,14 +163,14 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30<
 
         let start_x = bounds.start;
 
-        let px = start_x;
+        let px = start_x * CN;
         let bounds_size = bounds.size;
 
         if bounds_size == 2 {
-            let src_ptr0 = &src[px..(px + 2)];
-            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 2)];
-            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 2)];
-            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 2)];
+            let src_ptr0 = &src[px..(px + 2 * CN)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 2 * 4)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 2 * 4)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 2 * 4)];
 
             let sliced_weights = &weights[0..2];
             let weight0 = sliced_weights[0] as i32;
@@ -169,10 +184,10 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30<
             sums3 += load_ar30!(src_ptr3, AR30_TYPE, AR30_ORDER) * weight0
                 + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 1) * weight1;
         } else if bounds_size == 3 {
-            let src_ptr0 = &src[px..(px + 3)];
-            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 3)];
-            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 3)];
-            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 3)];
+            let src_ptr0 = &src[px..(px + 3 * CN)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 3 * 4)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 3 * 4)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 3 * 4)];
 
             let sliced_weights = &weights[0..3];
             let weight0 = sliced_weights[0] as i32;
@@ -191,10 +206,10 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30<
                 + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 1) * weight1
                 + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 2) * weight2;
         } else if bounds_size == 4 {
-            let src_ptr0 = &src[px..(px + 4)];
-            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 4)];
-            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 4)];
-            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 4)];
+            let src_ptr0 = &src[px..(px + 4 * CN)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 4 * 4)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 4 * 4)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 4 * 4)];
 
             let sliced_weights = &weights[0..4];
             let weight0 = sliced_weights[0] as i32;
@@ -218,10 +233,10 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30<
                 + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 2) * weight2
                 + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 3) * weight3;
         } else if bounds_size == 6 {
-            let src_ptr0 = &src[px..(px + 6)];
-            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 6)];
-            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 6)];
-            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 6)];
+            let src_ptr0 = &src[px..(px + 6 * CN)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 6 * 4)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 6 * 4)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 6 * 4)];
 
             let sliced_weights = &weights[0..6];
             let weight0 = sliced_weights[0] as i32;
@@ -255,17 +270,17 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30<
                 + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 4) * weight4
                 + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 5) * weight5;
         } else {
-            let src_ptr0 = &src[px..(px + bounds_size)];
-            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds_size)];
-            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds_size)];
-            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds_size)];
+            let src_ptr0 = &src[px..(px + bounds_size * CN)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds_size * CN)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds_size * CN)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds_size * CN)];
 
             for ((((&k_weight, src0), src1), src2), src3) in weights
                 .iter()
-                .zip(src_ptr0.iter())
-                .zip(src_ptr1.iter())
-                .zip(src_ptr2.iter())
-                .zip(src_ptr3.iter())
+                .zip(src_ptr0.chunks_exact(4))
+                .zip(src_ptr1.chunks_exact(4))
+                .zip(src_ptr2.chunks_exact(4))
+                .zip(src_ptr3.chunks_exact(4))
                 .take(bounds.size)
             {
                 let weight: i32 = k_weight as i32;
@@ -287,9 +302,28 @@ pub(crate) fn convolve_row_handler_fixed_point_4_ar30<
         let narrowed2 = sums2.saturate_ar30();
         let narrowed3 = sums3.saturate_ar30();
 
-        *chunk0 = narrowed0.to_ar30::<AR30_TYPE, AR30_ORDER>();
-        *chunk1 = narrowed1.to_ar30::<AR30_TYPE, AR30_ORDER>();
-        *chunk2 = narrowed2.to_ar30::<AR30_TYPE, AR30_ORDER>();
-        *chunk3 = narrowed3.to_ar30::<AR30_TYPE, AR30_ORDER>();
+        let bytes0 = narrowed0.to_ar30::<AR30_TYPE, AR30_ORDER>().to_ne_bytes();
+        chunk0[0] = bytes0[0];
+        chunk0[1] = bytes0[1];
+        chunk0[2] = bytes0[2];
+        chunk0[3] = bytes0[3];
+
+        let bytes1 = narrowed1.to_ar30::<AR30_TYPE, AR30_ORDER>().to_ne_bytes();
+        chunk1[0] = bytes1[0];
+        chunk1[1] = bytes1[1];
+        chunk1[2] = bytes1[2];
+        chunk1[3] = bytes1[3];
+
+        let bytes2 = narrowed2.to_ar30::<AR30_TYPE, AR30_ORDER>().to_ne_bytes();
+        chunk2[0] = bytes2[0];
+        chunk2[1] = bytes2[1];
+        chunk2[2] = bytes2[2];
+        chunk2[3] = bytes2[3];
+
+        let bytes3 = narrowed3.to_ar30::<AR30_TYPE, AR30_ORDER>().to_ne_bytes();
+        chunk3[0] = bytes3[0];
+        chunk3[1] = bytes3[1];
+        chunk3[2] = bytes3[2];
+        chunk3[3] = bytes3[3];
     }
 }
diff --git a/src/fixed_point_vertical_ar30.rs b/src/fixed_point_vertical_ar30.rs
index 86a84f7..26aa32c 100644
--- a/src/fixed_point_vertical_ar30.rs
+++ b/src/fixed_point_vertical_ar30.rs
@@ -39,9 +39,9 @@ pub(crate) fn convolve_column_handler_fip_db_ar30<
     const AR30_ORDER: usize,
     const BUFFER_SIZE: usize,
 >(
-    src: &[u32],
+    src: &[u8],
     src_stride: usize,
-    dst: &mut [u32],
+    dst: &mut [u8],
     filter: &[i16],
     bounds: &FilterBounds,
     x: usize,
@@ -56,10 +56,10 @@ pub(crate) fn convolve_column_handler_fip_db_ar30<
 
     let py = bounds.start;
     let weight = filter[0] as i32;
-    let offset = src_stride * py + v_start_px;
-    let src_ptr = &src[offset..(offset + BUFFER_SIZE)];
+    let offset = src_stride * py + v_start_px * 4;
+    let src_ptr = &src[offset..(offset + BUFFER_SIZE * 4)];
 
-    for (dst, src) in direct_store.iter_mut().zip(src_ptr) {
+    for (dst, src) in direct_store.iter_mut().zip(src_ptr.chunks_exact(4)) {
         *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
     }
 
@@ -67,18 +67,24 @@ pub(crate) fn convolve_column_handler_fip_db_ar30<
         // Adding 1 is necessary because skip do not incrementing value on values that skipped
         let py = bounds.start + j + 1;
         let weight = k_weight as i32;
-        let offset = src_stride * py + v_start_px;
-        let src_ptr = &src[offset..(offset + BUFFER_SIZE)];
+        let offset = src_stride * py + v_start_px * 4;
+        let src_ptr = &src[offset..(offset + BUFFER_SIZE * 4)];
 
-        for (dst, src) in direct_store.iter_mut().zip(src_ptr.iter()) {
+        for (dst, src) in direct_store.iter_mut().zip(src_ptr.chunks_exact(4)) {
             *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
         }
     }
 
-    let v_dst = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)];
-    for (dst, src) in v_dst.iter_mut().zip(direct_store) {
-        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
-        *dst = saturated;
+    let v_dst = &mut dst[v_start_px * 4..(v_start_px * 4 + BUFFER_SIZE * 4)];
+    for (dst, src) in v_dst.chunks_exact_mut(4).zip(direct_store) {
+        let saturated = src
+            .saturate_ar30()
+            .to_ar30::<AR30_TYPE, AR30_ORDER>()
+            .to_ne_bytes();
+        dst[0] = saturated[0];
+        dst[1] = saturated[1];
+        dst[2] = saturated[2];
+        dst[3] = saturated[3];
     }
 }
 
@@ -91,9 +97,9 @@ fn convolve_column_handler_fixed_point_direct_buffer_double<
     const AR30_ORDER: usize,
     const BUFFER_SIZE: usize,
 >(
-    src: &[u32],
+    src: &[u8],
     src_stride: usize,
-    dst: &mut [u32],
+    dst: &mut [u8],
     filter: &[i16],
     bounds: &FilterBounds,
     x: usize,
@@ -110,15 +116,15 @@ fn convolve_column_handler_fixed_point_direct_buffer_double<
 
     let py = bounds.start;
     let weight = filter[0] as i32;
-    let offset = src_stride * py + v_start_px;
-    let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
-    let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
+    let offset = src_stride * py + v_start_px * 4;
+    let src_ptr0 = &src[offset..(offset + BUFFER_SIZE * 4)];
+    let src_ptr1 = &src[(offset + BUFFER_SIZE * 4)..(offset + BUFFER_SIZE * 2 * 4)];
 
-    for (dst, src) in direct_store0.iter_mut().zip(src_ptr0) {
+    for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.chunks_exact(4)) {
         *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
     }
 
-    for (dst, src) in direct_store1.iter_mut().zip(src_ptr1) {
+    for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.chunks_exact(4)) {
         *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
     }
 
@@ -126,28 +132,41 @@ fn convolve_column_handler_fixed_point_direct_buffer_double<
         // Adding 1 is necessary because skip do not incrementing value on values that skipped
         let py = bounds.start + j + 1;
         let weight = k_weight as i32;
-        let offset = src_stride * py + v_start_px;
-        let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
-        let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
+        let offset = src_stride * py + v_start_px * 4;
+        let src_ptr0 = &src[offset..(offset + BUFFER_SIZE * 4)];
+        let src_ptr1 = &src[(offset + BUFFER_SIZE * 4)..(offset + BUFFER_SIZE * 2 * 4)];
 
-        for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.iter()) {
+        for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.chunks_exact(4)) {
             *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
         }
-        for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.iter()) {
+        for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.chunks_exact(4)) {
             *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
         }
     }
 
-    let v_dst0 = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)];
-    for (dst, src) in v_dst0.iter_mut().zip(direct_store0) {
-        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
-        *dst = saturated;
+    let v_dst0 = &mut dst[v_start_px * 4..(v_start_px * 4 + BUFFER_SIZE * 4)];
+    for (dst, src) in v_dst0.chunks_exact_mut(4).zip(direct_store0) {
+        let saturated = src
+            .saturate_ar30()
+            .to_ar30::<AR30_TYPE, AR30_ORDER>()
+            .to_ne_bytes();
+        dst[0] = saturated[0];
+        dst[1] = saturated[1];
+        dst[2] = saturated[2];
+        dst[3] = saturated[3];
     }
 
-    let v_dst1 = &mut dst[(v_start_px + BUFFER_SIZE)..(v_start_px + BUFFER_SIZE * 2)];
-    for (dst, src) in v_dst1.iter_mut().zip(direct_store1) {
-        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
-        *dst = saturated;
+    let v_dst1 =
+        &mut dst[(v_start_px * 4 + BUFFER_SIZE * 4)..(v_start_px * 4 + BUFFER_SIZE * 2 * 4)];
+    for (dst, src) in v_dst1.chunks_exact_mut(4).zip(direct_store1) {
+        let saturated = src
+            .saturate_ar30()
+            .to_ar30::<AR30_TYPE, AR30_ORDER>()
+            .to_ne_bytes();
+        dst[0] = saturated[0];
+        dst[1] = saturated[1];
+        dst[2] = saturated[2];
+        dst[3] = saturated[3];
     }
 }
 
@@ -160,9 +179,9 @@ fn convolve_column_handler_fixed_point_direct_buffer_four<
     const AR30_ORDER: usize,
     const BUFFER_SIZE: usize,
 >(
-    src: &[u32],
+    src: &[u8],
     src_stride: usize,
-    dst: &mut [u32],
+    dst: &mut [u8],
     filter: &[i16],
     bounds: &FilterBounds,
     x: usize,
@@ -179,29 +198,29 @@ fn convolve_column_handler_fixed_point_direct_buffer_four<
     let mut direct_store3: [ColorGroup<4, i32>; BUFFER_SIZE] =
         [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
 
-    let v_start_px = x;
+    let v_start_px = x * 4;
 
     let py = bounds.start;
     let weight = filter[0] as i32;
     let offset = src_stride * py + v_start_px;
-    let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
-    let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
-    let src_ptr2 = &src[(offset + BUFFER_SIZE * 2)..(offset + BUFFER_SIZE * 3)];
-    let src_ptr3 = &src[(offset + BUFFER_SIZE * 3)..(offset + BUFFER_SIZE * 4)];
+    let src_ptr0 = &src[offset..(offset + BUFFER_SIZE * 4)];
+    let src_ptr1 = &src[(offset + BUFFER_SIZE * 4)..(offset + BUFFER_SIZE * 2 * 4)];
+    let src_ptr2 = &src[(offset + BUFFER_SIZE * 2 * 4)..(offset + BUFFER_SIZE * 3 * 4)];
+    let src_ptr3 = &src[(offset + BUFFER_SIZE * 3 * 4)..(offset + BUFFER_SIZE * 4 * 4)];
 
-    for (dst, src) in direct_store0.iter_mut().zip(src_ptr0) {
+    for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.chunks_exact(4)) {
         *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
     }
 
-    for (dst, src) in direct_store1.iter_mut().zip(src_ptr1) {
+    for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.chunks_exact(4)) {
         *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
     }
 
-    for (dst, src) in direct_store2.iter_mut().zip(src_ptr2) {
+    for (dst, src) in direct_store2.iter_mut().zip(src_ptr2.chunks_exact(4)) {
         *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
     }
 
-    for (dst, src) in direct_store3.iter_mut().zip(src_ptr3) {
+    for (dst, src) in direct_store3.iter_mut().zip(src_ptr3.chunks_exact(4)) {
         *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
     }
 
@@ -210,60 +229,84 @@ fn convolve_column_handler_fixed_point_direct_buffer_four<
         let py = bounds.start + j + 1;
         let weight = k_weight as i32;
         let offset = src_stride * py + v_start_px;
-        let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
-        let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
-        let src_ptr2 = &src[(offset + BUFFER_SIZE * 2)..(offset + BUFFER_SIZE * 3)];
-        let src_ptr3 = &src[(offset + BUFFER_SIZE * 3)..(offset + BUFFER_SIZE * 4)];
+        let src_ptr0 = &src[offset..(offset + BUFFER_SIZE * 4)];
+        let src_ptr1 = &src[(offset + BUFFER_SIZE * 4)..(offset + BUFFER_SIZE * 2 * 4)];
+        let src_ptr2 = &src[(offset + BUFFER_SIZE * 2 * 4)..(offset + BUFFER_SIZE * 3 * 4)];
+        let src_ptr3 = &src[(offset + BUFFER_SIZE * 3 * 4)..(offset + BUFFER_SIZE * 4 * 4)];
 
-        for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.iter()) {
+        for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.chunks_exact(4)) {
             *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
         }
-        for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.iter()) {
+        for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.chunks_exact(4)) {
             *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
         }
-        for (dst, src) in direct_store2.iter_mut().zip(src_ptr2.iter()) {
+        for (dst, src) in direct_store2.iter_mut().zip(src_ptr2.chunks_exact(4)) {
             *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
         }
-        for (dst, src) in direct_store3.iter_mut().zip(src_ptr3.iter()) {
+        for (dst, src) in direct_store3.iter_mut().zip(src_ptr3.chunks_exact(4)) {
             *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
         }
     }
 
-    let v_dst0 = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)];
-    for (dst, src) in v_dst0.iter_mut().zip(direct_store0) {
-        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
-        *dst = saturated;
+    let v_dst0 = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE * 4)];
+    for (dst, src) in v_dst0.chunks_exact_mut(4).zip(direct_store0) {
+        let saturated = src
+            .saturate_ar30()
+            .to_ar30::<AR30_TYPE, AR30_ORDER>()
+            .to_ne_bytes();
+        dst[0] = saturated[0];
+        dst[1] = saturated[1];
+        dst[2] = saturated[2];
+        dst[3] = saturated[3];
     }
 
-    let v_dst1 = &mut dst[(v_start_px + BUFFER_SIZE)..(v_start_px + BUFFER_SIZE * 2)];
-    for (dst, src) in v_dst1.iter_mut().zip(direct_store1) {
-        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
-        *dst = saturated;
+    let v_dst1 = &mut dst[(v_start_px + BUFFER_SIZE * 4)..(v_start_px + BUFFER_SIZE * 2 * 4)];
+    for (dst, src) in v_dst1.chunks_exact_mut(4).zip(direct_store1) {
+        let saturated = src
+            .saturate_ar30()
+            .to_ar30::<AR30_TYPE, AR30_ORDER>()
+            .to_ne_bytes();
+        dst[0] = saturated[0];
+        dst[1] = saturated[1];
+        dst[2] = saturated[2];
+        dst[3] = saturated[3];
     }
 
-    let v_dst2 = &mut dst[(v_start_px + BUFFER_SIZE * 2)..(v_start_px + BUFFER_SIZE * 3)];
-    for (dst, src) in v_dst2.iter_mut().zip(direct_store2) {
-        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
-        *dst = saturated;
+    let v_dst2 = &mut dst[(v_start_px + BUFFER_SIZE * 2 * 4)..(v_start_px + BUFFER_SIZE * 3 * 4)];
+    for (dst, src) in v_dst2.chunks_exact_mut(4).zip(direct_store2) {
+        let saturated = src
+            .saturate_ar30()
+            .to_ar30::<AR30_TYPE, AR30_ORDER>()
+            .to_ne_bytes();
+        dst[0] = saturated[0];
+        dst[1] = saturated[1];
+        dst[2] = saturated[2];
+        dst[3] = saturated[3];
     }
 
-    let v_dst3 = &mut dst[(v_start_px + BUFFER_SIZE * 3)..(v_start_px + BUFFER_SIZE * 4)];
-    for (dst, src) in v_dst3.iter_mut().zip(direct_store3) {
-        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
-        *dst = saturated;
+    let v_dst3 = &mut dst[(v_start_px + BUFFER_SIZE * 3 * 4)..(v_start_px + BUFFER_SIZE * 4 * 4)];
+    for (dst, src) in v_dst3.chunks_exact_mut(4).zip(direct_store3) {
+        let saturated = src
+            .saturate_ar30()
+            .to_ar30::<AR30_TYPE, AR30_ORDER>()
+            .to_ne_bytes();
+        dst[0] = saturated[0];
+        dst[1] = saturated[1];
+        dst[2] = saturated[2];
+        dst[3] = saturated[3];
     }
 }
 
 pub(crate) fn column_handler_fixed_point_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
     bounds: &FilterBounds,
-    src: &[u32],
-    dst: &mut [u32],
+    src: &[u8],
+    dst: &mut [u8],
     src_stride: usize,
     weight: &[i16],
 ) {
     let mut cx = 0usize;
 
-    let total_width = dst.len();
+    let total_width = dst.len() / 4;
 
     while cx + 64 < total_width {
         convolve_column_handler_fixed_point_direct_buffer_four::<AR30_TYPE, AR30_ORDER, 16>(
diff --git a/src/image_store.rs b/src/image_store.rs
index 17fff1a..4f9c2b5 100644
--- a/src/image_store.rs
+++ b/src/image_store.rs
@@ -27,14 +27,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::alpha_check::has_non_constant_cap_alpha_rgba_f32;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 use crate::alpha_handle_f16::{premultiply_alpha_rgba_f16, unpremultiply_alpha_rgba_f16};
 use crate::alpha_handle_f32::{premultiply_alpha_rgba_f32, unpremultiply_alpha_rgba_f32};
 use crate::alpha_handle_u16::{premultiply_alpha_rgba_u16, unpremultiply_alpha_rgba_u16};
 use crate::alpha_handle_u8::{premultiply_alpha_rgba, unpremultiply_alpha_rgba};
 use crate::pic_scale_error::{PicScaleBufferMismatch, PicScaleError};
 use crate::ImageSize;
-use num_traits::FromPrimitive;
+#[cfg(feature = "nightly_f16")]
+use core::f16;
 use rayon::ThreadPool;
 use std::borrow::Cow;
 use std::fmt::Debug;
@@ -51,7 +52,7 @@ use std::fmt::Debug;
 #[derive(Debug, Clone)]
 pub struct ImageStore<'a, T, const N: usize>
 where
-    T: FromPrimitive + Clone + Copy + Debug,
+    T: Clone + Copy + Debug,
 {
     pub buffer: std::borrow::Cow<'a, [T]>,
     /// Channels in the image
@@ -78,7 +79,7 @@ where
 #[derive(Debug)]
 pub struct ImageStoreMut<'a, T, const N: usize>
 where
-    T: FromPrimitive + Clone + Copy + Debug,
+    T: Clone + Copy + Debug,
 {
     pub buffer: BufferStore<'a, T>,
     /// Channels in the image
@@ -123,7 +124,7 @@ impl<T: Copy + Debug> BufferStore<'_, T> {
 
 impl<'a, T, const N: usize> ImageStore<'a, T, N>
 where
-    T: FromPrimitive + Clone + Copy + Debug + Default,
+    T: Clone + Copy + Debug + Default,
 {
     pub fn new(
         slice_ref: Vec<T>,
@@ -151,7 +152,7 @@ where
     }
 
     pub fn alloc(width: usize, height: usize) -> ImageStore<'a, T, N> {
-        let vc = vec![T::from_u32(0).unwrap_or_default(); width * N * height];
+        let vc = vec![T::default(); width * N * height];
         ImageStore::<T, N> {
             buffer: std::borrow::Cow::Owned(vc),
             channels: N,
@@ -175,8 +176,8 @@ impl<const N: usize> CheckStoreDensity for ImageStoreMut<'_, f32, N> {
     }
 }
 
-#[cfg(feature = "half")]
-impl<const N: usize> CheckStoreDensity for ImageStoreMut<'_, half::f16, N> {
+#[cfg(feature = "nightly_f16")]
+impl<const N: usize> CheckStoreDensity for ImageStoreMut<'_, f16, N> {
     fn should_have_bit_depth(&self) -> bool {
         false
     }
@@ -190,7 +191,7 @@ impl<const N: usize> CheckStoreDensity for ImageStoreMut<'_, u16, N> {
 
 impl<T, const N: usize> ImageStoreMut<'_, T, N>
 where
-    T: FromPrimitive + Clone + Copy + Debug + Default,
+    T: Clone + Copy + Debug + Default,
 {
     pub(crate) fn validate(&self) -> Result<(), PicScaleError> {
         let expected_size = self.stride() * self.height;
@@ -212,7 +213,7 @@ where
 
 impl<T, const N: usize> ImageStore<'_, T, N>
 where
-    T: FromPrimitive + Clone + Copy + Debug + Default,
+    T: Clone + Copy + Debug + Default,
 {
     pub(crate) fn validate(&self) -> Result<(), PicScaleError> {
         let expected_size = self.stride() * self.height;
@@ -234,7 +235,7 @@ where
 
 impl<'a, T, const N: usize> ImageStoreMut<'a, T, N>
 where
-    T: FromPrimitive + Clone + Copy + Debug + Default,
+    T: Clone + Copy + Debug + Default,
 {
     /// Creates new mutable storage from vectors
     ///
@@ -268,7 +269,7 @@ where
     ///
     /// Always sets bit depth to `0`
     pub fn alloc(width: usize, height: usize) -> ImageStoreMut<'a, T, N> {
-        let vc = vec![T::from_u32(0).unwrap_or_default(); width * N * height];
+        let vc = vec![T::default(); width * N * height];
         ImageStoreMut::<T, N> {
             buffer: BufferStore::Owned(vc),
             channels: N,
@@ -285,7 +286,7 @@ where
         height: usize,
         bit_depth: usize,
     ) -> ImageStoreMut<'a, T, N> {
-        let vc = vec![T::from_u32(0).unwrap_or_default(); width * N * height];
+        let vc = vec![T::default(); width * N * height];
         ImageStoreMut::<T, N> {
             buffer: BufferStore::Owned(vc),
             channels: N,
@@ -299,7 +300,7 @@ where
 
 impl<T, const N: usize> ImageStoreMut<'_, T, N>
 where
-    T: FromPrimitive + Clone + Copy + Debug,
+    T: Clone + Copy + Debug,
 {
     /// Returns safe stride
     ///
@@ -315,7 +316,7 @@ where
 
 impl<T, const N: usize> ImageStore<'_, T, N>
 where
-    T: FromPrimitive + Clone + Copy + Debug,
+    T: Clone + Copy + Debug,
 {
     /// Returns safe stride
     ///
@@ -331,7 +332,7 @@ where
 
 impl<'a, T, const N: usize> ImageStore<'a, T, N>
 where
-    T: FromPrimitive + Clone + Copy + Debug,
+    T: Clone + Copy + Debug,
 {
     /// Returns bounded image size
     pub fn get_size(&self) -> ImageSize {
@@ -398,7 +399,7 @@ where
 
 impl<'a, T, const N: usize> ImageStoreMut<'a, T, N>
 where
-    T: FromPrimitive + Clone + Copy + Debug,
+    T: Clone + Copy + Debug,
 {
     /// Returns bounded image size
     pub fn get_size(&self) -> ImageSize {
@@ -460,12 +461,12 @@ where
     }
 }
 
-pub(crate) trait AssociateAlpha<T: FromPrimitive + Clone + Copy + Debug, const N: usize> {
+pub(crate) trait AssociateAlpha<T: Clone + Copy + Debug, const N: usize> {
     fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, T, N>, pool: &Option<ThreadPool>);
     fn is_alpha_premultiplication_needed(&self) -> bool;
 }
 
-pub(crate) trait UnassociateAlpha<T: FromPrimitive + Clone + Copy + Debug, const N: usize> {
+pub(crate) trait UnassociateAlpha<T: Clone + Copy + Debug, const N: usize> {
     fn unpremultiply_alpha(&mut self, pool: &Option<ThreadPool>);
 }
 
@@ -615,13 +616,9 @@ impl AssociateAlpha<f32, 4> for ImageStore<'_, f32, 4> {
     }
 }
 
-#[cfg(feature = "half")]
-impl AssociateAlpha<half::f16, 4> for ImageStore<'_, half::f16, 4> {
-    fn premultiply_alpha(
-        &self,
-        into: &mut ImageStoreMut<'_, half::f16, 4>,
-        pool: &Option<ThreadPool>,
-    ) {
+#[cfg(feature = "nightly_f16")]
+impl AssociateAlpha<f16, 4> for ImageStore<'_, f16, 4> {
+    fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, f16, 4>, pool: &Option<ThreadPool>) {
         let src_stride = self.stride();
         let dst_stride = into.stride();
         let dst = into.buffer.borrow_mut();
@@ -665,8 +662,8 @@ impl UnassociateAlpha<f32, 4> for ImageStoreMut<'_, f32, 4> {
     }
 }
 
-#[cfg(feature = "half")]
-impl UnassociateAlpha<half::f16, 4> for ImageStoreMut<'_, half::f16, 4> {
+#[cfg(feature = "nightly_f16")]
+impl UnassociateAlpha<f16, 4> for ImageStoreMut<'_, f16, 4> {
     fn unpremultiply_alpha(&mut self, pool: &Option<ThreadPool>) {
         let stride = self.stride();
         let dst = self.buffer.borrow_mut();
@@ -692,22 +689,22 @@ pub type Rgba16ImageStoreMut<'a> = ImageStoreMut<'a, u16, 4>;
 pub type Rgb16ImageStore<'a> = ImageStore<'a, u16, 3>;
 pub type Rgb16ImageStoreMut<'a> = ImageStoreMut<'a, u16, 3>;
 
-#[cfg(feature = "half")]
-pub type PlanarF16ImageStore<'a> = ImageStore<'a, half::f16, 1>;
-#[cfg(feature = "half")]
-pub type PlanarF16ImageStoreMut<'a> = ImageStoreMut<'a, half::f16, 1>;
-#[cfg(feature = "half")]
-pub type CbCrF16ImageStore<'a> = ImageStore<'a, half::f16, 2>;
-#[cfg(feature = "half")]
-pub type CbCrF16ImageStoreMut<'a> = ImageStoreMut<'a, half::f16, 2>;
-#[cfg(feature = "half")]
-pub type RgbaF16ImageStore<'a> = ImageStore<'a, half::f16, 4>;
-#[cfg(feature = "half")]
-pub type RgbaF16ImageStoreMut<'a> = ImageStoreMut<'a, half::f16, 4>;
-#[cfg(feature = "half")]
-pub type RgbF16ImageStore<'a> = ImageStore<'a, half::f16, 3>;
-#[cfg(feature = "half")]
-pub type RgbF16ImageStoreMut<'a> = ImageStoreMut<'a, half::f16, 3>;
+#[cfg(feature = "nightly_f16")]
+pub type PlanarF16ImageStore<'a> = ImageStore<'a, f16, 1>;
+#[cfg(feature = "nightly_f16")]
+pub type PlanarF16ImageStoreMut<'a> = ImageStoreMut<'a, f16, 1>;
+#[cfg(feature = "nightly_f16")]
+pub type CbCrF16ImageStore<'a> = ImageStore<'a, f16, 2>;
+#[cfg(feature = "nightly_f16")]
+pub type CbCrF16ImageStoreMut<'a> = ImageStoreMut<'a, f16, 2>;
+#[cfg(feature = "nightly_f16")]
+pub type RgbaF16ImageStore<'a> = ImageStore<'a, f16, 4>;
+#[cfg(feature = "nightly_f16")]
+pub type RgbaF16ImageStoreMut<'a> = ImageStoreMut<'a, f16, 4>;
+#[cfg(feature = "nightly_f16")]
+pub type RgbF16ImageStore<'a> = ImageStore<'a, f16, 3>;
+#[cfg(feature = "nightly_f16")]
+pub type RgbF16ImageStoreMut<'a> = ImageStoreMut<'a, f16, 3>;
 
 pub type PlanarF32ImageStore<'a> = ImageStore<'a, f32, 1>;
 pub type PlanarF32ImageStoreMut<'a> = ImageStoreMut<'a, f32, 1>;
diff --git a/src/lib.rs b/src/lib.rs
index 8ba1341..1d31842 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -29,13 +29,38 @@
 #![deny(deprecated)]
 // #![deny(unreachable_code, unused)]
 #![allow(clippy::too_many_arguments)]
-#![cfg_attr(feature = "nightly_avx512", feature(cfg_version))]
-#![cfg_attr(feature = "nightly_avx512", feature(avx512_target_feature))]
-#![cfg_attr(feature = "nightly_avx512", feature(stdarch_x86_avx512))]
-#![cfg_attr(feature = "nightly_avx512fp16", feature(stdarch_x86_avx512_f16))]
+#![cfg_attr(
+    all(
+        feature = "nightly_avx512",
+        any(target_arch = "x86", target_arch = "x86_64")
+    ),
+    feature(cfg_version)
+)]
+#![cfg_attr(
+    all(
+        feature = "nightly_avx512",
+        any(target_arch = "x86", target_arch = "x86_64")
+    ),
+    feature(avx512_target_feature)
+)]
+#![cfg_attr(
+    all(
+        feature = "nightly_avx512",
+        any(target_arch = "x86", target_arch = "x86_64")
+    ),
+    feature(stdarch_x86_avx512)
+)]
+#![cfg_attr(
+    all(
+        feature = "nightly_avx512fp16",
+        any(target_arch = "x86", target_arch = "x86_64")
+    ),
+    feature(stdarch_x86_avx512_f16)
+)]
+#![cfg_attr(feature = "nightly_f16", feature(f16))]
 
 mod alpha_check;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod alpha_handle_f16;
 mod alpha_handle_f32;
 mod alpha_handle_u16;
@@ -58,12 +83,12 @@ mod convolution;
 mod convolve_naive_f32;
 mod cpu_features;
 mod dispatch_group_ar30;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod dispatch_group_f16;
 mod dispatch_group_f32;
 mod dispatch_group_u16;
 mod dispatch_group_u8;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod f16;
 mod filter_weights;
 mod fixed_point_horizontal;
@@ -95,7 +120,7 @@ mod rgba_u8;
 mod sampler;
 mod saturate_narrow;
 mod scaler;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod scaler_f16;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 mod sse;
@@ -118,7 +143,7 @@ pub use image_store::{
     RgbF32ImageStore, RgbF32ImageStoreMut, Rgba16ImageStore, Rgba16ImageStoreMut, Rgba8ImageStore,
     Rgba8ImageStoreMut, RgbaF32ImageStore, RgbaF32ImageStoreMut,
 };
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub use image_store::{
     CbCrF16ImageStore, CbCrF16ImageStoreMut, PlanarF16ImageStore, PlanarF16ImageStoreMut,
     RgbF16ImageStore, RgbF16ImageStoreMut, RgbaF16ImageStore, RgbaF16ImageStoreMut,
@@ -129,5 +154,5 @@ pub use sampler::*;
 pub use scaler::Scaling;
 pub use scaler::ScalingF32;
 pub use scaler::ScalingU16;
-pub use scaler::{ImageStoreScaling, Scaler, ScalingOptions};
+pub use scaler::{ImageStoreScaling, Scaler, ScalingOptions, WorkloadStrategy};
 pub use threading_policy::*;
diff --git a/src/mixed_storage.rs b/src/mixed_storage.rs
index 1cc63ba..3c14ccc 100644
--- a/src/mixed_storage.rs
+++ b/src/mixed_storage.rs
@@ -55,12 +55,15 @@ impl MixedStorage<u16> for f32 {
     }
 }
 
-#[cfg(feature = "half")]
-impl MixedStorage<half::f16> for f32 {
+#[cfg(feature = "nightly_f16")]
+use core::f16;
+
+#[cfg(feature = "nightly_f16")]
+impl MixedStorage<f16> for f32 {
     #[inline(always)]
     #[allow(clippy::manual_clamp)]
-    fn to_mixed(self, _: u32) -> half::f16 {
-        half::f16::from_f32(self)
+    fn to_mixed(self, _: u32) -> f16 {
+        self as f16
     }
 }
 
diff --git a/src/neon/alpha_f16.rs b/src/neon/alpha_f16.rs
index eac8f01..40adf23 100644
--- a/src/neon/alpha_f16.rs
+++ b/src/neon/alpha_f16.rs
@@ -31,11 +31,12 @@ use std::arch::aarch64::*;
 
 use crate::alpha_handle_f16::{premultiply_pixel_f16_row, unpremultiply_pixel_f16_row};
 use crate::neon::f16_utils::*;
+use core::f16;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use rayon::ThreadPool;
 
-unsafe fn neon_premultiply_alpha_rgba_row_f16(dst: &mut [half::f16], src: &[half::f16]) {
+unsafe fn neon_premultiply_alpha_rgba_row_f16(dst: &mut [f16], src: &[f16]) {
     let mut rem = dst;
     let mut src_rem = src;
 
@@ -89,9 +90,9 @@ unsafe fn neon_premultiply_alpha_rgba_row_f16(dst: &mut [half::f16], src: &[half
 }
 
 pub(crate) fn neon_premultiply_alpha_rgba_f16(
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
     width: usize,
     _: usize,
@@ -114,7 +115,7 @@ pub(crate) fn neon_premultiply_alpha_rgba_f16(
     }
 }
 
-unsafe fn neon_unpremultiply_alpha_rgba_row_f16(in_place: &mut [half::f16]) {
+unsafe fn neon_unpremultiply_alpha_rgba_row_f16(in_place: &mut [f16]) {
     let mut rem = in_place;
 
     for dst in rem.chunks_exact_mut(8 * 4) {
@@ -182,7 +183,7 @@ unsafe fn neon_unpremultiply_alpha_rgba_row_f16(in_place: &mut [half::f16]) {
 }
 
 pub(crate) fn neon_unpremultiply_alpha_rgba_f16(
-    in_place: &mut [half::f16],
+    in_place: &mut [f16],
     stride: usize,
     width: usize,
     _: usize,
diff --git a/src/neon/alpha_f16_full.rs b/src/neon/alpha_f16_full.rs
index 633f7d7..29eb0ad 100644
--- a/src/neon/alpha_f16_full.rs
+++ b/src/neon/alpha_f16_full.rs
@@ -31,12 +31,13 @@ use std::arch::aarch64::*;
 
 use crate::alpha_handle_f16::{premultiply_pixel_f16_row, unpremultiply_pixel_f16_row};
 use crate::neon::f16_utils::*;
+use core::f16;
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
 use rayon::ThreadPool;
 
 #[target_feature(enable = "fp16")]
-unsafe fn neon_premultiply_alpha_rgba_row_f16_full(dst: &mut [half::f16], src: &[half::f16]) {
+unsafe fn neon_premultiply_alpha_rgba_row_f16_full(dst: &mut [f16], src: &[f16]) {
     let mut rem = dst;
     let mut src_rem = src;
 
@@ -66,9 +67,9 @@ unsafe fn neon_premultiply_alpha_rgba_row_f16_full(dst: &mut [half::f16], src: &
 }
 
 pub(crate) fn neon_premultiply_alpha_rgba_f16_full(
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
     width: usize,
     _: usize,
@@ -95,7 +96,7 @@ pub(crate) fn neon_premultiply_alpha_rgba_f16_full(
 }
 
 #[target_feature(enable = "fp16")]
-unsafe fn neon_unpremultiply_alpha_rgba_f16_row_full(in_place: &mut [half::f16]) {
+unsafe fn neon_unpremultiply_alpha_rgba_f16_row_full(in_place: &mut [f16]) {
     let mut rem = in_place;
 
     for dst in rem.chunks_exact_mut(8 * 4) {
@@ -137,7 +138,7 @@ unsafe fn neon_unpremultiply_alpha_rgba_f16_row_full(in_place: &mut [half::f16])
 }
 
 pub(crate) fn neon_unpremultiply_alpha_rgba_f16_full(
-    in_place: &mut [half::f16],
+    in_place: &mut [f16],
     stride: usize,
     width: usize,
     _: usize,
diff --git a/src/neon/ar30.rs b/src/neon/ar30.rs
index d846e11..db16224 100644
--- a/src/neon/ar30.rs
+++ b/src/neon/ar30.rs
@@ -95,9 +95,9 @@ pub(crate) unsafe fn vunzips_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: us
 }
 
 #[inline(always)]
-pub(crate) unsafe fn vunzip_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+pub(crate) unsafe fn vunzip_3_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
     v: uint32x4x2_t,
-) -> int16x8x4_t {
+) -> int16x8x3_t {
     let mask = vdupq_n_u32(0x3ff);
     let ar_type: Rgb30 = AR30_TYPE.into();
 
@@ -121,45 +121,13 @@ pub(crate) unsafe fn vunzip_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usi
                 vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v.0), mask)),
                 vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v.1), mask)),
             );
-            let va = vcombine_u16(
-                vmovn_u32(vshrq_n_u32::<30>(v.0)),
-                vmovn_u32(vshrq_n_u32::<30>(v.1)),
-            );
-            let a = vorrq_u16(
-                vorrq_u16(
-                    vorrq_u16(
-                        vorrq_u16(vshlq_n_u16::<8>(va), vshlq_n_u16::<6>(va)),
-                        vshlq_n_u16::<4>(va),
-                    ),
-                    vshlq_n_u16::<2>(va),
-                ),
-                va,
-            );
-            int16x8x4_t(
+            int16x8x3_t(
                 vreinterpretq_s16_u16(r),
                 vreinterpretq_s16_u16(g),
                 vreinterpretq_s16_u16(b),
-                vreinterpretq_s16_u16(a),
             )
         }
         Rgb30::Ra30 => {
-            let a_mask = vdupq_n_u32(0x3);
-            let va = vcombine_u16(
-                vmovn_u32(vandq_u32(v.0, a_mask)),
-                vmovn_u32(vandq_u32(v.1, a_mask)),
-            );
-
-            let a = vorrq_u16(
-                vorrq_u16(
-                    vorrq_u16(
-                        vorrq_u16(vshlq_n_u16::<8>(va), vshlq_n_u16::<6>(va)),
-                        vshlq_n_u16::<4>(va),
-                    ),
-                    vshlq_n_u16::<2>(va),
-                ),
-                va,
-            );
-
             let r = vcombine_u16(
                 vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v.0), mask)),
                 vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v.1), mask)),
@@ -172,23 +140,110 @@ pub(crate) unsafe fn vunzip_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usi
                 vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v.0), mask)),
                 vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v.1), mask)),
             );
-            int16x8x4_t(
+            int16x8x3_t(
                 vreinterpretq_s16_u16(r),
                 vreinterpretq_s16_u16(g),
                 vreinterpretq_s16_u16(b),
-                vreinterpretq_s16_u16(a),
             )
         }
     }
 }
 
+// #[inline(always)]
+// pub(crate) unsafe fn vunzip_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+//     v: uint32x4x2_t,
+// ) -> int16x8x4_t {
+//     let mask = vdupq_n_u32(0x3ff);
+//     let ar_type: Rgb30 = AR30_TYPE.into();
+//
+//     let v = if AR30_ORDER == 0 {
+//         v
+//     } else {
+//         uint32x4x2_t(vrev128_u32(v.0), vrev128_u32(v.1))
+//     };
+//
+//     match ar_type {
+//         Rgb30::Ar30 => {
+//             let r = vcombine_u16(
+//                 vmovn_u32(vandq_u32(v.0, mask)),
+//                 vmovn_u32(vandq_u32(v.1, mask)),
+//             );
+//             let g = vcombine_u16(
+//                 vmovn_u32(vandq_u32(vshrq_n_u32::<10>(v.0), mask)),
+//                 vmovn_u32(vandq_u32(vshrq_n_u32::<10>(v.1), mask)),
+//             );
+//             let b = vcombine_u16(
+//                 vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v.0), mask)),
+//                 vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v.1), mask)),
+//             );
+//             let va = vcombine_u16(
+//                 vmovn_u32(vshrq_n_u32::<30>(v.0)),
+//                 vmovn_u32(vshrq_n_u32::<30>(v.1)),
+//             );
+//             let a = vorrq_u16(
+//                 vorrq_u16(
+//                     vorrq_u16(
+//                         vorrq_u16(vshlq_n_u16::<8>(va), vshlq_n_u16::<6>(va)),
+//                         vshlq_n_u16::<4>(va),
+//                     ),
+//                     vshlq_n_u16::<2>(va),
+//                 ),
+//                 va,
+//             );
+//             int16x8x4_t(
+//                 vreinterpretq_s16_u16(r),
+//                 vreinterpretq_s16_u16(g),
+//                 vreinterpretq_s16_u16(b),
+//                 vreinterpretq_s16_u16(a),
+//             )
+//         }
+//         Rgb30::Ra30 => {
+//             let a_mask = vdupq_n_u32(0x3);
+//             let va = vcombine_u16(
+//                 vmovn_u32(vandq_u32(v.0, a_mask)),
+//                 vmovn_u32(vandq_u32(v.1, a_mask)),
+//             );
+//
+//             let a = vorrq_u16(
+//                 vorrq_u16(
+//                     vorrq_u16(
+//                         vorrq_u16(vshlq_n_u16::<8>(va), vshlq_n_u16::<6>(va)),
+//                         vshlq_n_u16::<4>(va),
+//                     ),
+//                     vshlq_n_u16::<2>(va),
+//                 ),
+//                 va,
+//             );
+//
+//             let r = vcombine_u16(
+//                 vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v.0), mask)),
+//                 vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v.1), mask)),
+//             );
+//             let g = vcombine_u16(
+//                 vmovn_u32(vandq_u32(vshrq_n_u32::<12>(v.0), mask)),
+//                 vmovn_u32(vandq_u32(vshrq_n_u32::<12>(v.1), mask)),
+//             );
+//             let b = vcombine_u16(
+//                 vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v.0), mask)),
+//                 vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v.1), mask)),
+//             );
+//             int16x8x4_t(
+//                 vreinterpretq_s16_u16(r),
+//                 vreinterpretq_s16_u16(g),
+//                 vreinterpretq_s16_u16(b),
+//                 vreinterpretq_s16_u16(a),
+//             )
+//         }
+//     }
+// }
+
 #[inline(always)]
-pub(crate) unsafe fn vunzip_4_ar30_separate<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+pub(crate) unsafe fn vunzip_3_ar30_separate<const AR30_TYPE: usize, const AR30_ORDER: usize>(
     v: uint32x4x2_t,
 ) -> int16x8x4_t {
-    let values = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(v);
+    let values = vunzip_3_ar30::<AR30_TYPE, AR30_ORDER>(v);
     let a0 = vtrnq_s16(values.0, values.1);
-    let a1 = vtrnq_s16(values.2, values.3);
+    let a1 = vtrnq_s16(values.2, vdupq_n_s16(3));
     let v1 = vtrnq_s32(vreinterpretq_s32_s16(a0.0), vreinterpretq_s32_s16(a1.0));
     let v2 = vtrnq_s32(vreinterpretq_s32_s16(a0.1), vreinterpretq_s32_s16(a1.1));
     let k0 = vreinterpretq_s16_s32(v1.0);
@@ -219,12 +274,12 @@ pub(crate) unsafe fn vzip_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize
     v: int16x8x4_t,
 ) -> uint32x4x2_t {
     let ar_type: Rgb30 = AR30_TYPE.into();
-    let a_max = vdupq_n_s16(3);
+    // let a_max = vdupq_n_s16(3);
     match ar_type {
         Rgb30::Ar30 => {
-            let v3 = vminq_s16(vrshrq_n_s16::<8>(v.3), a_max);
-            let mut a0 = vshlq_n_u32::<30>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v3))));
-            let mut a1 = vshlq_n_u32::<30>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v3))));
+            // let v3 = vminq_s16(vrshrq_n_s16::<8>(v.3), a_max);
+            let mut a0 = vdupq_n_u32(3 << 30); //vshlq_n_u32::<30>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v3))));
+            let mut a1 = vdupq_n_u32(3 << 30); // vshlq_n_u32::<30>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v3))));
 
             let r0 = vshlq_n_u32::<20>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.2))));
             let r1 = vshlq_n_u32::<20>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.2))));
@@ -248,9 +303,9 @@ pub(crate) unsafe fn vzip_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize
             }
         }
         Rgb30::Ra30 => {
-            let v3 = vminq_s16(vrshrq_n_s16::<8>(v.3), a_max);
-            let mut a0 = vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v3)));
-            let mut a1 = vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v3)));
+            // let v3 = vminq_s16(vrshrq_n_s16::<8>(v.3), a_max);
+            let mut a0 = vdupq_n_u32(3); //vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v3)));
+            let mut a1 = vdupq_n_u32(3); //vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v3)));
 
             let r0 = vshlq_n_u32::<22>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.0))));
             let r1 = vshlq_n_u32::<22>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.0))));
@@ -284,9 +339,14 @@ pub(crate) unsafe fn vzip_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize
 
 #[inline(always)]
 pub(crate) unsafe fn vld1_ar30_s16<const AR30_TYPE: usize, const AR30_ORDER: usize>(
-    arr: &[u32],
+    arr: &[u8],
 ) -> int16x4_t {
-    let item = *arr.get_unchecked(0);
+    let item = u32::from_ne_bytes([
+        *arr.get_unchecked(0),
+        *arr.get_unchecked(1),
+        *arr.get_unchecked(2),
+        *arr.get_unchecked(3),
+    ]);
     let ar_type: Rgb30 = AR30_TYPE.into();
     let vl = ar_type.unpack::<AR30_ORDER>(item);
     let a_rep = (vl.3 as i16) << 8;
diff --git a/src/neon/convolve_f16.rs b/src/neon/convolve_f16.rs
index 8d0ada8..1bd57e7 100644
--- a/src/neon/convolve_f16.rs
+++ b/src/neon/convolve_f16.rs
@@ -31,14 +31,15 @@ use std::arch::aarch64::{vdupq_n_f32, vld1q_dup_f32};
 use crate::filter_weights::FilterBounds;
 use crate::neon::utils::prefer_vfmaq_f32;
 use crate::neon::*;
+use core::f16;
 
 #[inline(always)]
 pub(crate) unsafe fn convolve_vertical_part_neon_8_f16<const USE_BLENDING: bool>(
     start_y: usize,
     start_x: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     filter: &[f32],
     bounds: &FilterBounds,
     blend_length: usize,
@@ -56,7 +57,7 @@ pub(crate) unsafe fn convolve_vertical_part_neon_8_f16<const USE_BLENDING: bool>
 
         let s_ptr = src_ptr.add(px);
         let item_row = if USE_BLENDING {
-            let mut transient: [half::f16; 8] = [half::f16::from_f32(0.); 8];
+            let mut transient: [f16; 8] = [0.; 8];
             std::ptr::copy_nonoverlapping(s_ptr, transient.as_mut_ptr(), blend_length);
             xvldq_f16(transient.as_ptr())
         } else {
@@ -74,7 +75,7 @@ pub(crate) unsafe fn convolve_vertical_part_neon_8_f16<const USE_BLENDING: bool>
 
     let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     if USE_BLENDING {
-        let mut transient: [half::f16; 8] = [half::f16::from_f32(0.); 8];
+        let mut transient: [f16; 8] = [0.; 8];
         xvstq_f16(transient.as_mut_ptr(), item);
         std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, blend_length);
     } else {
diff --git a/src/neon/f16_utils.rs b/src/neon/f16_utils.rs
index 8644646..de8a103 100644
--- a/src/neon/f16_utils.rs
+++ b/src/neon/f16_utils.rs
@@ -28,6 +28,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#[cfg(feature = "nightly_f16")]
+use core::f16;
 use std::arch::aarch64::*;
 use std::arch::asm;
 
@@ -89,22 +91,22 @@ pub(crate) struct x_float16x8x4_t(
 );
 
 #[inline]
-#[cfg(feature = "half")]
-pub(crate) unsafe fn xvld_f16(ptr: *const half::f16) -> x_float16x4_t {
+#[cfg(feature = "nightly_f16")]
+pub(crate) unsafe fn xvld_f16(ptr: *const f16) -> x_float16x4_t {
     let store: uint16x4_t = vld1_u16(ptr as *const _);
     std::mem::transmute(store)
 }
 
 #[inline]
-#[cfg(feature = "half")]
-pub(crate) unsafe fn xvldq_f16(ptr: *const half::f16) -> x_float16x8_t {
+#[cfg(feature = "nightly_f16")]
+pub(crate) unsafe fn xvldq_f16(ptr: *const f16) -> x_float16x8_t {
     let store: uint16x8_t = vld1q_u16(ptr as *const _);
     std::mem::transmute(store)
 }
 
 #[inline]
-#[cfg(feature = "half")]
-pub(crate) unsafe fn xvldq_f16_x2(ptr: *const half::f16) -> x_float16x8x2_t {
+#[cfg(feature = "nightly_f16")]
+pub(crate) unsafe fn xvldq_f16_x2(ptr: *const f16) -> x_float16x8x2_t {
     let ptr_u16 = ptr as *const u16;
     x_float16x8x2_t(
         xreinterpretq_f16_u16(vld1q_u16(ptr_u16)),
@@ -113,8 +115,8 @@ pub(crate) unsafe fn xvldq_f16_x2(ptr: *const half::f16) -> x_float16x8x2_t {
 }
 
 #[inline]
-#[cfg(feature = "half")]
-pub(crate) unsafe fn xvldq_f16_x4(ptr: *const half::f16) -> x_float16x8x4_t {
+#[cfg(feature = "nightly_f16")]
+pub(crate) unsafe fn xvldq_f16_x4(ptr: *const f16) -> x_float16x8x4_t {
     let ptr_u16 = ptr as *const u16;
     x_float16x8x4_t(
         xreinterpretq_f16_u16(vld1q_u16(ptr_u16)),
@@ -362,6 +364,330 @@ pub(super) unsafe fn xvfmla_f16(
     xreinterpret_f16_u16(result)
 }
 
+/// Floating-point fused Multiply-Add Long to accumulator (vector).
+/// This instruction multiplies corresponding half-precision floating-point values
+/// in the vectors in the two source SIMD&FP registers, and accumulates the product
+/// to the corresponding vector element of the destination SIMD&FP register.
+/// The instruction does not round the result of the multiply before the accumulation.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_high_f16)
+#[target_feature(enable = "fhm")]
+#[inline]
+pub(super) unsafe fn xvfmlalq_high_f16(
+    a: float32x4_t,
+    b: x_float16x8_t,
+    c: x_float16x8_t,
+) -> float32x4_t {
+    let mut result: float32x4_t = a;
+    asm!(
+    "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.4h",
+    inout(vreg) result,
+    in(vreg) xreinterpretq_u16_f16(b),
+    in(vreg) xreinterpretq_u16_f16(c),
+    options(pure, nomem, nostack)
+    );
+    result
+}
+
+/// Floating-point fused Multiply-Add Long to accumulator (vector).
+/// This instruction multiplies corresponding half-precision floating-point values
+/// in the vectors in the two source SIMD&FP registers, and accumulates the product
+/// to the corresponding vector element of the destination SIMD&FP register.
+/// The instruction does not round the result of the multiply before the accumulation.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_low_f16)
+#[target_feature(enable = "fhm")]
+#[inline]
+pub(super) unsafe fn xvfmlalq_low_f16(
+    a: float32x4_t,
+    b: x_float16x8_t,
+    c: x_float16x8_t,
+) -> float32x4_t {
+    let mut result: float32x4_t = a;
+    asm!(
+    "fmlal {0:v}.4s, {1:v}.4h, {2:v}.4h",
+    inout(vreg) result,
+    in(vreg) xreinterpretq_u16_f16(b),
+    in(vreg) xreinterpretq_u16_f16(c),
+    options(pure, nomem, nostack)
+    );
+    result
+}
+
+/// Floating-point fused Multiply-Add Long to accumulator (vector).
+/// This instruction multiplies corresponding half-precision floating-point values
+/// in the vectors in the two source SIMD&FP registers, and accumulates the product
+/// to the corresponding vector element of the destination SIMD&FP register.
+/// The instruction does not round the result of the multiply before the accumulation.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_lane_low_f16)
+#[target_feature(enable = "fhm")]
+#[inline]
+pub(super) unsafe fn xvfmlalq_lane_low_f16<const LANE: i32>(
+    a: float32x4_t,
+    b: x_float16x8_t,
+    c: x_float16x4_t,
+) -> float32x4_t {
+    let mut result: float32x4_t = a;
+    static_assert_uimm_bits!(LANE, 3);
+    let full_lane = xvcombine_f16(c, c);
+    if LANE == 0 {
+        asm!(
+        "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[0]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(full_lane),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 1 {
+        asm!(
+        "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[1]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(full_lane),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 2 {
+        asm!(
+        "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[2]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(full_lane),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 3 {
+        asm!(
+        "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[3]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(full_lane),
+        options(pure, nomem, nostack)
+        );
+    }
+    result
+}
+
+/// Floating-point fused Multiply-Add Long to accumulator (vector).
+/// This instruction multiplies corresponding half-precision floating-point values
+/// in the vectors in the two source SIMD&FP registers, and accumulates the product
+/// to the corresponding vector element of the destination SIMD&FP register.
+/// The instruction does not round the result of the multiply before the accumulation.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vfmlalq_laneq_low_f16)
+#[target_feature(enable = "fhm")]
+#[inline]
+pub(super) unsafe fn xvfmlalq_laneq_low_f16<const LANE: i32>(
+    a: float32x4_t,
+    b: x_float16x8_t,
+    c: x_float16x8_t,
+) -> float32x4_t {
+    let mut result: float32x4_t = a;
+    static_assert_uimm_bits!(LANE, 3);
+    if LANE == 0 {
+        asm!(
+        "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[0]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 1 {
+        asm!(
+        "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[1]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 2 {
+        asm!(
+        "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[2]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 3 {
+        asm!(
+        "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[3]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 4 {
+        asm!(
+        "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[4]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 5 {
+        asm!(
+        "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[5]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 6 {
+        asm!(
+        "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[6]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 7 {
+        asm!(
+        "fmlal {0:v}.4s, {1:v}.4h, {2:v}.h[7]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    }
+    result
+}
+
+/// Floating-point fused Multiply-Add Long to accumulator (vector).
+/// This instruction multiplies corresponding half-precision floating-point values
+/// in the vectors in the two source SIMD&FP registers, and accumulates the product
+/// to the corresponding vector element of the destination SIMD&FP register.
+/// The instruction does not round the result of the multiply before the accumulation.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/xvfmlalq_lane_high_f16)
+#[target_feature(enable = "fhm")]
+#[inline]
+pub(super) unsafe fn xvfmlalq_lane_high_f16<const LANE: i32>(
+    a: float32x4_t,
+    b: x_float16x8_t,
+    c: x_float16x4_t,
+) -> float32x4_t {
+    let mut result: float32x4_t = a;
+    static_assert_uimm_bits!(LANE, 3);
+    let full_lane = xvcombine_f16(c, c);
+    if LANE == 0 {
+        asm!(
+        "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[0]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(full_lane),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 1 {
+        asm!(
+        "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[1]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(full_lane),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 2 {
+        asm!(
+        "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[2]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(full_lane),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 3 {
+        asm!(
+        "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[3]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(full_lane),
+        options(pure, nomem, nostack)
+        );
+    }
+    result
+}
+
+/// Floating-point fused Multiply-Add Long to accumulator (vector).
+/// This instruction multiplies corresponding half-precision floating-point values
+/// in the vectors in the two source SIMD&FP registers, and accumulates the product
+/// to the corresponding vector element of the destination SIMD&FP register.
+/// The instruction does not round the result of the multiply before the accumulation.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/xvfmlalq_laneq_high_f16)
+#[target_feature(enable = "fhm")]
+#[inline]
+pub(super) unsafe fn xvfmlalq_laneq_high_f16<const LANE: i32>(
+    a: float32x4_t,
+    b: x_float16x8_t,
+    c: x_float16x8_t,
+) -> float32x4_t {
+    let mut result: float32x4_t = a;
+    static_assert_uimm_bits!(LANE, 3);
+    if LANE == 0 {
+        asm!(
+        "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[0]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 1 {
+        asm!(
+        "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[1]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 2 {
+        asm!(
+        "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[2]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 3 {
+        asm!(
+        "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[3]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 4 {
+        asm!(
+        "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[4]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 5 {
+        asm!(
+        "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[5]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 6 {
+        asm!(
+        "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[6]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    } else if LANE == 7 {
+        asm!(
+        "fmlal2 {0:v}.4s, {1:v}.4h, {2:v}.h[7]",
+        inout(vreg) result,
+        in(vreg) xreinterpretq_u16_f16(b),
+        in(vreg) xreinterpretq_u16_f16(c),
+        options(pure, nomem, nostack)
+        );
+    }
+    result
+}
+
 /// Floating-point fused Multiply-Add to accumulator (vector).
 /// This instruction multiplies corresponding floating-point values in the vectors
 /// in the two source SIMD&FP registers, adds the product to the corresponding
@@ -665,28 +991,28 @@ pub(super) unsafe fn xvbslq_f16(
 }
 
 #[inline]
-#[cfg(feature = "half")]
-pub(crate) unsafe fn xvst_f16(ptr: *mut half::f16, x: x_float16x4_t) {
+#[cfg(feature = "nightly_f16")]
+pub(crate) unsafe fn xvst_f16(ptr: *mut f16, x: x_float16x4_t) {
     vst1_u16(ptr as *mut u16, xreinterpret_u16_f16(x))
 }
 
 #[inline]
-#[cfg(feature = "half")]
-pub(crate) unsafe fn xvstq_f16(ptr: *mut half::f16, x: x_float16x8_t) {
+#[cfg(feature = "nightly_f16")]
+pub(crate) unsafe fn xvstq_f16(ptr: *mut f16, x: x_float16x8_t) {
     vst1q_u16(ptr as *mut u16, xreinterpretq_u16_f16(x))
 }
 
 #[inline]
-#[cfg(feature = "half")]
-pub(crate) unsafe fn xvstq_f16_x2(ptr: *mut half::f16, x: x_float16x8x2_t) {
+#[cfg(feature = "nightly_f16")]
+pub(crate) unsafe fn xvstq_f16_x2(ptr: *mut f16, x: x_float16x8x2_t) {
     let ptr_u16 = ptr as *mut u16;
     vst1q_u16(ptr_u16, xreinterpretq_u16_f16(x.0));
     vst1q_u16(ptr_u16.add(8), xreinterpretq_u16_f16(x.1));
 }
 
 #[inline]
-#[cfg(feature = "half")]
-pub(crate) unsafe fn xvstq_f16_x4(ptr: *const half::f16, x: x_float16x8x4_t) {
+#[cfg(feature = "nightly_f16")]
+pub(crate) unsafe fn xvstq_f16_x4(ptr: *const f16, x: x_float16x8x4_t) {
     let ptr_u16 = ptr as *mut u16;
     vst1q_u16(ptr_u16, xreinterpretq_u16_f16(x.0));
     vst1q_u16(ptr_u16.add(8), xreinterpretq_u16_f16(x.1));
@@ -705,9 +1031,9 @@ pub(crate) unsafe fn xvdup_laneq_f16<const N: i32>(a: x_float16x8_t) -> x_float1
 }
 
 #[inline]
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) unsafe fn xvld1q_lane_f16<const LANE: i32>(
-    ptr: *const half::f16,
+    ptr: *const f16,
     src: x_float16x8_t,
 ) -> x_float16x8_t {
     xreinterpretq_f16_u16(vld1q_lane_u16::<LANE>(
@@ -717,11 +1043,8 @@ pub(crate) unsafe fn xvld1q_lane_f16<const LANE: i32>(
 }
 
 #[inline]
-#[cfg(feature = "half")]
-pub(crate) unsafe fn xvsetq_lane_f16<const LANE: i32>(
-    v: half::f16,
-    r: x_float16x8_t,
-) -> x_float16x8_t {
+#[cfg(feature = "nightly_f16")]
+pub(crate) unsafe fn xvsetq_lane_f16<const LANE: i32>(v: f16, r: x_float16x8_t) -> x_float16x8_t {
     xreinterpretq_f16_u16(vsetq_lane_u16::<LANE>(
         v.to_bits(),
         xreinterpretq_u16_f16(r),
diff --git a/src/neon/horizontal_ar30.rs b/src/neon/horizontal_ar30_rdm.rs
similarity index 88%
rename from src/neon/horizontal_ar30.rs
rename to src/neon/horizontal_ar30_rdm.rs
index ea489ba..0aa7e99 100644
--- a/src/neon/horizontal_ar30.rs
+++ b/src/neon/horizontal_ar30_rdm.rs
@@ -28,7 +28,7 @@
  */
 use crate::filter_weights::FilterWeights;
 use crate::neon::ar30::{
-    vextract_ar30, vld1_ar30_s16, vunzip_4_ar30_separate, vunzips_4_ar30_separate,
+    vextract_ar30, vld1_ar30_s16, vunzip_3_ar30_separate, vunzips_4_ar30_separate,
 };
 use std::arch::aarch64::*;
 
@@ -39,11 +39,11 @@ unsafe fn conv_horiz_rgba_1_u8_i16<
     const AR_ORDER: usize,
 >(
     start_x: usize,
-    src: &[u32],
+    src: &[u8],
     w0: int16x4_t,
     store: int16x4_t,
 ) -> int16x4_t {
-    let src_ptr = src.get_unchecked(start_x..);
+    let src_ptr = src.get_unchecked(start_x * 4..);
     let ld = vld1_ar30_s16::<AR_TYPE, AR_ORDER>(src_ptr);
     let rgba_pixel = vshl_n_s16::<SCALE>(ld);
     vqrdmlah_s16(store, rgba_pixel, w0)
@@ -56,14 +56,15 @@ unsafe fn conv_horiz_rgba_8_u8_i16<
     const AR_ORDER: usize,
 >(
     start_x: usize,
-    src: &[u32],
+    src: &[u8],
     set1: (int16x4_t, int16x4_t, int16x4_t, int16x4_t),
     set2: (int16x4_t, int16x4_t, int16x4_t, int16x4_t),
     store: int16x4_t,
 ) -> int16x4_t {
-    let src_ptr = src.get_unchecked(start_x..);
+    let src_ptr = src.get_unchecked(start_x * 4..);
 
-    let rgba_pixel = vunzip_4_ar30_separate::<AR_TYPE, AR_ORDER>(vld1q_u32_x2(src_ptr.as_ptr()));
+    let rgba_pixel =
+        vunzip_3_ar30_separate::<AR_TYPE, AR_ORDER>(vld1q_u32_x2(src_ptr.as_ptr() as *const _));
 
     let hi0 = vshlq_n_s16::<SCALE>(rgba_pixel.1);
     let lo0 = vshlq_n_s16::<SCALE>(rgba_pixel.0);
@@ -88,16 +89,17 @@ unsafe fn conv_horiz_rgba_4_u8_i16<
     const AR_ORDER: usize,
 >(
     start_x: usize,
-    src: &[u32],
+    src: &[u8],
     w0: int16x4_t,
     w1: int16x4_t,
     w2: int16x4_t,
     w3: int16x4_t,
     store: int16x4_t,
 ) -> int16x4_t {
-    let src_ptr = src.get_unchecked(start_x..);
+    let src_ptr = src.get_unchecked(start_x * 4..);
 
-    let rgba_pixel = vunzips_4_ar30_separate::<AR_TYPE, AR_ORDER>(vld1q_u32(src_ptr.as_ptr()));
+    let rgba_pixel =
+        vunzips_4_ar30_separate::<AR_TYPE, AR_ORDER>(vld1q_u32(src_ptr.as_ptr() as *const _));
 
     let hi = vshlq_n_s16::<SCALE>(rgba_pixel.1);
     let lo = vshlq_n_s16::<SCALE>(rgba_pixel.0);
@@ -115,9 +117,9 @@ pub(crate) fn neon_convolve_horizontal_rgba_rows_4_ar30<
     const AR_TYPE: usize,
     const AR_ORDER: usize,
 >(
-    src: &[u32],
+    src: &[u8],
     src_stride: usize,
-    dst: &mut [u32],
+    dst: &mut [u8],
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
@@ -134,9 +136,9 @@ pub(crate) fn neon_convolve_horizontal_rgba_rows_4_ar30<
 
 #[target_feature(enable = "rdm")]
 unsafe fn neon_convolve_horizontal_rgba_rows_4_impl<const AR_TYPE: usize, const AR_ORDER: usize>(
-    src: &[u32],
+    src: &[u8],
     src_stride: usize,
-    dst: &mut [u32],
+    dst: &mut [u8],
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
@@ -153,10 +155,10 @@ unsafe fn neon_convolve_horizontal_rgba_rows_4_impl<const AR_TYPE: usize, const
         let (row1_ref, rest) = rest.split_at_mut(dst_stride);
         let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
 
-        let iter_row0 = row0_ref.iter_mut();
-        let iter_row1 = row1_ref.iter_mut();
-        let iter_row2 = row2_ref.iter_mut();
-        let iter_row3 = row3_ref.iter_mut();
+        let iter_row0 = row0_ref.chunks_exact_mut(4);
+        let iter_row1 = row1_ref.chunks_exact_mut(4);
+        let iter_row2 = row2_ref.chunks_exact_mut(4);
+        let iter_row3 = row3_ref.chunks_exact_mut(4);
 
         let v_shl_back = vld1_s16(
             [
@@ -333,14 +335,26 @@ unsafe fn neon_convolve_horizontal_rgba_rows_4_impl<const AR_TYPE: usize, const
                 v_cut_off,
             ));
 
-            let packed0 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_0);
-            *chunk0 = packed0;
-            let packed1 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_1);
-            *chunk1 = packed1;
-            let packed2 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_2);
-            *chunk2 = packed2;
-            let packed3 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_3);
-            *chunk3 = packed3;
+            let packed0 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_0).to_ne_bytes();
+            chunk0[0] = packed0[0];
+            chunk0[1] = packed0[1];
+            chunk0[2] = packed0[2];
+            chunk0[3] = packed0[3];
+            let packed1 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_1).to_ne_bytes();
+            chunk1[0] = packed1[0];
+            chunk1[1] = packed1[1];
+            chunk1[2] = packed1[2];
+            chunk1[3] = packed1[3];
+            let packed2 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_2).to_ne_bytes();
+            chunk2[0] = packed2[0];
+            chunk2[1] = packed2[1];
+            chunk2[2] = packed2[2];
+            chunk2[3] = packed2[3];
+            let packed3 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_3).to_ne_bytes();
+            chunk3[0] = packed3[0];
+            chunk3[1] = packed3[1];
+            chunk3[2] = packed3[2];
+            chunk3[3] = packed3[3];
         }
     }
 }
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index 4ccabc4..ca5f2bb 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -26,53 +26,69 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod alpha_f16;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod alpha_f16_full;
 mod alpha_f32;
 mod alpha_u16;
 mod alpha_u8;
+#[cfg(feature = "rdm")]
 mod ar30;
+#[cfg(feature = "rdm")]
 mod cbcr8_rdm;
 mod check_alpha;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod convolve_f16;
 mod f16_utils;
-mod horizontal_ar30;
+#[cfg(feature = "rdm")]
+mod horizontal_ar30_rdm;
 mod plane_f32;
 mod plane_u8;
+#[cfg(feature = "rdm")]
 mod plane_u8_rdm;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod rgb_f16;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
+mod rgb_f16_fhm;
+#[cfg(feature = "nightly_f16")]
 mod rgb_f16_full;
 mod rgb_f32;
 mod rgb_u8;
+#[cfg(feature = "rdm")]
 mod rgb_u8_sqrdml;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod rgba_f16;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
+mod rgba_f16_fhm;
+#[cfg(feature = "nightly_f16")]
 mod rgba_f16_full;
 mod rgba_f32;
 mod rgba_u16_lb;
 mod rgba_u8;
+#[cfg(feature = "rdm")]
+mod rgba_u8_rdm;
 mod utils;
-mod vertical_ar30;
-#[cfg(feature = "half")]
+#[cfg(feature = "rdm")]
+mod vertical_ar30_rdm;
+#[cfg(feature = "nightly_f16")]
 mod vertical_f16;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
+mod vertical_f16_fhm;
+#[cfg(feature = "nightly_f16")]
 mod vertical_f16_full;
 mod vertical_f32;
 mod vertical_u16;
 mod vertical_u16_lb;
 mod vertical_u16_lb_f16;
 mod vertical_u8;
+#[cfg(feature = "rdm")]
+mod vertical_u8_rdm;
 mod weights;
 
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) use alpha_f16::{neon_premultiply_alpha_rgba_f16, neon_unpremultiply_alpha_rgba_f16};
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) use alpha_f16_full::{
     neon_premultiply_alpha_rgba_f16_full, neon_unpremultiply_alpha_rgba_f16_full,
 };
@@ -81,6 +97,7 @@ pub(crate) use alpha_f32::neon_unpremultiply_alpha_rgba_f32;
 pub(crate) use alpha_u16::{neon_premultiply_alpha_rgba_u16, neon_unpremultiply_alpha_rgba_u16};
 pub(crate) use alpha_u8::neon_premultiply_alpha_rgba;
 pub(crate) use alpha_u8::neon_unpremultiply_alpha_rgba;
+#[cfg(feature = "rdm")]
 pub(crate) use cbcr8_rdm::{
     convolve_horizontal_cbcr_neon_rdm_row, convolve_horizontal_cbcr_neon_rows_rdm_4_u8,
 };
@@ -88,18 +105,27 @@ pub(crate) use check_alpha::{
     neon_has_non_constant_cap_alpha_rgba16, neon_has_non_constant_cap_alpha_rgba8,
 };
 pub(crate) use f16_utils::*;
-pub(crate) use horizontal_ar30::neon_convolve_horizontal_rgba_rows_4_ar30;
+#[cfg(feature = "rdm")]
+pub(crate) use horizontal_ar30_rdm::neon_convolve_horizontal_rgba_rows_4_ar30;
 pub(crate) use plane_f32::convolve_horizontal_plane_neon_row_one;
 pub(crate) use plane_f32::convolve_horizontal_plane_neon_rows_4;
-pub use plane_u8::{convolve_horizontal_plane_neon_row, convolve_horizontal_plane_neon_rows_4_u8};
+pub use plane_u8::{
+    convolve_horizontal_plane_neon_row, convolve_horizontal_plane_neon_row_q,
+    convolve_horizontal_plane_neon_rows_4_u8, convolve_horizontal_plane_neon_rows_4_u8_q,
+};
+#[cfg(feature = "rdm")]
 pub(crate) use plane_u8_rdm::{
     convolve_horizontal_plane_neon_rdm_row, convolve_horizontal_plane_neon_rows_rdm_4_u8,
 };
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) use rgb_f16::{
     convolve_horizontal_rgb_neon_row_one_f16, convolve_horizontal_rgb_neon_rows_4_f16,
 };
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
+pub(crate) use rgb_f16_fhm::{
+    convolve_horizontal_rgb_neon_row_one_f16_fhm, convolve_horizontal_rgb_neon_rows_4_f16_fhm,
+};
+#[cfg(feature = "nightly_f16")]
 pub(crate) use rgb_f16_full::{
     xconvolve_horizontal_rgb_neon_row_one_f16, xconvolve_horizontal_rgb_neon_rows_4_f16,
 };
@@ -107,16 +133,22 @@ pub(crate) use rgb_f32::{
     convolve_horizontal_rgb_neon_row_one_f32, convolve_horizontal_rgb_neon_rows_4_f32,
 };
 pub(crate) use rgb_u8::{
-    convolve_horizontal_rgb_neon_row_one, convolve_horizontal_rgb_neon_rows_4,
+    convolve_horizontal_rgb_neon_row_one, convolve_horizontal_rgb_neon_row_one_q,
+    convolve_horizontal_rgb_neon_rows_4, convolve_horizontal_rgb_neon_rows_4_q,
 };
+#[cfg(feature = "rdm")]
 pub(crate) use rgb_u8_sqrdml::{
     convolve_horizontal_rgb_neon_rdm_row_one, convolve_horizontal_rgb_neon_rdm_rows_4,
 };
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) use rgba_f16::convolve_horizontal_rgba_neon_row_one_f16;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) use rgba_f16::convolve_horizontal_rgba_neon_rows_4_f16;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
+pub(crate) use rgba_f16_fhm::{
+    convolve_horizontal_rgba_neon_row_one_f16_fhm, convolve_horizontal_rgba_neon_rows_4_f16_fhm,
+};
+#[cfg(feature = "nightly_f16")]
 pub(crate) use rgba_f16_full::{
     xconvolve_horizontal_rgba_neon_row_one_f16, xconvolve_horizontal_rgba_neon_rows_4_f16,
 };
@@ -127,19 +159,30 @@ pub(crate) use rgba_u16_lb::{
     convolve_horizontal_rgba_neon_rows_4_lb_u16, convolve_horizontal_rgba_neon_u16_lb_row,
 };
 pub(crate) use rgba_u8::{
-    convolve_horizontal_rgba_neon_row, convolve_horizontal_rgba_neon_row_i16,
-    convolve_horizontal_rgba_neon_rows_4_u8, convolve_horizontal_rgba_neon_rows_4_u8_i16,
+    convolve_horizontal_rgba_neon_row, convolve_horizontal_rgba_neon_row_q,
+    convolve_horizontal_rgba_neon_rows_4_u8, convolve_horizontal_rgba_neon_rows_4_u8_q,
+};
+#[cfg(feature = "rdm")]
+pub(crate) use rgba_u8_rdm::{
+    convolve_horizontal_rgba_neon_row_i16, convolve_horizontal_rgba_neon_rows_4_u8_i16,
 };
-pub(crate) use vertical_ar30::neon_column_handler_fixed_point_ar30;
-#[cfg(feature = "half")]
+#[cfg(feature = "rdm")]
+pub(crate) use vertical_ar30_rdm::neon_column_handler_fixed_point_ar30;
+#[cfg(feature = "nightly_f16")]
 pub(crate) use vertical_f16::convolve_vertical_rgb_neon_row_f16;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
+pub(crate) use vertical_f16_fhm::convolve_vertical_rgb_neon_row_f16_fhm;
+#[cfg(feature = "nightly_f16")]
 pub(crate) use vertical_f16_full::xconvolve_vertical_rgb_neon_row_f16;
 pub(crate) use vertical_f32::convolve_vertical_rgb_neon_row_f32;
 pub(crate) use vertical_u16::convolve_column_u16;
 pub(crate) use vertical_u16_lb::convolve_column_lb_u16;
 pub(crate) use vertical_u16_lb_f16::convolve_column_lb_u16_f16;
 pub(crate) use vertical_u8::{
-    convolve_vertical_neon_i16_precision, convolve_vertical_neon_i32_precision,
+    convolve_vertical_neon_i32_precision, convolve_vertical_neon_i32_precision_d,
 };
+#[cfg(feature = "rdm")]
+pub(crate) use vertical_u8_rdm::convolve_vertical_neon_i16_precision;
 pub(crate) use weights::convert_weights_to_f16;
+#[cfg(feature = "nightly_f16")]
+pub(crate) use weights::convert_weights_to_f16_fhm;
diff --git a/src/neon/plane_u8.rs b/src/neon/plane_u8.rs
index 9dcde44..ccccb3a 100644
--- a/src/neon/plane_u8.rs
+++ b/src/neon/plane_u8.rs
@@ -27,55 +27,102 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::filter_weights::FilterWeights;
-use crate::neon::utils::xvld1q_s16_x2;
-use crate::support::{PRECISION, ROUNDING_CONST};
+use crate::neon::utils::{vxmlal_high_s16, vxmlal_s16, xvld1q_s16_x2};
+use crate::support::PRECISION;
 use std::arch::aarch64::*;
 
-macro_rules! accumulate_16_horiz {
-    ($store: expr, $ptr: expr, $weights: expr) => {{
-        let pixel_colors = vld1q_u8($ptr);
-        let px_high_16 = vreinterpretq_s16_u16(vmovl_high_u8(pixel_colors));
-        let px_low_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixel_colors)));
-
-        $store = vmlal_high_s16($store, px_high_16, $weights.1);
-        $store = vmlal_s16($store, vget_low_s16(px_high_16), vget_low_s16($weights.1));
-
-        $store = vmlal_high_s16($store, px_low_16, $weights.0);
-        $store = vmlal_s16($store, vget_low_s16(px_low_16), vget_low_s16($weights.0));
-    }};
+#[must_use]
+#[inline(always)]
+unsafe fn accumulate_16_horiz<const D: bool>(
+    store: int32x4_t,
+    ptr: &[u8],
+    weights: int16x8x2_t,
+) -> int32x4_t {
+    let pixel_colors = vld1q_u8(ptr.as_ptr());
+    let px_high_16 = vreinterpretq_s16_u16(vmovl_high_u8(pixel_colors));
+    let px_low_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixel_colors)));
+
+    let mut store = vxmlal_high_s16::<D>(store, px_high_16, weights.1);
+    store = vxmlal_s16::<D>(store, vget_low_s16(px_high_16), vget_low_s16(weights.1));
+
+    store = vxmlal_high_s16::<D>(store, px_low_16, weights.0);
+    store = vxmlal_s16::<D>(store, vget_low_s16(px_low_16), vget_low_s16(weights.0));
+    store
 }
 
-macro_rules! accumulate_8_horiz {
-    ($store: expr, $ptr: expr, $weights: expr) => {{
-        let pixel_colors = vld1_u8($ptr);
-        let px_16 = vreinterpretq_s16_u16(vmovl_u8(pixel_colors));
+#[must_use]
+#[inline(always)]
+unsafe fn accumulate_8_horiz<const D: bool>(
+    store: int32x4_t,
+    ptr: &[u8],
+    weight: int16x8_t,
+) -> int32x4_t {
+    let pixel_colors = vld1_u8(ptr.as_ptr());
+    let px_16 = vreinterpretq_s16_u16(vmovl_u8(pixel_colors));
+
+    let mut store = vxmlal_high_s16::<D>(store, px_16, weight);
+    store = vxmlal_s16::<D>(store, vget_low_s16(px_16), vget_low_s16(weight));
+    store
+}
 
-        $store = vmlal_high_s16($store, px_16, $weights);
-        $store = vmlal_s16($store, vget_low_s16(px_16), vget_low_s16($weights));
-    }};
+#[inline(always)]
+unsafe fn accumulate_4_horiz<const D: bool>(
+    store: int32x4_t,
+    ptr: &[u8],
+    weight: int16x4_t,
+) -> int32x4_t {
+    let pixel_colors = vmovl_u8(vreinterpret_u8_u32(vld1_lane_u32::<0>(
+        ptr.as_ptr() as *const u32,
+        vdup_n_u32(0),
+    )));
+    let px_16 = vreinterpret_s16_u16(vget_low_u16(pixel_colors));
+    vxmlal_s16::<D>(store, px_16, weight)
 }
 
-macro_rules! accumulate_4_horiz {
-    ($store: expr, $ptr: expr, $weights: expr) => {{
-        let pixel_colors = vmovl_u8(vreinterpret_u8_u32(vld1_lane_u32::<0>(
-            $ptr as *const u32,
-            vdup_n_u32(0),
-        )));
-        let px_16 = vreinterpret_s16_u16(vget_low_u16(pixel_colors));
+#[inline(always)]
+unsafe fn accumulate_1_horiz<const D: bool>(
+    store: int32x4_t,
+    ptr: &[u8],
+    weight: int16x4_t,
+) -> int32x4_t {
+    let pixel_colors = vmovl_u8(vld1_lane_u8::<0>(ptr.as_ptr(), vdup_n_u8(0)));
+    let px_16 = vreinterpret_s16_u16(vget_low_u16(pixel_colors));
+    vxmlal_s16::<D>(store, px_16, weight)
+}
 
-        $store = vmlal_s16($store, px_16, $weights);
-    }};
+pub fn convolve_horizontal_plane_neon_rows_4_u8(
+    src: &[u8],
+    src_stride: usize,
+    dst: &mut [u8],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+) {
+    convolve_horizontal_plane_neon_rows_4_u8_impl::<false, PRECISION>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        filter_weights,
+    );
 }
 
-macro_rules! accumulate_1_horiz {
-    ($store: expr, $ptr: expr, $weight: expr) => {{
-        let pixel_colors = vld1_u16([$ptr.read_unaligned() as u16, 0u16, 0u16, 0u16].as_ptr());
-        let px_16 = vreinterpret_s16_u16(pixel_colors);
-        $store = vmlal_s16($store, px_16, $weight);
-    }};
+pub fn convolve_horizontal_plane_neon_rows_4_u8_q(
+    src: &[u8],
+    src_stride: usize,
+    dst: &mut [u8],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+) {
+    convolve_horizontal_plane_neon_rows_4_u8_impl::<true, 16>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        filter_weights,
+    );
 }
 
-pub fn convolve_horizontal_plane_neon_rows_4_u8(
+fn convolve_horizontal_plane_neon_rows_4_u8_impl<const D: bool, const PRECISION: i32>(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -92,9 +139,11 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8(
         let iter_row2 = row2_ref.iter_mut();
         let iter_row3 = row3_ref.iter_mut();
 
+        let rnd_const = (1 << (PRECISION - 1)) - 1;
+
         let base_val = {
             let j = vdupq_n_s32(0);
-            vsetq_lane_s32::<0>(ROUNDING_CONST, j)
+            vsetq_lane_s32::<0>(rnd_const, j)
         };
 
         for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
@@ -125,16 +174,16 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8(
                 let bounds_start = bounds.start + jx;
 
                 let src_ptr = src0.get_unchecked(bounds_start..);
-                accumulate_16_horiz!(store0, src_ptr.as_ptr(), weights);
+                store0 = accumulate_16_horiz::<D>(store0, src_ptr, weights);
 
                 let src_ptr1 = src1.get_unchecked(bounds_start..);
-                accumulate_16_horiz!(store1, src_ptr1.as_ptr(), weights);
+                store1 = accumulate_16_horiz::<D>(store1, src_ptr1, weights);
 
                 let src_ptr2 = src2.get_unchecked(bounds_start..);
-                accumulate_16_horiz!(store2, src_ptr2.as_ptr(), weights);
+                store2 = accumulate_16_horiz::<D>(store2, src_ptr2, weights);
 
                 let src_ptr3 = src3.get_unchecked(bounds_start..);
-                accumulate_16_horiz!(store3, src_ptr3.as_ptr(), weights);
+                store3 = accumulate_16_horiz::<D>(store3, src_ptr3, weights);
 
                 jx += 16;
             }
@@ -145,16 +194,16 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8(
                 let bounds_start = bounds.start + jx;
 
                 let src_ptr = src0.get_unchecked(bounds_start..);
-                accumulate_8_horiz!(store0, src_ptr.as_ptr(), weights);
+                store0 = accumulate_8_horiz::<D>(store0, src_ptr, weights);
 
                 let src_ptr1 = src1.get_unchecked(bounds_start..);
-                accumulate_8_horiz!(store1, src_ptr1.as_ptr(), weights);
+                store1 = accumulate_8_horiz::<D>(store1, src_ptr1, weights);
 
                 let src_ptr2 = src2.get_unchecked(bounds_start..);
-                accumulate_8_horiz!(store2, src_ptr2.as_ptr(), weights);
+                store2 = accumulate_8_horiz::<D>(store2, src_ptr2, weights);
 
                 let src_ptr3 = src3.get_unchecked(bounds_start..);
-                accumulate_8_horiz!(store3, src_ptr3.as_ptr(), weights);
+                store3 = accumulate_8_horiz::<D>(store3, src_ptr3, weights);
 
                 jx += 8;
             }
@@ -165,16 +214,16 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8(
                 let bounds_start = bounds.start + jx;
 
                 let src_ptr = src0.get_unchecked(bounds_start..);
-                accumulate_4_horiz!(store0, src_ptr.as_ptr(), weights);
+                accumulate_4_horiz::<D>(store0, src_ptr, weights);
 
                 let src_ptr1 = src1.get_unchecked(bounds_start..);
-                accumulate_4_horiz!(store1, src_ptr1.as_ptr(), weights);
+                accumulate_4_horiz::<D>(store1, src_ptr1, weights);
 
                 let src_ptr2 = src2.get_unchecked(bounds_start..);
-                accumulate_4_horiz!(store2, src_ptr2.as_ptr(), weights);
+                accumulate_4_horiz::<D>(store2, src_ptr2, weights);
 
                 let src_ptr3 = src3.get_unchecked(bounds_start..);
-                accumulate_4_horiz!(store3, src_ptr3.as_ptr(), weights);
+                accumulate_4_horiz::<D>(store3, src_ptr3, weights);
 
                 jx += 4;
             }
@@ -185,16 +234,16 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8(
                 let bounds_start = bounds.start + jx;
 
                 let src_ptr = src0.get_unchecked(bounds_start..);
-                accumulate_1_horiz!(store0, src_ptr.as_ptr(), weight);
+                accumulate_1_horiz::<D>(store0, src_ptr, weight);
 
                 let src_ptr1 = src1.get_unchecked(bounds_start..);
-                accumulate_1_horiz!(store1, src_ptr1.as_ptr(), weight);
+                accumulate_1_horiz::<D>(store1, src_ptr1, weight);
 
                 let src_ptr2 = src2.get_unchecked(bounds_start..);
-                accumulate_1_horiz!(store2, src_ptr2.as_ptr(), weight);
+                accumulate_1_horiz::<D>(store2, src_ptr2, weight);
 
                 let src_ptr3 = src3.get_unchecked(bounds_start..);
-                accumulate_1_horiz!(store3, src_ptr3.as_ptr(), weight);
+                accumulate_1_horiz::<D>(store3, src_ptr3, weight);
 
                 jx += 1;
             }
@@ -226,11 +275,28 @@ pub fn convolve_horizontal_plane_neon_row(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
+) {
+    convolve_horizontal_plane_neon_row_impl::<false, PRECISION>(src, dst, filter_weights);
+}
+
+pub fn convolve_horizontal_plane_neon_row_q(
+    src: &[u8],
+    dst: &mut [u8],
+    filter_weights: &FilterWeights<i16>,
+) {
+    convolve_horizontal_plane_neon_row_impl::<true, 16>(src, dst, filter_weights);
+}
+
+fn convolve_horizontal_plane_neon_row_impl<const D: bool, const PRECISION: i32>(
+    src: &[u8],
+    dst: &mut [u8],
+    filter_weights: &FilterWeights<i16>,
 ) {
     unsafe {
+        let rnd_const = (1 << (PRECISION - 1)) - 1;
         let base_val = {
             let j = vdupq_n_s32(0);
-            vsetq_lane_s32::<0>(ROUNDING_CONST, j)
+            vsetq_lane_s32::<0>(rnd_const, j)
         };
 
         for ((dst, bounds), weights) in dst.iter_mut().zip(filter_weights.bounds.iter()).zip(
@@ -248,8 +314,8 @@ pub fn convolve_horizontal_plane_neon_row(
                 let weights = xvld1q_s16_x2(w_ptr.as_ptr());
                 let bounds_start = bounds.start + jx;
 
-                let src_ptr = src.get_unchecked(bounds_start..).as_ptr();
-                accumulate_16_horiz!(store, src_ptr, weights);
+                let src_ptr = src.get_unchecked(bounds_start..);
+                store = accumulate_16_horiz::<D>(store, src_ptr, weights);
 
                 jx += 16;
             }
@@ -259,8 +325,8 @@ pub fn convolve_horizontal_plane_neon_row(
                 let weights = vld1q_s16(w_ptr.as_ptr());
                 let bounds_start = bounds.start + jx;
 
-                let src_ptr = src.get_unchecked(bounds_start..).as_ptr();
-                accumulate_8_horiz!(store, src_ptr, weights);
+                let src_ptr = src.get_unchecked(bounds_start..);
+                store = accumulate_8_horiz::<D>(store, src_ptr, weights);
 
                 jx += 8;
             }
@@ -270,8 +336,8 @@ pub fn convolve_horizontal_plane_neon_row(
                 let weights = vld1_s16(w_ptr.as_ptr());
                 let bounds_start = bounds.start + jx;
 
-                let src_ptr = src.get_unchecked(bounds_start..).as_ptr();
-                accumulate_4_horiz!(store, src_ptr, weights);
+                let src_ptr = src.get_unchecked(bounds_start..);
+                accumulate_4_horiz::<D>(store, src_ptr, weights);
 
                 jx += 4;
             }
@@ -280,8 +346,8 @@ pub fn convolve_horizontal_plane_neon_row(
                 let w_ptr = weights.get_unchecked(jx..(jx + 1));
                 let weight = vld1_lane_s16::<0>(w_ptr.as_ptr(), vdup_n_s16(0));
                 let bounds_start = bounds.start + jx;
-                let src_ptr = src.get_unchecked(bounds_start..).as_ptr();
-                accumulate_1_horiz!(store, src_ptr, weight);
+                let src_ptr = src.get_unchecked(bounds_start..);
+                accumulate_1_horiz::<D>(store, src_ptr, weight);
                 jx += 1;
             }
 
diff --git a/src/neon/rgb_f16.rs b/src/neon/rgb_f16.rs
index 3078cbf..a2622c0 100644
--- a/src/neon/rgb_f16.rs
+++ b/src/neon/rgb_f16.rs
@@ -27,105 +27,115 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use std::arch::aarch64::*;
-
 use crate::filter_weights::FilterWeights;
 use crate::neon::utils::{prefer_vfmaq_f32, prefer_vfmaq_lane_f32, prefer_vfmaq_laneq_f32};
 use crate::neon::*;
+use core::f16;
+use std::arch::aarch64::*;
 
-macro_rules! write_rgb_f16 {
-    ($store: expr, $dest_ptr: expr) => {{
-        let cvt = xreinterpret_u16_f16(xvcvt_f16_f32($store));
-        let l1 = vget_lane_u32::<0>(vreinterpret_u32_u16(cvt));
-        let l3 = vget_lane_u16::<2>(cvt);
-        ($dest_ptr as *mut u32).write_unaligned(l1);
-        ($dest_ptr as *mut u16).add(2).write_unaligned(l3);
-    }};
+#[inline(always)]
+unsafe fn write_rgb_f16(store: float32x4_t, dest_ptr: &mut [f16]) {
+    let cvt = xreinterpret_u16_f16(xvcvt_f16_f32(store));
+    let l1 = vget_lane_u32::<0>(vreinterpret_u32_u16(cvt));
+    let l3 = vget_lane_u16::<2>(cvt);
+    (dest_ptr.as_mut_ptr() as *mut u32).write_unaligned(l1);
+    (dest_ptr.as_mut_ptr() as *mut u16)
+        .add(2)
+        .write_unaligned(l3);
 }
 
-macro_rules! conv_horiz_4_rgb_f16 {
-    ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{
-        const COMPONENTS: usize = 3;
-        let src_ptr = $src.add($start_x * COMPONENTS);
-
-        let rgb_pixel_s = xvldq_f16_x2(src_ptr);
-        let rgb_first_u = vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.0));
-        let rgb_first = xreinterpret_f16_u16(rgb_first_u);
-        let rgb_second_u = vext_u16::<3>(
-            vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)),
-            vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)),
-        );
-        let rgb_second = xreinterpret_f16_u16(rgb_second_u);
-
-        let rgb_third_u = vext_u16::<2>(
-            vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)),
-            vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)),
-        );
-        let rgb_third = xreinterpret_f16_u16(rgb_third_u);
-
-        let rgb_fourth_u = vext_u16::<1>(
-            vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)),
-            vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)),
-        );
-        let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u);
-
-        let acc = prefer_vfmaq_laneq_f32::<0>($store, xvcvt_f32_f16(rgb_first), $weights);
-        let acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(rgb_second), $weights);
-        let acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(rgb_third), $weights);
-        let acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(rgb_fourth), $weights);
-        acc
-    }};
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_4_rgb_f16(
+    start_x: usize,
+    src: &[f16],
+    w: float32x4_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 3;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
+
+    let rgb_pixel_s = xvldq_f16(src_ptr as *const _);
+    let rgb_pixel_n = xvld_f16(src_ptr.add(8) as *const _);
+
+    let rgb_first_u = vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s));
+    let rgb_first = xreinterpret_f16_u16(rgb_first_u);
+    let rgb_second_u = vext_u16::<3>(
+        vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s)),
+        vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s)),
+    );
+    let rgb_second = xreinterpret_f16_u16(rgb_second_u);
+
+    let rgb_third_u = vext_u16::<2>(
+        vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s)),
+        xreinterpret_u16_f16(rgb_pixel_n),
+    );
+    let rgb_third = xreinterpret_f16_u16(rgb_third_u);
+
+    let rgb_fourth_u = vext_u16::<1>(
+        xreinterpret_u16_f16(rgb_pixel_n),
+        xreinterpret_u16_f16(rgb_pixel_n),
+    );
+    let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u);
+
+    let acc = prefer_vfmaq_laneq_f32::<0>(store, xvcvt_f32_f16(rgb_first), w);
+    let acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(rgb_second), w);
+    let acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(rgb_third), w);
+    prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(rgb_fourth), w)
 }
 
-macro_rules! conv_horiz_2_rgb_f16 {
-    ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{
-        const COMPONENTS: usize = 3;
-        let src_ptr = $src.add($start_x * COMPONENTS);
-
-        let rgb_pixel = xvld_f16(src_ptr);
-        let second_px = vreinterpret_u16_u32(vld1_lane_u32::<0>(
-            src_ptr.add(4) as *const u32,
-            vdup_n_u32(0),
-        ));
-
-        let rgb_first_u = xreinterpret_u16_f16(rgb_pixel);
-        let rgb_first = xreinterpret_f16_u16(rgb_first_u);
-        let rgb_second_u = vext_u16::<3>(xreinterpret_u16_f16(rgb_pixel), second_px);
-        let rgb_second = xreinterpret_f16_u16(rgb_second_u);
-
-        let acc = prefer_vfmaq_lane_f32::<0>($store, xvcvt_f32_f16(rgb_first), $set);
-        let acc = prefer_vfmaq_lane_f32::<1>(acc, xvcvt_f32_f16(rgb_second), $set);
-        acc
-    }};
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_2_rgb_f16(
+    start_x: usize,
+    src: &[f16],
+    w: float32x2_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 3;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
+
+    let rgb_pixel = xvld_f16(src_ptr);
+    let second_px = vreinterpret_u16_u32(vld1_lane_u32::<0>(
+        src_ptr.add(4) as *const u32,
+        vdup_n_u32(0),
+    ));
+
+    let rgb_first_u = xreinterpret_u16_f16(rgb_pixel);
+    let rgb_first = xreinterpret_f16_u16(rgb_first_u);
+    let rgb_second_u = vext_u16::<3>(xreinterpret_u16_f16(rgb_pixel), second_px);
+    let rgb_second = xreinterpret_f16_u16(rgb_second_u);
+
+    let acc = prefer_vfmaq_lane_f32::<0>(store, xvcvt_f32_f16(rgb_first), w);
+    prefer_vfmaq_lane_f32::<1>(acc, xvcvt_f32_f16(rgb_second), w)
 }
 
-macro_rules! conv_horiz_1_rgb_f16 {
-    ($start_x: expr, $src: expr, $weight: expr, $store: expr) => {{
-        const COMPONENTS: usize = 3;
-        let src_ptr = $src.add($start_x * COMPONENTS);
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_1_rgb_f16(
+    start_x: usize,
+    src: &[f16],
+    w: float32x4_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 3;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
 
-        const ZEROS_F16: half::f16 = half::f16::from_bits(0);
+    let mut fq = vreinterpret_u16_u32(vld1_lane_u32::<0>(src_ptr as *const _, vdup_n_u32(0)));
+    fq = vld1_lane_u16::<2>(src_ptr.add(2) as *const _, fq);
 
-        let transient: [half::f16; 4] = [
-            src_ptr.read_unaligned(),
-            src_ptr.add(1).read_unaligned(),
-            src_ptr.add(2).read_unaligned(),
-            ZEROS_F16,
-        ];
-        let rgb_pixel = xvld_f16(transient.as_ptr());
+    let rgb_pixel = xreinterpret_f16_u16(fq);
 
-        let acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(rgb_pixel), $weight);
-        acc
-    }};
+    prefer_vfmaq_f32(store, xvcvt_f32_f16(rgb_pixel), w)
 }
 
 pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16(
     dst_width: usize,
-    src_width: usize,
+    _: usize,
     filter_weights: &FilterWeights<f32>,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     unsafe {
@@ -144,17 +154,17 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16(
             let mut store_2 = zeros;
             let mut store_3 = zeros;
 
-            while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
+            while jx + 4 < bounds.size {
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                store_0 = conv_horiz_4_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store_0);
-                let s_ptr1 = src.get_unchecked(src_stride..).as_ptr();
-                store_1 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr1, read_weights, store_1);
-                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
-                store_2 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr2, read_weights, store_2);
-                let s_ptr = src.get_unchecked(src_stride * 3..).as_ptr();
-                store_3 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr, read_weights, store_3);
+                store_0 = conv_horiz_4_rgb_f16(bounds_start, src, read_weights, store_0);
+                let s_ptr1 = src.get_unchecked(src_stride..);
+                store_1 = conv_horiz_4_rgb_f16(bounds_start, s_ptr1, read_weights, store_1);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..);
+                store_2 = conv_horiz_4_rgb_f16(bounds_start, s_ptr2, read_weights, store_2);
+                let s_ptr = src.get_unchecked(src_stride * 3..);
+                store_3 = conv_horiz_4_rgb_f16(bounds_start, s_ptr, read_weights, store_3);
                 jx += 4;
             }
 
@@ -162,13 +172,13 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                store_0 = conv_horiz_2_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store_0);
-                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
-                store_1 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr_1, read_weights, store_1);
-                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
-                store_2 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr2, read_weights, store_2);
-                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
-                store_3 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr3, read_weights, store_3);
+                store_0 = conv_horiz_2_rgb_f16(bounds_start, src, read_weights, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..);
+                store_1 = conv_horiz_2_rgb_f16(bounds_start, s_ptr_1, read_weights, store_1);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..);
+                store_2 = conv_horiz_2_rgb_f16(bounds_start, s_ptr2, read_weights, store_2);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..);
+                store_3 = conv_horiz_2_rgb_f16(bounds_start, s_ptr3, read_weights, store_3);
                 jx += 2;
             }
 
@@ -176,28 +186,28 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let bounds_start = bounds.start + jx;
                 let weight0 = vld1q_dup_f32(ptr);
-                store_0 = conv_horiz_1_rgb_f16!(bounds_start, src.as_ptr(), weight0, store_0);
-                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
-                store_1 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_1, weight0, store_1);
-                let s_ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
-                store_2 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_2, weight0, store_2);
-                let s_ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
-                store_3 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_3, weight0, store_3);
+                store_0 = conv_horiz_1_rgb_f16(bounds_start, src, weight0, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..);
+                store_1 = conv_horiz_1_rgb_f16(bounds_start, s_ptr_1, weight0, store_1);
+                let s_ptr_2 = src.get_unchecked(src_stride * 2..);
+                store_2 = conv_horiz_1_rgb_f16(bounds_start, s_ptr_2, weight0, store_2);
+                let s_ptr_3 = src.get_unchecked(src_stride * 3..);
+                store_3 = conv_horiz_1_rgb_f16(bounds_start, s_ptr_3, weight0, store_3);
                 jx += 1;
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
-            write_rgb_f16!(store_0, dest_ptr);
+            let dest_ptr = dst.get_unchecked_mut(px..);
+            write_rgb_f16(store_0, dest_ptr);
 
-            let dest_ptr_1 = dst.get_unchecked_mut(px + dst_stride..).as_ptr();
-            write_rgb_f16!(store_1, dest_ptr_1);
+            let dest_ptr_1 = dst.get_unchecked_mut(px + dst_stride..);
+            write_rgb_f16(store_1, dest_ptr_1);
 
-            let dest_ptr_2 = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
-            write_rgb_f16!(store_2, dest_ptr_2);
+            let dest_ptr_2 = dst.get_unchecked_mut(px + dst_stride * 2..);
+            write_rgb_f16(store_2, dest_ptr_2);
 
-            let dest_ptr_3 = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
-            write_rgb_f16!(store_3, dest_ptr_3);
+            let dest_ptr_3 = dst.get_unchecked_mut(px + dst_stride * 3..);
+            write_rgb_f16(store_3, dest_ptr_3);
 
             filter_offset += filter_weights.aligned_size;
         }
@@ -206,10 +216,10 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16(
 
 pub(crate) fn convolve_horizontal_rgb_neon_row_one_f16(
     dst_width: usize,
-    src_width: usize,
+    _: usize,
     filter_weights: &FilterWeights<f32>,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     unsafe {
         const CHANNELS: usize = 3;
@@ -221,11 +231,11 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f16(
             let mut jx = 0usize;
             let mut store = vdupq_n_f32(0f32);
 
-            while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width {
+            while jx + 4 < bounds.size {
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                store = conv_horiz_4_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store);
+                store = conv_horiz_4_rgb_f16(bounds_start, src, read_weights, store);
                 jx += 4;
             }
 
@@ -233,7 +243,7 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f16(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                store = conv_horiz_2_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store);
+                store = conv_horiz_2_rgb_f16(bounds_start, src, read_weights, store);
                 jx += 2;
             }
 
@@ -241,13 +251,13 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f16(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = vld1q_dup_f32(ptr);
                 let bounds_start = bounds.start + jx;
-                store = conv_horiz_1_rgb_f16!(bounds_start, src.as_ptr(), weight0, store);
+                store = conv_horiz_1_rgb_f16(bounds_start, src, weight0, store);
                 jx += 1;
             }
 
             let px = x * CHANNELS;
-            let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
-            write_rgb_f16!(store, dest_ptr);
+            let dest_ptr = dst.get_unchecked_mut(px..);
+            write_rgb_f16(store, dest_ptr);
 
             filter_offset += filter_weights.aligned_size;
         }
diff --git a/src/neon/rgb_f16_fhm.rs b/src/neon/rgb_f16_fhm.rs
new file mode 100644
index 0000000..fc2fe0a
--- /dev/null
+++ b/src/neon/rgb_f16_fhm.rs
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+use crate::filter_weights::FilterWeights;
+use crate::neon::*;
+use core::f16;
+use std::arch::aarch64::*;
+
+#[inline(always)]
+unsafe fn write_rgb_f16(store: float32x4_t, dest_ptr: &mut [f16]) {
+    let cvt = xreinterpret_u16_f16(xvcvt_f16_f32(store));
+    let l1 = vget_lane_u32::<0>(vreinterpret_u32_u16(cvt));
+    let l3 = vget_lane_u16::<2>(cvt);
+    (dest_ptr.as_mut_ptr() as *mut u32).write_unaligned(l1);
+    (dest_ptr.as_mut_ptr() as *mut u16)
+        .add(2)
+        .write_unaligned(l3);
+}
+
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_4_rgb_f16(
+    start_x: usize,
+    src: &[f16],
+    w: x_float16x4_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 3;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
+
+    let rgb_pixel_s = xvldq_f16(src_ptr as *const _);
+    let rgb_pixel_n = xvld_f16(src_ptr.add(8) as *const _);
+
+    let rgb_first_u = vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s));
+    let rgb_first = xreinterpret_f16_u16(rgb_first_u);
+    let rgb_second_u = vext_u16::<3>(
+        vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s)),
+        vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s)),
+    );
+    let rgb_second = xreinterpret_f16_u16(rgb_second_u);
+
+    let rgb_third_u = vext_u16::<2>(
+        vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s)),
+        xreinterpret_u16_f16(rgb_pixel_n),
+    );
+    let rgb_third = xreinterpret_f16_u16(rgb_third_u);
+
+    let rgb_fourth_u = vext_u16::<1>(
+        xreinterpret_u16_f16(rgb_pixel_n),
+        xreinterpret_u16_f16(rgb_pixel_n),
+    );
+    let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u);
+
+    let f0 = xvcombine_f16(rgb_first, rgb_second);
+    let f1 = xvcombine_f16(rgb_third, rgb_fourth);
+
+    let acc = xvfmlalq_lane_low_f16::<0>(store, f0, w);
+    let acc = xvfmlalq_lane_high_f16::<1>(acc, f0, w);
+    let acc = xvfmlalq_lane_low_f16::<2>(acc, f1, w);
+    xvfmlalq_lane_high_f16::<3>(acc, f1, w)
+}
+
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_2_rgb_f16(
+    start_x: usize,
+    src: &[f16],
+    w: x_float16x4_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 3;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
+
+    let rgb_pixel = xvld_f16(src_ptr);
+    let second_px = vreinterpret_u16_u32(vld1_lane_u32::<0>(
+        src_ptr.add(4) as *const u32,
+        vdup_n_u32(0),
+    ));
+
+    let rgb_first_u = xreinterpret_u16_f16(rgb_pixel);
+    let rgb_first = xreinterpret_f16_u16(rgb_first_u);
+    let rgb_second_u = vext_u16::<3>(xreinterpret_u16_f16(rgb_pixel), second_px);
+    let rgb_second = xreinterpret_f16_u16(rgb_second_u);
+
+    let f0 = xvcombine_f16(rgb_first, rgb_second);
+
+    let acc = xvfmlalq_lane_low_f16::<0>(store, f0, w);
+    xvfmlalq_lane_high_f16::<1>(acc, f0, w)
+}
+
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_1_rgb_f16(
+    start_x: usize,
+    src: &[f16],
+    w: x_float16x4_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 3;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
+
+    let mut fq = vreinterpret_u16_u32(vld1_lane_u32::<0>(src_ptr as *const _, vdup_n_u32(0)));
+    fq = vld1_lane_u16::<2>(src_ptr.add(2) as *const _, fq);
+
+    let rgb_pixel = xreinterpret_f16_u16(fq);
+
+    xvfmlalq_lane_low_f16::<0>(store, xvcombine_f16(rgb_pixel, rgb_pixel), w)
+}
+
+pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16_fhm(
+    dst_width: usize,
+    w: usize,
+    filter_weights: &FilterWeights<f16>,
+    src: &[f16],
+    src_stride: usize,
+    dst: &mut [f16],
+    dst_stride: usize,
+) {
+    unsafe {
+        convolve_horizontal_rgb_neon_rows_4_f16_impl(
+            dst_width,
+            w,
+            filter_weights,
+            src,
+            src_stride,
+            dst,
+            dst_stride,
+        )
+    }
+}
+
+#[target_feature(enable = "fhm")]
+unsafe fn convolve_horizontal_rgb_neon_rows_4_f16_impl(
+    dst_width: usize,
+    _: usize,
+    filter_weights: &FilterWeights<f16>,
+    src: &[f16],
+    src_stride: usize,
+    dst: &mut [f16],
+    dst_stride: usize,
+) {
+    const CHANNELS: usize = 3;
+    let mut filter_offset = 0usize;
+
+    let zeros = vdupq_n_f32(0.);
+
+    let weights_ptr = filter_weights.weights.as_ptr();
+
+    for x in 0..dst_width {
+        let bounds = filter_weights.bounds.get_unchecked(x);
+        let mut jx = 0usize;
+        let mut store_0 = zeros;
+        let mut store_1 = zeros;
+        let mut store_2 = zeros;
+        let mut store_3 = zeros;
+
+        while jx + 4 < bounds.size {
+            let bounds_start = bounds.start + jx;
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights = xvld_f16(ptr);
+            store_0 = conv_horiz_4_rgb_f16(bounds_start, src, read_weights, store_0);
+            let s_ptr1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_4_rgb_f16(bounds_start, s_ptr1, read_weights, store_1);
+            let s_ptr2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_4_rgb_f16(bounds_start, s_ptr2, read_weights, store_2);
+            let s_ptr = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_4_rgb_f16(bounds_start, s_ptr, read_weights, store_3);
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let bounds_start = bounds.start + jx;
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights =
+                xreinterpret_f16_u16(vreinterpret_u16_u32(vld1_dup_u32(ptr as *const _)));
+            store_0 = conv_horiz_2_rgb_f16(bounds_start, src, read_weights, store_0);
+            let s_ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_2_rgb_f16(bounds_start, s_ptr_1, read_weights, store_1);
+            let s_ptr2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_2_rgb_f16(bounds_start, s_ptr2, read_weights, store_2);
+            let s_ptr3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_2_rgb_f16(bounds_start, s_ptr3, read_weights, store_3);
+            jx += 2;
+        }
+
+        while jx < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let bounds_start = bounds.start + jx;
+            let weight0 = xreinterpret_f16_u16(vld1_dup_u16(ptr as *const _));
+            store_0 = conv_horiz_1_rgb_f16(bounds_start, src, weight0, store_0);
+            let s_ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_1_rgb_f16(bounds_start, s_ptr_1, weight0, store_1);
+            let s_ptr_2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_1_rgb_f16(bounds_start, s_ptr_2, weight0, store_2);
+            let s_ptr_3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_1_rgb_f16(bounds_start, s_ptr_3, weight0, store_3);
+            jx += 1;
+        }
+
+        let px = x * CHANNELS;
+        let dest_ptr = dst.get_unchecked_mut(px..);
+        write_rgb_f16(store_0, dest_ptr);
+
+        let dest_ptr_1 = dst.get_unchecked_mut(px + dst_stride..);
+        write_rgb_f16(store_1, dest_ptr_1);
+
+        let dest_ptr_2 = dst.get_unchecked_mut(px + dst_stride * 2..);
+        write_rgb_f16(store_2, dest_ptr_2);
+
+        let dest_ptr_3 = dst.get_unchecked_mut(px + dst_stride * 3..);
+        write_rgb_f16(store_3, dest_ptr_3);
+
+        filter_offset += filter_weights.aligned_size;
+    }
+}
+
+pub(crate) fn convolve_horizontal_rgb_neon_row_one_f16_fhm(
+    dst_width: usize,
+    w: usize,
+    filter_weights: &FilterWeights<f16>,
+    src: &[f16],
+    dst: &mut [f16],
+) {
+    unsafe { convolve_horizontal_rgb_neon_row_one_f16_impl(dst_width, w, filter_weights, src, dst) }
+}
+
+#[target_feature(enable = "fhm")]
+unsafe fn convolve_horizontal_rgb_neon_row_one_f16_impl(
+    dst_width: usize,
+    _: usize,
+    filter_weights: &FilterWeights<f16>,
+    src: &[f16],
+    dst: &mut [f16],
+) {
+    const CHANNELS: usize = 3;
+    let weights_ptr = filter_weights.weights.as_ptr();
+    let mut filter_offset = 0usize;
+
+    for x in 0..dst_width {
+        let bounds = filter_weights.bounds.get_unchecked(x);
+        let mut jx = 0usize;
+        let mut store = vdupq_n_f32(0f32);
+
+        while jx + 4 < bounds.size {
+            let bounds_start = bounds.start + jx;
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights = xvld_f16(ptr);
+            store = conv_horiz_4_rgb_f16(bounds_start, src, read_weights, store);
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let bounds_start = bounds.start + jx;
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights =
+                xreinterpret_f16_u16(vreinterpret_u16_u32(vld1_dup_u32(ptr as *const _)));
+            store = conv_horiz_2_rgb_f16(bounds_start, src, read_weights, store);
+            jx += 2;
+        }
+
+        while jx < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let weight0 = xreinterpret_f16_u16(vld1_dup_u16(ptr as *const _));
+            let bounds_start = bounds.start + jx;
+            store = conv_horiz_1_rgb_f16(bounds_start, src, weight0, store);
+            jx += 1;
+        }
+
+        let px = x * CHANNELS;
+        let dest_ptr = dst.get_unchecked_mut(px..);
+        write_rgb_f16(store, dest_ptr);
+
+        filter_offset += filter_weights.aligned_size;
+    }
+}
diff --git a/src/neon/rgb_f16_full.rs b/src/neon/rgb_f16_full.rs
index 031fd92..b30d07f 100644
--- a/src/neon/rgb_f16_full.rs
+++ b/src/neon/rgb_f16_full.rs
@@ -29,7 +29,7 @@
 
 use std::arch::aarch64::*;
 
-use half::f16;
+use core::f16;
 
 use crate::filter_weights::FilterWeights;
 use crate::neon::*;
@@ -132,9 +132,7 @@ unsafe fn conv_horiz_1_rgb_f16(
     rgb_pixel_u = vld1_lane_u16::<2>(src_ptr as *const _, rgb_pixel_u);
 
     let rgb_pixel = xreinterpret_f16_u16(rgb_pixel_u);
-
-    let acc = xvfmla_f16(store, rgb_pixel, set);
-    acc
+    xvfmla_f16(store, rgb_pixel, set)
 }
 
 pub(crate) fn xconvolve_horizontal_rgb_neon_rows_4_f16(
diff --git a/src/neon/rgb_u8.rs b/src/neon/rgb_u8.rs
index 550191c..88ba854 100644
--- a/src/neon/rgb_u8.rs
+++ b/src/neon/rgb_u8.rs
@@ -28,13 +28,13 @@
  */
 
 use crate::filter_weights::FilterWeights;
-use crate::neon::utils::load_3b_as_u16x4;
-use crate::support::{PRECISION, ROUNDING_CONST};
+use crate::neon::utils::{load_3b_as_u16x4, vxmlal_high_lane_s16, vxmlal_lane_s16, vxmlal_s16};
+use crate::support::PRECISION;
 use std::arch::aarch64::*;
 
 #[must_use]
 #[inline(always)]
-unsafe fn conv_horiz_rgba_4_u8(
+unsafe fn conv_horiz_rgb_4_u8<const D: bool>(
     start_x: usize,
     src: &[u8],
     weights: int16x4_t,
@@ -55,15 +55,15 @@ unsafe fn conv_horiz_rgba_4_u8(
     let hi = vreinterpretq_s16_u16(vmovl_high_u8(rgb_pixel));
     let lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgb_pixel)));
 
-    let acc = vmlal_high_lane_s16::<3>(store, hi, weights);
-    let acc = vmlal_lane_s16::<2>(acc, vget_low_s16(hi), weights);
-    let acc = vmlal_high_lane_s16::<1>(acc, lo, weights);
-    vmlal_lane_s16::<0>(acc, vget_low_s16(lo), weights)
+    let acc = vxmlal_high_lane_s16::<D, 3>(store, hi, weights);
+    let acc = vxmlal_lane_s16::<D, 2>(acc, vget_low_s16(hi), weights);
+    let acc = vxmlal_high_lane_s16::<D, 1>(acc, lo, weights);
+    vxmlal_lane_s16::<D, 0>(acc, vget_low_s16(lo), weights)
 }
 
 #[must_use]
 #[inline(always)]
-unsafe fn conv_horiz_rgba_2_u8(
+unsafe fn conv_horiz_rgba_2_u8<const D: bool>(
     start_x: usize,
     src: &[u8],
     weights: int16x4_t,
@@ -81,13 +81,13 @@ unsafe fn conv_horiz_rgba_2_u8(
 
     let wide = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(rgb_pixel)));
 
-    let acc = vmlal_high_lane_s16::<1>(store, wide, weights);
-    vmlal_lane_s16::<0>(acc, vget_low_s16(wide), weights)
+    let acc = vxmlal_high_lane_s16::<D, 1>(store, wide, weights);
+    vxmlal_lane_s16::<D, 0>(acc, vget_low_s16(wide), weights)
 }
 
 #[must_use]
 #[inline(always)]
-unsafe fn conv_horiz_rgba_1_u8(
+unsafe fn conv_horiz_rgba_1_u8<const D: bool>(
     start_x: usize,
     src: &[u8],
     w0: int16x4_t,
@@ -97,11 +97,11 @@ unsafe fn conv_horiz_rgba_1_u8(
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
     let rgb_pixel = load_3b_as_u16x4(src_ptr.as_ptr());
     let lo = vreinterpret_s16_u16(rgb_pixel);
-    vmlal_s16(store, lo, w0)
+    vxmlal_s16::<D>(store, lo, w0)
 }
 
 #[inline(always)]
-unsafe fn write_accumulator_u8(store: int32x4_t, dst: &mut [u8]) {
+unsafe fn write_accumulator_u8<const PRECISION: i32>(store: int32x4_t, dst: &mut [u8]) {
     let store_16 = vqshrun_n_s32::<PRECISION>(store);
     let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
     vst1_lane_u16::<0>(
@@ -117,6 +117,38 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4(
     dst: &mut [u8],
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
+) {
+    convolve_horizontal_rgb_neon_rows_4_impl::<false, PRECISION>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        filter_weights,
+    );
+}
+
+pub(crate) fn convolve_horizontal_rgb_neon_rows_4_q(
+    src: &[u8],
+    src_stride: usize,
+    dst: &mut [u8],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+) {
+    convolve_horizontal_rgb_neon_rows_4_impl::<true, 16>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        filter_weights,
+    );
+}
+
+fn convolve_horizontal_rgb_neon_rows_4_impl<const D: bool, const PRECISION: i32>(
+    src: &[u8],
+    src_stride: usize,
+    dst: &mut [u8],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
 ) {
     unsafe {
         let shuf_table_1: [u8; 8] = [0, 1, 2, 255, 3, 4, 5, 255];
@@ -127,8 +159,10 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4(
 
         // (r0 g0 b0 r1) (g2 b2 r3 g3) (b3 r4 g4 b4) (r5 g5 b5 r6)
 
+        let rnd_const: i32 = (1 << (PRECISION - 1)) - 1;
+
         const CHANNELS: usize = 3;
-        let init = vdupq_n_s32(ROUNDING_CONST);
+        let init = vdupq_n_s32(rnd_const);
         let (row0_ref, rest) = dst.split_at_mut(dst_stride);
         let (row1_ref, rest) = rest.split_at_mut(dst_stride);
         let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
@@ -164,10 +198,10 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                store_0 = conv_horiz_rgba_4_u8(bounds_start, src0, weights, store_0, shuffle);
-                store_1 = conv_horiz_rgba_4_u8(bounds_start, src1, weights, store_1, shuffle);
-                store_2 = conv_horiz_rgba_4_u8(bounds_start, src2, weights, store_2, shuffle);
-                store_3 = conv_horiz_rgba_4_u8(bounds_start, src3, weights, store_3, shuffle);
+                store_0 = conv_horiz_rgb_4_u8::<D>(bounds_start, src0, weights, store_0, shuffle);
+                store_1 = conv_horiz_rgb_4_u8::<D>(bounds_start, src1, weights, store_1, shuffle);
+                store_2 = conv_horiz_rgb_4_u8::<D>(bounds_start, src2, weights, store_2, shuffle);
+                store_3 = conv_horiz_rgb_4_u8::<D>(bounds_start, src3, weights, store_3, shuffle);
                 jx += 4;
             }
 
@@ -176,10 +210,10 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4(
                 let bnds = bounds.start + jx;
                 let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
                 v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
-                store_0 = conv_horiz_rgba_2_u8(bnds, src0, v_weight, store_0, shuffle_1);
-                store_1 = conv_horiz_rgba_2_u8(bnds, src1, v_weight, store_1, shuffle_1);
-                store_2 = conv_horiz_rgba_2_u8(bnds, src2, v_weight, store_2, shuffle_1);
-                store_3 = conv_horiz_rgba_2_u8(bnds, src3, v_weight, store_3, shuffle_1);
+                store_0 = conv_horiz_rgba_2_u8::<D>(bnds, src0, v_weight, store_0, shuffle_1);
+                store_1 = conv_horiz_rgba_2_u8::<D>(bnds, src1, v_weight, store_1, shuffle_1);
+                store_2 = conv_horiz_rgba_2_u8::<D>(bnds, src2, v_weight, store_2, shuffle_1);
+                store_3 = conv_horiz_rgba_2_u8::<D>(bnds, src3, v_weight, store_3, shuffle_1);
                 jx += 2;
             }
 
@@ -187,17 +221,17 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4(
                 let w_ptr = weights.get_unchecked(jx..(jx + 1));
                 let bnds = bounds.start + jx;
                 let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-                store_0 = conv_horiz_rgba_1_u8(bnds, src0, weight0, store_0);
-                store_1 = conv_horiz_rgba_1_u8(bnds, src1, weight0, store_1);
-                store_2 = conv_horiz_rgba_1_u8(bnds, src2, weight0, store_2);
-                store_3 = conv_horiz_rgba_1_u8(bnds, src3, weight0, store_3);
+                store_0 = conv_horiz_rgba_1_u8::<D>(bnds, src0, weight0, store_0);
+                store_1 = conv_horiz_rgba_1_u8::<D>(bnds, src1, weight0, store_1);
+                store_2 = conv_horiz_rgba_1_u8::<D>(bnds, src2, weight0, store_2);
+                store_3 = conv_horiz_rgba_1_u8::<D>(bnds, src3, weight0, store_3);
                 jx += 1;
             }
 
-            write_accumulator_u8(store_0, chunk0);
-            write_accumulator_u8(store_1, chunk1);
-            write_accumulator_u8(store_2, chunk2);
-            write_accumulator_u8(store_3, chunk3);
+            write_accumulator_u8::<PRECISION>(store_0, chunk0);
+            write_accumulator_u8::<PRECISION>(store_1, chunk1);
+            write_accumulator_u8::<PRECISION>(store_2, chunk2);
+            write_accumulator_u8::<PRECISION>(store_3, chunk3);
         }
     }
 }
@@ -206,6 +240,22 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
+) {
+    convolve_horizontal_rgb_neon_row_one_impl::<false, PRECISION>(src, dst, filter_weights);
+}
+
+pub(crate) fn convolve_horizontal_rgb_neon_row_one_q(
+    src: &[u8],
+    dst: &mut [u8],
+    filter_weights: &FilterWeights<i16>,
+) {
+    convolve_horizontal_rgb_neon_row_one_impl::<true, 16>(src, dst, filter_weights);
+}
+
+fn convolve_horizontal_rgb_neon_row_one_impl<const D: bool, const PRECISION: i32>(
+    src: &[u8],
+    dst: &mut [u8],
+    filter_weights: &FilterWeights<i16>,
 ) {
     unsafe {
         const CHANNELS: usize = 3;
@@ -216,6 +266,8 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one(
         let shuffle_2 = vld1_u8(shuf_table_2.as_ptr());
         let shuffle = vcombine_u8(shuffle_1, shuffle_2);
 
+        let rnd_const: i32 = (1 << (PRECISION - 1)) - 1;
+
         for ((dst, bounds), weights) in dst
             .chunks_exact_mut(CHANNELS)
             .zip(filter_weights.bounds.iter())
@@ -228,13 +280,13 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one(
             let bounds_size = bounds.size;
 
             let mut jx = 0usize;
-            let mut store = vdupq_n_s32(ROUNDING_CONST);
+            let mut store = vdupq_n_s32(rnd_const);
 
             while jx + 4 < bounds_size {
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                store = conv_horiz_rgba_4_u8(bounds_start, src, weights, store, shuffle);
+                store = conv_horiz_rgb_4_u8::<D>(bounds_start, src, weights, store, shuffle);
                 jx += 4;
             }
 
@@ -242,7 +294,7 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one(
                 let w_ptr = weights.get_unchecked(jx..(jx + 2));
                 let bounds_start = bounds.start + jx;
                 let v_weight = vreinterpret_s16_s32(vld1_dup_s32(w_ptr.as_ptr() as *const _));
-                store = conv_horiz_rgba_2_u8(bounds_start, src, v_weight, store, shuffle_1);
+                store = conv_horiz_rgba_2_u8::<D>(bounds_start, src, v_weight, store, shuffle_1);
                 jx += 2;
             }
 
@@ -250,11 +302,11 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one(
                 let w_ptr = weights.get_unchecked(jx..(jx + 1));
                 let weight0 = vld1_dup_s16(w_ptr.as_ptr());
                 let bnds = bounds.start + jx;
-                store = conv_horiz_rgba_1_u8(bnds, src, weight0, store);
+                store = conv_horiz_rgba_1_u8::<D>(bnds, src, weight0, store);
                 jx += 1;
             }
 
-            write_accumulator_u8(store, dst);
+            write_accumulator_u8::<PRECISION>(store, dst);
         }
     }
 }
diff --git a/src/neon/rgba_f16.rs b/src/neon/rgba_f16.rs
index 3926a16..d69bf58 100644
--- a/src/neon/rgba_f16.rs
+++ b/src/neon/rgba_f16.rs
@@ -36,77 +36,90 @@ use crate::neon::{
     xvcvt_f32_f16, xvget_high_f16, xvget_low_f16, xvld_f16, xvldq_f16, xvldq_f16_x2, xvldq_f16_x4,
     xvst_f16,
 };
+use core::f16;
 use std::arch::aarch64::*;
 
-macro_rules! conv_horiz_rgba_8_f16 {
-    ($start_x: expr, $src: expr, $set1: expr, $set2: expr, $store: expr) => {{
-        const COMPONENTS: usize = 4;
-        let src_ptr = $src.add($start_x * COMPONENTS);
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_rgba_8_f16(
+    start_x: usize,
+    src: &[f16],
+    set1: float32x4_t,
+    set2: float32x4_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 4;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
 
-        let rgb_pixel = xvldq_f16_x4(src_ptr);
+    let rgb_pixel = xvldq_f16_x4(src_ptr);
 
-        let mut acc =
-            prefer_vfmaq_laneq_f32::<0>($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), $set1);
-        acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1);
-        acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), $set1);
-        acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.1)), $set1);
-        acc = prefer_vfmaq_laneq_f32::<0>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.2)), $set2);
-        acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.2)), $set2);
-        acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.3)), $set2);
-        acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.3)), $set2);
-        acc
-    }};
+    let mut acc =
+        prefer_vfmaq_laneq_f32::<0>(store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), set1);
+    acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), set1);
+    acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), set1);
+    acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.1)), set1);
+    acc = prefer_vfmaq_laneq_f32::<0>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.2)), set2);
+    acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.2)), set2);
+    acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.3)), set2);
+    prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.3)), set2)
 }
 
-macro_rules! conv_horiz_rgba_4_f16 {
-    ($start_x: expr, $src: expr, $set1: expr,  $store: expr) => {{
-        const COMPONENTS: usize = 4;
-        let src_ptr = $src.add($start_x * COMPONENTS);
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_rgba_4_f16(
+    start_x: usize,
+    src: &[f16],
+    set1: float32x4_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 4;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
 
-        let rgb_pixel = xvldq_f16_x2(src_ptr);
+    let rgb_pixel = xvldq_f16_x2(src_ptr);
 
-        let acc =
-            prefer_vfmaq_laneq_f32::<0>($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), $set1);
-        let acc =
-            prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1);
-        let acc =
-            prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), $set1);
-        let acc =
-            prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1);
-        acc
-    }};
+    let acc = prefer_vfmaq_laneq_f32::<0>(store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), set1);
+    let acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), set1);
+    let acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), set1);
+    prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), set1)
 }
 
-macro_rules! conv_horiz_rgba_2_f32 {
-    ($start_x: expr, $src: expr, $set: expr,  $store: expr) => {{
-        const COMPONENTS: usize = 4;
-        let src_ptr = $src.add($start_x * COMPONENTS);
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_rgba_2_f32(
+    start_x: usize,
+    src: &[f16],
+    set: float32x2_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 4;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
 
-        let rgb_pixel = xvldq_f16(src_ptr);
+    let rgb_pixel = xvldq_f16(src_ptr);
 
-        let mut acc =
-            prefer_vfmaq_lane_f32::<0>($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel)), $set);
-        acc = prefer_vfmaq_lane_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel)), $set);
-        acc
-    }};
+    let acc = prefer_vfmaq_lane_f32::<0>(store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel)), set);
+    prefer_vfmaq_lane_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel)), set)
 }
 
-macro_rules! conv_horiz_rgba_1_f16 {
-    ($start_x: expr, $src: expr, $set: expr,  $store: expr) => {{
-        const COMPONENTS: usize = 4;
-        let src_ptr = $src.add($start_x * COMPONENTS);
-        let rgb_pixel = xvld_f16(src_ptr);
-        let acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(rgb_pixel), $set);
-        acc
-    }};
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_rgba_1_f16(
+    start_x: usize,
+    src: &[f16],
+    set: float32x4_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 4;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
+    let rgb_pixel = xvld_f16(src_ptr);
+    prefer_vfmaq_f32(store, xvcvt_f32_f16(rgb_pixel), set)
 }
 
 pub(crate) fn convolve_horizontal_rgba_neon_row_one_f16(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     unsafe {
         const CHANNELS: usize = 4;
@@ -122,7 +135,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one_f16(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
-                store = conv_horiz_rgba_4_f16!(bounds_start, src.as_ptr(), read_weights, store);
+                store = conv_horiz_rgba_4_f16(bounds_start, src, read_weights, store);
                 jx += 4;
             }
 
@@ -130,7 +143,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one_f16(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
-                store = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store);
+                store = conv_horiz_rgba_2_f32(bounds_start, src, read_weights, store);
                 jx += 2;
             }
 
@@ -138,7 +151,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one_f16(
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = vld1q_dup_f32(ptr);
-                store = conv_horiz_rgba_1_f16!(bounds_start, src.as_ptr(), weight0, store);
+                store = conv_horiz_rgba_1_f16(bounds_start, src, weight0, store);
                 jx += 1;
             }
 
@@ -155,9 +168,9 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     unsafe {
@@ -178,36 +191,36 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = xvld1q_f32_x2(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_rgba_8_f16!(
+                store_0 = conv_horiz_rgba_8_f16(
                     bounds_start,
-                    src.as_ptr(),
+                    src,
                     read_weights.0,
                     read_weights.1,
-                    store_0
+                    store_0,
                 );
-                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
-                store_1 = conv_horiz_rgba_8_f16!(
+                let s_ptr_1 = src.get_unchecked(src_stride..);
+                store_1 = conv_horiz_rgba_8_f16(
                     bounds_start,
                     s_ptr_1,
                     read_weights.0,
                     read_weights.1,
-                    store_1
+                    store_1,
                 );
-                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
-                store_2 = conv_horiz_rgba_8_f16!(
+                let s_ptr2 = src.get_unchecked(src_stride * 2..);
+                store_2 = conv_horiz_rgba_8_f16(
                     bounds_start,
                     s_ptr2,
                     read_weights.0,
                     read_weights.1,
-                    store_2
+                    store_2,
                 );
-                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
-                store_3 = conv_horiz_rgba_8_f16!(
+                let s_ptr3 = src.get_unchecked(src_stride * 3..);
+                store_3 = conv_horiz_rgba_8_f16(
                     bounds_start,
                     s_ptr3,
                     read_weights.0,
                     read_weights.1,
-                    store_3
+                    store_3,
                 );
                 jx += 8;
             }
@@ -216,13 +229,13 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1q_f32(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_rgba_4_f16!(bounds_start, src.as_ptr(), read_weights, store_0);
-                let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr();
-                store_1 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr_1, read_weights, store_1);
-                let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr();
-                store_2 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr2, read_weights, store_2);
-                let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr();
-                store_3 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr3, read_weights, store_3);
+                store_0 = conv_horiz_rgba_4_f16(bounds_start, src, read_weights, store_0);
+                let s_ptr_1 = src.get_unchecked(src_stride..);
+                store_1 = conv_horiz_rgba_4_f16(bounds_start, s_ptr_1, read_weights, store_1);
+                let s_ptr2 = src.get_unchecked(src_stride * 2..);
+                store_2 = conv_horiz_rgba_4_f16(bounds_start, s_ptr2, read_weights, store_2);
+                let s_ptr3 = src.get_unchecked(src_stride * 3..);
+                store_3 = conv_horiz_rgba_4_f16(bounds_start, s_ptr3, read_weights, store_3);
                 jx += 4;
             }
 
@@ -230,13 +243,13 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let read_weights = vld1_f32(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store_0);
-                let ptr_1 = src.get_unchecked(src_stride..).as_ptr();
-                store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, read_weights, store_1);
-                let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
-                store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, read_weights, store_2);
-                let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
-                store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, read_weights, store_3);
+                store_0 = conv_horiz_rgba_2_f32(bounds_start, src, read_weights, store_0);
+                let ptr_1 = src.get_unchecked(src_stride..);
+                store_1 = conv_horiz_rgba_2_f32(bounds_start, ptr_1, read_weights, store_1);
+                let ptr_2 = src.get_unchecked(src_stride * 2..);
+                store_2 = conv_horiz_rgba_2_f32(bounds_start, ptr_2, read_weights, store_2);
+                let ptr_3 = src.get_unchecked(src_stride * 3..);
+                store_3 = conv_horiz_rgba_2_f32(bounds_start, ptr_3, read_weights, store_3);
                 jx += 2;
             }
 
@@ -244,13 +257,13 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = vld1q_dup_f32(ptr);
                 let bounds_start = bounds.start + jx;
-                store_0 = conv_horiz_rgba_1_f16!(bounds_start, src.as_ptr(), weight0, store_0);
-                let ptr_1 = src.get_unchecked(src_stride..).as_ptr();
-                store_1 = conv_horiz_rgba_1_f16!(bounds_start, ptr_1, weight0, store_1);
-                let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr();
-                store_2 = conv_horiz_rgba_1_f16!(bounds_start, ptr_2, weight0, store_2);
-                let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr();
-                store_3 = conv_horiz_rgba_1_f16!(bounds_start, ptr_3, weight0, store_3);
+                store_0 = conv_horiz_rgba_1_f16(bounds_start, src, weight0, store_0);
+                let ptr_1 = src.get_unchecked(src_stride..);
+                store_1 = conv_horiz_rgba_1_f16(bounds_start, ptr_1, weight0, store_1);
+                let ptr_2 = src.get_unchecked(src_stride * 2..);
+                store_2 = conv_horiz_rgba_1_f16(bounds_start, ptr_2, weight0, store_2);
+                let ptr_3 = src.get_unchecked(src_stride * 3..);
+                store_3 = conv_horiz_rgba_1_f16(bounds_start, ptr_3, weight0, store_3);
                 jx += 1;
             }
 
diff --git a/src/neon/rgba_f16_fhm.rs b/src/neon/rgba_f16_fhm.rs
new file mode 100644
index 0000000..33514e3
--- /dev/null
+++ b/src/neon/rgba_f16_fhm.rs
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+use crate::filter_weights::FilterWeights;
+use crate::neon::f16_utils::{
+    xvcombine_f16, xvcvt_f16_f32, xvfmlalq_lane_high_f16, xvfmlalq_lane_low_f16,
+    xvfmlalq_laneq_high_f16, xvfmlalq_laneq_low_f16,
+};
+use crate::neon::{
+    x_float16x4_t, x_float16x8_t, xreinterpret_f16_u16, xvld_f16, xvldq_f16, xvldq_f16_x2,
+    xvldq_f16_x4, xvst_f16,
+};
+use core::f16;
+use std::arch::aarch64::*;
+
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_rgba_8_f16(
+    start_x: usize,
+    src: &[f16],
+    w: x_float16x8_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 4;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
+
+    let rgb_pixel = xvldq_f16_x4(src_ptr);
+
+    let mut acc = xvfmlalq_laneq_low_f16::<0>(store, rgb_pixel.0, w);
+    acc = xvfmlalq_laneq_high_f16::<1>(acc, rgb_pixel.0, w);
+    acc = xvfmlalq_laneq_low_f16::<2>(acc, rgb_pixel.1, w);
+    acc = xvfmlalq_laneq_high_f16::<3>(acc, rgb_pixel.1, w);
+    acc = xvfmlalq_laneq_low_f16::<4>(acc, rgb_pixel.2, w);
+    acc = xvfmlalq_laneq_high_f16::<5>(acc, rgb_pixel.2, w);
+    acc = xvfmlalq_laneq_low_f16::<6>(acc, rgb_pixel.3, w);
+    xvfmlalq_laneq_high_f16::<7>(acc, rgb_pixel.3, w)
+}
+
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_rgba_4_f16(
+    start_x: usize,
+    src: &[f16],
+    set1: x_float16x4_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 4;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
+
+    let rgb_pixel = xvldq_f16_x2(src_ptr);
+
+    let acc = xvfmlalq_lane_low_f16::<0>(store, rgb_pixel.0, set1);
+    let acc = xvfmlalq_lane_high_f16::<1>(acc, rgb_pixel.0, set1);
+    let acc = xvfmlalq_lane_low_f16::<2>(acc, rgb_pixel.1, set1);
+    xvfmlalq_lane_high_f16::<3>(acc, rgb_pixel.0, set1)
+}
+
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_rgba_2_f32(
+    start_x: usize,
+    src: &[f16],
+    set: x_float16x4_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 4;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
+
+    let rgb_pixel = xvldq_f16(src_ptr);
+
+    let acc = xvfmlalq_lane_low_f16::<0>(store, rgb_pixel, set);
+    xvfmlalq_lane_high_f16::<1>(acc, rgb_pixel, set)
+}
+
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_rgba_1_f16(
+    start_x: usize,
+    src: &[f16],
+    set: x_float16x4_t,
+    store: float32x4_t,
+) -> float32x4_t {
+    const COMPONENTS: usize = 4;
+    let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr();
+    let rgb_pixel = xvld_f16(src_ptr);
+    xvfmlalq_lane_low_f16::<0>(store, xvcombine_f16(rgb_pixel, rgb_pixel), set)
+}
+
+pub(crate) fn convolve_horizontal_rgba_neon_row_one_f16_fhm(
+    dst_width: usize,
+    w: usize,
+    filter_weights: &FilterWeights<f16>,
+    src: &[f16],
+    dst: &mut [f16],
+) {
+    unsafe {
+        convolve_horizontal_rgba_neon_row_one_f16_impl(dst_width, w, filter_weights, src, dst)
+    }
+}
+
+#[target_feature(enable = "fhm")]
+unsafe fn convolve_horizontal_rgba_neon_row_one_f16_impl(
+    dst_width: usize,
+    _: usize,
+    filter_weights: &FilterWeights<f16>,
+    src: &[f16],
+    dst: &mut [f16],
+) {
+    const CHANNELS: usize = 4;
+    let mut filter_offset = 0usize;
+    let weights_ptr = filter_weights.weights.as_ptr();
+
+    for x in 0..dst_width {
+        let bounds = filter_weights.bounds.get_unchecked(x);
+        let mut jx = 0usize;
+        let mut store = vdupq_n_f32(0f32);
+
+        while jx + 4 < bounds.size {
+            let bounds_start = bounds.start + jx;
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights = xvld_f16(ptr);
+            store = conv_horiz_rgba_4_f16(bounds_start, src, read_weights, store);
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let bounds_start = bounds.start + jx;
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights =
+                xreinterpret_f16_u16(vreinterpret_u16_u32(vld1_dup_u32(ptr as *const _)));
+            store = conv_horiz_rgba_2_f32(bounds_start, src, read_weights, store);
+            jx += 2;
+        }
+
+        while jx < bounds.size {
+            let bounds_start = bounds.start + jx;
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let weight0 = xreinterpret_f16_u16(vld1_dup_u16(ptr as *const _));
+            store = conv_horiz_rgba_1_f16(bounds_start, src, weight0, store);
+            jx += 1;
+        }
+
+        let px = x * CHANNELS;
+        let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
+        xvst_f16(dest_ptr, xvcvt_f16_f32(store));
+
+        filter_offset += filter_weights.aligned_size;
+    }
+}
+
+pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16_fhm(
+    dst_width: usize,
+    w: usize,
+    filter_weights: &FilterWeights<f16>,
+    src: &[f16],
+    src_stride: usize,
+    dst: &mut [f16],
+    dst_stride: usize,
+) {
+    unsafe {
+        convolve_horizontal_rgba_neon_rows_4_f16_impl(
+            dst_width,
+            w,
+            filter_weights,
+            src,
+            src_stride,
+            dst,
+            dst_stride,
+        )
+    }
+}
+
+#[target_feature(enable = "fhm")]
+unsafe fn convolve_horizontal_rgba_neon_rows_4_f16_impl(
+    dst_width: usize,
+    _: usize,
+    filter_weights: &FilterWeights<f16>,
+    src: &[f16],
+    src_stride: usize,
+    dst: &mut [f16],
+    dst_stride: usize,
+) {
+    const CHANNELS: usize = 4;
+    let mut filter_offset = 0usize;
+    let zeros = vdupq_n_f32(0f32);
+    let weights_ptr = filter_weights.weights.as_ptr();
+
+    for x in 0..dst_width {
+        let bounds = filter_weights.bounds.get_unchecked(x);
+        let mut jx = 0usize;
+        let mut store_0 = zeros;
+        let mut store_1 = zeros;
+        let mut store_2 = zeros;
+        let mut store_3 = zeros;
+
+        while jx + 8 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights = xvldq_f16(ptr);
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_rgba_8_f16(bounds_start, src, read_weights, store_0);
+            let s_ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_rgba_8_f16(bounds_start, s_ptr_1, read_weights, store_1);
+            let s_ptr2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_rgba_8_f16(bounds_start, s_ptr2, read_weights, store_2);
+            let s_ptr3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_rgba_8_f16(bounds_start, s_ptr3, read_weights, store_3);
+            jx += 8;
+        }
+
+        while jx + 4 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights = xvld_f16(ptr);
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_rgba_4_f16(bounds_start, src, read_weights, store_0);
+            let s_ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_rgba_4_f16(bounds_start, s_ptr_1, read_weights, store_1);
+            let s_ptr2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_rgba_4_f16(bounds_start, s_ptr2, read_weights, store_2);
+            let s_ptr3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_rgba_4_f16(bounds_start, s_ptr3, read_weights, store_3);
+            jx += 4;
+        }
+
+        while jx + 2 < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let read_weights =
+                xreinterpret_f16_u16(vreinterpret_u16_u32(vld1_dup_u32(ptr as *const _)));
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_rgba_2_f32(bounds_start, src, read_weights, store_0);
+            let ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_rgba_2_f32(bounds_start, ptr_1, read_weights, store_1);
+            let ptr_2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_rgba_2_f32(bounds_start, ptr_2, read_weights, store_2);
+            let ptr_3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_rgba_2_f32(bounds_start, ptr_3, read_weights, store_3);
+            jx += 2;
+        }
+
+        while jx < bounds.size {
+            let ptr = weights_ptr.add(jx + filter_offset);
+            let weight0 = xreinterpret_f16_u16(vld1_dup_u16(ptr as *const _));
+            let bounds_start = bounds.start + jx;
+            store_0 = conv_horiz_rgba_1_f16(bounds_start, src, weight0, store_0);
+            let ptr_1 = src.get_unchecked(src_stride..);
+            store_1 = conv_horiz_rgba_1_f16(bounds_start, ptr_1, weight0, store_1);
+            let ptr_2 = src.get_unchecked(src_stride * 2..);
+            store_2 = conv_horiz_rgba_1_f16(bounds_start, ptr_2, weight0, store_2);
+            let ptr_3 = src.get_unchecked(src_stride * 3..);
+            store_3 = conv_horiz_rgba_1_f16(bounds_start, ptr_3, weight0, store_3);
+            jx += 1;
+        }
+
+        let px = x * CHANNELS;
+        let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
+        xvst_f16(dest_ptr, xvcvt_f16_f32(store_0));
+
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr();
+        xvst_f16(dest_ptr, xvcvt_f16_f32(store_1));
+
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr();
+        xvst_f16(dest_ptr, xvcvt_f16_f32(store_2));
+
+        let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr();
+        xvst_f16(dest_ptr, xvcvt_f16_f32(store_3));
+
+        filter_offset += filter_weights.aligned_size;
+    }
+}
diff --git a/src/neon/rgba_f16_full.rs b/src/neon/rgba_f16_full.rs
index 38fb45c..7f24625 100644
--- a/src/neon/rgba_f16_full.rs
+++ b/src/neon/rgba_f16_full.rs
@@ -36,7 +36,7 @@ use crate::neon::{
     x_float16x4_t, x_float16x8_t, xvget_high_f16, xvget_low_f16, xvld_f16, xvldq_f16, xvldq_f16_x2,
     xvldq_f16_x4, xvst_f16,
 };
-use half::f16;
+use core::f16;
 use std::arch::aarch64::*;
 
 #[must_use]
@@ -117,8 +117,8 @@ pub(crate) fn xconvolve_horizontal_rgba_neon_row_one_f16(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     unsafe {
         xconvolve_horizontal_rgba_neon_row_one_f16_impl(
@@ -136,8 +136,8 @@ unsafe fn xconvolve_horizontal_rgba_neon_row_one_f16_impl(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
 ) {
     const CHANNELS: usize = 4;
     let mut filter_offset = 0usize;
@@ -185,9 +185,9 @@ pub(crate) fn xconvolve_horizontal_rgba_neon_rows_4_f16(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
-    unsafe_source_ptr_0: &[half::f16],
+    unsafe_source_ptr_0: &[f16],
     src_stride: usize,
-    unsafe_destination_ptr_0: &mut [half::f16],
+    unsafe_destination_ptr_0: &mut [f16],
     dst_stride: usize,
 ) {
     unsafe {
@@ -208,9 +208,9 @@ unsafe fn xconvolve_horizontal_rgba_neon_rows_4_f16_impl(
     dst_width: usize,
     _: usize,
     filter_weights: &FilterWeights<f32>,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
 ) {
     const CHANNELS: usize = 4;
diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs
index 04aaa03..ce6b36e 100644
--- a/src/neon/rgba_u8.rs
+++ b/src/neon/rgba_u8.rs
@@ -29,15 +29,14 @@
 
 use crate::filter_weights::FilterWeights;
 use crate::neon::utils::{
-    expand8_high_to_14, expand8_to_14, load_4b_as_u16x4, load_4b_as_u8x8, xvld1q_u8_x2,
+    load_4b_as_u16x4, vxmlal_high_lane_s16, vxmlal_high_laneq_s16, vxmlal_lane_s16,
+    vxmlal_laneq_s16, vxmlal_s16, xvld1q_u8_x2,
 };
-use crate::support::PRECISION;
-use crate::support::ROUNDING_CONST;
 use std::arch::aarch64::*;
 
 #[must_use]
 #[inline(always)]
-unsafe fn conv_horiz_rgba_8_u8(
+unsafe fn conv_horiz_rgba_8_u8<const D: bool>(
     start_x: usize,
     src: &[u8],
     weights: int16x8_t,
@@ -53,48 +52,21 @@ unsafe fn conv_horiz_rgba_8_u8(
     let hi1 = vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel.1));
     let lo1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel.1)));
 
-    let mut acc = vmlal_high_laneq_s16::<3>(store, hi0, weights);
-    acc = vmlal_laneq_s16::<2>(acc, vget_low_s16(hi0), weights);
-    acc = vmlal_high_laneq_s16::<1>(acc, lo0, weights);
-    acc = vmlal_laneq_s16::<0>(acc, vget_low_s16(lo0), weights);
+    let mut acc = vxmlal_high_laneq_s16::<D, 3>(store, hi0, weights);
+    acc = vxmlal_laneq_s16::<D, 2>(acc, vget_low_s16(hi0), weights);
+    acc = vxmlal_high_laneq_s16::<D, 1>(acc, lo0, weights);
+    acc = vxmlal_laneq_s16::<D, 0>(acc, vget_low_s16(lo0), weights);
 
-    acc = vmlal_high_laneq_s16::<7>(acc, hi1, weights);
-    acc = vmlal_laneq_s16::<6>(acc, vget_low_s16(hi1), weights);
-    acc = vmlal_high_laneq_s16::<5>(acc, lo1, weights);
-    acc = vmlal_laneq_s16::<4>(acc, vget_low_s16(lo1), weights);
+    acc = vxmlal_high_laneq_s16::<D, 7>(acc, hi1, weights);
+    acc = vxmlal_laneq_s16::<D, 6>(acc, vget_low_s16(hi1), weights);
+    acc = vxmlal_high_laneq_s16::<D, 5>(acc, lo1, weights);
+    acc = vxmlal_laneq_s16::<D, 4>(acc, vget_low_s16(lo1), weights);
     acc
 }
 
 #[must_use]
 #[inline(always)]
-unsafe fn conv_horiz_rgba_8_u8_i16<const SCALE: i32>(
-    start_x: usize,
-    src: &[u8],
-    w0: int16x8_t,
-    w1: int16x8_t,
-    w2: int16x8_t,
-    w3: int16x8_t,
-    store: int16x8_t,
-) -> int16x8_t {
-    const COMPONENTS: usize = 4;
-    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
-
-    let rgba_pixel = xvld1q_u8_x2(src_ptr.as_ptr());
-
-    let hi0 = expand8_high_to_14(rgba_pixel.0);
-    let lo0 = expand8_to_14(vget_low_u8(rgba_pixel.0));
-    let hi1 = expand8_high_to_14(rgba_pixel.1);
-    let lo1 = expand8_to_14(vget_low_u8(rgba_pixel.1));
-
-    let mut p = vqrdmlahq_s16(store, lo0, w0);
-    p = vqrdmlahq_s16(p, hi0, w1);
-    p = vqrdmlahq_s16(p, lo1, w2);
-    vqrdmlahq_s16(p, hi1, w3)
-}
-
-#[must_use]
-#[inline(always)]
-unsafe fn conv_horiz_rgba_2_u8(
+unsafe fn conv_horiz_rgba_2_u8<const D: bool>(
     start_x: usize,
     src: &[u8],
     weights: int16x4_t,
@@ -106,30 +78,13 @@ unsafe fn conv_horiz_rgba_2_u8(
     let rgb_pixel = vld1_u8(src_ptr.as_ptr());
     let wide = vreinterpretq_s16_u16(vmovl_u8(rgb_pixel));
 
-    let acc = vmlal_high_lane_s16::<1>(store, wide, weights);
-    vmlal_lane_s16::<0>(acc, vget_low_s16(wide), weights)
-}
-
-#[must_use]
-#[inline(always)]
-unsafe fn conv_horiz_rgba_2_u8_i16<const SCALE: i32>(
-    start_x: usize,
-    src: &[u8],
-    weights: int16x8_t,
-    store: int16x8_t,
-) -> int16x8_t {
-    const COMPONENTS: usize = 4;
-    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
-
-    let rgb_pixel = vld1_u8(src_ptr.as_ptr());
-    let wide = expand8_to_14(rgb_pixel);
-
-    vqrdmlahq_s16(store, wide, weights)
+    let acc = vxmlal_high_lane_s16::<D, 1>(store, wide, weights);
+    vxmlal_lane_s16::<D, 0>(acc, vget_low_s16(wide), weights)
 }
 
 #[must_use]
 #[inline(always)]
-unsafe fn conv_horiz_rgba_4_u8(
+unsafe fn conv_horiz_rgba_4_u8<const D: bool>(
     start_x: usize,
     src: &[u8],
     weights: int16x4_t,
@@ -143,35 +98,15 @@ unsafe fn conv_horiz_rgba_4_u8(
     let hi = vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel));
     let lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel)));
 
-    let acc = vmlal_high_lane_s16::<3>(store, hi, weights);
-    let acc = vmlal_lane_s16::<2>(acc, vget_low_s16(hi), weights);
-    let acc = vmlal_high_lane_s16::<1>(acc, lo, weights);
-    vmlal_lane_s16::<0>(acc, vget_low_s16(lo), weights)
-}
-
-#[inline(always)]
-unsafe fn conv_horiz_rgba_4_u8_i16<const SCALE: i32>(
-    start_x: usize,
-    src: &[u8],
-    w0: int16x8_t,
-    w1: int16x8_t,
-    store: int16x8_t,
-) -> int16x8_t {
-    const COMPONENTS: usize = 4;
-    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
-
-    let rgba_pixel = vld1q_u8(src_ptr.as_ptr());
-
-    let hi = expand8_high_to_14(rgba_pixel);
-    let lo = expand8_to_14(vget_low_u8(rgba_pixel));
-
-    let p = vqrdmlahq_s16(store, lo, w0);
-    vqrdmlahq_s16(p, hi, w1)
+    let acc = vxmlal_high_lane_s16::<D, 3>(store, hi, weights);
+    let acc = vxmlal_lane_s16::<D, 2>(acc, vget_low_s16(hi), weights);
+    let acc = vxmlal_high_lane_s16::<D, 1>(acc, lo, weights);
+    vxmlal_lane_s16::<D, 0>(acc, vget_low_s16(lo), weights)
 }
 
 #[must_use]
 #[inline(always)]
-unsafe fn conv_horiz_rgba_1_u8(
+unsafe fn conv_horiz_rgba_1_u8<const D: bool>(
     start_x: usize,
     src: &[u8],
     w0: int16x4_t,
@@ -181,225 +116,42 @@ unsafe fn conv_horiz_rgba_1_u8(
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
     let rgba_pixel = load_4b_as_u16x4(src_ptr.as_ptr());
     let lo = vreinterpret_s16_u16(rgba_pixel);
-    vmlal_s16(store, lo, w0)
-}
-
-#[must_use]
-#[inline(always)]
-unsafe fn conv_horiz_rgba_1_u8_i16<const SCALE: i32>(
-    start_x: usize,
-    src: &[u8],
-    w0: int16x4_t,
-    store: int16x4_t,
-) -> int16x4_t {
-    const COMPONENTS: usize = 4;
-    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
-    let rgba_pixel = expand8_to_14(load_4b_as_u8x8(src_ptr.as_ptr()));
-    vqrdmlah_s16(store, vget_low_s16(rgba_pixel), w0)
+    vxmlal_s16::<D>(store, lo, w0)
 }
 
-/// Checking NEON `rdm` availability is required before a call.
-///
-/// RDM feature has slightly lower precision and won't work really well on huge kernel which
-/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling.
-///
-/// # Safety
-/// - Check `rdm` availability before the call.
-pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8_i16(
+pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
-    unsafe {
-        convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
-            src,
-            src_stride,
-            dst,
-            dst_stride,
-            filter_weights,
-        );
-    }
+    convolve_horizontal_rgba_neon_rows_4_u8_impl::<false, 15>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        filter_weights,
+    );
 }
 
-/// Slightly lower precision scale option
-///
-/// # Safety
-/// - Check `rdm` availability before the call.
-#[target_feature(enable = "rdm")]
-unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
+pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8_q(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
     dst_stride: usize,
     filter_weights: &FilterWeights<i16>,
 ) {
-    const CHANNELS: usize = 4;
-    const SCALE: i32 = 6;
-    const ROUNDING: i16 = 1 << (SCALE - 1);
-
-    let weights_distribute: [u8; 16] = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
-    let v_w_distribute0 = vld1q_u8(weights_distribute.as_ptr());
-    let weights_distribute1: [u8; 16] = [4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7];
-    let v_w_distribute1 = vld1q_u8(weights_distribute1.as_ptr());
-    let weights_distribute2: [u8; 16] = [8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11];
-    let v_w_distribute2 = vld1q_u8(weights_distribute2.as_ptr());
-    let weights_distribute3: [u8; 16] =
-        [12, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15];
-    let v_w_distribute3 = vld1q_u8(weights_distribute3.as_ptr());
-
-    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-    let initial_val = vcombine_s16(vdup_n_s16(ROUNDING), vdup_n_s16(0));
-
-    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-        .zip(iter_row1)
-        .zip(iter_row2)
-        .zip(iter_row3)
-        .zip(filter_weights.bounds.iter())
-        .zip(
-            filter_weights
-                .weights
-                .chunks_exact(filter_weights.aligned_size),
-        )
-    {
-        let mut jx = 0usize;
-
-        let bounds_size = bounds.size;
-
-        let mut store_0 = initial_val;
-        let mut store_1 = initial_val;
-        let mut store_2 = initial_val;
-        let mut store_3 = initial_val;
-
-        let src0 = src;
-        let src1 = src0.get_unchecked(src_stride..);
-        let src2 = src1.get_unchecked(src_stride..);
-        let src3 = src2.get_unchecked(src_stride..);
-
-        while jx + 8 < bounds_size {
-            let bounds_start = bounds.start + jx;
-            let w_ptr = weights.get_unchecked(jx..(jx + 8));
-            let weights_set = vld1q_s16(w_ptr.as_ptr());
-
-            let w0 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(weights_set),
-                v_w_distribute0,
-            ));
-            let w1 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(weights_set),
-                v_w_distribute1,
-            ));
-            let w2 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(weights_set),
-                v_w_distribute2,
-            ));
-            let w3 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(weights_set),
-                v_w_distribute3,
-            ));
-
-            store_0 =
-                conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src0, w0, w1, w2, w3, store_0);
-            store_1 =
-                conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src1, w0, w1, w2, w3, store_1);
-            store_2 =
-                conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src2, w0, w1, w2, w3, store_2);
-            store_3 =
-                conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src3, w0, w1, w2, w3, store_3);
-            jx += 8;
-        }
-
-        while jx + 4 < bounds_size {
-            let bounds_start = bounds.start + jx;
-            let w_ptr = weights.get_unchecked(jx..(jx + 4));
-            let weights = vld1_s16(w_ptr.as_ptr());
-
-            let w0 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))),
-                v_w_distribute0,
-            ));
-            let w1 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))),
-                v_w_distribute1,
-            ));
-
-            store_0 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src0, w0, w1, store_0);
-            store_1 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src1, w0, w1, store_1);
-            store_2 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src2, w0, w1, store_2);
-            store_3 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src3, w0, w1, store_3);
-            jx += 4;
-        }
-
-        while jx + 2 < bounds_size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 2));
-            let bounds_start = bounds.start + jx;
-            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(w_ptr.as_ptr() as *const i32));
-            let w0 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(vcombine_s16(v_weight, vdup_n_s16(0))),
-                v_w_distribute0,
-            ));
-            store_0 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src0, w0, store_0);
-            store_1 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src1, w0, store_1);
-            store_2 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src2, w0, store_2);
-            store_3 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src3, w0, store_3);
-            jx += 2;
-        }
-
-        let mut store_0 = vadd_s16(vget_low_s16(store_0), vget_high_s16(store_0));
-        let mut store_1 = vadd_s16(vget_low_s16(store_1), vget_high_s16(store_1));
-        let mut store_2 = vadd_s16(vget_low_s16(store_2), vget_high_s16(store_2));
-        let mut store_3 = vadd_s16(vget_low_s16(store_3), vget_high_s16(store_3));
-
-        while jx < bounds_size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 1));
-            let bounds_start = bounds.start + jx;
-            let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-            store_0 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src0, weight0, store_0);
-            store_1 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src1, weight0, store_1);
-            store_2 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src2, weight0, store_2);
-            store_3 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src3, weight0, store_3);
-            jx += 1;
-        }
-
-        let store_16_0 = vshr_n_s16::<SCALE>(store_0);
-        let store_16_1 = vshr_n_s16::<SCALE>(store_1);
-        let store_16_2 = vshr_n_s16::<SCALE>(store_2);
-        let store_16_3 = vshr_n_s16::<SCALE>(store_3);
-
-        let store_16_8_0 = vqmovun_s16(vcombine_s16(store_16_0, store_16_0));
-        let store_16_8_1 = vqmovun_s16(vcombine_s16(store_16_1, store_16_1));
-        let store_16_8_2 = vqmovun_s16(vcombine_s16(store_16_2, store_16_2));
-        let store_16_8 = vqmovun_s16(vcombine_s16(store_16_3, store_16_3));
-
-        vst1_lane_u32::<0>(
-            chunk0.as_mut_ptr() as *mut u32,
-            vreinterpret_u32_u8(store_16_8_0),
-        );
-        vst1_lane_u32::<0>(
-            chunk1.as_mut_ptr() as *mut u32,
-            vreinterpret_u32_u8(store_16_8_1),
-        );
-        vst1_lane_u32::<0>(
-            chunk2.as_mut_ptr() as *mut u32,
-            vreinterpret_u32_u8(store_16_8_2),
-        );
-        vst1_lane_u32::<0>(
-            chunk3.as_mut_ptr() as *mut u32,
-            vreinterpret_u32_u8(store_16_8),
-        );
-    }
+    convolve_horizontal_rgba_neon_rows_4_u8_impl::<true, 16>(
+        src,
+        src_stride,
+        dst,
+        dst_stride,
+        filter_weights,
+    );
 }
 
-pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8(
+fn convolve_horizontal_rgba_neon_rows_4_u8_impl<const D: bool, const PRECISION: i32>(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -408,7 +160,8 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8(
 ) {
     unsafe {
         const CHANNELS: usize = 4;
-        let init = vdupq_n_s32(ROUNDING_CONST);
+        let rnd_const: i32 = (1 << (PRECISION - 1)) - 1;
+        let init = vdupq_n_s32(rnd_const);
 
         let (row0_ref, rest) = dst.split_at_mut(dst_stride);
         let (row1_ref, rest) = rest.split_at_mut(dst_stride);
@@ -447,10 +200,10 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 8));
                 let weights_set = vld1q_s16(w_ptr.as_ptr());
-                store_0 = conv_horiz_rgba_8_u8(bounds_start, src0, weights_set, store_0);
-                store_1 = conv_horiz_rgba_8_u8(bounds_start, src1, weights_set, store_1);
-                store_2 = conv_horiz_rgba_8_u8(bounds_start, src2, weights_set, store_2);
-                store_3 = conv_horiz_rgba_8_u8(bounds_start, src3, weights_set, store_3);
+                store_0 = conv_horiz_rgba_8_u8::<D>(bounds_start, src0, weights_set, store_0);
+                store_1 = conv_horiz_rgba_8_u8::<D>(bounds_start, src1, weights_set, store_1);
+                store_2 = conv_horiz_rgba_8_u8::<D>(bounds_start, src2, weights_set, store_2);
+                store_3 = conv_horiz_rgba_8_u8::<D>(bounds_start, src3, weights_set, store_3);
                 jx += 8;
             }
 
@@ -458,10 +211,10 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                store_0 = conv_horiz_rgba_4_u8(bounds_start, src0, weights, store_0);
-                store_1 = conv_horiz_rgba_4_u8(bounds_start, src1, weights, store_1);
-                store_2 = conv_horiz_rgba_4_u8(bounds_start, src2, weights, store_2);
-                store_3 = conv_horiz_rgba_4_u8(bounds_start, src3, weights, store_3);
+                store_0 = conv_horiz_rgba_4_u8::<D>(bounds_start, src0, weights, store_0);
+                store_1 = conv_horiz_rgba_4_u8::<D>(bounds_start, src1, weights, store_1);
+                store_2 = conv_horiz_rgba_4_u8::<D>(bounds_start, src2, weights, store_2);
+                store_3 = conv_horiz_rgba_4_u8::<D>(bounds_start, src3, weights, store_3);
                 jx += 4;
             }
 
@@ -470,10 +223,10 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8(
                 let bounds_start = bounds.start + jx;
                 let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
                 v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
-                store_0 = conv_horiz_rgba_2_u8(bounds_start, src0, v_weight, store_0);
-                store_1 = conv_horiz_rgba_2_u8(bounds_start, src1, v_weight, store_1);
-                store_2 = conv_horiz_rgba_2_u8(bounds_start, src2, v_weight, store_2);
-                store_3 = conv_horiz_rgba_2_u8(bounds_start, src3, v_weight, store_3);
+                store_0 = conv_horiz_rgba_2_u8::<D>(bounds_start, src0, v_weight, store_0);
+                store_1 = conv_horiz_rgba_2_u8::<D>(bounds_start, src1, v_weight, store_1);
+                store_2 = conv_horiz_rgba_2_u8::<D>(bounds_start, src2, v_weight, store_2);
+                store_3 = conv_horiz_rgba_2_u8::<D>(bounds_start, src3, v_weight, store_3);
                 jx += 2;
             }
 
@@ -481,10 +234,10 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8(
                 let w_ptr = weights.get_unchecked(jx..(jx + 1));
                 let bounds_start = bounds.start + jx;
                 let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-                store_0 = conv_horiz_rgba_1_u8(bounds_start, src0, weight0, store_0);
-                store_1 = conv_horiz_rgba_1_u8(bounds_start, src1, weight0, store_1);
-                store_2 = conv_horiz_rgba_1_u8(bounds_start, src2, weight0, store_2);
-                store_3 = conv_horiz_rgba_1_u8(bounds_start, src3, weight0, store_3);
+                store_0 = conv_horiz_rgba_1_u8::<D>(bounds_start, src0, weight0, store_0);
+                store_1 = conv_horiz_rgba_1_u8::<D>(bounds_start, src1, weight0, store_1);
+                store_2 = conv_horiz_rgba_1_u8::<D>(bounds_start, src2, weight0, store_2);
+                store_3 = conv_horiz_rgba_1_u8::<D>(bounds_start, src3, weight0, store_3);
                 jx += 1;
             }
 
@@ -522,9 +275,26 @@ pub(crate) fn convolve_horizontal_rgba_neon_row(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
+) {
+    convolve_horizontal_rgba_neon_row_impl::<false, 15>(src, dst, filter_weights);
+}
+
+pub(crate) fn convolve_horizontal_rgba_neon_row_q(
+    src: &[u8],
+    dst: &mut [u8],
+    filter_weights: &FilterWeights<i16>,
+) {
+    convolve_horizontal_rgba_neon_row_impl::<true, 16>(src, dst, filter_weights);
+}
+
+fn convolve_horizontal_rgba_neon_row_impl<const D: bool, const PRECISION: i32>(
+    src: &[u8],
+    dst: &mut [u8],
+    filter_weights: &FilterWeights<i16>,
 ) {
     unsafe {
         const CHANNELS: usize = 4;
+        let rnd_const: i32 = (1 << (PRECISION - 1)) - 1;
 
         for ((dst, bounds), weights) in dst
             .chunks_exact_mut(CHANNELS)
@@ -537,13 +307,13 @@ pub(crate) fn convolve_horizontal_rgba_neon_row(
         {
             let bounds_size = bounds.size;
             let mut jx = 0usize;
-            let mut store = vdupq_n_s32(ROUNDING_CONST);
+            let mut store = vdupq_n_s32(rnd_const);
 
             while jx + 8 < bounds_size {
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 8));
                 let weights_set = vld1q_s16(w_ptr.as_ptr());
-                store = conv_horiz_rgba_8_u8(bounds_start, src, weights_set, store);
+                store = conv_horiz_rgba_8_u8::<D>(bounds_start, src, weights_set, store);
                 jx += 8;
             }
 
@@ -551,7 +321,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row(
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
                 let bounds_start = bounds.start + jx;
-                store = conv_horiz_rgba_4_u8(bounds_start, src, weights, store);
+                store = conv_horiz_rgba_4_u8::<D>(bounds_start, src, weights, store);
                 jx += 4;
             }
 
@@ -560,7 +330,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row(
                 let bounds_start = bounds.start + jx;
                 let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
                 v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
-                store = conv_horiz_rgba_2_u8(bounds_start, src, v_weight, store);
+                store = conv_horiz_rgba_2_u8::<D>(bounds_start, src, v_weight, store);
                 jx += 2;
             }
 
@@ -568,7 +338,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row(
                 let w_ptr = weights.get_unchecked(jx..(jx + 1));
                 let weight0 = vld1_dup_s16(w_ptr.as_ptr());
                 let bounds_start = bounds.start + jx;
-                store = conv_horiz_rgba_1_u8(bounds_start, src, weight0, store);
+                store = conv_horiz_rgba_1_u8::<D>(bounds_start, src, weight0, store);
                 jx += 1;
             }
 
@@ -582,132 +352,3 @@ pub(crate) fn convolve_horizontal_rgba_neon_row(
         }
     }
 }
-
-/// Checking NEON `rdm` availability is required before a call.
-///
-/// RDM feature has slightly lower precision and won't work really well on huge kernel which
-/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling.
-///
-/// # Safety
-/// - Check `rdm` availability before the call.
-pub(crate) fn convolve_horizontal_rgba_neon_row_i16(
-    src: &[u8],
-    dst: &mut [u8],
-    filter_weights: &FilterWeights<i16>,
-) {
-    unsafe {
-        convolve_horizontal_rgba_neon_row_i16_impl(src, dst, filter_weights);
-    }
-}
-
-#[target_feature(enable = "rdm")]
-unsafe fn convolve_horizontal_rgba_neon_row_i16_impl(
-    src: &[u8],
-    dst: &mut [u8],
-    filter_weights: &FilterWeights<i16>,
-) {
-    const SCALE: i32 = 6;
-    const ROUNDING: i16 = 1 << (SCALE - 1);
-    const CHANNELS: usize = 4;
-
-    let weights_distribute: [u8; 16] = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
-    let v_w_distribute0 = vld1q_u8(weights_distribute.as_ptr());
-    let weights_distribute1: [u8; 16] = [4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7];
-    let v_w_distribute1 = vld1q_u8(weights_distribute1.as_ptr());
-    let weights_distribute2: [u8; 16] = [8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11];
-    let v_w_distribute2 = vld1q_u8(weights_distribute2.as_ptr());
-    let weights_distribute3: [u8; 16] =
-        [12, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15];
-    let v_w_distribute3 = vld1q_u8(weights_distribute3.as_ptr());
-
-    let initial_val = vcombine_s16(vdup_n_s16(ROUNDING), vdup_n_s16(0));
-
-    for ((dst, bounds), weights) in dst
-        .chunks_exact_mut(CHANNELS)
-        .zip(filter_weights.bounds.iter())
-        .zip(
-            filter_weights
-                .weights
-                .chunks_exact(filter_weights.aligned_size),
-        )
-    {
-        let bounds_size = bounds.size;
-        let mut jx = 0usize;
-        let mut store = initial_val;
-
-        while jx + 8 < bounds_size {
-            let bounds_start = bounds.start + jx;
-            let w_ptr = weights.get_unchecked(jx..(jx + 8));
-            let weights_set = vld1q_s16(w_ptr.as_ptr());
-
-            let w0 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(weights_set),
-                v_w_distribute0,
-            ));
-            let w1 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(weights_set),
-                v_w_distribute1,
-            ));
-            let w2 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(weights_set),
-                v_w_distribute2,
-            ));
-            let w3 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(weights_set),
-                v_w_distribute3,
-            ));
-
-            store = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src, w0, w1, w2, w3, store);
-            jx += 8;
-        }
-
-        while jx + 4 < bounds_size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 4));
-            let weights = vld1_s16(w_ptr.as_ptr());
-            let bounds_start = bounds.start + jx;
-
-            let w0 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))),
-                v_w_distribute0,
-            ));
-            let w1 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))),
-                v_w_distribute1,
-            ));
-
-            store = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src, w0, w1, store);
-            jx += 4;
-        }
-
-        while jx + 2 < bounds_size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 2));
-            let bounds_start = bounds.start + jx;
-            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(w_ptr.as_ptr() as *const i32));
-            let w0 = vreinterpretq_s16_u8(vqtbl1q_u8(
-                vreinterpretq_u8_s16(vcombine_s16(v_weight, vdup_n_s16(0))),
-                v_w_distribute0,
-            ));
-            store = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src, w0, store);
-            jx += 2;
-        }
-
-        let mut store = vadd_s16(vget_low_s16(store), vget_high_s16(store));
-
-        while jx < bounds_size {
-            let w_ptr = weights.get_unchecked(jx..(jx + 1));
-            let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-            let bounds_start = bounds.start + jx;
-            store = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src, weight0, store);
-            jx += 1;
-        }
-
-        let store_16 = vshr_n_s16::<SCALE>(store);
-
-        let store_16_8 = vqmovun_s16(vcombine_s16(store_16, store_16));
-
-        vst1_lane_u32::<0>(
-            dst.as_mut_ptr() as *mut u32,
-            vreinterpret_u32_u8(store_16_8),
-        );
-    }
-}
diff --git a/src/neon/rgba_u8_rdm.rs b/src/neon/rgba_u8_rdm.rs
new file mode 100644
index 0000000..591dc6d
--- /dev/null
+++ b/src/neon/rgba_u8_rdm.rs
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+use crate::filter_weights::FilterWeights;
+use crate::neon::utils::{expand8_high_to_14, expand8_to_14, load_4b_as_u8x8, xvld1q_u8_x2};
+use std::arch::aarch64::*;
+
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_rgba_8_u8_i16<const SCALE: i32>(
+    start_x: usize,
+    src: &[u8],
+    w0: int16x8_t,
+    w1: int16x8_t,
+    w2: int16x8_t,
+    w3: int16x8_t,
+    store: int16x8_t,
+) -> int16x8_t {
+    const COMPONENTS: usize = 4;
+    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
+
+    let rgba_pixel = xvld1q_u8_x2(src_ptr.as_ptr());
+
+    let hi0 = expand8_high_to_14(rgba_pixel.0);
+    let lo0 = expand8_to_14(vget_low_u8(rgba_pixel.0));
+    let hi1 = expand8_high_to_14(rgba_pixel.1);
+    let lo1 = expand8_to_14(vget_low_u8(rgba_pixel.1));
+
+    let mut p = vqrdmlahq_s16(store, lo0, w0);
+    p = vqrdmlahq_s16(p, hi0, w1);
+    p = vqrdmlahq_s16(p, lo1, w2);
+    vqrdmlahq_s16(p, hi1, w3)
+}
+
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_rgba_2_u8_i16<const SCALE: i32>(
+    start_x: usize,
+    src: &[u8],
+    weights: int16x8_t,
+    store: int16x8_t,
+) -> int16x8_t {
+    const COMPONENTS: usize = 4;
+    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
+
+    let rgb_pixel = vld1_u8(src_ptr.as_ptr());
+    let wide = expand8_to_14(rgb_pixel);
+
+    vqrdmlahq_s16(store, wide, weights)
+}
+
+#[inline(always)]
+unsafe fn conv_horiz_rgba_4_u8_i16<const SCALE: i32>(
+    start_x: usize,
+    src: &[u8],
+    w0: int16x8_t,
+    w1: int16x8_t,
+    store: int16x8_t,
+) -> int16x8_t {
+    const COMPONENTS: usize = 4;
+    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
+
+    let rgba_pixel = vld1q_u8(src_ptr.as_ptr());
+
+    let hi = expand8_high_to_14(rgba_pixel);
+    let lo = expand8_to_14(vget_low_u8(rgba_pixel));
+
+    let p = vqrdmlahq_s16(store, lo, w0);
+    vqrdmlahq_s16(p, hi, w1)
+}
+
+#[must_use]
+#[inline(always)]
+unsafe fn conv_horiz_rgba_1_u8_i16<const SCALE: i32>(
+    start_x: usize,
+    src: &[u8],
+    w0: int16x4_t,
+    store: int16x4_t,
+) -> int16x4_t {
+    const COMPONENTS: usize = 4;
+    let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
+    let rgba_pixel = expand8_to_14(load_4b_as_u8x8(src_ptr.as_ptr()));
+    vqrdmlah_s16(store, vget_low_s16(rgba_pixel), w0)
+}
+
+/// Checking NEON `rdm` availability is required before a call.
+///
+/// RDM feature has slightly lower precision and won't work really well on huge kernel which
+/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling.
+///
+/// # Safety
+/// - Check `rdm` availability before the call.
+pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8_i16(
+    src: &[u8],
+    src_stride: usize,
+    dst: &mut [u8],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+) {
+    unsafe {
+        convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
+            src,
+            src_stride,
+            dst,
+            dst_stride,
+            filter_weights,
+        );
+    }
+}
+
+/// Slightly lower precision scale option
+///
+/// # Safety
+/// - Check `rdm` availability before the call.
+#[target_feature(enable = "rdm")]
+unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
+    src: &[u8],
+    src_stride: usize,
+    dst: &mut [u8],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+) {
+    const CHANNELS: usize = 4;
+    const SCALE: i32 = 6;
+    const ROUNDING: i16 = 1 << (SCALE - 1);
+
+    let weights_distribute: [u8; 16] = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
+    let v_w_distribute0 = vld1q_u8(weights_distribute.as_ptr());
+    let weights_distribute1: [u8; 16] = [4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7];
+    let v_w_distribute1 = vld1q_u8(weights_distribute1.as_ptr());
+    let weights_distribute2: [u8; 16] = [8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11];
+    let v_w_distribute2 = vld1q_u8(weights_distribute2.as_ptr());
+    let weights_distribute3: [u8; 16] =
+        [12, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15];
+    let v_w_distribute3 = vld1q_u8(weights_distribute3.as_ptr());
+
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
+    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
+    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
+    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
+
+    let initial_val = vcombine_s16(vdup_n_s16(ROUNDING), vdup_n_s16(0));
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut jx = 0usize;
+
+        let bounds_size = bounds.size;
+
+        let mut store_0 = initial_val;
+        let mut store_1 = initial_val;
+        let mut store_2 = initial_val;
+        let mut store_3 = initial_val;
+
+        let src0 = src;
+        let src1 = src0.get_unchecked(src_stride..);
+        let src2 = src1.get_unchecked(src_stride..);
+        let src3 = src2.get_unchecked(src_stride..);
+
+        while jx + 8 < bounds_size {
+            let bounds_start = bounds.start + jx;
+            let w_ptr = weights.get_unchecked(jx..(jx + 8));
+            let weights_set = vld1q_s16(w_ptr.as_ptr());
+
+            let w0 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(weights_set),
+                v_w_distribute0,
+            ));
+            let w1 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(weights_set),
+                v_w_distribute1,
+            ));
+            let w2 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(weights_set),
+                v_w_distribute2,
+            ));
+            let w3 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(weights_set),
+                v_w_distribute3,
+            ));
+
+            store_0 =
+                conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src0, w0, w1, w2, w3, store_0);
+            store_1 =
+                conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src1, w0, w1, w2, w3, store_1);
+            store_2 =
+                conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src2, w0, w1, w2, w3, store_2);
+            store_3 =
+                conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src3, w0, w1, w2, w3, store_3);
+            jx += 8;
+        }
+
+        while jx + 4 < bounds_size {
+            let bounds_start = bounds.start + jx;
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+            let weights = vld1_s16(w_ptr.as_ptr());
+
+            let w0 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))),
+                v_w_distribute0,
+            ));
+            let w1 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))),
+                v_w_distribute1,
+            ));
+
+            store_0 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src0, w0, w1, store_0);
+            store_1 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src1, w0, w1, store_1);
+            store_2 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src2, w0, w1, store_2);
+            store_3 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src3, w0, w1, store_3);
+            jx += 4;
+        }
+
+        while jx + 2 < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = bounds.start + jx;
+            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(w_ptr.as_ptr() as *const i32));
+            let w0 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(vcombine_s16(v_weight, vdup_n_s16(0))),
+                v_w_distribute0,
+            ));
+            store_0 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src0, w0, store_0);
+            store_1 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src1, w0, store_1);
+            store_2 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src2, w0, store_2);
+            store_3 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src3, w0, store_3);
+            jx += 2;
+        }
+
+        let mut store_0 = vadd_s16(vget_low_s16(store_0), vget_high_s16(store_0));
+        let mut store_1 = vadd_s16(vget_low_s16(store_1), vget_high_s16(store_1));
+        let mut store_2 = vadd_s16(vget_low_s16(store_2), vget_high_s16(store_2));
+        let mut store_3 = vadd_s16(vget_low_s16(store_3), vget_high_s16(store_3));
+
+        while jx < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+            let bounds_start = bounds.start + jx;
+            let weight0 = vld1_dup_s16(w_ptr.as_ptr());
+            store_0 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src0, weight0, store_0);
+            store_1 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src1, weight0, store_1);
+            store_2 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src2, weight0, store_2);
+            store_3 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src3, weight0, store_3);
+            jx += 1;
+        }
+
+        let store_16_0 = vshr_n_s16::<SCALE>(store_0);
+        let store_16_1 = vshr_n_s16::<SCALE>(store_1);
+        let store_16_2 = vshr_n_s16::<SCALE>(store_2);
+        let store_16_3 = vshr_n_s16::<SCALE>(store_3);
+
+        let store_16_8_0 = vqmovun_s16(vcombine_s16(store_16_0, store_16_0));
+        let store_16_8_1 = vqmovun_s16(vcombine_s16(store_16_1, store_16_1));
+        let store_16_8_2 = vqmovun_s16(vcombine_s16(store_16_2, store_16_2));
+        let store_16_8 = vqmovun_s16(vcombine_s16(store_16_3, store_16_3));
+
+        vst1_lane_u32::<0>(
+            chunk0.as_mut_ptr() as *mut u32,
+            vreinterpret_u32_u8(store_16_8_0),
+        );
+        vst1_lane_u32::<0>(
+            chunk1.as_mut_ptr() as *mut u32,
+            vreinterpret_u32_u8(store_16_8_1),
+        );
+        vst1_lane_u32::<0>(
+            chunk2.as_mut_ptr() as *mut u32,
+            vreinterpret_u32_u8(store_16_8_2),
+        );
+        vst1_lane_u32::<0>(
+            chunk3.as_mut_ptr() as *mut u32,
+            vreinterpret_u32_u8(store_16_8),
+        );
+    }
+}
+
+/// Checking NEON `rdm` availability is required before a call.
+///
+/// RDM feature has slightly lower precision and won't work really well on huge kernel which
+/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling.
+///
+/// # Safety
+/// - Check `rdm` availability before the call.
+pub(crate) fn convolve_horizontal_rgba_neon_row_i16(
+    src: &[u8],
+    dst: &mut [u8],
+    filter_weights: &FilterWeights<i16>,
+) {
+    unsafe {
+        convolve_horizontal_rgba_neon_row_i16_impl(src, dst, filter_weights);
+    }
+}
+
+#[target_feature(enable = "rdm")]
+unsafe fn convolve_horizontal_rgba_neon_row_i16_impl(
+    src: &[u8],
+    dst: &mut [u8],
+    filter_weights: &FilterWeights<i16>,
+) {
+    const SCALE: i32 = 6;
+    const ROUNDING: i16 = 1 << (SCALE - 1);
+    const CHANNELS: usize = 4;
+
+    let weights_distribute: [u8; 16] = [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3];
+    let v_w_distribute0 = vld1q_u8(weights_distribute.as_ptr());
+    let weights_distribute1: [u8; 16] = [4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7];
+    let v_w_distribute1 = vld1q_u8(weights_distribute1.as_ptr());
+    let weights_distribute2: [u8; 16] = [8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11];
+    let v_w_distribute2 = vld1q_u8(weights_distribute2.as_ptr());
+    let weights_distribute3: [u8; 16] =
+        [12, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15, 2, 13, 14, 15];
+    let v_w_distribute3 = vld1q_u8(weights_distribute3.as_ptr());
+
+    let initial_val = vcombine_s16(vdup_n_s16(ROUNDING), vdup_n_s16(0));
+
+    for ((dst, bounds), weights) in dst
+        .chunks_exact_mut(CHANNELS)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let bounds_size = bounds.size;
+        let mut jx = 0usize;
+        let mut store = initial_val;
+
+        while jx + 8 < bounds_size {
+            let bounds_start = bounds.start + jx;
+            let w_ptr = weights.get_unchecked(jx..(jx + 8));
+            let weights_set = vld1q_s16(w_ptr.as_ptr());
+
+            let w0 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(weights_set),
+                v_w_distribute0,
+            ));
+            let w1 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(weights_set),
+                v_w_distribute1,
+            ));
+            let w2 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(weights_set),
+                v_w_distribute2,
+            ));
+            let w3 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(weights_set),
+                v_w_distribute3,
+            ));
+
+            store = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src, w0, w1, w2, w3, store);
+            jx += 8;
+        }
+
+        while jx + 4 < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+            let weights = vld1_s16(w_ptr.as_ptr());
+            let bounds_start = bounds.start + jx;
+
+            let w0 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))),
+                v_w_distribute0,
+            ));
+            let w1 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(vcombine_s16(weights, vdup_n_s16(0))),
+                v_w_distribute1,
+            ));
+
+            store = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src, w0, w1, store);
+            jx += 4;
+        }
+
+        while jx + 2 < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = bounds.start + jx;
+            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(w_ptr.as_ptr() as *const i32));
+            let w0 = vreinterpretq_s16_u8(vqtbl1q_u8(
+                vreinterpretq_u8_s16(vcombine_s16(v_weight, vdup_n_s16(0))),
+                v_w_distribute0,
+            ));
+            store = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src, w0, store);
+            jx += 2;
+        }
+
+        let mut store = vadd_s16(vget_low_s16(store), vget_high_s16(store));
+
+        while jx < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+            let weight0 = vld1_dup_s16(w_ptr.as_ptr());
+            let bounds_start = bounds.start + jx;
+            store = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src, weight0, store);
+            jx += 1;
+        }
+
+        let store_16 = vshr_n_s16::<SCALE>(store);
+
+        let store_16_8 = vqmovun_s16(vcombine_s16(store_16, store_16));
+
+        vst1_lane_u32::<0>(
+            dst.as_mut_ptr() as *mut u32,
+            vreinterpret_u32_u8(store_16_8),
+        );
+    }
+}
diff --git a/src/neon/utils.rs b/src/neon/utils.rs
index a755d62..67b3546 100644
--- a/src/neon/utils.rs
+++ b/src/neon/utils.rs
@@ -30,12 +30,14 @@
 use std::arch::aarch64::*;
 
 #[inline(always)]
+#[cfg(feature = "rdm")]
 pub(crate) unsafe fn expand8_to_14(row: uint8x8_t) -> int16x8_t {
     let row = vcombine_u8(row, row);
     vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip1q_u8(row, row))))
 }
 
 #[inline(always)]
+#[cfg(feature = "rdm")]
 pub(crate) unsafe fn expand8_high_to_14(row: uint8x16_t) -> int16x8_t {
     vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip2q_u8(row, row))))
 }
@@ -155,6 +157,7 @@ pub(crate) unsafe fn load_3b_as_u16x4(src_ptr: *const u8) -> uint16x4_t {
 }
 
 #[inline(always)]
+#[cfg(feature = "rdm")]
 pub(crate) unsafe fn load_3b_as_u8x16(src_ptr: *const u8) -> uint8x16_t {
     let v = vreinterpretq_u8_u16(vld1q_lane_u16::<0>(src_ptr as *const u16, vdupq_n_u16(0)));
     vld1q_lane_u8::<2>(src_ptr.add(2), v)
@@ -167,6 +170,7 @@ pub(crate) unsafe fn load_4b_as_u16x4(src_ptr: *const u8) -> uint16x4_t {
 }
 
 #[inline(always)]
+#[cfg(feature = "rdm")]
 pub(crate) unsafe fn load_4b_as_u8x8(src_ptr: *const u8) -> uint8x8_t {
     vreinterpret_u8_u32(vld1_lane_u32::<0>(src_ptr as *const u32, vdup_n_u32(0)))
 }
@@ -179,6 +183,7 @@ pub(crate) unsafe fn xvld1q_s16_x2(a: *const i16) -> int16x8x2_t {
 }
 
 #[inline(always)]
+#[cfg(feature = "rdm")]
 pub(crate) unsafe fn xvld1q_s16_x4(a: *const i16) -> int16x8x4_t {
     let v0 = vld1q_s16(a);
     let v1 = vld1q_s16(a.add(8));
@@ -186,3 +191,81 @@ pub(crate) unsafe fn xvld1q_s16_x4(a: *const i16) -> int16x8x4_t {
     let v3 = vld1q_s16(a.add(24));
     int16x8x4_t(v0, v1, v2, v3)
 }
+
+#[inline(always)]
+pub(crate) unsafe fn vxmlal_high_lane_s16<const D: bool, const LANE: i32>(
+    a: int32x4_t,
+    b: int16x8_t,
+    c: int16x4_t,
+) -> int32x4_t {
+    if D {
+        vqdmlal_high_lane_s16::<LANE>(a, b, c)
+    } else {
+        vmlal_high_lane_s16::<LANE>(a, b, c)
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vxmlal_lane_s16<const D: bool, const LANE: i32>(
+    a: int32x4_t,
+    b: int16x4_t,
+    c: int16x4_t,
+) -> int32x4_t {
+    if D {
+        vqdmlal_lane_s16::<LANE>(a, b, c)
+    } else {
+        vmlal_lane_s16::<LANE>(a, b, c)
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vxmlal_s16<const D: bool>(
+    a: int32x4_t,
+    b: int16x4_t,
+    c: int16x4_t,
+) -> int32x4_t {
+    if D {
+        vqdmlal_s16(a, b, c)
+    } else {
+        vmlal_s16(a, b, c)
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vxmlal_high_s16<const D: bool>(
+    a: int32x4_t,
+    b: int16x8_t,
+    c: int16x8_t,
+) -> int32x4_t {
+    if D {
+        vqdmlal_high_s16(a, b, c)
+    } else {
+        vmlal_high_s16(a, b, c)
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vxmlal_high_laneq_s16<const D: bool, const LANE: i32>(
+    a: int32x4_t,
+    b: int16x8_t,
+    c: int16x8_t,
+) -> int32x4_t {
+    if D {
+        vqdmlal_high_laneq_s16::<LANE>(a, b, c)
+    } else {
+        vmlal_high_laneq_s16::<LANE>(a, b, c)
+    }
+}
+
+#[inline(always)]
+pub unsafe fn vxmlal_laneq_s16<const D: bool, const LANE: i32>(
+    a: int32x4_t,
+    b: int16x4_t,
+    c: int16x8_t,
+) -> int32x4_t {
+    if D {
+        vqdmlal_laneq_s16::<LANE>(a, b, c)
+    } else {
+        vmlal_laneq_s16::<LANE>(a, b, c)
+    }
+}
diff --git a/src/neon/vertical_ar30.rs b/src/neon/vertical_ar30_rdm.rs
similarity index 77%
rename from src/neon/vertical_ar30.rs
rename to src/neon/vertical_ar30_rdm.rs
index b5c3f2f..c843c85 100644
--- a/src/neon/vertical_ar30.rs
+++ b/src/neon/vertical_ar30_rdm.rs
@@ -28,7 +28,7 @@
  */
 use crate::filter_weights::FilterBounds;
 use crate::fixed_point_vertical_ar30::convolve_column_handler_fip_db_ar30;
-use crate::neon::ar30::{vunzip_4_ar30, vzip_4_ar30};
+use crate::neon::ar30::{vunzip_3_ar30, vzip_4_ar30};
 use std::arch::aarch64::{
     int16x8x4_t, vdupq_n_s16, vld1q_u32_x2, vmaxq_s16, vminq_s16, vqrdmlahq_s16, vqrdmulhq_s16,
     vrshrq_n_s16, vshlq_n_s16, vst1q_u32_x2,
@@ -40,8 +40,8 @@ pub(crate) fn neon_column_handler_fixed_point_ar30<
     const AR30_ORDER: usize,
 >(
     bounds: &FilterBounds,
-    src: &[u32],
-    dst: &mut [u32],
+    src: &[u8],
+    dst: &mut [u8],
     src_stride: usize,
     weight: &[i16],
 ) {
@@ -58,14 +58,14 @@ unsafe fn neon_column_handler_fixed_point_ar30_impl<
     const AR30_ORDER: usize,
 >(
     bounds: &FilterBounds,
-    src: &[u32],
-    dst: &mut [u32],
+    src: &[u8],
+    dst: &mut [u8],
     src_stride: usize,
     weight: &[i16],
 ) {
     let mut cx = 0usize;
 
-    let total_width = dst.len();
+    let total_width = dst.len() / 4;
 
     const PREC: i32 = 5;
     const BACK: i32 = 5;
@@ -77,18 +77,18 @@ unsafe fn neon_column_handler_fixed_point_ar30_impl<
             let v_max = vdupq_n_s16(1023);
             let zeros = vdupq_n_s16(0);
             let filter = weight;
-            let v_start_px = cx;
+            let v_start_px = cx * 4;
 
             let py = bounds.start;
             let weight = vdupq_n_s16(filter[0]);
             let offset = src_stride * py + v_start_px;
             let src_ptr = src.get_unchecked(offset..(offset + 8));
 
-            let ps = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr.as_ptr()));
+            let ps =
+                vunzip_3_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr.as_ptr() as *const _));
             let mut v0 = vqrdmulhq_s16(vshlq_n_s16::<PREC>(ps.0), weight);
             let mut v1 = vqrdmulhq_s16(vshlq_n_s16::<PREC>(ps.1), weight);
             let mut v2 = vqrdmulhq_s16(vshlq_n_s16::<PREC>(ps.2), weight);
-            let mut v3 = vqrdmulhq_s16(vshlq_n_s16::<PREC>(ps.3), weight);
 
             if bounds_size == 2 {
                 let weights = filter.get_unchecked(0..2);
@@ -97,11 +97,12 @@ unsafe fn neon_column_handler_fixed_point_ar30_impl<
 
                 let v_weight1 = vdupq_n_s16(weights[1]);
 
-                let ps1 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr1.as_ptr()));
+                let ps1 = vunzip_3_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(
+                    src_ptr1.as_ptr() as *const _
+                ));
                 v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps1.0), v_weight1);
                 v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps1.1), v_weight1);
                 v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps1.2), v_weight1);
-                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps1.3), v_weight1);
             } else if bounds_size == 3 {
                 let weights = filter.get_unchecked(0..3);
                 let py = bounds.start;
@@ -111,16 +112,18 @@ unsafe fn neon_column_handler_fixed_point_ar30_impl<
                 let v_weight1 = vdupq_n_s16(weights[1]);
                 let v_weight2 = vdupq_n_s16(weights[2]);
 
-                let ps1 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr1.as_ptr()));
+                let ps1 = vunzip_3_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(
+                    src_ptr1.as_ptr() as *const _
+                ));
                 v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps1.0), v_weight1);
                 v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps1.1), v_weight1);
                 v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps1.2), v_weight1);
-                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps1.3), v_weight1);
-                let ps2 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr2.as_ptr()));
+                let ps2 = vunzip_3_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(
+                    src_ptr2.as_ptr() as *const _
+                ));
                 v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps2.0), v_weight2);
                 v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps2.1), v_weight2);
                 v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps2.2), v_weight2);
-                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps2.3), v_weight2);
             } else if bounds_size == 4 {
                 let weights = filter.get_unchecked(0..4);
                 let py = bounds.start;
@@ -132,46 +135,54 @@ unsafe fn neon_column_handler_fixed_point_ar30_impl<
                 let v_weight2 = vdupq_n_s16(weights[2]);
                 let v_weight3 = vdupq_n_s16(weights[3]);
 
-                let ps1 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr1.as_ptr()));
+                let ps1 = vunzip_3_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(
+                    src_ptr1.as_ptr() as *const _
+                ));
                 v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps1.0), v_weight1);
                 v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps1.1), v_weight1);
                 v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps1.2), v_weight1);
-                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps1.3), v_weight1);
-                let ps2 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr2.as_ptr()));
+                let ps2 = vunzip_3_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(
+                    src_ptr2.as_ptr() as *const _
+                ));
                 v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps2.0), v_weight2);
                 v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps2.1), v_weight2);
                 v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps2.2), v_weight2);
-                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps2.3), v_weight2);
-                let ps3 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr3.as_ptr()));
+                let ps3 = vunzip_3_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(
+                    src_ptr3.as_ptr() as *const _
+                ));
                 v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps3.0), v_weight3);
                 v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps3.1), v_weight3);
                 v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps3.2), v_weight3);
-                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps3.3), v_weight3);
             } else {
                 for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
                     // Adding 1 is necessary because skip do not incrementing value on values that skipped
                     let py = bounds.start + j + 1;
                     let weight = vdupq_n_s16(k_weight);
                     let offset = src_stride * py + v_start_px;
-                    let src_ptr = src.get_unchecked(offset..(offset + 8));
+                    let src_ptr = src.get_unchecked(offset..(offset + 8 * 4));
 
-                    let ps = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr.as_ptr()));
+                    let ps = vunzip_3_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(
+                        src_ptr.as_ptr() as *const _
+                    ));
                     v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps.0), weight);
                     v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps.1), weight);
                     v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps.2), weight);
-                    v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps.3), weight);
                 }
             }
 
-            let v_dst = dst.get_unchecked_mut(v_start_px..(v_start_px + 8));
+            let v_dst = dst.get_unchecked_mut(v_start_px..(v_start_px + 8 * 4));
 
-            v0 = vmaxq_s16(vminq_s16(vrshrq_n_s16::<BACK>(v0), v_max), zeros);
-            v1 = vmaxq_s16(vminq_s16(vrshrq_n_s16::<BACK>(v1), v_max), zeros);
-            v2 = vmaxq_s16(vminq_s16(vrshrq_n_s16::<BACK>(v2), v_max), zeros);
-            v3 = vmaxq_s16(vrshrq_n_s16::<BACK>(v3), zeros);
+            v0 = vrshrq_n_s16::<BACK>(v0);
+            v1 = vrshrq_n_s16::<BACK>(v1);
+            v2 = vrshrq_n_s16::<BACK>(v2);
 
-            let vals = vzip_4_ar30::<AR30_TYPE, AR30_ORDER>(int16x8x4_t(v0, v1, v2, v3));
-            vst1q_u32_x2(v_dst.as_mut_ptr(), vals);
+            v0 = vmaxq_s16(vminq_s16(v0, v_max), zeros);
+            v1 = vmaxq_s16(vminq_s16(v1, v_max), zeros);
+            v2 = vmaxq_s16(vminq_s16(v2, v_max), zeros);
+
+            let vals =
+                vzip_4_ar30::<AR30_TYPE, AR30_ORDER>(int16x8x4_t(v0, v1, v2, vdupq_n_s16(3)));
+            vst1q_u32_x2(v_dst.as_mut_ptr() as *mut _, vals);
         }
 
         cx += 8;
diff --git a/src/neon/vertical_f16.rs b/src/neon/vertical_f16.rs
index 5a3fd5b..ce4496a 100644
--- a/src/neon/vertical_f16.rs
+++ b/src/neon/vertical_f16.rs
@@ -32,6 +32,7 @@ use crate::filter_weights::FilterBounds;
 use crate::neon::convolve_f16::convolve_vertical_part_neon_8_f16;
 use crate::neon::utils::prefer_vfmaq_f32;
 use crate::neon::*;
+use core::f16;
 
 macro_rules! conv_vertical_part_neon_16_f16 {
     ($start_y: expr, $start_x: expr, $src: expr, $src_stride: expr, $dst: expr, $filter: expr, $bounds: expr) => {{
@@ -239,8 +240,8 @@ macro_rules! conv_vertical_part_neon_48_f16 {
 pub(crate) fn convolve_vertical_rgb_neon_row_f16(
     _: usize,
     bounds: &FilterBounds,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
diff --git a/src/neon/vertical_f16_fhm.rs b/src/neon/vertical_f16_fhm.rs
new file mode 100644
index 0000000..482d110
--- /dev/null
+++ b/src/neon/vertical_f16_fhm.rs
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use std::arch::aarch64::*;
+
+use crate::filter_weights::FilterBounds;
+use crate::neon::*;
+use core::f16;
+
+#[inline(always)]
+pub(crate) unsafe fn conv_vertical_part_neon_16_f16(
+    start_y: usize,
+    start_x: usize,
+    src: &[f16],
+    src_stride: usize,
+    dst: &mut [f16],
+    filter: &[f16],
+    bounds: &FilterBounds,
+) {
+    unsafe {
+        let mut store_0 = vdupq_n_f32(0.);
+        let mut store_1 = vdupq_n_f32(0.);
+        let mut store_2 = vdupq_n_f32(0.);
+        let mut store_3 = vdupq_n_f32(0.);
+
+        let px = start_x;
+
+        for j in 0..bounds.size {
+            let py = start_y + j;
+            let v_weight = xreinterpretq_f16_u16(vld1q_dup_u16(
+                filter.get_unchecked(j..).as_ptr() as *const _
+            ));
+            let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
+
+            let s_ptr = src_ptr.add(px);
+            let item_row = xvldq_f16_x2(s_ptr);
+
+            store_0 = xvfmlalq_low_f16(store_0, item_row.0, v_weight);
+            store_1 = xvfmlalq_high_f16(store_1, item_row.0, v_weight);
+            store_2 = xvfmlalq_low_f16(store_2, item_row.1, v_weight);
+            store_3 = xvfmlalq_high_f16(store_3, item_row.1, v_weight);
+        }
+
+        let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
+        let f_set = x_float16x8x2_t(
+            xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)),
+            xcombine_f16(xvcvt_f16_f32(store_2), xvcvt_f16_f32(store_3)),
+        );
+        xvstq_f16_x2(dst_ptr, f_set);
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn conv_vertical_part_neon_32_f16(
+    start_y: usize,
+    start_x: usize,
+    src: &[f16],
+    src_stride: usize,
+    dst: &mut [f16],
+    filter: &[f16],
+    bounds: &FilterBounds,
+) {
+    let mut store_0 = vdupq_n_f32(0.);
+    let mut store_1 = vdupq_n_f32(0.);
+    let mut store_2 = vdupq_n_f32(0.);
+    let mut store_3 = vdupq_n_f32(0.);
+    let mut store_4 = vdupq_n_f32(0.);
+    let mut store_5 = vdupq_n_f32(0.);
+    let mut store_6 = vdupq_n_f32(0.);
+    let mut store_7 = vdupq_n_f32(0.);
+
+    let px = start_x;
+
+    for j in 0..bounds.size {
+        let py = start_y + j;
+        let v_weight =
+            xreinterpretq_f16_u16(vld1q_dup_u16(filter.get_unchecked(j..).as_ptr() as *const _));
+        let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
+
+        let s_ptr = src_ptr.add(px);
+        let item_row = xvldq_f16_x4(s_ptr);
+
+        store_0 = xvfmlalq_low_f16(store_0, item_row.0, v_weight);
+        store_1 = xvfmlalq_high_f16(store_1, item_row.0, v_weight);
+        store_2 = xvfmlalq_low_f16(store_2, item_row.1, v_weight);
+        store_3 = xvfmlalq_high_f16(store_3, item_row.1, v_weight);
+
+        store_4 = xvfmlalq_low_f16(store_4, item_row.2, v_weight);
+        store_5 = xvfmlalq_high_f16(store_5, item_row.2, v_weight);
+        store_6 = xvfmlalq_low_f16(store_6, item_row.3, v_weight);
+        store_7 = xvfmlalq_high_f16(store_7, item_row.3, v_weight);
+    }
+
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
+    let f_set = x_float16x8x4_t(
+        xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)),
+        xcombine_f16(xvcvt_f16_f32(store_2), xvcvt_f16_f32(store_3)),
+        xcombine_f16(xvcvt_f16_f32(store_4), xvcvt_f16_f32(store_5)),
+        xcombine_f16(xvcvt_f16_f32(store_6), xvcvt_f16_f32(store_7)),
+    );
+    xvstq_f16_x4(dst_ptr, f_set);
+}
+
+#[inline(always)]
+pub(crate) unsafe fn conv_vertical_part_neon_48_f16(
+    start_y: usize,
+    start_x: usize,
+    src: &[f16],
+    src_stride: usize,
+    dst: &mut [f16],
+    filter: &[f16],
+    bounds: &FilterBounds,
+) {
+    unsafe {
+        let mut store_0 = vdupq_n_f32(0.);
+        let mut store_1 = vdupq_n_f32(0.);
+        let mut store_2 = vdupq_n_f32(0.);
+        let mut store_3 = vdupq_n_f32(0.);
+
+        let mut store_4 = vdupq_n_f32(0.);
+        let mut store_5 = vdupq_n_f32(0.);
+        let mut store_6 = vdupq_n_f32(0.);
+        let mut store_7 = vdupq_n_f32(0.);
+
+        let mut store_8 = vdupq_n_f32(0.);
+        let mut store_9 = vdupq_n_f32(0.);
+        let mut store_10 = vdupq_n_f32(0.);
+        let mut store_11 = vdupq_n_f32(0.);
+
+        let px = start_x;
+
+        for j in 0..bounds.size {
+            let py = start_y + j;
+            let v_weight = xreinterpretq_f16_u16(vld1q_dup_u16(
+                filter.get_unchecked(j..).as_ptr() as *const _
+            ));
+            let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
+
+            let s_ptr = src_ptr.add(px);
+            let item_row_0 = xvldq_f16_x4(s_ptr);
+            let item_row_1 = xvldq_f16_x2(s_ptr.add(32));
+
+            store_0 = xvfmlalq_low_f16(store_0, item_row_0.0, v_weight);
+            store_1 = xvfmlalq_high_f16(store_1, item_row_0.0, v_weight);
+            store_2 = xvfmlalq_low_f16(store_2, item_row_0.1, v_weight);
+            store_3 = xvfmlalq_high_f16(store_3, item_row_0.1, v_weight);
+
+            store_4 = xvfmlalq_low_f16(store_4, item_row_0.2, v_weight);
+            store_5 = xvfmlalq_high_f16(store_5, item_row_0.2, v_weight);
+            store_6 = xvfmlalq_low_f16(store_6, item_row_0.3, v_weight);
+            store_7 = xvfmlalq_high_f16(store_7, item_row_0.3, v_weight);
+
+            store_8 = xvfmlalq_low_f16(store_8, item_row_1.0, v_weight);
+            store_9 = xvfmlalq_high_f16(store_9, item_row_1.0, v_weight);
+            store_10 = xvfmlalq_low_f16(store_10, item_row_1.1, v_weight);
+            store_11 = xvfmlalq_high_f16(store_11, item_row_1.1, v_weight);
+        }
+
+        let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
+        let f_set = x_float16x8x4_t(
+            xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)),
+            xcombine_f16(xvcvt_f16_f32(store_2), xvcvt_f16_f32(store_3)),
+            xcombine_f16(xvcvt_f16_f32(store_4), xvcvt_f16_f32(store_5)),
+            xcombine_f16(xvcvt_f16_f32(store_6), xvcvt_f16_f32(store_7)),
+        );
+        xvstq_f16_x4(dst_ptr, f_set);
+        let dst_ptr2 = dst_ptr.add(32);
+
+        let f_set1 = x_float16x8x2_t(
+            xcombine_f16(xvcvt_f16_f32(store_8), xvcvt_f16_f32(store_9)),
+            xcombine_f16(xvcvt_f16_f32(store_10), xvcvt_f16_f32(store_11)),
+        );
+        xvstq_f16_x2(dst_ptr2, f_set1);
+    }
+}
+
+pub(crate) fn convolve_vertical_rgb_neon_row_f16_fhm(
+    w0: usize,
+    bounds: &FilterBounds,
+    src: &[f16],
+    dst: &mut [f16],
+    src_stride: usize,
+    weight_ptr: &[f16],
+) {
+    unsafe { convolve_vertical_rgb_neon_row_f16_impl(w0, bounds, src, dst, src_stride, weight_ptr) }
+}
+
+#[inline(always)]
+unsafe fn convolve_vertical_part_neon_8_f16_fhm<const USE_BLENDING: bool>(
+    start_y: usize,
+    start_x: usize,
+    src: &[f16],
+    src_stride: usize,
+    dst: &mut [f16],
+    filter: &[f16],
+    bounds: &FilterBounds,
+    blend_length: usize,
+) {
+    let mut store_0 = vdupq_n_f32(0f32);
+    let mut store_1 = vdupq_n_f32(0f32);
+
+    let px = start_x;
+
+    for j in 0..bounds.size {
+        let py = start_y + j;
+        let v_weight =
+            xreinterpretq_f16_u16(vld1q_dup_u16(filter.get_unchecked(j..).as_ptr() as *const _));
+        let src_ptr = src.get_unchecked(src_stride * py..).as_ptr();
+
+        let s_ptr = src_ptr.add(px);
+        let item_row = if USE_BLENDING {
+            let mut transient: [f16; 8] = [0.; 8];
+            std::ptr::copy_nonoverlapping(s_ptr, transient.as_mut_ptr(), blend_length);
+            xvldq_f16(transient.as_ptr())
+        } else {
+            xvldq_f16(s_ptr)
+        };
+
+        store_0 = xvfmlalq_low_f16(store_0, item_row, v_weight);
+        store_1 = xvfmlalq_high_f16(store_1, item_row, v_weight);
+    }
+
+    let item = xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1));
+
+    let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
+    if USE_BLENDING {
+        let mut transient: [f16; 8] = [0.; 8];
+        xvstq_f16(transient.as_mut_ptr(), item);
+        std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, blend_length);
+    } else {
+        xvstq_f16(dst_ptr, item);
+    }
+}
+
+#[target_feature(enable = "fhm")]
+unsafe fn convolve_vertical_rgb_neon_row_f16_impl(
+    _: usize,
+    bounds: &FilterBounds,
+    src: &[f16],
+    dst: &mut [f16],
+    src_stride: usize,
+    weight_ptr: &[f16],
+) {
+    let mut cx = 0usize;
+    let dst_width = dst.len();
+
+    while cx + 48 < dst_width {
+        conv_vertical_part_neon_48_f16(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
+
+        cx += 48;
+    }
+
+    while cx + 32 < dst_width {
+        conv_vertical_part_neon_32_f16(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
+
+        cx += 32;
+    }
+
+    while cx + 16 < dst_width {
+        conv_vertical_part_neon_16_f16(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds);
+
+        cx += 16;
+    }
+
+    while cx + 8 < dst_width {
+        unsafe {
+            convolve_vertical_part_neon_8_f16_fhm::<false>(
+                bounds.start,
+                cx,
+                src,
+                src_stride,
+                dst,
+                weight_ptr,
+                bounds,
+                8,
+            );
+        }
+
+        cx += 8;
+    }
+
+    let left = dst_width - cx;
+
+    if left > 0 {
+        unsafe {
+            convolve_vertical_part_neon_8_f16_fhm::<true>(
+                bounds.start,
+                cx,
+                src,
+                src_stride,
+                dst,
+                weight_ptr,
+                bounds,
+                left,
+            );
+        }
+    }
+}
diff --git a/src/neon/vertical_f16_full.rs b/src/neon/vertical_f16_full.rs
index 3109835..bd317c0 100644
--- a/src/neon/vertical_f16_full.rs
+++ b/src/neon/vertical_f16_full.rs
@@ -30,14 +30,15 @@ use std::arch::aarch64::*;
 
 use crate::filter_weights::FilterBounds;
 use crate::neon::*;
+use core::f16;
 
 #[inline(always)]
 pub(crate) unsafe fn xconvolve_vertical_part_neon_8_f16<const USE_BLENDING: bool>(
     start_y: usize,
     start_x: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     filter: &[f32],
     bounds: &FilterBounds,
     blend_length: usize,
@@ -55,7 +56,7 @@ pub(crate) unsafe fn xconvolve_vertical_part_neon_8_f16<const USE_BLENDING: bool
 
         let s_ptr = src_ptr.add(px);
         let item_row = if USE_BLENDING {
-            let mut transient: [half::f16; 8] = [half::f16::from_f32(0.); 8];
+            let mut transient: [f16; 8] = [0.; 8];
             std::ptr::copy_nonoverlapping(s_ptr, transient.as_mut_ptr(), blend_length);
             xvldq_f16(transient.as_ptr())
         } else {
@@ -67,7 +68,7 @@ pub(crate) unsafe fn xconvolve_vertical_part_neon_8_f16<const USE_BLENDING: bool
 
     let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr();
     if USE_BLENDING {
-        let mut transient: [half::f16; 8] = [half::f16::from_bits(0); 8];
+        let mut transient: [f16; 8] = [0.; 8];
         xvstq_f16(transient.as_mut_ptr(), store_0);
         std::ptr::copy_nonoverlapping(transient.as_ptr(), dst_ptr, blend_length);
     } else {
@@ -181,8 +182,8 @@ macro_rules! conv_vertical_part_neon_48_f16 {
 pub(crate) fn xconvolve_vertical_rgb_neon_row_f16(
     width: usize,
     bounds: &FilterBounds,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -195,8 +196,8 @@ pub(crate) fn xconvolve_vertical_rgb_neon_row_f16(
 pub unsafe fn xconvolve_vertical_rgb_neon_row_f16_impl(
     _: usize,
     bounds: &FilterBounds,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
diff --git a/src/neon/vertical_u16_lb.rs b/src/neon/vertical_u16_lb.rs
index f76a525..9fa65ff 100644
--- a/src/neon/vertical_u16_lb.rs
+++ b/src/neon/vertical_u16_lb.rs
@@ -27,7 +27,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::filter_weights::FilterBounds;
-use crate::support::{PRECISION, ROUNDING_CONST};
 use std::arch::aarch64::*;
 
 #[inline(always)]
@@ -46,6 +45,9 @@ pub(crate) fn convolve_column_lb_u16(
 
         let bounds_size = bounds.size;
 
+        const PRECISION: i32 = 16;
+        const ROUNDING_CONST: i32 = (1 << (PRECISION - 1)) - 1;
+
         let initial_store = vdupq_n_s32(ROUNDING_CONST);
 
         let v_max_colors = vdupq_n_u16(max_colors);
@@ -74,18 +76,18 @@ pub(crate) fn convolve_column_lb_u16(
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr().add(8)));
 
-                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
-                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
-                store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight);
-                store3 = vmlal_high_lane_s16::<0>(store3, item_row1, v_weight);
+                store0 = vqdmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vqdmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
+                store2 = vqdmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight);
+                store3 = vqdmlal_high_lane_s16::<0>(store3, item_row1, v_weight);
 
                 let item_row10 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
                 let item_row11 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr().add(8)));
 
-                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight);
-                store1 = vmlal_high_lane_s16::<1>(store1, item_row10, v_weight);
-                store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight);
-                store3 = vmlal_high_lane_s16::<1>(store3, item_row11, v_weight);
+                store0 = vqdmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight);
+                store1 = vqdmlal_high_lane_s16::<1>(store1, item_row10, v_weight);
+                store2 = vqdmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight);
+                store3 = vqdmlal_high_lane_s16::<1>(store3, item_row11, v_weight);
             } else if bounds_size == 3 {
                 let weights = weight.get_unchecked(0..3);
                 let mut v_weight = vld1_dup_s16(weights.as_ptr());
@@ -100,26 +102,26 @@ pub(crate) fn convolve_column_lb_u16(
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr().add(8)));
 
-                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
-                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
-                store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight);
-                store3 = vmlal_high_lane_s16::<0>(store3, item_row1, v_weight);
+                store0 = vqdmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vqdmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
+                store2 = vqdmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight);
+                store3 = vqdmlal_high_lane_s16::<0>(store3, item_row1, v_weight);
 
                 let item_row10 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
                 let item_row11 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr().add(8)));
 
-                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight);
-                store1 = vmlal_high_lane_s16::<1>(store1, item_row10, v_weight);
-                store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight);
-                store3 = vmlal_high_lane_s16::<1>(store3, item_row11, v_weight);
+                store0 = vqdmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight);
+                store1 = vqdmlal_high_lane_s16::<1>(store1, item_row10, v_weight);
+                store2 = vqdmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight);
+                store3 = vqdmlal_high_lane_s16::<1>(store3, item_row11, v_weight);
 
                 let item_row20 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr()));
                 let item_row21 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr().add(8)));
 
-                store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row20), v_weight);
-                store1 = vmlal_high_lane_s16::<2>(store1, item_row20, v_weight);
-                store2 = vmlal_lane_s16::<2>(store2, vget_low_s16(item_row21), v_weight);
-                store3 = vmlal_high_lane_s16::<2>(store3, item_row21, v_weight);
+                store0 = vqdmlal_lane_s16::<2>(store0, vget_low_s16(item_row20), v_weight);
+                store1 = vqdmlal_high_lane_s16::<2>(store1, item_row20, v_weight);
+                store2 = vqdmlal_lane_s16::<2>(store2, vget_low_s16(item_row21), v_weight);
+                store3 = vqdmlal_high_lane_s16::<2>(store3, item_row21, v_weight);
             } else if bounds_size == 4 {
                 let weights = weight.get_unchecked(0..4);
 
@@ -134,34 +136,34 @@ pub(crate) fn convolve_column_lb_u16(
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr().add(8)));
 
-                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
-                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
-                store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight);
-                store3 = vmlal_high_lane_s16::<0>(store3, item_row1, v_weight);
+                store0 = vqdmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vqdmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
+                store2 = vqdmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight);
+                store3 = vqdmlal_high_lane_s16::<0>(store3, item_row1, v_weight);
 
                 let item_row10 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
                 let item_row11 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr().add(8)));
 
-                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight);
-                store1 = vmlal_high_lane_s16::<1>(store1, item_row10, v_weight);
-                store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight);
-                store3 = vmlal_high_lane_s16::<1>(store3, item_row11, v_weight);
+                store0 = vqdmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight);
+                store1 = vqdmlal_high_lane_s16::<1>(store1, item_row10, v_weight);
+                store2 = vqdmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight);
+                store3 = vqdmlal_high_lane_s16::<1>(store3, item_row11, v_weight);
 
                 let item_row20 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr()));
                 let item_row21 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr().add(8)));
 
-                store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row20), v_weight);
-                store1 = vmlal_high_lane_s16::<2>(store1, item_row20, v_weight);
-                store2 = vmlal_lane_s16::<2>(store2, vget_low_s16(item_row21), v_weight);
-                store3 = vmlal_high_lane_s16::<2>(store3, item_row21, v_weight);
+                store0 = vqdmlal_lane_s16::<2>(store0, vget_low_s16(item_row20), v_weight);
+                store1 = vqdmlal_high_lane_s16::<2>(store1, item_row20, v_weight);
+                store2 = vqdmlal_lane_s16::<2>(store2, vget_low_s16(item_row21), v_weight);
+                store3 = vqdmlal_high_lane_s16::<2>(store3, item_row21, v_weight);
 
                 let item_row30 = vreinterpretq_s16_u16(vld1q_u16(src_ptr3.as_ptr()));
                 let item_row31 = vreinterpretq_s16_u16(vld1q_u16(src_ptr3.as_ptr().add(8)));
 
-                store0 = vmlal_lane_s16::<3>(store0, vget_low_s16(item_row30), v_weight);
-                store1 = vmlal_high_lane_s16::<3>(store1, item_row30, v_weight);
-                store2 = vmlal_lane_s16::<3>(store2, vget_low_s16(item_row31), v_weight);
-                store3 = vmlal_high_lane_s16::<3>(store3, item_row31, v_weight);
+                store0 = vqdmlal_lane_s16::<3>(store0, vget_low_s16(item_row30), v_weight);
+                store1 = vqdmlal_high_lane_s16::<3>(store1, item_row30, v_weight);
+                store2 = vqdmlal_lane_s16::<3>(store2, vget_low_s16(item_row31), v_weight);
+                store3 = vqdmlal_high_lane_s16::<3>(store3, item_row31, v_weight);
             } else {
                 for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
                     let py = bounds.start + j;
@@ -179,20 +181,13 @@ pub(crate) fn convolve_column_lb_u16(
                 }
             }
 
-            let item0 = vminq_u16(
-                vcombine_u16(
-                    vqshrun_n_s32::<PRECISION>(store0),
-                    vqshrun_n_s32::<PRECISION>(store1),
-                ),
-                v_max_colors,
-            );
-            let item1 = vminq_u16(
-                vcombine_u16(
-                    vqshrun_n_s32::<PRECISION>(store2),
-                    vqshrun_n_s32::<PRECISION>(store3),
-                ),
-                v_max_colors,
-            );
+            let store0 = vqshrun_n_s32::<PRECISION>(store0);
+            let store1 = vqshrun_n_s32::<PRECISION>(store1);
+            let store2 = vqshrun_n_s32::<PRECISION>(store2);
+            let store3 = vqshrun_n_s32::<PRECISION>(store3);
+
+            let item0 = vminq_u16(vcombine_u16(store0, store1), v_max_colors);
+            let item1 = vminq_u16(vcombine_u16(store2, store3), v_max_colors);
 
             vst1q_u16(dst.as_mut_ptr(), item0);
             vst1q_u16(dst.as_mut_ptr().add(8), item1);
@@ -222,13 +217,13 @@ pub(crate) fn convolve_column_lb_u16(
 
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
 
-                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
-                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
+                store0 = vqdmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vqdmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
 
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
 
-                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight);
-                store1 = vmlal_high_lane_s16::<1>(store1, item_row1, v_weight);
+                store0 = vqdmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight);
+                store1 = vqdmlal_high_lane_s16::<1>(store1, item_row1, v_weight);
             } else if bounds_size == 3 {
                 let weights = weight.get_unchecked(0..3);
                 let mut v_weight = vld1_dup_s16(weights.as_ptr());
@@ -242,18 +237,18 @@ pub(crate) fn convolve_column_lb_u16(
 
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
 
-                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
-                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
+                store0 = vqdmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vqdmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
 
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
 
-                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight);
-                store1 = vmlal_high_lane_s16::<1>(store1, item_row1, v_weight);
+                store0 = vqdmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight);
+                store1 = vqdmlal_high_lane_s16::<1>(store1, item_row1, v_weight);
 
                 let item_row2 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr()));
 
-                store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row2), v_weight);
-                store1 = vmlal_high_lane_s16::<2>(store1, item_row2, v_weight);
+                store0 = vqdmlal_lane_s16::<2>(store0, vget_low_s16(item_row2), v_weight);
+                store1 = vqdmlal_high_lane_s16::<2>(store1, item_row2, v_weight);
             } else if bounds_size == 4 {
                 let weights = weight.get_unchecked(0..4);
                 let v_weight = vld1_s16(weights.as_ptr());
@@ -266,23 +261,23 @@ pub(crate) fn convolve_column_lb_u16(
 
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
 
-                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
-                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
+                store0 = vqdmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vqdmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
 
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
 
-                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight);
-                store1 = vmlal_high_lane_s16::<1>(store1, item_row1, v_weight);
+                store0 = vqdmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight);
+                store1 = vqdmlal_high_lane_s16::<1>(store1, item_row1, v_weight);
 
                 let item_row2 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr()));
 
-                store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row2), v_weight);
-                store1 = vmlal_high_lane_s16::<2>(store1, item_row2, v_weight);
+                store0 = vqdmlal_lane_s16::<2>(store0, vget_low_s16(item_row2), v_weight);
+                store1 = vqdmlal_high_lane_s16::<2>(store1, item_row2, v_weight);
 
                 let item_row3 = vreinterpretq_s16_u16(vld1q_u16(src_ptr3.as_ptr()));
 
-                store0 = vmlal_lane_s16::<3>(store0, vget_low_s16(item_row3), v_weight);
-                store1 = vmlal_high_lane_s16::<3>(store1, item_row3, v_weight);
+                store0 = vqdmlal_lane_s16::<3>(store0, vget_low_s16(item_row3), v_weight);
+                store1 = vqdmlal_high_lane_s16::<3>(store1, item_row3, v_weight);
             } else {
                 for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
                     let py = bounds.start + j;
@@ -329,10 +324,10 @@ pub(crate) fn convolve_column_lb_u16(
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
 
                 let item_row0 = vreinterpret_s16_u16(vld1_u16(src_ptr0.as_ptr()));
-                store0 = vmlal_lane_s16::<0>(store0, item_row0, v_weight);
+                store0 = vqdmlal_lane_s16::<0>(store0, item_row0, v_weight);
 
                 let item_row1 = vreinterpret_s16_u16(vld1_u16(src_ptr1.as_ptr()));
-                store0 = vmlal_lane_s16::<1>(store0, item_row1, v_weight);
+                store0 = vqdmlal_lane_s16::<1>(store0, item_row1, v_weight);
             } else if bounds_size == 3 {
                 let weights = weight.get_unchecked(0..3);
                 let mut v_weight = vld1_dup_s16(weights.as_ptr());
@@ -345,13 +340,13 @@ pub(crate) fn convolve_column_lb_u16(
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..);
 
                 let item_row0 = vreinterpret_s16_u16(vld1_u16(src_ptr0.as_ptr()));
-                store0 = vmlal_lane_s16::<0>(store0, item_row0, v_weight);
+                store0 = vqdmlal_lane_s16::<0>(store0, item_row0, v_weight);
 
                 let item_row1 = vreinterpret_s16_u16(vld1_u16(src_ptr1.as_ptr()));
-                store0 = vmlal_lane_s16::<1>(store0, item_row1, v_weight);
+                store0 = vqdmlal_lane_s16::<1>(store0, item_row1, v_weight);
 
                 let item_row2 = vreinterpret_s16_u16(vld1_u16(src_ptr2.as_ptr()));
-                store0 = vmlal_lane_s16::<2>(store0, item_row2, v_weight);
+                store0 = vqdmlal_lane_s16::<2>(store0, item_row2, v_weight);
             } else if bounds_size == 4 {
                 let weights = weight.get_unchecked(0..4);
                 let v_weight = vld1_s16(weights.as_ptr());
@@ -363,16 +358,16 @@ pub(crate) fn convolve_column_lb_u16(
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_dx)..);
 
                 let item_row0 = vreinterpret_s16_u16(vld1_u16(src_ptr0.as_ptr()));
-                store0 = vmlal_lane_s16::<0>(store0, item_row0, v_weight);
+                store0 = vqdmlal_lane_s16::<0>(store0, item_row0, v_weight);
 
                 let item_row1 = vreinterpret_s16_u16(vld1_u16(src_ptr1.as_ptr()));
-                store0 = vmlal_lane_s16::<1>(store0, item_row1, v_weight);
+                store0 = vqdmlal_lane_s16::<1>(store0, item_row1, v_weight);
 
                 let item_row2 = vreinterpret_s16_u16(vld1_u16(src_ptr2.as_ptr()));
-                store0 = vmlal_lane_s16::<2>(store0, item_row2, v_weight);
+                store0 = vqdmlal_lane_s16::<2>(store0, item_row2, v_weight);
 
                 let item_row3 = vreinterpret_s16_u16(vld1_u16(src_ptr3.as_ptr()));
-                store0 = vmlal_lane_s16::<3>(store0, item_row3, v_weight);
+                store0 = vqdmlal_lane_s16::<3>(store0, item_row3, v_weight);
             } else {
                 for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
                     let py = bounds.start + j;
@@ -415,8 +410,8 @@ pub(crate) fn convolve_column_lb_u16(
                 let offset1 = src_stride * (py + 1) + v_px;
                 let src_ptr1 = src.get_unchecked(offset1..(offset1 + 1));
 
-                store0 += src_ptr0[0] as i32 * weight0 as i32;
-                store0 += src_ptr1[0] as i32 * weight1 as i32;
+                store0 += 2 * src_ptr0[0] as i32 * weight0 as i32;
+                store0 += 2 * src_ptr1[0] as i32 * weight1 as i32;
             } else if bounds_size == 3 {
                 let weights = weight.get_unchecked(0..3);
                 let weight0 = weights[0];
@@ -431,9 +426,9 @@ pub(crate) fn convolve_column_lb_u16(
                 let offset2 = src_stride * (py + 2) + v_px;
                 let src_ptr2 = src.get_unchecked(offset2..(offset2 + 1));
 
-                store0 += src_ptr0[0] as i32 * weight0 as i32;
-                store0 += src_ptr1[0] as i32 * weight1 as i32;
-                store0 += src_ptr2[0] as i32 * weight2 as i32;
+                store0 += 2 * src_ptr0[0] as i32 * weight0 as i32;
+                store0 += 2 * src_ptr1[0] as i32 * weight1 as i32;
+                store0 += 2 * src_ptr2[0] as i32 * weight2 as i32;
             } else if bounds_size == 4 {
                 let weights = weight.get_unchecked(0..4);
                 let weight0 = weights[0];
@@ -451,17 +446,17 @@ pub(crate) fn convolve_column_lb_u16(
                 let offset3 = src_stride * (py + 3) + v_px;
                 let src_ptr3 = src.get_unchecked(offset3..(offset3 + 1));
 
-                store0 += src_ptr0[0] as i32 * weight0 as i32;
-                store0 += src_ptr1[0] as i32 * weight1 as i32;
-                store0 += src_ptr2[0] as i32 * weight2 as i32;
-                store0 += src_ptr3[0] as i32 * weight3 as i32;
+                store0 += 2 * src_ptr0[0] as i32 * weight0 as i32;
+                store0 += 2 * src_ptr1[0] as i32 * weight1 as i32;
+                store0 += 2 * src_ptr2[0] as i32 * weight2 as i32;
+                store0 += 2 * src_ptr3[0] as i32 * weight3 as i32;
             } else {
                 for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
                     let py = bounds.start + j;
                     let offset = src_stride * py + v_px;
                     let src_ptr = src.get_unchecked(offset..(offset + 1));
 
-                    store0 += src_ptr[0] as i32 * k_weight as i32;
+                    store0 += 2 * src_ptr[0] as i32 * k_weight as i32;
                 }
             }
 
diff --git a/src/neon/vertical_u8.rs b/src/neon/vertical_u8.rs
index a803116..ce3cc35 100644
--- a/src/neon/vertical_u8.rs
+++ b/src/neon/vertical_u8.rs
@@ -27,56 +27,71 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::filter_weights::FilterBounds;
-use crate::neon::utils::{expand8_to_14, xvld1q_u8_x2, xvld1q_u8_x4, xvst1q_u8_x2, xvst1q_u8_x4};
-use crate::support::{PRECISION, ROUNDING_CONST};
+use crate::neon::utils::{
+    vxmlal_high_lane_s16, vxmlal_high_s16, vxmlal_lane_s16, vxmlal_s16, xvld1q_u8_x2, xvld1q_u8_x4,
+    xvst1q_u8_x2, xvst1q_u8_x4,
+};
 use std::arch::aarch64::*;
 
-macro_rules! pack_weights {
-    ($store_0: expr, $store_1: expr, $store_2: expr, $store_3: expr) => {{
-        let low_u16 = vcombine_u16(
-            vqshrun_n_s32::<PRECISION>($store_0),
-            vqshrun_n_s32::<PRECISION>($store_1),
-        );
-        let high_u16 = vcombine_u16(
-            vqshrun_n_s32::<PRECISION>($store_2),
-            vqshrun_n_s32::<PRECISION>($store_3),
-        );
-        vcombine_u8(vqmovn_u16(low_u16), vqmovn_u16(high_u16))
-    }};
+#[inline(always)]
+unsafe fn pack_weights<const PRECISION: i32>(
+    store_0: int32x4_t,
+    store_1: int32x4_t,
+    store_2: int32x4_t,
+    store_3: int32x4_t,
+) -> uint8x16_t {
+    let low_u16 = vcombine_u16(
+        vqshrun_n_s32::<PRECISION>(store_0),
+        vqshrun_n_s32::<PRECISION>(store_1),
+    );
+    let high_u16 = vcombine_u16(
+        vqshrun_n_s32::<PRECISION>(store_2),
+        vqshrun_n_s32::<PRECISION>(store_3),
+    );
+    vcombine_u8(vqmovn_u16(low_u16), vqmovn_u16(high_u16))
 }
 
-macro_rules! accumulate_4_into {
-    ($item: expr,$store_0: expr, $store_1: expr, $store_2: expr, $store_3: expr, $weight: expr) => {{
-        let low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8($item)));
-        let high = vreinterpretq_s16_u16(vmovl_high_u8($item));
-
-        $store_0 = vmlal_s16($store_0, vget_low_s16(low), vget_low_s16($weight));
-        $store_1 = vmlal_high_s16($store_1, low, $weight);
-        $store_2 = vmlal_s16($store_2, vget_low_s16(high), vget_low_s16($weight));
-        $store_3 = vmlal_high_s16($store_3, high, $weight);
-    }};
+#[must_use]
+#[inline(always)]
+unsafe fn accumulate_4_into<const D: bool>(
+    item: uint8x16_t,
+    store_0: int32x4_t,
+    store_1: int32x4_t,
+    store_2: int32x4_t,
+    store_3: int32x4_t,
+    weight: int16x8_t,
+) -> (int32x4_t, int32x4_t, int32x4_t, int32x4_t) {
+    let low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(item)));
+    let high = vreinterpretq_s16_u16(vmovl_high_u8(item));
+
+    let store_0 = vxmlal_s16::<D>(store_0, vget_low_s16(low), vget_low_s16(weight));
+    let store_1 = vxmlal_high_s16::<D>(store_1, low, weight);
+    let store_2 = vxmlal_s16::<D>(store_2, vget_low_s16(high), vget_low_s16(weight));
+    let store_3 = vxmlal_high_s16::<D>(store_3, high, weight);
+    (store_0, store_1, store_2, store_3)
 }
 
-macro_rules! accumulate_4_into_lane {
-    ($item: expr,$store_0: expr, $store_1: expr, $store_2: expr, $store_3: expr, $weight: expr, $weight_pos: expr) => {{
-        let low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8($item)));
-        let high = vreinterpretq_s16_u16(vmovl_high_u8($item));
-
-        $store_0 = vmlal_lane_s16::<$weight_pos>($store_0, vget_low_s16(low), $weight);
-        $store_1 = vmlal_high_lane_s16::<$weight_pos>($store_1, low, $weight);
-        $store_2 = vmlal_lane_s16::<$weight_pos>($store_2, vget_low_s16(high), $weight);
-        $store_3 = vmlal_high_lane_s16::<$weight_pos>($store_3, high, $weight);
-    }};
+#[must_use]
+#[inline(always)]
+unsafe fn accumulate_4_into_lane<const D: bool, const W: i32>(
+    item: uint8x16_t,
+    store_0: int32x4_t,
+    store_1: int32x4_t,
+    store_2: int32x4_t,
+    store_3: int32x4_t,
+    weight: int16x4_t,
+) -> (int32x4_t, int32x4_t, int32x4_t, int32x4_t) {
+    let low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(item)));
+    let high = vreinterpretq_s16_u16(vmovl_high_u8(item));
+
+    let store_0 = vxmlal_lane_s16::<D, W>(store_0, vget_low_s16(low), weight);
+    let store_1 = vxmlal_high_lane_s16::<D, W>(store_1, low, weight);
+    let store_2 = vxmlal_lane_s16::<D, W>(store_2, vget_low_s16(high), weight);
+    let store_3 = vxmlal_high_lane_s16::<D, W>(store_3, high, weight);
+    (store_0, store_1, store_2, store_3)
 }
 
-/// Checking NEON `rdm` availability is required before a call.
-///
-/// RDM feature has slightly lower precision and won't work really well on huge kernel which
-/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling.
-///
-/// # Safety
-/// - Check `rdm` availability before the call.
-pub(crate) fn convolve_vertical_neon_i16_precision(
+pub(crate) fn convolve_vertical_neon_i32_precision(
     width: usize,
     bounds: &FilterBounds,
     src: &[u8],
@@ -84,12 +99,12 @@ pub(crate) fn convolve_vertical_neon_i16_precision(
     src_stride: usize,
     weight: &[i16],
 ) {
-    unsafe {
-        convolve_vertical_neon_row_upper(width, bounds, src, dst, src_stride, weight);
-    }
+    convolve_vertical_neon_row_full::<false, { crate::support::PRECISION }>(
+        width, bounds, src, dst, src_stride, weight,
+    );
 }
 
-pub(crate) fn convolve_vertical_neon_i32_precision(
+pub(crate) fn convolve_vertical_neon_i32_precision_d(
     width: usize,
     bounds: &FilterBounds,
     src: &[u8],
@@ -97,553 +112,10 @@ pub(crate) fn convolve_vertical_neon_i32_precision(
     src_stride: usize,
     weight: &[i16],
 ) {
-    convolve_vertical_neon_row_full(width, bounds, src, dst, src_stride, weight);
+    convolve_vertical_neon_row_full::<true, 16>(width, bounds, src, dst, src_stride, weight);
 }
 
-#[must_use]
-#[inline(always)]
-unsafe fn vdot<const SCALE: i32>(
-    store0: int16x8_t,
-    store1: int16x8_t,
-    row: uint8x16_t,
-    weight: int16x8_t,
-) -> (int16x8_t, int16x8_t) {
-    let lo0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip1q_u8(row, row))));
-    let store0 = vqrdmlahq_s16(store0, lo0, weight);
-    let hi0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip2q_u8(row, row))));
-    let store1 = vqrdmlahq_s16(store1, hi0, weight);
-    (store0, store1)
-}
-
-#[must_use]
-#[inline(always)]
-unsafe fn vdot_lane<const SCALE: i32, const LANE: i32>(
-    store0: int16x8_t,
-    store1: int16x8_t,
-    row: uint8x16_t,
-    weight: int16x4_t,
-) -> (int16x8_t, int16x8_t) {
-    let lo0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip1q_u8(row, row))));
-    let store0 = vqrdmlahq_lane_s16::<LANE>(store0, lo0, weight);
-    let hi0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip2q_u8(row, row))));
-    let store1 = vqrdmlahq_lane_s16::<LANE>(store1, hi0, weight);
-    (store0, store1)
-}
-
-#[target_feature(enable = "rdm")]
-unsafe fn convolve_vertical_neon_row_upper(
-    _: usize,
-    bounds: &FilterBounds,
-    src: &[u8],
-    dst: &mut [u8],
-    src_stride: usize,
-    weight: &[i16],
-) {
-    let mut cx = 0usize;
-
-    let iter_64 = dst.chunks_exact_mut(64);
-
-    let bounds_size = bounds.size;
-    const SCALE: i32 = 6;
-    const R_SHR_SCALE: i32 = SCALE;
-    const ROUNDING: i16 = 1 << (SCALE - 1);
-
-    for dst in iter_64 {
-        let vld = vdupq_n_s16(ROUNDING);
-
-        let mut store_0 = vld;
-        let mut store_1 = vld;
-        let mut store_2 = vld;
-        let mut store_3 = vld;
-
-        let mut store_4 = vld;
-        let mut store_5 = vld;
-        let mut store_6 = vld;
-        let mut store_7 = vld;
-
-        let px = cx;
-
-        if bounds_size == 2 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..2);
-            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-
-            let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
-            (store_4, store_5) = vdot_lane::<SCALE, 0>(store_4, store_5, items0.2, v_weight);
-            (store_6, store_7) = vdot_lane::<SCALE, 0>(store_6, store_7, items0.3, v_weight);
-
-            let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
-            (store_4, store_5) = vdot_lane::<SCALE, 1>(store_4, store_5, items1.2, v_weight);
-            (store_6, store_7) = vdot_lane::<SCALE, 1>(store_6, store_7, items1.3, v_weight);
-        } else if bounds_size == 3 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..3);
-            let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
-            v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-
-            let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
-            (store_4, store_5) = vdot_lane::<SCALE, 0>(store_4, store_5, items0.2, v_weight);
-            (store_6, store_7) = vdot_lane::<SCALE, 0>(store_6, store_7, items0.3, v_weight);
-
-            let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
-            (store_4, store_5) = vdot_lane::<SCALE, 1>(store_4, store_5, items1.2, v_weight);
-            (store_6, store_7) = vdot_lane::<SCALE, 1>(store_6, store_7, items1.3, v_weight);
-
-            let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
-            (store_4, store_5) = vdot_lane::<SCALE, 2>(store_4, store_5, items2.2, v_weight);
-            (store_6, store_7) = vdot_lane::<SCALE, 2>(store_6, store_7, items2.3, v_weight);
-        } else if bounds_size == 4 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..4);
-            let v_weight = vld1_s16(weight.as_ptr());
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-            let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
-
-            let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
-            (store_4, store_5) = vdot_lane::<SCALE, 0>(store_4, store_5, items0.2, v_weight);
-            (store_6, store_7) = vdot_lane::<SCALE, 0>(store_6, store_7, items0.3, v_weight);
-
-            let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
-            (store_4, store_5) = vdot_lane::<SCALE, 1>(store_4, store_5, items1.2, v_weight);
-            (store_6, store_7) = vdot_lane::<SCALE, 1>(store_6, store_7, items1.3, v_weight);
-
-            let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
-            (store_4, store_5) = vdot_lane::<SCALE, 2>(store_4, store_5, items2.2, v_weight);
-            (store_6, store_7) = vdot_lane::<SCALE, 2>(store_6, store_7, items2.3, v_weight);
-
-            let items3 = xvld1q_u8_x4(src_ptr3.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 3>(store_0, store_1, items3.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 3>(store_2, store_3, items3.1, v_weight);
-            (store_4, store_5) = vdot_lane::<SCALE, 3>(store_4, store_5, items3.2, v_weight);
-            (store_6, store_7) = vdot_lane::<SCALE, 3>(store_6, store_7, items3.3, v_weight);
-        } else {
-            for j in 0..bounds_size {
-                let py = bounds.start + j;
-                let weight = weight.get_unchecked(j..);
-                let v_weight = vld1q_dup_s16(weight.as_ptr());
-                let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                let items = xvld1q_u8_x4(src_ptr.as_ptr());
-
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items.0, v_weight);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items.1, v_weight);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items.2, v_weight);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items.3, v_weight);
-            }
-        }
-
-        let item00 = vqshrun_n_s16::<R_SHR_SCALE>(store_0);
-        let item01 = vqshrun_n_s16::<R_SHR_SCALE>(store_1);
-        let item10 = vqshrun_n_s16::<R_SHR_SCALE>(store_2);
-        let item11 = vqshrun_n_s16::<R_SHR_SCALE>(store_3);
-        let item20 = vqshrun_n_s16::<R_SHR_SCALE>(store_4);
-        let item21 = vqshrun_n_s16::<R_SHR_SCALE>(store_5);
-        let item30 = vqshrun_n_s16::<R_SHR_SCALE>(store_6);
-        let item31 = vqshrun_n_s16::<R_SHR_SCALE>(store_7);
-        let item0 = vcombine_u8(item00, item01);
-        let item1 = vcombine_u8(item10, item11);
-        let item2 = vcombine_u8(item20, item21);
-        let item3 = vcombine_u8(item30, item31);
-
-        let dst_items = uint8x16x4_t(item0, item1, item2, item3);
-        xvst1q_u8_x4(dst.as_mut_ptr(), dst_items);
-
-        cx += 64;
-    }
-
-    let mut rem = dst.chunks_exact_mut(64).into_remainder();
-    let iter_32 = rem.chunks_exact_mut(32);
-
-    for dst in iter_32 {
-        let vld = vdupq_n_s16(ROUNDING);
-        let mut store_0 = vld;
-        let mut store_1 = vld;
-        let mut store_2 = vld;
-        let mut store_3 = vld;
-
-        let px = cx;
-
-        if bounds_size == 2 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..2);
-            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-
-            let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
-
-            let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
-        } else if bounds_size == 3 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..3);
-            let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
-            v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-
-            let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
-
-            let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
-
-            let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
-        } else if bounds_size == 4 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..4);
-            let v_weight = vld1_s16(weight.as_ptr());
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-            let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
-
-            let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
-
-            let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
-
-            let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
-
-            let items3 = xvld1q_u8_x2(src_ptr3.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 3>(store_0, store_1, items3.0, v_weight);
-            (store_2, store_3) = vdot_lane::<SCALE, 3>(store_2, store_3, items3.1, v_weight);
-        } else {
-            for j in 0..bounds.size {
-                let py = bounds.start + j;
-                let weight = weight.get_unchecked(j..);
-                let v_weight = vld1q_dup_s16(weight.as_ptr());
-                let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                let items = xvld1q_u8_x2(src_ptr.as_ptr());
-
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items.0, v_weight);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items.1, v_weight);
-            }
-        }
-
-        let item00 = vqshrun_n_s16::<R_SHR_SCALE>(store_0);
-        let item01 = vqshrun_n_s16::<R_SHR_SCALE>(store_1);
-        let item10 = vqshrun_n_s16::<R_SHR_SCALE>(store_2);
-        let item11 = vqshrun_n_s16::<R_SHR_SCALE>(store_3);
-        let item0 = vcombine_u8(item00, item01);
-        let item1 = vcombine_u8(item10, item11);
-
-        let dst_items = uint8x16x2_t(item0, item1);
-        xvst1q_u8_x2(dst.as_mut_ptr(), dst_items);
-
-        cx += 32;
-    }
-
-    rem = rem.chunks_exact_mut(32).into_remainder();
-    let iter_16 = rem.chunks_exact_mut(16);
-
-    for dst in iter_16 {
-        let vld = vdupq_n_s16(ROUNDING);
-        let mut store_0 = vld;
-        let mut store_1 = vld;
-
-        let px = cx;
-
-        if bounds_size == 2 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..2);
-            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-
-            let item0 = vld1q_u8(src_ptr0.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, item0, v_weight);
-
-            let item1 = vld1q_u8(src_ptr1.as_ptr());
-            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, item1, v_weight);
-        } else if bounds_size == 3 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..3);
-            let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
-            v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-
-            let item0 = vld1q_u8(src_ptr0.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, item0, v_weight);
-
-            let item1 = vld1q_u8(src_ptr1.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, item1, v_weight);
-
-            let item2 = vld1q_u8(src_ptr2.as_ptr());
-
-            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, item2, v_weight);
-        } else if bounds_size == 4 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..4);
-            let v_weight = vld1_s16(weight.as_ptr());
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-            let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
-
-            let item0 = vld1q_u8(src_ptr0.as_ptr());
-            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, item0, v_weight);
-
-            let item1 = vld1q_u8(src_ptr1.as_ptr());
-            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, item1, v_weight);
-
-            let item2 = vld1q_u8(src_ptr2.as_ptr());
-            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, item2, v_weight);
-
-            let item3 = vld1q_u8(src_ptr3.as_ptr());
-            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, item3, v_weight);
-        } else {
-            for j in 0..bounds_size {
-                let py = bounds.start + j;
-                let weight = weight.get_unchecked(j..);
-                let v_weight = vld1q_dup_s16(weight.as_ptr());
-                let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                let item_row = vld1q_u8(src_ptr.as_ptr());
-
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item_row, v_weight);
-            }
-        }
-
-        let item0 = vqshrun_n_s16::<R_SHR_SCALE>(store_0);
-        let item1 = vqshrun_n_s16::<R_SHR_SCALE>(store_1);
-
-        vst1q_u8(dst.as_mut_ptr(), vcombine_u8(item0, item1));
-
-        cx += 16;
-    }
-
-    rem = rem.chunks_exact_mut(16).into_remainder();
-    let iter_8 = rem.chunks_exact_mut(8);
-
-    for dst in iter_8 {
-        let vld = vdupq_n_s16(ROUNDING);
-        let mut store_0 = vld;
-
-        let px = cx;
-
-        if bounds_size == 2 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..2);
-            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-
-            let item0 = vld1_u8(src_ptr0.as_ptr());
-            let low0 = expand8_to_14(item0);
-            store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight);
-
-            let item1 = vld1_u8(src_ptr1.as_ptr());
-            let low1 = expand8_to_14(item1);
-            store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight);
-        } else if bounds_size == 3 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..3);
-            let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
-            v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-
-            let item0 = vld1_u8(src_ptr0.as_ptr());
-            let low0 = expand8_to_14(item0);
-            store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight);
-
-            let item1 = vld1_u8(src_ptr1.as_ptr());
-            let low1 = expand8_to_14(item1);
-            store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight);
-
-            let item2 = vld1_u8(src_ptr2.as_ptr());
-            let low2 = expand8_to_14(item2);
-            store_0 = vqrdmlahq_lane_s16::<2>(store_0, low2, v_weight);
-        } else if bounds_size == 4 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..4);
-            let v_weight = vld1_s16(weight.as_ptr());
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-            let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
-
-            let item0 = vld1_u8(src_ptr0.as_ptr());
-            let low0 = expand8_to_14(item0);
-            store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight);
-
-            let item1 = vld1_u8(src_ptr1.as_ptr());
-            let low1 = expand8_to_14(item1);
-            store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight);
-
-            let item2 = vld1_u8(src_ptr2.as_ptr());
-            let low2 = expand8_to_14(item2);
-            store_0 = vqrdmlahq_lane_s16::<2>(store_0, low2, v_weight);
-
-            let item3 = vld1_u8(src_ptr3.as_ptr());
-            let low3 = expand8_to_14(item3);
-            store_0 = vqrdmlahq_lane_s16::<3>(store_0, low3, v_weight);
-        } else {
-            for j in 0..bounds_size {
-                let py = bounds.start + j;
-                let weight = weight.get_unchecked(j..);
-                let v_weight = vld1q_dup_s16(weight.as_ptr());
-                let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                let item_row = vld1_u8(src_ptr.as_ptr());
-
-                let low = expand8_to_14(item_row);
-                store_0 = vqrdmlahq_s16(store_0, low, v_weight);
-            }
-        }
-
-        let item = vqshrun_n_s16::<R_SHR_SCALE>(store_0);
-        vst1_u8(dst.as_mut_ptr(), item);
-
-        cx += 8;
-    }
-
-    rem = rem.chunks_exact_mut(8).into_remainder();
-    let iter_1 = rem.iter_mut();
-
-    for dst in iter_1 {
-        let vld = vdupq_n_s16(ROUNDING);
-        let mut store = vld;
-
-        let px = cx;
-
-        if bounds_size == 2 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..2);
-            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-
-            let items0 = vld1_dup_u8(src_ptr0.as_ptr());
-            let low0 = expand8_to_14(items0);
-            store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight);
-
-            let items1 = vld1_dup_u8(src_ptr1.as_ptr());
-            let low1 = expand8_to_14(items1);
-            store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight);
-        } else if bounds_size == 3 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..3);
-            let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
-            v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-
-            let items0 = vld1_dup_u8(src_ptr0.as_ptr());
-            let low0 = expand8_to_14(items0);
-            store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight);
-
-            let items1 = vld1_dup_u8(src_ptr1.as_ptr());
-            let low1 = expand8_to_14(items1);
-            store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight);
-
-            let items2 = vld1_dup_u8(src_ptr2.as_ptr());
-            let low2 = expand8_to_14(items2);
-            store = vqrdmlahq_lane_s16::<2>(store, low2, v_weight);
-        } else if bounds_size == 4 {
-            let py = bounds.start;
-            let weight = weight.get_unchecked(0..4);
-            let v_weight = vld1_s16(weight.as_ptr());
-            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
-            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-            let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
-
-            let items0 = vld1_dup_u8(src_ptr0.as_ptr());
-            let low0 = expand8_to_14(items0);
-            store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight);
-
-            let items1 = vld1_dup_u8(src_ptr1.as_ptr());
-            let low1 = expand8_to_14(items1);
-            store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight);
-
-            let items2 = vld1_dup_u8(src_ptr2.as_ptr());
-            let low2 = expand8_to_14(items2);
-            store = vqrdmlahq_lane_s16::<2>(store, low2, v_weight);
-
-            let items3 = vld1_dup_u8(src_ptr3.as_ptr());
-            let low3 = expand8_to_14(items3);
-            store = vqrdmlahq_lane_s16::<3>(store, low3, v_weight);
-        } else {
-            for j in 0..bounds_size {
-                let py = bounds.start + j;
-                let weight = weight.get_unchecked(j..);
-                let v_weight = vld1q_dup_s16(weight.as_ptr());
-                let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                let item_row = vld1_dup_u8(src_ptr.as_ptr());
-
-                let low = expand8_to_14(item_row);
-                store = vqrdmlahq_s16(store, low, v_weight);
-            }
-        }
-
-        let shrinked_store = vqshrun_n_s16::<R_SHR_SCALE>(store);
-        let value = vget_lane_u8::<0>(shrinked_store);
-        *dst = value;
-        cx += 1;
-    }
-}
-
-fn convolve_vertical_neon_row_full(
+fn convolve_vertical_neon_row_full<const D: bool, const PRECISION: i32>(
     _: usize,
     bounds: &FilterBounds,
     src: &[u8],
@@ -652,6 +124,7 @@ fn convolve_vertical_neon_row_full(
     weight: &[i16],
 ) {
     let mut cx = 0usize;
+    let rnd_const: i32 = (1 << (PRECISION - 1)) - 1;
 
     unsafe {
         let iter_64 = dst.chunks_exact_mut(64);
@@ -659,7 +132,7 @@ fn convolve_vertical_neon_row_full(
         let bounds_size = bounds.size;
 
         for dst in iter_64 {
-            let vld = vdupq_n_s32(ROUNDING_CONST);
+            let vld = vdupq_n_s32(rnd_const);
             let mut store_0 = vld;
             let mut store_1 = vld;
             let mut store_2 = vld;
@@ -691,24 +164,32 @@ fn convolve_vertical_neon_row_full(
 
                 let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
 
-                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
-                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
-                accumulate_4_into_lane!(
-                    items0.2, store_8, store_9, store_10, store_11, v_weight, 0
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 0>(
+                    items0.0, store_0, store_1, store_2, store_3, v_weight,
                 );
-                accumulate_4_into_lane!(
-                    items0.3, store_12, store_13, store_14, store_15, v_weight, 0
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 0>(
+                    items0.1, store_4, store_5, store_6, store_7, v_weight,
+                );
+                (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::<D, 0>(
+                    items0.2, store_8, store_9, store_10, store_11, v_weight,
+                );
+                (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::<D, 0>(
+                    items0.3, store_12, store_13, store_14, store_15, v_weight,
                 );
 
                 let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
 
-                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
-                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
-                accumulate_4_into_lane!(
-                    items1.2, store_8, store_9, store_10, store_11, v_weight, 1
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 1>(
+                    items1.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 1>(
+                    items1.1, store_4, store_5, store_6, store_7, v_weight,
                 );
-                accumulate_4_into_lane!(
-                    items1.3, store_12, store_13, store_14, store_15, v_weight, 1
+                (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::<D, 1>(
+                    items1.2, store_8, store_9, store_10, store_11, v_weight,
+                );
+                (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::<D, 1>(
+                    items1.3, store_12, store_13, store_14, store_15, v_weight,
                 );
             } else if bounds_size == 3 {
                 let py = bounds.start;
@@ -722,35 +203,47 @@ fn convolve_vertical_neon_row_full(
 
                 let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
 
-                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
-                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
-                accumulate_4_into_lane!(
-                    items0.2, store_8, store_9, store_10, store_11, v_weight, 0
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 0>(
+                    items0.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 0>(
+                    items0.1, store_4, store_5, store_6, store_7, v_weight,
                 );
-                accumulate_4_into_lane!(
-                    items0.3, store_12, store_13, store_14, store_15, v_weight, 0
+                (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::<D, 0>(
+                    items0.2, store_8, store_9, store_10, store_11, v_weight,
+                );
+                (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::<D, 0>(
+                    items0.3, store_12, store_13, store_14, store_15, v_weight,
                 );
 
                 let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
 
-                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
-                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
-                accumulate_4_into_lane!(
-                    items1.2, store_8, store_9, store_10, store_11, v_weight, 1
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 1>(
+                    items1.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 1>(
+                    items1.1, store_4, store_5, store_6, store_7, v_weight,
                 );
-                accumulate_4_into_lane!(
-                    items1.3, store_12, store_13, store_14, store_15, v_weight, 1
+                (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::<D, 1>(
+                    items1.2, store_8, store_9, store_10, store_11, v_weight,
+                );
+                (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::<D, 1>(
+                    items1.3, store_12, store_13, store_14, store_15, v_weight,
                 );
 
                 let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
 
-                accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2);
-                accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2);
-                accumulate_4_into_lane!(
-                    items2.2, store_8, store_9, store_10, store_11, v_weight, 2
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 2>(
+                    items2.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 2>(
+                    items2.1, store_4, store_5, store_6, store_7, v_weight,
                 );
-                accumulate_4_into_lane!(
-                    items2.3, store_12, store_13, store_14, store_15, v_weight, 2
+                (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::<D, 2>(
+                    items2.2, store_8, store_9, store_10, store_11, v_weight,
+                );
+                (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::<D, 2>(
+                    items2.3, store_12, store_13, store_14, store_15, v_weight,
                 );
             } else if bounds_size == 4 {
                 let py = bounds.start;
@@ -763,46 +256,62 @@ fn convolve_vertical_neon_row_full(
 
                 let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
 
-                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
-                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
-                accumulate_4_into_lane!(
-                    items0.2, store_8, store_9, store_10, store_11, v_weight, 0
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 0>(
+                    items0.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 0>(
+                    items0.1, store_4, store_5, store_6, store_7, v_weight,
+                );
+                (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::<D, 0>(
+                    items0.2, store_8, store_9, store_10, store_11, v_weight,
                 );
-                accumulate_4_into_lane!(
-                    items0.3, store_12, store_13, store_14, store_15, v_weight, 0
+                (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::<D, 0>(
+                    items0.3, store_12, store_13, store_14, store_15, v_weight,
                 );
 
                 let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
 
-                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
-                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
-                accumulate_4_into_lane!(
-                    items1.2, store_8, store_9, store_10, store_11, v_weight, 1
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 1>(
+                    items1.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 1>(
+                    items1.1, store_4, store_5, store_6, store_7, v_weight,
+                );
+                (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::<D, 1>(
+                    items1.2, store_8, store_9, store_10, store_11, v_weight,
                 );
-                accumulate_4_into_lane!(
-                    items1.3, store_12, store_13, store_14, store_15, v_weight, 1
+                (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::<D, 1>(
+                    items1.3, store_12, store_13, store_14, store_15, v_weight,
                 );
 
                 let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
 
-                accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2);
-                accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2);
-                accumulate_4_into_lane!(
-                    items2.2, store_8, store_9, store_10, store_11, v_weight, 2
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 2>(
+                    items2.0, store_0, store_1, store_2, store_3, v_weight,
                 );
-                accumulate_4_into_lane!(
-                    items2.3, store_12, store_13, store_14, store_15, v_weight, 2
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 2>(
+                    items2.1, store_4, store_5, store_6, store_7, v_weight,
+                );
+                (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::<D, 2>(
+                    items2.2, store_8, store_9, store_10, store_11, v_weight,
+                );
+                (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::<D, 2>(
+                    items2.3, store_12, store_13, store_14, store_15, v_weight,
                 );
 
                 let items3 = xvld1q_u8_x4(src_ptr3.as_ptr());
 
-                accumulate_4_into_lane!(items3.0, store_0, store_1, store_2, store_3, v_weight, 3);
-                accumulate_4_into_lane!(items3.1, store_4, store_5, store_6, store_7, v_weight, 3);
-                accumulate_4_into_lane!(
-                    items3.2, store_8, store_9, store_10, store_11, v_weight, 3
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 3>(
+                    items3.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 3>(
+                    items3.1, store_4, store_5, store_6, store_7, v_weight,
+                );
+                (store_8, store_9, store_10, store_11) = accumulate_4_into_lane::<D, 3>(
+                    items3.2, store_8, store_9, store_10, store_11, v_weight,
                 );
-                accumulate_4_into_lane!(
-                    items3.3, store_12, store_13, store_14, store_15, v_weight, 3
+                (store_12, store_13, store_14, store_15) = accumulate_4_into_lane::<D, 3>(
+                    items3.3, store_12, store_13, store_14, store_15, v_weight,
                 );
             } else {
                 for j in 0..bounds_size {
@@ -812,17 +321,25 @@ fn convolve_vertical_neon_row_full(
                     let src_ptr = src.get_unchecked((src_stride * py + px)..);
                     let items = xvld1q_u8_x4(src_ptr.as_ptr());
 
-                    accumulate_4_into!(items.0, store_0, store_1, store_2, store_3, v_weight);
-                    accumulate_4_into!(items.1, store_4, store_5, store_6, store_7, v_weight);
-                    accumulate_4_into!(items.2, store_8, store_9, store_10, store_11, v_weight);
-                    accumulate_4_into!(items.3, store_12, store_13, store_14, store_15, v_weight);
+                    (store_0, store_1, store_2, store_3) = accumulate_4_into::<D>(
+                        items.0, store_0, store_1, store_2, store_3, v_weight,
+                    );
+                    (store_4, store_5, store_6, store_7) = accumulate_4_into::<D>(
+                        items.1, store_4, store_5, store_6, store_7, v_weight,
+                    );
+                    (store_8, store_9, store_10, store_11) = accumulate_4_into::<D>(
+                        items.2, store_8, store_9, store_10, store_11, v_weight,
+                    );
+                    (store_12, store_13, store_14, store_15) = accumulate_4_into::<D>(
+                        items.3, store_12, store_13, store_14, store_15, v_weight,
+                    );
                 }
             }
 
-            let item_0 = pack_weights!(store_0, store_1, store_2, store_3);
-            let item_1 = pack_weights!(store_4, store_5, store_6, store_7);
-            let item_2 = pack_weights!(store_8, store_9, store_10, store_11);
-            let item_3 = pack_weights!(store_12, store_13, store_14, store_15);
+            let item_0 = pack_weights::<PRECISION>(store_0, store_1, store_2, store_3);
+            let item_1 = pack_weights::<PRECISION>(store_4, store_5, store_6, store_7);
+            let item_2 = pack_weights::<PRECISION>(store_8, store_9, store_10, store_11);
+            let item_3 = pack_weights::<PRECISION>(store_12, store_13, store_14, store_15);
 
             let dst_items = uint8x16x4_t(item_0, item_1, item_2, item_3);
             xvst1q_u8_x4(dst.as_mut_ptr(), dst_items);
@@ -834,7 +351,7 @@ fn convolve_vertical_neon_row_full(
         let iter_32 = rem.chunks_exact_mut(32);
 
         for dst in iter_32 {
-            let vld = vdupq_n_s32(ROUNDING_CONST);
+            let vld = vdupq_n_s32(rnd_const);
             let mut store_0 = vld;
             let mut store_1 = vld;
             let mut store_2 = vld;
@@ -854,13 +371,21 @@ fn convolve_vertical_neon_row_full(
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
-                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 0>(
+                    items0.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 0>(
+                    items0.1, store_4, store_5, store_6, store_7, v_weight,
+                );
 
                 let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
-                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 1>(
+                    items1.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 1>(
+                    items1.1, store_4, store_5, store_6, store_7, v_weight,
+                );
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
@@ -872,18 +397,30 @@ fn convolve_vertical_neon_row_full(
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
-                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 0>(
+                    items0.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 0>(
+                    items0.1, store_4, store_5, store_6, store_7, v_weight,
+                );
 
                 let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
-                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 1>(
+                    items1.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 1>(
+                    items1.1, store_4, store_5, store_6, store_7, v_weight,
+                );
 
                 let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
 
-                accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2);
-                accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2);
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 2>(
+                    items2.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 2>(
+                    items2.1, store_4, store_5, store_6, store_7, v_weight,
+                );
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
@@ -894,23 +431,39 @@ fn convolve_vertical_neon_row_full(
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
                 let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
-                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 0>(
+                    items0.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 0>(
+                    items0.1, store_4, store_5, store_6, store_7, v_weight,
+                );
 
                 let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
-                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 1>(
+                    items1.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 1>(
+                    items1.1, store_4, store_5, store_6, store_7, v_weight,
+                );
 
                 let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
 
-                accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2);
-                accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2);
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 2>(
+                    items2.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 2>(
+                    items2.1, store_4, store_5, store_6, store_7, v_weight,
+                );
 
                 let items3 = xvld1q_u8_x2(src_ptr3.as_ptr());
 
-                accumulate_4_into_lane!(items3.0, store_0, store_1, store_2, store_3, v_weight, 3);
-                accumulate_4_into_lane!(items3.1, store_4, store_5, store_6, store_7, v_weight, 3);
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 3>(
+                    items3.0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_4, store_5, store_6, store_7) = accumulate_4_into_lane::<D, 3>(
+                    items3.1, store_4, store_5, store_6, store_7, v_weight,
+                );
             } else {
                 for j in 0..bounds.size {
                     let py = bounds.start + j;
@@ -919,13 +472,17 @@ fn convolve_vertical_neon_row_full(
                     let src_ptr = src.get_unchecked((src_stride * py + px)..);
                     let items = xvld1q_u8_x2(src_ptr.as_ptr());
 
-                    accumulate_4_into!(items.0, store_0, store_1, store_2, store_3, v_weight);
-                    accumulate_4_into!(items.1, store_4, store_5, store_6, store_7, v_weight);
+                    (store_0, store_1, store_2, store_3) = accumulate_4_into::<D>(
+                        items.0, store_0, store_1, store_2, store_3, v_weight,
+                    );
+                    (store_4, store_5, store_6, store_7) = accumulate_4_into::<D>(
+                        items.1, store_4, store_5, store_6, store_7, v_weight,
+                    );
                 }
             }
 
-            let item_0 = pack_weights!(store_0, store_1, store_2, store_3);
-            let item_1 = pack_weights!(store_4, store_5, store_6, store_7);
+            let item_0 = pack_weights::<PRECISION>(store_0, store_1, store_2, store_3);
+            let item_1 = pack_weights::<PRECISION>(store_4, store_5, store_6, store_7);
 
             let dst_items = uint8x16x2_t(item_0, item_1);
             xvst1q_u8_x2(dst.as_mut_ptr(), dst_items);
@@ -937,7 +494,7 @@ fn convolve_vertical_neon_row_full(
         let iter_16 = rem.chunks_exact_mut(16);
 
         for dst in iter_16 {
-            let vld = vdupq_n_s32(ROUNDING_CONST);
+            let vld = vdupq_n_s32(rnd_const);
             let mut store_0 = vld;
             let mut store_1 = vld;
             let mut store_2 = vld;
@@ -953,8 +510,12 @@ fn convolve_vertical_neon_row_full(
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let item_row0 = vld1q_u8(src_ptr0.as_ptr());
                 let item_row1 = vld1q_u8(src_ptr1.as_ptr());
-                accumulate_4_into_lane!(item_row0, store_0, store_1, store_2, store_3, v_weight, 0);
-                accumulate_4_into_lane!(item_row1, store_0, store_1, store_2, store_3, v_weight, 1);
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 0>(
+                    item_row0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 1>(
+                    item_row1, store_0, store_1, store_2, store_3, v_weight,
+                );
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
@@ -967,9 +528,15 @@ fn convolve_vertical_neon_row_full(
                 let item_row0 = vld1q_u8(src_ptr0.as_ptr());
                 let item_row1 = vld1q_u8(src_ptr1.as_ptr());
                 let item_row2 = vld1q_u8(src_ptr2.as_ptr());
-                accumulate_4_into_lane!(item_row0, store_0, store_1, store_2, store_3, v_weight, 0);
-                accumulate_4_into_lane!(item_row1, store_0, store_1, store_2, store_3, v_weight, 1);
-                accumulate_4_into_lane!(item_row2, store_0, store_1, store_2, store_3, v_weight, 2);
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 0>(
+                    item_row0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 1>(
+                    item_row1, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 2>(
+                    item_row2, store_0, store_1, store_2, store_3, v_weight,
+                );
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
@@ -982,10 +549,18 @@ fn convolve_vertical_neon_row_full(
                 let item_row1 = vld1q_u8(src_ptr1.as_ptr());
                 let item_row2 = vld1q_u8(src_ptr2.as_ptr());
                 let item_row3 = vld1q_u8(src_ptr3.as_ptr());
-                accumulate_4_into_lane!(item_row0, store_0, store_1, store_2, store_3, v_weight, 0);
-                accumulate_4_into_lane!(item_row1, store_0, store_1, store_2, store_3, v_weight, 1);
-                accumulate_4_into_lane!(item_row2, store_0, store_1, store_2, store_3, v_weight, 2);
-                accumulate_4_into_lane!(item_row3, store_0, store_1, store_2, store_3, v_weight, 3);
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 0>(
+                    item_row0, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 1>(
+                    item_row1, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 2>(
+                    item_row2, store_0, store_1, store_2, store_3, v_weight,
+                );
+                (store_0, store_1, store_2, store_3) = accumulate_4_into_lane::<D, 3>(
+                    item_row3, store_0, store_1, store_2, store_3, v_weight,
+                );
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -993,11 +568,13 @@ fn convolve_vertical_neon_row_full(
                     let v_weight = vld1q_dup_s16(weight.as_ptr());
                     let src_ptr = src.get_unchecked((src_stride * py + px)..);
                     let item_row = vld1q_u8(src_ptr.as_ptr());
-                    accumulate_4_into!(item_row, store_0, store_1, store_2, store_3, v_weight);
+                    (store_0, store_1, store_2, store_3) = accumulate_4_into::<D>(
+                        item_row, store_0, store_1, store_2, store_3, v_weight,
+                    );
                 }
             }
 
-            let item = pack_weights!(store_0, store_1, store_2, store_3);
+            let item = pack_weights::<PRECISION>(store_0, store_1, store_2, store_3);
 
             vst1q_u8(dst.as_mut_ptr(), item);
 
@@ -1008,7 +585,7 @@ fn convolve_vertical_neon_row_full(
         let iter_8 = rem.chunks_exact_mut(8);
 
         for dst in iter_8 {
-            let vld = vdupq_n_s32(ROUNDING_CONST);
+            let vld = vdupq_n_s32(rnd_const);
             let mut store_0 = vld;
             let mut store_1 = vld;
 
@@ -1025,10 +602,10 @@ fn convolve_vertical_neon_row_full(
 
                 let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0));
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
-                store_0 = vmlal_lane_s16::<0>(store_0, vget_low_s16(low0), v_weight);
-                store_1 = vmlal_high_lane_s16::<0>(store_1, low0, v_weight);
-                store_0 = vmlal_lane_s16::<1>(store_0, vget_low_s16(low1), v_weight);
-                store_1 = vmlal_high_lane_s16::<1>(store_1, low1, v_weight);
+                store_0 = vxmlal_lane_s16::<D, 0>(store_0, vget_low_s16(low0), v_weight);
+                store_1 = vxmlal_high_lane_s16::<D, 0>(store_1, low0, v_weight);
+                store_0 = vxmlal_lane_s16::<D, 1>(store_0, vget_low_s16(low1), v_weight);
+                store_1 = vxmlal_high_lane_s16::<D, 1>(store_1, low1, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
@@ -1045,12 +622,12 @@ fn convolve_vertical_neon_row_full(
                 let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0));
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
                 let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2));
-                store_0 = vmlal_lane_s16::<0>(store_0, vget_low_s16(low0), v_weight);
-                store_1 = vmlal_high_lane_s16::<0>(store_1, low0, v_weight);
-                store_0 = vmlal_lane_s16::<1>(store_0, vget_low_s16(low1), v_weight);
-                store_1 = vmlal_high_lane_s16::<1>(store_1, low1, v_weight);
-                store_0 = vmlal_lane_s16::<2>(store_0, vget_low_s16(low2), v_weight);
-                store_1 = vmlal_high_lane_s16::<3>(store_1, low2, v_weight);
+                store_0 = vxmlal_lane_s16::<D, 0>(store_0, vget_low_s16(low0), v_weight);
+                store_1 = vxmlal_high_lane_s16::<D, 0>(store_1, low0, v_weight);
+                store_0 = vxmlal_lane_s16::<D, 1>(store_0, vget_low_s16(low1), v_weight);
+                store_1 = vxmlal_high_lane_s16::<D, 1>(store_1, low1, v_weight);
+                store_0 = vxmlal_lane_s16::<D, 2>(store_0, vget_low_s16(low2), v_weight);
+                store_1 = vxmlal_high_lane_s16::<D, 3>(store_1, low2, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
@@ -1068,14 +645,14 @@ fn convolve_vertical_neon_row_full(
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
                 let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2));
                 let low3 = vreinterpretq_s16_u16(vmovl_u8(item_row3));
-                store_0 = vmlal_lane_s16::<0>(store_0, vget_low_s16(low0), v_weight);
-                store_1 = vmlal_high_lane_s16::<0>(store_1, low0, v_weight);
-                store_0 = vmlal_lane_s16::<1>(store_0, vget_low_s16(low1), v_weight);
-                store_1 = vmlal_high_lane_s16::<1>(store_1, low1, v_weight);
-                store_0 = vmlal_lane_s16::<2>(store_0, vget_low_s16(low2), v_weight);
-                store_1 = vmlal_high_lane_s16::<2>(store_1, low2, v_weight);
-                store_0 = vmlal_lane_s16::<3>(store_0, vget_low_s16(low3), v_weight);
-                store_1 = vmlal_high_lane_s16::<3>(store_1, low3, v_weight);
+                store_0 = vxmlal_lane_s16::<D, 0>(store_0, vget_low_s16(low0), v_weight);
+                store_1 = vxmlal_high_lane_s16::<D, 0>(store_1, low0, v_weight);
+                store_0 = vxmlal_lane_s16::<D, 1>(store_0, vget_low_s16(low1), v_weight);
+                store_1 = vxmlal_high_lane_s16::<D, 1>(store_1, low1, v_weight);
+                store_0 = vxmlal_lane_s16::<D, 2>(store_0, vget_low_s16(low2), v_weight);
+                store_1 = vxmlal_high_lane_s16::<D, 2>(store_1, low2, v_weight);
+                store_0 = vxmlal_lane_s16::<D, 3>(store_0, vget_low_s16(low3), v_weight);
+                store_1 = vxmlal_high_lane_s16::<D, 3>(store_1, low3, v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -1085,8 +662,8 @@ fn convolve_vertical_neon_row_full(
                     let item_row = vld1_u8(src_ptr.as_ptr());
 
                     let low = vreinterpretq_s16_u16(vmovl_u8(item_row));
-                    store_0 = vmlal_s16(store_0, vget_low_s16(low), vget_low_s16(v_weight));
-                    store_1 = vmlal_high_s16(store_1, low, v_weight);
+                    store_0 = vxmlal_s16::<D>(store_0, vget_low_s16(low), vget_low_s16(v_weight));
+                    store_1 = vxmlal_high_s16::<D>(store_1, low, v_weight);
                 }
             }
 
@@ -1106,7 +683,7 @@ fn convolve_vertical_neon_row_full(
         let iter_1 = rem.iter_mut();
 
         for dst in iter_1 {
-            let vld = vdupq_n_s32(ROUNDING_CONST);
+            let vld = vdupq_n_s32(rnd_const);
             let mut store = vld;
 
             let px = cx;
@@ -1122,8 +699,8 @@ fn convolve_vertical_neon_row_full(
 
                 let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0));
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
-                store = vmlal_lane_s16::<0>(store, vget_low_s16(low0), v_weight);
-                store = vmlal_lane_s16::<1>(store, vget_low_s16(low1), v_weight);
+                store = vxmlal_lane_s16::<D, 0>(store, vget_low_s16(low0), v_weight);
+                store = vxmlal_lane_s16::<D, 1>(store, vget_low_s16(low1), v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
@@ -1140,9 +717,9 @@ fn convolve_vertical_neon_row_full(
                 let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0));
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
                 let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2));
-                store = vmlal_lane_s16::<0>(store, vget_low_s16(low0), v_weight);
-                store = vmlal_lane_s16::<1>(store, vget_low_s16(low1), v_weight);
-                store = vmlal_lane_s16::<2>(store, vget_low_s16(low2), v_weight);
+                store = vxmlal_lane_s16::<D, 0>(store, vget_low_s16(low0), v_weight);
+                store = vxmlal_lane_s16::<D, 1>(store, vget_low_s16(low1), v_weight);
+                store = vxmlal_lane_s16::<D, 2>(store, vget_low_s16(low2), v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
@@ -1160,10 +737,10 @@ fn convolve_vertical_neon_row_full(
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
                 let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2));
                 let low3 = vreinterpretq_s16_u16(vmovl_u8(item_row3));
-                store = vmlal_lane_s16::<0>(store, vget_low_s16(low0), v_weight);
-                store = vmlal_lane_s16::<1>(store, vget_low_s16(low1), v_weight);
-                store = vmlal_lane_s16::<2>(store, vget_low_s16(low2), v_weight);
-                store = vmlal_lane_s16::<3>(store, vget_low_s16(low3), v_weight);
+                store = vxmlal_lane_s16::<D, 0>(store, vget_low_s16(low0), v_weight);
+                store = vxmlal_lane_s16::<D, 1>(store, vget_low_s16(low1), v_weight);
+                store = vxmlal_lane_s16::<D, 2>(store, vget_low_s16(low2), v_weight);
+                store = vxmlal_lane_s16::<D, 3>(store, vget_low_s16(low3), v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -1173,7 +750,7 @@ fn convolve_vertical_neon_row_full(
                     let item_row = vld1_dup_u8(src_ptr.as_ptr());
 
                     let low = vreinterpretq_s16_u16(vmovl_u8(item_row));
-                    store = vmlal_s16(store, vget_low_s16(low), vget_low_s16(v_weight));
+                    store = vxmlal_s16::<D>(store, vget_low_s16(low), vget_low_s16(v_weight));
                 }
             }
 
diff --git a/src/neon/vertical_u8_rdm.rs b/src/neon/vertical_u8_rdm.rs
new file mode 100644
index 0000000..aeabdc3
--- /dev/null
+++ b/src/neon/vertical_u8_rdm.rs
@@ -0,0 +1,594 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::filter_weights::FilterBounds;
+use crate::neon::utils::{expand8_to_14, xvld1q_u8_x2, xvld1q_u8_x4, xvst1q_u8_x2, xvst1q_u8_x4};
+use std::arch::aarch64::*;
+
+/// Checking NEON `rdm` availability is required before a call.
+///
+/// RDM feature has slightly lower precision and won't work really well on huge kernel which
+/// edges fades out fast. Therefore, it would be reasonable to avoid using feature for huge downscaling.
+///
+/// # Safety
+/// - Check `rdm` availability before the call.
+pub(crate) fn convolve_vertical_neon_i16_precision(
+    width: usize,
+    bounds: &FilterBounds,
+    src: &[u8],
+    dst: &mut [u8],
+    src_stride: usize,
+    weight: &[i16],
+) {
+    unsafe {
+        convolve_vertical_neon_row_upper(width, bounds, src, dst, src_stride, weight);
+    }
+}
+
+#[must_use]
+#[inline(always)]
+unsafe fn vdot<const SCALE: i32>(
+    store0: int16x8_t,
+    store1: int16x8_t,
+    row: uint8x16_t,
+    weight: int16x8_t,
+) -> (int16x8_t, int16x8_t) {
+    let lo0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip1q_u8(row, row))));
+    let store0 = vqrdmlahq_s16(store0, lo0, weight);
+    let hi0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip2q_u8(row, row))));
+    let store1 = vqrdmlahq_s16(store1, hi0, weight);
+    (store0, store1)
+}
+
+#[must_use]
+#[inline(always)]
+unsafe fn vdot_lane<const SCALE: i32, const LANE: i32>(
+    store0: int16x8_t,
+    store1: int16x8_t,
+    row: uint8x16_t,
+    weight: int16x4_t,
+) -> (int16x8_t, int16x8_t) {
+    let lo0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip1q_u8(row, row))));
+    let store0 = vqrdmlahq_lane_s16::<LANE>(store0, lo0, weight);
+    let hi0 = vreinterpretq_s16_u16(vshrq_n_u16::<2>(vreinterpretq_u16_u8(vzip2q_u8(row, row))));
+    let store1 = vqrdmlahq_lane_s16::<LANE>(store1, hi0, weight);
+    (store0, store1)
+}
+
+#[target_feature(enable = "rdm")]
+unsafe fn convolve_vertical_neon_row_upper(
+    _: usize,
+    bounds: &FilterBounds,
+    src: &[u8],
+    dst: &mut [u8],
+    src_stride: usize,
+    weight: &[i16],
+) {
+    let mut cx = 0usize;
+
+    let iter_64 = dst.chunks_exact_mut(64);
+
+    let bounds_size = bounds.size;
+    const SCALE: i32 = 6;
+    const R_SHR_SCALE: i32 = SCALE;
+    const ROUNDING: i16 = 1 << (SCALE - 1);
+
+    for dst in iter_64 {
+        let vld = vdupq_n_s16(ROUNDING);
+
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+        let mut store_2 = vld;
+        let mut store_3 = vld;
+
+        let mut store_4 = vld;
+        let mut store_5 = vld;
+        let mut store_6 = vld;
+        let mut store_7 = vld;
+
+        let px = cx;
+
+        if bounds_size == 2 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..2);
+            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+
+            let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
+            (store_4, store_5) = vdot_lane::<SCALE, 0>(store_4, store_5, items0.2, v_weight);
+            (store_6, store_7) = vdot_lane::<SCALE, 0>(store_6, store_7, items0.3, v_weight);
+
+            let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
+            (store_4, store_5) = vdot_lane::<SCALE, 1>(store_4, store_5, items1.2, v_weight);
+            (store_6, store_7) = vdot_lane::<SCALE, 1>(store_6, store_7, items1.3, v_weight);
+        } else if bounds_size == 3 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..3);
+            let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
+            v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
+
+            let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
+            (store_4, store_5) = vdot_lane::<SCALE, 0>(store_4, store_5, items0.2, v_weight);
+            (store_6, store_7) = vdot_lane::<SCALE, 0>(store_6, store_7, items0.3, v_weight);
+
+            let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
+            (store_4, store_5) = vdot_lane::<SCALE, 1>(store_4, store_5, items1.2, v_weight);
+            (store_6, store_7) = vdot_lane::<SCALE, 1>(store_6, store_7, items1.3, v_weight);
+
+            let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
+            (store_4, store_5) = vdot_lane::<SCALE, 2>(store_4, store_5, items2.2, v_weight);
+            (store_6, store_7) = vdot_lane::<SCALE, 2>(store_6, store_7, items2.3, v_weight);
+        } else if bounds_size == 4 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..4);
+            let v_weight = vld1_s16(weight.as_ptr());
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
+            let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
+
+            let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
+            (store_4, store_5) = vdot_lane::<SCALE, 0>(store_4, store_5, items0.2, v_weight);
+            (store_6, store_7) = vdot_lane::<SCALE, 0>(store_6, store_7, items0.3, v_weight);
+
+            let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
+            (store_4, store_5) = vdot_lane::<SCALE, 1>(store_4, store_5, items1.2, v_weight);
+            (store_6, store_7) = vdot_lane::<SCALE, 1>(store_6, store_7, items1.3, v_weight);
+
+            let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
+            (store_4, store_5) = vdot_lane::<SCALE, 2>(store_4, store_5, items2.2, v_weight);
+            (store_6, store_7) = vdot_lane::<SCALE, 2>(store_6, store_7, items2.3, v_weight);
+
+            let items3 = xvld1q_u8_x4(src_ptr3.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 3>(store_0, store_1, items3.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 3>(store_2, store_3, items3.1, v_weight);
+            (store_4, store_5) = vdot_lane::<SCALE, 3>(store_4, store_5, items3.2, v_weight);
+            (store_6, store_7) = vdot_lane::<SCALE, 3>(store_6, store_7, items3.3, v_weight);
+        } else {
+            for j in 0..bounds_size {
+                let py = bounds.start + j;
+                let weight = weight.get_unchecked(j..);
+                let v_weight = vld1q_dup_s16(weight.as_ptr());
+                let src_ptr = src.get_unchecked((src_stride * py + px)..);
+                let items = xvld1q_u8_x4(src_ptr.as_ptr());
+
+                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items.0, v_weight);
+                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items.1, v_weight);
+                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items.2, v_weight);
+                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items.3, v_weight);
+            }
+        }
+
+        let item00 = vqshrun_n_s16::<R_SHR_SCALE>(store_0);
+        let item01 = vqshrun_n_s16::<R_SHR_SCALE>(store_1);
+        let item10 = vqshrun_n_s16::<R_SHR_SCALE>(store_2);
+        let item11 = vqshrun_n_s16::<R_SHR_SCALE>(store_3);
+        let item20 = vqshrun_n_s16::<R_SHR_SCALE>(store_4);
+        let item21 = vqshrun_n_s16::<R_SHR_SCALE>(store_5);
+        let item30 = vqshrun_n_s16::<R_SHR_SCALE>(store_6);
+        let item31 = vqshrun_n_s16::<R_SHR_SCALE>(store_7);
+        let item0 = vcombine_u8(item00, item01);
+        let item1 = vcombine_u8(item10, item11);
+        let item2 = vcombine_u8(item20, item21);
+        let item3 = vcombine_u8(item30, item31);
+
+        let dst_items = uint8x16x4_t(item0, item1, item2, item3);
+        xvst1q_u8_x4(dst.as_mut_ptr(), dst_items);
+
+        cx += 64;
+    }
+
+    let mut rem = dst.chunks_exact_mut(64).into_remainder();
+    let iter_32 = rem.chunks_exact_mut(32);
+
+    for dst in iter_32 {
+        let vld = vdupq_n_s16(ROUNDING);
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+        let mut store_2 = vld;
+        let mut store_3 = vld;
+
+        let px = cx;
+
+        if bounds_size == 2 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..2);
+            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+
+            let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
+
+            let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
+        } else if bounds_size == 3 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..3);
+            let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
+            v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
+
+            let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
+
+            let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
+
+            let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
+        } else if bounds_size == 4 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..4);
+            let v_weight = vld1_s16(weight.as_ptr());
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
+            let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
+
+            let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
+
+            let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
+
+            let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
+
+            let items3 = xvld1q_u8_x2(src_ptr3.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 3>(store_0, store_1, items3.0, v_weight);
+            (store_2, store_3) = vdot_lane::<SCALE, 3>(store_2, store_3, items3.1, v_weight);
+        } else {
+            for j in 0..bounds.size {
+                let py = bounds.start + j;
+                let weight = weight.get_unchecked(j..);
+                let v_weight = vld1q_dup_s16(weight.as_ptr());
+                let src_ptr = src.get_unchecked((src_stride * py + px)..);
+                let items = xvld1q_u8_x2(src_ptr.as_ptr());
+
+                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items.0, v_weight);
+                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items.1, v_weight);
+            }
+        }
+
+        let item00 = vqshrun_n_s16::<R_SHR_SCALE>(store_0);
+        let item01 = vqshrun_n_s16::<R_SHR_SCALE>(store_1);
+        let item10 = vqshrun_n_s16::<R_SHR_SCALE>(store_2);
+        let item11 = vqshrun_n_s16::<R_SHR_SCALE>(store_3);
+        let item0 = vcombine_u8(item00, item01);
+        let item1 = vcombine_u8(item10, item11);
+
+        let dst_items = uint8x16x2_t(item0, item1);
+        xvst1q_u8_x2(dst.as_mut_ptr(), dst_items);
+
+        cx += 32;
+    }
+
+    rem = rem.chunks_exact_mut(32).into_remainder();
+    let iter_16 = rem.chunks_exact_mut(16);
+
+    for dst in iter_16 {
+        let vld = vdupq_n_s16(ROUNDING);
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+
+        let px = cx;
+
+        if bounds_size == 2 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..2);
+            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+
+            let item0 = vld1q_u8(src_ptr0.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, item0, v_weight);
+
+            let item1 = vld1q_u8(src_ptr1.as_ptr());
+            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, item1, v_weight);
+        } else if bounds_size == 3 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..3);
+            let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
+            v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
+
+            let item0 = vld1q_u8(src_ptr0.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, item0, v_weight);
+
+            let item1 = vld1q_u8(src_ptr1.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, item1, v_weight);
+
+            let item2 = vld1q_u8(src_ptr2.as_ptr());
+
+            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, item2, v_weight);
+        } else if bounds_size == 4 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..4);
+            let v_weight = vld1_s16(weight.as_ptr());
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
+            let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
+
+            let item0 = vld1q_u8(src_ptr0.as_ptr());
+            (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, item0, v_weight);
+
+            let item1 = vld1q_u8(src_ptr1.as_ptr());
+            (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, item1, v_weight);
+
+            let item2 = vld1q_u8(src_ptr2.as_ptr());
+            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, item2, v_weight);
+
+            let item3 = vld1q_u8(src_ptr3.as_ptr());
+            (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, item3, v_weight);
+        } else {
+            for j in 0..bounds_size {
+                let py = bounds.start + j;
+                let weight = weight.get_unchecked(j..);
+                let v_weight = vld1q_dup_s16(weight.as_ptr());
+                let src_ptr = src.get_unchecked((src_stride * py + px)..);
+                let item_row = vld1q_u8(src_ptr.as_ptr());
+
+                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item_row, v_weight);
+            }
+        }
+
+        let item0 = vqshrun_n_s16::<R_SHR_SCALE>(store_0);
+        let item1 = vqshrun_n_s16::<R_SHR_SCALE>(store_1);
+
+        vst1q_u8(dst.as_mut_ptr(), vcombine_u8(item0, item1));
+
+        cx += 16;
+    }
+
+    rem = rem.chunks_exact_mut(16).into_remainder();
+    let iter_8 = rem.chunks_exact_mut(8);
+
+    for dst in iter_8 {
+        let vld = vdupq_n_s16(ROUNDING);
+        let mut store_0 = vld;
+
+        let px = cx;
+
+        if bounds_size == 2 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..2);
+            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+
+            let item0 = vld1_u8(src_ptr0.as_ptr());
+            let low0 = expand8_to_14(item0);
+            store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight);
+
+            let item1 = vld1_u8(src_ptr1.as_ptr());
+            let low1 = expand8_to_14(item1);
+            store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight);
+        } else if bounds_size == 3 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..3);
+            let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
+            v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
+
+            let item0 = vld1_u8(src_ptr0.as_ptr());
+            let low0 = expand8_to_14(item0);
+            store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight);
+
+            let item1 = vld1_u8(src_ptr1.as_ptr());
+            let low1 = expand8_to_14(item1);
+            store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight);
+
+            let item2 = vld1_u8(src_ptr2.as_ptr());
+            let low2 = expand8_to_14(item2);
+            store_0 = vqrdmlahq_lane_s16::<2>(store_0, low2, v_weight);
+        } else if bounds_size == 4 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..4);
+            let v_weight = vld1_s16(weight.as_ptr());
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
+            let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
+
+            let item0 = vld1_u8(src_ptr0.as_ptr());
+            let low0 = expand8_to_14(item0);
+            store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight);
+
+            let item1 = vld1_u8(src_ptr1.as_ptr());
+            let low1 = expand8_to_14(item1);
+            store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight);
+
+            let item2 = vld1_u8(src_ptr2.as_ptr());
+            let low2 = expand8_to_14(item2);
+            store_0 = vqrdmlahq_lane_s16::<2>(store_0, low2, v_weight);
+
+            let item3 = vld1_u8(src_ptr3.as_ptr());
+            let low3 = expand8_to_14(item3);
+            store_0 = vqrdmlahq_lane_s16::<3>(store_0, low3, v_weight);
+        } else {
+            for j in 0..bounds_size {
+                let py = bounds.start + j;
+                let weight = weight.get_unchecked(j..);
+                let v_weight = vld1q_dup_s16(weight.as_ptr());
+                let src_ptr = src.get_unchecked((src_stride * py + px)..);
+                let item_row = vld1_u8(src_ptr.as_ptr());
+
+                let low = expand8_to_14(item_row);
+                store_0 = vqrdmlahq_s16(store_0, low, v_weight);
+            }
+        }
+
+        let item = vqshrun_n_s16::<R_SHR_SCALE>(store_0);
+        vst1_u8(dst.as_mut_ptr(), item);
+
+        cx += 8;
+    }
+
+    rem = rem.chunks_exact_mut(8).into_remainder();
+    let iter_1 = rem.iter_mut();
+
+    for dst in iter_1 {
+        let vld = vdupq_n_s16(ROUNDING);
+        let mut store = vld;
+
+        let px = cx;
+
+        if bounds_size == 2 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..2);
+            let v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+
+            let items0 = vld1_dup_u8(src_ptr0.as_ptr());
+            let low0 = expand8_to_14(items0);
+            store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight);
+
+            let items1 = vld1_dup_u8(src_ptr1.as_ptr());
+            let low1 = expand8_to_14(items1);
+            store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight);
+        } else if bounds_size == 3 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..3);
+            let mut v_weight = vreinterpret_s16_s32(vld1_dup_s32(weight.as_ptr() as *const i32));
+            v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
+
+            let items0 = vld1_dup_u8(src_ptr0.as_ptr());
+            let low0 = expand8_to_14(items0);
+            store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight);
+
+            let items1 = vld1_dup_u8(src_ptr1.as_ptr());
+            let low1 = expand8_to_14(items1);
+            store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight);
+
+            let items2 = vld1_dup_u8(src_ptr2.as_ptr());
+            let low2 = expand8_to_14(items2);
+            store = vqrdmlahq_lane_s16::<2>(store, low2, v_weight);
+        } else if bounds_size == 4 {
+            let py = bounds.start;
+            let weight = weight.get_unchecked(0..4);
+            let v_weight = vld1_s16(weight.as_ptr());
+            let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
+            let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
+            let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
+            let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
+
+            let items0 = vld1_dup_u8(src_ptr0.as_ptr());
+            let low0 = expand8_to_14(items0);
+            store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight);
+
+            let items1 = vld1_dup_u8(src_ptr1.as_ptr());
+            let low1 = expand8_to_14(items1);
+            store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight);
+
+            let items2 = vld1_dup_u8(src_ptr2.as_ptr());
+            let low2 = expand8_to_14(items2);
+            store = vqrdmlahq_lane_s16::<2>(store, low2, v_weight);
+
+            let items3 = vld1_dup_u8(src_ptr3.as_ptr());
+            let low3 = expand8_to_14(items3);
+            store = vqrdmlahq_lane_s16::<3>(store, low3, v_weight);
+        } else {
+            for j in 0..bounds_size {
+                let py = bounds.start + j;
+                let weight = weight.get_unchecked(j..);
+                let v_weight = vld1q_dup_s16(weight.as_ptr());
+                let src_ptr = src.get_unchecked((src_stride * py + px)..);
+                let item_row = vld1_dup_u8(src_ptr.as_ptr());
+
+                let low = expand8_to_14(item_row);
+                store = vqrdmlahq_s16(store, low, v_weight);
+            }
+        }
+
+        let shrinked_store = vqshrun_n_s16::<R_SHR_SCALE>(store);
+        let value = vget_lane_u8::<0>(shrinked_store);
+        *dst = value;
+        cx += 1;
+    }
+}
diff --git a/src/neon/weights.rs b/src/neon/weights.rs
index 2f7e390..dbae834 100644
--- a/src/neon/weights.rs
+++ b/src/neon/weights.rs
@@ -32,12 +32,12 @@ use crate::neon::{xreinterpret_u16_f16, xreinterpretq_u16_f16};
 use std::arch::aarch64::*;
 
 pub(crate) fn convert_weights_to_f16(weights: &[f32]) -> Vec<i16> {
-    unsafe { convert_weights_to_f16_impl(weights) }
+    unsafe { convert_weights_to_f16_impl::<i16>(weights) }
 }
 
 #[target_feature(enable = "fp16")]
-unsafe fn convert_weights_to_f16_impl(weights: &[f32]) -> Vec<i16> {
-    let mut new_weights = vec![0i16; weights.len()];
+unsafe fn convert_weights_to_f16_impl<J: Default + Clone>(weights: &[f32]) -> Vec<J> {
+    let mut new_weights = vec![J::default(); weights.len()];
 
     for (dst, src) in new_weights.chunks_exact_mut(8).zip(weights.chunks_exact(8)) {
         let j = xvld1q_f32_x2(src.as_ptr());
@@ -68,3 +68,11 @@ unsafe fn convert_weights_to_f16_impl(weights: &[f32]) -> Vec<i16> {
 
     new_weights
 }
+
+#[cfg(feature = "nightly_f16")]
+use core::f16;
+
+#[cfg(feature = "nightly_f16")]
+pub(crate) fn convert_weights_to_f16_fhm(weights: &[f32]) -> Vec<f16> {
+    unsafe { convert_weights_to_f16_impl(weights) }
+}
diff --git a/src/plane_f32.rs b/src/plane_f32.rs
index d9cad85..49e1e93 100644
--- a/src/plane_f32.rs
+++ b/src/plane_f32.rs
@@ -28,7 +28,7 @@
  */
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::avx2::convolve_vertical_avx_row_f32;
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::convolve_naive_f32::{
     convolve_horizontal_rgb_native_row, convolve_horizontal_rgba_4_row_f32,
 };
@@ -57,6 +57,7 @@ impl HorizontalConvolutionPass<f32, 1> for ImageStore<'_, f32, 1> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f32, 1>,
         pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         let mut _dispatcher_4_rows: Option<
             fn(usize, usize, &FilterWeights<f32>, &[f32], usize, &mut [f32], usize),
@@ -96,6 +97,7 @@ impl VerticalConvolutionPass<f32, 1> for ImageStore<'_, f32, 1> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f32, 1>,
         pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         #[allow(clippy::type_complexity)]
         let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) =
diff --git a/src/plane_u16.rs b/src/plane_u16.rs
index bb86038..6cc8e7c 100644
--- a/src/plane_u16.rs
+++ b/src/plane_u16.rs
@@ -27,7 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16};
 use crate::filter_weights::FilterWeights;
 use crate::image_store::ImageStoreMut;
@@ -41,6 +41,7 @@ impl HorizontalConvolutionPass<u16, 1> for ImageStore<'_, u16, 1> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u16, 1>,
         _pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool);
     }
@@ -52,7 +53,8 @@ impl VerticalConvolutionPass<u16, 1> for ImageStore<'_, u16, 1> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u16, 1>,
         pool: &Option<ThreadPool>,
+        options: ConvolutionOptions,
     ) {
-        convolve_vertical_dispatch_u16(self, filter_weights, destination, pool);
+        convolve_vertical_dispatch_u16(self, filter_weights, destination, pool, options);
     }
 }
diff --git a/src/plane_u8.rs b/src/plane_u8.rs
index c8074a1..1ca91de 100644
--- a/src/plane_u8.rs
+++ b/src/plane_u8.rs
@@ -28,7 +28,7 @@
  */
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::avx2::{convolve_vertical_avx_row, convolve_vertical_avx_row_lp};
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8};
 use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights};
 use crate::handler_provider::{
@@ -37,8 +37,6 @@ use crate::handler_provider::{
 use crate::image_store::ImageStoreMut;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
 use crate::neon::{convolve_horizontal_plane_neon_row, convolve_horizontal_plane_neon_rows_4_u8};
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-use crate::neon::{convolve_vertical_neon_i16_precision, convolve_vertical_neon_i32_precision};
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse::{
     convolve_horizontal_plane_sse_row, convolve_horizontal_plane_sse_rows_4_u8,
@@ -56,6 +54,7 @@ impl HorizontalConvolutionPass<u8, 1> for ImageStore<'_, u8, 1> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u8, 1>,
         _pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         let _scale_factor = self.height as f32 / destination.height as f32;
         let mut _dispatcher_4_rows: Option<
@@ -65,15 +64,31 @@ impl HorizontalConvolutionPass<u8, 1> for ImageStore<'_, u8, 1> {
             handle_fixed_row_u8::<1>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_4_u8);
-            _dispatcher_1_row = convolve_horizontal_plane_neon_row;
-            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
-                use crate::neon::{
-                    convolve_horizontal_plane_neon_rdm_row,
-                    convolve_horizontal_plane_neon_rows_rdm_4_u8,
-                };
-                _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_rdm_4_u8);
-                _dispatcher_1_row = convolve_horizontal_plane_neon_rdm_row;
+            match _options.workload_strategy {
+                crate::WorkloadStrategy::PreferQuality => {
+                    use crate::neon::{
+                        convolve_horizontal_plane_neon_row_q,
+                        convolve_horizontal_plane_neon_rows_4_u8_q,
+                    };
+                    _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_4_u8_q);
+                    _dispatcher_1_row = convolve_horizontal_plane_neon_row_q;
+                }
+                crate::WorkloadStrategy::PreferSpeed => {
+                    _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_4_u8);
+                    _dispatcher_1_row = convolve_horizontal_plane_neon_row;
+                    #[cfg(feature = "rdm")]
+                    if _scale_factor < 8.
+                        && crate::cpu_features::is_aarch_rdm_supported()
+                        && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                    {
+                        use crate::neon::{
+                            convolve_horizontal_plane_neon_rdm_row,
+                            convolve_horizontal_plane_neon_rows_rdm_4_u8,
+                        };
+                        _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_rdm_4_u8);
+                        _dispatcher_1_row = convolve_horizontal_plane_neon_rdm_row;
+                    }
+                }
             }
         }
         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
@@ -81,7 +96,9 @@ impl HorizontalConvolutionPass<u8, 1> for ImageStore<'_, u8, 1> {
             if is_x86_feature_detected!("sse4.1") {
                 _dispatcher_4_rows = Some(convolve_horizontal_plane_sse_rows_4_u8);
                 _dispatcher_1_row = convolve_horizontal_plane_sse_row;
-                if _scale_factor < 8. {
+                if _scale_factor < 8.
+                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                {
                     use crate::sse::{
                         convolve_horizontal_plane_sse_row_hrs,
                         convolve_horizontal_plane_sse_rows_hrs_4_u8,
@@ -109,6 +126,7 @@ impl VerticalConvolutionPass<u8, 1> for ImageStore<'_, u8, 1> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u8, 1>,
         pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         let _scale_factor = self.height as f32 / destination.height as f32;
         #[allow(clippy::type_complexity)]
@@ -116,35 +134,56 @@ impl VerticalConvolutionPass<u8, 1> for ImageStore<'_, u8, 1> {
             handle_fixed_column_u8;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            // For more downscaling better to use more precise version
-            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
-                _dispatcher = convolve_vertical_neon_i16_precision;
-            } else {
-                _dispatcher = convolve_vertical_neon_i32_precision;
+            match _options.workload_strategy {
+                crate::WorkloadStrategy::PreferQuality => {
+                    use crate::neon::convolve_vertical_neon_i32_precision_d;
+                    _dispatcher = convolve_vertical_neon_i32_precision_d;
+                }
+                crate::WorkloadStrategy::PreferSpeed => {
+                    // For more downscaling better to use more precise version
+                    #[cfg(feature = "rdm")]
+                    if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
+                        use crate::neon::convolve_vertical_neon_i16_precision;
+                        _dispatcher = convolve_vertical_neon_i16_precision;
+                    } else {
+                        use crate::neon::convolve_vertical_neon_i32_precision;
+                        _dispatcher = convolve_vertical_neon_i32_precision;
+                    }
+                    #[cfg(not(feature = "rdm"))]
+                    {
+                        use crate::neon::convolve_vertical_neon_i32_precision;
+                        _dispatcher = convolve_vertical_neon_i32_precision;
+                    }
+                }
             }
         }
         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
         {
             if is_x86_feature_detected!("sse4.1") {
-                if _scale_factor < 8. {
+                if _scale_factor < 8.
+                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                {
                     _dispatcher = convolve_vertical_sse_row_lp;
                 } else {
                     _dispatcher = convolve_vertical_sse_row;
                 }
             }
             if is_x86_feature_detected!("avx2") {
-                if _scale_factor < 8. {
+                if _scale_factor < 8.
+                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                {
                     _dispatcher = convolve_vertical_avx_row_lp;
                 } else {
                     _dispatcher = convolve_vertical_avx_row;
                 }
             }
             #[cfg(feature = "nightly_avx512")]
-            if std::arch::is_x86_feature_detected!("avx512bw") {
-                if _scale_factor < 8. {
-                    use crate::avx512::convolve_vertical_avx512_row_lp;
-                    _dispatcher = convolve_vertical_avx512_row_lp;
-                }
+            if std::arch::is_x86_feature_detected!("avx512bw")
+                && _scale_factor < 8.
+                && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+            {
+                use crate::avx512::convolve_vertical_avx512_row_lp;
+                _dispatcher = convolve_vertical_avx512_row_lp;
             }
         }
         #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
diff --git a/src/resize_ar30.rs b/src/resize_ar30.rs
index bf36dd2..281abf2 100644
--- a/src/resize_ar30.rs
+++ b/src/resize_ar30.rs
@@ -26,18 +26,21 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+use crate::convolution::ConvolutionOptions;
 use crate::dispatch_group_ar30::{
     convolve_horizontal_dispatch_ar30, convolve_vertical_dispatch_ar30,
 };
 use crate::nearest_sampler::resize_nearest;
 use crate::pic_scale_error::PicScaleError;
 use crate::support::check_image_size_overflow;
-use crate::{ImageSize, ResamplingFunction, Scaler};
+use crate::{ImageSize, PicScaleBufferMismatch, ResamplingFunction, Scaler};
 
 pub(crate) fn resize_ar30_impl<const AR30_TYPE: usize, const AR30_ORDER: usize>(
-    src: &[u32],
+    src: &[u8],
+    src_stride: usize,
     src_size: ImageSize,
-    dst: &mut [u32],
+    dst: &mut [u8],
+    dst_stride: usize,
     dst_size: ImageSize,
     scaler: &Scaler,
 ) -> Result<(), PicScaleError> {
@@ -45,14 +48,40 @@ pub(crate) fn resize_ar30_impl<const AR30_TYPE: usize, const AR30_ORDER: usize>(
         return Err(PicScaleError::ZeroImageDimensions);
     }
 
-    if check_image_size_overflow(src_size.width, src_size.height, 1) {
+    if check_image_size_overflow(src_size.width, src_size.height, 4) {
         return Err(PicScaleError::SourceImageIsTooLarge);
     }
 
-    if check_image_size_overflow(dst_size.width, dst_size.height, 1) {
+    if check_image_size_overflow(dst_size.width, dst_size.height, 4) {
         return Err(PicScaleError::DestinationImageIsTooLarge);
     }
 
+    if src.len() != src_stride * src_size.height {
+        return Err(PicScaleError::BufferMismatch(PicScaleBufferMismatch {
+            expected: src_stride * src_size.height,
+            width: src_size.width,
+            height: src_size.height,
+            channels: 4,
+            slice_len: src.len(),
+        }));
+    }
+    if src_stride < src_size.width * 4 {
+        return Err(PicScaleError::InvalidStride(src_size.width * 4, src_stride));
+    }
+
+    if dst.len() != dst_stride * dst_size.height {
+        return Err(PicScaleError::BufferMismatch(PicScaleBufferMismatch {
+            expected: dst_stride * dst_size.height,
+            width: dst_size.width,
+            height: dst_size.height,
+            channels: 4,
+            slice_len: dst.len(),
+        }));
+    }
+    if dst_stride < dst_size.width * 4 {
+        return Err(PicScaleError::InvalidStride(dst_size.width * 4, dst_stride));
+    }
+
     if src_size.width == dst_size.width && src_size.height == dst_size.height {
         for (src, dst) in src.iter().zip(dst.iter_mut()) {
             *dst = *src;
@@ -65,12 +94,12 @@ pub(crate) fn resize_ar30_impl<const AR30_TYPE: usize, const AR30_ORDER: usize>(
         .get_pool(ImageSize::new(dst_size.width, dst_size.height));
 
     if scaler.function == ResamplingFunction::Nearest {
-        resize_nearest::<u32, 1>(
+        resize_nearest::<u8, 4>(
             src,
-            src_size.width,
+            src_stride,
             src_size.height,
             dst,
-            dst_size.width,
+            dst_stride,
             dst_size.height,
             &pool,
         );
@@ -81,46 +110,56 @@ pub(crate) fn resize_ar30_impl<const AR30_TYPE: usize, const AR30_ORDER: usize>(
     let should_do_vertical = src_size.height != dst_size.height;
     assert!(should_do_horizontal || should_do_vertical);
 
+    let options = ConvolutionOptions::new(scaler.workload_strategy);
+
     if should_do_vertical && !should_do_horizontal {
         let vertical_filters = scaler.generate_weights(src_size.height, dst_size.height);
         convolve_vertical_dispatch_ar30::<AR30_TYPE, AR30_ORDER>(
             src,
-            src_size.width,
+            src_stride,
             vertical_filters,
             dst,
-            src_size.width,
+            src_stride,
             &pool,
+            src_size.width,
+            options,
         );
         return Ok(());
-    }
-
-    let working_store = if should_do_vertical {
-        let mut target = vec![0u32; src_size.width * dst_size.height];
+    } else if should_do_horizontal && should_do_vertical {
+        let mut target = vec![0u8; src_size.width * dst_size.height * 4];
 
         let vertical_filters = scaler.generate_weights(src_size.height, dst_size.height);
         convolve_vertical_dispatch_ar30::<AR30_TYPE, AR30_ORDER>(
             src,
-            src_size.width,
+            src_stride,
             vertical_filters,
             &mut target,
-            src_size.width,
+            src_size.width * 4,
             &pool,
+            src_size.width,
+            options,
         );
 
-        std::borrow::Cow::Owned(target)
+        let horizontal_filters = scaler.generate_weights(src_size.width, dst_size.width);
+        convolve_horizontal_dispatch_ar30::<AR30_TYPE, AR30_ORDER>(
+            &target,
+            src_size.width * 4,
+            horizontal_filters,
+            dst,
+            dst_stride,
+            &pool,
+            options,
+        );
     } else {
-        std::borrow::Cow::Borrowed(src)
-    };
-
-    if should_do_horizontal {
         let horizontal_filters = scaler.generate_weights(src_size.width, dst_size.width);
         convolve_horizontal_dispatch_ar30::<AR30_TYPE, AR30_ORDER>(
-            working_store.as_ref(),
-            src_size.width,
+            src,
+            src_stride,
             horizontal_filters,
             dst,
-            dst_size.width,
+            dst_stride,
             &pool,
+            options,
         );
     }
 
diff --git a/src/rgb_f32.rs b/src/rgb_f32.rs
index d02a216..8bab908 100644
--- a/src/rgb_f32.rs
+++ b/src/rgb_f32.rs
@@ -28,7 +28,7 @@
  */
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::avx2::convolve_vertical_avx_row_f32;
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::convolve_naive_f32::*;
 use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32};
 use crate::filter_weights::{FilterBounds, FilterWeights};
@@ -58,6 +58,7 @@ impl HorizontalConvolutionPass<f32, 3> for ImageStore<'_, f32, 3> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f32, 3>,
         pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         let mut _dispatcher_4_rows: Option<
             fn(usize, usize, &FilterWeights<f32>, &[f32], usize, &mut [f32], usize),
@@ -97,6 +98,7 @@ impl VerticalConvolutionPass<f32, 3> for ImageStore<'_, f32, 3> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f32, 3>,
         pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         #[allow(clippy::type_complexity)]
         let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) =
diff --git a/src/rgb_u16.rs b/src/rgb_u16.rs
index d420454..b44b093 100644
--- a/src/rgb_u16.rs
+++ b/src/rgb_u16.rs
@@ -27,7 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16};
 use crate::filter_weights::FilterWeights;
 use crate::image_store::ImageStoreMut;
@@ -41,6 +41,7 @@ impl HorizontalConvolutionPass<u16, 3> for ImageStore<'_, u16, 3> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u16, 3>,
         _pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool);
     }
@@ -52,7 +53,8 @@ impl VerticalConvolutionPass<u16, 3> for ImageStore<'_, u16, 3> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u16, 3>,
         pool: &Option<ThreadPool>,
+        options: ConvolutionOptions,
     ) {
-        convolve_vertical_dispatch_u16(self, filter_weights, destination, pool);
+        convolve_vertical_dispatch_u16(self, filter_weights, destination, pool, options);
     }
 }
diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs
index 5f541fc..1c73732 100644
--- a/src/rgb_u8.rs
+++ b/src/rgb_u8.rs
@@ -28,7 +28,7 @@
  */
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::avx2::{convolve_vertical_avx_row, convolve_vertical_avx_row_lp};
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8};
 use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights};
 use crate::handler_provider::{
@@ -53,6 +53,7 @@ impl HorizontalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u8, 3>,
         pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         let _scale_factor = self.width as f32 / destination.width as f32;
         let mut _dispatcher_4_rows: Option<
@@ -63,15 +64,28 @@ impl HorizontalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
 
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4);
-            _dispatcher_1_row = convolve_horizontal_rgb_neon_row_one;
-            if _scale_factor < 8.0 && crate::cpu_features::is_aarch_rdm_supported() {
-                use crate::neon::{
-                    convolve_horizontal_rgb_neon_rdm_row_one,
-                    convolve_horizontal_rgb_neon_rdm_rows_4,
-                };
-                _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rdm_rows_4);
-                _dispatcher_1_row = convolve_horizontal_rgb_neon_rdm_row_one;
+            match _options.workload_strategy {
+                crate::WorkloadStrategy::PreferQuality => {
+                    use crate::neon::{
+                        convolve_horizontal_rgb_neon_row_one_q,
+                        convolve_horizontal_rgb_neon_rows_4_q,
+                    };
+                    _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4_q);
+                    _dispatcher_1_row = convolve_horizontal_rgb_neon_row_one_q;
+                }
+                crate::WorkloadStrategy::PreferSpeed => {
+                    _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4);
+                    _dispatcher_1_row = convolve_horizontal_rgb_neon_row_one;
+                    #[cfg(feature = "rdm")]
+                    if _scale_factor < 8.0 && crate::cpu_features::is_aarch_rdm_supported() {
+                        use crate::neon::{
+                            convolve_horizontal_rgb_neon_rdm_row_one,
+                            convolve_horizontal_rgb_neon_rdm_rows_4,
+                        };
+                        _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rdm_rows_4);
+                        _dispatcher_1_row = convolve_horizontal_rgb_neon_rdm_row_one;
+                    }
+                }
             }
         }
         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
@@ -106,6 +120,7 @@ impl VerticalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u8, 3>,
         pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         let _scale_factor = self.height as f32 / destination.height as f32;
         #[allow(clippy::type_complexity)]
@@ -113,35 +128,56 @@ impl VerticalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
             handle_fixed_column_u8;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            // For more downscaling better to use more precise version
-            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
-                _dispatcher = convolve_vertical_neon_i16_precision;
-            } else {
-                _dispatcher = convolve_vertical_neon_i32_precision;
+            match _options.workload_strategy {
+                crate::WorkloadStrategy::PreferQuality => {
+                    use crate::neon::convolve_vertical_neon_i32_precision_d;
+                    _dispatcher = convolve_vertical_neon_i32_precision_d;
+                }
+                crate::WorkloadStrategy::PreferSpeed => {
+                    // For more downscaling better to use more precise version
+                    #[cfg(feature = "rdm")]
+                    if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
+                        use crate::neon::convolve_vertical_neon_i16_precision;
+                        _dispatcher = convolve_vertical_neon_i16_precision;
+                    } else {
+                        use crate::neon::convolve_vertical_neon_i32_precision;
+                        _dispatcher = convolve_vertical_neon_i32_precision;
+                    }
+                    #[cfg(not(feature = "rdm"))]
+                    {
+                        use crate::neon::convolve_vertical_neon_i32_precision;
+                        _dispatcher = convolve_vertical_neon_i32_precision;
+                    }
+                }
             }
         }
         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
         {
             if is_x86_feature_detected!("sse4.1") {
-                if _scale_factor < 8. {
+                if _scale_factor < 8.
+                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                {
                     _dispatcher = convolve_vertical_sse_row_lp;
                 } else {
                     _dispatcher = convolve_vertical_sse_row;
                 }
             }
             if is_x86_feature_detected!("avx2") {
-                if _scale_factor < 8. {
+                if _scale_factor < 8.
+                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                {
                     _dispatcher = convolve_vertical_avx_row_lp;
                 } else {
                     _dispatcher = convolve_vertical_avx_row;
                 }
             }
             #[cfg(feature = "nightly_avx512")]
-            if std::arch::is_x86_feature_detected!("avx512bw") {
-                if _scale_factor < 8. {
-                    use crate::avx512::convolve_vertical_avx512_row_lp;
-                    _dispatcher = convolve_vertical_avx512_row_lp;
-                }
+            if std::arch::is_x86_feature_detected!("avx512bw")
+                && _scale_factor < 8.
+                && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+            {
+                use crate::avx512::convolve_vertical_avx512_row_lp;
+                _dispatcher = convolve_vertical_avx512_row_lp;
             }
         }
         #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
diff --git a/src/rgba_f32.rs b/src/rgba_f32.rs
index d3cff0c..4a78f4d 100644
--- a/src/rgba_f32.rs
+++ b/src/rgba_f32.rs
@@ -31,7 +31,7 @@ use crate::avx2::{
     convolve_horizontal_rgba_avx_row_one_f32, convolve_horizontal_rgba_avx_rows_4_f32,
     convolve_vertical_avx_row_f32,
 };
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::convolve_naive_f32::{
     convolve_horizontal_rgb_native_row, convolve_horizontal_rgba_4_row_f32,
 };
@@ -53,6 +53,7 @@ impl HorizontalConvolutionPass<f32, 4> for ImageStore<'_, f32, 4> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f32, 4>,
         pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         let mut _dispatcher_4_rows: Option<
             fn(usize, usize, &FilterWeights<f32>, &[f32], usize, &mut [f32], usize),
@@ -100,6 +101,7 @@ impl VerticalConvolutionPass<f32, 4> for ImageStore<'_, f32, 4> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<f32, 4>,
         pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         #[allow(clippy::type_complexity)]
         let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) =
diff --git a/src/rgba_u16.rs b/src/rgba_u16.rs
index 613bc19..2bfc7cd 100644
--- a/src/rgba_u16.rs
+++ b/src/rgba_u16.rs
@@ -27,7 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #![forbid(unsafe_code)]
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16};
 use crate::filter_weights::FilterWeights;
 use crate::image_store::ImageStoreMut;
@@ -41,6 +41,7 @@ impl HorizontalConvolutionPass<u16, 4> for ImageStore<'_, u16, 4> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u16, 4>,
         _pool: &Option<ThreadPool>,
+        _: ConvolutionOptions,
     ) {
         convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool);
     }
@@ -52,7 +53,8 @@ impl VerticalConvolutionPass<u16, 4> for ImageStore<'_, u16, 4> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u16, 4>,
         pool: &Option<ThreadPool>,
+        options: ConvolutionOptions,
     ) {
-        convolve_vertical_dispatch_u16(self, filter_weights, destination, pool);
+        convolve_vertical_dispatch_u16(self, filter_weights, destination, pool, options);
     }
 }
diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs
index 0c57593..fa57723 100644
--- a/src/rgba_u8.rs
+++ b/src/rgba_u8.rs
@@ -33,7 +33,7 @@ use crate::avx2::{
     convolve_horizontal_rgba_avx_rows_4_lb, convolve_horizontal_rgba_avx_rows_one_lb,
     convolve_vertical_avx_row, convolve_vertical_avx_row_lp,
 };
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8};
 use crate::filter_weights::{DefaultWeightsConverter, FilterBounds, FilterWeights};
 use crate::handler_provider::{
@@ -60,6 +60,7 @@ impl HorizontalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u8, 4>,
         _pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         let _scale_factor = self.width as f32 / destination.width as f32;
         let mut _dispatcher_4_rows: Option<
@@ -69,18 +70,38 @@ impl HorizontalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
             handle_fixed_row_u8::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
-                _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8_i16);
-                _dispatcher_1_row = convolve_horizontal_rgba_neon_row_i16;
-            } else {
-                _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8);
-                _dispatcher_1_row = convolve_horizontal_rgba_neon_row;
+            match _options.workload_strategy {
+                crate::WorkloadStrategy::PreferQuality => {
+                    use crate::neon::{
+                        convolve_horizontal_rgba_neon_row_q,
+                        convolve_horizontal_rgba_neon_rows_4_u8_q,
+                    };
+                    _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8_q);
+                    _dispatcher_1_row = convolve_horizontal_rgba_neon_row_q;
+                }
+                crate::WorkloadStrategy::PreferSpeed => {
+                    #[cfg(feature = "rdm")]
+                    if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
+                        _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8_i16);
+                        _dispatcher_1_row = convolve_horizontal_rgba_neon_row_i16;
+                    } else {
+                        _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8);
+                        _dispatcher_1_row = convolve_horizontal_rgba_neon_row;
+                    }
+                    #[cfg(not(feature = "rdm"))]
+                    {
+                        _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8);
+                        _dispatcher_1_row = convolve_horizontal_rgba_neon_row;
+                    }
+                }
             }
         }
         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
         {
             if std::arch::is_x86_feature_detected!("sse4.1") {
-                if _scale_factor < 8. {
+                if _scale_factor < 8.
+                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                {
                     _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4_lb);
                     _dispatcher_1_row = convolve_horizontal_rgba_sse_rows_one_lb;
                 } else {
@@ -88,7 +109,9 @@ impl HorizontalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
                     _dispatcher_1_row = convolve_horizontal_rgba_sse_rows_one;
                 }
             }
-            if std::arch::is_x86_feature_detected!("avx2") || _scale_factor < 8. {
+            if (std::arch::is_x86_feature_detected!("avx2") || _scale_factor < 8.)
+                && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+            {
                 _dispatcher_4_rows = Some(convolve_horizontal_rgba_avx_rows_4_lb);
                 _dispatcher_1_row = convolve_horizontal_rgba_avx_rows_one_lb;
             }
@@ -121,6 +144,7 @@ impl VerticalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
         filter_weights: FilterWeights<f32>,
         destination: &mut ImageStoreMut<u8, 4>,
         pool: &Option<ThreadPool>,
+        _options: ConvolutionOptions,
     ) {
         let _scale_factor = self.height as f32 / destination.height as f32;
         #[allow(clippy::type_complexity)]
@@ -128,35 +152,56 @@ impl VerticalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
             handle_fixed_column_u8;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            // For more downscaling better to use more precise version
-            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
-                _dispatcher = convolve_vertical_neon_i16_precision;
-            } else {
-                _dispatcher = convolve_vertical_neon_i32_precision;
+            match _options.workload_strategy {
+                crate::WorkloadStrategy::PreferQuality => {
+                    use crate::neon::convolve_vertical_neon_i32_precision_d;
+                    _dispatcher = convolve_vertical_neon_i32_precision_d;
+                }
+                crate::WorkloadStrategy::PreferSpeed => {
+                    // For more downscaling better to use more precise version
+                    #[cfg(feature = "rdm")]
+                    if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
+                        use crate::neon::convolve_vertical_neon_i16_precision;
+                        _dispatcher = convolve_vertical_neon_i16_precision;
+                    } else {
+                        use crate::neon::convolve_vertical_neon_i32_precision;
+                        _dispatcher = convolve_vertical_neon_i32_precision;
+                    }
+                    #[cfg(not(feature = "rdm"))]
+                    {
+                        use crate::neon::convolve_vertical_neon_i32_precision;
+                        _dispatcher = convolve_vertical_neon_i32_precision;
+                    }
+                }
             }
         }
         #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
         {
             if std::arch::is_x86_feature_detected!("sse4.1") {
-                if _scale_factor < 8. {
+                if _scale_factor < 8.
+                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                {
                     _dispatcher = convolve_vertical_sse_row_lp;
                 } else {
                     _dispatcher = convolve_vertical_sse_row;
                 }
             }
             if std::arch::is_x86_feature_detected!("avx2") {
-                if _scale_factor < 8. {
+                if _scale_factor < 8.
+                    && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+                {
                     _dispatcher = convolve_vertical_avx_row_lp;
                 } else {
                     _dispatcher = convolve_vertical_avx_row;
                 }
             }
             #[cfg(feature = "nightly_avx512")]
-            if std::arch::is_x86_feature_detected!("avx512bw") {
-                if _scale_factor < 8. {
-                    use crate::avx512::convolve_vertical_avx512_row_lp;
-                    _dispatcher = convolve_vertical_avx512_row_lp;
-                }
+            if std::arch::is_x86_feature_detected!("avx512bw")
+                && _scale_factor < 8.
+                && _options.workload_strategy == crate::WorkloadStrategy::PreferSpeed
+            {
+                use crate::avx512::convolve_vertical_avx512_row_lp;
+                _dispatcher = convolve_vertical_avx512_row_lp;
             }
         }
         #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
diff --git a/src/scaler.rs b/src/scaler.rs
index ea0523b..1b4d94f 100644
--- a/src/scaler.rs
+++ b/src/scaler.rs
@@ -27,7 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::ar30::{Ar30ByteOrder, Rgb30};
-use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
+use crate::convolution::{ConvolutionOptions, HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::image_size::ImageSize;
 use crate::image_store::{
@@ -44,7 +44,7 @@ use crate::{
     Rgb16ImageStore, Rgb8ImageStore, RgbF32ImageStore, Rgba16ImageStore, Rgba8ImageStore,
     RgbaF32ImageStore,
 };
-use num_traits::{AsPrimitive, Float, FromPrimitive, Signed};
+use num_traits::{AsPrimitive, Float, Signed};
 use rayon::ThreadPool;
 use std::fmt::Debug;
 use std::ops::{AddAssign, MulAssign, Neg};
@@ -54,6 +54,7 @@ use std::ops::{AddAssign, MulAssign, Neg};
 pub struct Scaler {
     pub(crate) function: ResamplingFunction,
     pub(crate) threading_policy: ThreadingPolicy,
+    pub workload_strategy: WorkloadStrategy,
 }
 
 pub trait Scaling {
@@ -196,6 +197,15 @@ pub trait ScalingF32 {
     ) -> Result<(), PicScaleError>;
 }
 
+#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Default)]
+pub enum WorkloadStrategy {
+    /// Prefers quality to speed
+    PreferQuality,
+    /// Prefers speed to quality
+    #[default]
+    PreferSpeed,
+}
+
 pub trait ScalingU16 {
     /// Performs rescaling for Planar u16
     ///
@@ -332,9 +342,14 @@ impl Scaler {
         Scaler {
             function: filter,
             threading_policy: ThreadingPolicy::Single,
+            workload_strategy: WorkloadStrategy::default(),
         }
     }
 
+    pub fn set_workload_strategy(&mut self, workload_strategy: WorkloadStrategy) {
+        self.workload_strategy = workload_strategy;
+    }
+
     pub(crate) fn generate_weights<T>(&self, in_size: usize, out_size: usize) -> FilterWeights<T>
     where
         T: Copy
@@ -555,7 +570,7 @@ impl Scaler {
 impl Scaler {
     pub(crate) fn generic_resize<
         'a,
-        T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static,
+        T: Clone + Copy + Debug + Send + Sync + Default + 'static,
         const N: usize,
     >(
         &self,
@@ -621,7 +636,8 @@ impl Scaler {
             )?;
             new_image_vertical.bit_depth = into.bit_depth;
             let vertical_filters = self.generate_weights(store.height, new_size.height);
-            store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
+            let options = ConvolutionOptions::new(self.workload_strategy);
+            store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool, options);
 
             let new_immutable_store = ImageStore::<T, N> {
                 buffer: std::borrow::Cow::Owned(target_vertical),
@@ -632,23 +648,26 @@ impl Scaler {
                 bit_depth: into.bit_depth,
             };
             let horizontal_filters = self.generate_weights(store.width, new_size.width);
-            new_immutable_store.convolve_horizontal(horizontal_filters, into, &pool);
+            let options = ConvolutionOptions::new(self.workload_strategy);
+            new_immutable_store.convolve_horizontal(horizontal_filters, into, &pool, options);
             Ok(())
         } else if should_do_vertical {
             let vertical_filters = self.generate_weights(store.height, new_size.height);
-            store.convolve_vertical(vertical_filters, into, &pool);
+            let options = ConvolutionOptions::new(self.workload_strategy);
+            store.convolve_vertical(vertical_filters, into, &pool, options);
             Ok(())
         } else {
             assert!(should_do_horizontal);
             let horizontal_filters = self.generate_weights(store.width, new_size.width);
-            store.convolve_horizontal(horizontal_filters, into, &pool);
+            let options = ConvolutionOptions::new(self.workload_strategy);
+            store.convolve_horizontal(horizontal_filters, into, &pool, options);
             Ok(())
         }
     }
 
     fn forward_resize_with_alpha<
         'a,
-        T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static,
+        T: Clone + Copy + Debug + Send + Sync + Default + 'static,
         const N: usize,
     >(
         &self,
@@ -702,7 +721,8 @@ impl Scaler {
         )?;
         new_image_vertical.bit_depth = into.bit_depth;
         let vertical_filters = self.generate_weights(src_store.height, new_size.height);
-        src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, pool);
+        let options = ConvolutionOptions::new(self.workload_strategy);
+        src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, pool, options);
 
         let new_immutable_store = ImageStore::<T, N> {
             buffer: std::borrow::Cow::Owned(target_vertical),
@@ -713,7 +733,8 @@ impl Scaler {
             bit_depth: into.bit_depth,
         };
         let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-        new_immutable_store.convolve_horizontal(horizontal_filters, into, pool);
+        let options = ConvolutionOptions::new(self.workload_strategy);
+        new_immutable_store.convolve_horizontal(horizontal_filters, into, pool, options);
 
         if premultiply_alpha_requested && has_alpha_premultiplied {
             into.unpremultiply_alpha(pool);
@@ -724,7 +745,7 @@ impl Scaler {
 
     fn forward_resize_vertical_with_alpha<
         'a,
-        T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static,
+        T: Clone + Copy + Debug + Send + Sync + Default + 'static,
         const N: usize,
     >(
         &self,
@@ -769,7 +790,8 @@ impl Scaler {
         }
 
         let vertical_filters = self.generate_weights(src_store.height, new_size.height);
-        src_store.convolve_vertical(vertical_filters, into, pool);
+        let options = ConvolutionOptions::new(self.workload_strategy);
+        src_store.convolve_vertical(vertical_filters, into, pool, options);
 
         if premultiply_alpha_requested && has_alpha_premultiplied {
             into.unpremultiply_alpha(pool);
@@ -780,7 +802,7 @@ impl Scaler {
 
     fn forward_resize_horizontal_with_alpha<
         'a,
-        T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static,
+        T: Clone + Copy + Debug + Send + Sync + Default + 'static,
         const N: usize,
     >(
         &self,
@@ -825,7 +847,8 @@ impl Scaler {
         }
 
         let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
-        src_store.convolve_horizontal(horizontal_filters, into, pool);
+        let options = ConvolutionOptions::new(self.workload_strategy);
+        src_store.convolve_horizontal(horizontal_filters, into, pool, options);
 
         if premultiply_alpha_requested && has_alpha_premultiplied {
             into.unpremultiply_alpha(pool);
@@ -836,7 +859,7 @@ impl Scaler {
 
     pub(crate) fn generic_resize_with_alpha<
         'a,
-        T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static,
+        T: Clone + Copy + Debug + Send + Sync + Default + 'static,
         const N: usize,
     >(
         &self,
@@ -1097,6 +1120,8 @@ impl ScalingU16 for Scaler {
 impl Scaler {
     /// Resizes RGBA2101010 image
     ///
+    /// This method ignores alpha scaling.
+    ///
     /// # Arguments
     /// `src` - source slice
     /// `src_size` - Source Image size
@@ -1105,26 +1130,32 @@ impl Scaler {
     ///
     pub fn resize_ar30(
         &self,
-        src: &[u32],
+        src: &[u8],
+        src_stride: usize,
         src_size: ImageSize,
-        dst: &mut [u32],
+        dst: &mut [u8],
+        dst_stride: usize,
         new_size: ImageSize,
         order: Ar30ByteOrder,
     ) -> Result<(), PicScaleError> {
         match order {
-            Ar30ByteOrder::Host => resize_ar30_impl::<
-                { Rgb30::Ar30 as usize },
-                { Ar30ByteOrder::Host as usize },
-            >(src, src_size, dst, new_size, self),
-            Ar30ByteOrder::Network => resize_ar30_impl::<
-                { Rgb30::Ar30 as usize },
-                { Ar30ByteOrder::Network as usize },
-            >(src, src_size, dst, new_size, self),
+            Ar30ByteOrder::Host => {
+                resize_ar30_impl::<{ Rgb30::Ar30 as usize }, { Ar30ByteOrder::Host as usize }>(
+                    src, src_stride, src_size, dst, dst_stride, new_size, self,
+                )
+            }
+            Ar30ByteOrder::Network => {
+                resize_ar30_impl::<{ Rgb30::Ar30 as usize }, { Ar30ByteOrder::Network as usize }>(
+                    src, src_stride, src_size, dst, dst_stride, new_size, self,
+                )
+            }
         }
     }
 
     /// Resizes RGBA1010102 image
     ///
+    /// This method ignores alpha scaling.
+    ///
     /// # Arguments
     /// `src` - source slice
     /// `src_size` - Source Image size
@@ -1133,21 +1164,25 @@ impl Scaler {
     ///
     pub fn resize_ra30(
         &self,
-        src: &[u32],
+        src: &[u8],
+        src_stride: usize,
         src_size: ImageSize,
-        dst: &mut [u32],
+        dst: &mut [u8],
+        dst_stride: usize,
         new_size: ImageSize,
         order: Ar30ByteOrder,
     ) -> Result<(), PicScaleError> {
         match order {
-            Ar30ByteOrder::Host => resize_ar30_impl::<
-                { Rgb30::Ra30 as usize },
-                { Ar30ByteOrder::Host as usize },
-            >(src, src_size, dst, new_size, self),
-            Ar30ByteOrder::Network => resize_ar30_impl::<
-                { Rgb30::Ra30 as usize },
-                { Ar30ByteOrder::Network as usize },
-            >(src, src_size, dst, new_size, self),
+            Ar30ByteOrder::Host => {
+                resize_ar30_impl::<{ Rgb30::Ra30 as usize }, { Ar30ByteOrder::Host as usize }>(
+                    src, src_stride, src_size, dst, dst_stride, new_size, self,
+                )
+            }
+            Ar30ByteOrder::Network => {
+                resize_ar30_impl::<{ Rgb30::Ra30 as usize }, { Ar30ByteOrder::Network as usize }>(
+                    src, src_stride, src_size, dst, dst_stride, new_size, self,
+                )
+            }
         }
     }
 }
@@ -1162,7 +1197,7 @@ pub struct ScalingOptions {
 
 pub trait ImageStoreScaling<'b, T, const N: usize>
 where
-    T: FromPrimitive + Clone + Copy + Debug,
+    T: Clone + Copy + Debug,
 {
     fn scale(
         &self,
diff --git a/src/scaler_f16.rs b/src/scaler_f16.rs
index 8c6d1f1..eafe342 100644
--- a/src/scaler_f16.rs
+++ b/src/scaler_f16.rs
@@ -34,7 +34,7 @@ use crate::{
     CbCrF16ImageStore, ImageStore, ImageStoreScaling, PlanarF16ImageStore, RgbF16ImageStore,
     RgbaF16ImageStore, Scaler, Scaling, ThreadingPolicy,
 };
-use half::f16;
+use core::f16;
 
 /// Implements `f16` type support
 impl Scaler {
@@ -56,7 +56,7 @@ impl Scaler {
     ///  use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler};
     ///  let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
     ///  let src_store = ImageStore::alloc(100, 100);
-    ///  let mut dst_store = ImageStoreMut::<half::f16, 4>::alloc_with_depth(50, 50, 10);
+    ///  let mut dst_store = ImageStoreMut::<f16, 4>::alloc_with_depth(50, 50, 10);
     ///  scaler.resize_rgba_f16(&src_store, &mut dst_store, false).unwrap();
     /// ```
     pub fn resize_rgba_f16<'a>(
@@ -84,7 +84,7 @@ impl Scaler {
     ///  use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler};
     ///  let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
     ///  let src_store = ImageStore::alloc(100, 100);
-    ///  let mut dst_store = ImageStoreMut::<half::f16, 3>::alloc_with_depth(50, 50, 10);
+    ///  let mut dst_store = ImageStoreMut::<f16, 3>::alloc_with_depth(50, 50, 10);
     ///  scaler.resize_rgb_f16(&src_store, &mut dst_store).unwrap();
     /// ```
     pub fn resize_rgb_f16<'a>(
@@ -111,7 +111,7 @@ impl Scaler {
     ///  use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler};
     ///  let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
     ///  let src_store = ImageStore::alloc(100, 100);
-    ///  let mut dst_store = ImageStoreMut::<half::f16, 2>::alloc_with_depth(50, 50, 10);
+    ///  let mut dst_store = ImageStoreMut::<f16, 2>::alloc_with_depth(50, 50, 10);
     ///  scaler.resize_cbcr_f16(&src_store, &mut dst_store).unwrap();
     /// ```
     pub fn resize_cbcr_f16<'a>(
@@ -138,7 +138,7 @@ impl Scaler {
     ///  use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler};
     ///  let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
     ///  let src_store = ImageStore::alloc(100, 100);
-    ///  let mut dst_store = ImageStoreMut::<half::f16, 1>::alloc_with_depth(50, 50, 10);
+    ///  let mut dst_store = ImageStoreMut::<f16, 1>::alloc_with_depth(50, 50, 10);
     ///  scaler.resize_plane_f16(&src_store, &mut dst_store).unwrap();
     /// ```
     ///
diff --git a/src/sse/alpha_f16.rs b/src/sse/alpha_f16.rs
index a8ff24d..e0c6195 100644
--- a/src/sse/alpha_f16.rs
+++ b/src/sse/alpha_f16.rs
@@ -39,9 +39,9 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 pub(crate) fn sse_premultiply_alpha_rgba_f16(
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
     width: usize,
     height: usize,
@@ -60,9 +60,9 @@ pub(crate) fn sse_premultiply_alpha_rgba_f16(
 
 #[target_feature(enable = "sse4.1")]
 unsafe fn sse_premultiply_alpha_rgba_f16_regular(
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
     width: usize,
     height: usize,
@@ -75,9 +75,9 @@ unsafe fn sse_premultiply_alpha_rgba_f16_regular(
 
 #[target_feature(enable = "sse4.1", enable = "f16c")]
 unsafe fn sse_premultiply_alpha_rgba_f16c(
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
     width: usize,
     height: usize,
@@ -89,10 +89,7 @@ unsafe fn sse_premultiply_alpha_rgba_f16c(
 }
 
 #[inline(always)]
-unsafe fn sse_premultiply_alpha_rgba_row_f16_impl<const F16C: bool>(
-    dst: &mut [half::f16],
-    src: &[half::f16],
-) {
+unsafe fn sse_premultiply_alpha_rgba_row_f16_impl<const F16C: bool>(dst: &mut [f16], src: &[f16]) {
     let mut rem = dst;
     let mut src_rem = src;
 
@@ -145,9 +142,9 @@ unsafe fn sse_premultiply_alpha_rgba_row_f16_impl<const F16C: bool>(
 
 #[inline(always)]
 unsafe fn sse_premultiply_alpha_rgba_f16_impl<const F16C: bool>(
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     dst_stride: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
     width: usize,
     _: usize,
@@ -177,7 +174,7 @@ unsafe fn sse_premultiply_alpha_rgba_f16_impl<const F16C: bool>(
 }
 
 pub(crate) fn sse_unpremultiply_alpha_rgba_f16(
-    in_place: &mut [half::f16],
+    in_place: &mut [f16],
     stride: usize,
     width: usize,
     height: usize,
@@ -194,7 +191,7 @@ pub(crate) fn sse_unpremultiply_alpha_rgba_f16(
 
 #[target_feature(enable = "sse4.1")]
 unsafe fn sse_unpremultiply_alpha_rgba_f16_regular(
-    in_place: &mut [half::f16],
+    in_place: &mut [f16],
     stride: usize,
     width: usize,
     height: usize,
@@ -205,7 +202,7 @@ unsafe fn sse_unpremultiply_alpha_rgba_f16_regular(
 
 #[target_feature(enable = "sse4.1", enable = "f16c")]
 unsafe fn sse_unpremultiply_alpha_rgba_f16c(
-    in_place: &mut [half::f16],
+    in_place: &mut [f16],
     stride: usize,
     width: usize,
     height: usize,
@@ -215,7 +212,7 @@ unsafe fn sse_unpremultiply_alpha_rgba_f16c(
 }
 
 #[inline(always)]
-unsafe fn sse_unpremultiply_alpha_rgba_f16_row_impl<const F16C: bool>(in_place: &mut [half::f16]) {
+unsafe fn sse_unpremultiply_alpha_rgba_f16_row_impl<const F16C: bool>(in_place: &mut [f16]) {
     let mut rem = in_place;
 
     for dst in rem.chunks_exact_mut(8 * 4) {
@@ -293,7 +290,7 @@ unsafe fn sse_unpremultiply_alpha_rgba_f16_row_impl<const F16C: bool>(in_place:
 
 #[inline(always)]
 unsafe fn sse_unpremultiply_alpha_rgba_f16_impl<const F16C: bool>(
-    in_place: &mut [half::f16],
+    in_place: &mut [f16],
     stride: usize,
     width: usize,
     _: usize,
diff --git a/src/sse/f16_utils.rs b/src/sse/f16_utils.rs
index 7f7a8e1..f0e78e2 100644
--- a/src/sse/f16_utils.rs
+++ b/src/sse/f16_utils.rs
@@ -211,7 +211,7 @@ pub(crate) unsafe fn _mm_cvtph_psx<const F16C: bool>(x: __m128i) -> __m128 {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use half::f16;
+    use f16;
 
     #[test]
     fn test_conversion_into_f16() {
diff --git a/src/sse/mod.rs b/src/sse/mod.rs
index 5f7a083..1ef68b2 100644
--- a/src/sse/mod.rs
+++ b/src/sse/mod.rs
@@ -27,23 +27,23 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod alpha_f16;
 mod alpha_f32;
 mod alpha_u16;
 mod alpha_u8;
 mod cbcr8_hrs;
 mod check_alpha;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod f16_utils;
 mod plane_f32;
 mod plane_u8;
 mod plane_u8_hrs;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod rgb_f16;
 mod rgb_f32;
 mod rgb_u8;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod rgba_f16;
 mod rgba_f32;
 mod rgba_u16;
@@ -53,7 +53,7 @@ mod rgba_u8_lb;
 mod routines;
 mod u8_utils;
 mod utils;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 mod vertical_f16;
 mod vertical_f32;
 mod vertical_u16;
@@ -61,7 +61,7 @@ mod vertical_u16_lb;
 mod vertical_u8;
 mod vertical_u8_lp;
 
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) use alpha_f16::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16};
 pub(crate) use alpha_f32::sse_premultiply_alpha_rgba_f32;
 pub(crate) use alpha_f32::sse_unpremultiply_alpha_rgba_f32;
@@ -81,7 +81,7 @@ pub(crate) use plane_u8::{
 pub(crate) use plane_u8_hrs::{
     convolve_horizontal_plane_sse_row_hrs, convolve_horizontal_plane_sse_rows_hrs_4_u8,
 };
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) use rgb_f16::{
     convolve_horizontal_rgb_sse_row_one_f16, convolve_horizontal_rgb_sse_rows_4_f16,
 };
@@ -89,7 +89,7 @@ pub(crate) use rgb_f32::{
     convolve_horizontal_rgb_sse_row_one_f32, convolve_horizontal_rgb_sse_rows_4_f32,
 };
 pub(crate) use rgb_u8::*;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) use rgba_f16::{
     convolve_horizontal_rgba_sse_row_one_f16, convolve_horizontal_rgba_sse_rows_4_f16,
 };
@@ -111,7 +111,7 @@ pub(crate) use rgba_u8_lb::{
 pub(crate) use routines::{load_4_weights, load_4_weights_group_2_avx, load_8_weights_group_4_avx};
 pub(crate) use u8_utils::*;
 pub(crate) use utils::*;
-#[cfg(feature = "half")]
+#[cfg(feature = "nightly_f16")]
 pub(crate) use vertical_f16::convolve_vertical_sse_row_f16;
 pub(crate) use vertical_f32::convolve_vertical_rgb_sse_row_f32;
 pub(crate) use vertical_u16::convolve_column_sse_u16;
diff --git a/src/sse/rgb_f16.rs b/src/sse/rgb_f16.rs
index 637b774..66335b6 100644
--- a/src/sse/rgb_f16.rs
+++ b/src/sse/rgb_f16.rs
@@ -30,7 +30,7 @@
 use crate::filter_weights::FilterWeights;
 use crate::sse::f16_utils::{_mm_cvtph_psx, _mm_cvtps_phx};
 use crate::sse::{_mm_prefer_fma_ps, load_4_weights, shuffle};
-use half::f16;
+use core::f16;
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
diff --git a/src/sse/rgba_f16.rs b/src/sse/rgba_f16.rs
index 5e367d2..0f234b1 100644
--- a/src/sse/rgba_f16.rs
+++ b/src/sse/rgba_f16.rs
@@ -32,7 +32,7 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-use half::f16;
+use core::f16;
 
 use crate::filter_weights::FilterWeights;
 use crate::sse::f16_utils::{_mm_cvtph_psx, _mm_cvtps_phx};
diff --git a/src/sse/vertical_f16.rs b/src/sse/vertical_f16.rs
index dfa8782..9807205 100644
--- a/src/sse/vertical_f16.rs
+++ b/src/sse/vertical_f16.rs
@@ -26,6 +26,7 @@
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
+use core::f16;
 #[cfg(target_arch = "x86")]
 use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
@@ -39,9 +40,9 @@ use crate::sse::f16_utils::{_mm_cvtph_psx, _mm_cvtps_phx};
 pub(crate) unsafe fn convolve_vertical_part_sse_f16<const F16C: bool, const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -71,9 +72,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_f16<const F16C: bool, const FMA:
 pub(crate) unsafe fn convolve_vertical_part_sse_4_f16<const F16C: bool, const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -102,9 +103,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_4_f16<const F16C: bool, const FM
 pub(crate) unsafe fn convolve_vertical_part_sse_16_16<const F16C: bool, const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -155,9 +156,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_16_16<const F16C: bool, const FM
 pub(crate) unsafe fn convolve_vertical_part_sse_8_f16<const F16C: bool, const FMA: bool>(
     start_y: usize,
     start_x: usize,
-    src: &[half::f16],
+    src: &[f16],
     src_stride: usize,
-    dst: &mut [half::f16],
+    dst: &mut [f16],
     filter: &[f32],
     bounds: &FilterBounds,
 ) {
@@ -192,8 +193,8 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8_f16<const F16C: bool, const FM
 pub(crate) fn convolve_vertical_sse_row_f16<const F16C: bool, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -217,8 +218,8 @@ pub(crate) fn convolve_vertical_sse_row_f16<const F16C: bool, const FMA: bool>(
 unsafe fn convolve_vertical_sse_row_f16_regular(
     width: usize,
     bounds: &FilterBounds,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -234,8 +235,8 @@ unsafe fn convolve_vertical_sse_row_f16_regular(
 unsafe fn convolve_vertical_sse_row_f16c_fma(
     width: usize,
     bounds: &FilterBounds,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -251,8 +252,8 @@ unsafe fn convolve_vertical_sse_row_f16c_fma(
 unsafe fn convolve_vertical_sse_row_f16c(
     width: usize,
     bounds: &FilterBounds,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {
@@ -265,8 +266,8 @@ unsafe fn convolve_vertical_sse_row_f16c(
 unsafe fn convolve_vertical_sse_row_f16_impl<const FMA: bool, const F16C: bool>(
     _: usize,
     bounds: &FilterBounds,
-    src: &[half::f16],
-    dst: &mut [half::f16],
+    src: &[f16],
+    dst: &mut [f16],
     src_stride: usize,
     weight_ptr: &[f32],
 ) {