diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml
index 156cf3b..3e8fb85 100644
--- a/.github/workflows/build_push.yml
+++ b/.github/workflows/build_push.yml
@@ -7,14 +7,14 @@ concurrency:
 on:
   push:
     branches:
-      - '*'
+      - 'master'
+      - 'dev'
       - '!ci_test_*'
     tags-ignore:
       - '*'
   pull_request:
     branches:
-      - '*'
-      - '!ci_test_*'
+      - 'master'
 
 jobs:
   build:
@@ -29,9 +29,60 @@ jobs:
       - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target i686-unknown-linux-gnu
       - run: cargo build --target powerpc-unknown-linux-gnu
       - run: cargo build --target riscv64gc-unknown-linux-gnu
-      - run: cargo clippy
-      - run: cargo clippy --target aarch64-unknown-linux-gnu
       - run: RUSTFLAGS="-C target-feature=+sse4.1" cargo build --target x86_64-unknown-linux-gnu
       - run: RUSTFLAGS="-C target-feature=+sse4.1,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu
       - run: RUSTFLAGS="-C target-feature=+avx2,+f16c" cargo build --features half --target x86_64-unknown-linux-gnu
-      - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu
\ No newline at end of file
+      - run: RUSTFLAGS="-C target-feature=+avx2" cargo build --target x86_64-unknown-linux-gnu
+
+  clippy:
+    name: Clippy
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@stable
+      - run: cargo clippy
+
+  fuzz_rgba_8bit:
+    name: Fuzzing 8bit
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@nightly
+      - run: cargo install cargo-fuzz
+      - run: cargo fuzz run resize_rgba -- -max_total_time=30
+      - run: cargo fuzz run resize_rgb -- -max_total_time=30
+      - run: cargo fuzz run resize_plane -- -max_total_time=30
+
+  fuzz_rgba_high_bit:
+    name: Fuzzing High bit-depth
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@nightly
+      - run: cargo install cargo-fuzz
+      - run: cargo fuzz run resize_rgba_u16 -- -max_total_time=30
+      - run: cargo fuzz run resize_rgb_u16 -- -max_total_time=30
+      - run: cargo fuzz run resize_plane_u16 -- -max_total_time=30
+
+  fuzz_rgba_f32:
+    name: Fuzzing floating point
+    strategy:
+      matrix:
+        os: [ ubuntu-latest, macos-latest ]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@nightly
+      - run: cargo install cargo-fuzz
+      - run: cargo fuzz run resize_rgba_f32 -- -max_total_time=30
+      - run: cargo fuzz run resize_rgb_f32 -- -max_total_time=30
+      - run: cargo fuzz run resize_plane_f32 -- -max_total_time=30
\ No newline at end of file
diff --git a/.github/workflows/no-response.yml b/.github/workflows/no-response.yml
new file mode 100644
index 0000000..fc5cbc5
--- /dev/null
+++ b/.github/workflows/no-response.yml
@@ -0,0 +1,26 @@
+name: no-response
+
+on:
+  schedule:
+    - cron: '0 0 * * *' # Runs daily at midnight
+  workflow_dispatch:
+
+jobs:
+  noResponse:
+    permissions:
+      issues: write
+      pull-requests: write
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v9
+        with:
+          repo-token: ${{ github.token }}
+          days-before-stale: -1
+          days-before-close: 14
+          only-labels: 'waiting for author'
+          stale-issue-label: 'waiting for author'
+          stale-pr-label: 'waiting for author'
+          remove-stale-when-updated: true
+          ignore-updates: false
+          close-issue-message: This issue has been automatically closed due to inactivity. We requested additional information but have not received a response from the original author. Without the requested details, we cannot proceed. If you have or find the information needed, please comment so we can reopen the issue.
+          close-pr-message: This pull request has been automatically closed due to inactivity. We requested additional information but have not received a response from the original author. Without the requested details, we cannot proceed. If you have the needed information or updates, please reopen the PR or comment so we can continue the review.
\ No newline at end of file
diff --git a/Cargo.lock b/Cargo.lock
index 50c2826..9eda492 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2,12 +2,6 @@
 # It is not intended for manual editing.
 version = 4
 
-[[package]]
-name = "adler"
-version = "1.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
-
 [[package]]
 name = "adler2"
 version = "2.0.0"
@@ -37,15 +31,15 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anstyle"
-version = "1.0.8"
+version = "1.0.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1"
+checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
 
 [[package]]
 name = "anyhow"
-version = "1.0.90"
+version = "1.0.93"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37bf3594c4c988a53154954629820791dde498571819ae4ca50ca811e060cc95"
+checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775"
 
 [[package]]
 name = "app"
@@ -56,13 +50,22 @@ dependencies = [
  "half",
  "image",
  "pic-scale",
+ "yuvutils-rs",
+]
+
+[[package]]
+name = "app-fuzz"
+version = "0.0.0"
+dependencies = [
+ "libfuzzer-sys",
+ "pic-scale",
 ]
 
 [[package]]
 name = "arbitrary"
-version = "1.3.2"
+version = "1.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
+checksum = "dde20b3d026af13f561bdd0f15edf01fc734f0dafcedbaf42bba506a9517f223"
 
 [[package]]
 name = "arg_enum_proc_macro"
@@ -124,9 +127,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitstream-io"
-version = "2.5.3"
+version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b81e1519b0d82120d2fd469d5bfb2919a9361c48b02d82d04befc1cdd2002452"
+checksum = "6099cdc01846bc367c4e7dd630dc5966dccf36b652fae7a74e17b640411a91b2"
 
 [[package]]
 name = "built"
@@ -142,9 +145,9 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
 
 [[package]]
 name = "bytemuck"
-version = "1.19.0"
+version = "1.20.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d"
+checksum = "8b37c88a63ffd85d15b406896cc343916d7cf57838a847b3a6f2ca5d39a5695a"
 
 [[package]]
 name = "byteorder"
@@ -166,9 +169,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cc"
-version = "1.1.31"
+version = "1.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f"
+checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47"
 dependencies = [
  "jobserver",
  "libc",
@@ -226,18 +229,18 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.20"
+version = "4.5.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
+checksum = "fb3b4b9e5a7c7514dfa52869339ee98b3156b0bfb4e8a77c4ff4babb64b1604f"
 dependencies = [
  "clap_builder",
 ]
 
 [[package]]
 name = "clap_builder"
-version = "4.5.20"
+version = "4.5.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
+checksum = "b17a95aa67cc7b5ebd32aa5370189aa0d79069ef1c64ce893bd30fb24bff20ec"
 dependencies = [
  "anstyle",
  "clap_lex",
@@ -245,9 +248,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.7.2"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
+checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
 [[package]]
 name = "color_quant"
@@ -257,9 +260,9 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b"
 
 [[package]]
 name = "colorutils-rs"
-version = "0.7.2"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e1632390e4314e1ce4b4060fbbb36f0d576b994399088bfdfe4216eef0aa54b"
+checksum = "31ca2cc8ed986672b15bfd3e416014e40cada05196bdfaa51168985f3c2e81f1"
 dependencies = [
  "erydanos",
  "half",
@@ -385,15 +388,14 @@ dependencies = [
 
 [[package]]
 name = "exr"
-version = "1.72.0"
+version = "1.73.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "887d93f60543e9a9362ef8a21beedd0a833c5d9610e18c67abe15a5963dcb1a4"
+checksum = "f83197f59927b46c04a183a619b7c29df34e63e63c7869320862268c0ef687e0"
 dependencies = [
  "bit_field",
- "flume",
  "half",
  "lebe",
- "miniz_oxide 0.7.4",
+ "miniz_oxide",
  "rayon-core",
  "smallvec",
  "zune-inflate",
@@ -413,30 +415,21 @@ dependencies = [
 
 [[package]]
 name = "fdeflate"
-version = "0.3.5"
+version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d8090f921a24b04994d9929e204f50b498a33ea6ba559ffaa05e04f7ee7fb5ab"
+checksum = "07c6f4c64c1d33a3111c4466f7365ebdcc37c5bd1ea0d62aae2e3d722aacbedb"
 dependencies = [
  "simd-adler32",
 ]
 
 [[package]]
 name = "flate2"
-version = "1.0.34"
+version = "1.0.35"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0"
+checksum = "c936bfdafb507ebbf50b8074c54fa31c5be9a1e7e5f467dd659697041407d07c"
 dependencies = [
  "crc32fast",
- "miniz_oxide 0.8.0",
-]
-
-[[package]]
-name = "flume"
-version = "0.11.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095"
-dependencies = [
- "spin",
+ "miniz_oxide",
 ]
 
 [[package]]
@@ -473,9 +466,9 @@ dependencies = [
 
 [[package]]
 name = "hashbrown"
-version = "0.15.0"
+version = "0.15.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb"
+checksum = "3a9bfc1af68b1726ea47d3d5109de126281def866b33970e10fbab11b5dafab3"
 
 [[package]]
 name = "heck"
@@ -580,9 +573,9 @@ dependencies = [
 
 [[package]]
 name = "itoa"
-version = "1.0.11"
+version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
+checksum = "540654e97a3f4470a492cd30ff187bc95d89557a903a2bbf112e2fae98104ef2"
 
 [[package]]
 name = "jobserver"
@@ -616,26 +609,25 @@ checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8"
 
 [[package]]
 name = "libc"
-version = "0.2.161"
+version = "0.2.164"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1"
+checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f"
 
 [[package]]
 name = "libfuzzer-sys"
-version = "0.4.7"
+version = "0.4.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
+checksum = "9b9569d2f74e257076d8c6bfa73fb505b46b851e51ddaecc825944aa3bed17fa"
 dependencies = [
  "arbitrary",
  "cc",
- "once_cell",
 ]
 
 [[package]]
 name = "libm"
-version = "0.2.8"
+version = "0.2.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
 
 [[package]]
 name = "litrs"
@@ -643,16 +635,6 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b4ce301924b7887e9d637144fdade93f9dfff9b60981d4ac161db09720d39aa5"
 
-[[package]]
-name = "lock_api"
-version = "0.4.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
-dependencies = [
- "autocfg",
- "scopeguard",
-]
-
 [[package]]
 name = "log"
 version = "0.4.22"
@@ -696,15 +678,6 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
 
-[[package]]
-name = "miniz_oxide"
-version = "0.7.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
-dependencies = [
- "adler",
-]
-
 [[package]]
 name = "miniz_oxide"
 version = "0.8.0"
@@ -808,7 +781,7 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
 [[package]]
 name = "pic-scale"
-version = "0.3.5"
+version = "0.3.6"
 dependencies = [
  "colorutils-rs",
  "half",
@@ -861,7 +834,7 @@ dependencies = [
  "crc32fast",
  "fdeflate",
  "flate2",
- "miniz_oxide 0.8.0",
+ "miniz_oxide",
 ]
 
 [[package]]
@@ -875,9 +848,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.88"
+version = "1.0.91"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9"
+checksum = "307e3004becf10f5a6e0d59d20f3cd28231b0e0827a96cd3e0ce6d14bc1e4bb3"
 dependencies = [
  "unicode-ident",
 ]
@@ -1027,9 +1000,9 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.11.0"
+version = "1.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8"
+checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -1039,9 +1012,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.8"
+version = "0.4.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -1075,26 +1048,20 @@ dependencies = [
  "winapi-util",
 ]
 
-[[package]]
-name = "scopeguard"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
-
 [[package]]
 name = "serde"
-version = "1.0.210"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
+checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f"
 dependencies = [
  "serde_derive",
 ]
 
 [[package]]
 name = "serde_derive"
-version = "1.0.210"
+version = "1.0.215"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
+checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1103,9 +1070,9 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.132"
+version = "1.0.133"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03"
+checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
 dependencies = [
  "itoa",
  "memchr",
@@ -1149,20 +1116,11 @@ version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
-[[package]]
-name = "spin"
-version = "0.9.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
-dependencies = [
- "lock_api",
-]
-
 [[package]]
 name = "syn"
-version = "2.0.81"
+version = "2.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "198514704ca887dd5a1e408c6c6cdcba43672f9b4062e1b24aa34e74e6d7faae"
+checksum = "44d46482f1c1c87acd84dea20c1bf5ebff4c757009ed6bf19cfd36fb10e92c4e"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1190,18 +1148,18 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1"
 
 [[package]]
 name = "thiserror"
-version = "1.0.64"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.64"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -1265,9 +1223,9 @@ dependencies = [
 
 [[package]]
 name = "unicode-ident"
-version = "1.0.13"
+version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
+checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
 
 [[package]]
 name = "v_frame"
@@ -1519,6 +1477,15 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "yuvutils-rs"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74d0d13bb8e3921f9d76ef4bcb348108578df0402c6cf2695ef0690b4b64c7d9"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "zerocopy"
 version = "0.7.35"
diff --git a/Cargo.toml b/Cargo.toml
index 9c091c7..68371ce 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,8 +1,8 @@
-workspace = { members = ["app", "wasm"] }
+workspace = { members = ["app", "wasm", "fuzz"] }
 
 [package]
 name = "pic-scale"
-version = "0.3.5"
+version = "0.3.6"
 edition = "2021"
 description = "High performance image scaling"
 readme = "README.md"
@@ -14,6 +14,7 @@ categories = ["multimedia::images", "multimedia::video"]
 homepage = "https://github.com/awxkee/pic-scale"
 repository = "https://github.com/awxkee/pic-scale"
 exclude = ["*.jpg", "/assets", "*.png", "*.sh", "/assets/*"]
+rust-version = "1.73.0"
 
 [dependencies]
 colorutils-rs = {version = "0.7.0", optional = true}
diff --git a/README.md b/README.md
index 645438a..6b24ff1 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ Despite all implementation are fast, not all the paths are implemented using SIM
 | RGBA (f16)     | x    | x   | x   | -    | 
 | RGB (f16)      | x    | ~   | ~   | -    | 
 | Plane (f16)    | ~    | ~   | ~   | -    |
+| AR30/RA30      | x    | -   | -   | -    |
 
 #### Features
 
@@ -85,16 +86,9 @@ Even when `half` feature activated but platform do not support or features not e
 
 Example comparison with `fast-image-resize` time for downscale RGB 4928x3279 image in 4 times.
 
-| Lanczos3  |  SSE  |  AVX  | NEON  |
-|-----------|:-----:|:-----:|:-----:|
-| pic-scale | 43.84 | 28.46 | 8.56  |
-| fir       | 45.36 | 32.07 | 32.77 |
-
-Example comparison with `fast-image-resize` time for downscale RGB 4928x3279 image in 4 times.
-
 | Lanczos3  |  AVX  | NEON  |
 |-----------|:-----:|:-----:|
-| pic-scale | 16.67 | 10.88 |
+| pic-scale | 16.67 | 8.54  |
 | fir       | 22.83 | 24.97 |
 
 Example comparison time for downscale RGBA 4928x3279 image in two times with premultiplying alpha.
diff --git a/app/Cargo.toml b/app/Cargo.toml
index 1c09c6d..aa48512 100644
--- a/app/Cargo.toml
+++ b/app/Cargo.toml
@@ -9,6 +9,7 @@ image = { version = "0.25.5", features = ["default"] }
 pic-scale = { path = "..", features = ["half"], default-features = true }
 fast_image_resize = { version = "5.0.0", features = [] }
 half = { version = "2.4.1", default-features = true }
+yuvutils-rs = "0.5.5"
 
 [dev-dependencies]
 criterion = "0.5.1"
diff --git a/app/src/main.rs b/app/src/main.rs
index 0e2b15a..3b4cb82 100644
--- a/app/src/main.rs
+++ b/app/src/main.rs
@@ -11,12 +11,44 @@ use fast_image_resize::{
 };
 use image::{EncodableLayout, GenericImageView, ImageReader};
 use pic_scale::{
-    ImageSize, ImageStore, LinearApproxScaler, ResamplingFunction, Scaler, Scaling, ScalingU16,
-    ThreadingPolicy,
+    Ar30ByteOrder, ImageSize, ImageStore, LinearApproxScaler, ResamplingFunction, Scaler, Scaling,
+    ScalingU16, ThreadingPolicy,
 };
+use yuvutils_rs::{
+    ar30_to_rgba8, ra30_to_rgba8, rgb8_to_ar30, rgba8_to_ar30, rgba8_to_ra30, Rgb30ByteOrder,
+};
+
+fn resize_plane(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![15u8; src_width * src_height * 1];
+
+    let store = ImageStore::<u8, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_plane(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+}
+
 
 fn main() {
-    test_fast_image();
+    // test_fast_image();
     let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png")
         .unwrap()
         .decode()
@@ -25,21 +57,35 @@ fn main() {
     let transient = img.to_rgba8();
     let mut bytes = Vec::from(transient.as_bytes());
 
-    let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
+    let mut scaler = Scaler::new(ResamplingFunction::Bilinear);
     scaler.set_threading_policy(ThreadingPolicy::Single);
 
-    let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
+    resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);
+
+    // let mut choke: Vec<u16> = bytes.iter().map(|&x| (x as u16) << 2).collect();
     //
     let store =
-        ImageStore::<u16, 4>::from_slice(&mut choke, dimensions.0 as usize, dimensions.1 as usize)
+        ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize)
             .unwrap();
+
+    let dst_size = ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2);
+    // let mut resized_ar = vec![0u32; dst_size.width * dst_size.height];
     let start_time = Instant::now();
+    // scaler
+    //     .resize_ra30(
+    //         &ar30_src,
+    //         ImageSize::new(dimensions.0 as usize, dimensions.1 as usize),
+    //         &mut resized_ar,
+    //         dst_size,
+    //         Ar30ByteOrder::Host,
+    //     )
+    //     .unwrap();
+
     let resized = scaler
-        .resize_rgba_u16(
+        .resize_rgba(
             ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
             store,
-            10,
-            true,
+            false,
         )
         .unwrap();
 
@@ -47,6 +93,18 @@ fn main() {
     // Print the elapsed time in milliseconds
     println!("Scaler: {:.2?}", elapsed_time);
 
+    // let mut resized = vec![0u8; dst_size.width * dst_size.height * 4];
+    // ra30_to_rgba8(
+    //     &resized_ar,
+    //     dst_size.width as u32,
+    //     Rgb30ByteOrder::Host,
+    //     &mut resized,
+    //     dst_size.width as u32 * 4,
+    //     dst_size.width as u32,
+    //     dst_size.height as u32,
+    // )
+    // .unwrap();
+
     // let dst: Vec<u8> = resized.as_bytes().iter().map(|&x| x).collect::<Vec<_>>();
     // println!("f1 {}, f2 {}, f3 {}, f4 {}", dst[0], dst[1], dst[2], dst[3]);
     // let dst: Vec<u8> = resized
@@ -106,9 +164,18 @@ fn main() {
     //     .map(|&x| (x * 255f32) as u8)
     //     .collect();
 
-    let dst: Vec<u8> = resized.as_bytes().iter().map(|&x| (x >> 2) as u8).collect();
+    // let dst: Vec<u8> = resized.as_bytes().iter().map(|&x| (x >> 2) as u8).collect();
     //
-    // let dst = resized.as_bytes();
+    let dst = resized.as_bytes();
+    // let dst = resized;
+    // image::save_buffer(
+    //     "converted.png",
+    //     &dst,
+    //     dst_size.width as u32,
+    //     dst_size.height as u32,
+    //     image::ColorType::Rgba8,
+    // )
+    // .unwrap();
 
     if resized.channels == 4 {
         image::save_buffer(
diff --git a/fuzz/.gitignore b/fuzz/.gitignore
new file mode 100644
index 0000000..1a45eee
--- /dev/null
+++ b/fuzz/.gitignore
@@ -0,0 +1,4 @@
+target
+corpus
+artifacts
+coverage
diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml
new file mode 100644
index 0000000..3db86e5
--- /dev/null
+++ b/fuzz/Cargo.toml
@@ -0,0 +1,75 @@
+[package]
+name = "app-fuzz"
+version = "0.0.0"
+publish = false
+edition = "2021"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+libfuzzer-sys = "0.4"
+pic-scale = {path = "../"}
+
+[[bin]]
+name = "resize_rgba"
+path = "resize_rgba/resize_rgba.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_rgb"
+path = "resize_rgb/resize_rgb.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_plane"
+path = "resize_plane/resize_plane.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_rgba_u16"
+path = "resize_rgba_u16/resize_rgba_u16.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_rgb_u16"
+path = "resize_rgb_u16/resize_rgb_u16.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_plane_u16"
+path = "resize_plane_u16/resize_plane_u16.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_rgba_f32"
+path = "resize_rgba_f32/resize_rgba_f32.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_rgb_f32"
+path = "resize_rgb_f32/resize_rgb_f32.rs"
+test = false
+doc = false
+bench = false
+
+[[bin]]
+name = "resize_plane_f32"
+path = "resize_plane_f32/resize_plane_f32.rs"
+test = false
+doc = false
+bench = false
diff --git a/fuzz/resize_plane/resize_plane.rs b/fuzz/resize_plane/resize_plane.rs
new file mode 100644
index 0000000..829cca4
--- /dev/null
+++ b/fuzz/resize_plane/resize_plane.rs
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_plane(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_plane(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![15u8; src_width * src_height];
+
+    let store = ImageStore::<u8, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_plane(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+}
diff --git a/fuzz/resize_plane_f32/resize_plane_f32.rs b/fuzz/resize_plane_f32/resize_plane_f32.rs
new file mode 100644
index 0000000..bb128e0
--- /dev/null
+++ b/fuzz/resize_plane_f32/resize_plane_f32.rs
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_plane(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_plane(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![0f32; src_width * src_height];
+
+    let store = ImageStore::<f32, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_plane_f32(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+    let store = ImageStore::<f32, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_plane_f32(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+}
diff --git a/fuzz/resize_plane_u16/resize_plane_u16.rs b/fuzz/resize_plane_u16/resize_plane_u16.rs
new file mode 100644
index 0000000..8a59c96
--- /dev/null
+++ b/fuzz/resize_plane_u16/resize_plane_u16.rs
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgb(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Lanczos3,
+    )
+});
+
+fn resize_rgb(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![1u16; src_width * src_height];
+
+    let store = ImageStore::<u16, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_plane_u16(ImageSize::new(dst_width, dst_height), store, 10)
+        .unwrap();
+
+    let store = ImageStore::<u16, 1>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_plane_u16(ImageSize::new(dst_width, dst_height), store, 16)
+        .unwrap();
+}
diff --git a/fuzz/resize_rgb/resize_rgb.rs b/fuzz/resize_rgb/resize_rgb.rs
new file mode 100644
index 0000000..ecc74d3
--- /dev/null
+++ b/fuzz/resize_rgb/resize_rgb.rs
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgb(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_rgb(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![0u8; src_width * src_height * 3];
+
+    let store = ImageStore::<u8, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_rgb(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+}
diff --git a/fuzz/resize_rgb_f32/resize_rgb_f32.rs b/fuzz/resize_rgb_f32/resize_rgb_f32.rs
new file mode 100644
index 0000000..f2d4773
--- /dev/null
+++ b/fuzz/resize_rgb_f32/resize_rgb_f32.rs
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingF32};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgb(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_rgb(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![0f32; src_width * src_height * 3];
+
+    let store = ImageStore::<f32, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_rgb_f32(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+    let store = ImageStore::<f32, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgb_f32(ImageSize::new(dst_width, dst_height), store)
+        .unwrap();
+}
diff --git a/fuzz/resize_rgb_u16/resize_rgb_u16.rs b/fuzz/resize_rgb_u16/resize_rgb_u16.rs
new file mode 100644
index 0000000..47e48fd
--- /dev/null
+++ b/fuzz/resize_rgb_u16/resize_rgb_u16.rs
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgb(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Lanczos3,
+    )
+});
+
+fn resize_rgb(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![1u16; src_width * src_height * 3];
+
+    let store = ImageStore::<u16, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_rgb_u16(ImageSize::new(dst_width, dst_height), store, 10)
+        .unwrap();
+
+    let store = ImageStore::<u16, 3>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgb_u16(ImageSize::new(dst_width, dst_height), store, 16)
+        .unwrap();
+}
diff --git a/fuzz/resize_rgba/resize_rgba.rs b/fuzz/resize_rgba/resize_rgba.rs
new file mode 100644
index 0000000..dab34f5
--- /dev/null
+++ b/fuzz/resize_rgba/resize_rgba.rs
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgba(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_rgba(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![0u8; src_width * src_height * 4];
+
+    let store = ImageStore::<u8, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_rgba(ImageSize::new(dst_width, dst_height), store, false)
+        .unwrap();
+    let store = ImageStore::<u8, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgba(ImageSize::new(dst_width, dst_height), store, true)
+        .unwrap();
+}
diff --git a/fuzz/resize_rgba_f32/resize_rgba_f32.rs b/fuzz/resize_rgba_f32/resize_rgba_f32.rs
new file mode 100644
index 0000000..8c08146
--- /dev/null
+++ b/fuzz/resize_rgba_f32/resize_rgba_f32.rs
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingF32};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgba(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Bilinear,
+    )
+});
+
+fn resize_rgba(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![0f32; src_width * src_height * 4];
+
+    let store = ImageStore::<f32, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_rgba_f32(ImageSize::new(dst_width, dst_height), store, false)
+        .unwrap();
+    let store = ImageStore::<f32, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgba_f32(ImageSize::new(dst_width, dst_height), store, true)
+        .unwrap();
+}
diff --git a/fuzz/resize_rgba_u16/resize_rgba_u16.rs b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
new file mode 100644
index 0000000..494da64
--- /dev/null
+++ b/fuzz/resize_rgba_u16/resize_rgba_u16.rs
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#![no_main]
+
+use libfuzzer_sys::fuzz_target;
+use pic_scale::{Ar30ByteOrder, ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16};
+
+fuzz_target!(|data: (u16, u16, u16, u16)| {
+    resize_rgba(
+        data.0 as usize,
+        data.1 as usize,
+        data.2 as usize,
+        data.3 as usize,
+        ResamplingFunction::Lanczos3,
+    )
+});
+
+fn resize_rgba(
+    src_width: usize,
+    src_height: usize,
+    dst_width: usize,
+    dst_height: usize,
+    sampler: ResamplingFunction,
+) {
+    if src_width == 0
+        || src_width > 2000
+        || src_height == 0
+        || src_height > 2000
+        || dst_width == 0
+        || dst_width > 512
+        || dst_height == 0
+        || dst_height > 512
+    {
+        return;
+    }
+
+    let mut src_data = vec![1u16; src_width * src_height * 4];
+
+    let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    let scaler = Scaler::new(sampler);
+    _ = scaler
+        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 10, false)
+        .unwrap();
+    let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 10, true)
+        .unwrap();
+
+    let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 16, false)
+        .unwrap();
+
+    let store = ImageStore::<u16, 4>::from_slice(&mut src_data, src_width, src_height).unwrap();
+    _ = scaler
+        .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 16, true)
+        .unwrap();
+
+    let src_data_ar30 = vec![1u32; src_width * src_height];
+    let mut dst_data_ar30 = vec![1u32; dst_width * dst_height];
+    _ = scaler.resize_ar30(
+        &src_data_ar30,
+        ImageSize::new(src_width, src_height),
+        &mut dst_data_ar30,
+        ImageSize::new(dst_width, dst_height),
+        Ar30ByteOrder::Host,
+    );
+}
diff --git a/src/alpha_handle_f16.rs b/src/alpha_handle_f16.rs
index 15e3f4c..74ac366 100644
--- a/src/alpha_handle_f16.rs
+++ b/src/alpha_handle_f16.rs
@@ -123,7 +123,7 @@ fn unpremultiply_alpha_rgba_impl_f16(
     }
 }
 
-pub fn premultiply_alpha_rgba_f16(
+pub(crate) fn premultiply_alpha_rgba_f16(
     dst: &mut [half::f16],
     src: &[half::f16],
     width: usize,
@@ -154,7 +154,7 @@ pub fn premultiply_alpha_rgba_f16(
     _dispatcher(dst, src, width, height, pool);
 }
 
-pub fn unpremultiply_alpha_rgba_f16(
+pub(crate) fn unpremultiply_alpha_rgba_f16(
     in_place: &mut [half::f16],
     width: usize,
     height: usize,
diff --git a/src/alpha_handle_f32.rs b/src/alpha_handle_f32.rs
index 132c53a..5a46f5e 100644
--- a/src/alpha_handle_f32.rs
+++ b/src/alpha_handle_f32.rs
@@ -119,7 +119,7 @@ fn unpremultiply_alpha_rgba_impl_f32(
     }
 }
 
-pub fn premultiply_alpha_rgba_f32(
+pub(crate) fn premultiply_alpha_rgba_f32(
     dst: &mut [f32],
     src: &[f32],
     width: usize,
@@ -147,7 +147,7 @@ pub fn premultiply_alpha_rgba_f32(
     _dispatcher(dst, src, width, height, pool);
 }
 
-pub fn unpremultiply_alpha_rgba_f32(
+pub(crate) fn unpremultiply_alpha_rgba_f32(
     in_place: &mut [f32],
     width: usize,
     height: usize,
diff --git a/src/alpha_handle_u16.rs b/src/alpha_handle_u16.rs
index a7f89d4..a9ecce7 100644
--- a/src/alpha_handle_u16.rs
+++ b/src/alpha_handle_u16.rs
@@ -39,21 +39,21 @@ use rayon::slice::ParallelSliceMut;
 use rayon::ThreadPool;
 
 #[inline]
-pub fn div_by_1023(v: u32) -> u16 {
+pub(crate) fn div_by_1023(v: u32) -> u16 {
     let round = 1 << 9;
     let v = v + round;
     (((v >> 10) + v) >> 10) as u16
 }
 
 #[inline]
-pub fn div_by_4095(v: u32) -> u16 {
+pub(crate) fn div_by_4095(v: u32) -> u16 {
     let round = 1 << 11;
     let v = v + round;
     (((v >> 12) + v) >> 12) as u16
 }
 
 #[inline]
-pub fn div_by_65535(v: u32) -> u16 {
+pub(crate) fn div_by_65535(v: u32) -> u16 {
     let round = 1 << 15;
     let v_expand = v;
     let v = v_expand + round;
@@ -101,7 +101,7 @@ pub(crate) fn premultiply_alpha_rgba_row(dst: &mut [u16], src: &[u16], max_color
     }
 }
 
-pub fn unpremultiply_alpha_rgba_row(in_place: &mut [u16], max_colors: u32) {
+pub(crate) fn unpremultiply_alpha_rgba_row(in_place: &mut [u16], max_colors: u32) {
     for dst in in_place.chunks_exact_mut(4) {
         let a = dst[3] as u32;
         if a != 0 {
@@ -161,7 +161,7 @@ fn unpremultiply_alpha_rgba_impl(
     }
 }
 
-pub fn premultiply_alpha_rgba_u16(
+pub(crate) fn premultiply_alpha_rgba_u16(
     dst: &mut [u16],
     src: &[u16],
     width: usize,
@@ -191,7 +191,7 @@ pub fn premultiply_alpha_rgba_u16(
     _dispatcher(dst, src, width, height, bit_depth, pool);
 }
 
-pub fn unpremultiply_alpha_rgba_u16(
+pub(crate) fn unpremultiply_alpha_rgba_u16(
     in_place: &mut [u16],
     width: usize,
     height: usize,
diff --git a/src/alpha_handle_u8.rs b/src/alpha_handle_u8.rs
index 162754a..a357e51 100644
--- a/src/alpha_handle_u8.rs
+++ b/src/alpha_handle_u8.rs
@@ -40,7 +40,7 @@ use rayon::slice::ParallelSliceMut;
 use rayon::ThreadPool;
 
 #[inline]
-pub fn div_by_255(v: u16) -> u8 {
+pub(crate) fn div_by_255(v: u16) -> u8 {
     ((((v + 0x80) >> 8) + v + 0x80) >> 8).min(255) as u8
 }
 
@@ -112,7 +112,7 @@ fn unpremultiply_alpha_rgba_impl(
     }
 }
 
-pub fn premultiply_alpha_rgba(
+pub(crate) fn premultiply_alpha_rgba(
     dst: &mut [u8],
     src: &[u8],
     width: usize,
@@ -144,7 +144,7 @@ pub fn premultiply_alpha_rgba(
     _dispatcher(dst, src, width, height, pool);
 }
 
-pub fn unpremultiply_alpha_rgba(
+pub(crate) fn unpremultiply_alpha_rgba(
     in_place: &mut [u8],
     width: usize,
     height: usize,
diff --git a/src/ar30.rs b/src/ar30.rs
new file mode 100644
index 0000000..cc4fcc6
--- /dev/null
+++ b/src/ar30.rs
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub(crate) enum Rgb30 {
+    Ar30 = 0,
+    Ra30 = 1,
+}
+
+impl From<usize> for Rgb30 {
+    fn from(value: usize) -> Self {
+        match value {
+            0 => Rgb30::Ar30,
+            1 => Rgb30::Ra30,
+            _ => {
+                unimplemented!("Rgb30 is not implemented for value {}", value)
+            }
+        }
+    }
+}
+
+/// Converts a value from host byte order to network byte order.
+#[inline]
+const fn htonl(hostlong: u32) -> u32 {
+    hostlong.to_be()
+}
+
+/// Converts a value from network byte order to host byte order.
+#[inline]
+const fn ntohl(netlong: u32) -> u32 {
+    u32::from_be(netlong)
+}
+
+impl Rgb30 {
+    #[inline]
+    pub(crate) const fn pack_w_a<const STORE: usize>(self, r: i32, g: i32, b: i32, a: i32) -> u32 {
+        let value: u32 = match self {
+            Rgb30::Ar30 => (a << 30 | (b << 20) | (g << 10) | r) as u32,
+            Rgb30::Ra30 => ((r << 22) | (g << 12) | (b << 2) | a) as u32,
+        };
+        if STORE == 0 {
+            value
+        } else {
+            htonl(value)
+        }
+    }
+
+    #[inline(always)]
+    pub(crate) const fn unpack<const STORE: usize>(self, value: u32) -> (u32, u32, u32, u32) {
+        let pixel = if STORE == 0 { value } else { ntohl(value) };
+        match self {
+            Rgb30::Ar30 => {
+                let r10 = pixel & 0x3ff;
+                let g10 = (pixel >> 10) & 0x3ff;
+                let b10 = (pixel >> 20) & 0x3ff;
+                let a10 = pixel >> 30;
+                (r10, g10, b10, a10)
+            }
+            Rgb30::Ra30 => {
+                let a2 = pixel & 0x3;
+                let r10 = (pixel >> 22) & 0x3ff;
+                let g10 = (pixel >> 12) & 0x3ff;
+                let b10 = (pixel >> 2) & 0x3ff;
+                (r10, g10, b10, a2)
+            }
+        }
+    }
+}
+
+#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
+/// Defines storage byte order for RGBA1010102 or RGBA2101010
+///
+/// Some systems require to be bytes in network byte order instead of host.
+pub enum Ar30ByteOrder {
+    Host = 0,
+    Network = 1,
+}
+
+impl From<usize> for Ar30ByteOrder {
+    fn from(value: usize) -> Self {
+        match value {
+            0 => Ar30ByteOrder::Host,
+            1 => Ar30ByteOrder::Network,
+            _ => {
+                unimplemented!("Rgb30ByteOrder is not implemented for value {}", value)
+            }
+        }
+    }
+}
diff --git a/src/avx2/alpha_f16.rs b/src/avx2/alpha_f16.rs
index 6d21ef8..ac2bf6e 100644
--- a/src/avx2/alpha_f16.rs
+++ b/src/avx2/alpha_f16.rs
@@ -37,7 +37,7 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-pub fn avx_premultiply_alpha_rgba_f16(
+pub(crate) fn avx_premultiply_alpha_rgba_f16(
     dst: &mut [half::f16],
     src: &[half::f16],
     width: usize,
@@ -138,7 +138,7 @@ unsafe fn avx_premultiply_alpha_rgba_f16_impl(
     }
 }
 
-pub fn avx_unpremultiply_alpha_rgba_f16(
+pub(crate) fn avx_unpremultiply_alpha_rgba_f16(
     in_place: &mut [half::f16],
     width: usize,
     height: usize,
diff --git a/src/avx2/alpha_f32.rs b/src/avx2/alpha_f32.rs
index b7c97b0..d24ca33 100644
--- a/src/avx2/alpha_f32.rs
+++ b/src/avx2/alpha_f32.rs
@@ -40,13 +40,13 @@ use rayon::slice::ParallelSliceMut;
 use rayon::ThreadPool;
 
 #[inline(always)]
-pub unsafe fn avx_unpremultiply_row_f32(x: __m256, a: __m256) -> __m256 {
+pub(crate) unsafe fn avx_unpremultiply_row_f32(x: __m256, a: __m256) -> __m256 {
     let is_zero_mask = _mm256_cmp_ps::<_CMP_EQ_OS>(a, _mm256_setzero_ps());
     let rs = _mm256_div_ps(x, a);
     _mm256_blendv_ps(rs, _mm256_setzero_ps(), is_zero_mask)
 }
 
-pub fn avx_unpremultiply_alpha_rgba_f32(
+pub(crate) fn avx_unpremultiply_alpha_rgba_f32(
     in_place: &mut [f32],
     width: usize,
     height: usize,
@@ -111,7 +111,7 @@ unsafe fn avx_unpremultiply_alpha_rgba_f32_impl(
     }
 }
 
-pub fn avx_premultiply_alpha_rgba_f32(
+pub(crate) fn avx_premultiply_alpha_rgba_f32(
     dst: &mut [f32],
     src: &[f32],
     width: usize,
diff --git a/src/avx2/alpha_u16.rs b/src/avx2/alpha_u16.rs
index 35f1a24..f190f68 100644
--- a/src/avx2/alpha_u16.rs
+++ b/src/avx2/alpha_u16.rs
@@ -51,7 +51,7 @@ unsafe fn _mm256_scale_by_alpha(px: __m256i, low_low_a: __m256, low_high_a: __m2
 }
 
 #[inline(always)]
-pub unsafe fn _mm256_div_by_1023_epi32(v: __m256i) -> __m256i {
+pub(crate) unsafe fn _mm256_div_by_1023_epi32(v: __m256i) -> __m256i {
     const DIVIDING_BY: i32 = 10;
     let addition = _mm256_set1_epi32(1 << (DIVIDING_BY - 1));
     let v = _mm256_add_epi32(v, addition);
@@ -59,7 +59,7 @@ pub unsafe fn _mm256_div_by_1023_epi32(v: __m256i) -> __m256i {
 }
 
 #[inline(always)]
-pub unsafe fn _mm256_div_by_4095_epi32(v: __m256i) -> __m256i {
+pub(crate) unsafe fn _mm256_div_by_4095_epi32(v: __m256i) -> __m256i {
     const DIVIDING_BY: i32 = 12;
     let addition = _mm256_set1_epi32(1 << (DIVIDING_BY - 1));
     let v = _mm256_add_epi32(v, addition);
@@ -67,14 +67,14 @@ pub unsafe fn _mm256_div_by_4095_epi32(v: __m256i) -> __m256i {
 }
 
 #[inline(always)]
-pub unsafe fn _mm256_div_by_65535_epi32(v: __m256i) -> __m256i {
+pub(crate) unsafe fn _mm256_div_by_65535_epi32(v: __m256i) -> __m256i {
     const DIVIDING_BY: i32 = 16;
     let addition = _mm256_set1_epi32(1 << (DIVIDING_BY - 1));
     let v = _mm256_add_epi32(v, addition);
     _mm256_srli_epi32::<DIVIDING_BY>(_mm256_add_epi32(v, _mm256_srli_epi32::<DIVIDING_BY>(v)))
 }
 
-pub fn avx_premultiply_alpha_rgba_u16(
+pub(crate) fn avx_premultiply_alpha_rgba_u16(
     dst: &mut [u16],
     src: &[u16],
     width: usize,
@@ -339,7 +339,7 @@ unsafe fn avx_premultiply_alpha_rgba_u16_impl(
     }
 }
 
-pub fn avx_unpremultiply_alpha_rgba_u16(
+pub(crate) fn avx_unpremultiply_alpha_rgba_u16(
     in_place: &mut [u16],
     width: usize,
     height: usize,
diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs
index 934f7b8..291c4b4 100644
--- a/src/avx2/alpha_u8.rs
+++ b/src/avx2/alpha_u8.rs
@@ -99,7 +99,7 @@ unsafe fn avx2_unpremultiply_row(x: __m256i, a: __m256i) -> __m256i {
     )
 }
 
-pub fn avx_premultiply_alpha_rgba(
+pub(crate) fn avx_premultiply_alpha_rgba(
     dst: &mut [u8],
     src: &[u8],
     width: usize,
@@ -239,7 +239,7 @@ unsafe fn avx_premultiply_alpha_rgba_impl(
     }
 }
 
-pub fn avx_unpremultiply_alpha_rgba(
+pub(crate) fn avx_unpremultiply_alpha_rgba(
     in_place: &mut [u8],
     width: usize,
     height: usize,
diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
index 9affca8..8a1806a 100644
--- a/src/avx2/mod.rs
+++ b/src/avx2/mod.rs
@@ -35,7 +35,7 @@ mod alpha_u8;
 #[cfg(feature = "half")]
 mod rgba_f16;
 mod rgba_f32;
-pub mod utils;
+pub(crate) mod utils;
 #[cfg(feature = "half")]
 mod vertical_f16;
 mod vertical_f32;
@@ -43,21 +43,21 @@ mod vertical_u8;
 mod vertical_u8_lp;
 
 #[cfg(feature = "half")]
-pub use alpha_f16::{avx_premultiply_alpha_rgba_f16, avx_unpremultiply_alpha_rgba_f16};
-pub use alpha_f32::avx_premultiply_alpha_rgba_f32;
-pub use alpha_f32::avx_unpremultiply_alpha_rgba_f32;
-pub use alpha_u16::{avx_premultiply_alpha_rgba_u16, avx_unpremultiply_alpha_rgba_u16};
-pub use alpha_u8::avx_premultiply_alpha_rgba;
-pub use alpha_u8::avx_unpremultiply_alpha_rgba;
+pub(crate) use alpha_f16::{avx_premultiply_alpha_rgba_f16, avx_unpremultiply_alpha_rgba_f16};
+pub(crate) use alpha_f32::avx_premultiply_alpha_rgba_f32;
+pub(crate) use alpha_f32::avx_unpremultiply_alpha_rgba_f32;
+pub(crate) use alpha_u16::{avx_premultiply_alpha_rgba_u16, avx_unpremultiply_alpha_rgba_u16};
+pub(crate) use alpha_u8::avx_premultiply_alpha_rgba;
+pub(crate) use alpha_u8::avx_unpremultiply_alpha_rgba;
 #[cfg(feature = "half")]
-pub use rgba_f16::{
+pub(crate) use rgba_f16::{
     convolve_horizontal_rgba_avx_row_one_f16, convolve_horizontal_rgba_avx_rows_4_f16,
 };
-pub use rgba_f32::{
+pub(crate) use rgba_f32::{
     convolve_horizontal_rgba_avx_row_one_f32, convolve_horizontal_rgba_avx_rows_4_f32,
 };
 #[cfg(feature = "half")]
-pub use vertical_f16::convolve_vertical_avx_row_f16;
-pub use vertical_f32::convolve_vertical_avx_row_f32;
-pub use vertical_u8::convolve_vertical_avx_row;
-pub use vertical_u8_lp::convolve_vertical_avx_row_lp;
+pub(crate) use vertical_f16::convolve_vertical_avx_row_f16;
+pub(crate) use vertical_f32::convolve_vertical_avx_row_f32;
+pub(crate) use vertical_u8::convolve_vertical_avx_row;
+pub(crate) use vertical_u8_lp::convolve_vertical_avx_row_lp;
diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs
index f54af44..e9d645d 100644
--- a/src/avx2/rgba_f16.rs
+++ b/src/avx2/rgba_f16.rs
@@ -116,7 +116,7 @@ unsafe fn convolve_horizontal_parts_2_rgba_f16<const FMA: bool>(
     acc
 }
 
-pub fn convolve_horizontal_rgba_avx_row_one_f16<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_avx_row_one_f16<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -271,7 +271,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl<const FMA: bool>(
     }
 }
 
-pub fn convolve_horizontal_rgba_avx_rows_4_f16<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f16<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/avx2/rgba_f32.rs b/src/avx2/rgba_f32.rs
index f2a2726..f82923a 100644
--- a/src/avx2/rgba_f32.rs
+++ b/src/avx2/rgba_f32.rs
@@ -112,7 +112,7 @@ unsafe fn convolve_horizontal_parts_2_rgba_f32<const FMA: bool>(
     _mm256_fma_ps::<FMA>(store_0, rgb_pixel, weight0)
 }
 
-pub fn convolve_horizontal_rgba_avx_rows_4_f32<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -396,7 +396,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl<const FMA: bool>(
     }
 }
 
-pub fn convolve_horizontal_rgba_avx_row_one_f32<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_avx_row_one_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/avx2/utils.rs b/src/avx2/utils.rs
index c9cdf93..cd11c57 100644
--- a/src/avx2/utils.rs
+++ b/src/avx2/utils.rs
@@ -33,7 +33,7 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 #[inline]
-pub unsafe fn _mm256_fma_ps<const FMA: bool>(a: __m256, b: __m256, c: __m256) -> __m256 {
+pub(crate) unsafe fn _mm256_fma_ps<const FMA: bool>(a: __m256, b: __m256, c: __m256) -> __m256 {
     if FMA {
         _mm256_fma_psx(a, b, c)
     } else {
@@ -47,12 +47,12 @@ unsafe fn _mm256_fma_psx(a: __m256, b: __m256, c: __m256) -> __m256 {
 }
 
 #[inline(always)]
-pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
+pub(crate) const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
     ((z << 6) | (y << 4) | (x << 2) | w) as i32
 }
 
 #[inline(always)]
-pub unsafe fn _mm256_select_si256(
+pub(crate) unsafe fn _mm256_select_si256(
     mask: __m256i,
     true_vals: __m256i,
     false_vals: __m256i,
@@ -64,12 +64,16 @@ pub unsafe fn _mm256_select_si256(
 }
 
 #[inline(always)]
-pub unsafe fn _mm256_selecti_ps(mask: __m256i, true_vals: __m256, false_vals: __m256) -> __m256 {
+pub(crate) unsafe fn _mm256_selecti_ps(
+    mask: __m256i,
+    true_vals: __m256,
+    false_vals: __m256,
+) -> __m256 {
     _mm256_blendv_ps(false_vals, true_vals, _mm256_castsi256_ps(mask))
 }
 
 #[inline(always)]
-pub unsafe fn avx2_div_by255(v: __m256i) -> __m256i {
+pub(crate) unsafe fn avx2_div_by255(v: __m256i) -> __m256i {
     let addition = _mm256_set1_epi16(127);
     _mm256_srli_epi16::<8>(_mm256_add_epi16(
         _mm256_add_epi16(v, addition),
@@ -78,7 +82,7 @@ pub unsafe fn avx2_div_by255(v: __m256i) -> __m256i {
 }
 
 #[inline(always)]
-pub unsafe fn avx2_deinterleave_rgba(
+pub(crate) unsafe fn avx2_deinterleave_rgba(
     rgba0: __m256i,
     rgba1: __m256i,
     rgba2: __m256i,
@@ -118,7 +122,7 @@ pub unsafe fn avx2_deinterleave_rgba(
 }
 
 #[inline(always)]
-pub unsafe fn avx_deinterleave_rgba_epi32(
+pub(crate) unsafe fn avx_deinterleave_rgba_epi32(
     p0: __m256i,
     p1: __m256i,
     p2: __m256i,
@@ -142,7 +146,7 @@ pub unsafe fn avx_deinterleave_rgba_epi32(
 }
 
 #[inline(always)]
-pub unsafe fn avx_interleave_rgba_epi32(
+pub(crate) unsafe fn avx_interleave_rgba_epi32(
     p0: __m256i,
     p1: __m256i,
     p2: __m256i,
@@ -167,7 +171,7 @@ pub unsafe fn avx_interleave_rgba_epi32(
 }
 
 #[inline(always)]
-pub unsafe fn avx_interleave_rgba_epi16(
+pub(crate) unsafe fn avx_interleave_rgba_epi16(
     a: __m256i,
     b: __m256i,
     c: __m256i,
@@ -191,7 +195,7 @@ pub unsafe fn avx_interleave_rgba_epi16(
 }
 
 #[inline(always)]
-pub unsafe fn avx_deinterleave_rgba_epi16(
+pub(crate) unsafe fn avx_deinterleave_rgba_epi16(
     a: __m256i,
     b: __m256i,
     c: __m256i,
@@ -224,7 +228,7 @@ pub unsafe fn avx_deinterleave_rgba_epi16(
 }
 
 #[inline(always)]
-pub unsafe fn avx_deinterleave_rgba_ps(
+pub(crate) unsafe fn avx_deinterleave_rgba_ps(
     p0: __m256,
     p1: __m256,
     p2: __m256,
@@ -245,7 +249,7 @@ pub unsafe fn avx_deinterleave_rgba_ps(
 }
 
 #[inline(always)]
-pub unsafe fn avx_interleave_rgba_ps(
+pub(crate) unsafe fn avx_interleave_rgba_ps(
     p0: __m256,
     p1: __m256,
     p2: __m256,
@@ -266,7 +270,7 @@ pub unsafe fn avx_interleave_rgba_ps(
 }
 
 #[inline(always)]
-pub unsafe fn avx2_interleave_rgba(
+pub(crate) unsafe fn avx2_interleave_rgba(
     r: __m256i,
     g: __m256i,
     b: __m256i,
@@ -290,7 +294,7 @@ pub unsafe fn avx2_interleave_rgba(
 }
 
 #[inline(always)]
-pub unsafe fn avx2_pack_u16(s_1: __m256i, s_2: __m256i) -> __m256i {
+pub(crate) unsafe fn avx2_pack_u16(s_1: __m256i, s_2: __m256i) -> __m256i {
     let packed = _mm256_packus_epi16(s_1, s_2);
     const MASK: i32 = shuffle(3, 1, 2, 0);
     _mm256_permute4x64_epi64::<MASK>(packed)
@@ -298,7 +302,12 @@ pub unsafe fn avx2_pack_u16(s_1: __m256i, s_2: __m256i) -> __m256i {
 
 #[inline]
 #[target_feature(enable = "avx2")]
-pub unsafe fn _mm256_packus_four_epi32(a: __m256i, b: __m256i, c: __m256i, d: __m256i) -> __m256i {
+pub(crate) unsafe fn _mm256_packus_four_epi32(
+    a: __m256i,
+    b: __m256i,
+    c: __m256i,
+    d: __m256i,
+) -> __m256i {
     let ab = _mm256_packs_epi32(a, b);
     let cd = _mm256_packs_epi32(c, d);
 
@@ -309,7 +318,7 @@ pub unsafe fn _mm256_packus_four_epi32(a: __m256i, b: __m256i, c: __m256i, d: __
 }
 
 #[inline(always)]
-pub unsafe fn avx2_pack_u32(s_1: __m256i, s_2: __m256i) -> __m256i {
+pub(crate) unsafe fn avx2_pack_u32(s_1: __m256i, s_2: __m256i) -> __m256i {
     let packed = _mm256_packus_epi32(s_1, s_2);
     const MASK: i32 = shuffle(3, 1, 2, 0);
     _mm256_permute4x64_epi64::<MASK>(packed)
@@ -317,13 +326,13 @@ pub unsafe fn avx2_pack_u32(s_1: __m256i, s_2: __m256i) -> __m256i {
 
 #[inline(always)]
 #[allow(dead_code)]
-pub unsafe fn avx_combine_ps(lo: __m128, hi: __m128) -> __m256 {
+pub(crate) unsafe fn avx_combine_ps(lo: __m128, hi: __m128) -> __m256 {
     _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(lo), hi)
 }
 
 #[inline(always)]
 #[allow(dead_code)]
-pub unsafe fn avx_combine_epi(lo: __m128i, hi: __m128i) -> __m256i {
+pub(crate) unsafe fn avx_combine_epi(lo: __m128i, hi: __m128i) -> __m256i {
     _mm256_castps_si256(_mm256_insertf128_ps::<1>(
         _mm256_castps128_ps256(_mm_castsi128_ps(lo)),
         _mm_castsi128_ps(hi),
@@ -332,7 +341,7 @@ pub unsafe fn avx_combine_epi(lo: __m128i, hi: __m128i) -> __m256i {
 
 #[inline]
 /// Arithmetic shift for i64, shifting with sign bits
-pub unsafe fn _mm256_srai_epi64x<const IMM8: i32>(a: __m256i) -> __m256i {
+pub(crate) unsafe fn _mm256_srai_epi64x<const IMM8: i32>(a: __m256i) -> __m256i {
     let m = _mm256_set1_epi64x(1 << (64 - 1));
     let x = _mm256_srli_epi64::<IMM8>(a);
     _mm256_sub_epi64(_mm256_xor_si256(x, m), m)
@@ -340,7 +349,7 @@ pub unsafe fn _mm256_srai_epi64x<const IMM8: i32>(a: __m256i) -> __m256i {
 
 #[inline]
 /// Pack 64bytes integers into 32 bytes using truncation
-pub unsafe fn _mm256_packts_epi64(a: __m256i, b: __m256i) -> __m256i {
+pub(crate) unsafe fn _mm256_packts_epi64(a: __m256i, b: __m256i) -> __m256i {
     const SHUFFLE_1: i32 = shuffle(2, 0, 2, 0);
     let combined = _mm256_shuffle_ps::<SHUFFLE_1>(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b));
     const SHUFFLE_2: i32 = shuffle(3, 1, 2, 0);
@@ -351,7 +360,7 @@ pub unsafe fn _mm256_packts_epi64(a: __m256i, b: __m256i) -> __m256i {
 #[inline]
 #[allow(dead_code)]
 /// Pack 64bytes integers into 32 bytes
-pub unsafe fn _mm256_cvtepi64_epi32x(v: __m256i) -> __m128i {
+pub(crate) unsafe fn _mm256_cvtepi64_epi32x(v: __m256i) -> __m128i {
     let vf = _mm256_castsi256_ps(v);
     let hi = _mm256_extractf128_ps::<1>(vf);
     let lo = _mm256_castps256_ps128(vf);
diff --git a/src/avx2/vertical_f16.rs b/src/avx2/vertical_f16.rs
index fc2b2e4..63b2871 100644
--- a/src/avx2/vertical_f16.rs
+++ b/src/avx2/vertical_f16.rs
@@ -201,7 +201,7 @@ unsafe fn convolve_vertical_part_avx_16_f16<const FMA: bool>(
     _mm256_storeu_si256(dst_ptr as *mut __m256i, acc0);
 }
 
-pub fn convolve_vertical_avx_row_f16<const CHANNELS: usize, const FMA: bool>(
+pub(crate) fn convolve_vertical_avx_row_f16<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const half::f16,
@@ -273,7 +273,7 @@ unsafe fn convolve_vertical_avx_row_f16_fma<const CHANNELS: usize>(
 }
 
 #[inline(always)]
-pub fn convolve_vertical_avx_row_f16_impl<const CHANNELS: usize, const FMA: bool>(
+pub(crate) fn convolve_vertical_avx_row_f16_impl<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const half::f16,
diff --git a/src/avx2/vertical_f32.rs b/src/avx2/vertical_f32.rs
index b480b36..ddd673a 100644
--- a/src/avx2/vertical_f32.rs
+++ b/src/avx2/vertical_f32.rs
@@ -171,7 +171,7 @@ pub(crate) unsafe fn convolve_vertical_part_avx_f32<const FMA: bool>(
 }
 
 #[inline]
-pub fn convolve_vertical_avx_row_f32<const CHANNELS: usize, const FMA: bool>(
+pub(crate) fn convolve_vertical_avx_row_f32<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const f32,
diff --git a/src/avx2/vertical_u8.rs b/src/avx2/vertical_u8.rs
index 03feaad..b8e3ee8 100644
--- a/src/avx2/vertical_u8.rs
+++ b/src/avx2/vertical_u8.rs
@@ -520,7 +520,7 @@ unsafe fn convolve_vertical_part_avx(
     *dst_ptr = _mm256_extract_epi8::<0>(item) as u8;
 }
 
-pub fn convolve_vertical_avx_row(
+pub(crate) fn convolve_vertical_avx_row(
     dst_width: usize,
     bounds: &FilterBounds,
     src: &[u8],
diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs
index 7ad792d..51b7134 100644
--- a/src/avx2/vertical_u8_lp.rs
+++ b/src/avx2/vertical_u8_lp.rs
@@ -33,7 +33,7 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-pub fn convolve_vertical_avx_row_lp(
+pub(crate) fn convolve_vertical_avx_row_lp(
     dst_width: usize,
     bounds: &FilterBounds,
     src: &[u8],
diff --git a/src/color_group.rs b/src/color_group.rs
index abb7cd8..942deb5 100644
--- a/src/color_group.rs
+++ b/src/color_group.rs
@@ -26,10 +26,9 @@
  * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
-#![allow(dead_code)]
 use crate::mlaf::mlaf;
 use crate::saturate_narrow::SaturateNarrow;
-use num_traits::{AsPrimitive, FromPrimitive, MulAdd, Num};
+use num_traits::{AsPrimitive, FromPrimitive, MulAdd};
 use std::ops::{Add, AddAssign, Mul, Shr, ShrAssign, Sub, SubAssign};
 
 #[repr(C)]
@@ -46,7 +45,7 @@ where
     J: Copy + Default,
 {
     #[inline(always)]
-    pub fn new() -> ColorGroup<COMPS, J> {
+    pub(crate) fn new() -> ColorGroup<COMPS, J> {
         ColorGroup {
             r: J::default(),
             g: J::default(),
@@ -56,12 +55,12 @@ where
     }
 
     #[inline(always)]
-    pub fn from_components(r: J, g: J, b: J, a: J) -> ColorGroup<COMPS, J> {
+    pub(crate) fn from_components(r: J, g: J, b: J, a: J) -> ColorGroup<COMPS, J> {
         ColorGroup { r, g, b, a }
     }
 
     #[inline(always)]
-    pub fn dup(v: J) -> ColorGroup<COMPS, J> {
+    pub(crate) fn dup(v: J) -> ColorGroup<COMPS, J> {
         ColorGroup {
             r: v,
             g: v,
@@ -76,47 +75,7 @@ where
     J: Copy + Default + 'static,
 {
     #[inline(always)]
-    pub fn from_slice<T>(store: &[T], offset: usize) -> ColorGroup<COMPS, J>
-    where
-        T: AsPrimitive<J>,
-    {
-        unsafe {
-            if COMPS == 1 {
-                ColorGroup {
-                    r: (*store.get_unchecked(offset)).as_(),
-                    g: J::default(),
-                    b: J::default(),
-                    a: J::default(),
-                }
-            } else if COMPS == 2 {
-                ColorGroup {
-                    r: (*store.get_unchecked(offset)).as_(),
-                    g: (*store.get_unchecked(offset + 1)).as_(),
-                    b: J::default(),
-                    a: J::default(),
-                }
-            } else if COMPS == 3 {
-                ColorGroup {
-                    r: (*store.get_unchecked(offset)).as_(),
-                    g: (*store.get_unchecked(offset + 1)).as_(),
-                    b: (*store.get_unchecked(offset + 2)).as_(),
-                    a: J::default(),
-                }
-            } else if COMPS == 4 {
-                ColorGroup {
-                    r: (*store.get_unchecked(offset)).as_(),
-                    g: (*store.get_unchecked(offset + 1)).as_(),
-                    b: (*store.get_unchecked(offset + 2)).as_(),
-                    a: (*store.get_unchecked(offset + 3)).as_(),
-                }
-            } else {
-                panic!("Not implemented.")
-            }
-        }
-    }
-
-    #[inline(always)]
-    pub fn from_ptr<T>(store: *const T, offset: usize) -> ColorGroup<COMPS, J>
+    pub(crate) fn from_ptr<T>(store: *const T, offset: usize) -> ColorGroup<COMPS, J>
     where
         T: AsPrimitive<J>,
     {
@@ -151,30 +110,13 @@ where
                     a: l_ptr.add(3).read_unaligned().as_(),
                 }
             } else {
-                panic!("Not implemented.")
+                unimplemented!("Not implemented.")
             }
         }
     }
 
     #[inline(always)]
-    pub fn to_ptr(self, ptr: *mut J, offset: usize) {
-        unsafe {
-            let s_ptr = ptr.add(offset);
-            s_ptr.write_unaligned(self.r);
-            if COMPS > 1 {
-                s_ptr.add(1).write_unaligned(self.g);
-            }
-            if COMPS > 2 {
-                s_ptr.add(2).write_unaligned(self.b);
-            }
-            if COMPS == 4 {
-                s_ptr.add(3).write_unaligned(self.a);
-            }
-        }
-    }
-
-    #[inline(always)]
-    pub fn as_ptr<V: Copy + 'static>(self, ptr: *mut V, offset: usize)
+    pub(crate) fn as_ptr<V: Copy + 'static>(self, ptr: *mut V, offset: usize)
     where
         J: Copy + AsPrimitive<V>,
     {
@@ -194,67 +136,6 @@ where
     }
 }
 
-impl<const COMPS: usize, J> ColorGroup<COMPS, J>
-where
-    J: Copy + Default + 'static + Num + Ord,
-{
-    #[inline(always)]
-    pub fn min_scalar(&self, other: J) -> ColorGroup<COMPS, J> {
-        if COMPS == 1 {
-            ColorGroup::from_components(self.r.min(other), J::default(), J::default(), J::default())
-        } else if COMPS == 2 {
-            ColorGroup::from_components(
-                self.r.min(other),
-                self.g.min(other),
-                J::default(),
-                J::default(),
-            )
-        } else if COMPS == 3 {
-            ColorGroup::from_components(
-                self.r.min(other),
-                self.g.min(other),
-                self.b.min(other),
-                J::default(),
-            )
-        } else {
-            ColorGroup::from_components(
-                self.r.min(other),
-                self.g.min(other),
-                self.b.min(other),
-                self.a.min(other),
-            )
-        }
-    }
-
-    #[inline(always)]
-    pub(crate) fn max_scalar(&self, other: J) -> ColorGroup<COMPS, J> {
-        if COMPS == 1 {
-            ColorGroup::from_components(self.r.max(other), J::default(), J::default(), J::default())
-        } else if COMPS == 2 {
-            ColorGroup::from_components(
-                self.r.max(other),
-                self.g.max(other),
-                J::default(),
-                J::default(),
-            )
-        } else if COMPS == 3 {
-            ColorGroup::from_components(
-                self.r.max(other),
-                self.g.max(other),
-                self.b.max(other),
-                J::default(),
-            )
-        } else {
-            ColorGroup::from_components(
-                self.r.max(other),
-                self.g.max(other),
-                self.b.max(other),
-                self.a.max(other),
-            )
-        }
-    }
-}
-
 impl<const COMPS: usize, J> Mul<J> for ColorGroup<COMPS, J>
 where
     J: Copy + Mul<Output = J> + Default + 'static,
@@ -272,7 +153,7 @@ where
         } else if COMPS == 4 {
             ColorGroup::from_components(self.r * rhs, self.g * rhs, self.b * rhs, self.a * rhs)
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -282,7 +163,7 @@ where
     J: Copy + Default + 'static,
 {
     #[inline(always)]
-    pub fn saturate_narrow<V>(&self, bit_depth: u32) -> ColorGroup<COMPS, V>
+    pub(crate) fn saturate_narrow<V>(&self, bit_depth: u32) -> ColorGroup<COMPS, V>
     where
         V: Copy + Default,
         J: SaturateNarrow<V>,
@@ -341,11 +222,30 @@ where
                 self.a * rhs.b,
             )
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
 
+impl ColorGroup<4, i32> {
+    #[inline(always)]
+    #[allow(clippy::manual_clamp)]
+    pub(crate) fn saturate_ar30(&self) -> ColorGroup<4, i32> {
+        ColorGroup::from_components(
+            (self.r >> PRECISION).min(1023).max(0),
+            (self.g >> PRECISION).min(1023).max(0),
+            (self.b >> PRECISION).min(1023).max(0),
+            (self.a >> PRECISION).min(3).max(0),
+        )
+    }
+
+    #[inline(always)]
+    pub(crate) fn to_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(self) -> u32 {
+        let ar30_type: Rgb30 = AR30_TYPE.into();
+        ar30_type.pack_w_a::<AR30_ORDER>(self.r, self.g, self.b, self.a)
+    }
+}
+
 impl<const COMPS: usize, J> Sub<J> for ColorGroup<COMPS, J>
 where
     J: Copy + Sub<Output = J> + Default + 'static,
@@ -363,7 +263,7 @@ where
         } else if COMPS == 4 {
             ColorGroup::from_components(self.r - rhs, self.g - rhs, self.b - rhs, self.a - rhs)
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -390,7 +290,7 @@ where
                 self.a - rhs.a,
             )
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -417,7 +317,7 @@ where
                 self.a + rhs.a,
             )
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -439,7 +339,7 @@ where
         } else if COMPS == 4 {
             ColorGroup::from_components(self.r + rhs, self.g + rhs, self.b + rhs, self.a + rhs)
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -461,7 +361,7 @@ where
         } else if COMPS == 4 {
             ColorGroup::from_components(self.r >> rhs, self.g >> rhs, self.b >> rhs, self.a >> rhs)
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -517,7 +417,7 @@ where
                 mlaf(self.a, a.a, b),
             )
         } else {
-            panic!("Not implemented.");
+            unimplemented!("Not implemented.");
         }
     }
 }
@@ -580,7 +480,52 @@ where
     }
 }
 
-macro_rules! fast_load_color_group {
+macro_rules! load_ar30 {
+    ($store: expr, $ar_type: expr, $ar_order: ty) => {{
+        let ar_type: crate::ar30::Rgb30 = $ar_type.into();
+        let unpacked = ar_type.unpack::<$ar_order>($store[0]);
+        ColorGroup::<4, i32> {
+            r: unpacked.0 as i32,
+            g: unpacked.1 as i32,
+            b: unpacked.2 as i32,
+            a: unpacked.3 as i32,
+        }
+    }};
+}
+
+pub(crate) use load_ar30;
+
+macro_rules! load_ar30_p {
+    ($store: expr, $ar_type: expr, $ar_order: ty) => {{
+        let ar_type: crate::ar30::Rgb30 = $ar_type.into();
+        let unpacked = ar_type.unpack::<$ar_order>(*$store);
+        ColorGroup::<4, i32> {
+            r: unpacked.0 as i32,
+            g: unpacked.1 as i32,
+            b: unpacked.2 as i32,
+            a: unpacked.3 as i32,
+        }
+    }};
+}
+
+pub(crate) use load_ar30_p;
+
+macro_rules! load_ar30_with_offset {
+    ($store: expr, $ar_type: expr, $ar_order: ty, $offset: expr) => {{
+        let ar_type: crate::ar30::Rgb30 = $ar_type.into();
+        let unpacked = ar_type.unpack::<$ar_order>($store[$offset]);
+        ColorGroup::<4, i32> {
+            r: unpacked.0 as i32,
+            g: unpacked.1 as i32,
+            b: unpacked.2 as i32,
+            a: unpacked.3 as i32,
+        }
+    }};
+}
+
+pub(crate) use load_ar30_with_offset;
+
+macro_rules! load_color_group {
     ($store: expr, $channels: expr, $vtype: ty) => {{
         if $channels == 1 {
             ColorGroup::<$channels, $vtype> {
@@ -611,14 +556,14 @@ macro_rules! fast_load_color_group {
                 a: $store.get_unchecked(3).as_(),
             }
         } else {
-            panic!("Not implemented.")
+            unimplemented!("Not implemented.")
         }
     }};
 }
 
-pub(crate) use fast_load_color_group;
+pub(crate) use load_color_group;
 
-macro_rules! fast_load_color_group_with_offset {
+macro_rules! load_color_group_with_offset {
     ($store: expr, $channels: expr, $offset: expr, $vtype: ty) => {{
         if $channels == 1 {
             ColorGroup::<$channels, $vtype> {
@@ -654,7 +599,7 @@ macro_rules! fast_load_color_group_with_offset {
     }};
 }
 
-pub(crate) use fast_load_color_group_with_offset;
+pub(crate) use load_color_group_with_offset;
 
 macro_rules! fast_store_color_group {
     ($color_group: expr, $store: expr, $components: expr) => {{
@@ -688,4 +633,6 @@ macro_rules! fast_mixed_store_color_group {
     }};
 }
 
+use crate::ar30::Rgb30;
+use crate::support::PRECISION;
 pub(crate) use fast_mixed_store_color_group;
diff --git a/src/convolution.rs b/src/convolution.rs
index c71c2a0..944b4ae 100644
--- a/src/convolution.rs
+++ b/src/convolution.rs
@@ -34,7 +34,7 @@ use std::fmt::Debug;
 use crate::filter_weights::FilterWeights;
 use crate::ImageStore;
 
-pub trait HorizontalConvolutionPass<T, const N: usize>
+pub(crate) trait HorizontalConvolutionPass<T, const N: usize>
 where
     T: FromPrimitive + Clone + Copy + Debug,
 {
@@ -46,7 +46,7 @@ where
     );
 }
 
-pub trait VerticalConvolutionPass<T, const N: usize>
+pub(crate) trait VerticalConvolutionPass<T, const N: usize>
 where
     T: FromPrimitive + Clone + Copy + Debug,
 {
diff --git a/src/cpu_features.rs b/src/cpu_features.rs
index dcd0497..975fd4a 100644
--- a/src/cpu_features.rs
+++ b/src/cpu_features.rs
@@ -64,8 +64,8 @@ fn apple_has_cpu_feature(_feature_name: &str) -> bool {
 
 /// Test aarch64 cpu with *fp16* check,
 /// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-pub fn is_aarch_f16_supported() -> bool {
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "half"))]
+pub(crate) fn is_aarch_f16_supported() -> bool {
     #[cfg(any(target_os = "macos", target_os = "ios"))]
     {
         apple_has_cpu_feature("hw.optional.arm.FEAT_FP16")
@@ -80,8 +80,8 @@ pub fn is_aarch_f16_supported() -> bool {
 /// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used
 /// otherwise consider it is always available
 #[allow(clippy::too_long_first_doc_paragraph)]
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-pub fn is_aarch_f16c_supported() -> bool {
+#[cfg(all(target_arch = "aarch64", target_feature = "neon", feature = "half"))]
+pub(crate) fn is_aarch_f16c_supported() -> bool {
     #[cfg(any(target_os = "macos", target_os = "ios"))]
     {
         apple_has_cpu_feature("hw.optional.AdvSIMD_HPFPCvt")
@@ -91,3 +91,18 @@ pub fn is_aarch_f16c_supported() -> bool {
         true
     }
 }
+
+/// Test aarch64 cpu with *RDM* check
+///
+/// on *Apple* platform [libc](https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics) be used
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+pub(crate) fn is_aarch_rdm_supported() -> bool {
+    #[cfg(any(target_os = "macos", target_os = "ios"))]
+    {
+        apple_has_cpu_feature("hw.optional.arm.FEAT_RDM")
+    }
+    #[cfg(not(any(target_os = "macos", target_os = "ios")))]
+    {
+        std::arch::is_aarch64_feature_detected!("rdm")
+    }
+}
diff --git a/src/dispatch_group_ar30.rs b/src/dispatch_group_ar30.rs
new file mode 100644
index 0000000..95de42b
--- /dev/null
+++ b/src/dispatch_group_ar30.rs
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+use crate::filter_weights::{FilterBounds, FilterWeights};
+use crate::fixed_point_horizontal_ar30::{
+    convolve_row_handler_fixed_point_4_ar30, convolve_row_handler_fixed_point_ar30,
+};
+use crate::fixed_point_vertical_ar30::column_handler_fixed_point_ar30;
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+use crate::neon::{
+    neon_column_handler_fixed_point_ar30, neon_convolve_horizontal_rgba_rows_4_ar30,
+};
+use crate::support::PRECISION;
+use rayon::iter::{IndexedParallelIterator, ParallelIterator};
+use rayon::prelude::{ParallelSlice, ParallelSliceMut};
+use rayon::ThreadPool;
+
+#[allow(clippy::type_complexity)]
+pub(crate) fn convolve_horizontal_dispatch_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    src: &[u32],
+    src_stride: usize,
+    filter_weights: FilterWeights<f32>,
+    dst: &mut [u32],
+    dst_stride: usize,
+    pool: &Option<ThreadPool>,
+) {
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
+    if let Some(pool) = pool {
+        pool.install(|| {
+            let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+            dst.par_chunks_exact_mut(dst_stride * 4)
+                .zip(src.par_chunks_exact(src_stride * 4))
+                .for_each(|(dst, src)| {
+                    let mut _dispatch: fn(&[u32], usize, &mut [u32], usize, &FilterWeights<i16>) =
+                        convolve_row_handler_fixed_point_4_ar30::<AR30_TYPE, AR30_ORDER>;
+                    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                    if is_rdm_available {
+                        _dispatch =
+                            neon_convolve_horizontal_rgba_rows_4_ar30::<AR30_TYPE, AR30_ORDER>;
+                    }
+                    _dispatch(src, src_stride, dst, dst_stride, &approx);
+                });
+
+            let remainder = dst.chunks_exact_mut(dst_stride * 4).into_remainder();
+            let src_remainder = src.chunks_exact(src_stride * 4).remainder();
+
+            remainder
+                .par_chunks_exact_mut(dst_stride)
+                .zip(src_remainder.par_chunks_exact(src_stride))
+                .for_each(|(dst, src)| {
+                    convolve_row_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>(
+                        src, dst, &approx,
+                    );
+                });
+        });
+    } else {
+        let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+        dst.chunks_exact_mut(dst_stride * 4)
+            .zip(src.chunks_exact(src_stride * 4))
+            .for_each(|(dst, src)| {
+                let mut _dispatch: fn(&[u32], usize, &mut [u32], usize, &FilterWeights<i16>) =
+                    convolve_row_handler_fixed_point_4_ar30::<AR30_TYPE, AR30_ORDER>;
+                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                if is_rdm_available {
+                    _dispatch = neon_convolve_horizontal_rgba_rows_4_ar30::<AR30_TYPE, AR30_ORDER>;
+                }
+                _dispatch(src, src_stride, dst, dst_stride, &approx);
+            });
+
+        let remainder = dst.chunks_exact_mut(dst_stride * 4).into_remainder();
+        let src_remainder = src.chunks_exact(src_stride * 4).remainder();
+
+        remainder
+            .chunks_exact_mut(dst_stride)
+            .zip(src_remainder.chunks_exact(src_stride))
+            .for_each(|(dst, src)| {
+                convolve_row_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>(src, dst, &approx);
+            });
+    }
+}
+
+pub(crate) fn convolve_vertical_dispatch_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    src: &[u32],
+    src_stride: usize,
+    filter_weights: FilterWeights<f32>,
+    dst: &mut [u32],
+    dst_stride: usize,
+    pool: &Option<ThreadPool>,
+) {
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+    let is_rdm_available = std::arch::is_aarch64_feature_detected!("rdm");
+    if let Some(pool) = pool {
+        pool.install(|| {
+            let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+            dst.par_chunks_exact_mut(dst_stride)
+                .enumerate()
+                .for_each(|(y, row)| {
+                    let bounds = approx.bounds[y];
+                    let filter_offset = y * approx.aligned_size;
+                    let weights = &approx.weights[filter_offset..];
+                    let mut _dispatch: fn(&FilterBounds, &[u32], &mut [u32], usize, &[i16]) =
+                        column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>;
+                    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                    if is_rdm_available {
+                        _dispatch = neon_column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>;
+                    }
+
+                    _dispatch(&bounds, src, row, src_stride, weights);
+                });
+        });
+    } else {
+        let approx = filter_weights.numerical_approximation_i16::<PRECISION>(0);
+        dst.chunks_exact_mut(dst_stride)
+            .enumerate()
+            .for_each(|(y, row)| {
+                let bounds = approx.bounds[y];
+                let filter_offset = y * approx.aligned_size;
+                let weights = &approx.weights[filter_offset..];
+
+                let mut _dispatch: fn(&FilterBounds, &[u32], &mut [u32], usize, &[i16]) =
+                    column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>;
+                #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+                if is_rdm_available {
+                    _dispatch = neon_column_handler_fixed_point_ar30::<AR30_TYPE, AR30_ORDER>;
+                }
+
+                _dispatch(&bounds, src, row, src_stride, weights);
+            });
+    }
+}
diff --git a/src/filter_weights.rs b/src/filter_weights.rs
index b3491ae..6e2823d 100644
--- a/src/filter_weights.rs
+++ b/src/filter_weights.rs
@@ -28,7 +28,7 @@
  */
 
 #[derive(Debug, Clone)]
-pub struct FilterWeights<T> {
+pub(crate) struct FilterWeights<T> {
     pub weights: Vec<T>,
     pub bounds: Vec<FilterBounds>,
     pub kernel_size: usize,
@@ -38,19 +38,19 @@ pub struct FilterWeights<T> {
 }
 
 #[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)]
-pub struct FilterBounds {
+pub(crate) struct FilterBounds {
     pub start: usize,
     pub size: usize,
 }
 
 impl FilterBounds {
-    pub fn new(start: usize, size: usize) -> FilterBounds {
+    pub(crate) fn new(start: usize, size: usize) -> FilterBounds {
         FilterBounds { start, size }
     }
 }
 
 impl<T> FilterWeights<T> {
-    pub fn new(
+    pub(crate) fn new(
         slice_ref: Vec<T>,
         kernel_size: usize,
         aligned_size: usize,
@@ -70,7 +70,7 @@ impl<T> FilterWeights<T> {
 }
 
 impl FilterWeights<f32> {
-    pub fn numerical_approximation_i16<const PRECISION: i32>(
+    pub(crate) fn numerical_approximation_i16<const PRECISION: i32>(
         &self,
         alignment: usize,
     ) -> FilterWeights<i16> {
diff --git a/src/fixed_point_horizontal.rs b/src/fixed_point_horizontal.rs
index 7baa2d3..06103e5 100644
--- a/src/fixed_point_horizontal.rs
+++ b/src/fixed_point_horizontal.rs
@@ -27,7 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::color_group::{
-    fast_load_color_group, fast_load_color_group_with_offset, fast_store_color_group, ColorGroup,
+    fast_store_color_group, load_color_group, load_color_group_with_offset, ColorGroup,
 };
 use crate::filter_weights::FilterWeights;
 use crate::saturate_narrow::SaturateNarrow;
@@ -78,18 +78,17 @@ pub(crate) fn convolve_row_handler_fixed_point<
                 let sliced_weights = &weights[0..2];
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
-                sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1;
+                sums += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1;
             } else if bounds_size == 3 {
                 let src_ptr0 = &src[px..(px + 3 * CHANNELS)];
                 let sliced_weights = &weights[0..3];
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
-                sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                        * weight2;
+                sums += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2;
             } else if bounds_size == 4 {
                 let src_ptr0 = &src[px..(px + 4 * CHANNELS)];
                 let sliced_weights = &weights[0..4];
@@ -97,12 +96,10 @@ pub(crate) fn convolve_row_handler_fixed_point<
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
                 let weight3 = sliced_weights[3].as_();
-                sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                        * weight3;
+                sums += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3;
             } else if bounds_size == 6 {
                 let src_ptr0 = &src[px..(px + 6 * CHANNELS)];
 
@@ -113,16 +110,12 @@ pub(crate) fn convolve_row_handler_fixed_point<
                 let weight3 = sliced_weights[3].as_();
                 let weight4 = sliced_weights[4].as_();
                 let weight5 = sliced_weights[5].as_();
-                sums += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                        * weight3
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J)
-                        * weight4
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J)
-                        * weight5;
+                sums += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J) * weight4
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J) * weight5;
             } else {
                 let src_ptr0 = &src[px..(px + bounds_size * CHANNELS)];
                 for (&k_weight, src) in weights
@@ -131,7 +124,7 @@ pub(crate) fn convolve_row_handler_fixed_point<
                     .take(bounds.size)
                 {
                     let weight: J = k_weight.as_();
-                    let new_px = fast_load_color_group!(src, CHANNELS, J);
+                    let new_px = load_color_group!(src, CHANNELS, J);
                     sums += new_px * weight;
                 }
             }
@@ -205,14 +198,14 @@ pub(crate) fn convolve_row_handler_fixed_point_4<
                 let sliced_weights = &weights[0..2];
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
-                sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1;
-                sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1;
-                sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1;
-                sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1;
+                sums0 += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1;
+                sums1 += load_color_group!(src_ptr1, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1;
+                sums2 += load_color_group!(src_ptr2, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1;
+                sums3 += load_color_group!(src_ptr3, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1;
             } else if bounds_size == 3 {
                 let src_ptr0 = &src[px..(px + 3 * CHANNELS)];
                 let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 3 * CHANNELS)];
@@ -223,22 +216,18 @@ pub(crate) fn convolve_row_handler_fixed_point_4<
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
-                sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                        * weight2;
-                sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J)
-                        * weight2;
-                sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J)
-                        * weight2;
-                sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J)
-                        * weight2;
+                sums0 += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2;
+                sums1 += load_color_group!(src_ptr1, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J) * weight2;
+                sums2 += load_color_group!(src_ptr2, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J) * weight2;
+                sums3 += load_color_group!(src_ptr3, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2;
             } else if bounds_size == 4 {
                 let src_ptr0 = &src[px..(px + 4 * CHANNELS)];
                 let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 4 * CHANNELS)];
@@ -250,30 +239,22 @@ pub(crate) fn convolve_row_handler_fixed_point_4<
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
                 let weight3 = sliced_weights[3].as_();
-                sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                        * weight3;
-                sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J)
-                        * weight3;
-                sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J)
-                        * weight3;
-                sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J)
-                        * weight3;
+                sums0 += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3;
+                sums1 += load_color_group!(src_ptr1, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J) * weight3;
+                sums2 += load_color_group!(src_ptr2, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J) * weight3;
+                sums3 += load_color_group!(src_ptr3, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J) * weight3;
             } else if bounds_size == 6 {
                 let src_ptr0 = &src[px..(px + 6 * CHANNELS)];
                 let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 6 * CHANNELS)];
@@ -287,46 +268,30 @@ pub(crate) fn convolve_row_handler_fixed_point_4<
                 let weight3 = sliced_weights[3].as_();
                 let weight4 = sliced_weights[4].as_();
                 let weight5 = sliced_weights[5].as_();
-                sums0 += fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                        * weight3
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J)
-                        * weight4
-                    + fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J)
-                        * weight5;
-                sums1 += fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J)
-                        * weight3
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 4, J)
-                        * weight4
-                    + fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 5, J)
-                        * weight5;
-                sums2 += fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J)
-                        * weight3
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 4, J)
-                        * weight4
-                    + fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 5, J)
-                        * weight5;
-                sums3 += fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J)
-                        * weight2
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J)
-                        * weight3
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 4, J)
-                        * weight4
-                    + fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 5, J)
-                        * weight5;
+                sums0 += load_color_group!(src_ptr0, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight3
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J) * weight4
+                    + load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J) * weight5;
+                sums1 += load_color_group!(src_ptr1, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J) * weight3
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 4, J) * weight4
+                    + load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 5, J) * weight5;
+                sums2 += load_color_group!(src_ptr2, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J) * weight3
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 4, J) * weight4
+                    + load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 5, J) * weight5;
+                sums3 += load_color_group!(src_ptr3, CHANNELS, J) * weight0
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J) * weight1
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J) * weight2
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J) * weight3
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 4, J) * weight4
+                    + load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 5, J) * weight5;
             } else {
                 let src_ptr0 = &src[px..(px + bounds_size * CHANNELS)];
                 let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds_size * CHANNELS)];
@@ -345,10 +310,10 @@ pub(crate) fn convolve_row_handler_fixed_point_4<
                 {
                     let weight: J = k_weight.as_();
 
-                    let new_px0 = fast_load_color_group!(src0, CHANNELS, J);
-                    let new_px1 = fast_load_color_group!(src1, CHANNELS, J);
-                    let new_px2 = fast_load_color_group!(src2, CHANNELS, J);
-                    let new_px3 = fast_load_color_group!(src3, CHANNELS, J);
+                    let new_px0 = load_color_group!(src0, CHANNELS, J);
+                    let new_px1 = load_color_group!(src1, CHANNELS, J);
+                    let new_px2 = load_color_group!(src2, CHANNELS, J);
+                    let new_px3 = load_color_group!(src3, CHANNELS, J);
 
                     sums0 += new_px0 * weight;
                     sums1 += new_px1 * weight;
diff --git a/src/fixed_point_horizontal_ar30.rs b/src/fixed_point_horizontal_ar30.rs
new file mode 100644
index 0000000..b46a7cb
--- /dev/null
+++ b/src/fixed_point_horizontal_ar30.rs
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#![forbid(unsafe_code)]
+use crate::color_group::{load_ar30, load_ar30_p, load_ar30_with_offset, ColorGroup};
+use crate::filter_weights::FilterWeights;
+use crate::support::ROUNDING_CONST;
+use num_traits::AsPrimitive;
+
+#[inline(always)]
+pub(crate) fn convolve_row_handler_fixed_point_ar30<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+>(
+    src: &[u32],
+    dst: &mut [u32],
+    filter_weights: &FilterWeights<i16>,
+) {
+    for ((chunk, &bounds), weights) in dst.iter_mut().zip(filter_weights.bounds.iter()).zip(
+        filter_weights
+            .weights
+            .chunks_exact(filter_weights.aligned_size),
+    ) {
+        let mut sums = ColorGroup::<4, i32>::dup(ROUNDING_CONST.as_());
+
+        let start_x = bounds.start;
+        let bounds_size = bounds.size;
+
+        let px = start_x;
+
+        if bounds_size == 2 {
+            let src_ptr0 = &src[px..(px + 2)];
+            let sliced_weights = &weights[0..2];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            sums += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1;
+        } else if bounds_size == 3 {
+            let src_ptr0 = &src[px..(px + 3)];
+            let sliced_weights = &weights[0..3];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            let weight2 = sliced_weights[2] as i32;
+            sums += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2;
+        } else if bounds_size == 4 {
+            let src_ptr0 = &src[px..(px + 4)];
+            let sliced_weights = &weights[0..4];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            let weight2 = sliced_weights[2] as i32;
+            let weight3 = sliced_weights[3] as i32;
+            sums += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 3) * weight3;
+        } else if bounds_size == 6 {
+            let src_ptr0 = &src[px..(px + 6)];
+
+            let sliced_weights = &weights[0..6];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            let weight2 = sliced_weights[2] as i32;
+            let weight3 = sliced_weights[3] as i32;
+            let weight4 = sliced_weights[4] as i32;
+            let weight5 = sliced_weights[5] as i32;
+            sums += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 3) * weight3
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 4) * weight4
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 5) * weight5;
+        } else {
+            let src_ptr0 = &src[px..(px + bounds_size)];
+            for (&k_weight, src) in weights.iter().zip(src_ptr0.iter()).take(bounds.size) {
+                let weight: i32 = k_weight as i32;
+                let new_px = load_ar30_p!(src, AR30_TYPE, AR30_ORDER);
+                sums += new_px * weight;
+            }
+        }
+
+        let narrowed = sums.saturate_ar30();
+        *chunk = narrowed.to_ar30::<AR30_TYPE, AR30_ORDER>();
+    }
+}
+
+#[inline(always)]
+pub(crate) fn convolve_row_handler_fixed_point_4_ar30<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+>(
+    src: &[u32],
+    src_stride: usize,
+    dst: &mut [u32],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+) {
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.iter_mut();
+    let iter_row1 = row1_ref.iter_mut();
+    let iter_row2 = row2_ref.iter_mut();
+    let iter_row3 = row3_ref.iter_mut();
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut sums0 = ColorGroup::<4, i32>::dup(ROUNDING_CONST.as_());
+        let mut sums1 = ColorGroup::<4, i32>::dup(ROUNDING_CONST.as_());
+        let mut sums2 = ColorGroup::<4, i32>::dup(ROUNDING_CONST.as_());
+        let mut sums3 = ColorGroup::<4, i32>::dup(ROUNDING_CONST.as_());
+
+        let start_x = bounds.start;
+
+        let px = start_x;
+        let bounds_size = bounds.size;
+
+        if bounds_size == 2 {
+            let src_ptr0 = &src[px..(px + 2)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 2)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 2)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 2)];
+
+            let sliced_weights = &weights[0..2];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            sums0 += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1;
+            sums1 += load_ar30!(src_ptr1, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 1) * weight1;
+            sums2 += load_ar30!(src_ptr2, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 1) * weight1;
+            sums3 += load_ar30!(src_ptr3, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 1) * weight1;
+        } else if bounds_size == 3 {
+            let src_ptr0 = &src[px..(px + 3)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 3)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 3)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 3)];
+
+            let sliced_weights = &weights[0..3];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            let weight2 = sliced_weights[2] as i32;
+            sums0 += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2;
+            sums1 += load_ar30!(src_ptr1, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 2) * weight2;
+            sums2 += load_ar30!(src_ptr2, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 2) * weight2;
+            sums3 += load_ar30!(src_ptr3, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 2) * weight2;
+        } else if bounds_size == 4 {
+            let src_ptr0 = &src[px..(px + 4)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 4)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 4)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 4)];
+
+            let sliced_weights = &weights[0..4];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            let weight2 = sliced_weights[2] as i32;
+            let weight3 = sliced_weights[3] as i32;
+            sums0 += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 3) * weight3;
+            sums1 += load_ar30!(src_ptr1, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 3) * weight3;
+            sums2 += load_ar30!(src_ptr2, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 3) * weight3;
+            sums3 += load_ar30!(src_ptr3, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 3) * weight3;
+        } else if bounds_size == 6 {
+            let src_ptr0 = &src[px..(px + 6)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + 6)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + 6)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + 6)];
+
+            let sliced_weights = &weights[0..6];
+            let weight0 = sliced_weights[0] as i32;
+            let weight1 = sliced_weights[1] as i32;
+            let weight2 = sliced_weights[2] as i32;
+            let weight3 = sliced_weights[3] as i32;
+            let weight4 = sliced_weights[4] as i32;
+            let weight5 = sliced_weights[5] as i32;
+            sums0 += load_ar30!(src_ptr0, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 3) * weight3
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 4) * weight4
+                + load_ar30_with_offset!(src_ptr0, AR30_TYPE, AR30_ORDER, 5) * weight5;
+            sums1 += load_ar30!(src_ptr1, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 3) * weight3
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 4) * weight4
+                + load_ar30_with_offset!(src_ptr1, AR30_TYPE, AR30_ORDER, 5) * weight5;
+            sums2 += load_ar30!(src_ptr2, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 3) * weight3
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 4) * weight4
+                + load_ar30_with_offset!(src_ptr2, AR30_TYPE, AR30_ORDER, 5) * weight5;
+            sums3 += load_ar30!(src_ptr3, AR30_TYPE, AR30_ORDER) * weight0
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 1) * weight1
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 2) * weight2
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 3) * weight3
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 4) * weight4
+                + load_ar30_with_offset!(src_ptr3, AR30_TYPE, AR30_ORDER, 5) * weight5;
+        } else {
+            let src_ptr0 = &src[px..(px + bounds_size)];
+            let src_ptr1 = &src[(px + src_stride)..(px + src_stride + bounds_size)];
+            let src_ptr2 = &src[(px + src_stride * 2)..(px + src_stride * 2 + bounds_size)];
+            let src_ptr3 = &src[(px + src_stride * 3)..(px + src_stride * 3 + bounds_size)];
+
+            for ((((&k_weight, src0), src1), src2), src3) in weights
+                .iter()
+                .zip(src_ptr0.iter())
+                .zip(src_ptr1.iter())
+                .zip(src_ptr2.iter())
+                .zip(src_ptr3.iter())
+                .take(bounds.size)
+            {
+                let weight: i32 = k_weight as i32;
+
+                let new_px0 = load_ar30_p!(src0, AR30_TYPE, AR30_ORDER);
+                let new_px1 = load_ar30_p!(src1, AR30_TYPE, AR30_ORDER);
+                let new_px2 = load_ar30_p!(src2, AR30_TYPE, AR30_ORDER);
+                let new_px3 = load_ar30_p!(src3, AR30_TYPE, AR30_ORDER);
+
+                sums0 += new_px0 * weight;
+                sums1 += new_px1 * weight;
+                sums2 += new_px2 * weight;
+                sums3 += new_px3 * weight;
+            }
+        }
+
+        let narrowed0 = sums0.saturate_ar30();
+        let narrowed1 = sums1.saturate_ar30();
+        let narrowed2 = sums2.saturate_ar30();
+        let narrowed3 = sums3.saturate_ar30();
+
+        *chunk0 = narrowed0.to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *chunk1 = narrowed1.to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *chunk2 = narrowed2.to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *chunk3 = narrowed3.to_ar30::<AR30_TYPE, AR30_ORDER>();
+    }
+}
diff --git a/src/fixed_point_vertical_ar30.rs b/src/fixed_point_vertical_ar30.rs
new file mode 100644
index 0000000..86a84f7
--- /dev/null
+++ b/src/fixed_point_vertical_ar30.rs
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk, 10/2024. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::color_group::{load_ar30_p, ColorGroup};
+use crate::filter_weights::FilterBounds;
+use crate::support::ROUNDING_CONST;
+
+#[inline(always)]
+/// # Generics
+/// `T` - template buffer type
+/// `J` - accumulator type
+pub(crate) fn convolve_column_handler_fip_db_ar30<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+    const BUFFER_SIZE: usize,
+>(
+    src: &[u32],
+    src_stride: usize,
+    dst: &mut [u32],
+    filter: &[i16],
+    bounds: &FilterBounds,
+    x: usize,
+) {
+    if filter.is_empty() {
+        return;
+    }
+    let mut direct_store: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+
+    let v_start_px = x;
+
+    let py = bounds.start;
+    let weight = filter[0] as i32;
+    let offset = src_stride * py + v_start_px;
+    let src_ptr = &src[offset..(offset + BUFFER_SIZE)];
+
+    for (dst, src) in direct_store.iter_mut().zip(src_ptr) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
+        // Adding 1 is necessary because skip do not incrementing value on values that skipped
+        let py = bounds.start + j + 1;
+        let weight = k_weight as i32;
+        let offset = src_stride * py + v_start_px;
+        let src_ptr = &src[offset..(offset + BUFFER_SIZE)];
+
+        for (dst, src) in direct_store.iter_mut().zip(src_ptr.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+    }
+
+    let v_dst = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)];
+    for (dst, src) in v_dst.iter_mut().zip(direct_store) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+}
+
+#[inline(always)]
+/// # Generics
+/// `T` - template buffer type
+/// `J` - accumulator type
+fn convolve_column_handler_fixed_point_direct_buffer_double<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+    const BUFFER_SIZE: usize,
+>(
+    src: &[u32],
+    src_stride: usize,
+    dst: &mut [u32],
+    filter: &[i16],
+    bounds: &FilterBounds,
+    x: usize,
+) {
+    if filter.is_empty() {
+        return;
+    }
+    let mut direct_store0: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+    let mut direct_store1: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+
+    let v_start_px = x;
+
+    let py = bounds.start;
+    let weight = filter[0] as i32;
+    let offset = src_stride * py + v_start_px;
+    let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
+    let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
+
+    for (dst, src) in direct_store0.iter_mut().zip(src_ptr0) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (dst, src) in direct_store1.iter_mut().zip(src_ptr1) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
+        // Adding 1 is necessary because skip do not incrementing value on values that skipped
+        let py = bounds.start + j + 1;
+        let weight = k_weight as i32;
+        let offset = src_stride * py + v_start_px;
+        let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
+        let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
+
+        for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+        for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+    }
+
+    let v_dst0 = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)];
+    for (dst, src) in v_dst0.iter_mut().zip(direct_store0) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+
+    let v_dst1 = &mut dst[(v_start_px + BUFFER_SIZE)..(v_start_px + BUFFER_SIZE * 2)];
+    for (dst, src) in v_dst1.iter_mut().zip(direct_store1) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+}
+
+#[inline(always)]
+/// # Generics
+/// `T` - template buffer type
+/// `J` - accumulator type
+fn convolve_column_handler_fixed_point_direct_buffer_four<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+    const BUFFER_SIZE: usize,
+>(
+    src: &[u32],
+    src_stride: usize,
+    dst: &mut [u32],
+    filter: &[i16],
+    bounds: &FilterBounds,
+    x: usize,
+) {
+    if filter.is_empty() {
+        return;
+    }
+    let mut direct_store0: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+    let mut direct_store1: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+    let mut direct_store2: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+    let mut direct_store3: [ColorGroup<4, i32>; BUFFER_SIZE] =
+        [ColorGroup::<4, i32>::dup(ROUNDING_CONST); BUFFER_SIZE];
+
+    let v_start_px = x;
+
+    let py = bounds.start;
+    let weight = filter[0] as i32;
+    let offset = src_stride * py + v_start_px;
+    let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
+    let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
+    let src_ptr2 = &src[(offset + BUFFER_SIZE * 2)..(offset + BUFFER_SIZE * 3)];
+    let src_ptr3 = &src[(offset + BUFFER_SIZE * 3)..(offset + BUFFER_SIZE * 4)];
+
+    for (dst, src) in direct_store0.iter_mut().zip(src_ptr0) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (dst, src) in direct_store1.iter_mut().zip(src_ptr1) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (dst, src) in direct_store2.iter_mut().zip(src_ptr2) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (dst, src) in direct_store3.iter_mut().zip(src_ptr3) {
+        *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+    }
+
+    for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
+        // Adding 1 is necessary because skip do not incrementing value on values that skipped
+        let py = bounds.start + j + 1;
+        let weight = k_weight as i32;
+        let offset = src_stride * py + v_start_px;
+        let src_ptr0 = &src[offset..(offset + BUFFER_SIZE)];
+        let src_ptr1 = &src[(offset + BUFFER_SIZE)..(offset + BUFFER_SIZE * 2)];
+        let src_ptr2 = &src[(offset + BUFFER_SIZE * 2)..(offset + BUFFER_SIZE * 3)];
+        let src_ptr3 = &src[(offset + BUFFER_SIZE * 3)..(offset + BUFFER_SIZE * 4)];
+
+        for (dst, src) in direct_store0.iter_mut().zip(src_ptr0.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+        for (dst, src) in direct_store1.iter_mut().zip(src_ptr1.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+        for (dst, src) in direct_store2.iter_mut().zip(src_ptr2.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+        for (dst, src) in direct_store3.iter_mut().zip(src_ptr3.iter()) {
+            *dst += load_ar30_p!(src, AR30_TYPE, AR30_ORDER) * weight;
+        }
+    }
+
+    let v_dst0 = &mut dst[v_start_px..(v_start_px + BUFFER_SIZE)];
+    for (dst, src) in v_dst0.iter_mut().zip(direct_store0) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+
+    let v_dst1 = &mut dst[(v_start_px + BUFFER_SIZE)..(v_start_px + BUFFER_SIZE * 2)];
+    for (dst, src) in v_dst1.iter_mut().zip(direct_store1) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+
+    let v_dst2 = &mut dst[(v_start_px + BUFFER_SIZE * 2)..(v_start_px + BUFFER_SIZE * 3)];
+    for (dst, src) in v_dst2.iter_mut().zip(direct_store2) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+
+    let v_dst3 = &mut dst[(v_start_px + BUFFER_SIZE * 3)..(v_start_px + BUFFER_SIZE * 4)];
+    for (dst, src) in v_dst3.iter_mut().zip(direct_store3) {
+        let saturated = src.saturate_ar30().to_ar30::<AR30_TYPE, AR30_ORDER>();
+        *dst = saturated;
+    }
+}
+
+pub(crate) fn column_handler_fixed_point_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    bounds: &FilterBounds,
+    src: &[u32],
+    dst: &mut [u32],
+    src_stride: usize,
+    weight: &[i16],
+) {
+    let mut cx = 0usize;
+
+    let total_width = dst.len();
+
+    while cx + 64 < total_width {
+        convolve_column_handler_fixed_point_direct_buffer_four::<AR30_TYPE, AR30_ORDER, 16>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 64;
+    }
+
+    while cx + 32 < total_width {
+        convolve_column_handler_fixed_point_direct_buffer_double::<AR30_TYPE, AR30_ORDER, 16>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 32;
+    }
+
+    while cx + 16 < total_width {
+        convolve_column_handler_fip_db_ar30::<AR30_TYPE, AR30_ORDER, 16>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 16;
+    }
+
+    while cx + 8 < total_width {
+        convolve_column_handler_fip_db_ar30::<AR30_TYPE, AR30_ORDER, 8>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 8;
+    }
+
+    while cx < total_width {
+        convolve_column_handler_fip_db_ar30::<AR30_TYPE, AR30_ORDER, 1>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 1;
+    }
+}
diff --git a/src/floating_point_horizontal.rs b/src/floating_point_horizontal.rs
index aa7fd99..9fc80c4 100644
--- a/src/floating_point_horizontal.rs
+++ b/src/floating_point_horizontal.rs
@@ -27,8 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::color_group::{
-    fast_load_color_group, fast_load_color_group_with_offset, fast_mixed_store_color_group,
-    ColorGroup,
+    fast_mixed_store_color_group, load_color_group, load_color_group_with_offset, ColorGroup,
 };
 use crate::filter_weights::FilterWeights;
 use crate::mixed_storage::MixedStorage;
@@ -83,8 +82,8 @@ pub(crate) fn convolve_row_handler_floating_point<
                 let sliced_weights = &weights[0..2];
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
-                sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0).mul_add(
-                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                sums = (load_color_group!(src_ptr0, CHANNELS, J) * weight0).mul_add(
+                    load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                     weight1,
                 );
             } else if bounds_size == 3 {
@@ -94,13 +93,13 @@ pub(crate) fn convolve_row_handler_floating_point<
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
-                sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                sums = (load_color_group!(src_ptr0, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     );
             } else if bounds_size == 4 {
@@ -111,17 +110,17 @@ pub(crate) fn convolve_row_handler_floating_point<
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
                 let weight3 = sliced_weights[3].as_();
-                sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                sums = (load_color_group!(src_ptr0, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     );
             } else if bounds_size == 6 {
@@ -134,25 +133,25 @@ pub(crate) fn convolve_row_handler_floating_point<
                 let weight3 = sliced_weights[3].as_();
                 let weight4 = sliced_weights[4].as_();
                 let weight5 = sliced_weights[5].as_();
-                sums = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                sums = (load_color_group!(src_ptr0, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J),
                         weight4,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J),
                         weight5,
                     );
             } else {
@@ -163,7 +162,7 @@ pub(crate) fn convolve_row_handler_floating_point<
                     .take(bounds.size)
                 {
                     let weight: J = k_weight.as_();
-                    let new_px = fast_load_color_group!(src, CHANNELS, J);
+                    let new_px = load_color_group!(src, CHANNELS, J);
                     sums = sums.mul_add(new_px, weight);
                 }
             }
@@ -239,20 +238,20 @@ pub(crate) fn convolve_row_handler_floating_point_4<
                 let sliced_weights = &weights[0..2];
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
-                sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0).mul_add(
-                    fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                sums0 = (load_color_group!(src_ptr0, CHANNELS, J) * weight0).mul_add(
+                    load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                     weight1,
                 );
-                sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0).mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                sums1 = (load_color_group!(src_ptr1, CHANNELS, J) * weight0).mul_add(
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                     weight1,
                 );
-                sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0).mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                sums2 = (load_color_group!(src_ptr2, CHANNELS, J) * weight0).mul_add(
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                     weight1,
                 );
-                sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0).mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                sums3 = (load_color_group!(src_ptr3, CHANNELS, J) * weight0).mul_add(
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
                     weight1,
                 );
             } else if bounds_size == 3 {
@@ -265,40 +264,40 @@ pub(crate) fn convolve_row_handler_floating_point_4<
                 let weight0 = sliced_weights[0].as_();
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
-                sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                sums0 = (load_color_group!(src_ptr0, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     );
-                sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0)
+                sums1 = (load_color_group!(src_ptr1, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     );
-                sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0)
+                sums2 = (load_color_group!(src_ptr2, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     );
-                sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0)
+                sums3 = (load_color_group!(src_ptr3, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     );
             } else if bounds_size == 4 {
@@ -312,56 +311,56 @@ pub(crate) fn convolve_row_handler_floating_point_4<
                 let weight1 = sliced_weights[1].as_();
                 let weight2 = sliced_weights[2].as_();
                 let weight3 = sliced_weights[3].as_();
-                sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                sums0 = (load_color_group!(src_ptr0, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     );
-                sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0)
+                sums1 = (load_color_group!(src_ptr1, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     );
-                sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0)
+                sums2 = (load_color_group!(src_ptr2, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     );
-                sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0)
+                sums3 = (load_color_group!(src_ptr3, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     );
             } else if bounds_size == 6 {
@@ -377,88 +376,88 @@ pub(crate) fn convolve_row_handler_floating_point_4<
                 let weight3 = sliced_weights[3].as_();
                 let weight4 = sliced_weights[4].as_();
                 let weight5 = sliced_weights[5].as_();
-                sums0 = (fast_load_color_group!(src_ptr0, CHANNELS, J) * weight0)
+                sums0 = (load_color_group!(src_ptr0, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 4, J),
                         weight4,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J),
+                        load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 5, J),
                         weight5,
                     );
-                sums1 = (fast_load_color_group!(src_ptr1, CHANNELS, J) * weight0)
+                sums1 = (load_color_group!(src_ptr1, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 4, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 4, J),
                         weight4,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 5, J),
+                        load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 5, J),
                         weight5,
                     );
-                sums2 = (fast_load_color_group!(src_ptr2, CHANNELS, J) * weight0)
+                sums2 = (load_color_group!(src_ptr2, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 4, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 4, J),
                         weight4,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 5, J),
+                        load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 5, J),
                         weight5,
                     );
-                sums3 = (fast_load_color_group!(src_ptr3, CHANNELS, J) * weight0)
+                sums3 = (load_color_group!(src_ptr3, CHANNELS, J) * weight0)
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
                         weight1,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
                         weight2,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
                         weight3,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 4, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 4, J),
                         weight4,
                     )
                     .mul_add(
-                        fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 5, J),
+                        load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 5, J),
                         weight5,
                     );
             } else {
@@ -479,10 +478,10 @@ pub(crate) fn convolve_row_handler_floating_point_4<
                 {
                     let weight: J = k_weight.as_();
 
-                    let new_px0 = fast_load_color_group!(src0, CHANNELS, J);
-                    let new_px1 = fast_load_color_group!(src1, CHANNELS, J);
-                    let new_px2 = fast_load_color_group!(src2, CHANNELS, J);
-                    let new_px3 = fast_load_color_group!(src3, CHANNELS, J);
+                    let new_px0 = load_color_group!(src0, CHANNELS, J);
+                    let new_px1 = load_color_group!(src1, CHANNELS, J);
+                    let new_px2 = load_color_group!(src2, CHANNELS, J);
+                    let new_px3 = load_color_group!(src3, CHANNELS, J);
 
                     sums0 = sums0.mul_add(new_px0, weight);
                     sums1 = sums1.mul_add(new_px1, weight);
diff --git a/src/floating_point_vertical.rs b/src/floating_point_vertical.rs
index 4d82044..9f38b38 100644
--- a/src/floating_point_vertical.rs
+++ b/src/floating_point_vertical.rs
@@ -27,8 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::color_group::{
-    fast_load_color_group, fast_load_color_group_with_offset, fast_mixed_store_color_group,
-    ColorGroup,
+    fast_mixed_store_color_group, load_color_group, load_color_group_with_offset, ColorGroup,
 };
 use crate::filter_weights::FilterBounds;
 use crate::mixed_storage::MixedStorage;
@@ -83,26 +82,23 @@ pub(crate) fn convolve_column_handler_floating_point_4<
             let src_ptr0 = &src[offset0..(offset0 + CHANNELS * 4)];
             let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0).mul_add(
+                load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                weight1,
+            );
+            sums1 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                     weight1,
                 );
-            sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
+            sums2 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                     weight1,
                 );
-            sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                * weight0)
+            sums3 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
-                    weight1,
-                );
-            sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                * weight0)
-                .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
                     weight1,
                 );
         } else if bounds_size == 3 {
@@ -117,45 +113,43 @@ pub(crate) fn convolve_column_handler_floating_point_4<
             let src_ptr1 = &src[offset1..(offset1 + CHANNELS * 4)];
             let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
                     weight2,
                 );
 
-            sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
+            sums1 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                     weight2,
                 );
 
-            sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                * weight0)
+            sums2 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
                     weight2,
                 );
 
-            sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                * weight0)
+            sums3 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
                     weight2,
                 );
         } else if bounds_size == 4 {
@@ -173,61 +167,59 @@ pub(crate) fn convolve_column_handler_floating_point_4<
             let src_ptr2 = &src[offset2..(offset2 + CHANNELS * 4)];
             let src_ptr3 = &src[offset3..(offset3 + CHANNELS * 4)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
                     weight3,
                 );
 
-            sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
+            sums1 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
                     weight3,
                 );
 
-            sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                * weight0)
+            sums2 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
                     weight3,
                 );
 
-            sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                * weight0)
+            sums3 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
                     weight3,
                 );
         } else if bounds_size == 6 {
@@ -251,93 +243,91 @@ pub(crate) fn convolve_column_handler_floating_point_4<
             let src_ptr4 = &src[offset4..(offset4 + CHANNELS * 4)];
             let src_ptr5 = &src[offset5..(offset5 + CHANNELS * 4)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
                     weight3,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr4, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr4, CHANNELS, 0, J),
                     weight4,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr5, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr5, CHANNELS, 0, J),
                     weight5,
                 );
 
-            sums1 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
+            sums1 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS, J),
                     weight3,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS, J),
                     weight4,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS, J),
+                    load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS, J),
                     weight5,
                 );
 
-            sums2 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J)
-                * weight0)
+            sums2 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 2, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 2, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 2, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 2, J),
                     weight3,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS * 2, J),
                     weight4,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS * 2, J),
+                    load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS * 2, J),
                     weight5,
                 );
 
-            sums3 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J)
-                * weight0)
+            sums3 = (load_color_group_with_offset!(src_ptr0, CHANNELS, CHANNELS * 3, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, CHANNELS * 3, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, CHANNELS * 3, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, CHANNELS * 3, J),
                     weight3,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr4, CHANNELS, CHANNELS * 3, J),
                     weight4,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS * 3, J),
+                    load_color_group_with_offset!(src_ptr5, CHANNELS, CHANNELS * 3, J),
                     weight5,
                 );
         } else {
@@ -347,12 +337,10 @@ pub(crate) fn convolve_column_handler_floating_point_4<
                 let offset = src_stride * py + v_start_px;
                 let src_ptr = &src[offset..(offset + CHANNELS * 4)];
 
-                let new_px0 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, 0, J);
-                let new_px1 = fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS, J);
-                let new_px2 =
-                    fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 2, J);
-                let new_px3 =
-                    fast_load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 3, J);
+                let new_px0 = load_color_group_with_offset!(src_ptr, CHANNELS, 0, J);
+                let new_px1 = load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS, J);
+                let new_px2 = load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 2, J);
+                let new_px3 = load_color_group_with_offset!(src_ptr, CHANNELS, CHANNELS * 3, J);
 
                 sums0 = sums0.mul_add(new_px0, weight);
                 sums1 = sums1.mul_add(new_px1, weight);
@@ -435,11 +423,10 @@ pub(crate) fn convolve_column_handler_floating_point<
             let src_ptr0 = &src[offset0..(offset0 + CHANNELS)];
             let src_ptr1 = &src[offset1..(offset1 + CHANNELS)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
-                .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
-                    weight1,
-                );
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0).mul_add(
+                load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                weight1,
+            );
         } else if bounds_size == 3 {
             let weights = &filter[0..3];
             let weight0 = weights[0].as_();
@@ -452,13 +439,13 @@ pub(crate) fn convolve_column_handler_floating_point<
             let src_ptr1 = &src[offset1..(offset1 + CHANNELS)];
             let src_ptr2 = &src[offset2..(offset2 + CHANNELS)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
                     weight2,
                 );
         } else if bounds_size == 4 {
@@ -476,17 +463,17 @@ pub(crate) fn convolve_column_handler_floating_point<
             let src_ptr2 = &src[offset2..(offset2 + CHANNELS)];
             let src_ptr3 = &src[offset3..(offset3 + CHANNELS)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
                     weight3,
                 );
         } else if bounds_size == 6 {
@@ -510,25 +497,25 @@ pub(crate) fn convolve_column_handler_floating_point<
             let src_ptr4 = &src[offset4..(offset4 + CHANNELS)];
             let src_ptr5 = &src[offset5..(offset5 + CHANNELS)];
 
-            sums0 = (fast_load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
+            sums0 = (load_color_group_with_offset!(src_ptr0, CHANNELS, 0, J) * weight0)
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr1, CHANNELS, 0, J),
                     weight1,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr2, CHANNELS, 0, J),
                     weight2,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr3, CHANNELS, 0, J),
                     weight3,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr4, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr4, CHANNELS, 0, J),
                     weight4,
                 )
                 .mul_add(
-                    fast_load_color_group_with_offset!(src_ptr5, CHANNELS, 0, J),
+                    load_color_group_with_offset!(src_ptr5, CHANNELS, 0, J),
                     weight5,
                 );
         } else {
@@ -538,7 +525,7 @@ pub(crate) fn convolve_column_handler_floating_point<
                 let offset = src_stride * py + v_start_px;
                 let src_ptr = &src[offset..(offset + CHANNELS)];
 
-                let new_px0 = fast_load_color_group!(src_ptr, CHANNELS, J);
+                let new_px0 = load_color_group!(src_ptr, CHANNELS, J);
 
                 sums0 = sums0.mul_add(new_px0, weight);
             }
diff --git a/src/handler_provider.rs b/src/handler_provider.rs
index c4f2d14..f725b70 100644
--- a/src/handler_provider.rs
+++ b/src/handler_provider.rs
@@ -38,7 +38,7 @@ use crate::floating_point_vertical::column_handler_floating_point;
 use crate::mixed_storage::MixedStorage;
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use crate::neon::{
-    convolve_column_lb_u16, convolve_column_u16, convolve_horizontal_rgba_neon_rows_4_lb_u8,
+    convolve_column_lb_u16, convolve_column_u16, convolve_horizontal_rgba_neon_rows_4_lb_u16,
     convolve_horizontal_rgba_neon_u16_lb_row,
 };
 use crate::saturate_narrow::SaturateNarrow;
@@ -51,7 +51,7 @@ use crate::sse::{
 use num_traits::{AsPrimitive, Float, MulAdd};
 use std::ops::{Add, AddAssign, Mul};
 
-pub trait ColumnHandlerFloatingPoint<T, J, F>
+pub(crate) trait ColumnHandlerFloatingPoint<T, J, F>
 where
     T: Copy + 'static + AsPrimitive<J> + Default,
     J: Copy + 'static + AsPrimitive<T> + MulAdd<J, Output = J> + Default + MixedStorage<T>,
@@ -153,7 +153,7 @@ impl ColumnHandlerFloatingPoint<u16, f32, f32> for u16 {
 default_floating_column_handler!(u8);
 default_floating_column_handler!(f32);
 
-pub trait RowHandlerFloatingPoint<T, J, F>
+pub(crate) trait RowHandlerFloatingPoint<T, J, F>
 where
     T: Copy + 'static + AsPrimitive<J> + Default,
     J: Copy + 'static + AsPrimitive<T> + MulAdd<J, Output = J> + Default + MixedStorage<T>,
@@ -263,7 +263,7 @@ impl RowHandlerFloatingPoint<u16, f32, f32> for u16 {
     }
 }
 
-pub trait ColumnHandlerFixedPoint<T> {
+pub(crate) trait ColumnHandlerFixedPoint<T> {
     fn handle_fixed_column<J, const COMPONENTS: usize>(
         dst_width: usize,
         bounds: &FilterBounds,
@@ -285,7 +285,7 @@ pub trait ColumnHandlerFixedPoint<T> {
         i16: AsPrimitive<J>;
 }
 
-pub trait RowHandlerFixedPoint<T> {
+pub(crate) trait RowHandlerFixedPoint<T> {
     fn handle_fixed_row_4<J, const COMPONENTS: usize>(
         src: &[T],
         src_stride: usize,
@@ -382,7 +382,7 @@ impl RowHandlerFixedPoint<u16> for u16 {
         u16: AsPrimitive<J>,
     {
         if COMPONENTS == 4 {
-            convolve_horizontal_rgba_neon_rows_4_lb_u8(
+            convolve_horizontal_rgba_neon_rows_4_lb_u16(
                 src,
                 src_stride,
                 dst,
diff --git a/src/image_store.rs b/src/image_store.rs
index a3a68ad..16bd36f 100644
--- a/src/image_store.rs
+++ b/src/image_store.rs
@@ -69,14 +69,14 @@ pub(crate) enum BufferStore<'a, T: Copy + Debug> {
 }
 
 impl<T: Copy + Debug> BufferStore<'_, T> {
-    pub fn borrow(&self) -> &[T] {
+    pub(crate) fn borrow(&self) -> &[T] {
         match self {
             Self::Borrowed(p_ref) => p_ref,
             Self::Owned(vec) => vec,
         }
     }
 
-    pub fn borrow_mut(&mut self) -> &mut [T] {
+    pub(crate) fn borrow_mut(&mut self) -> &mut [T] {
         match self {
             Self::Borrowed(p_ref) => p_ref,
             Self::Owned(vec) => vec,
diff --git a/src/lib.rs b/src/lib.rs
index 1942a7d..929b771 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -27,6 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #![deny(deprecated)]
+#![deny(unreachable_code, unused)]
 #![allow(clippy::too_many_arguments)]
 mod alpha_check;
 #[cfg(feature = "half")]
@@ -34,6 +35,7 @@ mod alpha_handle_f16;
 mod alpha_handle_f32;
 mod alpha_handle_u16;
 mod alpha_handle_u8;
+mod ar30;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 mod avx2;
 mod color_group;
@@ -43,6 +45,7 @@ mod convolution;
 mod convolve_naive_f32;
 mod convolve_naive_u16;
 mod cpu_features;
+mod dispatch_group_ar30;
 #[cfg(feature = "half")]
 mod dispatch_group_f16;
 mod dispatch_group_f32;
@@ -52,7 +55,9 @@ mod dispatch_group_u8;
 mod f16;
 mod filter_weights;
 mod fixed_point_horizontal;
+mod fixed_point_horizontal_ar30;
 mod fixed_point_vertical;
+mod fixed_point_vertical_ar30;
 mod floating_point_horizontal;
 mod floating_point_vertical;
 mod handler_provider;
@@ -62,12 +67,13 @@ mod math;
 mod mixed_storage;
 mod mlaf;
 mod nearest_sampler;
-#[cfg(all(target_arch = "aarch64", target_feature = "neon",))]
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 mod neon;
 mod pic_scale_error;
 mod plane_f32;
 mod plane_u16;
 mod plane_u8;
+mod resize_ar30;
 mod rgb_f32;
 mod rgb_u16;
 mod rgb_u8;
@@ -87,12 +93,11 @@ mod unsafe_slice;
 #[cfg(all(target_arch = "wasm32", target_feature = "simd128",))]
 mod wasm32;
 
+pub use ar30::Ar30ByteOrder;
 #[cfg(feature = "colorspaces")]
 pub use colors::*;
 #[cfg(feature = "colorspaces")]
 pub use colorutils_rs::TransferFunction;
-#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
-pub use cpu_features::{is_aarch_f16_supported, is_aarch_f16c_supported};
 pub use image_size::ImageSize;
 pub use image_store::ImageStore;
 pub use math::*;
diff --git a/src/mixed_storage.rs b/src/mixed_storage.rs
index 3591604..594b8c4 100644
--- a/src/mixed_storage.rs
+++ b/src/mixed_storage.rs
@@ -27,7 +27,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-pub trait MixedStorage<T> {
+pub(crate) trait MixedStorage<T> {
     fn to_mixed(self, bit_depth: u32) -> T;
 }
 
diff --git a/src/mlaf.rs b/src/mlaf.rs
index f72d2bd..1cf3f2b 100644
--- a/src/mlaf.rs
+++ b/src/mlaf.rs
@@ -37,7 +37,7 @@ use std::ops::{Add, Mul};
     all(target_arch = "aarch64", target_feature = "neon")
 ))]
 #[inline(always)]
-pub fn mlaf<T: Copy + Mul<T, Output = T> + Add<T, Output = T> + MulAdd<T, Output = T>>(
+pub(crate) fn mlaf<T: Copy + Mul<T, Output = T> + Add<T, Output = T> + MulAdd<T, Output = T>>(
     acc: T,
     a: T,
     b: T,
@@ -53,7 +53,7 @@ pub fn mlaf<T: Copy + Mul<T, Output = T> + Add<T, Output = T> + MulAdd<T, Output
     ),
     all(target_arch = "aarch64", target_feature = "neon")
 )))]
-pub fn mlaf<T: Copy + Mul<T, Output = T> + Add<T, Output = T> + MulAdd<T, Output = T>>(
+pub(crate) fn mlaf<T: Copy + Mul<T, Output = T> + Add<T, Output = T> + MulAdd<T, Output = T>>(
     acc: T,
     a: T,
     b: T,
diff --git a/src/nearest_sampler.rs b/src/nearest_sampler.rs
index b1aa650..945d5bb 100644
--- a/src/nearest_sampler.rs
+++ b/src/nearest_sampler.rs
@@ -31,7 +31,7 @@ use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::ParallelSliceMut;
 use rayon::ThreadPool;
 
-pub fn resize_nearest<T: Copy + Send + Sync, const CHANNELS: usize>(
+pub(crate) fn resize_nearest<T: Copy + Send + Sync, const CHANNELS: usize>(
     src: &[T],
     src_width: usize,
     src_height: usize,
diff --git a/src/neon/ar30.rs b/src/neon/ar30.rs
new file mode 100644
index 0000000..d846e11
--- /dev/null
+++ b/src/neon/ar30.rs
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::ar30::Rgb30;
+use std::arch::aarch64::*;
+
+#[inline(always)]
+pub(crate) unsafe fn vrev128_u32(v: uint32x4_t) -> uint32x4_t {
+    vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v)))
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vunzips_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    v: uint32x4_t,
+) -> int16x4x4_t {
+    let mask = vdupq_n_u32(0x3ff);
+    let ar_type: Rgb30 = AR30_TYPE.into();
+
+    let v = if AR30_ORDER == 0 { v } else { vrev128_u32(v) };
+
+    match ar_type {
+        Rgb30::Ar30 => {
+            let r = vmovn_u32(vandq_u32(v, mask));
+            let g = vmovn_u32(vandq_u32(vshrq_n_u32::<10>(v), mask));
+            let b = vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v), mask));
+            let va = vmovn_u32(vshrq_n_u32::<30>(v));
+            let a = vorr_u16(
+                vorr_u16(
+                    vorr_u16(
+                        vorr_u16(vshl_n_u16::<8>(va), vshl_n_u16::<6>(va)),
+                        vshl_n_u16::<4>(va),
+                    ),
+                    vshl_n_u16::<2>(va),
+                ),
+                va,
+            );
+            int16x4x4_t(
+                vreinterpret_s16_u16(r),
+                vreinterpret_s16_u16(g),
+                vreinterpret_s16_u16(b),
+                vreinterpret_s16_u16(a),
+            )
+        }
+        Rgb30::Ra30 => {
+            let a_mask = vdupq_n_u32(0x3);
+            let va = vmovn_u32(vandq_u32(v, a_mask));
+
+            let a = vorr_u16(
+                vorr_u16(
+                    vorr_u16(
+                        vorr_u16(vshl_n_u16::<8>(va), vshl_n_u16::<6>(va)),
+                        vshl_n_u16::<4>(va),
+                    ),
+                    vshl_n_u16::<2>(va),
+                ),
+                va,
+            );
+
+            let r = vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v), mask));
+            let g = vmovn_u32(vandq_u32(vshrq_n_u32::<12>(v), mask));
+            let b = vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v), mask));
+            int16x4x4_t(
+                vreinterpret_s16_u16(r),
+                vreinterpret_s16_u16(g),
+                vreinterpret_s16_u16(b),
+                vreinterpret_s16_u16(a),
+            )
+        }
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vunzip_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    v: uint32x4x2_t,
+) -> int16x8x4_t {
+    let mask = vdupq_n_u32(0x3ff);
+    let ar_type: Rgb30 = AR30_TYPE.into();
+
+    let v = if AR30_ORDER == 0 {
+        v
+    } else {
+        uint32x4x2_t(vrev128_u32(v.0), vrev128_u32(v.1))
+    };
+
+    match ar_type {
+        Rgb30::Ar30 => {
+            let r = vcombine_u16(
+                vmovn_u32(vandq_u32(v.0, mask)),
+                vmovn_u32(vandq_u32(v.1, mask)),
+            );
+            let g = vcombine_u16(
+                vmovn_u32(vandq_u32(vshrq_n_u32::<10>(v.0), mask)),
+                vmovn_u32(vandq_u32(vshrq_n_u32::<10>(v.1), mask)),
+            );
+            let b = vcombine_u16(
+                vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v.0), mask)),
+                vmovn_u32(vandq_u32(vshrq_n_u32::<20>(v.1), mask)),
+            );
+            let va = vcombine_u16(
+                vmovn_u32(vshrq_n_u32::<30>(v.0)),
+                vmovn_u32(vshrq_n_u32::<30>(v.1)),
+            );
+            let a = vorrq_u16(
+                vorrq_u16(
+                    vorrq_u16(
+                        vorrq_u16(vshlq_n_u16::<8>(va), vshlq_n_u16::<6>(va)),
+                        vshlq_n_u16::<4>(va),
+                    ),
+                    vshlq_n_u16::<2>(va),
+                ),
+                va,
+            );
+            int16x8x4_t(
+                vreinterpretq_s16_u16(r),
+                vreinterpretq_s16_u16(g),
+                vreinterpretq_s16_u16(b),
+                vreinterpretq_s16_u16(a),
+            )
+        }
+        Rgb30::Ra30 => {
+            let a_mask = vdupq_n_u32(0x3);
+            let va = vcombine_u16(
+                vmovn_u32(vandq_u32(v.0, a_mask)),
+                vmovn_u32(vandq_u32(v.1, a_mask)),
+            );
+
+            let a = vorrq_u16(
+                vorrq_u16(
+                    vorrq_u16(
+                        vorrq_u16(vshlq_n_u16::<8>(va), vshlq_n_u16::<6>(va)),
+                        vshlq_n_u16::<4>(va),
+                    ),
+                    vshlq_n_u16::<2>(va),
+                ),
+                va,
+            );
+
+            let r = vcombine_u16(
+                vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v.0), mask)),
+                vmovn_u32(vandq_u32(vshrq_n_u32::<22>(v.1), mask)),
+            );
+            let g = vcombine_u16(
+                vmovn_u32(vandq_u32(vshrq_n_u32::<12>(v.0), mask)),
+                vmovn_u32(vandq_u32(vshrq_n_u32::<12>(v.1), mask)),
+            );
+            let b = vcombine_u16(
+                vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v.0), mask)),
+                vmovn_u32(vandq_u32(vshrq_n_u32::<2>(v.1), mask)),
+            );
+            int16x8x4_t(
+                vreinterpretq_s16_u16(r),
+                vreinterpretq_s16_u16(g),
+                vreinterpretq_s16_u16(b),
+                vreinterpretq_s16_u16(a),
+            )
+        }
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vunzip_4_ar30_separate<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    v: uint32x4x2_t,
+) -> int16x8x4_t {
+    let values = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(v);
+    let a0 = vtrnq_s16(values.0, values.1);
+    let a1 = vtrnq_s16(values.2, values.3);
+    let v1 = vtrnq_s32(vreinterpretq_s32_s16(a0.0), vreinterpretq_s32_s16(a1.0));
+    let v2 = vtrnq_s32(vreinterpretq_s32_s16(a0.1), vreinterpretq_s32_s16(a1.1));
+    let k0 = vreinterpretq_s16_s32(v1.0);
+    let k1 = vreinterpretq_s16_s32(v2.0);
+    let k2 = vreinterpretq_s16_s32(v1.1);
+    let k3 = vreinterpretq_s16_s32(v2.1);
+    int16x8x4_t(k0, k1, k2, k3)
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vunzips_4_ar30_separate<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    v: uint32x4_t,
+) -> int16x8x2_t {
+    let values = vunzips_4_ar30::<AR30_TYPE, AR30_ORDER>(v);
+    let a0 = vtrn_s16(values.0, values.1);
+    let a1 = vtrn_s16(values.2, values.3);
+    let v1 = vtrn_s32(vreinterpret_s32_s16(a0.0), vreinterpret_s32_s16(a1.0));
+    let v2 = vtrn_s32(vreinterpret_s32_s16(a0.1), vreinterpret_s32_s16(a1.1));
+    let k0 = vreinterpret_s16_s32(v1.0);
+    let k1 = vreinterpret_s16_s32(v2.0);
+    let k2 = vreinterpret_s16_s32(v1.1);
+    let k3 = vreinterpret_s16_s32(v2.1);
+    int16x8x2_t(vcombine_s16(k0, k1), vcombine_s16(k2, k3))
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vzip_4_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    v: int16x8x4_t,
+) -> uint32x4x2_t {
+    let ar_type: Rgb30 = AR30_TYPE.into();
+    let a_max = vdupq_n_s16(3);
+    match ar_type {
+        Rgb30::Ar30 => {
+            let v3 = vminq_s16(vrshrq_n_s16::<8>(v.3), a_max);
+            let mut a0 = vshlq_n_u32::<30>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v3))));
+            let mut a1 = vshlq_n_u32::<30>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v3))));
+
+            let r0 = vshlq_n_u32::<20>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.2))));
+            let r1 = vshlq_n_u32::<20>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.2))));
+
+            a0 = vorrq_u32(a0, r0);
+            a1 = vorrq_u32(a1, r1);
+
+            let g0 = vshlq_n_u32::<10>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.1))));
+            let g1 = vshlq_n_u32::<10>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.1))));
+
+            a0 = vorrq_u32(a0, g0);
+            a1 = vorrq_u32(a1, g1);
+
+            a0 = vorrq_u32(a0, vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.0))));
+            a1 = vorrq_u32(a1, vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.0))));
+
+            if AR30_ORDER == 0 {
+                uint32x4x2_t(a0, a1)
+            } else {
+                uint32x4x2_t(vrev128_u32(a0), vrev128_u32(a1))
+            }
+        }
+        Rgb30::Ra30 => {
+            let v3 = vminq_s16(vrshrq_n_s16::<8>(v.3), a_max);
+            let mut a0 = vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v3)));
+            let mut a1 = vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v3)));
+
+            let r0 = vshlq_n_u32::<22>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.0))));
+            let r1 = vshlq_n_u32::<22>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.0))));
+
+            a0 = vorrq_u32(a0, r0);
+            a1 = vorrq_u32(a1, r1);
+
+            let g0 = vshlq_n_u32::<12>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.1))));
+            let g1 = vshlq_n_u32::<12>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.1))));
+
+            a0 = vorrq_u32(a0, g0);
+            a1 = vorrq_u32(a1, g1);
+
+            a0 = vorrq_u32(
+                a0,
+                vshlq_n_u32::<2>(vmovl_u16(vreinterpret_u16_s16(vget_low_s16(v.2)))),
+            );
+            a1 = vorrq_u32(
+                a1,
+                vshlq_n_u32::<2>(vmovl_u16(vreinterpret_u16_s16(vget_high_s16(v.2)))),
+            );
+
+            if AR30_ORDER == 0 {
+                uint32x4x2_t(a0, a1)
+            } else {
+                uint32x4x2_t(vrev128_u32(a0), vrev128_u32(a1))
+            }
+        }
+    }
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vld1_ar30_s16<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    arr: &[u32],
+) -> int16x4_t {
+    let item = *arr.get_unchecked(0);
+    let ar_type: Rgb30 = AR30_TYPE.into();
+    let vl = ar_type.unpack::<AR30_ORDER>(item);
+    let a_rep = (vl.3 as i16) << 8;
+    let temp = [vl.0 as i16, vl.1 as i16, vl.2 as i16, a_rep];
+    vld1_s16(temp.as_ptr())
+}
+
+#[inline(always)]
+pub(crate) unsafe fn vextract_ar30<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    v: uint16x4_t,
+) -> u32 {
+    let v0 = vreinterpret_u64_u16(v);
+    let a_mask = vdup_n_u64(0x3);
+    let v_mask = vdup_n_u64(0x3ff);
+    let mut a = vand_u64(vshr_n_u64::<48>(v0), a_mask);
+    let r = vand_u64(v0, v_mask);
+    let g = vand_u64(vshr_n_u64::<16>(v0), v_mask);
+    let b = vand_u64(vshr_n_u64::<32>(v0), v_mask);
+
+    let ar_type: Rgb30 = AR30_TYPE.into();
+
+    match ar_type {
+        Rgb30::Ar30 => {
+            a = vshl_n_u64::<30>(a);
+            a = vorr_u64(a, vshl_n_u64::<20>(b));
+            a = vorr_u64(a, vshl_n_u64::<10>(g));
+            a = vorr_u64(a, r);
+        }
+        Rgb30::Ra30 => {
+            a = vorr_u64(a, vshl_n_u64::<2>(b));
+            a = vorr_u64(a, vshl_n_u64::<12>(g));
+            a = vorr_u64(a, vshl_n_u64::<22>(r));
+        }
+    }
+
+    if AR30_ORDER == 1 {
+        a = vreinterpret_u64_u8(vrev32_u8(vreinterpret_u8_u64(a)));
+    }
+    let pairs = vreinterpret_u32_u64(a);
+    vget_lane_u32::<0>(pairs)
+}
diff --git a/src/neon/horizontal_ar30.rs b/src/neon/horizontal_ar30.rs
new file mode 100644
index 0000000..ea489ba
--- /dev/null
+++ b/src/neon/horizontal_ar30.rs
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::filter_weights::FilterWeights;
+use crate::neon::ar30::{
+    vextract_ar30, vld1_ar30_s16, vunzip_4_ar30_separate, vunzips_4_ar30_separate,
+};
+use std::arch::aarch64::*;
+
+#[inline]
+unsafe fn conv_horiz_rgba_1_u8_i16<
+    const SCALE: i32,
+    const AR_TYPE: usize,
+    const AR_ORDER: usize,
+>(
+    start_x: usize,
+    src: &[u32],
+    w0: int16x4_t,
+    store: int16x4_t,
+) -> int16x4_t {
+    let src_ptr = src.get_unchecked(start_x..);
+    let ld = vld1_ar30_s16::<AR_TYPE, AR_ORDER>(src_ptr);
+    let rgba_pixel = vshl_n_s16::<SCALE>(ld);
+    vqrdmlah_s16(store, rgba_pixel, w0)
+}
+
+#[inline(always)]
+unsafe fn conv_horiz_rgba_8_u8_i16<
+    const SCALE: i32,
+    const AR_TYPE: usize,
+    const AR_ORDER: usize,
+>(
+    start_x: usize,
+    src: &[u32],
+    set1: (int16x4_t, int16x4_t, int16x4_t, int16x4_t),
+    set2: (int16x4_t, int16x4_t, int16x4_t, int16x4_t),
+    store: int16x4_t,
+) -> int16x4_t {
+    let src_ptr = src.get_unchecked(start_x..);
+
+    let rgba_pixel = vunzip_4_ar30_separate::<AR_TYPE, AR_ORDER>(vld1q_u32_x2(src_ptr.as_ptr()));
+
+    let hi0 = vshlq_n_s16::<SCALE>(rgba_pixel.1);
+    let lo0 = vshlq_n_s16::<SCALE>(rgba_pixel.0);
+    let hi1 = vshlq_n_s16::<SCALE>(rgba_pixel.3);
+    let lo1 = vshlq_n_s16::<SCALE>(rgba_pixel.2);
+
+    let hi_v = vqrdmulhq_s16(hi0, vcombine_s16(set1.2, set1.3));
+    let mut product = vqrdmlahq_s16(hi_v, lo0, vcombine_s16(set1.0, set1.1));
+    product = vqrdmlahq_s16(product, hi1, vcombine_s16(set2.2, set2.3));
+    product = vqrdmlahq_s16(product, lo1, vcombine_s16(set2.0, set2.1));
+
+    vadd_s16(
+        vadd_s16(store, vget_low_s16(product)),
+        vget_high_s16(product),
+    )
+}
+
+#[inline]
+unsafe fn conv_horiz_rgba_4_u8_i16<
+    const SCALE: i32,
+    const AR_TYPE: usize,
+    const AR_ORDER: usize,
+>(
+    start_x: usize,
+    src: &[u32],
+    w0: int16x4_t,
+    w1: int16x4_t,
+    w2: int16x4_t,
+    w3: int16x4_t,
+    store: int16x4_t,
+) -> int16x4_t {
+    let src_ptr = src.get_unchecked(start_x..);
+
+    let rgba_pixel = vunzips_4_ar30_separate::<AR_TYPE, AR_ORDER>(vld1q_u32(src_ptr.as_ptr()));
+
+    let hi = vshlq_n_s16::<SCALE>(rgba_pixel.1);
+    let lo = vshlq_n_s16::<SCALE>(rgba_pixel.0);
+
+    let hi_v = vqrdmulhq_s16(hi, vcombine_s16(w2, w3));
+    let product = vqrdmlahq_s16(hi_v, lo, vcombine_s16(w0, w1));
+
+    vadd_s16(
+        vadd_s16(store, vget_low_s16(product)),
+        vget_high_s16(product),
+    )
+}
+
+pub(crate) fn neon_convolve_horizontal_rgba_rows_4_ar30<
+    const AR_TYPE: usize,
+    const AR_ORDER: usize,
+>(
+    src: &[u32],
+    src_stride: usize,
+    dst: &mut [u32],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+) {
+    unsafe {
+        neon_convolve_horizontal_rgba_rows_4_impl::<AR_TYPE, AR_ORDER>(
+            src,
+            src_stride,
+            dst,
+            dst_stride,
+            filter_weights,
+        );
+    }
+}
+
+#[target_feature(enable = "rdm")]
+unsafe fn neon_convolve_horizontal_rgba_rows_4_impl<const AR_TYPE: usize, const AR_ORDER: usize>(
+    src: &[u32],
+    src_stride: usize,
+    dst: &mut [u32],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+) {
+    unsafe {
+        const SCALE: i32 = 4;
+        const ROUNDING: i16 = 1 << (SCALE - 1);
+        let zeros = vdup_n_s16(0i16);
+        const ALPHA_ROUNDING: i16 = 1 << (SCALE as i16 + 7);
+        let init = vld1_s16([ROUNDING, ROUNDING, ROUNDING, ALPHA_ROUNDING].as_ptr());
+
+        let v_cut_off = vld1_s16([1023, 1023, 1023, 3].as_ptr());
+
+        let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+        let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+        let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+        let iter_row0 = row0_ref.iter_mut();
+        let iter_row1 = row1_ref.iter_mut();
+        let iter_row2 = row2_ref.iter_mut();
+        let iter_row3 = row3_ref.iter_mut();
+
+        let v_shl_back = vld1_s16(
+            [
+                -SCALE as i16,
+                -SCALE as i16,
+                -SCALE as i16,
+                -(SCALE as i16 + 8),
+            ]
+            .as_ptr(),
+        );
+
+        for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+            .zip(iter_row1)
+            .zip(iter_row2)
+            .zip(iter_row3)
+            .zip(filter_weights.bounds.iter())
+            .zip(
+                filter_weights
+                    .weights
+                    .chunks_exact(filter_weights.aligned_size),
+            )
+        {
+            let mut jx = 0usize;
+
+            let bounds_size = bounds.size;
+
+            let mut store_0 = init;
+            let mut store_1 = init;
+            let mut store_2 = init;
+            let mut store_3 = init;
+
+            let src0 = src;
+            let src1 = src0.get_unchecked(src_stride..);
+            let src2 = src1.get_unchecked(src_stride..);
+            let src3 = src2.get_unchecked(src_stride..);
+
+            while jx + 8 < bounds_size {
+                let bounds_start = bounds.start + jx;
+                let w_ptr = weights.get_unchecked(jx..(jx + 8));
+                let weights_set = vld1q_s16(w_ptr.as_ptr());
+                let w0 = vdup_laneq_s16::<0>(weights_set);
+                let w1 = vdup_laneq_s16::<1>(weights_set);
+                let w2 = vdup_laneq_s16::<2>(weights_set);
+                let w3 = vdup_laneq_s16::<3>(weights_set);
+                let w4 = vdup_laneq_s16::<4>(weights_set);
+                let w5 = vdup_laneq_s16::<5>(weights_set);
+                let w6 = vdup_laneq_s16::<6>(weights_set);
+                let w7 = vdup_laneq_s16::<7>(weights_set);
+                let set1 = (w0, w1, w2, w3);
+                let set2 = (w4, w5, w6, w7);
+                store_0 = conv_horiz_rgba_8_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src0,
+                    set1,
+                    set2,
+                    store_0,
+                );
+                store_1 = conv_horiz_rgba_8_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src1,
+                    set1,
+                    set2,
+                    store_1,
+                );
+                store_2 = conv_horiz_rgba_8_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src2,
+                    set1,
+                    set2,
+                    store_2,
+                );
+                store_3 = conv_horiz_rgba_8_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src3,
+                    set1,
+                    set2,
+                    store_3,
+                );
+                jx += 8;
+            }
+
+            while jx + 4 < bounds_size {
+                let bounds_start = bounds.start + jx;
+                let w_ptr = weights.get_unchecked(jx..(jx + 4));
+                let weights = vld1_s16(w_ptr.as_ptr());
+                let w0 = vdup_lane_s16::<0>(weights);
+                let w1 = vdup_lane_s16::<1>(weights);
+                let w2 = vdup_lane_s16::<2>(weights);
+                let w3 = vdup_lane_s16::<3>(weights);
+                store_0 = conv_horiz_rgba_4_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src0,
+                    w0,
+                    w1,
+                    w2,
+                    w3,
+                    store_0,
+                );
+                store_1 = conv_horiz_rgba_4_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src1,
+                    w0,
+                    w1,
+                    w2,
+                    w3,
+                    store_1,
+                );
+                store_2 = conv_horiz_rgba_4_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src2,
+                    w0,
+                    w1,
+                    w2,
+                    w3,
+                    store_2,
+                );
+                store_3 = conv_horiz_rgba_4_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src3,
+                    w0,
+                    w1,
+                    w2,
+                    w3,
+                    store_3,
+                );
+                jx += 4;
+            }
+
+            while jx < bounds_size {
+                let w_ptr = weights.get_unchecked(jx..(jx + 1));
+                let bounds_start = bounds.start + jx;
+                let weight0 = vld1_dup_s16(w_ptr.as_ptr());
+                store_0 = conv_horiz_rgba_1_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src0,
+                    weight0,
+                    store_0,
+                );
+                store_1 = conv_horiz_rgba_1_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src1,
+                    weight0,
+                    store_1,
+                );
+                store_2 = conv_horiz_rgba_1_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src2,
+                    weight0,
+                    store_2,
+                );
+                store_3 = conv_horiz_rgba_1_u8_i16::<SCALE, AR_TYPE, AR_ORDER>(
+                    bounds_start,
+                    src3,
+                    weight0,
+                    store_3,
+                );
+                jx += 1;
+            }
+
+            let store_16_0 = vreinterpret_u16_s16(vmin_s16(
+                vshl_s16(vmax_s16(store_0, zeros), v_shl_back),
+                v_cut_off,
+            ));
+            let store_16_1 = vreinterpret_u16_s16(vmin_s16(
+                vshl_s16(vmax_s16(store_1, zeros), v_shl_back),
+                v_cut_off,
+            ));
+            let store_16_2 = vreinterpret_u16_s16(vmin_s16(
+                vshl_s16(vmax_s16(store_2, zeros), v_shl_back),
+                v_cut_off,
+            ));
+            let store_16_3 = vreinterpret_u16_s16(vmin_s16(
+                vshl_s16(vmax_s16(store_3, zeros), v_shl_back),
+                v_cut_off,
+            ));
+
+            let packed0 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_0);
+            *chunk0 = packed0;
+            let packed1 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_1);
+            *chunk1 = packed1;
+            let packed2 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_2);
+            *chunk2 = packed2;
+            let packed3 = vextract_ar30::<AR_TYPE, AR_ORDER>(store_16_3);
+            *chunk3 = packed3;
+        }
+    }
+}
diff --git a/src/neon/mod.rs b/src/neon/mod.rs
index a7ca013..f937e95 100644
--- a/src/neon/mod.rs
+++ b/src/neon/mod.rs
@@ -33,10 +33,12 @@ mod alpha_f16_full;
 mod alpha_f32;
 mod alpha_u16;
 mod alpha_u8;
+mod ar30;
 #[cfg(feature = "half")]
 mod convolve_f16;
 #[cfg(feature = "half")]
 mod f16_utils;
+mod horizontal_ar30;
 mod plane_f32;
 mod plane_u8;
 #[cfg(feature = "half")]
@@ -53,6 +55,7 @@ mod rgba_f32;
 mod rgba_u16_lb;
 mod rgba_u8;
 mod utils;
+mod vertical_ar30;
 #[cfg(feature = "half")]
 mod vertical_f16;
 #[cfg(feature = "half")]
@@ -75,6 +78,7 @@ pub use alpha_u8::neon_premultiply_alpha_rgba;
 pub use alpha_u8::neon_unpremultiply_alpha_rgba;
 #[cfg(feature = "half")]
 pub use f16_utils::*;
+pub(crate) use horizontal_ar30::neon_convolve_horizontal_rgba_rows_4_ar30;
 pub use plane_f32::convolve_horizontal_plane_neon_row_one;
 pub use plane_f32::convolve_horizontal_plane_neon_rows_4;
 pub use plane_u8::{convolve_horizontal_plane_neon_row, convolve_horizontal_plane_neon_rows_4_u8};
@@ -98,9 +102,10 @@ pub use rgba_f16_full::{
 };
 pub use rgba_f32::*;
 pub use rgba_u16_lb::{
-    convolve_horizontal_rgba_neon_rows_4_lb_u8, convolve_horizontal_rgba_neon_u16_lb_row,
+    convolve_horizontal_rgba_neon_rows_4_lb_u16, convolve_horizontal_rgba_neon_u16_lb_row,
 };
 pub use rgba_u8::*;
+pub(crate) use vertical_ar30::neon_column_handler_fixed_point_ar30;
 #[cfg(feature = "half")]
 pub use vertical_f16::convolve_vertical_rgb_neon_row_f16;
 #[cfg(feature = "half")]
diff --git a/src/neon/plane_f32.rs b/src/neon/plane_f32.rs
index 576a8e0..e13e5b3 100644
--- a/src/neon/plane_f32.rs
+++ b/src/neon/plane_f32.rs
@@ -28,14 +28,14 @@
  */
 
 use crate::filter_weights::FilterWeights;
-use crate::neon::utils::prefer_vfmaq_f32;
+use crate::neon::utils::{prefer_vfmaq_f32, xvld1q_f32_x4};
 use std::arch::aarch64::*;
 
 macro_rules! conv_horiz_plane_16_f32 {
     ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{
         let src_ptr = $src.add($start_x);
 
-        let rgb_pixel = vld1q_f32_x4(src_ptr);
+        let rgb_pixel = xvld1q_f32_x4(src_ptr);
 
         let mut acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set.0);
         acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set.1);
@@ -87,13 +87,6 @@ macro_rules! conv_horiz_plane_1_f32 {
     }};
 }
 
-macro_rules! vfullq_sum_f32 {
-    ($reg: expr) => {{
-        let acc = vadd_f32(vget_low_f32($reg), vget_high_f32($reg));
-        vpadds_f32(acc)
-    }};
-}
-
 pub fn convolve_horizontal_plane_neon_row_one(
     dst_width: usize,
     _: usize,
@@ -113,7 +106,7 @@ pub fn convolve_horizontal_plane_neon_row_one(
             while jx + 16 < bounds.size {
                 let bounds_start = bounds.start + jx;
                 let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = vld1q_f32_x4(ptr);
+                let read_weights = xvld1q_f32_x4(ptr);
                 store = conv_horiz_plane_16_f32!(
                     bounds_start,
                     unsafe_source_ptr_0,
@@ -165,7 +158,7 @@ pub fn convolve_horizontal_plane_neon_row_one(
 
             let px = x;
             let dest_ptr = unsafe_destination_ptr_0.add(px);
-            dest_ptr.write_unaligned(vfullq_sum_f32!(store));
+            dest_ptr.write_unaligned(vaddvq_f32(store));
 
             filter_offset += filter_weights.aligned_size;
         }
@@ -196,7 +189,7 @@ pub fn convolve_horizontal_plane_neon_rows_4(
 
             while jx + 16 < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
-                let read_weights = vld1q_f32_x4(ptr);
+                let read_weights = xvld1q_f32_x4(ptr);
                 let bounds_start = bounds.start + jx;
                 store_0 = conv_horiz_plane_16_f32!(
                     bounds_start,
@@ -303,16 +296,16 @@ pub fn convolve_horizontal_plane_neon_rows_4(
 
             let px = x;
             let dest_ptr = unsafe_destination_ptr_0.add(px);
-            dest_ptr.write_unaligned(vfullq_sum_f32!(store_0));
+            dest_ptr.write_unaligned(vaddvq_f32(store_0));
 
             let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride);
-            dest_ptr.write_unaligned(vfullq_sum_f32!(store_1));
+            dest_ptr.write_unaligned(vaddvq_f32(store_1));
 
             let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2);
-            dest_ptr.write_unaligned(vfullq_sum_f32!(store_2));
+            dest_ptr.write_unaligned(vaddvq_f32(store_2));
 
             let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3);
-            dest_ptr.write_unaligned(vfullq_sum_f32!(store_3));
+            dest_ptr.write_unaligned(vaddvq_f32(store_3));
 
             filter_offset += filter_weights.aligned_size;
         }
diff --git a/src/neon/plane_u8.rs b/src/neon/plane_u8.rs
index 427494b..724b4b0 100644
--- a/src/neon/plane_u8.rs
+++ b/src/neon/plane_u8.rs
@@ -30,13 +30,6 @@ use crate::filter_weights::FilterWeights;
 use crate::support::{PRECISION, ROUNDING_CONST};
 use std::arch::aarch64::*;
 
-macro_rules! vfullq_sum_s32 {
-    ($reg: expr) => {{
-        let acc = vadd_s32(vget_low_s32($reg), vget_high_s32($reg));
-        vget_lane_s32::<0>(vpadd_s32(acc, acc))
-    }};
-}
-
 macro_rules! accumulate_16_horiz {
     ($store: expr, $ptr: expr, $weights: expr) => {{
         let pixel_colors = vld1q_u8($ptr);
@@ -190,7 +183,7 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8(
             }
 
             while jx < bounds.size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
+                let w_ptr = weights.get_unchecked(jx..(jx + 1));
                 let weight = vld1_lane_s16::<0>(w_ptr.as_ptr(), vdup_n_s16(0));
                 let bounds_start = bounds.start + jx;
 
@@ -209,22 +202,22 @@ pub fn convolve_horizontal_plane_neon_rows_4_u8(
                 jx += 1;
             }
 
-            let sums = vfullq_sum_s32!(store0).max(0);
+            let sums = vaddvq_s32(store0).max(0);
             let shifted = sums >> PRECISION;
             let value = shifted.min(255) as u8;
             *chunk0 = value;
 
-            let sums = vfullq_sum_s32!(store1).max(0);
+            let sums = vaddvq_s32(store1).max(0);
             let shifted = sums >> PRECISION;
             let value = shifted.min(255) as u8;
             *chunk1 = value;
 
-            let sums = vfullq_sum_s32!(store2).max(0);
+            let sums = vaddvq_s32(store2).max(0);
             let shifted = sums >> PRECISION;
             let value = shifted.min(255) as u8;
             *chunk2 = value;
 
-            let sums = vfullq_sum_s32!(store3).max(0);
+            let sums = vaddvq_s32(store3).max(0);
             let shifted = sums >> PRECISION;
             let value = shifted.min(255) as u8;
             *chunk3 = value;
@@ -291,7 +284,7 @@ pub fn convolve_horizontal_plane_neon_row(
                 jx += 1;
             }
 
-            let sums = vfullq_sum_s32!(store).max(0);
+            let sums = vaddvq_s32(store).max(0);
             let shifted = sums >> PRECISION;
             let value = shifted.min(255) as u8;
             *dst = value;
diff --git a/src/neon/rgb_f32.rs b/src/neon/rgb_f32.rs
index 6df1b0c..6934bbb 100644
--- a/src/neon/rgb_f32.rs
+++ b/src/neon/rgb_f32.rs
@@ -30,6 +30,7 @@
 use std::arch::aarch64::*;
 
 use crate::filter_weights::FilterWeights;
+use crate::neon::utils::xvld1q_f32_x4;
 use crate::neon::utils::{prefer_vfmaq_f32, vsplit_rgb_5};
 
 macro_rules! write_rgb_f32 {
@@ -46,7 +47,7 @@ macro_rules! conv_horiz_5_rgb_f32 {
         const COMPONENTS: usize = 3;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
-        let full_pixel = vld1q_f32_x4(src_ptr);
+        let full_pixel = xvld1q_f32_x4(src_ptr);
         let splat = vsplit_rgb_5(full_pixel);
 
         let mut acc = prefer_vfmaq_f32($store, splat.0, $set.0);
diff --git a/src/neon/rgb_u8.rs b/src/neon/rgb_u8.rs
index 7bd7fcd..8192ec2 100644
--- a/src/neon/rgb_u8.rs
+++ b/src/neon/rgb_u8.rs
@@ -32,14 +32,11 @@ use crate::neon::utils::load_3b_as_u16x4;
 use crate::support::{PRECISION, ROUNDING_CONST};
 use std::arch::aarch64::*;
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_4_u8(
     start_x: usize,
     src: &[u8],
-    w0: int16x4_t,
-    w1: int16x8_t,
-    w2: int16x4_t,
-    w3: int16x8_t,
+    weights: int16x4_t,
     store: int32x4_t,
     shuffle: uint8x16_t,
 ) -> int32x4_t {
@@ -57,18 +54,17 @@ unsafe fn conv_horiz_rgba_4_u8(
     let hi = vreinterpretq_s16_u16(vmovl_high_u8(rgb_pixel));
     let lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgb_pixel)));
 
-    let acc = vmlal_high_s16(store, hi, w3);
-    let acc = vmlal_s16(acc, vget_low_s16(hi), w2);
-    let acc = vmlal_high_s16(acc, lo, w1);
-    vmlal_s16(acc, vget_low_s16(lo), w0)
+    let acc = vmlal_high_lane_s16::<3>(store, hi, weights);
+    let acc = vmlal_lane_s16::<2>(acc, vget_low_s16(hi), weights);
+    let acc = vmlal_high_lane_s16::<1>(acc, lo, weights);
+    vmlal_lane_s16::<0>(acc, vget_low_s16(lo), weights)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_2_u8(
     start_x: usize,
     src: &[u8],
-    w0: int16x4_t,
-    w1: int16x8_t,
+    weights: int16x4_t,
     store: int32x4_t,
     shuffle: uint8x8_t,
 ) -> int32x4_t {
@@ -84,11 +80,11 @@ unsafe fn conv_horiz_rgba_2_u8(
 
     let wide = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(rgb_pixel)));
 
-    let acc = vmlal_high_s16(store, wide, w1);
-    vmlal_s16(acc, vget_low_s16(wide), w0)
+    let acc = vmlal_high_lane_s16::<1>(store, wide, weights);
+    vmlal_lane_s16::<0>(acc, vget_low_s16(wide), weights)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_1_u8(
     start_x: usize,
     src: &[u8],
@@ -102,10 +98,9 @@ unsafe fn conv_horiz_rgba_1_u8(
     vmlal_s16(store, lo, w0)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn write_accumulator_u8(store: int32x4_t, dst: &mut [u8]) {
-    let zeros = vdupq_n_s32(0i32);
-    let store_16 = vqshrun_n_s32::<PRECISION>(vmaxq_s32(store, zeros));
+    let store_16 = vqshrun_n_s32::<PRECISION>(store);
     let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
     let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
     let bytes = pixel.to_le_bytes();
@@ -167,30 +162,22 @@ pub fn convolve_horizontal_rgb_neon_rows_4(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                let w0 = vdup_lane_s16::<0>(weights);
-                let w1 = vdupq_lane_s16::<1>(weights);
-                let w2 = vdup_lane_s16::<2>(weights);
-                let w3 = vdupq_lane_s16::<3>(weights);
-                store_0 =
-                    conv_horiz_rgba_4_u8(bounds_start, src0, w0, w1, w2, w3, store_0, shuffle);
-                store_1 =
-                    conv_horiz_rgba_4_u8(bounds_start, src1, w0, w1, w2, w3, store_1, shuffle);
-                store_2 =
-                    conv_horiz_rgba_4_u8(bounds_start, src2, w0, w1, w2, w3, store_2, shuffle);
-                store_3 =
-                    conv_horiz_rgba_4_u8(bounds_start, src3, w0, w1, w2, w3, store_3, shuffle);
+                store_0 = conv_horiz_rgba_4_u8(bounds_start, src0, weights, store_0, shuffle);
+                store_1 = conv_horiz_rgba_4_u8(bounds_start, src1, weights, store_1, shuffle);
+                store_2 = conv_horiz_rgba_4_u8(bounds_start, src2, weights, store_2, shuffle);
+                store_3 = conv_horiz_rgba_4_u8(bounds_start, src3, weights, store_3, shuffle);
                 jx += 4;
             }
 
             while jx + 2 < bounds.size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 2));
                 let bnds = bounds.start + jx;
-                let w0 = vld1_dup_s16(w_ptr.as_ptr());
-                let w1 = vld1q_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-                store_0 = conv_horiz_rgba_2_u8(bnds, src0, w0, w1, store_0, shuffle_1);
-                store_1 = conv_horiz_rgba_2_u8(bnds, src1, w0, w1, store_1, shuffle_1);
-                store_2 = conv_horiz_rgba_2_u8(bnds, src2, w0, w1, store_2, shuffle_1);
-                store_3 = conv_horiz_rgba_2_u8(bnds, src3, w0, w1, store_3, shuffle_1);
+                let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
+                v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
+                store_0 = conv_horiz_rgba_2_u8(bnds, src0, v_weight, store_0, shuffle_1);
+                store_1 = conv_horiz_rgba_2_u8(bnds, src1, v_weight, store_1, shuffle_1);
+                store_2 = conv_horiz_rgba_2_u8(bnds, src2, v_weight, store_2, shuffle_1);
+                store_3 = conv_horiz_rgba_2_u8(bnds, src3, v_weight, store_3, shuffle_1);
                 jx += 2;
             }
 
@@ -245,20 +232,16 @@ pub fn convolve_horizontal_rgb_neon_row_one(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                let w0 = vdup_lane_s16::<0>(weights);
-                let w1 = vdupq_lane_s16::<1>(weights);
-                let w2 = vdup_lane_s16::<2>(weights);
-                let w3 = vdupq_lane_s16::<3>(weights);
-                store = conv_horiz_rgba_4_u8(bounds_start, src, w0, w1, w2, w3, store, shuffle);
+                store = conv_horiz_rgba_4_u8(bounds_start, src, weights, store, shuffle);
                 jx += 4;
             }
 
             while jx + 2 < bounds_size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 2));
                 let bounds_start = bounds.start + jx;
-                let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-                let weight1 = vld1q_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-                store = conv_horiz_rgba_2_u8(bounds_start, src, weight0, weight1, store, shuffle_1);
+                let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
+                v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
+                store = conv_horiz_rgba_2_u8(bounds_start, src, v_weight, store, shuffle_1);
                 jx += 2;
             }
 
diff --git a/src/neon/rgba_f32.rs b/src/neon/rgba_f32.rs
index d087703..0f60df9 100644
--- a/src/neon/rgba_f32.rs
+++ b/src/neon/rgba_f32.rs
@@ -29,6 +29,7 @@
 
 use crate::filter_weights::FilterWeights;
 use crate::neon::utils::prefer_vfmaq_f32;
+use crate::neon::utils::xvld1q_f32_x4;
 use std::arch::aarch64::*;
 
 macro_rules! conv_horiz_rgba_8_f32 {
@@ -36,8 +37,8 @@ macro_rules! conv_horiz_rgba_8_f32 {
         const COMPONENTS: usize = 4;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
-        let rgb_pixel0 = vld1q_f32_x4(src_ptr);
-        let rgb_pixel1 = vld1q_f32_x4(src_ptr.add(16));
+        let rgb_pixel0 = xvld1q_f32_x4(src_ptr);
+        let rgb_pixel1 = xvld1q_f32_x4(src_ptr.add(16));
 
         let mut acc = prefer_vfmaq_f32($store, rgb_pixel0.0, $set1.0);
         acc = prefer_vfmaq_f32(acc, rgb_pixel0.1, $set1.1);
@@ -56,7 +57,7 @@ macro_rules! conv_horiz_rgba_4_f32 {
         const COMPONENTS: usize = 4;
         let src_ptr = $src.add($start_x * COMPONENTS);
 
-        let rgb_pixel = vld1q_f32_x4(src_ptr);
+        let rgb_pixel = xvld1q_f32_x4(src_ptr);
 
         let acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set1.0);
         let acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set1.1);
diff --git a/src/neon/rgba_u16_lb.rs b/src/neon/rgba_u16_lb.rs
index ba8285d..36f2d91 100644
--- a/src/neon/rgba_u16_lb.rs
+++ b/src/neon/rgba_u16_lb.rs
@@ -27,10 +27,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::filter_weights::FilterWeights;
+use crate::neon::utils::{xvld1q_u16_x2, xvld1q_u16_x4};
 use crate::support::{PRECISION, ROUNDING_CONST};
 use std::arch::aarch64::*;
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_1_u16(
     start_x: usize,
     src: &[u16],
@@ -44,7 +45,7 @@ unsafe fn conv_horiz_rgba_1_u16(
     vmlal_s16(store, lo, w0)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_2_u16(
     start_x: usize,
     src: &[u16],
@@ -62,61 +63,57 @@ unsafe fn conv_horiz_rgba_2_u16(
     vmlal_s16(acc, vget_low_s16(wide), w0)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_4_u16(
     start_x: usize,
     src: &[u16],
-    w0: int16x4_t,
-    w1: int16x8_t,
-    w2: int16x4_t,
-    w3: int16x8_t,
+    weights: int16x4_t,
     store: int32x4_t,
 ) -> int32x4_t {
     const COMPONENTS: usize = 4;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
 
-    let rgba_pixel = vld1q_u16_x2(src_ptr.as_ptr());
+    let rgba_pixel = xvld1q_u16_x2(src_ptr.as_ptr());
 
     let hi = vreinterpretq_s16_u16(rgba_pixel.1);
     let lo = vreinterpretq_s16_u16(rgba_pixel.0);
 
-    let acc = vmlal_high_s16(store, hi, w3);
-    let acc = vmlal_s16(acc, vget_low_s16(hi), w2);
-    let acc = vmlal_high_s16(acc, lo, w1);
-    vmlal_s16(acc, vget_low_s16(lo), w0)
+    let acc = vmlal_high_lane_s16::<3>(store, hi, weights);
+    let acc = vmlal_lane_s16::<2>(acc, vget_low_s16(hi), weights);
+    let acc = vmlal_high_lane_s16::<1>(acc, lo, weights);
+    vmlal_lane_s16::<0>(acc, vget_low_s16(lo), weights)
 }
 
 #[inline(always)]
 unsafe fn conv_horiz_rgba_8_u16(
     start_x: usize,
     src: &[u16],
-    set1: (int16x8_t, int16x8_t, int16x8_t, int16x8_t),
-    set2: (int16x8_t, int16x8_t, int16x8_t, int16x8_t),
+    weights: int16x8_t,
     store: int32x4_t,
 ) -> int32x4_t {
     const COMPONENTS: usize = 4;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
 
-    let rgba_pixel = vld1q_u16_x4(src_ptr.as_ptr());
+    let rgba_pixel = xvld1q_u16_x4(src_ptr.as_ptr());
 
     let hi0 = vreinterpretq_s16_u16(rgba_pixel.1);
     let lo0 = vreinterpretq_s16_u16(rgba_pixel.0);
     let hi1 = vreinterpretq_s16_u16(rgba_pixel.3);
     let lo1 = vreinterpretq_s16_u16(rgba_pixel.2);
 
-    let mut acc = vmlal_high_s16(store, hi0, set1.3);
-    acc = vmlal_s16(acc, vget_low_s16(hi0), vget_low_s16(set1.2));
-    acc = vmlal_high_s16(acc, lo0, set1.1);
-    acc = vmlal_s16(acc, vget_low_s16(lo0), vget_low_s16(set1.0));
+    let mut acc = vmlal_high_laneq_s16::<3>(store, hi0, weights);
+    acc = vmlal_laneq_s16::<2>(acc, vget_low_s16(hi0), weights);
+    acc = vmlal_high_laneq_s16::<1>(acc, lo0, weights);
+    acc = vmlal_laneq_s16::<0>(acc, vget_low_s16(lo0), weights);
 
-    acc = vmlal_high_s16(acc, hi1, set2.3);
-    acc = vmlal_s16(acc, vget_low_s16(hi1), vget_low_s16(set2.2));
-    acc = vmlal_high_s16(acc, lo1, set2.1);
-    acc = vmlal_s16(acc, vget_low_s16(lo1), vget_low_s16(set2.0));
+    acc = vmlal_high_laneq_s16::<7>(acc, hi1, weights);
+    acc = vmlal_laneq_s16::<6>(acc, vget_low_s16(hi1), weights);
+    acc = vmlal_high_laneq_s16::<5>(acc, lo1, weights);
+    acc = vmlal_laneq_s16::<4>(acc, vget_low_s16(lo1), weights);
     acc
 }
 
-pub fn convolve_horizontal_rgba_neon_rows_4_lb_u8(
+pub fn convolve_horizontal_rgba_neon_rows_4_lb_u16(
     src: &[u16],
     src_stride: usize,
     dst: &mut [u16],
@@ -126,7 +123,6 @@ pub fn convolve_horizontal_rgba_neon_rows_4_lb_u8(
 ) {
     unsafe {
         const CHANNELS: usize = 4;
-        let zeros = vdupq_n_s32(0i32);
         let init = vdupq_n_s32(ROUNDING_CONST);
 
         let v_max_colors = vdup_n_u16((1 << bit_depth) - 1);
@@ -168,20 +164,10 @@ pub fn convolve_horizontal_rgba_neon_rows_4_lb_u8(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 8));
                 let weights_set = vld1q_s16(w_ptr.as_ptr());
-                let w0 = vdupq_laneq_s16::<0>(weights_set);
-                let w1 = vdupq_laneq_s16::<1>(weights_set);
-                let w2 = vdupq_laneq_s16::<2>(weights_set);
-                let w3 = vdupq_laneq_s16::<3>(weights_set);
-                let w4 = vdupq_laneq_s16::<4>(weights_set);
-                let w5 = vdupq_laneq_s16::<5>(weights_set);
-                let w6 = vdupq_laneq_s16::<6>(weights_set);
-                let w7 = vdupq_laneq_s16::<7>(weights_set);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
-                store_0 = conv_horiz_rgba_8_u16(bounds_start, src0, set1, set2, store_0);
-                store_1 = conv_horiz_rgba_8_u16(bounds_start, src1, set1, set2, store_1);
-                store_2 = conv_horiz_rgba_8_u16(bounds_start, src2, set1, set2, store_2);
-                store_3 = conv_horiz_rgba_8_u16(bounds_start, src3, set1, set2, store_3);
+                store_0 = conv_horiz_rgba_8_u16(bounds_start, src0, weights_set, store_0);
+                store_1 = conv_horiz_rgba_8_u16(bounds_start, src1, weights_set, store_1);
+                store_2 = conv_horiz_rgba_8_u16(bounds_start, src2, weights_set, store_2);
+                store_3 = conv_horiz_rgba_8_u16(bounds_start, src3, weights_set, store_3);
                 jx += 8;
             }
 
@@ -189,14 +175,10 @@ pub fn convolve_horizontal_rgba_neon_rows_4_lb_u8(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                let w0 = vdup_lane_s16::<0>(weights);
-                let w1 = vdupq_lane_s16::<1>(weights);
-                let w2 = vdup_lane_s16::<2>(weights);
-                let w3 = vdupq_lane_s16::<3>(weights);
-                store_0 = conv_horiz_rgba_4_u16(bounds_start, src0, w0, w1, w2, w3, store_0);
-                store_1 = conv_horiz_rgba_4_u16(bounds_start, src1, w0, w1, w2, w3, store_1);
-                store_2 = conv_horiz_rgba_4_u16(bounds_start, src2, w0, w1, w2, w3, store_2);
-                store_3 = conv_horiz_rgba_4_u16(bounds_start, src3, w0, w1, w2, w3, store_3);
+                store_0 = conv_horiz_rgba_4_u16(bounds_start, src0, weights, store_0);
+                store_1 = conv_horiz_rgba_4_u16(bounds_start, src1, weights, store_1);
+                store_2 = conv_horiz_rgba_4_u16(bounds_start, src2, weights, store_2);
+                store_3 = conv_horiz_rgba_4_u16(bounds_start, src3, weights, store_3);
                 jx += 4;
             }
 
@@ -223,22 +205,10 @@ pub fn convolve_horizontal_rgba_neon_rows_4_lb_u8(
                 jx += 1;
             }
 
-            let store_16_0 = vmin_u16(
-                vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_0, zeros)),
-                v_max_colors,
-            );
-            let store_16_1 = vmin_u16(
-                vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_1, zeros)),
-                v_max_colors,
-            );
-            let store_16_2 = vmin_u16(
-                vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_2, zeros)),
-                v_max_colors,
-            );
-            let store_16_3 = vmin_u16(
-                vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_3, zeros)),
-                v_max_colors,
-            );
+            let store_16_0 = vmin_u16(vqshrun_n_s32::<PRECISION>(store_0), v_max_colors);
+            let store_16_1 = vmin_u16(vqshrun_n_s32::<PRECISION>(store_1), v_max_colors);
+            let store_16_2 = vmin_u16(vqshrun_n_s32::<PRECISION>(store_2), v_max_colors);
+            let store_16_3 = vmin_u16(vqshrun_n_s32::<PRECISION>(store_3), v_max_colors);
 
             vst1_u16(chunk0.as_mut_ptr(), store_16_0);
             vst1_u16(chunk1.as_mut_ptr(), store_16_1);
@@ -257,7 +227,6 @@ pub fn convolve_horizontal_rgba_neon_u16_lb_row(
     unsafe {
         const CHANNELS: usize = 4;
 
-        let zeros = vdupq_n_s32(0i32);
         let v_max_colors = vdup_n_u16((1 << bit_depth) - 1);
 
         for ((dst, bounds), weights) in dst
@@ -277,37 +246,15 @@ pub fn convolve_horizontal_rgba_neon_u16_lb_row(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 8));
                 let weights_set = vld1q_s16(w_ptr.as_ptr());
-                let w0 = vdupq_laneq_s16::<0>(weights_set);
-                let w1 = vdupq_laneq_s16::<1>(weights_set);
-                let w2 = vdupq_laneq_s16::<2>(weights_set);
-                let w3 = vdupq_laneq_s16::<3>(weights_set);
-                let w4 = vdupq_laneq_s16::<4>(weights_set);
-                let w5 = vdupq_laneq_s16::<5>(weights_set);
-                let w6 = vdupq_laneq_s16::<6>(weights_set);
-                let w7 = vdupq_laneq_s16::<7>(weights_set);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
-                store = conv_horiz_rgba_8_u16(bounds_start, src, set1, set2, store);
+                store = conv_horiz_rgba_8_u16(bounds_start, src, weights_set, store);
                 jx += 8;
             }
 
             while jx + 4 < bounds_size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                let weight0 = vdup_lane_s16::<0>(weights);
-                let weight1 = vdupq_lane_s16::<1>(weights);
-                let weight2 = vdup_lane_s16::<2>(weights);
-                let weight3 = vdupq_lane_s16::<3>(weights);
                 let bounds_start = bounds.start + jx;
-                store = conv_horiz_rgba_4_u16(
-                    bounds_start,
-                    src,
-                    weight0,
-                    weight1,
-                    weight2,
-                    weight3,
-                    store,
-                );
+                store = conv_horiz_rgba_4_u16(bounds_start, src, weights, store);
                 jx += 4;
             }
 
@@ -328,10 +275,7 @@ pub fn convolve_horizontal_rgba_neon_u16_lb_row(
                 jx += 1;
             }
 
-            let store_16_0 = vmin_u16(
-                vqshrun_n_s32::<PRECISION>(vmaxq_s32(store, zeros)),
-                v_max_colors,
-            );
+            let store_16_0 = vmin_u16(vqshrun_n_s32::<PRECISION>(store), v_max_colors);
 
             vst1_u16(dst.as_mut_ptr(), store_16_0);
         }
diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs
index 7f0b6ce..62373f0 100644
--- a/src/neon/rgba_u8.rs
+++ b/src/neon/rgba_u8.rs
@@ -28,7 +28,7 @@
  */
 
 use crate::filter_weights::FilterWeights;
-use crate::neon::utils::load_4b_as_u16x4;
+use crate::neon::utils::{load_4b_as_u16x4, xvld1q_u8_x2};
 use crate::support::PRECISION;
 use crate::support::ROUNDING_CONST;
 use std::arch::aarch64::*;
@@ -37,29 +37,28 @@ use std::arch::aarch64::*;
 unsafe fn conv_horiz_rgba_8_u8(
     start_x: usize,
     src: &[u8],
-    set1: (int16x8_t, int16x8_t, int16x8_t, int16x8_t),
-    set2: (int16x8_t, int16x8_t, int16x8_t, int16x8_t),
+    weights: int16x8_t,
     store: int32x4_t,
 ) -> int32x4_t {
     const COMPONENTS: usize = 4;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
 
-    let rgba_pixel = vld1q_u8_x2(src_ptr.as_ptr());
+    let rgba_pixel = xvld1q_u8_x2(src_ptr.as_ptr());
 
     let hi0 = vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel.0));
     let lo0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel.0)));
     let hi1 = vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel.1));
     let lo1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel.1)));
 
-    let mut acc = vmlal_high_s16(store, hi0, set1.3);
-    acc = vmlal_s16(acc, vget_low_s16(hi0), vget_low_s16(set1.2));
-    acc = vmlal_high_s16(acc, lo0, set1.1);
-    acc = vmlal_s16(acc, vget_low_s16(lo0), vget_low_s16(set1.0));
+    let mut acc = vmlal_high_laneq_s16::<3>(store, hi0, weights);
+    acc = vmlal_laneq_s16::<2>(acc, vget_low_s16(hi0), weights);
+    acc = vmlal_high_laneq_s16::<1>(acc, lo0, weights);
+    acc = vmlal_laneq_s16::<0>(acc, vget_low_s16(lo0), weights);
 
-    acc = vmlal_high_s16(acc, hi1, set2.3);
-    acc = vmlal_s16(acc, vget_low_s16(hi1), vget_low_s16(set2.2));
-    acc = vmlal_high_s16(acc, lo1, set2.1);
-    acc = vmlal_s16(acc, vget_low_s16(lo1), vget_low_s16(set2.0));
+    acc = vmlal_high_laneq_s16::<7>(acc, hi1, weights);
+    acc = vmlal_laneq_s16::<6>(acc, vget_low_s16(hi1), weights);
+    acc = vmlal_high_laneq_s16::<5>(acc, lo1, weights);
+    acc = vmlal_laneq_s16::<4>(acc, vget_low_s16(lo1), weights);
     acc
 }
 
@@ -67,37 +66,35 @@ unsafe fn conv_horiz_rgba_8_u8(
 unsafe fn conv_horiz_rgba_8_u8_i16<const SCALE: i32>(
     start_x: usize,
     src: &[u8],
-    set1: (int16x4_t, int16x4_t, int16x4_t, int16x4_t),
-    set2: (int16x4_t, int16x4_t, int16x4_t, int16x4_t),
+    weights: int16x8_t,
     store: int16x4_t,
 ) -> int16x4_t {
     const COMPONENTS: usize = 4;
     let src_ptr = src.get_unchecked((start_x * COMPONENTS)..);
 
-    let rgba_pixel = vld1q_u8_x2(src_ptr.as_ptr());
+    let rgba_pixel = xvld1q_u8_x2(src_ptr.as_ptr());
 
     let hi0 = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel.0)));
     let lo0 = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel.0))));
     let hi1 = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel.1)));
     let lo1 = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel.1))));
 
-    let hi_v = vqrdmulhq_s16(hi0, vcombine_s16(set1.2, set1.3));
-    let mut product = vqrdmlahq_s16(hi_v, lo0, vcombine_s16(set1.0, set1.1));
-    product = vqrdmlahq_s16(product, hi1, vcombine_s16(set2.2, set2.3));
-    product = vqrdmlahq_s16(product, lo1, vcombine_s16(set2.0, set2.1));
-
-    vadd_s16(
-        vadd_s16(store, vget_low_s16(product)),
-        vget_high_s16(product),
-    )
+    let mut product = vqrdmlah_laneq_s16::<3>(store, vget_high_s16(hi0), weights);
+    product = vqrdmlah_laneq_s16::<2>(product, vget_low_s16(hi0), weights);
+    product = vqrdmlah_laneq_s16::<1>(product, vget_high_s16(lo0), weights);
+    product = vqrdmlah_laneq_s16::<0>(product, vget_low_s16(lo0), weights);
+    product = vqrdmlah_laneq_s16::<7>(product, vget_high_s16(hi1), weights);
+    product = vqrdmlah_laneq_s16::<6>(product, vget_low_s16(hi1), weights);
+    product = vqrdmlah_laneq_s16::<5>(product, vget_high_s16(lo1), weights);
+    product = vqrdmlah_laneq_s16::<4>(product, vget_low_s16(lo1), weights);
+    product
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_2_u8(
     start_x: usize,
     src: &[u8],
-    w0: int16x4_t,
-    w1: int16x8_t,
+    weights: int16x4_t,
     store: int32x4_t,
 ) -> int32x4_t {
     const COMPONENTS: usize = 4;
@@ -106,16 +103,15 @@ unsafe fn conv_horiz_rgba_2_u8(
     let rgb_pixel = vld1_u8(src_ptr.as_ptr());
     let wide = vreinterpretq_s16_u16(vmovl_u8(rgb_pixel));
 
-    let acc = vmlal_high_s16(store, wide, w1);
-    vmlal_s16(acc, vget_low_s16(wide), w0)
+    let acc = vmlal_high_lane_s16::<1>(store, wide, weights);
+    vmlal_lane_s16::<0>(acc, vget_low_s16(wide), weights)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_2_u8_i16<const SCALE: i32>(
     start_x: usize,
     src: &[u8],
-    w0: int16x4_t,
-    w1: int16x4_t,
+    weights: int16x4_t,
     store: int16x4_t,
 ) -> int16x4_t {
     const COMPONENTS: usize = 4;
@@ -124,22 +120,15 @@ unsafe fn conv_horiz_rgba_2_u8_i16<const SCALE: i32>(
     let rgb_pixel = vld1_u8(src_ptr.as_ptr());
     let wide = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_u8(rgb_pixel)));
 
-    let product = vqrdmulhq_s16(wide, vcombine_s16(w0, w1));
-
-    vadd_s16(
-        vadd_s16(store, vget_low_s16(product)),
-        vget_high_s16(product),
-    )
+    let product = vqrdmlah_lane_s16::<0>(store, vget_low_s16(wide), weights);
+    vqrdmlah_lane_s16::<1>(product, vget_high_s16(wide), weights)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_4_u8(
     start_x: usize,
     src: &[u8],
-    w0: int16x4_t,
-    w1: int16x8_t,
-    w2: int16x4_t,
-    w3: int16x8_t,
+    weights: int16x4_t,
     store: int32x4_t,
 ) -> int32x4_t {
     const COMPONENTS: usize = 4;
@@ -150,20 +139,17 @@ unsafe fn conv_horiz_rgba_4_u8(
     let hi = vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel));
     let lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel)));
 
-    let acc = vmlal_high_s16(store, hi, w3);
-    let acc = vmlal_s16(acc, vget_low_s16(hi), w2);
-    let acc = vmlal_high_s16(acc, lo, w1);
-    vmlal_s16(acc, vget_low_s16(lo), w0)
+    let acc = vmlal_high_lane_s16::<3>(store, hi, weights);
+    let acc = vmlal_lane_s16::<2>(acc, vget_low_s16(hi), weights);
+    let acc = vmlal_high_lane_s16::<1>(acc, lo, weights);
+    vmlal_lane_s16::<0>(acc, vget_low_s16(lo), weights)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_4_u8_i16<const SCALE: i32>(
     start_x: usize,
     src: &[u8],
-    w0: int16x4_t,
-    w1: int16x4_t,
-    w2: int16x4_t,
-    w3: int16x4_t,
+    weights: int16x4_t,
     store: int16x4_t,
 ) -> int16x4_t {
     const COMPONENTS: usize = 4;
@@ -174,16 +160,14 @@ unsafe fn conv_horiz_rgba_4_u8_i16<const SCALE: i32>(
     let hi = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_high_u8(rgba_pixel)));
     let lo = vshlq_n_s16::<SCALE>(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(rgba_pixel))));
 
-    let hi_v = vqrdmulhq_s16(hi, vcombine_s16(w2, w3));
-    let product = vqrdmlahq_s16(hi_v, lo, vcombine_s16(w0, w1));
-
-    vadd_s16(
-        vadd_s16(store, vget_low_s16(product)),
-        vget_high_s16(product),
-    )
+    let mut product = vqrdmlah_lane_s16::<3>(store, vget_high_s16(hi), weights);
+    product = vqrdmlah_lane_s16::<2>(product, vget_low_s16(hi), weights);
+    product = vqrdmlah_lane_s16::<1>(product, vget_high_s16(lo), weights);
+    product = vqrdmlah_lane_s16::<0>(product, vget_low_s16(lo), weights);
+    product
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_1_u8(
     start_x: usize,
     src: &[u8],
@@ -197,7 +181,7 @@ unsafe fn conv_horiz_rgba_1_u8(
     vmlal_s16(store, lo, w0)
 }
 
-#[inline]
+#[inline(always)]
 unsafe fn conv_horiz_rgba_1_u8_i16<const SCALE: i32>(
     start_x: usize,
     src: &[u8],
@@ -211,7 +195,6 @@ unsafe fn conv_horiz_rgba_1_u8_i16<const SCALE: i32>(
     vqrdmlah_s16(store, lo, w0)
 }
 
-/// Slightly lower precision scale option
 pub fn convolve_horizontal_rgba_neon_rows_4_u8_i16(
     src: &[u8],
     src_stride: usize,
@@ -220,139 +203,134 @@ pub fn convolve_horizontal_rgba_neon_rows_4_u8_i16(
     filter_weights: &FilterWeights<i16>,
 ) {
     unsafe {
-        const CHANNELS: usize = 4;
-        const SCALE: i32 = 6;
-        const ROUNDING: i16 = 1 << (SCALE - 1);
-        let zeros = vdup_n_s16(0i16);
-        let init = vdup_n_s16(ROUNDING);
-
-        let (row0_ref, rest) = dst.split_at_mut(dst_stride);
-        let (row1_ref, rest) = rest.split_at_mut(dst_stride);
-        let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
-
-        let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
-        let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
-        let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
-        let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
-
-        for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
-            .zip(iter_row1)
-            .zip(iter_row2)
-            .zip(iter_row3)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let mut jx = 0usize;
-
-            let bounds_size = bounds.size;
-
-            let mut store_0 = init;
-            let mut store_1 = init;
-            let mut store_2 = init;
-            let mut store_3 = init;
-
-            let src0 = src;
-            let src1 = src0.get_unchecked(src_stride..);
-            let src2 = src1.get_unchecked(src_stride..);
-            let src3 = src2.get_unchecked(src_stride..);
+        convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
+            src,
+            src_stride,
+            dst,
+            dst_stride,
+            filter_weights,
+        );
+    }
+}
 
-            while jx + 8 < bounds_size {
-                let bounds_start = bounds.start + jx;
-                let w_ptr = weights.get_unchecked(jx..(jx + 8));
-                let weights_set = vld1q_s16(w_ptr.as_ptr());
-                let w0 = vdup_laneq_s16::<0>(weights_set);
-                let w1 = vdup_laneq_s16::<1>(weights_set);
-                let w2 = vdup_laneq_s16::<2>(weights_set);
-                let w3 = vdup_laneq_s16::<3>(weights_set);
-                let w4 = vdup_laneq_s16::<4>(weights_set);
-                let w5 = vdup_laneq_s16::<5>(weights_set);
-                let w6 = vdup_laneq_s16::<6>(weights_set);
-                let w7 = vdup_laneq_s16::<7>(weights_set);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
-                store_0 =
-                    conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src0, set1, set2, store_0);
-                store_1 =
-                    conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src1, set1, set2, store_1);
-                store_2 =
-                    conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src2, set1, set2, store_2);
-                store_3 =
-                    conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src3, set1, set2, store_3);
-                jx += 8;
-            }
+/// Slightly lower precision scale option
+#[target_feature(enable = "rdm")]
+unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl(
+    src: &[u8],
+    src_stride: usize,
+    dst: &mut [u8],
+    dst_stride: usize,
+    filter_weights: &FilterWeights<i16>,
+) {
+    const CHANNELS: usize = 4;
+    const SCALE: i32 = 6;
+    const ROUNDING: i16 = 1 << (SCALE - 1);
+    let init = vdup_n_s16(ROUNDING);
+
+    let (row0_ref, rest) = dst.split_at_mut(dst_stride);
+    let (row1_ref, rest) = rest.split_at_mut(dst_stride);
+    let (row2_ref, row3_ref) = rest.split_at_mut(dst_stride);
+
+    let iter_row0 = row0_ref.chunks_exact_mut(CHANNELS);
+    let iter_row1 = row1_ref.chunks_exact_mut(CHANNELS);
+    let iter_row2 = row2_ref.chunks_exact_mut(CHANNELS);
+    let iter_row3 = row3_ref.chunks_exact_mut(CHANNELS);
+
+    for (((((chunk0, chunk1), chunk2), chunk3), &bounds), weights) in iter_row0
+        .zip(iter_row1)
+        .zip(iter_row2)
+        .zip(iter_row3)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let mut jx = 0usize;
+
+        let bounds_size = bounds.size;
+
+        let mut store_0 = init;
+        let mut store_1 = init;
+        let mut store_2 = init;
+        let mut store_3 = init;
+
+        let src0 = src;
+        let src1 = src0.get_unchecked(src_stride..);
+        let src2 = src1.get_unchecked(src_stride..);
+        let src3 = src2.get_unchecked(src_stride..);
+
+        while jx + 8 < bounds_size {
+            let bounds_start = bounds.start + jx;
+            let w_ptr = weights.get_unchecked(jx..(jx + 8));
+            let weights_set = vld1q_s16(w_ptr.as_ptr());
+            store_0 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src0, weights_set, store_0);
+            store_1 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src1, weights_set, store_1);
+            store_2 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src2, weights_set, store_2);
+            store_3 = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src3, weights_set, store_3);
+            jx += 8;
+        }
 
-            while jx + 4 < bounds_size {
-                let bounds_start = bounds.start + jx;
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-                let weights = vld1_s16(w_ptr.as_ptr());
-                let w0 = vdup_lane_s16::<0>(weights);
-                let w1 = vdup_lane_s16::<1>(weights);
-                let w2 = vdup_lane_s16::<2>(weights);
-                let w3 = vdup_lane_s16::<3>(weights);
-                store_0 =
-                    conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src0, w0, w1, w2, w3, store_0);
-                store_1 =
-                    conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src1, w0, w1, w2, w3, store_1);
-                store_2 =
-                    conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src2, w0, w1, w2, w3, store_2);
-                store_3 =
-                    conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src3, w0, w1, w2, w3, store_3);
-                jx += 4;
-            }
+        while jx + 4 < bounds_size {
+            let bounds_start = bounds.start + jx;
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+            let weights = vld1_s16(w_ptr.as_ptr());
+            store_0 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src0, weights, store_0);
+            store_1 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src1, weights, store_1);
+            store_2 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src2, weights, store_2);
+            store_3 = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src3, weights, store_3);
+            jx += 4;
+        }
 
-            while jx + 2 < bounds_size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let bounds_start = bounds.start + jx;
-                let w0 = vld1_dup_s16(w_ptr.as_ptr());
-                let w1 = vld1_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-                store_0 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src0, w0, w1, store_0);
-                store_1 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src1, w0, w1, store_1);
-                store_2 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src2, w0, w1, store_2);
-                store_3 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src3, w0, w1, store_3);
-                jx += 2;
-            }
+        while jx + 2 < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = bounds.start + jx;
+            let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
+            v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
+            store_0 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src0, v_weight, store_0);
+            store_1 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src1, v_weight, store_1);
+            store_2 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src2, v_weight, store_2);
+            store_3 = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src3, v_weight, store_3);
+            jx += 2;
+        }
 
-            while jx < bounds_size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-                let bounds_start = bounds.start + jx;
-                let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-                store_0 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src0, weight0, store_0);
-                store_1 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src1, weight0, store_1);
-                store_2 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src2, weight0, store_2);
-                store_3 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src3, weight0, store_3);
-                jx += 1;
-            }
+        while jx < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+            let bounds_start = bounds.start + jx;
+            let weight0 = vld1_dup_s16(w_ptr.as_ptr());
+            store_0 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src0, weight0, store_0);
+            store_1 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src1, weight0, store_1);
+            store_2 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src2, weight0, store_2);
+            store_3 = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src3, weight0, store_3);
+            jx += 1;
+        }
 
-            let store_16_0 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_0, zeros)));
-            let store_16_1 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_1, zeros)));
-            let store_16_2 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_2, zeros)));
-            let store_16_3 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store_3, zeros)));
+        let store_16_0 = vshr_n_s16::<SCALE>(store_0);
+        let store_16_1 = vshr_n_s16::<SCALE>(store_1);
+        let store_16_2 = vshr_n_s16::<SCALE>(store_2);
+        let store_16_3 = vshr_n_s16::<SCALE>(store_3);
 
-            let store_16_8_0 = vqmovn_u16(vcombine_u16(store_16_0, store_16_0));
-            let store_16_8_1 = vqmovn_u16(vcombine_u16(store_16_1, store_16_1));
-            let store_16_8_2 = vqmovn_u16(vcombine_u16(store_16_2, store_16_2));
-            let store_16_8 = vqmovn_u16(vcombine_u16(store_16_3, store_16_3));
+        let store_16_8_0 = vqmovun_s16(vcombine_s16(store_16_0, store_16_0));
+        let store_16_8_1 = vqmovun_s16(vcombine_s16(store_16_1, store_16_1));
+        let store_16_8_2 = vqmovun_s16(vcombine_s16(store_16_2, store_16_2));
+        let store_16_8 = vqmovun_s16(vcombine_s16(store_16_3, store_16_3));
 
-            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_0));
-            let dest_ptr_32 = chunk0.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(pixel);
+        let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_0));
+        let dest_ptr_32 = chunk0.as_mut_ptr() as *mut u32;
+        dest_ptr_32.write_unaligned(pixel);
 
-            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_1));
-            let dest_ptr_32 = chunk1.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(pixel);
+        let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_1));
+        let dest_ptr_32 = chunk1.as_mut_ptr() as *mut u32;
+        dest_ptr_32.write_unaligned(pixel);
 
-            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_2));
-            let dest_ptr_32 = chunk2.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(pixel);
+        let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_2));
+        let dest_ptr_32 = chunk2.as_mut_ptr() as *mut u32;
+        dest_ptr_32.write_unaligned(pixel);
 
-            let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
-            let dest_ptr_32 = chunk3.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(pixel);
-        }
+        let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
+        let dest_ptr_32 = chunk3.as_mut_ptr() as *mut u32;
+        dest_ptr_32.write_unaligned(pixel);
     }
 }
 
@@ -365,7 +343,6 @@ pub fn convolve_horizontal_rgba_neon_rows_4_u8(
 ) {
     unsafe {
         const CHANNELS: usize = 4;
-        let zeros = vdupq_n_s32(0i32);
         let init = vdupq_n_s32(ROUNDING_CONST);
 
         let (row0_ref, rest) = dst.split_at_mut(dst_stride);
@@ -405,20 +382,10 @@ pub fn convolve_horizontal_rgba_neon_rows_4_u8(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 8));
                 let weights_set = vld1q_s16(w_ptr.as_ptr());
-                let w0 = vdupq_laneq_s16::<0>(weights_set);
-                let w1 = vdupq_laneq_s16::<1>(weights_set);
-                let w2 = vdupq_laneq_s16::<2>(weights_set);
-                let w3 = vdupq_laneq_s16::<3>(weights_set);
-                let w4 = vdupq_laneq_s16::<4>(weights_set);
-                let w5 = vdupq_laneq_s16::<5>(weights_set);
-                let w6 = vdupq_laneq_s16::<6>(weights_set);
-                let w7 = vdupq_laneq_s16::<7>(weights_set);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
-                store_0 = conv_horiz_rgba_8_u8(bounds_start, src0, set1, set2, store_0);
-                store_1 = conv_horiz_rgba_8_u8(bounds_start, src1, set1, set2, store_1);
-                store_2 = conv_horiz_rgba_8_u8(bounds_start, src2, set1, set2, store_2);
-                store_3 = conv_horiz_rgba_8_u8(bounds_start, src3, set1, set2, store_3);
+                store_0 = conv_horiz_rgba_8_u8(bounds_start, src0, weights_set, store_0);
+                store_1 = conv_horiz_rgba_8_u8(bounds_start, src1, weights_set, store_1);
+                store_2 = conv_horiz_rgba_8_u8(bounds_start, src2, weights_set, store_2);
+                store_3 = conv_horiz_rgba_8_u8(bounds_start, src3, weights_set, store_3);
                 jx += 8;
             }
 
@@ -426,26 +393,22 @@ pub fn convolve_horizontal_rgba_neon_rows_4_u8(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                let w0 = vdup_lane_s16::<0>(weights);
-                let w1 = vdupq_lane_s16::<1>(weights);
-                let w2 = vdup_lane_s16::<2>(weights);
-                let w3 = vdupq_lane_s16::<3>(weights);
-                store_0 = conv_horiz_rgba_4_u8(bounds_start, src0, w0, w1, w2, w3, store_0);
-                store_1 = conv_horiz_rgba_4_u8(bounds_start, src1, w0, w1, w2, w3, store_1);
-                store_2 = conv_horiz_rgba_4_u8(bounds_start, src2, w0, w1, w2, w3, store_2);
-                store_3 = conv_horiz_rgba_4_u8(bounds_start, src3, w0, w1, w2, w3, store_3);
+                store_0 = conv_horiz_rgba_4_u8(bounds_start, src0, weights, store_0);
+                store_1 = conv_horiz_rgba_4_u8(bounds_start, src1, weights, store_1);
+                store_2 = conv_horiz_rgba_4_u8(bounds_start, src2, weights, store_2);
+                store_3 = conv_horiz_rgba_4_u8(bounds_start, src3, weights, store_3);
                 jx += 4;
             }
 
             while jx + 2 < bounds_size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 2));
                 let bounds_start = bounds.start + jx;
-                let w0 = vld1_dup_s16(w_ptr.as_ptr());
-                let w1 = vld1q_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-                store_0 = conv_horiz_rgba_2_u8(bounds_start, src0, w0, w1, store_0);
-                store_1 = conv_horiz_rgba_2_u8(bounds_start, src1, w0, w1, store_1);
-                store_2 = conv_horiz_rgba_2_u8(bounds_start, src2, w0, w1, store_2);
-                store_3 = conv_horiz_rgba_2_u8(bounds_start, src3, w0, w1, store_3);
+                let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
+                v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
+                store_0 = conv_horiz_rgba_2_u8(bounds_start, src0, v_weight, store_0);
+                store_1 = conv_horiz_rgba_2_u8(bounds_start, src1, v_weight, store_1);
+                store_2 = conv_horiz_rgba_2_u8(bounds_start, src2, v_weight, store_2);
+                store_3 = conv_horiz_rgba_2_u8(bounds_start, src3, v_weight, store_3);
                 jx += 2;
             }
 
@@ -460,10 +423,10 @@ pub fn convolve_horizontal_rgba_neon_rows_4_u8(
                 jx += 1;
             }
 
-            let store_16_0 = vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_0, zeros));
-            let store_16_1 = vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_1, zeros));
-            let store_16_2 = vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_2, zeros));
-            let store_16_3 = vqshrun_n_s32::<PRECISION>(vmaxq_s32(store_3, zeros));
+            let store_16_0 = vqshrun_n_s32::<PRECISION>(store_0);
+            let store_16_1 = vqshrun_n_s32::<PRECISION>(store_1);
+            let store_16_2 = vqshrun_n_s32::<PRECISION>(store_2);
+            let store_16_3 = vqshrun_n_s32::<PRECISION>(store_3);
 
             let store_16_8_0 = vqmovn_u16(vcombine_u16(store_16_0, store_16_0));
             let store_16_8_1 = vqmovn_u16(vcombine_u16(store_16_1, store_16_1));
@@ -514,46 +477,24 @@ pub fn convolve_horizontal_rgba_neon_row(
                 let bounds_start = bounds.start + jx;
                 let w_ptr = weights.get_unchecked(jx..(jx + 8));
                 let weights_set = vld1q_s16(w_ptr.as_ptr());
-                let w0 = vdupq_laneq_s16::<0>(weights_set);
-                let w1 = vdupq_laneq_s16::<1>(weights_set);
-                let w2 = vdupq_laneq_s16::<2>(weights_set);
-                let w3 = vdupq_laneq_s16::<3>(weights_set);
-                let w4 = vdupq_laneq_s16::<4>(weights_set);
-                let w5 = vdupq_laneq_s16::<5>(weights_set);
-                let w6 = vdupq_laneq_s16::<6>(weights_set);
-                let w7 = vdupq_laneq_s16::<7>(weights_set);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
-                store = conv_horiz_rgba_8_u8(bounds_start, src, set1, set2, store);
+                store = conv_horiz_rgba_8_u8(bounds_start, src, weights_set, store);
                 jx += 8;
             }
 
             while jx + 4 < bounds_size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 4));
                 let weights = vld1_s16(w_ptr.as_ptr());
-                let weight0 = vdup_lane_s16::<0>(weights);
-                let weight1 = vdupq_lane_s16::<1>(weights);
-                let weight2 = vdup_lane_s16::<2>(weights);
-                let weight3 = vdupq_lane_s16::<3>(weights);
                 let bounds_start = bounds.start + jx;
-                store = conv_horiz_rgba_4_u8(
-                    bounds_start,
-                    src,
-                    weight0,
-                    weight1,
-                    weight2,
-                    weight3,
-                    store,
-                );
+                store = conv_horiz_rgba_4_u8(bounds_start, src, weights, store);
                 jx += 4;
             }
 
             while jx + 2 < bounds_size {
                 let w_ptr = weights.get_unchecked(jx..(jx + 2));
                 let bounds_start = bounds.start + jx;
-                let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-                let weight1 = vld1q_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-                store = conv_horiz_rgba_2_u8(bounds_start, src, weight0, weight1, store);
+                let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
+                v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
+                store = conv_horiz_rgba_2_u8(bounds_start, src, v_weight, store);
                 jx += 2;
             }
 
@@ -565,7 +506,7 @@ pub fn convolve_horizontal_rgba_neon_row(
                 jx += 1;
             }
 
-            let store_16 = vqshrun_n_s32::<PRECISION>(vmaxq_s32(store, vdupq_n_s32(0i32)));
+            let store_16 = vqshrun_n_s32::<PRECISION>(store);
             let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
 
             let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
@@ -581,86 +522,71 @@ pub fn convolve_horizontal_rgba_neon_row_i16(
     filter_weights: &FilterWeights<i16>,
 ) {
     unsafe {
-        const SCALE: i32 = 6;
-        const ROUNDING: i16 = 1 << (SCALE - 1);
-        let zeros = vdup_n_s16(0i16);
-        const CHANNELS: usize = 4;
-
-        for ((dst, bounds), weights) in dst
-            .chunks_exact_mut(CHANNELS)
-            .zip(filter_weights.bounds.iter())
-            .zip(
-                filter_weights
-                    .weights
-                    .chunks_exact(filter_weights.aligned_size),
-            )
-        {
-            let bounds_size = bounds.size;
-            let mut jx = 0usize;
-            let mut store = vdup_n_s16(ROUNDING);
+        convolve_horizontal_rgba_neon_row_i16_impl(src, dst, filter_weights);
+    }
+}
 
-            while jx + 8 < bounds_size {
-                let bounds_start = bounds.start + jx;
-                let w_ptr = weights.get_unchecked(jx..(jx + 8));
-                let weights_set = vld1q_s16(w_ptr.as_ptr());
-                let w0 = vdup_laneq_s16::<0>(weights_set);
-                let w1 = vdup_laneq_s16::<1>(weights_set);
-                let w2 = vdup_laneq_s16::<2>(weights_set);
-                let w3 = vdup_laneq_s16::<3>(weights_set);
-                let w4 = vdup_laneq_s16::<4>(weights_set);
-                let w5 = vdup_laneq_s16::<5>(weights_set);
-                let w6 = vdup_laneq_s16::<6>(weights_set);
-                let w7 = vdup_laneq_s16::<7>(weights_set);
-                let set1 = (w0, w1, w2, w3);
-                let set2 = (w4, w5, w6, w7);
-                store = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src, set1, set2, store);
-                jx += 8;
-            }
+#[target_feature(enable = "rdm")]
+unsafe fn convolve_horizontal_rgba_neon_row_i16_impl(
+    src: &[u8],
+    dst: &mut [u8],
+    filter_weights: &FilterWeights<i16>,
+) {
+    const SCALE: i32 = 6;
+    const ROUNDING: i16 = 1 << (SCALE - 1);
+    const CHANNELS: usize = 4;
+
+    for ((dst, bounds), weights) in dst
+        .chunks_exact_mut(CHANNELS)
+        .zip(filter_weights.bounds.iter())
+        .zip(
+            filter_weights
+                .weights
+                .chunks_exact(filter_weights.aligned_size),
+        )
+    {
+        let bounds_size = bounds.size;
+        let mut jx = 0usize;
+        let mut store = vdup_n_s16(ROUNDING);
+
+        while jx + 8 < bounds_size {
+            let bounds_start = bounds.start + jx;
+            let w_ptr = weights.get_unchecked(jx..(jx + 8));
+            let weights_set = vld1q_s16(w_ptr.as_ptr());
+            store = conv_horiz_rgba_8_u8_i16::<SCALE>(bounds_start, src, weights_set, store);
+            jx += 8;
+        }
 
-            while jx + 4 < bounds_size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 4));
-                let weights = vld1_s16(w_ptr.as_ptr());
-                let weight0 = vdup_lane_s16::<0>(weights);
-                let weight1 = vdup_lane_s16::<1>(weights);
-                let weight2 = vdup_lane_s16::<2>(weights);
-                let weight3 = vdup_lane_s16::<3>(weights);
-                let bounds_start = bounds.start + jx;
-                store = conv_horiz_rgba_4_u8_i16::<SCALE>(
-                    bounds_start,
-                    src,
-                    weight0,
-                    weight1,
-                    weight2,
-                    weight3,
-                    store,
-                );
-                jx += 4;
-            }
+        while jx + 4 < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 4));
+            let weights = vld1_s16(w_ptr.as_ptr());
+            let bounds_start = bounds.start + jx;
+            store = conv_horiz_rgba_4_u8_i16::<SCALE>(bounds_start, src, weights, store);
+            jx += 4;
+        }
 
-            while jx + 2 < bounds_size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 2));
-                let bounds_start = bounds.start + jx;
-                let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-                let weight1 = vld1_dup_s16(w_ptr.get_unchecked(1..).as_ptr());
-                store =
-                    conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src, weight0, weight1, store);
-                jx += 2;
-            }
+        while jx + 2 < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 2));
+            let bounds_start = bounds.start + jx;
+            let mut v_weight = vld1_dup_s16(w_ptr.as_ptr());
+            v_weight = vld1_lane_s16::<1>(w_ptr.as_ptr().add(1), v_weight);
+            store = conv_horiz_rgba_2_u8_i16::<SCALE>(bounds_start, src, v_weight, store);
+            jx += 2;
+        }
 
-            while jx < bounds_size {
-                let w_ptr = weights.get_unchecked(jx..(jx + 1));
-                let weight0 = vld1_dup_s16(w_ptr.as_ptr());
-                let bounds_start = bounds.start + jx;
-                store = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src, weight0, store);
-                jx += 1;
-            }
+        while jx < bounds_size {
+            let w_ptr = weights.get_unchecked(jx..(jx + 1));
+            let weight0 = vld1_dup_s16(w_ptr.as_ptr());
+            let bounds_start = bounds.start + jx;
+            store = conv_horiz_rgba_1_u8_i16::<SCALE>(bounds_start, src, weight0, store);
+            jx += 1;
+        }
 
-            let store_16 = vreinterpret_u16_s16(vshr_n_s16::<SCALE>(vmax_s16(store, zeros)));
-            let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16));
+        let store_16 = vshr_n_s16::<SCALE>(store);
+        let store_16_8 = vqmovun_s16(vcombine_s16(store_16, store_16));
 
-            let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
-            let dest_ptr_32 = dst.as_mut_ptr() as *mut u32;
-            dest_ptr_32.write_unaligned(value);
-        }
+        let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8));
+        let dest_ptr_32 = dst.as_mut_ptr() as *mut u32;
+        dest_ptr_32.write_unaligned(value);
     }
 }
diff --git a/src/neon/utils.rs b/src/neon/utils.rs
index fe2e131..e21a1d8 100644
--- a/src/neon/utils.rs
+++ b/src/neon/utils.rs
@@ -29,6 +29,60 @@
 
 use std::arch::aarch64::*;
 
+#[inline(always)]
+pub(crate) unsafe fn xvld1q_u8_x2(ptr: *const u8) -> uint8x16x2_t {
+    uint8x16x2_t(vld1q_u8(ptr), vld1q_u8(ptr.add(16)))
+}
+
+#[inline(always)]
+pub(crate) unsafe fn xvld1q_u8_x4(ptr: *const u8) -> uint8x16x4_t {
+    uint8x16x4_t(
+        vld1q_u8(ptr),
+        vld1q_u8(ptr.add(16)),
+        vld1q_u8(ptr.add(32)),
+        vld1q_u8(ptr.add(48)),
+    )
+}
+
+#[inline(always)]
+pub(crate) unsafe fn xvld1q_u16_x4(a: *const u16) -> uint16x8x4_t {
+    uint16x8x4_t(
+        vld1q_u16(a),
+        vld1q_u16(a.add(8)),
+        vld1q_u16(a.add(16)),
+        vld1q_u16(a.add(24)),
+    )
+}
+
+#[inline(always)]
+pub(crate) unsafe fn xvld1q_u16_x2(a: *const u16) -> uint16x8x2_t {
+    uint16x8x2_t(vld1q_u16(a), vld1q_u16(a.add(8)))
+}
+
+#[inline(always)]
+pub(crate) unsafe fn xvld1q_f32_x4(a: *const f32) -> float32x4x4_t {
+    float32x4x4_t(
+        vld1q_f32(a),
+        vld1q_f32(a.add(4)),
+        vld1q_f32(a.add(8)),
+        vld1q_f32(a.add(12)),
+    )
+}
+
+#[inline(always)]
+pub(crate) unsafe fn xvst1q_u8_x2(ptr: *mut u8, b: uint8x16x2_t) {
+    vst1q_u8(ptr, b.0);
+    vst1q_u8(ptr.add(16), b.1);
+}
+
+#[inline(always)]
+pub(crate) unsafe fn xvst1q_u8_x4(ptr: *mut u8, b: uint8x16x4_t) {
+    vst1q_u8(ptr, b.0);
+    vst1q_u8(ptr.add(16), b.1);
+    vst1q_u8(ptr.add(32), b.2);
+    vst1q_u8(ptr.add(48), b.3);
+}
+
 #[inline(always)]
 pub(crate) unsafe fn prefer_vfmaq_f32(
     a: float32x4_t,
@@ -74,9 +128,9 @@ pub(crate) unsafe fn vsplit_rgb_5(px: float32x4x4_t) -> Float32x5T {
 }
 
 pub(crate) struct Float32x5T(
-    pub float32x4_t,
-    pub float32x4_t,
-    pub float32x4_t,
-    pub float32x4_t,
-    pub float32x4_t,
+    pub(crate) float32x4_t,
+    pub(crate) float32x4_t,
+    pub(crate) float32x4_t,
+    pub(crate) float32x4_t,
+    pub(crate) float32x4_t,
 );
diff --git a/src/neon/vertical_ar30.rs b/src/neon/vertical_ar30.rs
new file mode 100644
index 0000000..b5c3f2f
--- /dev/null
+++ b/src/neon/vertical_ar30.rs
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::filter_weights::FilterBounds;
+use crate::fixed_point_vertical_ar30::convolve_column_handler_fip_db_ar30;
+use crate::neon::ar30::{vunzip_4_ar30, vzip_4_ar30};
+use std::arch::aarch64::{
+    int16x8x4_t, vdupq_n_s16, vld1q_u32_x2, vmaxq_s16, vminq_s16, vqrdmlahq_s16, vqrdmulhq_s16,
+    vrshrq_n_s16, vshlq_n_s16, vst1q_u32_x2,
+};
+
+#[inline(always)]
+pub(crate) fn neon_column_handler_fixed_point_ar30<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+>(
+    bounds: &FilterBounds,
+    src: &[u32],
+    dst: &mut [u32],
+    src_stride: usize,
+    weight: &[i16],
+) {
+    unsafe {
+        neon_column_handler_fixed_point_ar30_impl::<AR30_TYPE, AR30_ORDER>(
+            bounds, src, dst, src_stride, weight,
+        );
+    }
+}
+
+#[target_feature(enable = "rdm")]
+unsafe fn neon_column_handler_fixed_point_ar30_impl<
+    const AR30_TYPE: usize,
+    const AR30_ORDER: usize,
+>(
+    bounds: &FilterBounds,
+    src: &[u32],
+    dst: &mut [u32],
+    src_stride: usize,
+    weight: &[i16],
+) {
+    let mut cx = 0usize;
+
+    let total_width = dst.len();
+
+    const PREC: i32 = 5;
+    const BACK: i32 = 5;
+
+    let bounds_size = bounds.size;
+
+    while cx + 8 < total_width {
+        unsafe {
+            let v_max = vdupq_n_s16(1023);
+            let zeros = vdupq_n_s16(0);
+            let filter = weight;
+            let v_start_px = cx;
+
+            let py = bounds.start;
+            let weight = vdupq_n_s16(filter[0]);
+            let offset = src_stride * py + v_start_px;
+            let src_ptr = src.get_unchecked(offset..(offset + 8));
+
+            let ps = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr.as_ptr()));
+            let mut v0 = vqrdmulhq_s16(vshlq_n_s16::<PREC>(ps.0), weight);
+            let mut v1 = vqrdmulhq_s16(vshlq_n_s16::<PREC>(ps.1), weight);
+            let mut v2 = vqrdmulhq_s16(vshlq_n_s16::<PREC>(ps.2), weight);
+            let mut v3 = vqrdmulhq_s16(vshlq_n_s16::<PREC>(ps.3), weight);
+
+            if bounds_size == 2 {
+                let weights = filter.get_unchecked(0..2);
+                let py = bounds.start;
+                let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_start_px)..);
+
+                let v_weight1 = vdupq_n_s16(weights[1]);
+
+                let ps1 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr1.as_ptr()));
+                v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps1.0), v_weight1);
+                v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps1.1), v_weight1);
+                v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps1.2), v_weight1);
+                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps1.3), v_weight1);
+            } else if bounds_size == 3 {
+                let weights = filter.get_unchecked(0..3);
+                let py = bounds.start;
+                let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_start_px)..);
+                let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_start_px)..);
+
+                let v_weight1 = vdupq_n_s16(weights[1]);
+                let v_weight2 = vdupq_n_s16(weights[2]);
+
+                let ps1 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr1.as_ptr()));
+                v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps1.0), v_weight1);
+                v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps1.1), v_weight1);
+                v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps1.2), v_weight1);
+                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps1.3), v_weight1);
+                let ps2 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr2.as_ptr()));
+                v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps2.0), v_weight2);
+                v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps2.1), v_weight2);
+                v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps2.2), v_weight2);
+                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps2.3), v_weight2);
+            } else if bounds_size == 4 {
+                let weights = filter.get_unchecked(0..4);
+                let py = bounds.start;
+                let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_start_px)..);
+                let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_start_px)..);
+                let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_start_px)..);
+
+                let v_weight1 = vdupq_n_s16(weights[1]);
+                let v_weight2 = vdupq_n_s16(weights[2]);
+                let v_weight3 = vdupq_n_s16(weights[3]);
+
+                let ps1 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr1.as_ptr()));
+                v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps1.0), v_weight1);
+                v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps1.1), v_weight1);
+                v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps1.2), v_weight1);
+                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps1.3), v_weight1);
+                let ps2 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr2.as_ptr()));
+                v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps2.0), v_weight2);
+                v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps2.1), v_weight2);
+                v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps2.2), v_weight2);
+                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps2.3), v_weight2);
+                let ps3 = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr3.as_ptr()));
+                v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps3.0), v_weight3);
+                v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps3.1), v_weight3);
+                v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps3.2), v_weight3);
+                v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps3.3), v_weight3);
+            } else {
+                for (j, &k_weight) in filter.iter().take(bounds.size).skip(1).enumerate() {
+                    // Adding 1 is necessary because skip do not incrementing value on values that skipped
+                    let py = bounds.start + j + 1;
+                    let weight = vdupq_n_s16(k_weight);
+                    let offset = src_stride * py + v_start_px;
+                    let src_ptr = src.get_unchecked(offset..(offset + 8));
+
+                    let ps = vunzip_4_ar30::<AR30_TYPE, AR30_ORDER>(vld1q_u32_x2(src_ptr.as_ptr()));
+                    v0 = vqrdmlahq_s16(v0, vshlq_n_s16::<PREC>(ps.0), weight);
+                    v1 = vqrdmlahq_s16(v1, vshlq_n_s16::<PREC>(ps.1), weight);
+                    v2 = vqrdmlahq_s16(v2, vshlq_n_s16::<PREC>(ps.2), weight);
+                    v3 = vqrdmlahq_s16(v3, vshlq_n_s16::<PREC>(ps.3), weight);
+                }
+            }
+
+            let v_dst = dst.get_unchecked_mut(v_start_px..(v_start_px + 8));
+
+            v0 = vmaxq_s16(vminq_s16(vrshrq_n_s16::<BACK>(v0), v_max), zeros);
+            v1 = vmaxq_s16(vminq_s16(vrshrq_n_s16::<BACK>(v1), v_max), zeros);
+            v2 = vmaxq_s16(vminq_s16(vrshrq_n_s16::<BACK>(v2), v_max), zeros);
+            v3 = vmaxq_s16(vrshrq_n_s16::<BACK>(v3), zeros);
+
+            let vals = vzip_4_ar30::<AR30_TYPE, AR30_ORDER>(int16x8x4_t(v0, v1, v2, v3));
+            vst1q_u32_x2(v_dst.as_mut_ptr(), vals);
+        }
+
+        cx += 8;
+    }
+
+    while cx + 4 < total_width {
+        convolve_column_handler_fip_db_ar30::<AR30_TYPE, AR30_ORDER, 4>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 4;
+    }
+
+    while cx < total_width {
+        convolve_column_handler_fip_db_ar30::<AR30_TYPE, AR30_ORDER, 1>(
+            src, src_stride, dst, weight, bounds, cx,
+        );
+
+        cx += 1;
+    }
+}
diff --git a/src/neon/vertical_f32.rs b/src/neon/vertical_f32.rs
index 915c0b5..d1a241f 100644
--- a/src/neon/vertical_f32.rs
+++ b/src/neon/vertical_f32.rs
@@ -28,6 +28,7 @@
  */
 use crate::filter_weights::FilterBounds;
 use crate::neon::utils::prefer_vfmaq_f32;
+use crate::neon::utils::xvld1q_f32_x4;
 use std::arch::aarch64::*;
 
 macro_rules! conv_vertical_part_neon_16_f32 {
@@ -46,7 +47,7 @@ macro_rules! conv_vertical_part_neon_16_f32 {
                 let src_ptr = $src.add($src_stride * py);
 
                 let s_ptr = src_ptr.add(px);
-                let item_row = vld1q_f32_x4(s_ptr);
+                let item_row = xvld1q_f32_x4(s_ptr);
 
                 store_0 = prefer_vfmaq_f32(store_0, item_row.0, v_weight);
                 store_1 = prefer_vfmaq_f32(store_1, item_row.1, v_weight);
@@ -81,8 +82,8 @@ macro_rules! conv_vertical_part_neon_32_f32 {
                 let src_ptr = $src.add($src_stride * py);
 
                 let s_ptr = src_ptr.add(px);
-                let item_row_0 = vld1q_f32_x4(s_ptr);
-                let item_row_1 = vld1q_f32_x4(s_ptr.add(16));
+                let item_row_0 = xvld1q_f32_x4(s_ptr);
+                let item_row_1 = xvld1q_f32_x4(s_ptr.add(16));
 
                 store_0 = prefer_vfmaq_f32(store_0, item_row_0.0, v_weight);
                 store_1 = prefer_vfmaq_f32(store_1, item_row_0.1, v_weight);
@@ -131,9 +132,9 @@ macro_rules! conv_vertical_part_neon_48_f32 {
                 let src_ptr = $src.add($src_stride * py);
 
                 let s_ptr = src_ptr.add(px);
-                let item_row_0 = vld1q_f32_x4(s_ptr);
-                let item_row_1 = vld1q_f32_x4(s_ptr.add(16));
-                let item_row_2 = vld1q_f32_x4(s_ptr.add(32));
+                let item_row_0 = xvld1q_f32_x4(s_ptr);
+                let item_row_1 = xvld1q_f32_x4(s_ptr.add(16));
+                let item_row_2 = xvld1q_f32_x4(s_ptr.add(32));
 
                 store_0 = prefer_vfmaq_f32(store_0, item_row_0.0, v_weight);
                 store_1 = prefer_vfmaq_f32(store_1, item_row_0.1, v_weight);
diff --git a/src/neon/vertical_u16_lb.rs b/src/neon/vertical_u16_lb.rs
index 602e5dd..7dc925d 100644
--- a/src/neon/vertical_u16_lb.rs
+++ b/src/neon/vertical_u16_lb.rs
@@ -46,7 +46,6 @@ pub fn convolve_column_lb_u16(
 
         let bounds_size = bounds.size;
 
-        let zeros = vdupq_n_s32(0);
         let initial_store = vdupq_n_s32(ROUNDING_CONST);
 
         let v_max_colors = vdupq_n_u16(max_colors);
@@ -65,9 +64,8 @@ pub fn convolve_column_lb_u16(
 
             if bounds_size == 2 {
                 let weights = weight.get_unchecked(0..2);
-
-                let v_weight0 = vdupq_n_s16(weights[0]);
-                let v_weight1 = vdupq_n_s16(weights[1]);
+                let mut v_weight = vld1_dup_s16(weights.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weights.as_ptr().add(1), v_weight);
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -76,24 +74,23 @@ pub fn convolve_column_lb_u16(
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight0));
-                store1 = vmlal_high_s16(store1, item_row0, v_weight0);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row1), vget_low_s16(v_weight0));
-                store3 = vmlal_high_s16(store3, item_row1, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
+                store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight);
+                store3 = vmlal_high_lane_s16::<0>(store3, item_row1, v_weight);
 
                 let item_row10 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
                 let item_row11 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row10), vget_low_s16(v_weight1));
-                store1 = vmlal_high_s16(store1, item_row10, v_weight1);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row11), vget_low_s16(v_weight1));
-                store3 = vmlal_high_s16(store3, item_row11, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight);
+                store1 = vmlal_high_lane_s16::<1>(store1, item_row10, v_weight);
+                store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight);
+                store3 = vmlal_high_lane_s16::<1>(store3, item_row11, v_weight);
             } else if bounds_size == 3 {
                 let weights = weight.get_unchecked(0..3);
-
-                let v_weight0 = vdupq_n_s16(weights[0]);
-                let v_weight1 = vdupq_n_s16(weights[1]);
-                let v_weight2 = vdupq_n_s16(weights[2]);
+                let mut v_weight = vld1_dup_s16(weights.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weights.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weights.as_ptr().add(2), v_weight);
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -103,33 +100,30 @@ pub fn convolve_column_lb_u16(
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight0));
-                store1 = vmlal_high_s16(store1, item_row0, v_weight0);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row1), vget_low_s16(v_weight0));
-                store3 = vmlal_high_s16(store3, item_row1, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
+                store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight);
+                store3 = vmlal_high_lane_s16::<0>(store3, item_row1, v_weight);
 
                 let item_row10 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
                 let item_row11 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row10), vget_low_s16(v_weight1));
-                store1 = vmlal_high_s16(store1, item_row10, v_weight1);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row11), vget_low_s16(v_weight1));
-                store3 = vmlal_high_s16(store3, item_row11, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight);
+                store1 = vmlal_high_lane_s16::<1>(store1, item_row10, v_weight);
+                store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight);
+                store3 = vmlal_high_lane_s16::<1>(store3, item_row11, v_weight);
 
                 let item_row20 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr()));
                 let item_row21 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row20), vget_low_s16(v_weight2));
-                store1 = vmlal_high_s16(store1, item_row20, v_weight2);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row21), vget_low_s16(v_weight2));
-                store3 = vmlal_high_s16(store3, item_row21, v_weight2);
+                store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row20), v_weight);
+                store1 = vmlal_high_lane_s16::<2>(store1, item_row20, v_weight);
+                store2 = vmlal_lane_s16::<2>(store2, vget_low_s16(item_row21), v_weight);
+                store3 = vmlal_high_lane_s16::<2>(store3, item_row21, v_weight);
             } else if bounds_size == 4 {
                 let weights = weight.get_unchecked(0..4);
 
-                let v_weight0 = vdupq_n_s16(weights[0]);
-                let v_weight1 = vdupq_n_s16(weights[1]);
-                let v_weight2 = vdupq_n_s16(weights[2]);
-                let v_weight3 = vdupq_n_s16(weights[3]);
+                let v_weight = vld1_s16(weights.as_ptr());
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -140,34 +134,34 @@ pub fn convolve_column_lb_u16(
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight0));
-                store1 = vmlal_high_s16(store1, item_row0, v_weight0);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row1), vget_low_s16(v_weight0));
-                store3 = vmlal_high_s16(store3, item_row1, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
+                store2 = vmlal_lane_s16::<0>(store2, vget_low_s16(item_row1), v_weight);
+                store3 = vmlal_high_lane_s16::<0>(store3, item_row1, v_weight);
 
                 let item_row10 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
                 let item_row11 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row10), vget_low_s16(v_weight1));
-                store1 = vmlal_high_s16(store1, item_row10, v_weight1);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row11), vget_low_s16(v_weight1));
-                store3 = vmlal_high_s16(store3, item_row11, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row10), v_weight);
+                store1 = vmlal_high_lane_s16::<1>(store1, item_row10, v_weight);
+                store2 = vmlal_lane_s16::<1>(store2, vget_low_s16(item_row11), v_weight);
+                store3 = vmlal_high_lane_s16::<1>(store3, item_row11, v_weight);
 
                 let item_row20 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr()));
                 let item_row21 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row20), vget_low_s16(v_weight2));
-                store1 = vmlal_high_s16(store1, item_row20, v_weight2);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row21), vget_low_s16(v_weight2));
-                store3 = vmlal_high_s16(store3, item_row21, v_weight2);
+                store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row20), v_weight);
+                store1 = vmlal_high_lane_s16::<2>(store1, item_row20, v_weight);
+                store2 = vmlal_lane_s16::<2>(store2, vget_low_s16(item_row21), v_weight);
+                store3 = vmlal_high_lane_s16::<2>(store3, item_row21, v_weight);
 
                 let item_row30 = vreinterpretq_s16_u16(vld1q_u16(src_ptr3.as_ptr()));
                 let item_row31 = vreinterpretq_s16_u16(vld1q_u16(src_ptr3.as_ptr().add(8)));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row30), vget_low_s16(v_weight3));
-                store1 = vmlal_high_s16(store1, item_row30, v_weight3);
-                store2 = vmlal_s16(store2, vget_low_s16(item_row31), vget_low_s16(v_weight3));
-                store3 = vmlal_high_s16(store3, item_row31, v_weight3);
+                store0 = vmlal_lane_s16::<3>(store0, vget_low_s16(item_row30), v_weight);
+                store1 = vmlal_high_lane_s16::<3>(store1, item_row30, v_weight);
+                store2 = vmlal_lane_s16::<3>(store2, vget_low_s16(item_row31), v_weight);
+                store3 = vmlal_high_lane_s16::<3>(store3, item_row31, v_weight);
             } else {
                 for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
                     let py = bounds.start + j;
@@ -187,15 +181,15 @@ pub fn convolve_column_lb_u16(
 
             let item0 = vminq_u16(
                 vcombine_u16(
-                    vqshrun_n_s32::<PRECISION>(vmaxq_s32(store0, zeros)),
-                    vqshrun_n_s32::<PRECISION>(vmaxq_s32(store1, zeros)),
+                    vqshrun_n_s32::<PRECISION>(store0),
+                    vqshrun_n_s32::<PRECISION>(store1),
                 ),
                 v_max_colors,
             );
             let item1 = vminq_u16(
                 vcombine_u16(
-                    vqshrun_n_s32::<PRECISION>(vmaxq_s32(store2, zeros)),
-                    vqshrun_n_s32::<PRECISION>(vmaxq_s32(store3, zeros)),
+                    vqshrun_n_s32::<PRECISION>(store2),
+                    vqshrun_n_s32::<PRECISION>(store3),
                 ),
                 v_max_colors,
             );
@@ -219,9 +213,8 @@ pub fn convolve_column_lb_u16(
 
             if bounds_size == 2 {
                 let weights = weight.get_unchecked(0..2);
-
-                let v_weight0 = vdupq_n_s16(weights[0]);
-                let v_weight1 = vdupq_n_s16(weights[1]);
+                let mut v_weight = vld1_dup_s16(weights.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weights.as_ptr().add(1), v_weight);
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -229,19 +222,18 @@ pub fn convolve_column_lb_u16(
 
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight0));
-                store1 = vmlal_high_s16(store1, item_row0, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
 
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row1), vget_low_s16(v_weight1));
-                store1 = vmlal_high_s16(store1, item_row1, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight);
+                store1 = vmlal_high_lane_s16::<1>(store1, item_row1, v_weight);
             } else if bounds_size == 3 {
                 let weights = weight.get_unchecked(0..3);
-
-                let v_weight0 = vdupq_n_s16(weights[0]);
-                let v_weight1 = vdupq_n_s16(weights[1]);
-                let v_weight2 = vdupq_n_s16(weights[2]);
+                let mut v_weight = vld1_dup_s16(weights.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weights.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weights.as_ptr().add(2), v_weight);
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -250,25 +242,21 @@ pub fn convolve_column_lb_u16(
 
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight0));
-                store1 = vmlal_high_s16(store1, item_row0, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
 
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row1), vget_low_s16(v_weight1));
-                store1 = vmlal_high_s16(store1, item_row1, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight);
+                store1 = vmlal_high_lane_s16::<1>(store1, item_row1, v_weight);
 
                 let item_row2 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row2), vget_low_s16(v_weight2));
-                store1 = vmlal_high_s16(store1, item_row2, v_weight2);
+                store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row2), v_weight);
+                store1 = vmlal_high_lane_s16::<2>(store1, item_row2, v_weight);
             } else if bounds_size == 4 {
                 let weights = weight.get_unchecked(0..4);
-
-                let v_weight0 = vdupq_n_s16(weights[0]);
-                let v_weight1 = vdupq_n_s16(weights[1]);
-                let v_weight2 = vdupq_n_s16(weights[2]);
-                let v_weight3 = vdupq_n_s16(weights[3]);
+                let v_weight = vld1_s16(weights.as_ptr());
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -278,23 +266,23 @@ pub fn convolve_column_lb_u16(
 
                 let item_row0 = vreinterpretq_s16_u16(vld1q_u16(src_ptr0.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row0), vget_low_s16(v_weight0));
-                store1 = vmlal_high_s16(store1, item_row0, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, vget_low_s16(item_row0), v_weight);
+                store1 = vmlal_high_lane_s16::<0>(store1, item_row0, v_weight);
 
                 let item_row1 = vreinterpretq_s16_u16(vld1q_u16(src_ptr1.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row1), vget_low_s16(v_weight1));
-                store1 = vmlal_high_s16(store1, item_row1, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, vget_low_s16(item_row1), v_weight);
+                store1 = vmlal_high_lane_s16::<1>(store1, item_row1, v_weight);
 
                 let item_row2 = vreinterpretq_s16_u16(vld1q_u16(src_ptr2.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row2), vget_low_s16(v_weight2));
-                store1 = vmlal_high_s16(store1, item_row2, v_weight2);
+                store0 = vmlal_lane_s16::<2>(store0, vget_low_s16(item_row2), v_weight);
+                store1 = vmlal_high_lane_s16::<2>(store1, item_row2, v_weight);
 
                 let item_row3 = vreinterpretq_s16_u16(vld1q_u16(src_ptr3.as_ptr()));
 
-                store0 = vmlal_s16(store0, vget_low_s16(item_row3), vget_low_s16(v_weight3));
-                store1 = vmlal_high_s16(store1, item_row3, v_weight3);
+                store0 = vmlal_lane_s16::<3>(store0, vget_low_s16(item_row3), v_weight);
+                store1 = vmlal_high_lane_s16::<3>(store1, item_row3, v_weight);
             } else {
                 for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
                     let py = bounds.start + j;
@@ -311,8 +299,8 @@ pub fn convolve_column_lb_u16(
 
             let item = vminq_u16(
                 vcombine_u16(
-                    vqshrun_n_s32::<PRECISION>(vmaxq_s32(store0, zeros)),
-                    vqshrun_n_s32::<PRECISION>(vmaxq_s32(store1, zeros)),
+                    vqshrun_n_s32::<PRECISION>(store0),
+                    vqshrun_n_s32::<PRECISION>(store1),
                 ),
                 v_max_colors,
             );
@@ -333,25 +321,23 @@ pub fn convolve_column_lb_u16(
 
             if bounds_size == 2 {
                 let weights = weight.get_unchecked(0..2);
-
-                let v_weight0 = vdup_n_s16(weights[0]);
-                let v_weight1 = vdup_n_s16(weights[1]);
+                let mut v_weight = vld1_dup_s16(weights.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weights.as_ptr().add(1), v_weight);
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..);
 
                 let item_row0 = vreinterpret_s16_u16(vld1_u16(src_ptr0.as_ptr()));
-                store0 = vmlal_s16(store0, item_row0, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, item_row0, v_weight);
 
                 let item_row1 = vreinterpret_s16_u16(vld1_u16(src_ptr1.as_ptr()));
-                store0 = vmlal_s16(store0, item_row1, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, item_row1, v_weight);
             } else if bounds_size == 3 {
                 let weights = weight.get_unchecked(0..3);
-
-                let v_weight0 = vdup_n_s16(weights[0]);
-                let v_weight1 = vdup_n_s16(weights[1]);
-                let v_weight2 = vdup_n_s16(weights[2]);
+                let mut v_weight = vld1_dup_s16(weights.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weights.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weights.as_ptr().add(2), v_weight);
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -359,20 +345,16 @@ pub fn convolve_column_lb_u16(
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..);
 
                 let item_row0 = vreinterpret_s16_u16(vld1_u16(src_ptr0.as_ptr()));
-                store0 = vmlal_s16(store0, item_row0, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, item_row0, v_weight);
 
                 let item_row1 = vreinterpret_s16_u16(vld1_u16(src_ptr1.as_ptr()));
-                store0 = vmlal_s16(store0, item_row1, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, item_row1, v_weight);
 
                 let item_row2 = vreinterpret_s16_u16(vld1_u16(src_ptr2.as_ptr()));
-                store0 = vmlal_s16(store0, item_row2, v_weight2);
+                store0 = vmlal_lane_s16::<2>(store0, item_row2, v_weight);
             } else if bounds_size == 4 {
                 let weights = weight.get_unchecked(0..4);
-
-                let v_weight0 = vdup_n_s16(weights[0]);
-                let v_weight1 = vdup_n_s16(weights[1]);
-                let v_weight2 = vdup_n_s16(weights[2]);
-                let v_weight3 = vdup_n_s16(weights[3]);
+                let v_weight = vld1_s16(weights.as_ptr());
 
                 let py = bounds.start;
                 let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..);
@@ -381,16 +363,16 @@ pub fn convolve_column_lb_u16(
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_dx)..);
 
                 let item_row0 = vreinterpret_s16_u16(vld1_u16(src_ptr0.as_ptr()));
-                store0 = vmlal_s16(store0, item_row0, v_weight0);
+                store0 = vmlal_lane_s16::<0>(store0, item_row0, v_weight);
 
                 let item_row1 = vreinterpret_s16_u16(vld1_u16(src_ptr1.as_ptr()));
-                store0 = vmlal_s16(store0, item_row1, v_weight1);
+                store0 = vmlal_lane_s16::<1>(store0, item_row1, v_weight);
 
                 let item_row2 = vreinterpret_s16_u16(vld1_u16(src_ptr2.as_ptr()));
-                store0 = vmlal_s16(store0, item_row2, v_weight2);
+                store0 = vmlal_lane_s16::<2>(store0, item_row2, v_weight);
 
                 let item_row3 = vreinterpret_s16_u16(vld1_u16(src_ptr3.as_ptr()));
-                store0 = vmlal_s16(store0, item_row3, v_weight3);
+                store0 = vmlal_lane_s16::<3>(store0, item_row3, v_weight);
             } else {
                 for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() {
                     let py = bounds.start + j;
@@ -405,7 +387,7 @@ pub fn convolve_column_lb_u16(
             }
 
             let u_store0 = vmin_u16(
-                vqshrun_n_s32::<PRECISION>(vmaxq_s32(store0, zeros)),
+                vqshrun_n_s32::<PRECISION>(store0),
                 vget_low_u16(v_max_colors),
             );
             vst1_u16(dst.as_mut_ptr(), u_store0);
diff --git a/src/neon/vertical_u8.rs b/src/neon/vertical_u8.rs
index 32eccf5..667fb39 100644
--- a/src/neon/vertical_u8.rs
+++ b/src/neon/vertical_u8.rs
@@ -27,23 +27,21 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::filter_weights::FilterBounds;
+use crate::neon::utils::{xvld1q_u8_x2, xvld1q_u8_x4, xvst1q_u8_x2, xvst1q_u8_x4};
 use crate::support::{PRECISION, ROUNDING_CONST};
 use std::arch::aarch64::*;
 
 macro_rules! pack_weights {
     ($store_0: expr, $store_1: expr, $store_2: expr, $store_3: expr) => {{
-        let zeros = vdupq_n_s16(0);
-        let low_s16 = vcombine_s16(
-            vqshrn_n_s32::<PRECISION>($store_0),
-            vqshrn_n_s32::<PRECISION>($store_1),
+        let low_u16 = vcombine_u16(
+            vqshrun_n_s32::<PRECISION>($store_0),
+            vqshrun_n_s32::<PRECISION>($store_1),
         );
-        let high_s16 = vcombine_s16(
-            vqshrn_n_s32::<PRECISION>($store_2),
-            vqshrn_n_s32::<PRECISION>($store_3),
+        let high_u16 = vcombine_u16(
+            vqshrun_n_s32::<PRECISION>($store_2),
+            vqshrun_n_s32::<PRECISION>($store_3),
         );
-        let low_16 = vreinterpretq_u16_s16(vmaxq_s16(low_s16, zeros));
-        let high_16 = vreinterpretq_u16_s16(vmaxq_s16(high_s16, zeros));
-        vcombine_u8(vqmovn_u16(low_16), vqmovn_u16(high_16))
+        vcombine_u8(vqmovn_u16(low_u16), vqmovn_u16(high_u16))
     }};
 }
 
@@ -59,6 +57,18 @@ macro_rules! accumulate_4_into {
     }};
 }
 
+macro_rules! accumulate_4_into_lane {
+    ($item: expr,$store_0: expr, $store_1: expr, $store_2: expr, $store_3: expr, $weight: expr, $weight_pos: expr) => {{
+        let low = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8($item)));
+        let high = vreinterpretq_s16_u16(vmovl_high_u8($item));
+
+        $store_0 = vmlal_lane_s16::<$weight_pos>($store_0, vget_low_s16(low), $weight);
+        $store_1 = vmlal_high_lane_s16::<$weight_pos>($store_1, low, $weight);
+        $store_2 = vmlal_lane_s16::<$weight_pos>($store_2, vget_low_s16(high), $weight);
+        $store_3 = vmlal_high_lane_s16::<$weight_pos>($store_3, high, $weight);
+    }};
+}
+
 pub fn convolve_vertical_neon_i16_precision(
     width: usize,
     bounds: &FilterBounds,
@@ -67,7 +77,9 @@ pub fn convolve_vertical_neon_i16_precision(
     src_stride: usize,
     weight: &[i16],
 ) {
-    convolve_vertical_neon_row_upper(width, bounds, src, dst, src_stride, weight);
+    unsafe {
+        convolve_vertical_neon_row_upper(width, bounds, src, dst, src_stride, weight);
+    }
 }
 
 pub fn convolve_vertical_neon_i32_precision(
@@ -95,7 +107,22 @@ unsafe fn vdot<const SCALE: i32>(
     (store0, store1)
 }
 
-fn convolve_vertical_neon_row_upper(
+#[inline(always)]
+unsafe fn vdot_lane<const SCALE: i32, const LANE: i32>(
+    store0: int16x8_t,
+    store1: int16x8_t,
+    row: uint8x16_t,
+    weight: int16x4_t,
+) -> (int16x8_t, int16x8_t) {
+    let lo0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(vget_low_u8(row)));
+    let store0 = vqrdmlahq_lane_s16::<LANE>(store0, lo0, weight);
+    let hi0 = vreinterpretq_s16_u16(vshll_high_n_u8::<SCALE>(row));
+    let store1 = vqrdmlahq_lane_s16::<LANE>(store1, hi0, weight);
+    (store0, store1)
+}
+
+#[target_feature(enable = "rdm")]
+unsafe fn convolve_vertical_neon_row_upper(
     _: usize,
     bounds: &FilterBounds,
     src: &[u8],
@@ -132,100 +159,97 @@ fn convolve_vertical_neon_row_upper(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
 
-                let items0 = vld1q_u8_x4(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items0.0, v_weight0);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items0.1, v_weight0);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items0.2, v_weight0);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items0.3, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 0>(store_4, store_5, items0.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 0>(store_6, store_7, items0.3, v_weight);
 
-                let items1 = vld1q_u8_x4(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items1.0, v_weight1);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items1.1, v_weight1);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items1.2, v_weight1);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items1.3, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 1>(store_4, store_5, items1.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 1>(store_6, store_7, items1.3, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
 
-                let items0 = vld1q_u8_x4(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items0.0, v_weight0);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items0.1, v_weight0);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items0.2, v_weight0);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items0.3, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 0>(store_4, store_5, items0.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 0>(store_6, store_7, items0.3, v_weight);
 
-                let items1 = vld1q_u8_x4(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items1.0, v_weight1);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items1.1, v_weight1);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items1.2, v_weight1);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items1.3, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 1>(store_4, store_5, items1.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 1>(store_6, store_7, items1.3, v_weight);
 
-                let items2 = vld1q_u8_x4(src_ptr2.as_ptr());
+                let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items2.0, v_weight2);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items2.1, v_weight2);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items2.2, v_weight2);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items2.3, v_weight2);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 2>(store_4, store_5, items2.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 2>(store_6, store_7, items2.3, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
 
-                let items0 = vld1q_u8_x4(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items0.0, v_weight0);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items0.1, v_weight0);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items0.2, v_weight0);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items0.3, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 0>(store_4, store_5, items0.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 0>(store_6, store_7, items0.3, v_weight);
 
-                let items1 = vld1q_u8_x4(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items1.0, v_weight1);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items1.1, v_weight1);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items1.2, v_weight1);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items1.3, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 1>(store_4, store_5, items1.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 1>(store_6, store_7, items1.3, v_weight);
 
-                let items2 = vld1q_u8_x4(src_ptr2.as_ptr());
+                let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items2.0, v_weight2);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items2.1, v_weight2);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items2.2, v_weight2);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items2.3, v_weight2);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 2>(store_4, store_5, items2.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 2>(store_6, store_7, items2.3, v_weight);
 
-                let items3 = vld1q_u8_x4(src_ptr3.as_ptr());
+                let items3 = xvld1q_u8_x4(src_ptr3.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items3.0, v_weight3);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items3.1, v_weight3);
-                (store_4, store_5) = vdot::<SCALE>(store_4, store_5, items3.2, v_weight3);
-                (store_6, store_7) = vdot::<SCALE>(store_6, store_7, items3.3, v_weight3);
+                (store_0, store_1) = vdot_lane::<SCALE, 3>(store_0, store_1, items3.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 3>(store_2, store_3, items3.1, v_weight);
+                (store_4, store_5) = vdot_lane::<SCALE, 3>(store_4, store_5, items3.2, v_weight);
+                (store_6, store_7) = vdot_lane::<SCALE, 3>(store_6, store_7, items3.3, v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
                     let weight = weight.get_unchecked(j..);
                     let v_weight = vld1q_dup_s16(weight.as_ptr());
                     let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                    let items = vld1q_u8_x4(src_ptr.as_ptr());
+                    let items = xvld1q_u8_x4(src_ptr.as_ptr());
 
                     (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items.0, v_weight);
                     (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items.1, v_weight);
@@ -257,7 +281,7 @@ fn convolve_vertical_neon_row_upper(
             let item3 = vcombine_u8(item30, item31);
 
             let dst_items = uint8x16x4_t(item0, item1, item2, item3);
-            vst1q_u8_x4(dst.as_mut_ptr(), dst_items);
+            xvst1q_u8_x4(dst.as_mut_ptr(), dst_items);
 
             cx += 64;
         }
@@ -277,82 +301,79 @@ fn convolve_vertical_neon_row_upper(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
 
-                let items0 = vld1q_u8_x2(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items0.0, v_weight0);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items0.1, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
 
-                let items1 = vld1q_u8_x2(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items1.0, v_weight1);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items1.1, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
 
-                let items0 = vld1q_u8_x2(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items0.0, v_weight0);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items0.1, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
 
-                let items1 = vld1q_u8_x2(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items1.0, v_weight1);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items1.1, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
 
-                let items2 = vld1q_u8_x2(src_ptr2.as_ptr());
+                let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items2.0, v_weight2);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items2.1, v_weight2);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
 
-                let items0 = vld1q_u8_x2(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items0.0, v_weight0);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items0.1, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, items0.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 0>(store_2, store_3, items0.1, v_weight);
 
-                let items1 = vld1q_u8_x2(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items1.0, v_weight1);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items1.1, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, items1.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 1>(store_2, store_3, items1.1, v_weight);
 
-                let items2 = vld1q_u8_x2(src_ptr2.as_ptr());
+                let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items2.0, v_weight2);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items2.1, v_weight2);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, items2.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 2>(store_2, store_3, items2.1, v_weight);
 
-                let items3 = vld1q_u8_x2(src_ptr3.as_ptr());
+                let items3 = xvld1q_u8_x2(src_ptr3.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items3.0, v_weight3);
-                (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items3.1, v_weight3);
+                (store_0, store_1) = vdot_lane::<SCALE, 3>(store_0, store_1, items3.0, v_weight);
+                (store_2, store_3) = vdot_lane::<SCALE, 3>(store_2, store_3, items3.1, v_weight);
             } else {
                 for j in 0..bounds.size {
                     let py = bounds.start + j;
                     let weight = weight.get_unchecked(j..);
                     let v_weight = vld1q_dup_s16(weight.as_ptr());
                     let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                    let items = vld1q_u8_x2(src_ptr.as_ptr());
+                    let items = xvld1q_u8_x2(src_ptr.as_ptr());
 
                     (store_0, store_1) = vdot::<SCALE>(store_0, store_1, items.0, v_weight);
                     (store_2, store_3) = vdot::<SCALE>(store_2, store_3, items.1, v_weight);
@@ -372,7 +393,7 @@ fn convolve_vertical_neon_row_upper(
             let item1 = vcombine_u8(item10, item11);
 
             let dst_items = uint8x16x2_t(item0, item1);
-            vst1q_u8_x2(dst.as_mut_ptr(), dst_items);
+            xvst1q_u8_x2(dst.as_mut_ptr(), dst_items);
 
             cx += 32;
         }
@@ -390,62 +411,58 @@ fn convolve_vertical_neon_row_upper(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
 
                 let item0 = vld1q_u8(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item0, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, item0, v_weight);
 
                 let item1 = vld1q_u8(src_ptr1.as_ptr());
-
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item1, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, item1, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
 
                 let item0 = vld1q_u8(src_ptr0.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item0, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, item0, v_weight);
 
                 let item1 = vld1q_u8(src_ptr1.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item1, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, item1, v_weight);
 
                 let item2 = vld1q_u8(src_ptr2.as_ptr());
 
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item2, v_weight2);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, item2, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
 
                 let item0 = vld1q_u8(src_ptr0.as_ptr());
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item0, v_weight0);
+                (store_0, store_1) = vdot_lane::<SCALE, 0>(store_0, store_1, item0, v_weight);
 
                 let item1 = vld1q_u8(src_ptr1.as_ptr());
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item1, v_weight1);
+                (store_0, store_1) = vdot_lane::<SCALE, 1>(store_0, store_1, item1, v_weight);
 
                 let item2 = vld1q_u8(src_ptr2.as_ptr());
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item2, v_weight2);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, item2, v_weight);
 
                 let item3 = vld1q_u8(src_ptr3.as_ptr());
-                (store_0, store_1) = vdot::<SCALE>(store_0, store_1, item3, v_weight3);
+                (store_0, store_1) = vdot_lane::<SCALE, 2>(store_0, store_1, item3, v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -481,46 +498,43 @@ fn convolve_vertical_neon_row_upper(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
 
                 let item0 = vld1_u8(src_ptr0.as_ptr());
                 let low0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item0));
-                store_0 = vqrdmlahq_s16(store_0, low0, v_weight0);
+                store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight);
 
                 let item1 = vld1_u8(src_ptr1.as_ptr());
                 let low1 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item1));
-                store_0 = vqrdmlahq_s16(store_0, low1, v_weight1);
+                store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
 
                 let item0 = vld1_u8(src_ptr0.as_ptr());
                 let low0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item0));
-                store_0 = vqrdmlahq_s16(store_0, low0, v_weight0);
+                store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight);
 
                 let item1 = vld1_u8(src_ptr1.as_ptr());
                 let low1 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item1));
-                store_0 = vqrdmlahq_s16(store_0, low1, v_weight1);
+                store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight);
 
                 let item2 = vld1_u8(src_ptr2.as_ptr());
                 let low2 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item2));
-                store_0 = vqrdmlahq_s16(store_0, low2, v_weight2);
+                store_0 = vqrdmlahq_lane_s16::<2>(store_0, low2, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -528,19 +542,19 @@ fn convolve_vertical_neon_row_upper(
 
                 let item0 = vld1_u8(src_ptr0.as_ptr());
                 let low0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item0));
-                store_0 = vqrdmlahq_s16(store_0, low0, v_weight0);
+                store_0 = vqrdmlahq_lane_s16::<0>(store_0, low0, v_weight);
 
                 let item1 = vld1_u8(src_ptr1.as_ptr());
                 let low1 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item1));
-                store_0 = vqrdmlahq_s16(store_0, low1, v_weight1);
+                store_0 = vqrdmlahq_lane_s16::<1>(store_0, low1, v_weight);
 
                 let item2 = vld1_u8(src_ptr2.as_ptr());
                 let low2 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item2));
-                store_0 = vqrdmlahq_s16(store_0, low2, v_weight2);
+                store_0 = vqrdmlahq_lane_s16::<2>(store_0, low2, v_weight);
 
                 let item3 = vld1_u8(src_ptr3.as_ptr());
                 let low3 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(item3));
-                store_0 = vqrdmlahq_s16(store_0, low3, v_weight3);
+                store_0 = vqrdmlahq_lane_s16::<3>(store_0, low3, v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -574,46 +588,43 @@ fn convolve_vertical_neon_row_upper(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
 
                 let items0 = vld1_dup_u8(src_ptr0.as_ptr());
                 let low0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items0));
-                store = vqrdmlahq_s16(store, low0, v_weight0);
+                store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight);
 
                 let items1 = vld1_dup_u8(src_ptr1.as_ptr());
                 let low1 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items1));
-                store = vqrdmlahq_s16(store, low1, v_weight1);
+                store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
 
                 let items0 = vld1_dup_u8(src_ptr0.as_ptr());
                 let low0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items0));
-                store = vqrdmlahq_s16(store, low0, v_weight0);
+                store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight);
 
                 let items1 = vld1_dup_u8(src_ptr1.as_ptr());
                 let low1 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items1));
-                store = vqrdmlahq_s16(store, low1, v_weight1);
+                store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight);
 
                 let items2 = vld1_dup_u8(src_ptr2.as_ptr());
                 let low2 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items2));
-                store = vqrdmlahq_s16(store, low2, v_weight2);
+                store = vqrdmlahq_lane_s16::<2>(store, low2, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -621,19 +632,19 @@ fn convolve_vertical_neon_row_upper(
 
                 let items0 = vld1_dup_u8(src_ptr0.as_ptr());
                 let low0 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items0));
-                store = vqrdmlahq_s16(store, low0, v_weight0);
+                store = vqrdmlahq_lane_s16::<0>(store, low0, v_weight);
 
                 let items1 = vld1_dup_u8(src_ptr1.as_ptr());
                 let low1 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items1));
-                store = vqrdmlahq_s16(store, low1, v_weight1);
+                store = vqrdmlahq_lane_s16::<1>(store, low1, v_weight);
 
                 let items2 = vld1_dup_u8(src_ptr2.as_ptr());
                 let low2 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items2));
-                store = vqrdmlahq_s16(store, low2, v_weight2);
+                store = vqrdmlahq_lane_s16::<2>(store, low2, v_weight);
 
                 let items3 = vld1_dup_u8(src_ptr3.as_ptr());
                 let low3 = vreinterpretq_s16_u16(vshll_n_u8::<SCALE>(items3));
-                store = vqrdmlahq_s16(store, low3, v_weight3);
+                store = vqrdmlahq_lane_s16::<3>(store, low3, v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -699,100 +710,133 @@ fn convolve_vertical_neon_row_full(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
 
-                let items0 = vld1q_u8_x4(src_ptr0.as_ptr());
-
-                accumulate_4_into!(items0.0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(items0.1, store_4, store_5, store_6, store_7, v_weight0);
-                accumulate_4_into!(items0.2, store_8, store_9, store_10, store_11, v_weight0);
-                accumulate_4_into!(items0.3, store_12, store_13, store_14, store_15, v_weight0);
-
-                let items1 = vld1q_u8_x4(src_ptr1.as_ptr());
-
-                accumulate_4_into!(items1.0, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(items1.1, store_4, store_5, store_6, store_7, v_weight1);
-                accumulate_4_into!(items1.2, store_8, store_9, store_10, store_11, v_weight1);
-                accumulate_4_into!(items1.3, store_12, store_13, store_14, store_15, v_weight1);
+                let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
+
+                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
+                accumulate_4_into_lane!(
+                    items0.2, store_8, store_9, store_10, store_11, v_weight, 0
+                );
+                accumulate_4_into_lane!(
+                    items0.3, store_12, store_13, store_14, store_15, v_weight, 0
+                );
+
+                let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
+
+                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
+                accumulate_4_into_lane!(
+                    items1.2, store_8, store_9, store_10, store_11, v_weight, 1
+                );
+                accumulate_4_into_lane!(
+                    items1.3, store_12, store_13, store_14, store_15, v_weight, 1
+                );
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
 
-                let items0 = vld1q_u8_x4(src_ptr0.as_ptr());
-
-                accumulate_4_into!(items0.0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(items0.1, store_4, store_5, store_6, store_7, v_weight0);
-                accumulate_4_into!(items0.2, store_8, store_9, store_10, store_11, v_weight0);
-                accumulate_4_into!(items0.3, store_12, store_13, store_14, store_15, v_weight0);
-
-                let items1 = vld1q_u8_x4(src_ptr1.as_ptr());
-
-                accumulate_4_into!(items1.0, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(items1.1, store_4, store_5, store_6, store_7, v_weight1);
-                accumulate_4_into!(items1.2, store_8, store_9, store_10, store_11, v_weight1);
-                accumulate_4_into!(items1.3, store_12, store_13, store_14, store_15, v_weight1);
-
-                let items2 = vld1q_u8_x4(src_ptr2.as_ptr());
-
-                accumulate_4_into!(items2.0, store_0, store_1, store_2, store_3, v_weight2);
-                accumulate_4_into!(items2.1, store_4, store_5, store_6, store_7, v_weight2);
-                accumulate_4_into!(items2.2, store_8, store_9, store_10, store_11, v_weight2);
-                accumulate_4_into!(items2.3, store_12, store_13, store_14, store_15, v_weight2);
+                let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
+
+                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
+                accumulate_4_into_lane!(
+                    items0.2, store_8, store_9, store_10, store_11, v_weight, 0
+                );
+                accumulate_4_into_lane!(
+                    items0.3, store_12, store_13, store_14, store_15, v_weight, 0
+                );
+
+                let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
+
+                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
+                accumulate_4_into_lane!(
+                    items1.2, store_8, store_9, store_10, store_11, v_weight, 1
+                );
+                accumulate_4_into_lane!(
+                    items1.3, store_12, store_13, store_14, store_15, v_weight, 1
+                );
+
+                let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
+
+                accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2);
+                accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2);
+                accumulate_4_into_lane!(
+                    items2.2, store_8, store_9, store_10, store_11, v_weight, 2
+                );
+                accumulate_4_into_lane!(
+                    items2.3, store_12, store_13, store_14, store_15, v_weight, 2
+                );
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
 
-                let items0 = vld1q_u8_x4(src_ptr0.as_ptr());
-
-                accumulate_4_into!(items0.0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(items0.1, store_4, store_5, store_6, store_7, v_weight0);
-                accumulate_4_into!(items0.2, store_8, store_9, store_10, store_11, v_weight0);
-                accumulate_4_into!(items0.3, store_12, store_13, store_14, store_15, v_weight0);
-
-                let items1 = vld1q_u8_x4(src_ptr1.as_ptr());
-
-                accumulate_4_into!(items1.0, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(items1.1, store_4, store_5, store_6, store_7, v_weight1);
-                accumulate_4_into!(items1.2, store_8, store_9, store_10, store_11, v_weight1);
-                accumulate_4_into!(items1.3, store_12, store_13, store_14, store_15, v_weight1);
-
-                let items2 = vld1q_u8_x4(src_ptr2.as_ptr());
-
-                accumulate_4_into!(items2.0, store_0, store_1, store_2, store_3, v_weight2);
-                accumulate_4_into!(items2.1, store_4, store_5, store_6, store_7, v_weight2);
-                accumulate_4_into!(items2.2, store_8, store_9, store_10, store_11, v_weight2);
-                accumulate_4_into!(items2.3, store_12, store_13, store_14, store_15, v_weight2);
-
-                let items3 = vld1q_u8_x4(src_ptr3.as_ptr());
-
-                accumulate_4_into!(items3.0, store_0, store_1, store_2, store_3, v_weight3);
-                accumulate_4_into!(items3.1, store_4, store_5, store_6, store_7, v_weight3);
-                accumulate_4_into!(items3.2, store_8, store_9, store_10, store_11, v_weight3);
-                accumulate_4_into!(items3.3, store_12, store_13, store_14, store_15, v_weight3);
+                let items0 = xvld1q_u8_x4(src_ptr0.as_ptr());
+
+                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
+                accumulate_4_into_lane!(
+                    items0.2, store_8, store_9, store_10, store_11, v_weight, 0
+                );
+                accumulate_4_into_lane!(
+                    items0.3, store_12, store_13, store_14, store_15, v_weight, 0
+                );
+
+                let items1 = xvld1q_u8_x4(src_ptr1.as_ptr());
+
+                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
+                accumulate_4_into_lane!(
+                    items1.2, store_8, store_9, store_10, store_11, v_weight, 1
+                );
+                accumulate_4_into_lane!(
+                    items1.3, store_12, store_13, store_14, store_15, v_weight, 1
+                );
+
+                let items2 = xvld1q_u8_x4(src_ptr2.as_ptr());
+
+                accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2);
+                accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2);
+                accumulate_4_into_lane!(
+                    items2.2, store_8, store_9, store_10, store_11, v_weight, 2
+                );
+                accumulate_4_into_lane!(
+                    items2.3, store_12, store_13, store_14, store_15, v_weight, 2
+                );
+
+                let items3 = xvld1q_u8_x4(src_ptr3.as_ptr());
+
+                accumulate_4_into_lane!(items3.0, store_0, store_1, store_2, store_3, v_weight, 3);
+                accumulate_4_into_lane!(items3.1, store_4, store_5, store_6, store_7, v_weight, 3);
+                accumulate_4_into_lane!(
+                    items3.2, store_8, store_9, store_10, store_11, v_weight, 3
+                );
+                accumulate_4_into_lane!(
+                    items3.3, store_12, store_13, store_14, store_15, v_weight, 3
+                );
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
                     let weight = weight.get_unchecked(j..);
                     let v_weight = vld1q_dup_s16(weight.as_ptr());
                     let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                    let items = vld1q_u8_x4(src_ptr.as_ptr());
+                    let items = xvld1q_u8_x4(src_ptr.as_ptr());
 
                     accumulate_4_into!(items.0, store_0, store_1, store_2, store_3, v_weight);
                     accumulate_4_into!(items.1, store_4, store_5, store_6, store_7, v_weight);
@@ -807,7 +851,7 @@ fn convolve_vertical_neon_row_full(
             let item_3 = pack_weights!(store_12, store_13, store_14, store_15);
 
             let dst_items = uint8x16x4_t(item_0, item_1, item_2, item_3);
-            vst1q_u8_x4(dst.as_mut_ptr(), dst_items);
+            xvst1q_u8_x4(dst.as_mut_ptr(), dst_items);
 
             cx += 64;
         }
@@ -831,79 +875,76 @@ fn convolve_vertical_neon_row_full(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
-                let items0 = vld1q_u8_x2(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                accumulate_4_into!(items0.0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(items0.1, store_4, store_5, store_6, store_7, v_weight0);
+                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
 
-                let items1 = vld1q_u8_x2(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                accumulate_4_into!(items1.0, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(items1.1, store_4, store_5, store_6, store_7, v_weight1);
+                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
-                let items0 = vld1q_u8_x2(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                accumulate_4_into!(items0.0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(items0.1, store_4, store_5, store_6, store_7, v_weight0);
+                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
 
-                let items1 = vld1q_u8_x2(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                accumulate_4_into!(items1.0, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(items1.1, store_4, store_5, store_6, store_7, v_weight1);
+                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
 
-                let items2 = vld1q_u8_x2(src_ptr2.as_ptr());
+                let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
 
-                accumulate_4_into!(items2.0, store_0, store_1, store_2, store_3, v_weight2);
-                accumulate_4_into!(items2.1, store_4, store_5, store_6, store_7, v_weight2);
+                accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2);
+                accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..);
-                let items0 = vld1q_u8_x2(src_ptr0.as_ptr());
+                let items0 = xvld1q_u8_x2(src_ptr0.as_ptr());
 
-                accumulate_4_into!(items0.0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(items0.1, store_4, store_5, store_6, store_7, v_weight0);
+                accumulate_4_into_lane!(items0.0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(items0.1, store_4, store_5, store_6, store_7, v_weight, 0);
 
-                let items1 = vld1q_u8_x2(src_ptr1.as_ptr());
+                let items1 = xvld1q_u8_x2(src_ptr1.as_ptr());
 
-                accumulate_4_into!(items1.0, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(items1.1, store_4, store_5, store_6, store_7, v_weight1);
+                accumulate_4_into_lane!(items1.0, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(items1.1, store_4, store_5, store_6, store_7, v_weight, 1);
 
-                let items2 = vld1q_u8_x2(src_ptr2.as_ptr());
+                let items2 = xvld1q_u8_x2(src_ptr2.as_ptr());
 
-                accumulate_4_into!(items2.0, store_0, store_1, store_2, store_3, v_weight2);
-                accumulate_4_into!(items2.1, store_4, store_5, store_6, store_7, v_weight2);
+                accumulate_4_into_lane!(items2.0, store_0, store_1, store_2, store_3, v_weight, 2);
+                accumulate_4_into_lane!(items2.1, store_4, store_5, store_6, store_7, v_weight, 2);
 
-                let items3 = vld1q_u8_x2(src_ptr3.as_ptr());
+                let items3 = xvld1q_u8_x2(src_ptr3.as_ptr());
 
-                accumulate_4_into!(items3.0, store_0, store_1, store_2, store_3, v_weight3);
-                accumulate_4_into!(items3.1, store_4, store_5, store_6, store_7, v_weight3);
+                accumulate_4_into_lane!(items3.0, store_0, store_1, store_2, store_3, v_weight, 3);
+                accumulate_4_into_lane!(items3.1, store_4, store_5, store_6, store_7, v_weight, 3);
             } else {
                 for j in 0..bounds.size {
                     let py = bounds.start + j;
                     let weight = weight.get_unchecked(j..);
                     let v_weight = vld1q_dup_s16(weight.as_ptr());
                     let src_ptr = src.get_unchecked((src_stride * py + px)..);
-                    let items = vld1q_u8_x2(src_ptr.as_ptr());
+                    let items = xvld1q_u8_x2(src_ptr.as_ptr());
 
                     accumulate_4_into!(items.0, store_0, store_1, store_2, store_3, v_weight);
                     accumulate_4_into!(items.1, store_4, store_5, store_6, store_7, v_weight);
@@ -914,7 +955,7 @@ fn convolve_vertical_neon_row_full(
             let item_1 = pack_weights!(store_4, store_5, store_6, store_7);
 
             let dst_items = uint8x16x2_t(item_0, item_1);
-            vst1q_u8_x2(dst.as_mut_ptr(), dst_items);
+            xvst1q_u8_x2(dst.as_mut_ptr(), dst_items);
 
             cx += 32;
         }
@@ -934,36 +975,33 @@ fn convolve_vertical_neon_row_full(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let item_row0 = vld1q_u8(src_ptr0.as_ptr());
                 let item_row1 = vld1q_u8(src_ptr1.as_ptr());
-                accumulate_4_into!(item_row0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(item_row1, store_0, store_1, store_2, store_3, v_weight1);
+                accumulate_4_into_lane!(item_row0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(item_row1, store_0, store_1, store_2, store_3, v_weight, 1);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
                 let item_row0 = vld1q_u8(src_ptr0.as_ptr());
                 let item_row1 = vld1q_u8(src_ptr1.as_ptr());
                 let item_row2 = vld1q_u8(src_ptr2.as_ptr());
-                accumulate_4_into!(item_row0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(item_row1, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(item_row2, store_0, store_1, store_2, store_3, v_weight2);
+                accumulate_4_into_lane!(item_row0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(item_row1, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(item_row2, store_0, store_1, store_2, store_3, v_weight, 2);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -972,10 +1010,10 @@ fn convolve_vertical_neon_row_full(
                 let item_row1 = vld1q_u8(src_ptr1.as_ptr());
                 let item_row2 = vld1q_u8(src_ptr2.as_ptr());
                 let item_row3 = vld1q_u8(src_ptr3.as_ptr());
-                accumulate_4_into!(item_row0, store_0, store_1, store_2, store_3, v_weight0);
-                accumulate_4_into!(item_row1, store_0, store_1, store_2, store_3, v_weight1);
-                accumulate_4_into!(item_row2, store_0, store_1, store_2, store_3, v_weight2);
-                accumulate_4_into!(item_row3, store_0, store_1, store_2, store_3, v_weight3);
+                accumulate_4_into_lane!(item_row0, store_0, store_1, store_2, store_3, v_weight, 0);
+                accumulate_4_into_lane!(item_row1, store_0, store_1, store_2, store_3, v_weight, 1);
+                accumulate_4_into_lane!(item_row2, store_0, store_1, store_2, store_3, v_weight, 2);
+                accumulate_4_into_lane!(item_row3, store_0, store_1, store_2, store_3, v_weight, 3);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -1007,8 +1045,8 @@ fn convolve_vertical_neon_row_full(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let item_row0 = vld1_u8(src_ptr0.as_ptr());
@@ -1016,16 +1054,16 @@ fn convolve_vertical_neon_row_full(
 
                 let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0));
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
-                store_0 = vmlal_s16(store_0, vget_low_s16(low0), vget_low_s16(v_weight0));
-                store_1 = vmlal_high_s16(store_1, low0, v_weight0);
-                store_0 = vmlal_s16(store_0, vget_low_s16(low1), vget_low_s16(v_weight1));
-                store_1 = vmlal_high_s16(store_1, low1, v_weight1);
+                store_0 = vmlal_lane_s16::<0>(store_0, vget_low_s16(low0), v_weight);
+                store_1 = vmlal_high_lane_s16::<0>(store_1, low0, v_weight);
+                store_0 = vmlal_lane_s16::<1>(store_0, vget_low_s16(low1), v_weight);
+                store_1 = vmlal_high_lane_s16::<1>(store_1, low1, v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -1036,19 +1074,16 @@ fn convolve_vertical_neon_row_full(
                 let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0));
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
                 let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2));
-                store_0 = vmlal_s16(store_0, vget_low_s16(low0), vget_low_s16(v_weight0));
-                store_1 = vmlal_high_s16(store_1, low0, v_weight0);
-                store_0 = vmlal_s16(store_0, vget_low_s16(low1), vget_low_s16(v_weight1));
-                store_1 = vmlal_high_s16(store_1, low1, v_weight1);
-                store_0 = vmlal_s16(store_0, vget_low_s16(low2), vget_low_s16(v_weight2));
-                store_1 = vmlal_high_s16(store_1, low2, v_weight2);
+                store_0 = vmlal_lane_s16::<0>(store_0, vget_low_s16(low0), v_weight);
+                store_1 = vmlal_high_lane_s16::<0>(store_1, low0, v_weight);
+                store_0 = vmlal_lane_s16::<1>(store_0, vget_low_s16(low1), v_weight);
+                store_1 = vmlal_high_lane_s16::<1>(store_1, low1, v_weight);
+                store_0 = vmlal_lane_s16::<2>(store_0, vget_low_s16(low2), v_weight);
+                store_1 = vmlal_high_lane_s16::<3>(store_1, low2, v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -1062,14 +1097,14 @@ fn convolve_vertical_neon_row_full(
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
                 let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2));
                 let low3 = vreinterpretq_s16_u16(vmovl_u8(item_row3));
-                store_0 = vmlal_s16(store_0, vget_low_s16(low0), vget_low_s16(v_weight0));
-                store_1 = vmlal_high_s16(store_1, low0, v_weight0);
-                store_0 = vmlal_s16(store_0, vget_low_s16(low1), vget_low_s16(v_weight1));
-                store_1 = vmlal_high_s16(store_1, low1, v_weight1);
-                store_0 = vmlal_s16(store_0, vget_low_s16(low2), vget_low_s16(v_weight2));
-                store_1 = vmlal_high_s16(store_1, low2, v_weight2);
-                store_0 = vmlal_s16(store_0, vget_low_s16(low3), vget_low_s16(v_weight3));
-                store_1 = vmlal_high_s16(store_1, low3, v_weight3);
+                store_0 = vmlal_lane_s16::<0>(store_0, vget_low_s16(low0), v_weight);
+                store_1 = vmlal_high_lane_s16::<0>(store_1, low0, v_weight);
+                store_0 = vmlal_lane_s16::<1>(store_0, vget_low_s16(low1), v_weight);
+                store_1 = vmlal_high_lane_s16::<1>(store_1, low1, v_weight);
+                store_0 = vmlal_lane_s16::<2>(store_0, vget_low_s16(low2), v_weight);
+                store_1 = vmlal_high_lane_s16::<2>(store_1, low2, v_weight);
+                store_0 = vmlal_lane_s16::<3>(store_0, vget_low_s16(low3), v_weight);
+                store_1 = vmlal_high_lane_s16::<3>(store_1, low3, v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
@@ -1084,15 +1119,12 @@ fn convolve_vertical_neon_row_full(
                 }
             }
 
-            let zeros = vdupq_n_s16(0);
-
-            let low_s16 = vcombine_s16(
-                vqshrn_n_s32::<PRECISION>(store_0),
-                vqshrn_n_s32::<PRECISION>(store_1),
+            let low_u16 = vcombine_u16(
+                vqshrun_n_s32::<PRECISION>(store_0),
+                vqshrun_n_s32::<PRECISION>(store_1),
             );
-            let low_16 = vreinterpretq_u16_s16(vmaxq_s16(low_s16, zeros));
 
-            let item = vqmovn_u16(low_16);
+            let item = vqmovn_u16(low_u16);
 
             vst1_u8(dst.as_mut_ptr(), item);
 
@@ -1111,8 +1143,8 @@ fn convolve_vertical_neon_row_full(
             if bounds_size == 2 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..2);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let item_row0 = vld1_dup_u8(src_ptr0.as_ptr());
@@ -1120,14 +1152,14 @@ fn convolve_vertical_neon_row_full(
 
                 let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0));
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
-                store = vmlal_s16(store, vget_low_s16(low0), vget_low_s16(v_weight0));
-                store = vmlal_s16(store, vget_low_s16(low1), vget_low_s16(v_weight1));
+                store = vmlal_lane_s16::<0>(store, vget_low_s16(low0), v_weight);
+                store = vmlal_lane_s16::<1>(store, vget_low_s16(low1), v_weight);
             } else if bounds_size == 3 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..3);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
+                let mut v_weight = vld1_dup_s16(weight.as_ptr());
+                v_weight = vld1_lane_s16::<1>(weight.as_ptr().add(1), v_weight);
+                v_weight = vld1_lane_s16::<2>(weight.as_ptr().add(2), v_weight);
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -1138,16 +1170,13 @@ fn convolve_vertical_neon_row_full(
                 let low0 = vreinterpretq_s16_u16(vmovl_u8(item_row0));
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
                 let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2));
-                store = vmlal_s16(store, vget_low_s16(low0), vget_low_s16(v_weight0));
-                store = vmlal_s16(store, vget_low_s16(low1), vget_low_s16(v_weight1));
-                store = vmlal_s16(store, vget_low_s16(low2), vget_low_s16(v_weight2));
+                store = vmlal_lane_s16::<0>(store, vget_low_s16(low0), v_weight);
+                store = vmlal_lane_s16::<1>(store, vget_low_s16(low1), v_weight);
+                store = vmlal_lane_s16::<2>(store, vget_low_s16(low2), v_weight);
             } else if bounds_size == 4 {
                 let py = bounds.start;
                 let weight = weight.get_unchecked(0..4);
-                let v_weight0 = vld1q_dup_s16(weight.as_ptr());
-                let v_weight1 = vld1q_dup_s16(weight.as_ptr().add(1));
-                let v_weight2 = vld1q_dup_s16(weight.as_ptr().add(2));
-                let v_weight3 = vld1q_dup_s16(weight.as_ptr().add(3));
+                let v_weight = vld1_s16(weight.as_ptr());
                 let src_ptr0 = src.get_unchecked((src_stride * py + px)..);
                 let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..);
                 let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..);
@@ -1161,10 +1190,10 @@ fn convolve_vertical_neon_row_full(
                 let low1 = vreinterpretq_s16_u16(vmovl_u8(item_row1));
                 let low2 = vreinterpretq_s16_u16(vmovl_u8(item_row2));
                 let low3 = vreinterpretq_s16_u16(vmovl_u8(item_row3));
-                store = vmlal_s16(store, vget_low_s16(low0), vget_low_s16(v_weight0));
-                store = vmlal_s16(store, vget_low_s16(low1), vget_low_s16(v_weight1));
-                store = vmlal_s16(store, vget_low_s16(low2), vget_low_s16(v_weight2));
-                store = vmlal_s16(store, vget_low_s16(low3), vget_low_s16(v_weight3));
+                store = vmlal_lane_s16::<0>(store, vget_low_s16(low0), v_weight);
+                store = vmlal_lane_s16::<1>(store, vget_low_s16(low1), v_weight);
+                store = vmlal_lane_s16::<2>(store, vget_low_s16(low2), v_weight);
+                store = vmlal_lane_s16::<3>(store, vget_low_s16(low3), v_weight);
             } else {
                 for j in 0..bounds_size {
                     let py = bounds.start + j;
diff --git a/src/plane_u8.rs b/src/plane_u8.rs
index b448b41..0cd736e 100644
--- a/src/plane_u8.rs
+++ b/src/plane_u8.rs
@@ -96,7 +96,7 @@ impl VerticalConvolutionPass<u8, 1> for ImageStore<'_, u8, 1> {
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             // For more downscaling better to use more precise version
-            if _scale_factor < 8. && std::arch::is_aarch64_feature_detected!("rdm") {
+            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
                 _dispatcher = convolve_vertical_neon_i16_precision;
             } else {
                 _dispatcher = convolve_vertical_neon_i32_precision;
diff --git a/src/resize_ar30.rs b/src/resize_ar30.rs
new file mode 100644
index 0000000..bf36dd2
--- /dev/null
+++ b/src/resize_ar30.rs
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) Radzivon Bartoshyk. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ *
+ * 1.  Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2.  Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3.  Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+use crate::dispatch_group_ar30::{
+    convolve_horizontal_dispatch_ar30, convolve_vertical_dispatch_ar30,
+};
+use crate::nearest_sampler::resize_nearest;
+use crate::pic_scale_error::PicScaleError;
+use crate::support::check_image_size_overflow;
+use crate::{ImageSize, ResamplingFunction, Scaler};
+
+pub(crate) fn resize_ar30_impl<const AR30_TYPE: usize, const AR30_ORDER: usize>(
+    src: &[u32],
+    src_size: ImageSize,
+    dst: &mut [u32],
+    dst_size: ImageSize,
+    scaler: &Scaler,
+) -> Result<(), PicScaleError> {
+    if src_size.width == 0 || src_size.height == 0 || dst_size.width == 0 || dst_size.height == 0 {
+        return Err(PicScaleError::ZeroImageDimensions);
+    }
+
+    if check_image_size_overflow(src_size.width, src_size.height, 1) {
+        return Err(PicScaleError::SourceImageIsTooLarge);
+    }
+
+    if check_image_size_overflow(dst_size.width, dst_size.height, 1) {
+        return Err(PicScaleError::DestinationImageIsTooLarge);
+    }
+
+    if src_size.width == dst_size.width && src_size.height == dst_size.height {
+        for (src, dst) in src.iter().zip(dst.iter_mut()) {
+            *dst = *src;
+        }
+        return Ok(());
+    }
+
+    let pool = scaler
+        .threading_policy
+        .get_pool(ImageSize::new(dst_size.width, dst_size.height));
+
+    if scaler.function == ResamplingFunction::Nearest {
+        resize_nearest::<u32, 1>(
+            src,
+            src_size.width,
+            src_size.height,
+            dst,
+            dst_size.width,
+            dst_size.height,
+            &pool,
+        );
+        return Ok(());
+    }
+
+    let should_do_horizontal = src_size.width != dst_size.width;
+    let should_do_vertical = src_size.height != dst_size.height;
+    assert!(should_do_horizontal || should_do_vertical);
+
+    if should_do_vertical && !should_do_horizontal {
+        let vertical_filters = scaler.generate_weights(src_size.height, dst_size.height);
+        convolve_vertical_dispatch_ar30::<AR30_TYPE, AR30_ORDER>(
+            src,
+            src_size.width,
+            vertical_filters,
+            dst,
+            src_size.width,
+            &pool,
+        );
+        return Ok(());
+    }
+
+    let working_store = if should_do_vertical {
+        let mut target = vec![0u32; src_size.width * dst_size.height];
+
+        let vertical_filters = scaler.generate_weights(src_size.height, dst_size.height);
+        convolve_vertical_dispatch_ar30::<AR30_TYPE, AR30_ORDER>(
+            src,
+            src_size.width,
+            vertical_filters,
+            &mut target,
+            src_size.width,
+            &pool,
+        );
+
+        std::borrow::Cow::Owned(target)
+    } else {
+        std::borrow::Cow::Borrowed(src)
+    };
+
+    if should_do_horizontal {
+        let horizontal_filters = scaler.generate_weights(src_size.width, dst_size.width);
+        convolve_horizontal_dispatch_ar30::<AR30_TYPE, AR30_ORDER>(
+            working_store.as_ref(),
+            src_size.width,
+            horizontal_filters,
+            dst,
+            dst_size.width,
+            &pool,
+        );
+    }
+
+    Ok(())
+}
diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs
index 1ec37e7..85ce1e5 100644
--- a/src/rgb_u8.rs
+++ b/src/rgb_u8.rs
@@ -96,7 +96,7 @@ impl VerticalConvolutionPass<u8, 3> for ImageStore<'_, u8, 3> {
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             // For more downscaling better to use more precise version
-            if _scale_factor < 8. && std::arch::is_aarch64_feature_detected!("rdm") {
+            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
                 _dispatcher = convolve_vertical_neon_i16_precision;
             } else {
                 _dispatcher = convolve_vertical_neon_i32_precision;
diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs
index ac570d2..597ffe4 100644
--- a/src/rgba_u8.rs
+++ b/src/rgba_u8.rs
@@ -65,7 +65,7 @@ impl HorizontalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
             handle_fixed_row_u8::<4>;
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
-            if _scale_factor < 8. && std::arch::is_aarch64_feature_detected!("rdm") {
+            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
                 _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8_i16);
                 _dispatcher_1_row = convolve_horizontal_rgba_neon_row_i16;
             } else {
@@ -110,7 +110,7 @@ impl VerticalConvolutionPass<u8, 4> for ImageStore<'_, u8, 4> {
         #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
         {
             // For more downscaling better to use more precise version
-            if _scale_factor < 8. && std::arch::is_aarch64_feature_detected!("rdm") {
+            if _scale_factor < 8. && crate::cpu_features::is_aarch_rdm_supported() {
                 _dispatcher = convolve_vertical_neon_i16_precision;
             } else {
                 _dispatcher = convolve_vertical_neon_i32_precision;
diff --git a/src/saturate_narrow.rs b/src/saturate_narrow.rs
index c856cc1..74949d3 100644
--- a/src/saturate_narrow.rs
+++ b/src/saturate_narrow.rs
@@ -29,7 +29,7 @@
 
 use crate::support::PRECISION;
 
-pub trait SaturateNarrow<J> {
+pub(crate) trait SaturateNarrow<J> {
     fn saturate_narrow(self, bit_depth: u32) -> J;
 }
 
diff --git a/src/scaler.rs b/src/scaler.rs
index 2f57e69..513183a 100644
--- a/src/scaler.rs
+++ b/src/scaler.rs
@@ -30,12 +30,14 @@ use crate::alpha_check::{
     has_non_constant_cap_alpha_rgba16, has_non_constant_cap_alpha_rgba8,
     has_non_constant_cap_alpha_rgba_f32,
 };
+use crate::ar30::{Ar30ByteOrder, Rgb30};
 use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass};
 use crate::filter_weights::{FilterBounds, FilterWeights};
 use crate::image_size::ImageSize;
 use crate::image_store::ImageStore;
 use crate::nearest_sampler::resize_nearest;
 use crate::pic_scale_error::PicScaleError;
+use crate::resize_ar30::resize_ar30_impl;
 use crate::support::check_image_size_overflow;
 use crate::threading_policy::ThreadingPolicy;
 use crate::{ConstPI, ConstSqrt2, Jinc, ResamplingFunction};
@@ -907,6 +909,10 @@ impl ScalingU16 for Scaler {
             return Err(PicScaleError::UnsupportedBitDepth(bit_depth));
         }
 
+        if store.width == new_size.width && store.height == new_size.height {
+            return Ok(store.copied());
+        }
+
         let should_do_horizontal = store.width != new_size.width;
         let should_do_vertical = store.height != new_size.height;
         assert!(should_do_horizontal || should_do_vertical);
@@ -1148,3 +1154,61 @@ impl ScalingU16 for Scaler {
         Ok(src_store)
     }
 }
+
+impl Scaler {
+    /// Resizes RGBA2101010 image
+    ///
+    /// # Arguments
+    /// `src` - source slice
+    /// `src_size` - Source Image size
+    /// `dst` - destination slice
+    /// `new_size` - New image size
+    ///
+    pub fn resize_ar30(
+        &self,
+        src: &[u32],
+        src_size: ImageSize,
+        dst: &mut [u32],
+        new_size: ImageSize,
+        order: Ar30ByteOrder,
+    ) -> Result<(), PicScaleError> {
+        match order {
+            Ar30ByteOrder::Host => resize_ar30_impl::<
+                { Rgb30::Ar30 as usize },
+                { Ar30ByteOrder::Host as usize },
+            >(src, src_size, dst, new_size, self),
+            Ar30ByteOrder::Network => resize_ar30_impl::<
+                { Rgb30::Ar30 as usize },
+                { Ar30ByteOrder::Network as usize },
+            >(src, src_size, dst, new_size, self),
+        }
+    }
+
+    /// Resizes RGBA1010102 image
+    ///
+    /// # Arguments
+    /// `src` - source slice
+    /// `src_size` - Source Image size
+    /// `dst` - destination slice
+    /// `new_size` - New image size
+    ///
+    pub fn resize_ra30(
+        &self,
+        src: &[u32],
+        src_size: ImageSize,
+        dst: &mut [u32],
+        new_size: ImageSize,
+        order: Ar30ByteOrder,
+    ) -> Result<(), PicScaleError> {
+        match order {
+            Ar30ByteOrder::Host => resize_ar30_impl::<
+                { Rgb30::Ra30 as usize },
+                { Ar30ByteOrder::Host as usize },
+            >(src, src_size, dst, new_size, self),
+            Ar30ByteOrder::Network => resize_ar30_impl::<
+                { Rgb30::Ra30 as usize },
+                { Ar30ByteOrder::Network as usize },
+            >(src, src_size, dst, new_size, self),
+        }
+    }
+}
diff --git a/src/sse/alpha_f16.rs b/src/sse/alpha_f16.rs
index ebe54eb..b5ed21d 100644
--- a/src/sse/alpha_f16.rs
+++ b/src/sse/alpha_f16.rs
@@ -38,7 +38,7 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-pub fn sse_premultiply_alpha_rgba_f16(
+pub(crate) fn sse_premultiply_alpha_rgba_f16(
     dst: &mut [half::f16],
     src: &[half::f16],
     width: usize,
@@ -156,7 +156,7 @@ unsafe fn sse_premultiply_alpha_rgba_f16_impl<const F16C: bool>(
     }
 }
 
-pub fn sse_unpremultiply_alpha_rgba_f16(
+pub(crate) fn sse_unpremultiply_alpha_rgba_f16(
     in_place: &mut [half::f16],
     width: usize,
     height: usize,
diff --git a/src/sse/alpha_f32.rs b/src/sse/alpha_f32.rs
index 2a75274..def96a0 100644
--- a/src/sse/alpha_f32.rs
+++ b/src/sse/alpha_f32.rs
@@ -44,7 +44,7 @@ unsafe fn sse_unpremultiply_row_f32(x: __m128, a: __m128) -> __m128 {
     _mm_blendv_ps(rs, _mm_setzero_ps(), is_zero_mask)
 }
 
-pub fn sse_unpremultiply_alpha_rgba_f32(
+pub(crate) fn sse_unpremultiply_alpha_rgba_f32(
     in_place: &mut [f32],
     width: usize,
     height: usize,
@@ -106,7 +106,7 @@ unsafe fn sse_unpremultiply_alpha_rgba_f32_impl(
     }
 }
 
-pub fn sse_premultiply_alpha_rgba_f32(
+pub(crate) fn sse_premultiply_alpha_rgba_f32(
     dst: &mut [f32],
     src: &[f32],
     width: usize,
diff --git a/src/sse/alpha_u16.rs b/src/sse/alpha_u16.rs
index b8d0836..9cde8aa 100644
--- a/src/sse/alpha_u16.rs
+++ b/src/sse/alpha_u16.rs
@@ -65,7 +65,7 @@ unsafe fn sse_unpremultiply_row_u16(
 }
 
 #[inline(always)]
-pub unsafe fn _mm_div_by_1023_epi32(v: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_div_by_1023_epi32(v: __m128i) -> __m128i {
     const DIVIDING_BY: i32 = 10;
     let addition = _mm_set1_epi32(1 << (DIVIDING_BY - 1));
     let v = _mm_add_epi32(v, addition);
@@ -73,7 +73,7 @@ pub unsafe fn _mm_div_by_1023_epi32(v: __m128i) -> __m128i {
 }
 
 #[inline(always)]
-pub unsafe fn _mm_div_by_4095_epi32(v: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_div_by_4095_epi32(v: __m128i) -> __m128i {
     const DIVIDING_BY: i32 = 12;
     let addition = _mm_set1_epi32(1 << (DIVIDING_BY - 1));
     let v = _mm_add_epi32(v, addition);
@@ -81,14 +81,14 @@ pub unsafe fn _mm_div_by_4095_epi32(v: __m128i) -> __m128i {
 }
 
 #[inline(always)]
-pub unsafe fn _mm_div_by_65535_epi32(v: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_div_by_65535_epi32(v: __m128i) -> __m128i {
     const DIVIDING_BY: i32 = 16;
     let addition = _mm_set1_epi32(1 << (DIVIDING_BY - 1));
     let v = _mm_add_epi32(v, addition);
     _mm_srli_epi32::<DIVIDING_BY>(_mm_add_epi32(v, _mm_srli_epi32::<DIVIDING_BY>(v)))
 }
 
-pub fn unpremultiply_alpha_sse_rgba_u16(
+pub(crate) fn unpremultiply_alpha_sse_rgba_u16(
     in_place: &mut [u16],
     width: usize,
     height: usize,
@@ -198,7 +198,7 @@ unsafe fn sse_premultiply_row_u16(
     _mm_packs_epi32(new_lo, new_hi)
 }
 
-pub fn premultiply_alpha_sse_rgba_u16(
+pub(crate) fn premultiply_alpha_sse_rgba_u16(
     dst: &mut [u16],
     src: &[u16],
     width: usize,
diff --git a/src/sse/alpha_u8.rs b/src/sse/alpha_u8.rs
index 8c3c3f6..f194299 100644
--- a/src/sse/alpha_u8.rs
+++ b/src/sse/alpha_u8.rs
@@ -38,7 +38,11 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 #[inline(always)]
-pub unsafe fn _mm_select_si128(mask: __m128i, true_vals: __m128i, false_vals: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_select_si128(
+    mask: __m128i,
+    true_vals: __m128i,
+    false_vals: __m128i,
+) -> __m128i {
     _mm_or_si128(
         _mm_and_si128(mask, true_vals),
         _mm_andnot_si128(mask, false_vals),
@@ -46,7 +50,7 @@ pub unsafe fn _mm_select_si128(mask: __m128i, true_vals: __m128i, false_vals: __
 }
 
 #[inline(always)]
-pub unsafe fn _mm_div_by_255_epi16(v: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_div_by_255_epi16(v: __m128i) -> __m128i {
     let addition = _mm_set1_epi16(127);
     _mm_srli_epi16::<8>(_mm_add_epi16(
         _mm_add_epi16(v, addition),
@@ -55,7 +59,7 @@ pub unsafe fn _mm_div_by_255_epi16(v: __m128i) -> __m128i {
 }
 
 #[inline(always)]
-pub unsafe fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i {
+pub(crate) unsafe fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i {
     let zeros = _mm_setzero_si128();
     let lo = _mm_cvtepu8_epi16(x);
     let hi = _mm_unpackhi_epi8(x, zeros);
@@ -90,7 +94,7 @@ pub unsafe fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i {
     _mm_select_si128(is_zero_mask, _mm_setzero_si128(), _mm_packus_epi16(lo, hi))
 }
 
-pub fn sse_premultiply_alpha_rgba(
+pub(crate) fn sse_premultiply_alpha_rgba(
     dst: &mut [u8],
     src: &[u8],
     width: usize,
@@ -185,7 +189,7 @@ unsafe fn sse_premultiply_alpha_rgba_impl(
     }
 }
 
-pub fn sse_unpremultiply_alpha_rgba(
+pub(crate) fn sse_unpremultiply_alpha_rgba(
     in_place: &mut [u8],
     width: usize,
     height: usize,
diff --git a/src/sse/f16_utils.rs b/src/sse/f16_utils.rs
index 6c1d905..7f7a8e1 100644
--- a/src/sse/f16_utils.rs
+++ b/src/sse/f16_utils.rs
@@ -34,13 +34,13 @@ use std::arch::x86_64::*;
 
 #[inline]
 #[cfg(target_feature = "avx2")]
-pub unsafe fn _mm_srlv_epi32x(c: __m128i, n: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_srlv_epi32x(c: __m128i, n: __m128i) -> __m128i {
     _mm_srlv_epi32(c, n)
 }
 
 #[inline]
 #[cfg(not(target_feature = "avx2"))]
-pub unsafe fn _mm_srlv_epi32x(c: __m128i, n: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_srlv_epi32x(c: __m128i, n: __m128i) -> __m128i {
     _mm_setr_epi32(
         _mm_extract_epi32::<0>(c).wrapping_shr(_mm_extract_epi32::<0>(n) as u32),
         _mm_extract_epi32::<1>(c).wrapping_shr(_mm_extract_epi32::<1>(n) as u32),
@@ -51,13 +51,13 @@ pub unsafe fn _mm_srlv_epi32x(c: __m128i, n: __m128i) -> __m128i {
 
 #[inline]
 #[cfg(target_feature = "avx2")]
-pub unsafe fn _mm_sllv_epi32x(c: __m128i, n: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_sllv_epi32x(c: __m128i, n: __m128i) -> __m128i {
     _mm_sllv_epi32(c, n)
 }
 
 #[inline]
 #[cfg(not(target_feature = "avx2"))]
-pub unsafe fn _mm_sllv_epi32x(c: __m128i, n: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_sllv_epi32x(c: __m128i, n: __m128i) -> __m128i {
     _mm_setr_epi32(
         _mm_extract_epi32::<0>(c).wrapping_shl(_mm_extract_epi32::<0>(n) as u32),
         _mm_extract_epi32::<1>(c).wrapping_shl(_mm_extract_epi32::<1>(n) as u32),
@@ -67,7 +67,7 @@ pub unsafe fn _mm_sllv_epi32x(c: __m128i, n: __m128i) -> __m128i {
 }
 
 #[inline(always)]
-pub unsafe fn _mm_blendv_epi32(xmm0: __m128i, xmm1: __m128i, mask: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_blendv_epi32(xmm0: __m128i, xmm1: __m128i, mask: __m128i) -> __m128i {
     _mm_castps_si128(_mm_blendv_ps(
         _mm_castsi128_ps(xmm0),
         _mm_castsi128_ps(xmm1),
@@ -77,7 +77,11 @@ pub unsafe fn _mm_blendv_epi32(xmm0: __m128i, xmm1: __m128i, mask: __m128i) -> _
 
 #[inline(always)]
 /// If mask then `true_vals` otherwise `false_val`
-pub unsafe fn _mm_select_epi32(mask: __m128i, true_vals: __m128i, false_vals: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_select_epi32(
+    mask: __m128i,
+    true_vals: __m128i,
+    false_vals: __m128i,
+) -> __m128i {
     _mm_blendv_epi32(false_vals, true_vals, mask)
 }
 
@@ -181,7 +185,7 @@ unsafe fn _mm_cvtps_phdx(x: __m128) -> __m128i {
 }
 
 #[inline]
-pub unsafe fn _mm_cvtps_phx<const F16C: bool>(x: __m128) -> __m128i {
+pub(crate) unsafe fn _mm_cvtps_phx<const F16C: bool>(x: __m128) -> __m128i {
     if F16C {
         _mm_cvtps_phdx(x)
     } else {
@@ -196,7 +200,7 @@ unsafe fn _mm_cvtph_psdx(x: __m128i) -> __m128 {
 }
 
 #[inline]
-pub unsafe fn _mm_cvtph_psx<const F16C: bool>(x: __m128i) -> __m128 {
+pub(crate) unsafe fn _mm_cvtph_psx<const F16C: bool>(x: __m128i) -> __m128 {
     if F16C {
         _mm_cvtph_ps(x)
     } else {
diff --git a/src/sse/mod.rs b/src/sse/mod.rs
index ab0dcc0..6726e82 100644
--- a/src/sse/mod.rs
+++ b/src/sse/mod.rs
@@ -59,51 +59,57 @@ mod vertical_u8;
 mod vertical_u8_lp;
 
 #[cfg(feature = "half")]
-pub use alpha_f16::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16};
-pub use alpha_f32::sse_premultiply_alpha_rgba_f32;
-pub use alpha_f32::sse_unpremultiply_alpha_rgba_f32;
-pub use alpha_u16::{premultiply_alpha_sse_rgba_u16, unpremultiply_alpha_sse_rgba_u16};
-pub use alpha_u8::{
+pub(crate) use alpha_f16::{sse_premultiply_alpha_rgba_f16, sse_unpremultiply_alpha_rgba_f16};
+pub(crate) use alpha_f32::sse_premultiply_alpha_rgba_f32;
+pub(crate) use alpha_f32::sse_unpremultiply_alpha_rgba_f32;
+pub(crate) use alpha_u16::{premultiply_alpha_sse_rgba_u16, unpremultiply_alpha_sse_rgba_u16};
+pub(crate) use alpha_u8::{
     _mm_div_by_255_epi16, sse_premultiply_alpha_rgba, sse_unpremultiply_alpha_rgba,
     sse_unpremultiply_row,
 };
-pub use plane_f32::convolve_horizontal_plane_sse_row_one;
-pub use plane_f32::convolve_horizontal_plane_sse_rows_4;
-pub use plane_u8::{convolve_horizontal_plane_sse_row, convolve_horizontal_plane_sse_rows_4_u8};
+pub(crate) use plane_f32::convolve_horizontal_plane_sse_row_one;
+pub(crate) use plane_f32::convolve_horizontal_plane_sse_rows_4;
+pub(crate) use plane_u8::{
+    convolve_horizontal_plane_sse_row, convolve_horizontal_plane_sse_rows_4_u8,
+};
 #[cfg(feature = "half")]
-pub use rgb_f16::{
+pub(crate) use rgb_f16::{
     convolve_horizontal_rgb_sse_row_one_f16, convolve_horizontal_rgb_sse_rows_4_f16,
 };
-pub use rgb_f32::{
+pub(crate) use rgb_f32::{
     convolve_horizontal_rgb_sse_row_one_f32, convolve_horizontal_rgb_sse_rows_4_f32,
 };
-pub use rgb_u8::*;
+pub(crate) use rgb_u8::*;
 #[cfg(feature = "half")]
-pub use rgba_f16::{
+pub(crate) use rgba_f16::{
     convolve_horizontal_rgba_sse_row_one_f16, convolve_horizontal_rgba_sse_rows_4_f16,
 };
-pub use rgba_f32::{
+pub(crate) use rgba_f32::{
     convolve_horizontal_rgba_sse_row_one_f32, convolve_horizontal_rgba_sse_rows_4_f32,
 };
-pub use rgba_u16::{convolve_horizontal_rgba_sse_rows_4_u16, convolve_horizontal_rgba_sse_u16_row};
-pub use rgba_u16_lb::{
+pub(crate) use rgba_u16::{
+    convolve_horizontal_rgba_sse_rows_4_u16, convolve_horizontal_rgba_sse_u16_row,
+};
+pub(crate) use rgba_u16_lb::{
     convolve_horizontal_rgba_sse_rows_4_lb_u8, convolve_horizontal_rgba_sse_u16_lb_row,
 };
-pub use rgba_u8::{convolve_horizontal_rgba_sse_rows_4, convolve_horizontal_rgba_sse_rows_one};
-pub use rgba_u8_lb::{
+pub(crate) use rgba_u8::{
+    convolve_horizontal_rgba_sse_rows_4, convolve_horizontal_rgba_sse_rows_one,
+};
+pub(crate) use rgba_u8_lb::{
     convolve_horizontal_rgba_sse_rows_4_lb, convolve_horizontal_rgba_sse_rows_one_lb,
 };
 pub(crate) use routines::{load_4_weights, load_4_weights_group_2_avx, load_8_weights_group_4_avx};
-pub use u8_utils::*;
-pub use utils::*;
+pub(crate) use u8_utils::*;
+pub(crate) use utils::*;
 #[cfg(feature = "half")]
-pub use vertical_f16::convolve_vertical_sse_row_f16;
-pub use vertical_f32::convolve_vertical_rgb_sse_row_f32;
-pub use vertical_u16::convolve_column_sse_u16;
-pub use vertical_u16_lb::convolve_column_lb_sse_u16;
-pub use vertical_u8::convolve_vertical_sse_row;
-pub use vertical_u8_lp::convolve_vertical_sse_row_lp;
+pub(crate) use vertical_f16::convolve_vertical_sse_row_f16;
+pub(crate) use vertical_f32::convolve_vertical_rgb_sse_row_f32;
+pub(crate) use vertical_u16::convolve_column_sse_u16;
+pub(crate) use vertical_u16_lb::convolve_column_lb_sse_u16;
+pub(crate) use vertical_u8::convolve_vertical_sse_row;
+pub(crate) use vertical_u8_lp::convolve_vertical_sse_row_lp;
 
-pub const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
+pub(crate) const fn shuffle(z: u32, y: u32, x: u32, w: u32) -> i32 {
     ((z << 6) | (y << 4) | (x << 2) | w) as i32
 }
diff --git a/src/sse/plane_f32.rs b/src/sse/plane_f32.rs
index 55f32c7..7b619eb 100644
--- a/src/sse/plane_f32.rs
+++ b/src/sse/plane_f32.rs
@@ -97,7 +97,7 @@ macro_rules! conv_horiz_plane_1_f32 {
     }};
 }
 
-pub fn convolve_horizontal_plane_sse_row_one<const FMA: bool>(
+pub(crate) fn convolve_horizontal_plane_sse_row_one<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -245,7 +245,7 @@ unsafe fn convolve_horizontal_plane_sse_row_one_impl<const FMA: bool>(
     }
 }
 
-pub fn convolve_horizontal_plane_sse_rows_4<const FMA: bool>(
+pub(crate) fn convolve_horizontal_plane_sse_rows_4<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/sse/plane_u8.rs b/src/sse/plane_u8.rs
index 58fc4ea..6f275f2 100644
--- a/src/sse/plane_u8.rs
+++ b/src/sse/plane_u8.rs
@@ -67,7 +67,7 @@ macro_rules! s_accumulate_1_horiz {
     }};
 }
 
-pub fn convolve_horizontal_plane_sse_rows_4_u8(
+pub(crate) fn convolve_horizontal_plane_sse_rows_4_u8(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -213,7 +213,7 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_u8_impl(
     }
 }
 
-pub fn convolve_horizontal_plane_sse_row(
+pub(crate) fn convolve_horizontal_plane_sse_row(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/sse/rgb_f16.rs b/src/sse/rgb_f16.rs
index 1ebb02c..18ba209 100644
--- a/src/sse/rgb_f16.rs
+++ b/src/sse/rgb_f16.rs
@@ -146,7 +146,7 @@ unsafe fn convolve_horizontal_parts_one_rgb_f16<const F16C: bool, const FMA: boo
     acc
 }
 
-pub fn convolve_horizontal_rgb_sse_row_one_f16<const F16C: bool, const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgb_sse_row_one_f16<const F16C: bool, const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -317,7 +317,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_impl<const F16C: bool, const F
     }
 }
 
-pub fn convolve_horizontal_rgb_sse_rows_4_f16<const F16C: bool, const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f16<const F16C: bool, const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/sse/rgb_f32.rs b/src/sse/rgb_f32.rs
index 53bf992..918daed 100644
--- a/src/sse/rgb_f32.rs
+++ b/src/sse/rgb_f32.rs
@@ -106,7 +106,7 @@ unsafe fn convolve_horizontal_parts_one_rgb_f32<const FMA: bool>(
     _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel, weight0)
 }
 
-pub fn convolve_horizontal_rgb_sse_row_one_f32<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgb_sse_row_one_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -247,7 +247,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f32_impl<const FMA: bool>(
     }
 }
 
-pub fn convolve_horizontal_rgb_sse_rows_4_f32<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/sse/rgb_u8.rs b/src/sse/rgb_u8.rs
index a79e7b5..ee6e357 100644
--- a/src/sse/rgb_u8.rs
+++ b/src/sse/rgb_u8.rs
@@ -36,7 +36,7 @@ use crate::filter_weights::FilterWeights;
 use crate::sse::{compress_i32, convolve_horizontal_parts_one_sse_rgb, shuffle};
 use crate::support::ROUNDING_CONST;
 
-pub fn convolve_horizontal_rgb_sse_rows_4(
+pub(crate) fn convolve_horizontal_rgb_sse_rows_4(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -245,7 +245,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_impl(
     }
 }
 
-pub fn convolve_horizontal_rgb_sse_row_one(
+pub(crate) fn convolve_horizontal_rgb_sse_row_one(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/sse/rgba_f16.rs b/src/sse/rgba_f16.rs
index f186de0..6b7b1e7 100644
--- a/src/sse/rgba_f16.rs
+++ b/src/sse/rgba_f16.rs
@@ -103,7 +103,7 @@ unsafe fn convolve_horizontal_parts_2_rgba_f16<const F16C: bool, const FMA: bool
     acc
 }
 
-pub fn convolve_horizontal_rgba_sse_row_one_f16<const F16C: bool, const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_sse_row_one_f16<const F16C: bool, const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -276,7 +276,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl<const F16C: bool, const
     }
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_4_f16<const F16C: bool, const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f16<const F16C: bool, const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/sse/rgba_f32.rs b/src/sse/rgba_f32.rs
index 8a3ab2d..9b4b244 100644
--- a/src/sse/rgba_f32.rs
+++ b/src/sse/rgba_f32.rs
@@ -47,7 +47,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_f32<const FMA: bool>(
     _mm_prefer_fma_ps::<FMA>(store_0, rgb_pixel, weight0)
 }
 
-pub fn convolve_horizontal_rgba_sse_row_one_f32<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_sse_row_one_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
@@ -229,7 +229,7 @@ unsafe fn convolve_horizontal_parts_2_rgba_f32<const FMA: bool>(
     _mm_prefer_fma_ps::<FMA>(acc, rgb_pixel_1, weight1)
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_4_f32<const FMA: bool>(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32<const FMA: bool>(
     dst_width: usize,
     src_width: usize,
     filter_weights: &FilterWeights<f32>,
diff --git a/src/sse/rgba_u16.rs b/src/sse/rgba_u16.rs
index 2e499e3..79ec664 100644
--- a/src/sse/rgba_u16.rs
+++ b/src/sse/rgba_u16.rs
@@ -175,7 +175,7 @@ unsafe fn conv_horiz_rgba_8_u16<const FMA: bool>(
     acc
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_4_u16(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_4_u16(
     src: &[u16],
     src_stride: usize,
     dst: &mut [u16],
@@ -392,7 +392,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_u16_impl<const FMA: bool>(
     }
 }
 
-pub fn convolve_horizontal_rgba_sse_u16_row(
+pub(crate) fn convolve_horizontal_rgba_sse_u16_row(
     src: &[u16],
     dst: &mut [u16],
     filter_weights: &FilterWeights<f32>,
diff --git a/src/sse/rgba_u16_lb.rs b/src/sse/rgba_u16_lb.rs
index 6a5c715..f2d5974 100644
--- a/src/sse/rgba_u16_lb.rs
+++ b/src/sse/rgba_u16_lb.rs
@@ -162,7 +162,7 @@ unsafe fn conv_horiz_rgba_8_u16(
     acc
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_4_lb_u8(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_4_lb_u8(
     src: &[u16],
     src_stride: usize,
     dst: &mut [u16],
@@ -320,7 +320,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_lb_u8_impl(
     }
 }
 
-pub fn convolve_horizontal_rgba_sse_u16_lb_row(
+pub(crate) fn convolve_horizontal_rgba_sse_u16_lb_row(
     src: &[u16],
     dst: &mut [u16],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/sse/rgba_u8.rs b/src/sse/rgba_u8.rs
index c5c34ba..e41d35f 100644
--- a/src/sse/rgba_u8.rs
+++ b/src/sse/rgba_u8.rs
@@ -52,7 +52,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_sse(
     _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(lo), weight0))
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_4(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_4(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -244,7 +244,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
     }
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_one(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_one(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/sse/rgba_u8_lb.rs b/src/sse/rgba_u8_lb.rs
index 2c339e7..1cef21a 100644
--- a/src/sse/rgba_u8_lb.rs
+++ b/src/sse/rgba_u8_lb.rs
@@ -50,7 +50,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_sse<const SCALE: i32>(
     _mm_add_epi16(store_0, _mm_mulhi_epi16(lo, weight0))
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_4_lb(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_4_lb(
     src: &[u8],
     src_stride: usize,
     dst: &mut [u8],
@@ -385,7 +385,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl(
     }
 }
 
-pub fn convolve_horizontal_rgba_sse_rows_one_lb(
+pub(crate) fn convolve_horizontal_rgba_sse_rows_one_lb(
     src: &[u8],
     dst: &mut [u8],
     filter_weights: &FilterWeights<i16>,
diff --git a/src/sse/u8_utils.rs b/src/sse/u8_utils.rs
index 706776d..3605d2a 100644
--- a/src/sse/u8_utils.rs
+++ b/src/sse/u8_utils.rs
@@ -35,7 +35,7 @@ use std::arch::x86_64::*;
 use crate::support::PRECISION;
 
 #[inline(always)]
-pub fn compress_i32(x: __m128i) -> __m128i {
+pub(crate) fn compress_i32(x: __m128i) -> __m128i {
     let store_32 = unsafe { _mm_srai_epi32::<PRECISION>(_mm_max_epi32(x, _mm_setzero_si128())) };
     let store_16 = unsafe { _mm_packus_epi32(store_32, store_32) };
     unsafe { _mm_packus_epi16(store_16, store_16) }
diff --git a/src/sse/utils.rs b/src/sse/utils.rs
index 023aa0b..e4f50c9 100644
--- a/src/sse/utils.rs
+++ b/src/sse/utils.rs
@@ -34,8 +34,7 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 #[inline]
-#[target_feature(enable = "sse4.1")]
-pub unsafe fn _mm_prefer_fma_ps<const FMA: bool>(a: __m128, b: __m128, c: __m128) -> __m128 {
+pub(crate) unsafe fn _mm_prefer_fma_ps<const FMA: bool>(a: __m128, b: __m128, c: __m128) -> __m128 {
     if FMA {
         _mm_fma_psx(a, b, c)
     } else {
@@ -44,13 +43,12 @@ pub unsafe fn _mm_prefer_fma_ps<const FMA: bool>(a: __m128, b: __m128, c: __m128
 }
 
 #[inline]
-#[target_feature(enable = "sse4.1,fma")]
 unsafe fn _mm_fma_psx(a: __m128, b: __m128, c: __m128) -> __m128 {
     _mm_fmadd_ps(b, c, a)
 }
 
 #[inline(always)]
-pub unsafe fn sse_deinterleave_rgba_ps(
+pub(crate) unsafe fn sse_deinterleave_rgba_ps(
     v0: __m128,
     v1: __m128,
     v2: __m128,
@@ -68,7 +66,7 @@ pub unsafe fn sse_deinterleave_rgba_ps(
 }
 
 #[inline(always)]
-pub unsafe fn sse_interleave_rgba_ps(
+pub(crate) unsafe fn sse_interleave_rgba_ps(
     v0: __m128,
     v1: __m128,
     v2: __m128,
@@ -87,7 +85,7 @@ pub unsafe fn sse_interleave_rgba_ps(
 }
 
 #[inline(always)]
-pub unsafe fn sse_deinterleave_rgba(
+pub(crate) unsafe fn sse_deinterleave_rgba(
     rgba0: __m128i,
     rgba1: __m128i,
     rgba2: __m128i,
@@ -124,7 +122,7 @@ pub unsafe fn sse_deinterleave_rgba(
 }
 
 #[inline(always)]
-pub unsafe fn sse_interleave_rgba(
+pub(crate) unsafe fn sse_interleave_rgba(
     r: __m128i,
     g: __m128i,
     b: __m128i,
@@ -144,7 +142,7 @@ pub unsafe fn sse_interleave_rgba(
 
 /// Sums all lanes in float32
 #[inline(always)]
-pub unsafe fn _mm_hsum_ps(v: __m128) -> f32 {
+pub(crate) unsafe fn _mm_hsum_ps(v: __m128) -> f32 {
     let mut shuf = _mm_movehdup_ps(v);
     let mut sums = _mm_add_ps(v, shuf);
     shuf = _mm_movehl_ps(shuf, sums);
@@ -154,7 +152,7 @@ pub unsafe fn _mm_hsum_ps(v: __m128) -> f32 {
 
 #[inline(always)]
 #[allow(dead_code)]
-pub unsafe fn sse_deinterleave_rgba_epi16(
+pub(crate) unsafe fn sse_deinterleave_rgba_epi16(
     rgba0: __m128i,
     rgba1: __m128i,
     rgba2: __m128i,
@@ -179,7 +177,7 @@ pub unsafe fn sse_deinterleave_rgba_epi16(
 
 #[inline(always)]
 #[allow(dead_code)]
-pub unsafe fn sse_interleave_rgba_epi16(
+pub(crate) unsafe fn sse_interleave_rgba_epi16(
     a: __m128i,
     b: __m128i,
     c: __m128i,
@@ -218,7 +216,7 @@ pub(crate) unsafe fn _mm_muladd_wide_epi16(a: __m128i, b: __m128i, c: __m128i) -
 
 #[inline]
 /// Arithmetic shift for i64, shifting with sign bits
-pub unsafe fn _mm_srai_epi64x<const IMM8: i32>(a: __m128i) -> __m128i {
+pub(crate) unsafe fn _mm_srai_epi64x<const IMM8: i32>(a: __m128i) -> __m128i {
     let m = _mm_set1_epi64x(1 << (64 - 1));
     let x = _mm_srli_epi64::<IMM8>(a);
     _mm_sub_epi64(_mm_xor_si128(x, m), m)
@@ -235,7 +233,7 @@ pub(crate) unsafe fn _mm_packus_epi64(a: __m128i, b: __m128i) -> __m128i {
 
 #[inline(always)]
 /// Extracts i64 value
-pub unsafe fn _mm_extract_epi64x<const IMM: i32>(d: __m128i) -> i64 {
+pub(crate) unsafe fn _mm_extract_epi64x<const IMM: i32>(d: __m128i) -> i64 {
     #[cfg(target_arch = "x86_64")]
     {
         if IMM == 0 {
@@ -259,7 +257,7 @@ pub unsafe fn _mm_extract_epi64x<const IMM: i32>(d: __m128i) -> i64 {
 }
 
 #[inline]
-pub unsafe fn _mm_store3_u16(ptr: *mut u16, a: __m128i) {
+pub(crate) unsafe fn _mm_store3_u16(ptr: *mut u16, a: __m128i) {
     let low_pixel = _mm_extract_epi32::<0>(a);
     (ptr as *mut i32).write_unaligned(low_pixel);
     (ptr as *mut i16)
diff --git a/src/sse/vertical_f16.rs b/src/sse/vertical_f16.rs
index 9e400ef..50e0ede 100644
--- a/src/sse/vertical_f16.rs
+++ b/src/sse/vertical_f16.rs
@@ -189,7 +189,11 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8_f16<const F16C: bool, const FM
     _mm_storeu_si128(dst_ptr as *mut __m128i, acc0);
 }
 
-pub fn convolve_vertical_sse_row_f16<const CHANNELS: usize, const F16C: bool, const FMA: bool>(
+pub(crate) fn convolve_vertical_sse_row_f16<
+    const CHANNELS: usize,
+    const F16C: bool,
+    const FMA: bool,
+>(
     width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const half::f16,
diff --git a/src/sse/vertical_f32.rs b/src/sse/vertical_f32.rs
index 3b513ab..d00dfda 100644
--- a/src/sse/vertical_f32.rs
+++ b/src/sse/vertical_f32.rs
@@ -219,7 +219,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_f32<const FMA: bool>(
     (dst_ptr as *mut i32).write_unaligned(_mm_extract_ps::<0>(store_0));
 }
 
-pub fn convolve_vertical_rgb_sse_row_f32<const CHANNELS: usize, const FMA: bool>(
+pub(crate) fn convolve_vertical_rgb_sse_row_f32<const CHANNELS: usize, const FMA: bool>(
     width: usize,
     bounds: &FilterBounds,
     unsafe_source_ptr_0: *const f32,
diff --git a/src/sse/vertical_u16.rs b/src/sse/vertical_u16.rs
index b9100b2..2fde19b 100644
--- a/src/sse/vertical_u16.rs
+++ b/src/sse/vertical_u16.rs
@@ -37,7 +37,7 @@ use std::arch::x86_64::*;
 
 const ROUNDING: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
 
-pub fn convolve_column_sse_u16(
+pub(crate) fn convolve_column_sse_u16(
     _: usize,
     bounds: &FilterBounds,
     src: &[u16],
diff --git a/src/sse/vertical_u16_lb.rs b/src/sse/vertical_u16_lb.rs
index a35f950..9715cd7 100644
--- a/src/sse/vertical_u16_lb.rs
+++ b/src/sse/vertical_u16_lb.rs
@@ -34,7 +34,7 @@ use std::arch::x86::*;
 use std::arch::x86_64::*;
 
 #[inline(always)]
-pub fn convolve_column_lb_sse_u16(
+pub(crate) fn convolve_column_lb_sse_u16(
     _: usize,
     bounds: &FilterBounds,
     src: &[u16],
diff --git a/src/sse/vertical_u8.rs b/src/sse/vertical_u8.rs
index d5726a0..5b66bf4 100644
--- a/src/sse/vertical_u8.rs
+++ b/src/sse/vertical_u8.rs
@@ -631,7 +631,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse(
     *dst_ptr = _mm_extract_epi8::<0>(item) as u8;
 }
 
-pub fn convolve_vertical_sse_row(
+pub(crate) fn convolve_vertical_sse_row(
     dst_width: usize,
     bounds: &FilterBounds,
     src: &[u8],
diff --git a/src/sse/vertical_u8_lp.rs b/src/sse/vertical_u8_lp.rs
index ca8621f..d507857 100644
--- a/src/sse/vertical_u8_lp.rs
+++ b/src/sse/vertical_u8_lp.rs
@@ -32,7 +32,7 @@ use std::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use std::arch::x86_64::*;
 
-pub fn convolve_vertical_sse_row_lp(
+pub(crate) fn convolve_vertical_sse_row_lp(
     dst_width: usize,
     bounds: &FilterBounds,
     src: &[u8],
diff --git a/src/support.rs b/src/support.rs
index 430544c..4735c62 100644
--- a/src/support.rs
+++ b/src/support.rs
@@ -27,8 +27,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 #![forbid(unsafe_code)]
-pub const PRECISION: i32 = 15;
-pub const ROUNDING_CONST: i32 = 1 << (PRECISION - 1);
+pub(crate) const PRECISION: i32 = 15;
+pub(crate) const ROUNDING_CONST: i32 = 1 << (PRECISION - 1);
 
 pub(crate) fn check_image_size_overflow(width: usize, height: usize, chan: usize) -> bool {
     let (stride, is_overflowed) = width.overflowing_mul(chan);
diff --git a/src/threading_policy.rs b/src/threading_policy.rs
index ba70111..2f3b9a4 100644
--- a/src/threading_policy.rs
+++ b/src/threading_policy.rs
@@ -47,20 +47,20 @@ pub enum ThreadingPolicy {
 
 impl ThreadingPolicy {
     #[cfg(not(target_arch = "wasm32"))]
-    pub fn get_threads_count(&self, for_size: ImageSize) -> usize {
+    pub fn thread_count(&self, for_size: ImageSize) -> usize {
         match self {
             ThreadingPolicy::Single => 1,
             ThreadingPolicy::Fixed(thread_count) => (*thread_count).max(1),
             ThreadingPolicy::Adaptive => {
                 let box_size = 256 * 256;
                 let new_box_size = for_size.height * for_size.width;
-                (new_box_size / box_size).clamp(1, 16)
+                (new_box_size / box_size).clamp(1, 12)
             }
         }
     }
 
     #[cfg(target_arch = "wasm32")]
-    pub fn get_threads_count(&self, _: ImageSize) -> usize {
+    pub fn thread_count(&self, _: ImageSize) -> usize {
         1
     }
 }
@@ -71,9 +71,9 @@ impl ThreadingPolicy {
         if *self == ThreadingPolicy::Single {
             return None;
         }
-        let threads_count = self.get_threads_count(for_size);
+        let thread_count = self.thread_count(for_size);
         match rayon::ThreadPoolBuilder::new()
-            .num_threads(threads_count)
+            .num_threads(thread_count)
             .build()
         {
             Ok(pool) => Some(pool),
diff --git a/src/unsafe_slice.rs b/src/unsafe_slice.rs
index 21339a7..52b4352 100644
--- a/src/unsafe_slice.rs
+++ b/src/unsafe_slice.rs
@@ -30,7 +30,7 @@
 use std::cell::UnsafeCell;
 
 #[derive(Copy, Clone)]
-pub struct UnsafeSlice<'a, T> {
+pub(crate) struct UnsafeSlice<'a, T> {
     pub slice: &'a [UnsafeCell<T>],
 }
 
@@ -39,31 +39,31 @@ unsafe impl<T: Send + Sync> Send for UnsafeSlice<'_, T> {}
 unsafe impl<T: Send + Sync> Sync for UnsafeSlice<'_, T> {}
 
 impl<'a, T> UnsafeSlice<'a, T> {
-    pub fn new(slice: &'a mut [T]) -> Self {
+    pub(crate) fn new(slice: &'a mut [T]) -> Self {
         let ptr = slice as *mut [T] as *const [UnsafeCell<T>];
         Self {
             slice: unsafe { &*ptr },
         }
     }
 
-    pub fn mut_ptr(&self) -> *mut T {
+    pub(crate) fn mut_ptr(&self) -> *mut T {
         self.slice.as_ptr() as *const T as *mut T
     }
 
     /// SAFETY: It is UB if two threads write to the same index without
     /// synchronization.
     #[allow(dead_code)]
-    pub unsafe fn write(&self, i: usize, value: T) {
+    pub(crate) unsafe fn write(&self, i: usize, value: T) {
         let ptr = self.slice[i].get();
         *ptr = value;
     }
     #[allow(dead_code)]
-    pub fn get(&self, i: usize) -> &T {
+    pub(crate) fn get(&self, i: usize) -> &T {
         let ptr = self.slice[i].get();
         unsafe { &*ptr }
     }
     #[allow(dead_code)]
-    pub fn len(&self) -> usize {
+    pub(crate) fn len(&self) -> usize {
         self.slice.len()
     }
 }