diff --git a/README.md b/README.md
index c101397..f748326 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,27 @@ Supported only NEON and SSE.
 
 This library provides for you some conveniences to scale in different color spaces.
 
+#### Example integration with `image` crate
+
+```rust
+let img = ImageReader::open("./assets/asset.png")
+    .unwrap()
+    .decode()
+    .unwrap();
+let dimensions = img.dimensions();
+let mut bytes = Vec::from(img.as_bytes());
+
+let mut scaler = LinearScaler::new(ResamplingFunction::Lanczos3);
+scaler.set_threading_policy(ThreadingPolicy::Adaptive);
+let store =
+    ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize);
+let resized = scaler.resize_rgba(
+    ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
+    store,
+    true
+);
+```
+
 ### Performance
 
 Faster or comparable to `fast-image-resize`, when implemented equal SIMD and pixel type.
@@ -54,27 +75,6 @@ M3 Pro. NEON
 | pic-scale |  38.75   |
 | fir sse   |  45.79   |
 
-#### Example integration with `image` crate
-
-```rust
-let img = ImageReader::open("./assets/asset.png")
-    .unwrap()
-    .decode()
-    .unwrap();
-let dimensions = img.dimensions();
-let mut bytes = Vec::from(img.as_bytes());
-
-let mut scaler = LinearScaler::new(ResamplingFunction::Lanczos3);
-scaler.set_threading_policy(ThreadingPolicy::Adaptive);
-let store =
-    ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize);
-let resized = scaler.resize_rgba(
-    ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
-    store,
-    true
-);
-```
-
 #### Example in sRGB
 
 In common, you should not downsize an image in sRGB colorspace, however if speed is more preferable than more proper scale you may omit linearizing 
diff --git a/src/rgba_f32.rs b/src/rgba_f32.rs
index c7da6aa..f153d03 100644
--- a/src/rgba_f32.rs
+++ b/src/rgba_f32.rs
@@ -300,6 +300,7 @@ fn convolve_horizontal_rgba_f32_native(
     }
 }
 
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 #[inline(always)]
 fn convolve_vertical_rgb_native_row(
     total_width: usize,
diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs
index 4a23edc..a1308d5 100644
--- a/src/rgba_u8.rs
+++ b/src/rgba_u8.rs
@@ -1,7 +1,5 @@
 #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
 use std::arch::aarch64::*;
-#[cfg(target_arch = "x86")]
-use std::arch::x86::*;
 use std::sync::Arc;
 
 use rayon::ThreadPool;
@@ -14,9 +12,9 @@ use crate::neon_simd_u8::*;
 use crate::rgb_u8::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse_rgb_u8::sse_rgb::*;
+use crate::support::{PRECISION, ROUNDING_APPROX};
 use crate::unsafe_slice::UnsafeSlice;
 use crate::ImageStore;
-use crate::support::ROUNDING_APPROX;
 
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 fn convolve_horizontal_rgba_sse(
@@ -280,10 +278,10 @@ fn convolve_horizontal_rgba_native(
             let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) };
 
             unsafe {
-                *dest_ptr = (sum_r >> 12).min(255).max(0) as u8;
-                *dest_ptr.add(1) = (sum_g >> 12).min(255).max(0) as u8;
-                *dest_ptr.add(2) = (sum_b >> 12).min(255).max(0) as u8;
-                *dest_ptr.add(3) = (sum_a >> 12).min(255).max(0) as u8;
+                *dest_ptr = (sum_r >> PRECISION).min(255).max(0) as u8;
+                *dest_ptr.add(1) = (sum_g >> PRECISION).min(255).max(0) as u8;
+                *dest_ptr.add(2) = (sum_b >> PRECISION).min(255).max(0) as u8;
+                *dest_ptr.add(3) = (sum_a >> PRECISION).min(255).max(0) as u8;
             }
 
             filter_offset += approx_weights.aligned_size;