Merge pull request #14 from awxkee/dev

Added RGBA8 AVX2 fast path
awxkee · Dec 29, 2024 · ae8436e · ae8436e
2 parents ab25ef6 + 3f494d3
commit ae8436e
Show file tree

Hide file tree

Showing 39 changed files with 799 additions and 151 deletions.
diff --git a/README.md b/README.md
@@ -43,37 +43,36 @@ Despite all implementation are fast, not all the paths are implemented using SIM
 
 `~` - Partially implemented
 
-|                | NEON | SSE | AVX | WASM | 
-|----------------|------|-----|-----|------| 
-| RGBA (8 bit)   | x    | x   | ~   | ~    | 
-| RGB (8 bit)    | x    | x   | ~   | ~    | 
-| Plane (8 bit)  | x    | x   | ~   | ~    | 
-| RGBA (8+ bit)  | x    | x   | ~   | -    | 
-| RGB (8+ bit)   | x    | x   | ~   | -    | 
-| Plane (8+ bit) | ~    | ~   | ~   | -    | 
-| RGBA (f32)     | x    | x   | x   | -    | 
-| RGB (f32)      | x    | x   | ~   | -    | 
-| Plane (f32)    | x    | x   | ~   | -    | 
-| RGBA (f16)     | x    | x   | x   | -    | 
-| RGB (f16)      | x    | ~   | ~   | -    | 
-| Plane (f16)    | ~    | ~   | ~   | -    |
-| AR30/RA30      | x    | -   | -   | -    |
+|                | NEON | SSE | AVX2 | WASM | 
+|----------------|------|-----|------|------| 
+| RGBA (8 bit)   | x    | x   | x    | ~    | 
+| RGB (8 bit)    | x    | x   | ~    | ~    | 
+| Plane (8 bit)  | x    | x   | ~    | ~    | 
+| RGBA (8+ bit)  | x    | x   | ~    | -    | 
+| RGB (8+ bit)   | x    | x   | ~    | -    | 
+| Plane (8+ bit) | ~    | ~   | ~    | -    | 
+| RGBA (f32)     | x    | x   | x    | -    | 
+| RGB (f32)      | x    | x   | ~    | -    | 
+| Plane (f32)    | x    | x   | ~    | -    | 
+| RGBA (f16)     | x    | x   | x    | -    | 
+| RGB (f16)      | x    | ~   | ~    | -    | 
+| Plane (f16)    | ~    | ~   | ~    | -    |
+| AR30/RA30      | x    | -   | -    | -    |
 
 #### Features
 
-For RISC-V `riscv` feature should be implicitly enabled, nightly compiler channel is required
-
 To enable support of `f16` the feature `half` should be activated.
 
 #### Target features
 
-`neon` optional target features are available, enable it when compiling on supported platform to get full features
+`neon` optional target features are available, enable it when compiling on supported platform to get full features.
 
-`avx2`, `fma`, `sse4.1`, `f16c` will be detected automatically if available, and called the best path
+`avx2`, `fma`, `sse4.1`, `f16c` will be detected automatically if available, and called the best path.
+For x86 and aarch64 NEON runtime dispatch is used.
 
 `fullfp16` NEON target detection performed in runtime, when available best the best paths for *f16* images are available on ARM.
 
-WASM `simd128` target feature activating is mandatory in build flags
+WASM `simd128` target feature activating is mandatory in build flags.
 
 ##### About f16
 

diff --git a/app/benches/resize_rgb/main.rs b/app/benches/resize_rgb/main.rs
@@ -17,16 +17,16 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     let src_bytes = binding.as_bytes();
 
     c.bench_function("Pic scale RGB: Lanczos 3", |b| {
+        let mut copied: Vec<u8> = Vec::from(src_bytes);
+        let store = ImageStore::<u8, 3>::from_slice(
+            &mut copied,
+            dimensions.0 as usize,
+            dimensions.1 as usize,
+        )
+        .unwrap();
         b.iter(|| {
             let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
             scaler.set_threading_policy(ThreadingPolicy::Single);
-            let mut copied: Vec<u8> = Vec::from(src_bytes);
-            let store = ImageStore::<u8, 3>::from_slice(
-                &mut copied,
-                dimensions.0 as usize,
-                dimensions.1 as usize,
-            )
-            .unwrap();
             let mut target =
                 ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
             scaler.resize_rgb(&store, &mut target).unwrap();
@@ -36,28 +36,29 @@ pub fn criterion_benchmark(c: &mut Criterion) {
     let f32_image: Vec<f32> = src_bytes.iter().map(|&x| x as f32 / 255f32).collect();
 
     c.bench_function("Pic scale RGB f32: Lanczos 3", |b| {
+        let mut copied: Vec<f32> = Vec::from(f32_image.clone());
+        let store = ImageStore::<f32, 3>::from_slice(
+            &mut copied,
+            dimensions.0 as usize,
+            dimensions.1 as usize,
+        )
+        .unwrap();
         b.iter(|| {
             let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
             scaler.set_threading_policy(ThreadingPolicy::Single);
-            let mut copied: Vec<f32> = Vec::from(f32_image.clone());
-            let store = ImageStore::<f32, 3>::from_slice(
-                &mut copied,
-                dimensions.0 as usize,
-                dimensions.1 as usize,
-            )
-            .unwrap();
             let mut target =
                 ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
             scaler.resize_rgb_f32(&store, &mut target).unwrap();
         })
     });
 
     c.bench_function("Fast image resize RGB: Lanczos 3", |b| {
+        let mut vc = Vec::from(img.as_bytes());
+        let pixel_type: PixelType = PixelType::U8x3;
+
+        let src_image =
+            Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();
         b.iter(|| {
-            let mut vc = Vec::from(img.as_bytes());
-            let pixel_type: PixelType = PixelType::U8x3;
-            let src_image =
-                Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();
             let mut dst_image = Image::new(dimensions.0 / 4, dimensions.1 / 4, pixel_type);
 
             let mut resizer = Resizer::new();

diff --git a/app/src/main.rs b/app/src/main.rs
@@ -53,7 +53,7 @@ fn main() {
     let transient = img.to_rgba8();
     let mut bytes = Vec::from(transient.as_bytes());
 
-    let mut scaler = LinearScaler::new(ResamplingFunction::Bilinear);
+    let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
     scaler.set_threading_policy(ThreadingPolicy::Single);
 
     // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear);
@@ -64,7 +64,7 @@ fn main() {
         ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize)
             .unwrap();
 
-    let dst_size = ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2);
+    let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
     // let mut resized_ar = vec![0u32; dst_size.width * dst_size.height];
     let start_time = Instant::now();
     // scaler
@@ -78,7 +78,7 @@ fn main() {
     //     .unwrap();
 
     let mut dst_store =
-        ImageStoreMut::<u8, 4>::alloc(dimensions.0 as usize / 2, dimensions.1 as usize / 2);
+        ImageStoreMut::<u8, 4>::alloc(dimensions.0 as usize / 3, dimensions.1 as usize / 3);
 
     scaler.resize_rgba(&store, &mut dst_store, false).unwrap();
 

diff --git a/src/alpha_check.rs b/src/alpha_check.rs
@@ -42,6 +42,7 @@ pub(crate) fn has_non_constant_cap_alpha_rgba_f32(store: &[f32], width: usize) -
     has_non_constant_cap_alpha_f32_impl::<3, 4>(store, width)
 }
 
+/// Scans an image to check if alpha is not constant
 pub(crate) fn has_non_constant_cap_alpha<
     V: Copy + PartialEq + BitXor<V, Output = V> + 'static + AsPrimitive<J> + 'static,
     J: Copy + AddAssign + Default + 'static + Eq + Ord,
@@ -76,6 +77,7 @@ where
     row_sums.ne(&zeros)
 }
 
+/// Scans an `f32` image to check if alpha is not constant
 fn has_non_constant_cap_alpha_f32_impl<const ALPHA_CHANNEL_INDEX: usize, const CHANNELS: usize>(
     store: &[f32],
     width: usize,

diff --git a/src/alpha_handle_u16.rs b/src/alpha_handle_u16.rs
@@ -39,20 +39,23 @@ use rayon::slice::ParallelSliceMut;
 use rayon::ThreadPool;
 
 #[inline]
+/// Divides value by 1023 with rounding to nearest
 pub(crate) fn div_by_1023(v: u32) -> u16 {
     let round = 1 << 9;
     let v = v + round;
     (((v >> 10) + v) >> 10) as u16
 }
 
 #[inline]
+/// Divides value by 4095 with rounding to nearest
 pub(crate) fn div_by_4095(v: u32) -> u16 {
     let round = 1 << 11;
     let v = v + round;
     (((v >> 12) + v) >> 12) as u16
 }
 
 #[inline]
+/// Divides value by 655353 with rounding to nearest
 pub(crate) fn div_by_65535(v: u32) -> u16 {
     let round = 1 << 15;
     let v_expand = v;
@@ -174,13 +177,13 @@ pub(crate) fn premultiply_alpha_rgba_u16(
         premultiply_alpha_rgba_impl;
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     {
-        if is_x86_feature_detected!("sse4.1") {
+        if std::is_x86_feature_detected!("sse4.1") {
             _dispatcher = premultiply_alpha_sse_rgba_u16;
         }
     }
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     {
-        if is_x86_feature_detected!("avx2") {
+        if std::is_x86_feature_detected!("avx2") {
             _dispatcher = avx_premultiply_alpha_rgba_u16;
         }
     }
@@ -203,13 +206,13 @@ pub(crate) fn unpremultiply_alpha_rgba_u16(
         unpremultiply_alpha_rgba_impl;
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     {
-        if is_x86_feature_detected!("sse4.1") {
+        if std::is_x86_feature_detected!("sse4.1") {
             _dispatcher = unpremultiply_alpha_sse_rgba_u16;
         }
     }
     #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
     {
-        if is_x86_feature_detected!("avx2") {
+        if std::is_x86_feature_detected!("avx2") {
             _dispatcher = avx_unpremultiply_alpha_rgba_u16;
         }
     }

diff --git a/src/alpha_handle_u8.rs b/src/alpha_handle_u8.rs
@@ -40,6 +40,7 @@ use rayon::slice::ParallelSliceMut;
 use rayon::ThreadPool;
 
 #[inline]
+/// Divides value by 255 with rounding to nearest
 pub(crate) fn div_by_255(v: u16) -> u8 {
     ((((v + 0x80) >> 8) + v + 0x80) >> 8).min(255) as u8
 }

diff --git a/src/ar30.rs b/src/ar30.rs
@@ -61,8 +61,8 @@ impl Rgb30 {
     #[inline]
     pub(crate) const fn pack_w_a<const STORE: usize>(self, r: i32, g: i32, b: i32, a: i32) -> u32 {
         let value: u32 = match self {
-            Rgb30::Ar30 => (a << 30 | (b << 20) | (g << 10) | r) as u32,
-            Rgb30::Ra30 => ((r << 22) | (g << 12) | (b << 2) | a) as u32,
+            Rgb30::Ar30 => (((a << 30) | (b << 20)) | ((g << 10) | r)) as u32,
+            Rgb30::Ra30 => (((r << 22) | (g << 12)) | ((b << 2) | a)) as u32,
         };
         if STORE == 0 {
             value

diff --git a/src/avx2/alpha_f16.rs b/src/avx2/alpha_f16.rs
@@ -50,6 +50,7 @@ pub(crate) fn avx_premultiply_alpha_rgba_f16(
 }
 
 #[target_feature(enable = "avx2", enable = "f16c")]
+/// This inlining is required to activate all features for runtime dispatch
 unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[half::f16]) {
     let mut rem = dst;
     let mut src_rem = src;
@@ -112,8 +113,8 @@ unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[
     premultiply_pixel_f16_row(rem, src_rem);
 }
 
-#[inline]
 #[target_feature(enable = "avx2", enable = "f16c")]
+/// This inlining is required to activate all features for runtime dispatch
 unsafe fn avx_premultiply_alpha_rgba_f16_impl(
     dst: &mut [half::f16],
     src: &[half::f16],
@@ -150,6 +151,7 @@ pub(crate) fn avx_unpremultiply_alpha_rgba_f16(
 }
 
 #[target_feature(enable = "avx2", enable = "f16c")]
+/// This inlining is required to activate all features for runtime dispatch
 unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16]) {
     let mut rem = in_place;
 
@@ -234,8 +236,8 @@ unsafe fn avx_unpremultiply_alpha_rgba_f16_row_impl(in_place: &mut [half::f16])
     unpremultiply_pixel_f16_row(rem);
 }
 
-#[inline]
 #[target_feature(enable = "avx2", enable = "f16c")]
+/// This inlining is required to activate all features for runtime dispatch
 unsafe fn avx_unpremultiply_alpha_rgba_f16_impl(
     in_place: &mut [half::f16],
     width: usize,

diff --git a/src/avx2/alpha_f32.rs b/src/avx2/alpha_f32.rs
@@ -88,7 +88,6 @@ unsafe fn avx_unpremultiply_alpha_rgba_f32_row_impl(in_place: &mut [f32]) {
     unpremultiply_pixel_f32_row(rem);
 }
 
-#[inline]
 #[target_feature(enable = "avx2")]
 unsafe fn avx_unpremultiply_alpha_rgba_f32_impl(
     in_place: &mut [f32],

diff --git a/src/avx2/alpha_u16.rs b/src/avx2/alpha_u16.rs
@@ -89,6 +89,7 @@ pub(crate) fn avx_premultiply_alpha_rgba_u16(
 }
 
 #[target_feature(enable = "avx2")]
+/// This inlining is required to activate all features for runtime dispatch
 unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_depth: usize) {
     let max_colors = (1 << bit_depth) - 1;
 
@@ -356,6 +357,7 @@ pub(crate) fn avx_unpremultiply_alpha_rgba_u16(
 }
 
 #[target_feature(enable = "avx2")]
+/// This inlining is required to activate all features for runtime dispatch
 unsafe fn avx_unpremultiply_alpha_rgba_u16_row(in_place: &mut [u16], bit_depth: usize) {
     let max_colors = (1 << bit_depth) - 1;
 
@@ -408,7 +410,6 @@ unsafe fn avx_unpremultiply_alpha_rgba_u16_row(in_place: &mut [u16], bit_depth:
     unpremultiply_alpha_rgba_row(rem, max_colors);
 }
 
-#[inline]
 #[target_feature(enable = "avx2")]
 unsafe fn avx_unpremultiply_alpha_rgba_u16_impl(
     in_place: &mut [u16],

diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs
@@ -209,7 +209,6 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) {
     premultiply_alpha_rgba_row_impl(rem, src_rem);
 }
 
-#[inline]
 #[target_feature(enable = "avx2")]
 unsafe fn avx_premultiply_alpha_rgba_impl(
     dst: &mut [u8],
@@ -301,7 +300,6 @@ unsafe fn avx_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) {
     unpremultiply_alpha_rgba_row_impl(rem);
 }
 
-#[inline]
 #[target_feature(enable = "avx2")]
 unsafe fn avx_unpremultiply_alpha_rgba_impl(
     in_place: &mut [u8],

diff --git a/src/avx2/mod.rs b/src/avx2/mod.rs
@@ -35,6 +35,7 @@ mod alpha_u8;
 #[cfg(feature = "half")]
 mod rgba_f16;
 mod rgba_f32;
+mod rgba_u8_lb;
 pub(crate) mod utils;
 #[cfg(feature = "half")]
 mod vertical_f16;
@@ -56,6 +57,9 @@ pub(crate) use rgba_f16::{
 pub(crate) use rgba_f32::{
     convolve_horizontal_rgba_avx_row_one_f32, convolve_horizontal_rgba_avx_rows_4_f32,
 };
+pub(crate) use rgba_u8_lb::{
+    convolve_horizontal_rgba_avx_rows_4_lb, convolve_horizontal_rgba_avx_rows_one_lb,
+};
 #[cfg(feature = "half")]
 pub(crate) use vertical_f16::convolve_vertical_avx_row_f16;
 pub(crate) use vertical_f32::convolve_vertical_avx_row_f32;

diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs
@@ -144,8 +144,8 @@ pub(crate) fn convolve_horizontal_rgba_avx_row_one_f16<const FMA: bool>(
     }
 }
 
-#[inline]
-#[target_feature(enable = "avx2,f16c,fma")]
+#[target_feature(enable = "avx2", enable = "f16c", enable = "fma")]
+/// This inlining is required to activate all features for runtime dispatch
 unsafe fn convolve_horizontal_rgba_avx_row_one_f16_fma(
     dst_width: usize,
     src_width: usize,
@@ -162,8 +162,8 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_fma(
     );
 }
 
-#[inline]
-#[target_feature(enable = "avx2,f16c")]
+#[target_feature(enable = "avx2", enable = "f16c")]
+/// This inlining is required to activate all features for runtime dispatch
 unsafe fn convolve_horizontal_rgba_avx_row_one_f16_regular(
     dst_width: usize,
     src_width: usize,
@@ -301,8 +301,8 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f16<const FMA: bool>(
     }
 }
 
-#[inline]
 #[target_feature(enable = "avx2", enable = "f16c")]
+/// This inlining is required to activate all features for runtime dispatch
 unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_regular(
     dst_width: usize,
     src_width: usize,
@@ -323,8 +323,8 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_regular(
     );
 }
 
-#[inline]
 #[target_feature(enable = "avx2", enable = "f16c", enable = "fma")]
+/// This inlining is required to activate all features for runtime dispatch
 unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_fma(
     dst_width: usize,
     src_width: usize,