Merge pull request #26 from awxkee/dev

Stride f32, masked loads AVX-512, Vertical AVX-512 u16
awxkee · Jan 5, 2025 · 3de50d1 · 3de50d1
2 parents 6c172f0 + 918c682
commit 3de50d1
Show file tree

Hide file tree

Showing 16 changed files with 766 additions and 267 deletions.
diff --git a/README.md b/README.md
@@ -53,8 +53,8 @@ Despite all implementation are fast, not all the paths are implemented using SIM
 | RGB (8 bit)    | x    | x   | ~    | ~          | ~    | 
 | Plane (8 bit)  | x    | x   | ~    | ~          | ~    | 
 | RGBA (8+ bit)  | x    | x   | ~    | x(avxvnni) | -    | 
-| RGB (8+ bit)   | x    | x   | ~    | -          | -    | 
-| Plane (8+ bit) | ~    | ~   | ~    | -          | -    | 
+| RGB (8+ bit)   | x    | x   | ~    | ~          | -    | 
+| Plane (8+ bit) | ~    | ~   | ~    | ~          | -    | 
 | RGBA (f32)     | x    | x   | x    | -          | -    | 
 | RGB (f32)      | x    | x   | ~    | -          | -    | 
 | Plane (f32)    | x    | x   | ~    | -          | -    | 

diff --git a/src/alpha_handle_f16.rs b/src/alpha_handle_f16.rs
@@ -82,24 +82,26 @@ pub(crate) fn premultiply_pixel_f16_row(dst: &mut [half::f16], src: &[half::f16]
 
 fn premultiply_alpha_rgba_impl_f16(
     dst: &mut [half::f16],
+    dst_stride: usize,
     src: &[half::f16],
+    src_stride: usize,
     width: usize,
     _: usize,
     pool: &Option<ThreadPool>,
 ) {
     if let Some(pool) = pool {
         pool.install(|| {
-            dst.par_chunks_exact_mut(width * 4)
-                .zip(src.par_chunks_exact(width * 4))
+            dst.par_chunks_exact_mut(dst_stride)
+                .zip(src.par_chunks_exact(src_stride))
                 .for_each(|(dst, src)| {
-                    premultiply_pixel_f16_row(dst, src);
+                    premultiply_pixel_f16_row(&mut dst[..width * 4], &src[..width * 4]);
                 });
         });
     } else {
-        dst.chunks_exact_mut(width * 4)
-            .zip(src.chunks_exact(width * 4))
+        dst.chunks_exact_mut(dst_stride)
+            .zip(src.chunks_exact(src_stride))
             .for_each(|(dst, src)| {
-                premultiply_pixel_f16_row(dst, src);
+                premultiply_pixel_f16_row(&mut dst[..width * 4], &src[..width * 4]);
             });
     }
 }
@@ -126,13 +128,22 @@ fn unpremultiply_alpha_rgba_impl_f16(
 
 pub(crate) fn premultiply_alpha_rgba_f16(
     dst: &mut [half::f16],
+    dst_stride: usize,
     src: &[half::f16],
+    src_stride: usize,
     width: usize,
     height: usize,
     pool: &Option<ThreadPool>,
 ) {
-    let mut _dispatcher: fn(&mut [half::f16], &[half::f16], usize, usize, &Option<ThreadPool>) =
-        premultiply_alpha_rgba_impl_f16;
+    let mut _dispatcher: fn(
+        &mut [half::f16],
+        usize,
+        &[half::f16],
+        usize,
+        usize,
+        usize,
+        &Option<ThreadPool>,
+    ) = premultiply_alpha_rgba_impl_f16;
     #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
     {
         _dispatcher = neon_premultiply_alpha_rgba_f16;
@@ -152,7 +163,7 @@ pub(crate) fn premultiply_alpha_rgba_f16(
             _dispatcher = avx_premultiply_alpha_rgba_f16;
         }
     }
-    _dispatcher(dst, src, width, height, pool);
+    _dispatcher(dst, dst_stride, src, src_stride, width, height, pool);
 }
 
 pub(crate) fn unpremultiply_alpha_rgba_f16(

diff --git a/src/alpha_handle_f32.rs b/src/alpha_handle_f32.rs
@@ -78,24 +78,26 @@ pub(crate) fn premultiply_pixel_f32_row(dst: &mut [f32], src: &[f32]) {
 
 fn premultiply_alpha_rgba_impl_f32(
     dst: &mut [f32],
+    dst_stride: usize,
     src: &[f32],
+    src_stride: usize,
     width: usize,
     _: usize,
     pool: &Option<ThreadPool>,
 ) {
     if let Some(pool) = pool {
         pool.install(|| {
-            dst.par_chunks_exact_mut(width * 4)
-                .zip(src.par_chunks_exact(width * 4))
+            dst.par_chunks_exact_mut(dst_stride)
+                .zip(src.par_chunks_exact(src_stride))
                 .for_each(|(dst, src)| {
-                    premultiply_pixel_f32_row(dst, src);
+                    premultiply_pixel_f32_row(&mut dst[..width * 4], &src[..width * 4]);
                 });
         });
     } else {
-        dst.chunks_exact_mut(width * 4)
-            .zip(src.chunks_exact(width * 4))
+        dst.chunks_exact_mut(dst_stride)
+            .zip(src.chunks_exact(src_stride))
             .for_each(|(dst, src)| {
-                premultiply_pixel_f32_row(dst, src);
+                premultiply_pixel_f32_row(&mut dst[..width * 4], &src[..width * 4]);
             });
     }
 }
@@ -122,12 +124,14 @@ fn unpremultiply_alpha_rgba_impl_f32(
 
 pub(crate) fn premultiply_alpha_rgba_f32(
     dst: &mut [f32],
+    dst_stride: usize,
     src: &[f32],
+    src_stride: usize,
     width: usize,
     height: usize,
     pool: &Option<ThreadPool>,
 ) {
-    let mut _dispatcher: fn(&mut [f32], &[f32], usize, usize, &Option<ThreadPool>) =
+    let mut _dispatcher: fn(&mut [f32], usize, &[f32], usize, usize, usize, &Option<ThreadPool>) =
         premultiply_alpha_rgba_impl_f32;
     #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
     {
@@ -145,7 +149,7 @@ pub(crate) fn premultiply_alpha_rgba_f32(
             _dispatcher = avx_premultiply_alpha_rgba_f32;
         }
     }
-    _dispatcher(dst, src, width, height, pool);
+    _dispatcher(dst, dst_stride, src, src_stride, width, height, pool);
 }
 
 pub(crate) fn unpremultiply_alpha_rgba_f32(

diff --git a/src/avx2/alpha_f16.rs b/src/avx2/alpha_f16.rs
@@ -39,13 +39,15 @@ use std::arch::x86_64::*;
 
 pub(crate) fn avx_premultiply_alpha_rgba_f16(
     dst: &mut [half::f16],
+    dst_stride: usize,
     src: &[half::f16],
+    src_stride: usize,
     width: usize,
     height: usize,
     pool: &Option<ThreadPool>,
 ) {
     unsafe {
-        avx_premultiply_alpha_rgba_f16_impl(dst, src, width, height, pool);
+        avx_premultiply_alpha_rgba_f16_impl(dst, dst_stride, src, src_stride, width, height, pool);
     }
 }
 
@@ -115,24 +117,29 @@ unsafe fn avx_premultiply_alpha_rgba_f16_row_impl(dst: &mut [half::f16], src: &[
 /// This inlining is required to activate all features for runtime dispatch
 unsafe fn avx_premultiply_alpha_rgba_f16_impl(
     dst: &mut [half::f16],
+    dst_stride: usize,
     src: &[half::f16],
+    src_stride: usize,
     width: usize,
     _: usize,
     pool: &Option<ThreadPool>,
 ) {
     if let Some(pool) = pool {
         pool.install(|| {
-            dst.par_chunks_exact_mut(width * 4)
-                .zip(src.par_chunks_exact(width * 4))
+            dst.par_chunks_exact_mut(dst_stride)
+                .zip(src.par_chunks_exact(src_stride))
                 .for_each(|(dst, src)| unsafe {
-                    avx_premultiply_alpha_rgba_f16_row_impl(dst, src);
+                    avx_premultiply_alpha_rgba_f16_row_impl(
+                        &mut dst[..width * 4],
+                        &src[..width * 4],
+                    );
                 });
         });
     } else {
-        dst.chunks_exact_mut(width * 4)
-            .zip(src.chunks_exact(width * 4))
+        dst.chunks_exact_mut(dst_stride)
+            .zip(src.chunks_exact(src_stride))
             .for_each(|(dst, src)| unsafe {
-                avx_premultiply_alpha_rgba_f16_row_impl(dst, src);
+                avx_premultiply_alpha_rgba_f16_row_impl(&mut dst[..width * 4], &src[..width * 4]);
             });
     }
 }

diff --git a/src/avx2/alpha_f32.rs b/src/avx2/alpha_f32.rs
@@ -114,13 +114,15 @@ unsafe fn avx_unpremultiply_alpha_rgba_f32_impl(
 
 pub(crate) fn avx_premultiply_alpha_rgba_f32(
     dst: &mut [f32],
+    dst_stride: usize,
     src: &[f32],
+    src_stride: usize,
     width: usize,
     height: usize,
     pool: &Option<ThreadPool>,
 ) {
     unsafe {
-        avx_premultiply_alpha_rgba_f32_impl(dst, src, width, height, pool);
+        avx_premultiply_alpha_rgba_f32_impl(dst, dst_stride, src, src_stride, width, height, pool);
     }
 }
 
@@ -163,24 +165,29 @@ unsafe fn avx_premultiply_alpha_rgba_f32_row_impl(dst: &mut [f32], src: &[f32])
 #[target_feature(enable = "avx2")]
 unsafe fn avx_premultiply_alpha_rgba_f32_impl(
     dst: &mut [f32],
+    dst_stride: usize,
     src: &[f32],
+    src_stride: usize,
     width: usize,
     _: usize,
     pool: &Option<ThreadPool>,
 ) {
     if let Some(pool) = pool {
         pool.install(|| {
-            dst.par_chunks_exact_mut(width * 4)
-                .zip(src.par_chunks_exact(width * 4))
+            dst.par_chunks_exact_mut(dst_stride)
+                .zip(src.par_chunks_exact(src_stride))
                 .for_each(|(dst, src)| unsafe {
-                    avx_premultiply_alpha_rgba_f32_row_impl(dst, src);
+                    avx_premultiply_alpha_rgba_f32_row_impl(
+                        &mut dst[..width * 4],
+                        &src[..width * 4],
+                    );
                 });
         });
     } else {
-        dst.chunks_exact_mut(width * 4)
-            .zip(src.chunks_exact(width * 4))
+        dst.chunks_exact_mut(dst_stride)
+            .zip(src.chunks_exact(src_stride))
             .for_each(|(dst, src)| unsafe {
-                avx_premultiply_alpha_rgba_f32_row_impl(dst, src);
+                avx_premultiply_alpha_rgba_f32_row_impl(&mut dst[..width * 4], &src[..width * 4]);
             });
     }
 }