Merge pull request #31 from awxkee/dev

NEON faster u16 pre-multiplication if `fp16` available
awxkee · Jan 7, 2025 · 217b994 · 217b994
2 parents 69cf1f1 + 6684e73
commit 217b994
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 13 deletions.
diff --git a/app/Cargo.toml b/app/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
 [dependencies]
 image = { version = "0.25.5", features = ["default"] }
 #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
-pic-scale = { path = "..", features = ["half"], default-features = true }
+pic-scale = { path = "..", features = ["half", "nightly_i8mm"], default-features = true }
 fast_image_resize = { version = "5.0.0", features = [] }
 half = { version = "2.4.1", default-features = true }
 

diff --git a/app/src/main.rs b/app/src/main.rs
@@ -60,7 +60,7 @@ fn main() {
 
     //
     let store =
-        ImageStore::<u8, 4>::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize)
+        ImageStore::<u16, 4>::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize)
             .unwrap();
 
     let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
@@ -75,15 +75,17 @@ fn main() {
     //     )
     //     .unwrap();
 
-    let mut dst_store = ImageStoreMut::<u8, 4>::alloc_with_depth(
+    let mut dst_store = ImageStoreMut::<u16, 4>::alloc_with_depth(
         dimensions.0 as usize / 4,
         dimensions.1 as usize / 4,
         10,
     );
 
     // for i in 0..25 {
     let start_time = Instant::now();
-    scaler.resize_rgba(&store, &mut dst_store, false).unwrap();
+    scaler
+        .resize_rgba_u16(&store, &mut dst_store, true)
+        .unwrap();
 
     let elapsed_time = start_time.elapsed();
     // Print the elapsed time in milliseconds
@@ -161,13 +163,13 @@ fn main() {
     //     .map(|&x| (x * 255f32) as u8)
     //     .collect();
 
-    // let dst: Vec<u8> = dst_store
-    //     .as_bytes()
-    //     .iter()
-    //     .map(|&x| (x >> 2) as u8)
-    //     .collect();
+    let dst: Vec<u8> = dst_store
+        .as_bytes()
+        .iter()
+        .map(|&x| (x >> 2) as u8)
+        .collect();
 
-    let dst = dst_store.as_bytes();
+    // let dst = dst_store.as_bytes();
     // let dst = resized;
     // image::save_buffer(
     //     "converted.png",

diff --git a/src/alpha_handle_f32.rs b/src/alpha_handle_f32.rs
@@ -131,8 +131,16 @@ pub(crate) fn premultiply_alpha_rgba_f32(
     height: usize,
     pool: &Option<ThreadPool>,
 ) {
-    let mut _dispatcher: fn(&mut [f32], usize, &[f32], usize, usize, usize, &Option<ThreadPool>) =
-        premultiply_alpha_rgba_impl_f32;
+    #[allow(clippy::type_complexity)]
+    let mut _dispatcher: fn(
+        &mut [f32],
+        usize,
+        &[f32],
+        usize,
+        usize,
+        usize,
+        &Option<ThreadPool>,
+    ) = premultiply_alpha_rgba_impl_f32;
     #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
     {
         _dispatcher = neon_premultiply_alpha_rgba_f32;

diff --git a/src/neon/alpha_u16.rs b/src/neon/alpha_u16.rs
@@ -27,7 +27,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 use crate::cpu_features::is_aarch_f16_supported;
-use crate::neon::f16_utils::{xvcvtaq_u16_f16, xvcvtq_f16_u16, xvdivq_f16, xvmulq_f16};
+use crate::neon::f16_utils::{
+    xvcvtaq_u16_f16, xvcvtq_f16_u16, xvdivq_f16, xvmulq_f16, xvrecpeq_f16,
+};
 use crate::neon::{x_float16x8_t, xreinterpretq_f16_u16};
 use rayon::iter::{IndexedParallelIterator, ParallelIterator};
 use rayon::prelude::{ParallelSlice, ParallelSliceMut};
@@ -137,6 +139,69 @@ impl<const BIT_DEPTH: usize> NeonPremultiplyExecutor for NeonPremultiplyExecutor
     }
 }
 
+#[derive(Default)]
+struct NeonPremultiplyExecutorFloat16<const BIT_DEPTH: usize> {}
+
+impl<const BIT_DEPTH: usize> NeonPremultiplyExecutorFloat16<BIT_DEPTH> {
+    #[inline]
+    #[target_feature(enable = "fp16")]
+    unsafe fn premultiply_chunk(
+        &self,
+        dst: &mut [u16],
+        src: &[u16],
+        recip_bit_depth: x_float16x8_t,
+    ) {
+        let pixel = vld4q_u16(src.as_ptr());
+
+        let a_values = xvmulq_f16(xvcvtq_f16_u16(pixel.3), recip_bit_depth);
+
+        let new_r = xvcvtaq_u16_f16(xvmulq_f16(xvcvtq_f16_u16(pixel.0), a_values));
+        let new_g = xvcvtaq_u16_f16(xvmulq_f16(xvcvtq_f16_u16(pixel.1), a_values));
+        let new_b = xvcvtaq_u16_f16(xvmulq_f16(xvcvtq_f16_u16(pixel.2), a_values));
+
+        let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);
+
+        vst4q_u16(dst.as_mut_ptr(), new_px);
+    }
+}
+
+impl<const BIT_DEPTH: usize> NeonPremultiplyExecutor for NeonPremultiplyExecutorFloat16<BIT_DEPTH> {
+    #[target_feature(enable = "fp16")]
+    unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
+        assert_ne!(bit_depth, 0, "Something goes wrong!");
+        assert!((1..=16).contains(&bit_depth));
+
+        let recip_bit_depth = xvrecpeq_f16(xvcvtq_f16_u16(vdupq_n_u16((1 << bit_depth) - 1)));
+
+        let mut rem = dst;
+        let mut src_rem = src;
+        for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
+            self.premultiply_chunk(dst, src, recip_bit_depth);
+        }
+
+        rem = rem.chunks_exact_mut(8 * 4).into_remainder();
+        src_rem = src_rem.chunks_exact(8 * 4).remainder();
+
+        if !rem.is_empty() {
+            assert!(src_rem.len() < 8 * 4);
+            assert!(rem.len() < 8 * 4);
+            assert_eq!(src_rem.len(), rem.len());
+
+            let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4];
+            let mut dst_buffer: [u16; 8 * 4] = [0u16; 8 * 4];
+            std::ptr::copy_nonoverlapping(src_rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len());
+
+            self.premultiply_chunk(
+                dst_buffer.as_mut_slice(),
+                buffer.as_slice(),
+                recip_bit_depth,
+            );
+
+            std::ptr::copy_nonoverlapping(dst_buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
+        }
+    }
+}
+
 #[derive(Default)]
 struct NeonPremultiplyExecutorAnyBitDepth {}
 
@@ -206,6 +271,26 @@ impl NeonPremultiplyExecutor for NeonPremultiplyExecutorAnyBitDepth {
 fn neon_premultiply_alpha_rgba_row_u16(dst: &mut [u16], src: &[u16], bit_depth: usize) {
     assert_ne!(bit_depth, 0, "Something goes wrong!");
 
+    if std::arch::is_aarch64_feature_detected!("fp16") {
+        if bit_depth == 10 {
+            neon_pa_dispatch(
+                dst,
+                src,
+                bit_depth,
+                NeonPremultiplyExecutorFloat16::<10>::default(),
+            );
+            return;
+        } else if bit_depth == 12 {
+            neon_pa_dispatch(
+                dst,
+                src,
+                bit_depth,
+                NeonPremultiplyExecutorFloat16::<12>::default(),
+            );
+            return;
+        }
+    }
+
     if bit_depth == 10 {
         neon_pa_dispatch(
             dst,