diff --git a/app/Cargo.toml b/app/Cargo.toml index 1c09c6d..4b7e008 100644 --- a/app/Cargo.toml +++ b/app/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [dependencies] image = { version = "0.25.5", features = ["default"] } #image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] } -pic-scale = { path = "..", features = ["half"], default-features = true } +pic-scale = { path = "..", features = ["half", "nightly_i8mm"], default-features = true } fast_image_resize = { version = "5.0.0", features = [] } half = { version = "2.4.1", default-features = true } diff --git a/app/src/main.rs b/app/src/main.rs index 445e76d..935f996 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -60,7 +60,7 @@ fn main() { // let store = - ImageStore::::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize) + ImageStore::::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize) .unwrap(); let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4); @@ -75,7 +75,7 @@ fn main() { // ) // .unwrap(); - let mut dst_store = ImageStoreMut::::alloc_with_depth( + let mut dst_store = ImageStoreMut::::alloc_with_depth( dimensions.0 as usize / 4, dimensions.1 as usize / 4, 10, @@ -83,7 +83,9 @@ fn main() { // for i in 0..25 { let start_time = Instant::now(); - scaler.resize_rgba(&store, &mut dst_store, false).unwrap(); + scaler + .resize_rgba_u16(&store, &mut dst_store, true) + .unwrap(); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds @@ -161,13 +163,13 @@ fn main() { // .map(|&x| (x * 255f32) as u8) // .collect(); - // let dst: Vec = dst_store - // .as_bytes() - // .iter() - // .map(|&x| (x >> 2) as u8) - // .collect(); + let dst: Vec = dst_store + .as_bytes() + .iter() + .map(|&x| (x >> 2) as u8) + .collect(); - let dst = dst_store.as_bytes(); + // let dst = dst_store.as_bytes(); // let dst = resized; // image::save_buffer( // "converted.png", diff --git a/src/alpha_handle_f32.rs b/src/alpha_handle_f32.rs index e1e05bd..1b3b03a 100644 --- a/src/alpha_handle_f32.rs +++ b/src/alpha_handle_f32.rs @@ -131,8 +131,16 @@ pub(crate) fn premultiply_alpha_rgba_f32( height: usize, pool: &Option, ) { - let mut _dispatcher: fn(&mut [f32], usize, &[f32], usize, usize, usize, &Option) = - premultiply_alpha_rgba_impl_f32; + #[allow(clippy::type_complexity)] + let mut _dispatcher: fn( + &mut [f32], + usize, + &[f32], + usize, + usize, + usize, + &Option, + ) = premultiply_alpha_rgba_impl_f32; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { _dispatcher = neon_premultiply_alpha_rgba_f32; diff --git a/src/neon/alpha_u16.rs b/src/neon/alpha_u16.rs index 020565e..c98225b 100644 --- a/src/neon/alpha_u16.rs +++ b/src/neon/alpha_u16.rs @@ -27,7 +27,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ use crate::cpu_features::is_aarch_f16_supported; -use crate::neon::f16_utils::{xvcvtaq_u16_f16, xvcvtq_f16_u16, xvdivq_f16, xvmulq_f16}; +use crate::neon::f16_utils::{ + xvcvtaq_u16_f16, xvcvtq_f16_u16, xvdivq_f16, xvmulq_f16, xvrecpeq_f16, +}; use crate::neon::{x_float16x8_t, xreinterpretq_f16_u16}; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; @@ -137,6 +139,69 @@ impl NeonPremultiplyExecutor for NeonPremultiplyExecutor } } +#[derive(Default)] +struct NeonPremultiplyExecutorFloat16 {} + +impl NeonPremultiplyExecutorFloat16 { + #[inline] + #[target_feature(enable = "fp16")] + unsafe fn premultiply_chunk( + &self, + dst: &mut [u16], + src: &[u16], + recip_bit_depth: x_float16x8_t, + ) { + let pixel = vld4q_u16(src.as_ptr()); + + let a_values = xvmulq_f16(xvcvtq_f16_u16(pixel.3), recip_bit_depth); + + let new_r = xvcvtaq_u16_f16(xvmulq_f16(xvcvtq_f16_u16(pixel.0), a_values)); + let new_g = xvcvtaq_u16_f16(xvmulq_f16(xvcvtq_f16_u16(pixel.1), a_values)); + let new_b = xvcvtaq_u16_f16(xvmulq_f16(xvcvtq_f16_u16(pixel.2), a_values)); + + let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3); + + vst4q_u16(dst.as_mut_ptr(), new_px); + } +} + +impl NeonPremultiplyExecutor for NeonPremultiplyExecutorFloat16 { + #[target_feature(enable = "fp16")] + unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) { + assert_ne!(bit_depth, 0, "Something goes wrong!"); + assert!((1..=16).contains(&bit_depth)); + + let recip_bit_depth = xvrecpeq_f16(xvcvtq_f16_u16(vdupq_n_u16((1 << bit_depth) - 1))); + + let mut rem = dst; + let mut src_rem = src; + for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) { + self.premultiply_chunk(dst, src, recip_bit_depth); + } + + rem = rem.chunks_exact_mut(8 * 4).into_remainder(); + src_rem = src_rem.chunks_exact(8 * 4).remainder(); + + if !rem.is_empty() { + assert!(src_rem.len() < 8 * 4); + assert!(rem.len() < 8 * 4); + assert_eq!(src_rem.len(), rem.len()); + + let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4]; + let mut dst_buffer: [u16; 8 * 4] = [0u16; 8 * 4]; + std::ptr::copy_nonoverlapping(src_rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len()); + + self.premultiply_chunk( + dst_buffer.as_mut_slice(), + buffer.as_slice(), + recip_bit_depth, + ); + + std::ptr::copy_nonoverlapping(dst_buffer.as_ptr(), rem.as_mut_ptr(), rem.len()); + } + } +} + #[derive(Default)] struct NeonPremultiplyExecutorAnyBitDepth {} @@ -206,6 +271,26 @@ impl NeonPremultiplyExecutor for NeonPremultiplyExecutorAnyBitDepth { fn neon_premultiply_alpha_rgba_row_u16(dst: &mut [u16], src: &[u16], bit_depth: usize) { assert_ne!(bit_depth, 0, "Something goes wrong!"); + if std::arch::is_aarch64_feature_detected!("fp16") { + if bit_depth == 10 { + neon_pa_dispatch( + dst, + src, + bit_depth, + NeonPremultiplyExecutorFloat16::<10>::default(), + ); + return; + } else if bit_depth == 12 { + neon_pa_dispatch( + dst, + src, + bit_depth, + NeonPremultiplyExecutorFloat16::<12>::default(), + ); + return; + } + } + if bit_depth == 10 { neon_pa_dispatch( dst,