Skip to content

Commit

Permalink
Merge pull request #31 from awxkee/dev
Browse files Browse the repository at this point in the history
NEON faster u16 pre-multiplication if `fp16` available
  • Loading branch information
awxkee authored Jan 7, 2025
2 parents 69cf1f1 + 6684e73 commit 217b994
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 13 deletions.
2 changes: 1 addition & 1 deletion app/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ edition = "2021"
[dependencies]
image = { version = "0.25.5", features = ["default"] }
#image = { path= "../../../RustroverProjects/image", features = ["default", "avif", "avif-native"] }
pic-scale = { path = "..", features = ["half"], default-features = true }
pic-scale = { path = "..", features = ["half", "nightly_i8mm"], default-features = true }
fast_image_resize = { version = "5.0.0", features = [] }
half = { version = "2.4.1", default-features = true }

Expand Down
20 changes: 11 additions & 9 deletions app/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ fn main() {

//
let store =
ImageStore::<u8, 4>::from_slice(&bytes, dimensions.0 as usize, dimensions.1 as usize)
ImageStore::<u16, 4>::from_slice(&choke, dimensions.0 as usize, dimensions.1 as usize)
.unwrap();

let dst_size = ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4);
Expand All @@ -75,15 +75,17 @@ fn main() {
// )
// .unwrap();

let mut dst_store = ImageStoreMut::<u8, 4>::alloc_with_depth(
let mut dst_store = ImageStoreMut::<u16, 4>::alloc_with_depth(
dimensions.0 as usize / 4,
dimensions.1 as usize / 4,
10,
);

// for i in 0..25 {
let start_time = Instant::now();
scaler.resize_rgba(&store, &mut dst_store, false).unwrap();
scaler
.resize_rgba_u16(&store, &mut dst_store, true)
.unwrap();

let elapsed_time = start_time.elapsed();
// Print the elapsed time in milliseconds
Expand Down Expand Up @@ -161,13 +163,13 @@ fn main() {
// .map(|&x| (x * 255f32) as u8)
// .collect();

// let dst: Vec<u8> = dst_store
// .as_bytes()
// .iter()
// .map(|&x| (x >> 2) as u8)
// .collect();
let dst: Vec<u8> = dst_store
.as_bytes()
.iter()
.map(|&x| (x >> 2) as u8)
.collect();

let dst = dst_store.as_bytes();
// let dst = dst_store.as_bytes();
// let dst = resized;
// image::save_buffer(
// "converted.png",
Expand Down
12 changes: 10 additions & 2 deletions src/alpha_handle_f32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,16 @@ pub(crate) fn premultiply_alpha_rgba_f32(
height: usize,
pool: &Option<ThreadPool>,
) {
let mut _dispatcher: fn(&mut [f32], usize, &[f32], usize, usize, usize, &Option<ThreadPool>) =
premultiply_alpha_rgba_impl_f32;
#[allow(clippy::type_complexity)]
let mut _dispatcher: fn(
&mut [f32],
usize,
&[f32],
usize,
usize,
usize,
&Option<ThreadPool>,
) = premultiply_alpha_rgba_impl_f32;
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
{
_dispatcher = neon_premultiply_alpha_rgba_f32;
Expand Down
87 changes: 86 additions & 1 deletion src/neon/alpha_u16.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
use crate::cpu_features::is_aarch_f16_supported;
use crate::neon::f16_utils::{xvcvtaq_u16_f16, xvcvtq_f16_u16, xvdivq_f16, xvmulq_f16};
use crate::neon::f16_utils::{
xvcvtaq_u16_f16, xvcvtq_f16_u16, xvdivq_f16, xvmulq_f16, xvrecpeq_f16,
};
use crate::neon::{x_float16x8_t, xreinterpretq_f16_u16};
use rayon::iter::{IndexedParallelIterator, ParallelIterator};
use rayon::prelude::{ParallelSlice, ParallelSliceMut};
Expand Down Expand Up @@ -137,6 +139,69 @@ impl<const BIT_DEPTH: usize> NeonPremultiplyExecutor for NeonPremultiplyExecutor
}
}

#[derive(Default)]
struct NeonPremultiplyExecutorFloat16<const BIT_DEPTH: usize> {}

impl<const BIT_DEPTH: usize> NeonPremultiplyExecutorFloat16<BIT_DEPTH> {
#[inline]
#[target_feature(enable = "fp16")]
unsafe fn premultiply_chunk(
&self,
dst: &mut [u16],
src: &[u16],
recip_bit_depth: x_float16x8_t,
) {
let pixel = vld4q_u16(src.as_ptr());

let a_values = xvmulq_f16(xvcvtq_f16_u16(pixel.3), recip_bit_depth);

let new_r = xvcvtaq_u16_f16(xvmulq_f16(xvcvtq_f16_u16(pixel.0), a_values));
let new_g = xvcvtaq_u16_f16(xvmulq_f16(xvcvtq_f16_u16(pixel.1), a_values));
let new_b = xvcvtaq_u16_f16(xvmulq_f16(xvcvtq_f16_u16(pixel.2), a_values));

let new_px = uint16x8x4_t(new_r, new_g, new_b, pixel.3);

vst4q_u16(dst.as_mut_ptr(), new_px);
}
}

impl<const BIT_DEPTH: usize> NeonPremultiplyExecutor for NeonPremultiplyExecutorFloat16<BIT_DEPTH> {
#[target_feature(enable = "fp16")]
unsafe fn premultiply(&self, dst: &mut [u16], src: &[u16], bit_depth: usize) {
assert_ne!(bit_depth, 0, "Something goes wrong!");
assert!((1..=16).contains(&bit_depth));

let recip_bit_depth = xvrecpeq_f16(xvcvtq_f16_u16(vdupq_n_u16((1 << bit_depth) - 1)));

let mut rem = dst;
let mut src_rem = src;
for (dst, src) in rem.chunks_exact_mut(8 * 4).zip(src_rem.chunks_exact(8 * 4)) {
self.premultiply_chunk(dst, src, recip_bit_depth);
}

rem = rem.chunks_exact_mut(8 * 4).into_remainder();
src_rem = src_rem.chunks_exact(8 * 4).remainder();

if !rem.is_empty() {
assert!(src_rem.len() < 8 * 4);
assert!(rem.len() < 8 * 4);
assert_eq!(src_rem.len(), rem.len());

let mut buffer: [u16; 8 * 4] = [0u16; 8 * 4];
let mut dst_buffer: [u16; 8 * 4] = [0u16; 8 * 4];
std::ptr::copy_nonoverlapping(src_rem.as_ptr(), buffer.as_mut_ptr(), src_rem.len());

self.premultiply_chunk(
dst_buffer.as_mut_slice(),
buffer.as_slice(),
recip_bit_depth,
);

std::ptr::copy_nonoverlapping(dst_buffer.as_ptr(), rem.as_mut_ptr(), rem.len());
}
}
}

#[derive(Default)]
struct NeonPremultiplyExecutorAnyBitDepth {}

Expand Down Expand Up @@ -206,6 +271,26 @@ impl NeonPremultiplyExecutor for NeonPremultiplyExecutorAnyBitDepth {
fn neon_premultiply_alpha_rgba_row_u16(dst: &mut [u16], src: &[u16], bit_depth: usize) {
assert_ne!(bit_depth, 0, "Something goes wrong!");

if std::arch::is_aarch64_feature_detected!("fp16") {
if bit_depth == 10 {
neon_pa_dispatch(
dst,
src,
bit_depth,
NeonPremultiplyExecutorFloat16::<10>::default(),
);
return;
} else if bit_depth == 12 {
neon_pa_dispatch(
dst,
src,
bit_depth,
NeonPremultiplyExecutorFloat16::<12>::default(),
);
return;
}
}

if bit_depth == 10 {
neon_pa_dispatch(
dst,
Expand Down

0 comments on commit 217b994

Please sign in to comment.