Thread improvements, a lot of reworking

awxkee · Jun 4, 2024 · 90d6e5b · 90d6e5b
1 parent adc785d
commit 90d6e5b
Show file tree

Hide file tree

Showing 14 changed files with 233 additions and 130 deletions.
diff --git a/README.md b/README.md
@@ -19,20 +19,41 @@ Example comparison time for downscale RGB 4928x3279 image in two times for x86_6
 | pic-scale |  26.13   |
 | fir sse   |  26.84   |
 
+M3 Pro. NEON
+
+|           | Lanczos3 |
+|-----------|:--------:|
+| pic-scale |  23.04   |
+| fir sse   |  37.00   |
+
 Example comparison time for downscale RGBA 4928x3279 image in two times for x86_64 SSE with premultiplying alpha.
 
 |           | Lanczos3 |
 |-----------|:--------:|
 | pic-scale |  42.35   |
 | fir sse   |  42.96   |
 
+M3 Pro. NEON
+
+|           | Lanczos3 |
+|-----------|:--------:|
+| pic-scale |  47.45   |
+| fir sse   |  53.81   |
+
 Example comparison time for downscale RGBA 4928x3279 image in two times for x86_64 SSE without premultiplying alpha.
 
 |           | Lanczos3 |
 |-----------|:--------:|
 | pic-scale |  26.92   |
 | fir sse   |  38.30   |
 
+M3 Pro. NEON
+
+|           | Lanczos3 |
+|-----------|:--------:|
+| pic-scale |  38.75   |
+| fir sse   |  45.79   |
+
 #### Example integration with `image` crate
 
 ```rust

diff --git a/app/src/main.rs b/app/src/main.rs
@@ -8,12 +8,15 @@ use fast_image_resize::{
 use image::io::Reader as ImageReader;
 use image::{EncodableLayout, GenericImageView};
 
-use pic_scale::{ImageSize, ImageStore, LabScaler, LinearScaler, LuvScaler, ResamplingFunction, Scaler, Scaling, ThreadingPolicy};
+use pic_scale::{
+    ImageSize, ImageStore, LabScaler, LinearScaler, LuvScaler, ResamplingFunction, Scaler, Scaling,
+    ThreadingPolicy,
+};
 
 fn main() {
-    test_fast_image();
+    // test_fast_image();
 
-    let img = ImageReader::open("./assets/nasa-4928x3279.png")
+    let img = ImageReader::open("./assets/asset_5.png")
         .unwrap()
         .decode()
         .unwrap();
@@ -22,13 +25,14 @@ fn main() {
 
     let start_time = Instant::now();
 
-    let mut scaler = Scaler::new(ResamplingFunction::Lanczos3);
-    scaler.set_threading_policy(ThreadingPolicy::Single);
+    let mut scaler = LabScaler::new(ResamplingFunction::Lanczos3);
+    scaler.set_threading_policy(ThreadingPolicy::Adaptive);
     let store =
-        ImageStore::<u8, 3>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize);
-    let resized = scaler.resize_rgb(
+        ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize);
+    let resized = scaler.resize_rgba(
         ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
         store,
+        false,
     );
 
     let elapsed_time = start_time.elapsed();
@@ -57,7 +61,7 @@ fn main() {
 }
 
 fn test_fast_image() {
-    let img = ImageReader::open("./assets/nasa-4928x3279.png")
+    let img = ImageReader::open("./assets/asset_5.png")
         .unwrap()
         .decode()
         .unwrap();
@@ -67,7 +71,7 @@ fn test_fast_image() {
 
     let start_time = Instant::now();
 
-    let pixel_type: PixelType = PixelType::U8x3;
+    let pixel_type: PixelType = PixelType::U8x4;
 
     let src_image = Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();
 
@@ -86,7 +90,9 @@ fn test_fast_image() {
         .resize(
             &src_image,
             &mut dst_image,
-            &ResizeOptions::new().resize_alg(ResizeAlg::Convolution(Lanczos3)).use_alpha(true),
+            &ResizeOptions::new()
+                .resize_alg(ResizeAlg::Convolution(Lanczos3))
+                .use_alpha(false),
         )
         .unwrap();
 

diff --git a/src/alpha_handle.rs b/src/alpha_handle.rs
@@ -56,7 +56,11 @@ pub unsafe fn neon_umpremultiply_alpha(v: uint8x16_t, a_values: uint8x16_t) -> u
     let hi_hi = vcvtaq_u32_f32(vmulq_f32(hi_hi, a_hi_ho));
     let lo = vcombine_u16(vmovn_u32(lo_lo), vmovn_u32(lo_hi));
     let hi = vcombine_u16(vmovn_u32(hi_lo), vmovn_u32(hi_hi));
-    vbslq_u8(zero_mask, vdupq_n_u8(0), vcombine_u8(vqmovn_u16(lo), vqmovn_u16(hi)))
+    vbslq_u8(
+        zero_mask,
+        vdupq_n_u8(0),
+        vcombine_u8(vqmovn_u16(lo), vqmovn_u16(hi)),
+    )
 }
 
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
@@ -189,7 +193,9 @@ pub fn premultiply_alpha_rgba(dst: &mut [u8], src: &[u8], width: usize, height:
                 let px = _cx * 4;
                 let src_ptr = src.as_ptr().add(offset + px);
                 let mut pixel = vld4q_u8(src_ptr);
-
+                pixel.0 = neon_premultiply_alpha(pixel.0, pixel.3);
+                pixel.1 = neon_premultiply_alpha(pixel.1, pixel.3);
+                pixel.2 = neon_premultiply_alpha(pixel.2, pixel.3);
                 let dst_ptr = dst.as_mut_ptr().add(offset + px);
                 vst4q_u8(dst_ptr, pixel);
                 _cx += 16;

diff --git a/src/convolve_u8.rs b/src/convolve_u8.rs
@@ -1,4 +1,5 @@
 use crate::filter_weights::FilterBounds;
+use crate::support::{PRECISION, ROUNDING_APPROX};
 
 #[inline(always)]
 #[allow(unused)]
@@ -11,7 +12,7 @@ pub(crate) unsafe fn convolve_vertical_part<const PART: usize, const CHANNELS: u
     filter: *const i16,
     bounds: &FilterBounds,
 ) {
-    let mut store: [[i32; CHANNELS]; PART] = [[0; CHANNELS]; PART];
+    let mut store: [[i32; CHANNELS]; PART] = [[ROUNDING_APPROX; CHANNELS]; PART];
 
     for j in 0..bounds.size {
         let py = start_y + j;
@@ -33,7 +34,7 @@ pub(crate) unsafe fn convolve_vertical_part<const PART: usize, const CHANNELS: u
         let dst_ptr = dst.add(px);
         for c in 0..CHANNELS {
             let vl = *(*store.get_unchecked_mut(x)).get_unchecked_mut(c);
-            let ck = vl >> 12;
+            let ck = vl >> PRECISION;
             *dst_ptr.add(c) = ck.max(0).min(255) as u8;
         }
     }

diff --git a/src/lib.rs b/src/lib.rs
@@ -27,6 +27,7 @@ mod luv_scaler;
 mod sse_rgb_f32;
 mod alpha_handle;
 mod sse_utils;
+mod support;
 
 pub use image_size::ImageSize;
 pub use image_store::ImageStore;

diff --git a/src/neon_rgb_u8.rs b/src/neon_rgb_u8.rs
@@ -3,6 +3,7 @@ pub mod neon_rgb {
     use crate::filter_weights::{FilterBounds, FilterWeights};
     use crate::neon_simd_u8::neon_convolve_u8;
     use std::arch::aarch64::*;
+    use crate::support::ROUNDING_APPROX;
 
     pub unsafe fn convolve_horizontal_rgb_neon_rows_4(
         dst_width: usize,
@@ -23,13 +24,14 @@ pub mod neon_rgb {
         let weights_ptr = approx_weights.weights.as_ptr();
         const CHANNELS: usize = 3;
         let zeros = vdupq_n_s32(0i32);
+        let init = vdupq_n_s32(ROUNDING_APPROX);
         for x in 0..dst_width {
             let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
             let mut jx = 0usize;
-            let mut store_0 = zeros;
-            let mut store_1 = zeros;
-            let mut store_2 = zeros;
-            let mut store_3 = zeros;
+            let mut store_0 = init;
+            let mut store_1 = init;
+            let mut store_2 = init;
+            let mut store_3 = init;
 
             while jx + 4 < bounds.size && x + 6 < src_width {
                 let ptr = unsafe { weights_ptr.add(jx + filter_offset) };
@@ -235,7 +237,7 @@ pub mod neon_rgb {
         for x in 0..dst_width {
             let bounds = unsafe { approx_weights.bounds.get_unchecked(x) };
             let mut jx = 0usize;
-            let mut store = zeros;
+            let mut store = vdupq_n_s32(ROUNDING_APPROX);
 
             while jx + 4 < bounds.size && x + 6 < src_width {
                 let ptr = unsafe { weights_ptr.add(jx + filter_offset) };

diff --git a/src/neon_simd_u8.rs b/src/neon_simd_u8.rs
@@ -2,6 +2,7 @@
 pub mod neon_convolve_u8 {
     use crate::filter_weights::FilterBounds;
     use std::arch::aarch64::*;
+    use crate::support::ROUNDING_APPROX;
 
     #[inline(always)]
     pub(crate) unsafe fn convolve_horizontal_parts_one_rgba(
@@ -102,14 +103,15 @@ pub mod neon_convolve_u8 {
         filter: *const i16,
         bounds: &FilterBounds,
     ) {
-        let mut store_0 = vdupq_n_s32(0i32);
-        let mut store_1 = vdupq_n_s32(0i32);
-        let mut store_2 = vdupq_n_s32(0i32);
-        let mut store_3 = vdupq_n_s32(0i32);
-        let mut store_4 = vdupq_n_s32(0i32);
-        let mut store_5 = vdupq_n_s32(0i32);
-        let mut store_6 = vdupq_n_s32(0i32);
-        let mut store_7 = vdupq_n_s32(0i32);
+        let vld = vdupq_n_s32(ROUNDING_APPROX);
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+        let mut store_2 = vld;
+        let mut store_3 = vld;
+        let mut store_4 = vld;
+        let mut store_5 = vld;
+        let mut store_6 = vld;
+        let mut store_7 = vld;
 
         let px = start_x;
 
@@ -178,10 +180,11 @@ pub mod neon_convolve_u8 {
         filter: *const i16,
         bounds: &FilterBounds,
     ) {
-        let mut store_0 = vdupq_n_s32(0i32);
-        let mut store_1 = vdupq_n_s32(0i32);
-        let mut store_2 = vdupq_n_s32(0i32);
-        let mut store_3 = vdupq_n_s32(0i32);
+        let vld = vdupq_n_s32(ROUNDING_APPROX);
+        let mut store_0 = vld;
+        let mut store_1 = vld;
+        let mut store_2 = vld;
+        let mut store_3 = vld;
 
         let px = start_x;
 
@@ -230,8 +233,9 @@ pub mod neon_convolve_u8 {
         bounds: &FilterBounds,
         blend_length: usize,
     ) {
-        let mut store_0 = vdupq_n_s32(0i32);
-        let mut store_1 = vdupq_n_s32(0i32);
+        let vld = vdupq_n_s32(ROUNDING_APPROX);
+        let mut store_0 = vld;
+        let mut store_1 = vld;
 
         let px = start_x;
 

diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs
@@ -10,6 +10,7 @@ use crate::image_store::ImageStore;
 use crate::neon_rgb_u8::neon_rgb::*;
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
 use crate::sse_rgb_u8::sse_rgb::*;
+use crate::support::ROUNDING_APPROX;
 use crate::unsafe_slice::UnsafeSlice;
 
 #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
@@ -221,9 +222,9 @@ fn convolve_horizontal_rgb_native_row(
     let mut filter_offset = 0usize;
     let weights_ptr = filter_weights.weights.as_ptr();
     for x in 0..dst_width {
-        let mut sum_r = 0i32;
-        let mut sum_g = 0i32;
-        let mut sum_b = 0i32;
+        let mut sum_r = ROUNDING_APPROX;
+        let mut sum_g = ROUNDING_APPROX;
+        let mut sum_b = ROUNDING_APPROX;
 
         let bounds = unsafe { filter_weights.bounds.get_unchecked(x) };
         let start_x = bounds.start;