Horizontal convolution SSE fixes

awxkee · Jun 20, 2024 · 5468125 · 5468125
1 parent 16793ca
commit 5468125
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 39 deletions.
diff --git a/app/src/main.rs b/app/src/main.rs
@@ -20,7 +20,7 @@ fn main() {
     let dimensions = img.dimensions();
     let mut bytes = Vec::from(img.as_bytes());
 
-    let mut scaler = XYZScaler::new(ResamplingFunction::Hann);
+    let mut scaler = XYZScaler::new(ResamplingFunction::Lanczos3);
     scaler.set_threading_policy(ThreadingPolicy::Single);
     // let store =
     //     ImageStore::<u8, 4>::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize);
@@ -39,7 +39,7 @@ fn main() {
     let resized = scaler.resize_rgba(
         ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2),
         store,
-        true,
+        false,
     );
 
     let elapsed_time = start_time.elapsed();
@@ -111,7 +111,7 @@ fn main() {
 }
 
 fn test_fast_image() {
-    let img = ImageReader::open("./assets/nasa-4928x3279.png")
+    let img = ImageReader::open("./assets/asset_5.png")
         .unwrap()
         .decode()
         .unwrap();
@@ -121,7 +121,7 @@ fn test_fast_image() {
 
     let start_time = Instant::now();
 
-    let pixel_type: PixelType = PixelType::U8x3;
+    let pixel_type: PixelType = PixelType::U8x4;
 
     let src_image = Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap();
 

diff --git a/src/scaler.rs b/src/scaler.rs
@@ -63,7 +63,7 @@ impl Scaler {
         let base_size = (filter_base_size * filter_scale_cutoff).round() as usize;
         // Kernel size must be always odd
         let kernel_size = base_size * 2 + 1usize;
-        let filter_radius = base_size as i32;
+        let filter_radius = base_size as f32;
         let filter_scale = 1f32 / filter_scale_cutoff;
         let mut weights: Vec<f32> = vec![0f32; kernel_size * out_size];
         let mut local_filters = vec![0f32; kernel_size];
@@ -85,9 +85,8 @@ impl Scaler {
             let mut weights_sum: f32 = 0f32;
             let mut local_filter_iteration = 0usize;
 
-            let start = (center_x - filter_radius as f32).floor().max(0f32) as usize;
-            let end = ((center_x + filter_radius as f32).ceil().min(in_size as f32) as usize)
-                .min(start + kernel_size);
+            let start = (center_x - filter_radius).floor().max(0f32) as usize;
+            let end = ((center_x + filter_radius).ceil().min(in_size as f32) as usize).min(start + kernel_size);
 
             let center = center_x - 0.5f32;
 
@@ -115,6 +114,7 @@ impl Scaler {
                     };
                     weight = window * resampling_function(x_kernel_scaled);
                 } else {
+                    let dx = dx.abs();
                     weight = resampling_function(dx * filter_scale);
                 }
                 weights_sum += weight;
@@ -161,7 +161,7 @@ impl Scaler {
             kernel_size,
             kernel_size,
             out_size,
-            filter_radius,
+            filter_radius as i32,
             bounds,
         );
     }
@@ -273,14 +273,14 @@ impl Scaling for Scaler {
         allocated_store_vertical.resize(store.width * 4 * new_size.height, 0f32);
         let mut new_image_vertical =
             ImageStore::<f32, 4>::new(allocated_store_vertical, store.width, new_size.height);
+        let horizontal_filters = self.generate_weights(store.width, new_size.width);
         let vertical_filters = self.generate_weights(store.height, new_image_vertical.height);
         store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
 
         let mut allocated_store_horizontal: Vec<f32> = vec![];
         allocated_store_horizontal.resize(new_size.width * 4 * new_size.height, 0f32);
         let mut new_image_horizontal =
             ImageStore::<f32, 4>::new(allocated_store_horizontal, new_size.width, new_size.height);
-        let horizontal_filters = self.generate_weights(store.width, new_size.width);
         new_image_vertical.convolve_horizontal(
             horizontal_filters,
             &mut new_image_horizontal,
@@ -326,11 +326,11 @@ impl Scaling for Scaler {
             .get_pool(ImageSize::new(new_size.width, new_size.height));
 
         let mut new_image_vertical = ImageStore::<u8, 4>::alloc(src_store.width, new_size.height);
+        let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
         let vertical_filters = self.generate_weights(src_store.height, new_image_vertical.height);
         src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool);
 
         let mut new_image_horizontal = ImageStore::<u8, 4>::alloc(new_size.width, new_size.height);
-        let horizontal_filters = self.generate_weights(src_store.width, new_size.width);
         new_image_vertical.convolve_horizontal(
             horizontal_filters,
             &mut new_image_horizontal,

diff --git a/src/sse/rgb_f32.rs b/src/sse/rgb_f32.rs
@@ -112,8 +112,9 @@ pub fn convolve_horizontal_rgb_sse_row_one_f32(
                 let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
                 let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned());
                 let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned());
+                let filter_start = jx + bounds.start;
                 store = convolve_horizontal_parts_4_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0,
                     weight0,
                     weight1,
@@ -128,8 +129,9 @@ pub fn convolve_horizontal_rgb_sse_row_one_f32(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = _mm_set1_ps(ptr.read_unaligned());
                 let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
+                let filter_start = jx + bounds.start;
                 store = convolve_horizontal_parts_2_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0,
                     weight0,
                     weight1,
@@ -141,8 +143,9 @@ pub fn convolve_horizontal_rgb_sse_row_one_f32(
             while jx < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = _mm_set1_ps(ptr.read_unaligned());
+                let filter_start = jx + bounds.start;
                 store = convolve_horizontal_parts_one_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0,
                     weight0,
                     store,
@@ -194,8 +197,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
                 let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
                 let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned());
                 let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned());
+                let filter_start = jx + bounds.start;
                 store_0 = convolve_horizontal_parts_4_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0,
                     weight0,
                     weight1,
@@ -204,7 +208,7 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
                     store_0,
                 );
                 store_1 = convolve_horizontal_parts_4_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride),
                     weight0,
                     weight1,
@@ -213,7 +217,7 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
                     store_1,
                 );
                 store_2 = convolve_horizontal_parts_4_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride * 2),
                     weight0,
                     weight1,
@@ -222,7 +226,7 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
                     store_2,
                 );
                 store_3 = convolve_horizontal_parts_4_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride * 3),
                     weight0,
                     weight1,
@@ -237,29 +241,30 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = _mm_set1_ps(ptr.read_unaligned());
                 let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
+                let filter_start = jx + bounds.start;
                 store_0 = convolve_horizontal_parts_2_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0,
                     weight0,
                     weight1,
                     store_0,
                 );
                 store_1 = convolve_horizontal_parts_2_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride),
                     weight0,
                     weight1,
                     store_1,
                 );
                 store_2 = convolve_horizontal_parts_2_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride * 2),
                     weight0,
                     weight1,
                     store_2,
                 );
                 store_3 = convolve_horizontal_parts_2_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride * 3),
                     weight0,
                     weight1,
@@ -271,26 +276,27 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32(
             while jx < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = _mm_set1_ps(ptr.read_unaligned());
+                let filter_start = jx + bounds.start;
                 store_0 = convolve_horizontal_parts_one_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0,
                     weight0,
                     store_0,
                 );
                 store_1 = convolve_horizontal_parts_one_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride),
                     weight0,
                     store_1,
                 );
                 store_2 = convolve_horizontal_parts_one_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride * 2),
                     weight0,
                     store_2,
                 );
                 store_3 = convolve_horizontal_parts_one_rgb_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride * 3),
                     weight0,
                     store_3,

diff --git a/src/sse/rgba_f32.rs b/src/sse/rgba_f32.rs
@@ -147,15 +147,16 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(
             let mut store_1 = zeros;
             let mut store_2 = zeros;
             let mut store_3 = zeros;
-
             while jx + 4 < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = _mm_set1_ps(ptr.read_unaligned());
                 let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
                 let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned());
                 let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned());
+                let filter_start = jx + bounds.start;
+
                 store_0 = convolve_horizontal_parts_4_rgba_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0,
                     weight0,
                     weight1,
@@ -164,7 +165,7 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(
                     store_0,
                 );
                 store_1 = convolve_horizontal_parts_4_rgba_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride),
                     weight0,
                     weight1,
@@ -173,7 +174,7 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(
                     store_1,
                 );
                 store_2 = convolve_horizontal_parts_4_rgba_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride * 2),
                     weight0,
                     weight1,
@@ -182,7 +183,7 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(
                     store_2,
                 );
                 store_3 = convolve_horizontal_parts_4_rgba_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride * 3),
                     weight0,
                     weight1,
@@ -197,29 +198,30 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(
                 let ptr = weights_ptr.add(jx + filter_offset);
                 let weight0 = _mm_set1_ps(ptr.read_unaligned());
                 let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned());
+                let filter_start = jx + bounds.start;
                 store_0 = convolve_horizontal_parts_2_rgba_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0,
                     weight0,
                     weight1,
                     store_0,
                 );
                 store_1 = convolve_horizontal_parts_2_rgba_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride),
                     weight0,
                     weight1,
                     store_1,
                 );
                 store_2 = convolve_horizontal_parts_2_rgba_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride * 2),
                     weight0,
                     weight1,
                     store_2,
                 );
                 store_3 = convolve_horizontal_parts_2_rgba_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride * 3),
                     weight0,
                     weight1,
@@ -230,27 +232,28 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32(
 
             while jx < bounds.size {
                 let ptr = weights_ptr.add(jx + filter_offset);
+                let filter_start = jx + bounds.start;
                 let weight0 = _mm_set1_ps(ptr.read_unaligned());
                 store_0 = convolve_horizontal_parts_one_rgba_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0,
                     weight0,
                     store_0,
                 );
                 store_1 = convolve_horizontal_parts_one_rgba_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride),
                     weight0,
                     store_1,
                 );
                 store_2 = convolve_horizontal_parts_one_rgba_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride * 2),
                     weight0,
                     store_2,
                 );
                 store_3 = convolve_horizontal_parts_one_rgba_f32(
-                    bounds.start,
+                    filter_start,
                     unsafe_source_ptr_0.add(src_stride * 3),
                     weight0,
                     store_3,