diff --git a/app/src/main.rs b/app/src/main.rs index 1374351..967baf2 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -20,7 +20,7 @@ fn main() { let dimensions = img.dimensions(); let mut bytes = Vec::from(img.as_bytes()); - let mut scaler = XYZScaler::new(ResamplingFunction::Hann); + let mut scaler = XYZScaler::new(ResamplingFunction::Lanczos3); scaler.set_threading_policy(ThreadingPolicy::Single); // let store = // ImageStore::::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize); @@ -39,7 +39,7 @@ fn main() { let resized = scaler.resize_rgba( ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2), store, - true, + false, ); let elapsed_time = start_time.elapsed(); @@ -111,7 +111,7 @@ fn main() { } fn test_fast_image() { - let img = ImageReader::open("./assets/nasa-4928x3279.png") + let img = ImageReader::open("./assets/asset_5.png") .unwrap() .decode() .unwrap(); @@ -121,7 +121,7 @@ fn test_fast_image() { let start_time = Instant::now(); - let pixel_type: PixelType = PixelType::U8x3; + let pixel_type: PixelType = PixelType::U8x4; let src_image = Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap(); diff --git a/src/scaler.rs b/src/scaler.rs index 75ea329..572695b 100644 --- a/src/scaler.rs +++ b/src/scaler.rs @@ -63,7 +63,7 @@ impl Scaler { let base_size = (filter_base_size * filter_scale_cutoff).round() as usize; // Kernel size must be always odd let kernel_size = base_size * 2 + 1usize; - let filter_radius = base_size as i32; + let filter_radius = base_size as f32; let filter_scale = 1f32 / filter_scale_cutoff; let mut weights: Vec = vec![0f32; kernel_size * out_size]; let mut local_filters = vec![0f32; kernel_size]; @@ -85,9 +85,8 @@ impl Scaler { let mut weights_sum: f32 = 0f32; let mut local_filter_iteration = 0usize; - let start = (center_x - filter_radius as f32).floor().max(0f32) as usize; - let end = ((center_x + filter_radius as f32).ceil().min(in_size as f32) as usize) - .min(start + kernel_size); + let start = (center_x - filter_radius).floor().max(0f32) as usize; + let end = ((center_x + filter_radius).ceil().min(in_size as f32) as usize).min(start + kernel_size); let center = center_x - 0.5f32; @@ -115,6 +114,7 @@ impl Scaler { }; weight = window * resampling_function(x_kernel_scaled); } else { + let dx = dx.abs(); weight = resampling_function(dx * filter_scale); } weights_sum += weight; @@ -161,7 +161,7 @@ impl Scaler { kernel_size, kernel_size, out_size, - filter_radius, + filter_radius as i32, bounds, ); } @@ -273,6 +273,7 @@ impl Scaling for Scaler { allocated_store_vertical.resize(store.width * 4 * new_size.height, 0f32); let mut new_image_vertical = ImageStore::::new(allocated_store_vertical, store.width, new_size.height); + let horizontal_filters = self.generate_weights(store.width, new_size.width); let vertical_filters = self.generate_weights(store.height, new_image_vertical.height); store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); @@ -280,7 +281,6 @@ impl Scaling for Scaler { allocated_store_horizontal.resize(new_size.width * 4 * new_size.height, 0f32); let mut new_image_horizontal = ImageStore::::new(allocated_store_horizontal, new_size.width, new_size.height); - let horizontal_filters = self.generate_weights(store.width, new_size.width); new_image_vertical.convolve_horizontal( horizontal_filters, &mut new_image_horizontal, @@ -326,11 +326,11 @@ impl Scaling for Scaler { .get_pool(ImageSize::new(new_size.width, new_size.height)); let mut new_image_vertical = ImageStore::::alloc(src_store.width, new_size.height); + let horizontal_filters = self.generate_weights(src_store.width, new_size.width); let vertical_filters = self.generate_weights(src_store.height, new_image_vertical.height); src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); let mut new_image_horizontal = ImageStore::::alloc(new_size.width, new_size.height); - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); new_image_vertical.convolve_horizontal( horizontal_filters, &mut new_image_horizontal, diff --git a/src/sse/rgb_f32.rs b/src/sse/rgb_f32.rs index b7a09b9..41f74c3 100644 --- a/src/sse/rgb_f32.rs +++ b/src/sse/rgb_f32.rs @@ -112,8 +112,9 @@ pub fn convolve_horizontal_rgb_sse_row_one_f32( let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned()); let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned()); let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned()); + let filter_start = jx + bounds.start; store = convolve_horizontal_parts_4_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0, weight0, weight1, @@ -128,8 +129,9 @@ pub fn convolve_horizontal_rgb_sse_row_one_f32( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = _mm_set1_ps(ptr.read_unaligned()); let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned()); + let filter_start = jx + bounds.start; store = convolve_horizontal_parts_2_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0, weight0, weight1, @@ -141,8 +143,9 @@ pub fn convolve_horizontal_rgb_sse_row_one_f32( while jx < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); let weight0 = _mm_set1_ps(ptr.read_unaligned()); + let filter_start = jx + bounds.start; store = convolve_horizontal_parts_one_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0, weight0, store, @@ -194,8 +197,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32( let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned()); let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned()); let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned()); + let filter_start = jx + bounds.start; store_0 = convolve_horizontal_parts_4_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0, weight0, weight1, @@ -204,7 +208,7 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32( store_0, ); store_1 = convolve_horizontal_parts_4_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride), weight0, weight1, @@ -213,7 +217,7 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32( store_1, ); store_2 = convolve_horizontal_parts_4_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride * 2), weight0, weight1, @@ -222,7 +226,7 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32( store_2, ); store_3 = convolve_horizontal_parts_4_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride * 3), weight0, weight1, @@ -237,29 +241,30 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = _mm_set1_ps(ptr.read_unaligned()); let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned()); + let filter_start = jx + bounds.start; store_0 = convolve_horizontal_parts_2_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0, weight0, weight1, store_0, ); store_1 = convolve_horizontal_parts_2_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride), weight0, weight1, store_1, ); store_2 = convolve_horizontal_parts_2_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride * 2), weight0, weight1, store_2, ); store_3 = convolve_horizontal_parts_2_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride * 3), weight0, weight1, @@ -271,26 +276,27 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32( while jx < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); let weight0 = _mm_set1_ps(ptr.read_unaligned()); + let filter_start = jx + bounds.start; store_0 = convolve_horizontal_parts_one_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0, weight0, store_0, ); store_1 = convolve_horizontal_parts_one_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride), weight0, store_1, ); store_2 = convolve_horizontal_parts_one_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride * 2), weight0, store_2, ); store_3 = convolve_horizontal_parts_one_rgb_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride * 3), weight0, store_3, diff --git a/src/sse/rgba_f32.rs b/src/sse/rgba_f32.rs index eabdeb5..bab50d1 100644 --- a/src/sse/rgba_f32.rs +++ b/src/sse/rgba_f32.rs @@ -147,15 +147,16 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32( let mut store_1 = zeros; let mut store_2 = zeros; let mut store_3 = zeros; - while jx + 4 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); let weight0 = _mm_set1_ps(ptr.read_unaligned()); let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned()); let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned()); let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned()); + let filter_start = jx + bounds.start; + store_0 = convolve_horizontal_parts_4_rgba_f32( - bounds.start, + filter_start, unsafe_source_ptr_0, weight0, weight1, @@ -164,7 +165,7 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32( store_0, ); store_1 = convolve_horizontal_parts_4_rgba_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride), weight0, weight1, @@ -173,7 +174,7 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32( store_1, ); store_2 = convolve_horizontal_parts_4_rgba_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride * 2), weight0, weight1, @@ -182,7 +183,7 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32( store_2, ); store_3 = convolve_horizontal_parts_4_rgba_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride * 3), weight0, weight1, @@ -197,29 +198,30 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = _mm_set1_ps(ptr.read_unaligned()); let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned()); + let filter_start = jx + bounds.start; store_0 = convolve_horizontal_parts_2_rgba_f32( - bounds.start, + filter_start, unsafe_source_ptr_0, weight0, weight1, store_0, ); store_1 = convolve_horizontal_parts_2_rgba_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride), weight0, weight1, store_1, ); store_2 = convolve_horizontal_parts_2_rgba_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride * 2), weight0, weight1, store_2, ); store_3 = convolve_horizontal_parts_2_rgba_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride * 3), weight0, weight1, @@ -230,27 +232,28 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32( while jx < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); + let filter_start = jx + bounds.start; let weight0 = _mm_set1_ps(ptr.read_unaligned()); store_0 = convolve_horizontal_parts_one_rgba_f32( - bounds.start, + filter_start, unsafe_source_ptr_0, weight0, store_0, ); store_1 = convolve_horizontal_parts_one_rgba_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride), weight0, store_1, ); store_2 = convolve_horizontal_parts_one_rgba_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride * 2), weight0, store_2, ); store_3 = convolve_horizontal_parts_one_rgba_f32( - bounds.start, + filter_start, unsafe_source_ptr_0.add(src_stride * 3), weight0, store_3,