From 898c096dfb37ae9dfeeb4b1b61b18efeab968865 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Fri, 20 Dec 2024 15:25:07 +0000 Subject: [PATCH 1/9] x86 improvements --- src/avx2/rgba_f32.rs | 167 +++++++++++++--------------- src/avx2/vertical_f32.rs | 130 +++++++++------------- src/color_group.rs | 68 +----------- src/convolve_naive_f32.rs | 223 ++++---------------------------------- src/dispatch_group_f32.rs | 216 ++++++++++++++++++------------------ src/lib.rs | 1 - src/neon/plane_f32.rs | 82 ++++++-------- src/neon/rgb_f32.rs | 53 ++++----- src/neon/rgba_f32.rs | 67 +++++------- src/neon/vertical_f32.rs | 108 +++++++----------- src/plane_f32.rs | 12 +- src/rgb_f32.rs | 63 +++-------- src/rgba_f32.rs | 22 ++-- src/sse/plane_f32.rs | 181 +++++++++++++------------------ src/sse/rgb_f32.rs | 137 +++++++++++------------ src/sse/rgba_f32.rs | 138 +++++++++++------------ src/sse/vertical_f32.rs | 138 ++++++++++------------- src/unsafe_slice.rs | 69 ------------ 18 files changed, 674 insertions(+), 1201 deletions(-) delete mode 100644 src/unsafe_slice.rs diff --git a/src/avx2/rgba_f32.rs b/src/avx2/rgba_f32.rs index f82923a..0aba5bd 100644 --- a/src/avx2/rgba_f32.rs +++ b/src/avx2/rgba_f32.rs @@ -39,13 +39,13 @@ use crate::sse::{load_4_weights_group_2_avx, load_8_weights_group_4_avx, shuffle #[inline(always)] unsafe fn convolve_horizontal_parts_one_rgba_f32( start_x: usize, - src: *const f32, + src: &[f32], weight0: __m256, store_0: __m256, ) -> __m256 { const COMPONENTS: usize = 4; - let src_ptr = src.add(start_x * COMPONENTS); - let rgb_pixel = _mm_loadu_ps(src_ptr); + let src_ptr = src.get_unchecked(start_x * COMPONENTS..); + let rgb_pixel = _mm_loadu_ps(src_ptr.as_ptr()); _mm256_fma_ps::( store_0, avx_combine_ps(rgb_pixel, _mm_setzero_ps()), @@ -56,13 +56,13 @@ unsafe fn convolve_horizontal_parts_one_rgba_f32( #[inline(always)] unsafe fn convolve_horizontal_parts_4_rgba_f32( start_x: usize, - src: *const f32, + src: &[f32], weight0: __m256, weight1: __m256, store_0: __m256, ) -> __m256 { const COMPONENTS: usize = 4; - let src_ptr = src.add(start_x * COMPONENTS); + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); let rgb_pixel_0 = _mm256_loadu_ps(src_ptr); let rgb_pixel_1 = _mm256_loadu_ps(src_ptr.add(8)); @@ -75,7 +75,7 @@ unsafe fn convolve_horizontal_parts_4_rgba_f32( #[inline(always)] unsafe fn convolve_horizontal_parts_8_rgba_f32( start_x: usize, - src: *const f32, + src: &[f32], weight0: __m256, weight1: __m256, weight2: __m256, @@ -83,7 +83,7 @@ unsafe fn convolve_horizontal_parts_8_rgba_f32( store_0: __m256, ) -> __m256 { const COMPONENTS: usize = 4; - let src_ptr = src.add(start_x * COMPONENTS); + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); let rgb_pixel_0 = _mm256_loadu_ps(src_ptr); let rgb_pixel_1 = _mm256_loadu_ps(src_ptr.add(8)); @@ -100,14 +100,14 @@ unsafe fn convolve_horizontal_parts_8_rgba_f32( #[inline(always)] unsafe fn convolve_horizontal_parts_2_rgba_f32( start_x: usize, - src: *const f32, + src: &[f32], weight0: __m256, store_0: __m256, ) -> __m256 { const COMPONENTS: usize = 4; - let src_ptr = src.add(start_x * COMPONENTS); + let src_ptr = src.get_unchecked(start_x * COMPONENTS..); - let rgb_pixel = _mm256_loadu_ps(src_ptr); + let rgb_pixel = _mm256_loadu_ps(src_ptr.as_ptr()); _mm256_fma_ps::(store_0, rgb_pixel, weight0) } @@ -116,9 +116,9 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f32( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { unsafe { @@ -127,9 +127,9 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f32( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } else { @@ -137,9 +137,9 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f32( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -151,18 +151,18 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_regular( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { convolve_horizontal_rgba_avx_rows_4_f32_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -172,18 +172,18 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { convolve_horizontal_rgba_avx_rows_4_f32_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -193,9 +193,9 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { const CHANNELS: usize = 4; @@ -218,7 +218,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl( store_0 = convolve_horizontal_parts_8_rgba_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, weight1, weight2, @@ -227,7 +227,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl( ); store_1 = convolve_horizontal_parts_8_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..), weight0, weight1, weight2, @@ -236,7 +236,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl( ); store_2 = convolve_horizontal_parts_8_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..), weight0, weight1, weight2, @@ -245,7 +245,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl( ); store_3 = convolve_horizontal_parts_8_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..), weight0, weight1, weight2, @@ -262,28 +262,28 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl( store_0 = convolve_horizontal_parts_4_rgba_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, weight1, store_0, ); store_1 = convolve_horizontal_parts_4_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..), weight0, weight1, store_1, ); store_2 = convolve_horizontal_parts_4_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..), weight0, weight1, store_2, ); store_3 = convolve_horizontal_parts_4_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..), weight0, weight1, store_3, @@ -297,27 +297,23 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl( let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned()); let weight = avx_combine_ps(weight0, weight1); let filter_start = jx + bounds.start; - store_0 = convolve_horizontal_parts_2_rgba_f32::( - filter_start, - unsafe_source_ptr_0, - weight, - store_0, - ); + store_0 = + convolve_horizontal_parts_2_rgba_f32::(filter_start, src, weight, store_0); store_1 = convolve_horizontal_parts_2_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..), weight, store_1, ); store_2 = convolve_horizontal_parts_2_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..), weight, store_2, ); store_3 = convolve_horizontal_parts_2_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..), weight, store_3, ); @@ -328,27 +324,23 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl( let ptr = weights_ptr.add(jx + filter_offset); let filter_start = jx + bounds.start; let weight0 = _mm256_set1_ps(ptr.read_unaligned()); - store_0 = convolve_horizontal_parts_one_rgba_f32::( - filter_start, - unsafe_source_ptr_0, - weight0, - store_0, - ); + store_0 = + convolve_horizontal_parts_one_rgba_f32::(filter_start, src, weight0, store_0); store_1 = convolve_horizontal_parts_one_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..), weight0, store_1, ); store_2 = convolve_horizontal_parts_one_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..), weight0, store_2, ); store_3 = convolve_horizontal_parts_one_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..), weight0, store_3, ); @@ -356,36 +348,36 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f32_impl( } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..); _mm_storeu_ps( - dest_ptr, + dest_ptr.as_mut_ptr(), _mm_add_ps( _mm256_castps256_ps128(store_0), _mm256_extractf128_ps::<1>(store_0), ), ); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride..); _mm_storeu_ps( - dest_ptr, + dest_ptr.as_mut_ptr(), _mm_add_ps( _mm256_castps256_ps128(store_1), _mm256_extractf128_ps::<1>(store_1), ), ); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..); _mm_storeu_ps( - dest_ptr, + dest_ptr.as_mut_ptr(), _mm_add_ps( _mm256_castps256_ps128(store_2), _mm256_extractf128_ps::<1>(store_2), ), ); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..); _mm_storeu_ps( - dest_ptr, + dest_ptr.as_mut_ptr(), _mm_add_ps( _mm256_castps256_ps128(store_3), _mm256_extractf128_ps::<1>(store_3), @@ -400,8 +392,8 @@ pub(crate) fn convolve_horizontal_rgba_avx_row_one_f32( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { unsafe { if FMA { @@ -409,16 +401,16 @@ pub(crate) fn convolve_horizontal_rgba_avx_row_one_f32( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } else { convolve_horizontal_rgba_avx_row_one_f32_regular( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } } @@ -429,15 +421,15 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_regular( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { convolve_horizontal_rgba_avx_row_one_f32_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } @@ -446,15 +438,15 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { convolve_horizontal_rgba_avx_row_one_f32_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } @@ -463,8 +455,8 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { const CHANNELS: usize = 4; let mut filter_offset = 0usize; @@ -482,7 +474,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_impl( store = convolve_horizontal_parts_8_rgba_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, weight1, weight2, @@ -498,7 +490,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_impl( let filter_start = jx + bounds.start; store = convolve_horizontal_parts_4_rgba_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, weight1, store, @@ -512,12 +504,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_impl( let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned()); let weight = avx_combine_ps(weight0, weight1); let filter_start = jx + bounds.start; - store = convolve_horizontal_parts_2_rgba_f32::( - filter_start, - unsafe_source_ptr_0, - weight, - store, - ); + store = convolve_horizontal_parts_2_rgba_f32::(filter_start, src, weight, store); jx += 2 } @@ -525,19 +512,15 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_impl( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = _mm256_set1_ps(ptr.read_unaligned()); let filter_start = jx + bounds.start; - store = convolve_horizontal_parts_one_rgba_f32::( - filter_start, - unsafe_source_ptr_0, - weight0, - store, - ); + store = + convolve_horizontal_parts_one_rgba_f32::(filter_start, src, weight0, store); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..); _mm_storeu_ps( - dest_ptr, + dest_ptr.as_mut_ptr(), _mm_add_ps( _mm256_castps256_ps128(store), _mm256_extractf128_ps::<1>(store), diff --git a/src/avx2/vertical_f32.rs b/src/avx2/vertical_f32.rs index ddd673a..6e88617 100644 --- a/src/avx2/vertical_f32.rs +++ b/src/avx2/vertical_f32.rs @@ -38,9 +38,9 @@ use std::arch::x86_64::*; pub(crate) unsafe fn convolve_vertical_part_avx_32_f32( start_y: usize, start_x: usize, - src: *const f32, + src: &[f32], src_stride: usize, - dst: *mut f32, + dst: &mut [f32], filter: &[f32], bounds: &FilterBounds, ) { @@ -55,13 +55,11 @@ pub(crate) unsafe fn convolve_vertical_part_avx_32_f32( let py = start_y + j; let weight = *filter.get_unchecked(j); let v_weight = _mm256_set1_ps(weight); - let src_ptr = src.add(src_stride * py); - - let s_ptr = src_ptr.add(px); - let item_row_0 = _mm256_loadu_ps(s_ptr); - let item_row_1 = _mm256_loadu_ps(s_ptr.add(8)); - let item_row_2 = _mm256_loadu_ps(s_ptr.add(16)); - let item_row_3 = _mm256_loadu_ps(s_ptr.add(24)); + let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr(); + let item_row_0 = _mm256_loadu_ps(src_ptr); + let item_row_1 = _mm256_loadu_ps(src_ptr.add(8)); + let item_row_2 = _mm256_loadu_ps(src_ptr.add(16)); + let item_row_3 = _mm256_loadu_ps(src_ptr.add(24)); store_0 = _mm256_fma_ps::(store_0, item_row_0, v_weight); store_1 = _mm256_fma_ps::(store_1, item_row_1, v_weight); @@ -69,7 +67,7 @@ pub(crate) unsafe fn convolve_vertical_part_avx_32_f32( store_3 = _mm256_fma_ps::(store_3, item_row_3, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); _mm256_storeu_ps(dst_ptr, store_0); _mm256_storeu_ps(dst_ptr.add(8), store_1); _mm256_storeu_ps(dst_ptr.add(16), store_2); @@ -80,9 +78,9 @@ pub(crate) unsafe fn convolve_vertical_part_avx_32_f32( pub(crate) unsafe fn convolve_vertical_part_avx_16_f32( start_y: usize, start_x: usize, - src: *const f32, + src: &[f32], src_stride: usize, - dst: *mut f32, + dst: &mut [f32], filter: &[f32], bounds: &FilterBounds, ) { @@ -95,17 +93,16 @@ pub(crate) unsafe fn convolve_vertical_part_avx_16_f32( let py = start_y + j; let weight = *filter.get_unchecked(j); let v_weight = _mm256_set1_ps(weight); - let src_ptr = src.add(src_stride * py); + let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr(); - let s_ptr = src_ptr.add(px); - let item_row_0 = _mm256_loadu_ps(s_ptr); - let item_row_1 = _mm256_loadu_ps(s_ptr.add(8)); + let item_row_0 = _mm256_loadu_ps(src_ptr); + let item_row_1 = _mm256_loadu_ps(src_ptr.add(8)); store_0 = _mm256_fma_ps::(store_0, item_row_0, v_weight); store_1 = _mm256_fma_ps::(store_1, item_row_1, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); _mm256_storeu_ps(dst_ptr, store_0); _mm256_storeu_ps(dst_ptr.add(8), store_1); } @@ -114,9 +111,9 @@ pub(crate) unsafe fn convolve_vertical_part_avx_16_f32( pub(crate) unsafe fn convolve_vertical_part_avx_8_f32( start_y: usize, start_x: usize, - src: *const f32, + src: &[f32], src_stride: usize, - dst: *mut f32, + dst: &mut [f32], filter: &[f32], bounds: &FilterBounds, ) { @@ -128,15 +125,13 @@ pub(crate) unsafe fn convolve_vertical_part_avx_8_f32( let py = start_y + j; let weight = *filter.get_unchecked(j); let v_weight = _mm256_set1_ps(weight); - let src_ptr = src.add(src_stride * py); - - let s_ptr = src_ptr.add(px); - let item_row_0 = _mm256_loadu_ps(s_ptr); + let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr(); + let item_row_0 = _mm256_loadu_ps(src_ptr); store_0 = _mm256_fma_ps::(store_0, item_row_0, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); _mm256_storeu_ps(dst_ptr, store_0); } @@ -144,9 +139,9 @@ pub(crate) unsafe fn convolve_vertical_part_avx_8_f32( pub(crate) unsafe fn convolve_vertical_part_avx_f32( start_y: usize, start_x: usize, - src: *const f32, + src: &[f32], src_stride: usize, - dst: *mut f32, + dst: &mut [f32], filter: &[f32], bounds: &FilterBounds, ) { @@ -158,15 +153,14 @@ pub(crate) unsafe fn convolve_vertical_part_avx_f32( let py = start_y + j; let weight = *filter.get_unchecked(j); let v_weight = _mm256_set1_ps(weight); - let src_ptr = src.add(src_stride * py); + let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr(); - let s_ptr = src_ptr.add(px); - let item_row_0 = _mm256_set1_ps(s_ptr.read_unaligned()); + let item_row_0 = _mm256_set1_ps(src_ptr.read_unaligned()); store_0 = _mm256_fma_ps::(store_0, item_row_0, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); (dst_ptr as *mut i32).write_unaligned(_mm256_extract_epi32::<0>(_mm256_castps_si256(store_0))); } @@ -174,29 +168,19 @@ pub(crate) unsafe fn convolve_vertical_part_avx_f32( pub(crate) fn convolve_vertical_avx_row_f32( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], src_stride: usize, weight_ptr: &[f32], ) { unsafe { if FMA { convolve_vertical_avx_row_f32_fma::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } else { convolve_vertical_avx_row_f32_regular::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } } @@ -206,18 +190,13 @@ pub(crate) fn convolve_vertical_avx_row_f32( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], src_stride: usize, weight_ptr: &[f32], ) { convolve_vertical_avx_row_f32_impl::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } @@ -225,18 +204,13 @@ unsafe fn convolve_vertical_avx_row_f32_regular( unsafe fn convolve_vertical_avx_row_f32_fma( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], src_stride: usize, weight_ptr: &[f32], ) { convolve_vertical_avx_row_f32_impl::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } @@ -244,8 +218,8 @@ unsafe fn convolve_vertical_avx_row_f32_fma( unsafe fn convolve_vertical_avx_row_f32_impl( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], src_stride: usize, weight_ptr: &[f32], ) { @@ -256,9 +230,9 @@ unsafe fn convolve_vertical_avx_row_f32_impl( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -267,17 +241,15 @@ unsafe fn convolve_vertical_avx_row_f32_impl( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - ); - } + convolve_vertical_part_avx_16_f32::( + bounds.start, + cx, + src, + src_stride, + dst, + weight_ptr, + bounds, + ); cx += 16; } @@ -286,9 +258,9 @@ unsafe fn convolve_vertical_avx_row_f32_impl( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -300,9 +272,9 @@ unsafe fn convolve_vertical_avx_row_f32_impl( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); diff --git a/src/color_group.rs b/src/color_group.rs index 942deb5..3c8982b 100644 --- a/src/color_group.rs +++ b/src/color_group.rs @@ -28,7 +28,7 @@ */ use crate::mlaf::mlaf; use crate::saturate_narrow::SaturateNarrow; -use num_traits::{AsPrimitive, FromPrimitive, MulAdd}; +use num_traits::{FromPrimitive, MulAdd}; use std::ops::{Add, AddAssign, Mul, Shr, ShrAssign, Sub, SubAssign}; #[repr(C)] @@ -70,72 +70,6 @@ where } } -impl ColorGroup -where - J: Copy + Default + 'static, -{ - #[inline(always)] - pub(crate) fn from_ptr(store: *const T, offset: usize) -> ColorGroup - where - T: AsPrimitive, - { - unsafe { - let l_ptr = store.add(offset); - if COMPS == 1 { - ColorGroup { - r: l_ptr.read_unaligned().as_(), - g: J::default(), - b: J::default(), - a: J::default(), - } - } else if COMPS == 2 { - ColorGroup { - r: l_ptr.read_unaligned().as_(), - g: l_ptr.add(1).read_unaligned().as_(), - b: J::default(), - a: J::default(), - } - } else if COMPS == 3 { - ColorGroup { - r: l_ptr.read_unaligned().as_(), - g: l_ptr.add(1).read_unaligned().as_(), - b: l_ptr.add(2).read_unaligned().as_(), - a: J::default(), - } - } else if COMPS == 4 { - ColorGroup { - r: l_ptr.read_unaligned().as_(), - g: l_ptr.add(1).read_unaligned().as_(), - b: l_ptr.add(2).read_unaligned().as_(), - a: l_ptr.add(3).read_unaligned().as_(), - } - } else { - unimplemented!("Not implemented.") - } - } - } - - #[inline(always)] - pub(crate) fn as_ptr(self, ptr: *mut V, offset: usize) - where - J: Copy + AsPrimitive, - { - unsafe { - let s_ptr = ptr.add(offset); - s_ptr.write_unaligned(self.r.as_()); - if COMPS > 1 { - s_ptr.add(1).write_unaligned(self.g.as_()); - } - if COMPS > 2 { - s_ptr.add(2).write_unaligned(self.b.as_()); - } - if COMPS == 4 { - s_ptr.add(3).write_unaligned(self.a.as_()); - } - } - } -} - impl Mul for ColorGroup where J: Copy + Mul + Default + 'static, diff --git a/src/convolve_naive_f32.rs b/src/convolve_naive_f32.rs index 4952b8e..9bfe8f1 100644 --- a/src/convolve_naive_f32.rs +++ b/src/convolve_naive_f32.rs @@ -26,214 +26,37 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::color_group::ColorGroup; -use crate::filter_weights::{FilterBounds, FilterWeights}; -use num_traits::{AsPrimitive, MulAdd}; -use std::ops::{Add, Mul}; - -pub(crate) unsafe fn convolve_vertical_part_f32< - T: Copy + 'static + AsPrimitive, - I: Copy - + 'static - + AsPrimitive - + Default - + MulAdd - + Mul - + Add, - const CHANNELS: usize, ->( - start_y: usize, - start_x: usize, - src: *const T, - src_stride: usize, - dst: *mut T, - filter: &[f32], - bounds: &FilterBounds, -) where - f32: AsPrimitive, -{ - let mut sums0 = ColorGroup::::dup(I::default()); - - let v_start_px = start_x * CHANNELS; - - for j in 0..bounds.size { - let py = start_y + j; - let weight: I = filter.get_unchecked(j).as_(); - let src_ptr = src.add(src_stride * py); - - let new_px0 = ColorGroup::::from_ptr(src_ptr, v_start_px); - - sums0 = sums0.mul_add(new_px0, weight); - } - - sums0.as_ptr(dst, v_start_px); -} - -pub(crate) unsafe fn convolve_vertical_part_4_f32< - T: Copy + 'static + AsPrimitive, - I: Copy - + 'static - + AsPrimitive - + Default - + MulAdd - + Mul - + Add, - const CHANNELS: usize, ->( - start_y: usize, - start_x: usize, - src: *const T, - src_stride: usize, - dst: *mut T, - filter: &[f32], - bounds: &FilterBounds, -) where - f32: AsPrimitive, -{ - let mut sums0 = ColorGroup::::dup(I::default()); - let mut sums1 = ColorGroup::::dup(I::default()); - let mut sums2 = ColorGroup::::dup(I::default()); - let mut sums3 = ColorGroup::::dup(I::default()); - - let v_start_px = start_x * CHANNELS; - - for j in 0..bounds.size { - let py = start_y + j; - let weight: I = filter.get_unchecked(j).as_(); - let src_ptr = src.add(src_stride * py); - - let new_px0 = ColorGroup::::from_ptr(src_ptr, v_start_px); - let new_px1 = ColorGroup::::from_ptr(src_ptr, v_start_px + CHANNELS); - let new_px2 = ColorGroup::::from_ptr(src_ptr, v_start_px + CHANNELS * 2); - let new_px3 = ColorGroup::::from_ptr(src_ptr, v_start_px + CHANNELS * 3); - - sums0 = sums0.mul_add(new_px0, weight); - sums1 = sums1.mul_add(new_px1, weight); - sums2 = sums2.mul_add(new_px2, weight); - sums3 = sums3.mul_add(new_px3, weight); - } - - sums0.as_ptr(dst, v_start_px); - sums0.as_ptr(dst, v_start_px + CHANNELS); - sums0.as_ptr(dst, v_start_px + CHANNELS * 2); - sums0.as_ptr(dst, v_start_px + CHANNELS * 3); -} +use crate::filter_weights::FilterWeights; +use crate::floating_point_horizontal::{ + convolve_row_handler_floating_point, convolve_row_handler_floating_point_4, +}; #[inline] -pub(crate) fn convolve_horizontal_rgb_native_row< - T: Copy + 'static + AsPrimitive, - I: Copy - + 'static - + Default - + MulAdd - + AsPrimitive - + Mul - + Add, - const CHANNELS: usize, ->( - dst_width: usize, +pub(crate) fn convolve_horizontal_rgb_native_row( + _: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const T, - unsafe_destination_ptr_0: *mut T, -) where - f32: AsPrimitive + AsPrimitive, -{ - unsafe { - let weights_ptr = &filter_weights.weights; - let mut filter_offset = 0usize; - - for x in 0..dst_width { - let mut sums = ColorGroup::::dup(0f32.as_()); - - let bounds = filter_weights.bounds.get_unchecked(x); - let start_x = bounds.start; - for j in 0..bounds.size { - let px = (start_x + j) * CHANNELS; - let weight = *weights_ptr.get_unchecked(j + filter_offset); - - let new_px = ColorGroup::::from_ptr(unsafe_source_ptr_0, px); - - sums = sums.mul_add(new_px, weight.as_()); - } - - let px = x * CHANNELS; - - sums.as_ptr(unsafe_destination_ptr_0, px); - - filter_offset += filter_weights.aligned_size; - } - } + src: &[f32], + dst: &mut [f32], +) { + convolve_row_handler_floating_point::(src, dst, filter_weights, 8) } -pub(crate) fn convolve_horizontal_rgba_4_row_f32< - T: Copy + 'static + AsPrimitive, - I: Copy - + 'static - + Default - + MulAdd - + AsPrimitive - + Mul - + Add, - const CHANNELS: usize, ->( - dst_width: usize, +pub(crate) fn convolve_horizontal_rgba_4_row_f32( + _: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const T, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut T, + dst: &mut [f32], dst_stride: usize, -) where - f32: AsPrimitive + AsPrimitive, -{ - unsafe { - let mut filter_offset = 0usize; - let weights = &filter_weights.weights; - - let src_row0 = unsafe_source_ptr_0; - let src_row1 = unsafe_source_ptr_0.add(src_stride); - let src_row2 = unsafe_source_ptr_0.add(src_stride * 2); - let src_row3 = unsafe_source_ptr_0.add(src_stride * 3); - - let dst_row0 = unsafe_destination_ptr_0; - let dst_row1 = unsafe_destination_ptr_0.add(dst_stride); - let dst_row2 = unsafe_destination_ptr_0.add(dst_stride * 2); - let dst_row3 = unsafe_destination_ptr_0.add(dst_stride * 3); - - for x in 0..dst_width { - let mut sums0 = ColorGroup::::dup(0f32.as_()); - let mut sums1 = ColorGroup::::dup(0f32.as_()); - let mut sums2 = ColorGroup::::dup(0f32.as_()); - let mut sums3 = ColorGroup::::dup(0f32.as_()); - - let bounds = filter_weights.bounds.get_unchecked(x); - let start_x = bounds.start; - for j in 0..bounds.size { - let px = (start_x + j) * CHANNELS; - let weight = *weights.get_unchecked(j + filter_offset); - - let new_px0 = ColorGroup::::from_ptr(src_row0, px); - sums0 = sums0.mul_add(new_px0, weight.as_()); - - let new_px1 = ColorGroup::::from_ptr(src_row1, px); - sums1 = sums1.mul_add(new_px1, weight.as_()); - - let new_px2 = ColorGroup::::from_ptr(src_row2, px); - sums2 = sums2.mul_add(new_px2, weight.as_()); - - let new_px3 = ColorGroup::::from_ptr(src_row3, px); - sums3 = sums3.mul_add(new_px3, weight.as_()); - } - - let px = x * CHANNELS; - - sums0.as_ptr(dst_row0, px); - sums1.as_ptr(dst_row1, px); - sums2.as_ptr(dst_row2, px); - sums3.as_ptr(dst_row3, px); - - filter_offset += filter_weights.aligned_size; - } - } +) { + convolve_row_handler_floating_point_4::( + src, + src_stride, + dst, + dst_stride, + filter_weights, + 8, + ) } diff --git a/src/dispatch_group_f32.rs b/src/dispatch_group_f32.rs index ce639d4..f4b87e9 100644 --- a/src/dispatch_group_f32.rs +++ b/src/dispatch_group_f32.rs @@ -28,70 +28,51 @@ */ use crate::filter_weights::{FilterBounds, FilterWeights}; -use crate::unsafe_slice::UnsafeSlice; use crate::ImageStore; +use rayon::iter::{IndexedParallelIterator, ParallelIterator}; +use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use rayon::ThreadPool; -use std::sync::Arc; pub(crate) fn convolve_vertical_dispatch_f32( image_store: &ImageStore, filter_weights: FilterWeights, destination: &mut ImageStore, pool: &Option, - dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, &[f32]), + dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]), ) { - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - let src_stride = image_store.width * image_store.channels; - - let mut filter_offset = 0usize; - let dst_stride = destination.width * image_store.channels; + let dst_width = destination.width; if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let bounds = unsafe { weights.bounds.get_unchecked(y) }; - let weight_ptr = - unsafe { weights.weights.get_unchecked((weights.aligned_size * y)..) }; - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - dispatcher( - dst_width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, - ); + pool.install(|| { + destination + .buffer + .borrow_mut() + .par_chunks_exact_mut(dst_stride) + .enumerate() + .for_each(|(y, row)| { + let bounds = filter_weights.bounds[y]; + let filter_offset = y * filter_weights.aligned_size; + let weights = &filter_weights.weights[filter_offset..]; + let source_buffer = image_store.buffer.borrow(); + dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights); }); - } }); } else { - for y in 0..destination.height { - let bounds = unsafe { filter_weights.bounds.get_unchecked(y) }; - let weight_ptr = unsafe { filter_weights.weights.get_unchecked(filter_offset..) }; - - dispatcher( - dst_width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, - ); - - filter_offset += filter_weights.aligned_size; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } + destination + .buffer + .borrow_mut() + .chunks_exact_mut(dst_stride) + .enumerate() + .for_each(|(y, row)| { + let bounds = filter_weights.bounds[y]; + let filter_offset = y * filter_weights.aligned_size; + let weights = &filter_weights.weights[filter_offset..]; + let source_buffer = image_store.buffer.borrow(); + dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights); + }); } } @@ -102,95 +83,120 @@ pub(crate) fn convolve_horizontal_dispatch_f32( destination: &mut ImageStore, pool: &Option, dispatcher_4_rows: Option< - fn(usize, usize, &FilterWeights, *const f32, usize, *mut f32, usize), + fn(usize, usize, &FilterWeights, &[f32], usize, &mut [f32], usize), >, - dispatcher_row: fn(usize, usize, &FilterWeights, *const f32, *mut f32), + dispatcher_row: fn(usize, usize, &FilterWeights, &[f32], &mut [f32]), ) { - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - let src_stride = image_store.width * image_store.channels; let dst_stride = destination.width * image_store.channels; let dst_width = destination.width; let src_width = image_store.width; if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - let mut yy = 0usize; + pool.install(|| { + let mut processed_4 = false; + if let Some(dispatcher) = dispatcher_4_rows { - for y in (0..destination.height.saturating_sub(4)).step_by(4) { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; + image_store + .buffer + .borrow() + .par_chunks_exact(src_stride * 4) + .zip( + destination + .buffer + .borrow_mut() + .par_chunks_exact_mut(dst_stride * 4), + ) + .for_each(|(src, dst)| { dispatcher( dst_width, src_width, - &weights, - unsafe_source_ptr_0, + &filter_weights, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); }); - yy = y; - } + processed_4 = true; } - for y in yy..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - dispatcher_row( - dst_width, - src_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); + + let left_src_rows = if processed_4 { + image_store + .buffer + .borrow() + .chunks_exact(src_stride * 4) + .remainder() + } else { + image_store.buffer.borrow() + }; + let left_dst_rows = if processed_4 { + destination + .buffer + .borrow_mut() + .chunks_exact_mut(dst_stride * 4) + .into_remainder() + } else { + destination.buffer.borrow_mut() + }; + + left_src_rows + .par_chunks_exact(src_stride) + .zip(left_dst_rows.par_chunks_exact_mut(dst_stride)) + .for_each(|(src, dst)| { + dispatcher_row(dst_width, src_width, &filter_weights, src, dst); }); - } }); } else { - let mut yy = 0usize; - + let mut processed_4 = false; if let Some(dispatcher) = dispatcher_4_rows { - while yy + 4 < destination.height { + for (src, dst) in image_store + .buffer + .borrow() + .chunks_exact(src_stride * 4) + .zip( + destination + .buffer + .borrow_mut() + .chunks_exact_mut(dst_stride * 4), + ) + { dispatcher( dst_width, src_width, &filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) }; - - yy += 4; } + processed_4 = true; } - for _ in yy..destination.height { - dispatcher_row( - dst_width, - src_width, - &filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; + let left_src_rows = if processed_4 { + image_store + .buffer + .borrow() + .chunks_exact(src_stride * 4) + .remainder() + } else { + image_store.buffer.borrow() + }; + let left_dst_rows = if processed_4 { + destination + .buffer + .borrow_mut() + .chunks_exact_mut(dst_stride * 4) + .into_remainder() + } else { + destination.buffer.borrow_mut() + }; + for (src, dst) in left_src_rows + .chunks_exact(src_stride) + .zip(left_dst_rows.chunks_exact_mut(dst_stride)) + { + dispatcher_row(dst_width, src_width, &filter_weights, src, dst); } } } diff --git a/src/lib.rs b/src/lib.rs index de47fb5..290b8d4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,7 +88,6 @@ mod scaler_f16; mod sse; mod support; mod threading_policy; -mod unsafe_slice; #[cfg(all(target_arch = "wasm32", target_feature = "simd128",))] mod wasm32; diff --git a/src/neon/plane_f32.rs b/src/neon/plane_f32.rs index 3db9629..cb8c65f 100644 --- a/src/neon/plane_f32.rs +++ b/src/neon/plane_f32.rs @@ -91,8 +91,8 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { unsafe { let mut filter_offset = 0usize; @@ -109,7 +109,7 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one( let read_weights = xvld1q_f32_x4(ptr); store = conv_horiz_plane_16_f32!( bounds_start, - unsafe_source_ptr_0, + src.as_ptr(), read_weights, store ); @@ -122,7 +122,7 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one( let read_weights = xvld1q_f32_x2(ptr); store = conv_horiz_plane_8_f32!( bounds_start, - unsafe_source_ptr_0, + src.as_ptr(), read_weights.0, read_weights.1, store @@ -135,7 +135,7 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one( let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); store = - conv_horiz_plane_4_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store); + conv_horiz_plane_4_f32!(bounds_start, src.as_ptr(), read_weights, store); jx += 4; } @@ -144,7 +144,7 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one( let ptr = weights_ptr.add(jx + filter_offset); let weights0 = vld1_f32(ptr); let weights = vcombine_f32(weights0, vdup_n_f32(0.)); - store = conv_horiz_plane_2_f32!(bounds_start, unsafe_source_ptr_0, weights, store); + store = conv_horiz_plane_2_f32!(bounds_start, src.as_ptr(), weights, store); jx += 2; } @@ -152,12 +152,12 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vld1q_dup_f32(ptr); - store = conv_horiz_plane_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store); + store = conv_horiz_plane_1_f32!(bounds_start, src.as_ptr(), weight0, store); jx += 1; } let px = x; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); dest_ptr.write_unaligned(vaddvq_f32(store)); filter_offset += filter_weights.aligned_size; @@ -169,9 +169,9 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { unsafe { @@ -191,17 +191,13 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4( let ptr = weights_ptr.add(jx + filter_offset); let read_weights = xvld1q_f32_x4(ptr); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_16_f32!( - bounds_start, - unsafe_source_ptr_0, - read_weights, - store_0 - ); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = + conv_horiz_plane_16_f32!(bounds_start, src.as_ptr(), read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, read_weights, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, read_weights, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, read_weights, store_3); jx += 16; } @@ -212,12 +208,12 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4( let bounds_start = bounds.start + jx; store_0 = conv_horiz_plane_8_f32!( bounds_start, - unsafe_source_ptr_0, + src.as_ptr(), read_weights.0, read_weights.1, store_0 ); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_plane_8_f32!( bounds_start, s_ptr_1, @@ -225,7 +221,7 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4( read_weights.1, store_1 ); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_plane_8_f32!( bounds_start, s_ptr2, @@ -233,7 +229,7 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4( read_weights.1, store_2 ); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_plane_8_f32!( bounds_start, s_ptr3, @@ -248,17 +244,13 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4( let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_4_f32!( - bounds_start, - unsafe_source_ptr_0, - read_weights, - store_0 - ); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = + conv_horiz_plane_4_f32!(bounds_start, src.as_ptr(), read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3); jx += 4; } @@ -268,13 +260,12 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4( let weights0 = vld1_f32(ptr); let weights = vcombine_f32(weights0, vdup_n_f32(0.)); let bounds_start = bounds.start + jx; - store_0 = - conv_horiz_plane_2_f32!(bounds_start, unsafe_source_ptr_0, weights, store_0); - let ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_plane_2_f32!(bounds_start, src.as_ptr(), weights, store_0); + let ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1); - let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2); - let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3); jx += 2; } @@ -283,28 +274,27 @@ pub(crate) fn convolve_horizontal_plane_neon_rows_4( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vld1q_dup_f32(ptr); let bounds_start = bounds.start + jx; - store_0 = - conv_horiz_plane_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store_0); - let ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_plane_1_f32!(bounds_start, src.as_ptr(), weight0, store_0); + let ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1); - let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2); - let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3); jx += 1; } let px = x; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); dest_ptr.write_unaligned(vaddvq_f32(store_0)); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr(); dest_ptr.write_unaligned(vaddvq_f32(store_1)); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); dest_ptr.write_unaligned(vaddvq_f32(store_2)); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); dest_ptr.write_unaligned(vaddvq_f32(store_3)); filter_offset += filter_weights.aligned_size; diff --git a/src/neon/rgb_f32.rs b/src/neon/rgb_f32.rs index 473f745..f1a5325 100644 --- a/src/neon/rgb_f32.rs +++ b/src/neon/rgb_f32.rs @@ -113,9 +113,9 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f32( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { unsafe { @@ -138,13 +138,12 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f32( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - store_0 = - conv_horiz_4_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store_0); - let s_ptr1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_4_rgb_f32!(bounds_start, src.as_ptr(), read_weights, store_0); + let s_ptr1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr1, read_weights, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr2, read_weights, store_2); - let s_ptr = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr, read_weights, store_3); jx += 4; } @@ -153,13 +152,12 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f32( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - store_0 = - conv_horiz_2_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store_0); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_2_rgb_f32!(bounds_start, src.as_ptr(), read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr_1, read_weights, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr2, read_weights, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr3, read_weights, store_3); jx += 2; } @@ -168,28 +166,27 @@ pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f32( let ptr = weights_ptr.add(jx + filter_offset); let bounds_start = bounds.start + jx; let weight0 = vld1q_dup_f32(ptr); - store_0 = - conv_horiz_1_rgb_f32!(bounds_start, unsafe_source_ptr_0, weight0, store_0); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_1_rgb_f32!(bounds_start, src.as_ptr(), weight0, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_1_rgb_f32!(bounds_start, s_ptr_1, weight0, store_1); - let s_ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_1_rgb_f32!(bounds_start, s_ptr_2, weight0, store_2); - let s_ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_1_rgb_f32!(bounds_start, s_ptr_3, weight0, store_3); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); write_rgb_f32!(store_0, dest_ptr); - let dest_ptr_1 = unsafe_destination_ptr_0.add(px + dst_stride); + let dest_ptr_1 = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr(); write_rgb_f32!(store_1, dest_ptr_1); - let dest_ptr_2 = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let dest_ptr_2 = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); write_rgb_f32!(store_2, dest_ptr_2); - let dest_ptr_3 = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let dest_ptr_3 = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); write_rgb_f32!(store_3, dest_ptr_3); filter_offset += filter_weights.aligned_size; @@ -201,8 +198,8 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f32( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { unsafe { const CHANNELS: usize = 3; @@ -218,8 +215,7 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f32( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - store = - conv_horiz_4_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store); + store = conv_horiz_4_rgb_f32!(bounds_start, src.as_ptr(), read_weights, store); jx += 4; } @@ -227,8 +223,7 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f32( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - store = - conv_horiz_2_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store); + store = conv_horiz_2_rgb_f32!(bounds_start, src.as_ptr(), read_weights, store); jx += 2; } @@ -236,12 +231,12 @@ pub(crate) fn convolve_horizontal_rgb_neon_row_one_f32( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vld1q_dup_f32(ptr); let bounds_start = bounds.start + jx; - store = conv_horiz_1_rgb_f32!(bounds_start, unsafe_source_ptr_0, weight0, store); + store = conv_horiz_1_rgb_f32!(bounds_start, src.as_ptr(), weight0, store); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); write_rgb_f32!(store, dest_ptr); filter_offset += filter_weights.aligned_size; diff --git a/src/neon/rgba_f32.rs b/src/neon/rgba_f32.rs index 3db6b0e..1e44dd3 100644 --- a/src/neon/rgba_f32.rs +++ b/src/neon/rgba_f32.rs @@ -94,8 +94,8 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { unsafe { const CHANNELS: usize = 4; @@ -112,7 +112,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one( let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); store = - conv_horiz_rgba_4_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store); + conv_horiz_rgba_4_f32!(bounds_start, src.as_ptr(), read_weights, store); jx += 4; } @@ -121,7 +121,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one( let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); store = - conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store); + conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store); jx += 2; } @@ -129,12 +129,12 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vld1q_dup_f32(ptr); - store = conv_horiz_rgba_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store); + store = conv_horiz_rgba_1_f32!(bounds_start, src.as_ptr(), weight0, store); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); vst1q_f32(dest_ptr, store); filter_offset += filter_weights.aligned_size; @@ -146,9 +146,9 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { unsafe { @@ -171,12 +171,12 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4( let bounds_start = bounds.start + jx; store_0 = conv_horiz_rgba_8_f32!( bounds_start, - unsafe_source_ptr_0, + src.as_ptr(), read_weights.0, read_weights.1, store_0 ); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_rgba_8_f32!( bounds_start, s_ptr_1, @@ -184,7 +184,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4( read_weights.1, store_1 ); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_rgba_8_f32!( bounds_start, s_ptr2, @@ -192,7 +192,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4( read_weights.1, store_2 ); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_rgba_8_f32!( bounds_start, s_ptr3, @@ -207,17 +207,12 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4( let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_4_f32!( - bounds_start, - unsafe_source_ptr_0, - read_weights, - store_0 - ); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_rgba_4_f32!(bounds_start, src.as_ptr(), read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr_1, read_weights, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr2, read_weights, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr3, read_weights, store_3); jx += 4; } @@ -226,17 +221,12 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4( let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_2_f32!( - bounds_start, - unsafe_source_ptr_0, - read_weights, - store_0 - ); - let ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store_0); + let ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, read_weights, store_1); - let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, read_weights, store_2); - let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, read_weights, store_3); jx += 2; } @@ -245,28 +235,27 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vld1q_dup_f32(ptr); let bounds_start = bounds.start + jx; - store_0 = - conv_horiz_rgba_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store_0); - let ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_rgba_1_f32!(bounds_start, src.as_ptr(), weight0, store_0); + let ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_rgba_1_f32!(bounds_start, ptr_1, weight0, store_1); - let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_rgba_1_f32!(bounds_start, ptr_2, weight0, store_2); - let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_rgba_1_f32!(bounds_start, ptr_3, weight0, store_3); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); vst1q_f32(dest_ptr, store_0); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr(); vst1q_f32(dest_ptr, store_1); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); vst1q_f32(dest_ptr, store_2); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); vst1q_f32(dest_ptr, store_3); filter_offset += filter_weights.aligned_size; diff --git a/src/neon/vertical_f32.rs b/src/neon/vertical_f32.rs index dbb1b6d..454892c 100644 --- a/src/neon/vertical_f32.rs +++ b/src/neon/vertical_f32.rs @@ -44,10 +44,9 @@ macro_rules! conv_vertical_part_neon_16_f32 { for j in 0..$bounds.size { let py = $start_y + j; let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr()); - let src_ptr = $src.add($src_stride * py); + let src_ptr = $src.get_unchecked($src_stride * py + px..); - let s_ptr = src_ptr.add(px); - let item_row = xvld1q_f32_x4(s_ptr); + let item_row = xvld1q_f32_x4(src_ptr.as_ptr()); store_0 = prefer_vfmaq_f32(store_0, item_row.0, v_weight); store_1 = prefer_vfmaq_f32(store_1, item_row.1, v_weight); @@ -55,7 +54,7 @@ macro_rules! conv_vertical_part_neon_16_f32 { store_3 = prefer_vfmaq_f32(store_3, item_row.3, v_weight); } - let dst_ptr = $dst.add(px); + let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr(); let f_set = float32x4x4_t(store_0, store_1, store_2, store_3); vst1q_f32_x4(dst_ptr, f_set); } @@ -79,11 +78,10 @@ macro_rules! conv_vertical_part_neon_32_f32 { for j in 0..$bounds.size { let py = $start_y + j; let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr()); - let src_ptr = $src.add($src_stride * py); + let src_ptr = $src.get_unchecked($src_stride * py + px..).as_ptr(); - let s_ptr = src_ptr.add(px); - let item_row_0 = xvld1q_f32_x4(s_ptr); - let item_row_1 = xvld1q_f32_x4(s_ptr.add(16)); + let item_row_0 = xvld1q_f32_x4(src_ptr); + let item_row_1 = xvld1q_f32_x4(src_ptr.add(16)); store_0 = prefer_vfmaq_f32(store_0, item_row_0.0, v_weight); store_1 = prefer_vfmaq_f32(store_1, item_row_0.1, v_weight); @@ -96,7 +94,7 @@ macro_rules! conv_vertical_part_neon_32_f32 { store_7 = prefer_vfmaq_f32(store_7, item_row_1.3, v_weight); } - let dst_ptr = $dst.add(px); + let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr(); let f_set = float32x4x4_t(store_0, store_1, store_2, store_3); vst1q_f32_x4(dst_ptr, f_set); @@ -129,12 +127,11 @@ macro_rules! conv_vertical_part_neon_48_f32 { for j in 0..$bounds.size { let py = $start_y + j; let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr()); - let src_ptr = $src.add($src_stride * py); + let src_ptr = $src.get_unchecked($src_stride * py + px..).as_ptr(); - let s_ptr = src_ptr.add(px); - let item_row_0 = xvld1q_f32_x4(s_ptr); - let item_row_1 = xvld1q_f32_x4(s_ptr.add(16)); - let item_row_2 = xvld1q_f32_x4(s_ptr.add(32)); + let item_row_0 = xvld1q_f32_x4(src_ptr); + let item_row_1 = xvld1q_f32_x4(src_ptr.add(16)); + let item_row_2 = xvld1q_f32_x4(src_ptr.add(32)); store_0 = prefer_vfmaq_f32(store_0, item_row_0.0, v_weight); store_1 = prefer_vfmaq_f32(store_1, item_row_0.1, v_weight); @@ -152,7 +149,7 @@ macro_rules! conv_vertical_part_neon_48_f32 { store_11 = prefer_vfmaq_f32(store_11, item_row_2.3, v_weight); } - let dst_ptr = $dst.add(px); + let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr(); let f_set = float32x4x4_t(store_0, store_1, store_2, store_3); vst1q_f32_x4(dst_ptr, f_set); @@ -169,9 +166,9 @@ macro_rules! conv_vertical_part_neon_48_f32 { unsafe fn convolve_vertical_part_neon_8_f32( start_y: usize, start_x: usize, - src: *const f32, + src: &[f32], src_stride: usize, - dst: *mut f32, + dst: &mut [f32], filter: &[f32], bounds: &FilterBounds, ) { @@ -184,10 +181,8 @@ unsafe fn convolve_vertical_part_neon_8_f32( let py = start_y + j; let weight = filter.get_unchecked(j..); let v_weight = vld1q_dup_f32(weight.as_ptr()); - let src_ptr = src.add(src_stride * py); - - let s_ptr = src_ptr.add(px); - let item_row = xvld1q_f32_x2(s_ptr); + let src_ptr = src.get_unchecked(src_stride * py + px..); + let item_row = xvld1q_f32_x2(src_ptr.as_ptr()); store_0 = prefer_vfmaq_f32(store_0, item_row.0, v_weight); store_1 = prefer_vfmaq_f32(store_1, item_row.1, v_weight); @@ -195,7 +190,7 @@ unsafe fn convolve_vertical_part_neon_8_f32( let item = float32x4x2_t(store_0, store_1); - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); vst1q_f32_x2(dst_ptr, item); } @@ -203,9 +198,9 @@ unsafe fn convolve_vertical_part_neon_8_f32( unsafe fn convolve_vertical_part_neon_4_f32( start_y: usize, start_x: usize, - src: *const f32, + src: &[f32], src_stride: usize, - dst: *mut f32, + dst: &mut [f32], filter: &[f32], bounds: &FilterBounds, ) { @@ -217,15 +212,14 @@ unsafe fn convolve_vertical_part_neon_4_f32( let py = start_y + j; let weight = filter.get_unchecked(j..); let v_weight = vld1q_dup_f32(weight.as_ptr()); - let src_ptr = src.add(src_stride * py); + let src_ptr = src.get_unchecked(src_stride * py + px..); - let s_ptr = src_ptr.add(px); - let item_row = xvld1q_f32_x2(s_ptr); + let item_row = xvld1q_f32_x2(src_ptr.as_ptr()); store_0 = prefer_vfmaq_f32(store_0, item_row.0, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); vst1q_f32(dst_ptr, store_0); } @@ -233,9 +227,9 @@ unsafe fn convolve_vertical_part_neon_4_f32( unsafe fn convolve_vertical_part_neon_1_f32( start_y: usize, start_x: usize, - src: *const f32, + src: &[f32], src_stride: usize, - dst: *mut f32, + dst: &mut [f32], filter: &[f32], bounds: &FilterBounds, ) { @@ -247,23 +241,21 @@ unsafe fn convolve_vertical_part_neon_1_f32( let py = start_y + j; let weight = filter.get_unchecked(j..); let v_weight = vld1q_dup_f32(weight.as_ptr()); - let src_ptr = src.add(src_stride * py); - - let s_ptr = src_ptr.add(px); - let item_row = vld1q_dup_f32(s_ptr); + let src_ptr = src.get_unchecked(src_stride * py + px..); + let item_row = vld1q_dup_f32(src_ptr.as_ptr()); store_0 = prefer_vfmaq_f32(store_0, item_row, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); dst_ptr.write_unaligned(vgetq_lane_f32::<0>(store_0)); } pub(crate) fn convolve_vertical_rgb_neon_row_f32( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], src_stride: usize, weight_ptr: &[f32], ) { @@ -271,43 +263,19 @@ pub(crate) fn convolve_vertical_rgb_neon_row_f32( let dst_width = width * CHANNELS; while cx + 48 < dst_width { - conv_vertical_part_neon_48_f32!( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds - ); + conv_vertical_part_neon_48_f32!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); cx += 48; } while cx + 32 < dst_width { - conv_vertical_part_neon_32_f32!( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds - ); + conv_vertical_part_neon_32_f32!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); cx += 32; } while cx + 16 < dst_width { - conv_vertical_part_neon_16_f32!( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds - ); + conv_vertical_part_neon_16_f32!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); cx += 16; } @@ -317,9 +285,9 @@ pub(crate) fn convolve_vertical_rgb_neon_row_f32( convolve_vertical_part_neon_8_f32( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -333,9 +301,9 @@ pub(crate) fn convolve_vertical_rgb_neon_row_f32( convolve_vertical_part_neon_4_f32( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -349,9 +317,9 @@ pub(crate) fn convolve_vertical_rgb_neon_row_f32( convolve_vertical_part_neon_1_f32( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); diff --git a/src/plane_f32.rs b/src/plane_f32.rs index 742d024..61f533a 100644 --- a/src/plane_f32.rs +++ b/src/plane_f32.rs @@ -58,10 +58,10 @@ impl HorizontalConvolutionPass for ImageStore<'_, f32, 1> { pool: &Option, ) { let mut _dispatcher_4_rows: Option< - fn(usize, usize, &FilterWeights, *const f32, usize, *mut f32, usize), - > = Some(convolve_horizontal_rgba_4_row_f32::); - let mut _dispatcher_row: fn(usize, usize, &FilterWeights, *const f32, *mut f32) = - convolve_horizontal_rgb_native_row::; + fn(usize, usize, &FilterWeights, &[f32], usize, &mut [f32], usize), + > = Some(convolve_horizontal_rgba_4_row_f32::<1>); + let mut _dispatcher_row: fn(usize, usize, &FilterWeights, &[f32], &mut [f32]) = + convolve_horizontal_rgb_native_row::<1>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_4); @@ -96,8 +96,8 @@ impl VerticalConvolutionPass for ImageStore<'_, f32, 1> { destination: &mut ImageStore, pool: &Option, ) { - let mut _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, &[f32]) = - convolve_vertical_rgb_native_row_f32::; + let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) = + convolve_vertical_rgb_native_row_f32::<1>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { _dispatcher = convolve_vertical_rgb_neon_row_f32::<1>; diff --git a/src/rgb_f32.rs b/src/rgb_f32.rs index f06005d..357d8ec 100644 --- a/src/rgb_f32.rs +++ b/src/rgb_f32.rs @@ -32,60 +32,23 @@ use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::convolve_naive_f32::*; use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32}; use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::floating_point_vertical::column_handler_floating_point; use crate::image_store::ImageStore; #[cfg(all(target_arch = "aarch64", target_feature = "neon",))] use crate::neon::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::sse::*; -use num_traits::AsPrimitive; use rayon::ThreadPool; -pub(crate) fn convolve_vertical_rgb_native_row_f32< - T: Copy + 'static + AsPrimitive, - const COMPONENTS: usize, ->( - dst_width: usize, +pub(crate) fn convolve_vertical_rgb_native_row_f32( + _: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const T, - unsafe_destination_ptr_0: *mut T, + src: &[f32], + dst: &mut [f32], src_stride: usize, weight: &[f32], -) where - f32: AsPrimitive, -{ - let mut cx = 0usize; - - while cx + 4 < dst_width { - unsafe { - convolve_vertical_part_4_f32::( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight, - bounds, - ); - } - - cx += 4; - } - - while cx < dst_width { - unsafe { - convolve_vertical_part_f32::( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight, - bounds, - ); - } - - cx += 1; - } +) { + column_handler_floating_point::(bounds, src, dst, src_stride, weight, 8); } impl HorizontalConvolutionPass for ImageStore<'_, f32, 3> { @@ -97,10 +60,10 @@ impl HorizontalConvolutionPass for ImageStore<'_, f32, 3> { pool: &Option, ) { let mut _dispatcher_4_rows: Option< - fn(usize, usize, &FilterWeights, *const f32, usize, *mut f32, usize), - > = Some(convolve_horizontal_rgba_4_row_f32::); - let mut _dispatcher_row: fn(usize, usize, &FilterWeights, *const f32, *mut f32) = - convolve_horizontal_rgb_native_row::; + fn(usize, usize, &FilterWeights, &[f32], usize, &mut [f32], usize), + > = Some(convolve_horizontal_rgba_4_row_f32::<3>); + let mut _dispatcher_row: fn(usize, usize, &FilterWeights, &[f32], &mut [f32]) = + convolve_horizontal_rgb_native_row::<3>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4_f32); @@ -135,8 +98,8 @@ impl VerticalConvolutionPass for ImageStore<'_, f32, 3> { destination: &mut ImageStore, pool: &Option, ) { - let mut _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, &[f32]) = - convolve_vertical_rgb_native_row_f32::; + let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) = + convolve_vertical_rgb_native_row_f32::<3>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { _dispatcher = convolve_vertical_rgb_neon_row_f32::<3>; diff --git a/src/rgba_f32.rs b/src/rgba_f32.rs index 09fed44..f2081a4 100644 --- a/src/rgba_f32.rs +++ b/src/rgba_f32.rs @@ -54,10 +54,10 @@ impl HorizontalConvolutionPass for ImageStore<'_, f32, 4> { pool: &Option, ) { let mut _dispatcher_4_rows: Option< - fn(usize, usize, &FilterWeights, *const f32, usize, *mut f32, usize), - > = Some(convolve_horizontal_rgba_4_row_f32::); - let mut _dispatcher_row: fn(usize, usize, &FilterWeights, *const f32, *mut f32) = - convolve_horizontal_rgb_native_row::; + fn(usize, usize, &FilterWeights, &[f32], usize, &mut [f32], usize), + > = Some(convolve_horizontal_rgba_4_row_f32::<4>); + let mut _dispatcher_row: fn(usize, usize, &FilterWeights, &[f32], &mut [f32]) = + convolve_horizontal_rgb_native_row::<4>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4); @@ -65,7 +65,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f32, 4> { } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { - if is_x86_feature_detected!("sse4.1") { + if std::is_x86_feature_detected!("sse4.1") { _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4_f32::); _dispatcher_row = convolve_horizontal_rgba_sse_row_one_f32::; if is_x86_feature_detected!("fma") { @@ -73,7 +73,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f32, 4> { _dispatcher_row = convolve_horizontal_rgba_sse_row_one_f32::; } } - if is_x86_feature_detected!("avx2") { + if std::is_x86_feature_detected!("avx2") { _dispatcher_4_rows = Some(convolve_horizontal_rgba_avx_rows_4_f32::); _dispatcher_row = convolve_horizontal_rgba_avx_row_one_f32::; if is_x86_feature_detected!("fma") { @@ -100,23 +100,23 @@ impl VerticalConvolutionPass for ImageStore<'_, f32, 4> { destination: &mut ImageStore, pool: &Option, ) { - let mut _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, &[f32]) = - convolve_vertical_rgb_native_row_f32::; + let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) = + convolve_vertical_rgb_native_row_f32::<4>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { _dispatcher = convolve_vertical_rgb_neon_row_f32::<4>; } #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] { - let has_fma = is_x86_feature_detected!("fma"); - if is_x86_feature_detected!("sse4.1") { + let has_fma = std::is_x86_feature_detected!("fma"); + if std::is_x86_feature_detected!("sse4.1") { if has_fma { _dispatcher = convolve_vertical_rgb_sse_row_f32::<4, true>; } else { _dispatcher = convolve_vertical_rgb_sse_row_f32::<4, false>; } } - if is_x86_feature_detected!("avx2") { + if std::is_x86_feature_detected!("avx2") { _dispatcher = convolve_vertical_avx_row_f32::<4, false>; if has_fma { _dispatcher = convolve_vertical_avx_row_f32::<4, true>; diff --git a/src/sse/plane_f32.rs b/src/sse/plane_f32.rs index 7b619eb..8852f17 100644 --- a/src/sse/plane_f32.rs +++ b/src/sse/plane_f32.rs @@ -36,7 +36,7 @@ use std::arch::x86_64::*; macro_rules! conv_horiz_plane_16_f32 { ($start_x: expr, $src: expr, $set: expr, $store: expr, $fma: expr) => {{ - let src_ptr = $src.add($start_x); + let src_ptr = $src.get_unchecked($start_x..).as_ptr(); let rgb_pixel0 = _mm_loadu_ps(src_ptr); let rgb_pixel1 = _mm_loadu_ps(src_ptr.add(4)); @@ -53,7 +53,7 @@ macro_rules! conv_horiz_plane_16_f32 { macro_rules! conv_horiz_plane_8_f32 { ($start_x: expr, $src: expr, $set1: expr, $set2: expr, $store: expr, $fma: expr) => {{ - let src_ptr = $src.add($start_x); + let src_ptr = $src.get_unchecked($start_x..).as_ptr(); let rgb_pixel0 = _mm_loadu_ps(src_ptr); let rgb_pixel1 = _mm_loadu_ps(src_ptr.add(4)); @@ -66,7 +66,7 @@ macro_rules! conv_horiz_plane_8_f32 { macro_rules! conv_horiz_plane_4_f32 { ($start_x: expr, $src: expr, $set1: expr, $store: expr, $fma: expr) => {{ - let src_ptr = $src.add($start_x); + let src_ptr = $src.get_unchecked($start_x..).as_ptr(); let rgb_pixel = _mm_loadu_ps(src_ptr); @@ -76,7 +76,7 @@ macro_rules! conv_horiz_plane_4_f32 { macro_rules! conv_horiz_plane_2_f32 { ($start_x: expr, $src: expr, $set: expr, $store: expr, $fma: expr) => {{ - let src_ptr = $src.add($start_x); + let src_ptr = $src.get_unchecked($start_x..).as_ptr(); let rgb_pixel = _mm_setr_ps( src_ptr.read_unaligned(), @@ -91,7 +91,7 @@ macro_rules! conv_horiz_plane_2_f32 { macro_rules! conv_horiz_plane_1_f32 { ($start_x: expr, $src: expr, $set: expr, $store: expr, $fma: expr) => {{ - let src_ptr = $src.add($start_x); + let src_ptr = $src.get_unchecked($start_x..).as_ptr(); let rgb_pixel = _mm_setr_ps(src_ptr.read_unaligned(), 0., 0., 0.); _mm_prefer_fma_ps::<$fma>($store, rgb_pixel, $set) }}; @@ -101,8 +101,8 @@ pub(crate) fn convolve_horizontal_plane_sse_row_one( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { unsafe { if FMA { @@ -110,64 +110,62 @@ pub(crate) fn convolve_horizontal_plane_sse_row_one( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } else { convolve_horizontal_plane_sse_row_one_regular( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } } } -#[inline] #[target_feature(enable = "sse4.1")] unsafe fn convolve_horizontal_plane_sse_row_one_regular( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { convolve_horizontal_plane_sse_row_one_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } -#[inline] #[target_feature(enable = "sse4.1,fma")] unsafe fn convolve_horizontal_plane_sse_row_one_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { convolve_horizontal_plane_sse_row_one_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } -#[inline] +#[inline(always)] unsafe fn convolve_horizontal_plane_sse_row_one_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { let mut filter_offset = 0usize; let weights_ptr = filter_weights.weights.as_ptr(); @@ -185,8 +183,7 @@ unsafe fn convolve_horizontal_plane_sse_row_one_impl( let read_weights2 = _mm_loadu_ps(ptr.add(8)); let read_weights3 = _mm_loadu_ps(ptr.add(12)); let weights = (read_weights0, read_weights1, read_weights2, read_weights3); - store = - conv_horiz_plane_16_f32!(bounds_start, unsafe_source_ptr_0, weights, store, FMA); + store = conv_horiz_plane_16_f32!(bounds_start, src, weights, store, FMA); jx += 8; } @@ -198,7 +195,7 @@ unsafe fn convolve_horizontal_plane_sse_row_one_impl( let read_weights = (read_weights0, read_weights1); store = conv_horiz_plane_8_f32!( bounds_start, - unsafe_source_ptr_0, + src, read_weights.0, read_weights.1, store, @@ -211,13 +208,7 @@ unsafe fn convolve_horizontal_plane_sse_row_one_impl( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = _mm_loadu_ps(ptr); - store = conv_horiz_plane_4_f32!( - bounds_start, - unsafe_source_ptr_0, - read_weights, - store, - FMA - ); + store = conv_horiz_plane_4_f32!(bounds_start, src, read_weights, store, FMA); jx += 4; } @@ -225,7 +216,7 @@ unsafe fn convolve_horizontal_plane_sse_row_one_impl( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let weights = _mm_setr_ps(ptr.read_unaligned(), ptr.add(1).read_unaligned(), 0., 0.); - store = conv_horiz_plane_2_f32!(bounds_start, unsafe_source_ptr_0, weights, store, FMA); + store = conv_horiz_plane_2_f32!(bounds_start, src, weights, store, FMA); jx += 2; } @@ -233,13 +224,13 @@ unsafe fn convolve_horizontal_plane_sse_row_one_impl( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let weight0 = _mm_load1_ps(ptr); - store = conv_horiz_plane_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store, FMA); + store = conv_horiz_plane_1_f32!(bounds_start, src, weight0, store, FMA); jx += 1; } let px = x; - let dest_ptr = unsafe_destination_ptr_0.add(px); - dest_ptr.write_unaligned(_mm_hsum_ps(store)); + let dest_ptr = dst.get_unchecked_mut(px); + *dest_ptr = _mm_hsum_ps(store); filter_offset += filter_weights.aligned_size; } @@ -249,9 +240,9 @@ pub(crate) fn convolve_horizontal_plane_sse_rows_4( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { unsafe { @@ -260,9 +251,9 @@ pub(crate) fn convolve_horizontal_plane_sse_rows_4( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } else { @@ -270,55 +261,53 @@ pub(crate) fn convolve_horizontal_plane_sse_rows_4( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } } } -#[inline] #[target_feature(enable = "sse4.1")] unsafe fn convolve_horizontal_plane_sse_rows_4_regular( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { convolve_horizontal_plane_sse_rows_4_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } -#[inline] #[target_feature(enable = "sse4.1,fma")] unsafe fn convolve_horizontal_plane_sse_rows_4_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { convolve_horizontal_plane_sse_rows_4_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -328,9 +317,9 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { unsafe { @@ -354,18 +343,12 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl( let read_weights3 = _mm_loadu_ps(ptr.add(12)); let weights = (read_weights0, read_weights1, read_weights2, read_weights3); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_16_f32!( - bounds_start, - unsafe_source_ptr_0, - weights, - store_0, - FMA - ); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_plane_16_f32!(bounds_start, src, weights, store_0, FMA); + let s_ptr_1 = src.get_unchecked(src_stride..); store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, weights, store_1, FMA); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr2 = src.get_unchecked(src_stride * 2..); store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, weights, store_2, FMA); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr3 = src.get_unchecked(src_stride * 3..); store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, weights, store_3, FMA); jx += 16; } @@ -378,13 +361,13 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl( let bounds_start = bounds.start + jx; store_0 = conv_horiz_plane_8_f32!( bounds_start, - unsafe_source_ptr_0, + src, read_weights.0, read_weights.1, store_0, FMA ); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + let s_ptr_1 = src.get_unchecked(src_stride..); store_1 = conv_horiz_plane_8_f32!( bounds_start, s_ptr_1, @@ -393,7 +376,7 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl( store_1, FMA ); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr2 = src.get_unchecked(src_stride * 2..); store_2 = conv_horiz_plane_8_f32!( bounds_start, s_ptr2, @@ -402,7 +385,7 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl( store_2, FMA ); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr3 = src.get_unchecked(src_stride * 3..); store_3 = conv_horiz_plane_8_f32!( bounds_start, s_ptr3, @@ -418,19 +401,13 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl( let ptr = weights_ptr.add(jx + filter_offset); let read_weights = _mm_loadu_ps(ptr); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_4_f32!( - bounds_start, - unsafe_source_ptr_0, - read_weights, - store_0, - FMA - ); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_plane_4_f32!(bounds_start, src, read_weights, store_0, FMA); + let s_ptr_1 = src.get_unchecked(src_stride..); store_1 = conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1, FMA); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr2 = src.get_unchecked(src_stride * 2..); store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2, FMA); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr3 = src.get_unchecked(src_stride * 3..); store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3, FMA); jx += 4; } @@ -440,18 +417,12 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl( let weights = _mm_setr_ps(ptr.read_unaligned(), ptr.add(1).read_unaligned(), 0., 0.); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_2_f32!( - bounds_start, - unsafe_source_ptr_0, - weights, - store_0, - FMA - ); - let ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_plane_2_f32!(bounds_start, src, weights, store_0, FMA); + let ptr_1 = src.get_unchecked(src_stride..); store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1, FMA); - let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + let ptr_2 = src.get_unchecked(src_stride * 2..); store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2, FMA); - let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + let ptr_3 = src.get_unchecked(src_stride * 3..); store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3, FMA); jx += 2; } @@ -460,34 +431,28 @@ unsafe fn convolve_horizontal_plane_sse_rows_4_impl( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = _mm_set1_ps(ptr.read_unaligned()); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_plane_1_f32!( - bounds_start, - unsafe_source_ptr_0, - weight0, - store_0, - FMA - ); - let ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_plane_1_f32!(bounds_start, src, weight0, store_0, FMA); + let ptr_1 = src.get_unchecked(src_stride..); store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1, FMA); - let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + let ptr_2 = src.get_unchecked(src_stride * 2..); store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2, FMA); - let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + let ptr_3 = src.get_unchecked(src_stride * 3..); store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3, FMA); jx += 1; } let px = x; - let dest_ptr = unsafe_destination_ptr_0.add(px); - dest_ptr.write_unaligned(_mm_hsum_ps(store_0)); + let dest_ptr = dst.get_unchecked_mut(px); + *dest_ptr = _mm_hsum_ps(store_0); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); - dest_ptr.write_unaligned(_mm_hsum_ps(store_1)); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride); + *dest_ptr = _mm_hsum_ps(store_1); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); - dest_ptr.write_unaligned(_mm_hsum_ps(store_2)); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2); + *dest_ptr = _mm_hsum_ps(store_2); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); - dest_ptr.write_unaligned(_mm_hsum_ps(store_3)); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3); + *dest_ptr = _mm_hsum_ps(store_3); filter_offset += filter_weights.aligned_size; } diff --git a/src/sse/rgb_f32.rs b/src/sse/rgb_f32.rs index 918daed..26641f4 100644 --- a/src/sse/rgb_f32.rs +++ b/src/sse/rgb_f32.rs @@ -37,7 +37,7 @@ use std::arch::x86_64::*; #[inline(always)] unsafe fn convolve_horizontal_parts_4_rgb_f32( start_x: usize, - src: *const f32, + src: &[f32], weight0: __m128, weight1: __m128, weight2: __m128, @@ -45,7 +45,7 @@ unsafe fn convolve_horizontal_parts_4_rgb_f32( store_0: __m128, ) -> __m128 { const COMPONENTS: usize = 3; - let src_ptr = src.add(start_x * COMPONENTS); + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); let rgb_pixel_0 = _mm_loadu_ps(src_ptr); let rgb_pixel_1 = _mm_loadu_ps(src_ptr.add(3)); @@ -66,13 +66,13 @@ unsafe fn convolve_horizontal_parts_4_rgb_f32( #[inline(always)] unsafe fn convolve_horizontal_parts_2_rgb_f32( start_x: usize, - src: *const f32, + src: &[f32], weight0: __m128, weight1: __m128, store_0: __m128, ) -> __m128 { const COMPONENTS: usize = 3; - let src_ptr = src.add(start_x * COMPONENTS); + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); let orig1 = _mm_loadu_ps(src_ptr); let rgb_pixel_0 = orig1; @@ -91,12 +91,12 @@ unsafe fn convolve_horizontal_parts_2_rgb_f32( #[inline(always)] unsafe fn convolve_horizontal_parts_one_rgb_f32( start_x: usize, - src: *const f32, + src: &[f32], weight0: __m128, store_0: __m128, ) -> __m128 { const COMPONENTS: usize = 3; - let src_ptr = src.add(start_x * COMPONENTS); + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); let rgb_pixel = _mm_setr_ps( src_ptr.add(0).read_unaligned(), src_ptr.add(1).read_unaligned(), @@ -110,8 +110,8 @@ pub(crate) fn convolve_horizontal_rgb_sse_row_one_f32( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { unsafe { if FMA { @@ -119,54 +119,52 @@ pub(crate) fn convolve_horizontal_rgb_sse_row_one_f32( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } else { convolve_horizontal_rgb_sse_row_one_f32_regular( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } } } -#[inline] #[target_feature(enable = "sse4.1")] unsafe fn convolve_horizontal_rgb_sse_row_one_f32_regular( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { convolve_horizontal_rgb_sse_row_one_f32_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } -#[inline] -#[target_feature(enable = "sse4.1,fma")] +#[target_feature(enable = "sse4.1", enable = "fma")] unsafe fn convolve_horizontal_rgb_sse_row_one_f32_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { convolve_horizontal_rgb_sse_row_one_f32_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } @@ -175,8 +173,8 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f32_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { const CHANNELS: usize = 3; let mut filter_offset = 0usize; @@ -193,7 +191,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f32_impl( let filter_start = jx + bounds.start; store = convolve_horizontal_parts_4_rgb_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, weight1, weight2, @@ -215,7 +213,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f32_impl( let filter_start = jx + bounds.start; store = convolve_horizontal_parts_2_rgb_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, weight1, store, @@ -227,17 +225,12 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f32_impl( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = _mm_load1_ps(ptr); let filter_start = jx + bounds.start; - store = convolve_horizontal_parts_one_rgb_f32::( - filter_start, - unsafe_source_ptr_0, - weight0, - store, - ); + store = convolve_horizontal_parts_one_rgb_f32::(filter_start, src, weight0, store); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store))); (dest_ptr as *mut i32) .add(2) @@ -251,9 +244,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { unsafe { @@ -262,9 +255,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } else { @@ -272,67 +265,65 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f32( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } } } -#[inline] #[target_feature(enable = "sse4.1")] unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_regular( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { convolve_horizontal_rgb_sse_rows_4_f32_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } -#[inline] -#[target_feature(enable = "sse4.1,fma")] +#[target_feature(enable = "sse4.1", enable = "fma")] unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { convolve_horizontal_rgb_sse_rows_4_f32_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } -#[inline] +#[inline(always)] unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { const CHANNELS: usize = 3; @@ -354,7 +345,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl( let filter_start = jx + bounds.start; store_0 = convolve_horizontal_parts_4_rgb_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, weight1, weight2, @@ -363,7 +354,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl( ); store_1 = convolve_horizontal_parts_4_rgb_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..), weight0, weight1, weight2, @@ -372,7 +363,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl( ); store_2 = convolve_horizontal_parts_4_rgb_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..), weight0, weight1, weight2, @@ -381,7 +372,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl( ); store_3 = convolve_horizontal_parts_4_rgb_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..), weight0, weight1, weight2, @@ -403,28 +394,28 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl( let filter_start = jx + bounds.start; store_0 = convolve_horizontal_parts_2_rgb_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, weight1, store_0, ); store_1 = convolve_horizontal_parts_2_rgb_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..), weight0, weight1, store_1, ); store_2 = convolve_horizontal_parts_2_rgb_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..), weight0, weight1, store_2, ); store_3 = convolve_horizontal_parts_2_rgb_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..), weight0, weight1, store_3, @@ -436,27 +427,23 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = _mm_load1_ps(ptr); let filter_start = jx + bounds.start; - store_0 = convolve_horizontal_parts_one_rgb_f32::( - filter_start, - unsafe_source_ptr_0, - weight0, - store_0, - ); + store_0 = + convolve_horizontal_parts_one_rgb_f32::(filter_start, src, weight0, store_0); store_1 = convolve_horizontal_parts_one_rgb_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..), weight0, store_1, ); store_2 = convolve_horizontal_parts_one_rgb_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..), weight0, store_2, ); store_3 = convolve_horizontal_parts_one_rgb_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..), weight0, store_3, ); @@ -464,25 +451,25 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl( } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_0))); (dest_ptr as *mut i32) .add(2) .write_unaligned(_mm_extract_ps::<2>(store_0)); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr(); (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_1))); (dest_ptr as *mut i32) .add(2) .write_unaligned(_mm_extract_ps::<2>(store_1)); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_2))); (dest_ptr as *mut i32) .add(2) .write_unaligned(_mm_extract_ps::<2>(store_2)); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_3))); (dest_ptr as *mut i32) .add(2) diff --git a/src/sse/rgba_f32.rs b/src/sse/rgba_f32.rs index 9b4b244..1ba151f 100644 --- a/src/sse/rgba_f32.rs +++ b/src/sse/rgba_f32.rs @@ -37,12 +37,12 @@ use std::arch::x86_64::*; #[inline(always)] unsafe fn convolve_horizontal_parts_one_rgba_f32( start_x: usize, - src: *const f32, + src: &[f32], weight0: __m128, store_0: __m128, ) -> __m128 { const COMPONENTS: usize = 4; - let src_ptr = src.add(start_x * COMPONENTS); + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); let rgb_pixel = _mm_loadu_ps(src_ptr); _mm_prefer_fma_ps::(store_0, rgb_pixel, weight0) } @@ -51,8 +51,8 @@ pub(crate) fn convolve_horizontal_rgba_sse_row_one_f32( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { unsafe { if FMA { @@ -60,54 +60,52 @@ pub(crate) fn convolve_horizontal_rgba_sse_row_one_f32( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } else { convolve_horizontal_rgba_sse_row_one_f32_regular( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } } } -#[inline] #[target_feature(enable = "sse4.1,fma")] unsafe fn convolve_horizontal_rgba_sse_row_one_f32_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { convolve_horizontal_rgba_sse_row_one_f32_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } -#[inline] #[target_feature(enable = "sse4.1")] unsafe fn convolve_horizontal_rgba_sse_row_one_f32_regular( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { convolve_horizontal_rgba_sse_row_one_f32_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } @@ -116,8 +114,8 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], ) { unsafe { const CHANNELS: usize = 4; @@ -135,7 +133,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl( let filter_start = jx + bounds.start; store = convolve_horizontal_parts_4_rgba_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, weight1, weight2, @@ -157,7 +155,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl( let filter_start = jx + bounds.start; store = convolve_horizontal_parts_2_rgba_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, weight1, store, @@ -171,7 +169,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl( let filter_start = jx + bounds.start; store = convolve_horizontal_parts_one_rgba_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, store, ); @@ -179,8 +177,8 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl( } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); - _mm_storeu_ps(dest_ptr, store); + let dest_ptr = dst.get_unchecked_mut(px..); + _mm_storeu_ps(dest_ptr.as_mut_ptr(), store); filter_offset += filter_weights.aligned_size; } @@ -190,7 +188,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f32_impl( #[inline(always)] unsafe fn convolve_horizontal_parts_4_rgba_f32( start_x: usize, - src: *const f32, + src: &[f32], weight0: __m128, weight1: __m128, weight2: __m128, @@ -198,7 +196,7 @@ unsafe fn convolve_horizontal_parts_4_rgba_f32( store_0: __m128, ) -> __m128 { const COMPONENTS: usize = 4; - let src_ptr = src.add(start_x * COMPONENTS); + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); let rgb_pixel_0 = _mm_loadu_ps(src_ptr); let rgb_pixel_1 = _mm_loadu_ps(src_ptr.add(4)); @@ -214,13 +212,13 @@ unsafe fn convolve_horizontal_parts_4_rgba_f32( #[inline(always)] unsafe fn convolve_horizontal_parts_2_rgba_f32( start_x: usize, - src: *const f32, + src: &[f32], weight0: __m128, weight1: __m128, store_0: __m128, ) -> __m128 { const COMPONENTS: usize = 4; - let src_ptr = src.add(start_x * COMPONENTS); + let src_ptr = src.get_unchecked(start_x * COMPONENTS..).as_ptr(); let rgb_pixel_0 = _mm_loadu_ps(src_ptr); let rgb_pixel_1 = _mm_loadu_ps(src_ptr.add(4)); @@ -233,9 +231,9 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { unsafe { @@ -244,9 +242,9 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } else { @@ -254,67 +252,65 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } } } -#[inline] #[target_feature(enable = "sse4.1,fma")] unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { convolve_horizontal_rgba_sse_rows_4_f32_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } -#[inline] #[target_feature(enable = "sse4.1")] unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_regular( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { convolve_horizontal_rgba_sse_rows_4_f32_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } -#[inline] +#[inline(always)] unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, + src: &[f32], src_stride: usize, - unsafe_destination_ptr_0: *mut f32, + dst: &mut [f32], dst_stride: usize, ) { const CHANNELS: usize = 4; @@ -336,7 +332,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl( store_0 = convolve_horizontal_parts_4_rgba_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, weight1, weight2, @@ -345,7 +341,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl( ); store_1 = convolve_horizontal_parts_4_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..), weight0, weight1, weight2, @@ -354,7 +350,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl( ); store_2 = convolve_horizontal_parts_4_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..), weight0, weight1, weight2, @@ -363,7 +359,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl( ); store_3 = convolve_horizontal_parts_4_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..), weight0, weight1, weight2, @@ -385,28 +381,28 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl( let filter_start = jx + bounds.start; store_0 = convolve_horizontal_parts_2_rgba_f32::( filter_start, - unsafe_source_ptr_0, + src, weight0, weight1, store_0, ); store_1 = convolve_horizontal_parts_2_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..), weight0, weight1, store_1, ); store_2 = convolve_horizontal_parts_2_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..), weight0, weight1, store_2, ); store_3 = convolve_horizontal_parts_2_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..), weight0, weight1, store_3, @@ -418,27 +414,23 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl( let ptr = weights_ptr.add(jx + filter_offset); let filter_start = jx + bounds.start; let weight0 = _mm_load1_ps(ptr); - store_0 = convolve_horizontal_parts_one_rgba_f32::( - filter_start, - unsafe_source_ptr_0, - weight0, - store_0, - ); + store_0 = + convolve_horizontal_parts_one_rgba_f32::(filter_start, src, weight0, store_0); store_1 = convolve_horizontal_parts_one_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..), weight0, store_1, ); store_2 = convolve_horizontal_parts_one_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..), weight0, store_2, ); store_3 = convolve_horizontal_parts_one_rgba_f32::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..), weight0, store_3, ); @@ -446,17 +438,17 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f32_impl( } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); - _mm_storeu_ps(dest_ptr, store_0); + let dest_ptr = dst.get_unchecked_mut(px..); + _mm_storeu_ps(dest_ptr.as_mut_ptr(), store_0); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); - _mm_storeu_ps(dest_ptr, store_1); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride..); + _mm_storeu_ps(dest_ptr.as_mut_ptr(), store_1); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); - _mm_storeu_ps(dest_ptr, store_2); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..); + _mm_storeu_ps(dest_ptr.as_mut_ptr(), store_2); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); - _mm_storeu_ps(dest_ptr, store_3); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..); + _mm_storeu_ps(dest_ptr.as_mut_ptr(), store_3); filter_offset += filter_weights.aligned_size; } diff --git a/src/sse/vertical_f32.rs b/src/sse/vertical_f32.rs index d00dfda..df0ccba 100644 --- a/src/sse/vertical_f32.rs +++ b/src/sse/vertical_f32.rs @@ -37,9 +37,9 @@ use std::arch::x86_64::*; unsafe fn convolve_vertical_part_sse_24_f32( start_y: usize, start_x: usize, - src: *const f32, + src: &[f32], src_stride: usize, - dst: *mut f32, + dst: &mut [f32], filter: &[f32], bounds: &FilterBounds, ) { @@ -56,15 +56,14 @@ unsafe fn convolve_vertical_part_sse_24_f32( let py = start_y + j; let weight = filter.get_unchecked(j..); let v_weight = _mm_load1_ps(weight.as_ptr()); - let src_ptr = src.add(src_stride * py); + let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr(); - let s_ptr = src_ptr.add(px); - let item_row_0 = _mm_loadu_ps(s_ptr); - let item_row_1 = _mm_loadu_ps(s_ptr.add(4)); - let item_row_2 = _mm_loadu_ps(s_ptr.add(8)); - let item_row_3 = _mm_loadu_ps(s_ptr.add(12)); - let item_row_4 = _mm_loadu_ps(s_ptr.add(16)); - let item_row_5 = _mm_loadu_ps(s_ptr.add(20)); + let item_row_0 = _mm_loadu_ps(src_ptr); + let item_row_1 = _mm_loadu_ps(src_ptr.add(4)); + let item_row_2 = _mm_loadu_ps(src_ptr.add(8)); + let item_row_3 = _mm_loadu_ps(src_ptr.add(12)); + let item_row_4 = _mm_loadu_ps(src_ptr.add(16)); + let item_row_5 = _mm_loadu_ps(src_ptr.add(20)); store_0 = _mm_prefer_fma_ps::(store_0, item_row_0, v_weight); store_1 = _mm_prefer_fma_ps::(store_1, item_row_1, v_weight); @@ -74,7 +73,7 @@ unsafe fn convolve_vertical_part_sse_24_f32( store_5 = _mm_prefer_fma_ps::(store_5, item_row_5, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); _mm_storeu_ps(dst_ptr, store_0); _mm_storeu_ps(dst_ptr.add(4), store_1); _mm_storeu_ps(dst_ptr.add(8), store_2); @@ -87,9 +86,9 @@ unsafe fn convolve_vertical_part_sse_24_f32( unsafe fn convolve_vertical_part_sse_16_f32( start_y: usize, start_x: usize, - src: *const f32, + src: &[f32], src_stride: usize, - dst: *mut f32, + dst: &mut [f32], filter: &[f32], bounds: &FilterBounds, ) { @@ -104,13 +103,12 @@ unsafe fn convolve_vertical_part_sse_16_f32( let py = start_y + j; let weight = filter.get_unchecked(j..); let v_weight = _mm_load1_ps(weight.as_ptr()); - let src_ptr = src.add(src_stride * py); + let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr(); - let s_ptr = src_ptr.add(px); - let item_row_0 = _mm_loadu_ps(s_ptr); - let item_row_1 = _mm_loadu_ps(s_ptr.add(4)); - let item_row_2 = _mm_loadu_ps(s_ptr.add(8)); - let item_row_3 = _mm_loadu_ps(s_ptr.add(12)); + let item_row_0 = _mm_loadu_ps(src_ptr); + let item_row_1 = _mm_loadu_ps(src_ptr.add(4)); + let item_row_2 = _mm_loadu_ps(src_ptr.add(8)); + let item_row_3 = _mm_loadu_ps(src_ptr.add(12)); store_0 = _mm_prefer_fma_ps::(store_0, item_row_0, v_weight); store_1 = _mm_prefer_fma_ps::(store_1, item_row_1, v_weight); @@ -118,7 +116,7 @@ unsafe fn convolve_vertical_part_sse_16_f32( store_3 = _mm_prefer_fma_ps::(store_3, item_row_3, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); _mm_storeu_ps(dst_ptr, store_0); _mm_storeu_ps(dst_ptr.add(4), store_1); _mm_storeu_ps(dst_ptr.add(8), store_2); @@ -129,9 +127,9 @@ unsafe fn convolve_vertical_part_sse_16_f32( unsafe fn convolve_vertical_part_sse_8_f32( start_y: usize, start_x: usize, - src: *const f32, + src: &[f32], src_stride: usize, - dst: *mut f32, + dst: &mut [f32], filter: &[f32], bounds: &FilterBounds, ) { @@ -144,17 +142,16 @@ unsafe fn convolve_vertical_part_sse_8_f32( let py = start_y + j; let weight = filter.get_unchecked(j..); let v_weight = _mm_load1_ps(weight.as_ptr()); - let src_ptr = src.add(src_stride * py); + let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr(); - let s_ptr = src_ptr.add(px); - let item_row_0 = _mm_loadu_ps(s_ptr); - let item_row_1 = _mm_loadu_ps(s_ptr.add(4)); + let item_row_0 = _mm_loadu_ps(src_ptr); + let item_row_1 = _mm_loadu_ps(src_ptr.add(4)); store_0 = _mm_prefer_fma_ps::(store_0, item_row_0, v_weight); store_1 = _mm_prefer_fma_ps::(store_1, item_row_1, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); _mm_storeu_ps(dst_ptr, store_0); _mm_storeu_ps(dst_ptr.add(4), store_1); } @@ -163,9 +160,9 @@ unsafe fn convolve_vertical_part_sse_8_f32( unsafe fn convolve_vertical_part_sse_4_f32( start_y: usize, start_x: usize, - src: *const f32, + src: &[f32], src_stride: usize, - dst: *mut f32, + dst: &mut [f32], filter: &[f32], bounds: &FilterBounds, ) { @@ -177,15 +174,15 @@ unsafe fn convolve_vertical_part_sse_4_f32( let py = start_y + j; let weight = filter.get_unchecked(j..); let v_weight = _mm_load1_ps(weight.as_ptr()); - let src_ptr = src.add(src_stride * py); - let s_ptr = src_ptr.add(px); - let item_row_0 = _mm_loadu_ps(s_ptr); + let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr(); + + let item_row_0 = _mm_loadu_ps(src_ptr); store_0 = _mm_prefer_fma_ps::(store_0, item_row_0, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); _mm_storeu_ps(dst_ptr, store_0); } @@ -193,9 +190,9 @@ unsafe fn convolve_vertical_part_sse_4_f32( pub(crate) unsafe fn convolve_vertical_part_sse_f32( start_y: usize, start_x: usize, - src: *const f32, + src: &[f32], src_stride: usize, - dst: *mut f32, + dst: &mut [f32], filter: &[f32], bounds: &FilterBounds, ) { @@ -207,44 +204,33 @@ pub(crate) unsafe fn convolve_vertical_part_sse_f32( let py = start_y + j; let weight = filter.get_unchecked(j..); let v_weight = _mm_load1_ps(weight.as_ptr()); - let src_ptr = src.add(src_stride * py); + let src_ptr = src.get_unchecked(src_stride * py + px..); - let s_ptr = src_ptr.add(px); - let item_row_0 = _mm_set1_ps(s_ptr.read_unaligned()); + let item_row_0 = _mm_set1_ps(src_ptr.as_ptr().read_unaligned()); store_0 = _mm_prefer_fma_ps::(store_0, item_row_0, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); (dst_ptr as *mut i32).write_unaligned(_mm_extract_ps::<0>(store_0)); } pub(crate) fn convolve_vertical_rgb_sse_row_f32( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], src_stride: usize, weight_ptr: &[f32], ) { unsafe { if FMA { convolve_vertical_rgb_sse_row_f32_fma::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } else { convolve_vertical_rgb_sse_row_f32_regular::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } } @@ -255,18 +241,13 @@ pub(crate) fn convolve_vertical_rgb_sse_row_f32( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], src_stride: usize, weight_ptr: &[f32], ) { convolve_vertical_rgb_sse_row_f32_impl::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } @@ -275,18 +256,13 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_regular( unsafe fn convolve_vertical_rgb_sse_row_f32_fma( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], src_stride: usize, weight_ptr: &[f32], ) { convolve_vertical_rgb_sse_row_f32_impl::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } @@ -294,8 +270,8 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_fma( unsafe fn convolve_vertical_rgb_sse_row_f32_impl( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, + src: &[f32], + dst: &mut [f32], src_stride: usize, weight_ptr: &[f32], ) { @@ -306,9 +282,9 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_impl( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -320,9 +296,9 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_impl( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -334,9 +310,9 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_impl( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -348,9 +324,9 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_impl( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -362,9 +338,9 @@ unsafe fn convolve_vertical_rgb_sse_row_f32_impl( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); diff --git a/src/unsafe_slice.rs b/src/unsafe_slice.rs deleted file mode 100644 index 52b4352..0000000 --- a/src/unsafe_slice.rs +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) Radzivon Bartoshyk. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -use std::cell::UnsafeCell; - -#[derive(Copy, Clone)] -pub(crate) struct UnsafeSlice<'a, T> { - pub slice: &'a [UnsafeCell], -} - -unsafe impl Send for UnsafeSlice<'_, T> {} - -unsafe impl Sync for UnsafeSlice<'_, T> {} - -impl<'a, T> UnsafeSlice<'a, T> { - pub(crate) fn new(slice: &'a mut [T]) -> Self { - let ptr = slice as *mut [T] as *const [UnsafeCell]; - Self { - slice: unsafe { &*ptr }, - } - } - - pub(crate) fn mut_ptr(&self) -> *mut T { - self.slice.as_ptr() as *const T as *mut T - } - - /// SAFETY: It is UB if two threads write to the same index without - /// synchronization. - #[allow(dead_code)] - pub(crate) unsafe fn write(&self, i: usize, value: T) { - let ptr = self.slice[i].get(); - *ptr = value; - } - #[allow(dead_code)] - pub(crate) fn get(&self, i: usize) -> &T { - let ptr = self.slice[i].get(); - unsafe { &*ptr } - } - #[allow(dead_code)] - pub(crate) fn len(&self) -> usize { - self.slice.len() - } -} From b98f995c5a94519ffd320118b404df1e43ee17f1 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Fri, 20 Dec 2024 15:34:13 +0000 Subject: [PATCH 2/9] x86 improvements --- src/avx2/vertical_u8_lp.rs | 31 ++++++++++--------------------- src/neon/plane_f32.rs | 10 ++-------- src/neon/rgba_f32.rs | 6 ++---- 3 files changed, 14 insertions(+), 33 deletions(-) diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs index 51b7134..af4ebc3 100644 --- a/src/avx2/vertical_u8_lp.rs +++ b/src/avx2/vertical_u8_lp.rs @@ -77,7 +77,6 @@ unsafe fn convolve_vertical_avx2_row_impl( weight: &[i16], ) { let zeros = _mm_setzero_si128(); - let zeros256 = _mm256_setzero_si256(); let bounds_size = bounds.size; const SCALE: i32 = 6; @@ -212,17 +211,14 @@ unsafe fn convolve_vertical_avx2_row_impl( } } - store0 = _mm256_max_epi16(store0, zeros256); - store1 = _mm256_max_epi16(store1, zeros256); - store2 = _mm256_max_epi16(store2, zeros256); - store3 = _mm256_max_epi16(store3, zeros256); + let rebased0 = _mm256_srai_epi16::(store0); + let rebased1 = _mm256_srai_epi16::(store1); + let rebased2 = _mm256_srai_epi16::(store2); + let rebased3 = _mm256_srai_epi16::(store3); - let rebased0 = _mm256_srli_epi16::(store0); - let rebased1 = _mm256_srli_epi16::(store1); - let rebased2 = _mm256_srli_epi16::(store2); - let rebased3 = _mm256_srli_epi16::(store3); let shrank0 = avx2_pack_u16(rebased0, rebased1); let shrank1 = avx2_pack_u16(rebased2, rebased3); + _mm256_storeu_si256(dst.as_mut_ptr() as *mut __m256i, shrank0); _mm256_storeu_si256( dst.get_unchecked_mut(32..).as_mut_ptr() as *mut __m256i, @@ -317,11 +313,9 @@ unsafe fn convolve_vertical_avx2_row_impl( } } - store0 = _mm256_max_epi16(store0, zeros256); - store1 = _mm256_max_epi16(store1, zeros256); + let rebased0 = _mm256_srai_epi16::(store0); + let rebased1 = _mm256_srai_epi16::(store1); - let rebased0 = _mm256_srli_epi16::(store0); - let rebased1 = _mm256_srli_epi16::(store1); let shrank0 = avx2_pack_u16(rebased0, rebased1); _mm256_storeu_si256(dst.as_mut_ptr() as *mut __m256i, shrank0); @@ -351,8 +345,7 @@ unsafe fn convolve_vertical_avx2_row_impl( ); } - store0 = _mm256_max_epi16(store0, zeros256); - store0 = _mm256_srli_epi16::(store0); + store0 = _mm256_srai_epi16::(store0); let packed = avx2_pack_u16(store0, store0); @@ -474,9 +467,7 @@ unsafe fn convolve_vertical_avx2_row_impl( } } - store = _mm_max_epi16(store, zeros); - - let rebased = _mm_srli_epi16::(store); + let rebased = _mm_srai_epi16::(store); let shrank = _mm_packus_epi16(rebased, rebased); std::ptr::copy_nonoverlapping(&shrank as *const _ as *const u8, dst.as_mut_ptr(), 8); @@ -600,9 +591,7 @@ unsafe fn convolve_vertical_avx2_row_impl( } } - store = _mm_max_epi16(store, zeros); - - let rebased = _mm_srli_epi16::(store); + let rebased = _mm_srai_epi16::(store); let value = _mm_extract_epi8::<0>(_mm_packus_epi16(rebased, rebased)); *dst = value as u8; diff --git a/src/neon/plane_f32.rs b/src/neon/plane_f32.rs index cb8c65f..24e4ca2 100644 --- a/src/neon/plane_f32.rs +++ b/src/neon/plane_f32.rs @@ -107,12 +107,7 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = xvld1q_f32_x4(ptr); - store = conv_horiz_plane_16_f32!( - bounds_start, - src.as_ptr(), - read_weights, - store - ); + store = conv_horiz_plane_16_f32!(bounds_start, src.as_ptr(), read_weights, store); jx += 8; } @@ -134,8 +129,7 @@ pub(crate) fn convolve_horizontal_plane_neon_row_one( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - store = - conv_horiz_plane_4_f32!(bounds_start, src.as_ptr(), read_weights, store); + store = conv_horiz_plane_4_f32!(bounds_start, src.as_ptr(), read_weights, store); jx += 4; } diff --git a/src/neon/rgba_f32.rs b/src/neon/rgba_f32.rs index 1e44dd3..2a2b088 100644 --- a/src/neon/rgba_f32.rs +++ b/src/neon/rgba_f32.rs @@ -111,8 +111,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - store = - conv_horiz_rgba_4_f32!(bounds_start, src.as_ptr(), read_weights, store); + store = conv_horiz_rgba_4_f32!(bounds_start, src.as_ptr(), read_weights, store); jx += 4; } @@ -120,8 +119,7 @@ pub(crate) fn convolve_horizontal_rgba_neon_row_one( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - store = - conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store); + store = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store); jx += 2; } From 4d669f78d9f3de529c37600e421c630f9f256250 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Fri, 27 Dec 2024 11:46:16 +0000 Subject: [PATCH 3/9] Improvements --- app/src/main.rs | 4 +- src/avx2/alpha_u16.rs | 102 +++++---- src/avx2/alpha_u8.rs | 72 +++--- src/avx2/rgba_f16.rs | 34 +-- src/avx2/rgba_f32.rs | 4 +- src/avx2/utils.rs | 24 -- src/avx2/vertical_f16.rs | 2 +- src/avx2/vertical_f32.rs | 5 +- src/avx2/vertical_u8.rs | 285 +++++------------------ src/avx2/vertical_u8_lp.rs | 13 +- src/neon/rgb_f32.rs | 6 +- src/neon/rgb_u8.rs | 10 +- src/neon/rgba_u8.rs | 38 +-- src/neon/vertical_u8.rs | 4 +- src/sse/alpha_u16.rs | 17 +- src/sse/alpha_u8.rs | 20 +- src/sse/plane_u8.rs | 2 +- src/sse/rgb_f32.rs | 12 +- src/sse/rgba_f16.rs | 34 +-- src/sse/rgba_u16.rs | 31 +-- src/sse/rgba_u16_lb.rs | 43 +--- src/sse/rgba_u8.rs | 42 ++-- src/sse/rgba_u8_lb.rs | 56 +++-- src/sse/u8_utils.rs | 9 +- src/sse/vertical_f16.rs | 2 +- src/sse/vertical_u16.rs | 459 ++----------------------------------- src/sse/vertical_u16_lb.rs | 403 ++------------------------------ src/sse/vertical_u8.rs | 322 ++++++-------------------- src/sse/vertical_u8_lp.rs | 335 +++------------------------ 29 files changed, 435 insertions(+), 1955 deletions(-) diff --git a/app/src/main.rs b/app/src/main.rs index 6d8dcec..cb713ce 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -45,7 +45,7 @@ fn resize_plane( fn main() { // test_fast_image(); - let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png") + let img = ImageReader::open("./assets/test_1.jpg") .unwrap() .decode() .unwrap(); @@ -53,7 +53,7 @@ fn main() { let transient = img.to_rgba8(); let mut bytes = Vec::from(transient.as_bytes()); - let mut scaler = LinearScaler::new(ResamplingFunction::Bilinear); + let mut scaler = Scaler::new(ResamplingFunction::Bilinear); scaler.set_threading_policy(ThreadingPolicy::Single); // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear); diff --git a/src/avx2/alpha_u16.rs b/src/avx2/alpha_u16.rs index f190f68..27203d7 100644 --- a/src/avx2/alpha_u16.rs +++ b/src/avx2/alpha_u16.rs @@ -29,7 +29,7 @@ use crate::alpha_handle_u16::{premultiply_alpha_rgba_row, unpremultiply_alpha_rgba_row}; use crate::avx2::utils::{ - _mm256_select_si256, avx2_pack_u32, avx_deinterleave_rgba_epi16, avx_interleave_rgba_epi16, + _mm256_select_si256, avx_deinterleave_rgba_epi16, avx_interleave_rgba_epi16, }; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; @@ -39,15 +39,16 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; -#[inline] +#[inline(always)] unsafe fn _mm256_scale_by_alpha(px: __m256i, low_low_a: __m256, low_high_a: __m256) -> __m256i { - let low_px = _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(px))); - let high_px = _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(px))); + let zeros = _mm256_setzero_si256(); + let low_px = _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(px, zeros)); + let high_px = _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(px, zeros)); let new_ll = _mm256_cvtps_epi32(_mm256_round_ps::<0x02>(_mm256_mul_ps(low_px, low_low_a))); let new_lh = _mm256_cvtps_epi32(_mm256_round_ps::<0x02>(_mm256_mul_ps(high_px, low_high_a))); - avx2_pack_u32(new_ll, new_lh) + _mm256_packus_epi32(new_ll, new_lh) } #[inline(always)] @@ -108,36 +109,37 @@ unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_d let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - let low_alpha = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.3)); - let high_alpha = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.3)); + let zeros = _mm256_setzero_si256(); + let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros); + let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros); - let new_rrr = avx2_pack_u32( + let new_rrr = _mm256_packus_epi32( _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.0)), + _mm256_unpacklo_epi16(pixel.0, zeros), low_alpha, )), _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.0)), + _mm256_unpackhi_epi16(pixel.0, zeros), high_alpha, )), ); - let new_ggg = avx2_pack_u32( + let new_ggg = _mm256_packus_epi32( _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.1)), + _mm256_unpacklo_epi16(pixel.1, zeros), low_alpha, )), _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.1)), + _mm256_unpackhi_epi16(pixel.1, zeros), high_alpha, )), ); - let new_bbb = avx2_pack_u32( + let new_bbb = _mm256_packus_epi32( _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.2)), + _mm256_unpacklo_epi16(pixel.2, zeros), low_alpha, )), _mm256_div_by_1023_epi32(_mm256_madd_epi16( - _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.2)), + _mm256_unpackhi_epi16(pixel.2, zeros), high_alpha, )), ); @@ -165,36 +167,37 @@ unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_d let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - let low_alpha = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.3)); - let high_alpha = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.3)); + let zeros = _mm256_setzero_si256(); + let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros); + let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros); - let new_rrr = avx2_pack_u32( + let new_rrr = _mm256_packus_epi32( _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.0)), + _mm256_unpacklo_epi16(pixel.0, zeros), low_alpha, )), _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.0)), + _mm256_unpackhi_epi16(pixel.0, zeros), high_alpha, )), ); - let new_ggg = avx2_pack_u32( + let new_ggg = _mm256_packus_epi32( _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.1)), + _mm256_unpacklo_epi16(pixel.1, zeros), low_alpha, )), _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.1)), + _mm256_unpackhi_epi16(pixel.1, zeros), high_alpha, )), ); - let new_bbb = avx2_pack_u32( + let new_bbb = _mm256_packus_epi32( _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.2)), + _mm256_unpacklo_epi16(pixel.2, zeros), low_alpha, )), _mm256_div_by_4095_epi32(_mm256_madd_epi16( - _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.2)), + _mm256_unpackhi_epi16(pixel.2, zeros), high_alpha, )), ); @@ -222,36 +225,37 @@ unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_d let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - let low_alpha = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.3)); - let high_alpha = _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.3)); + let zeros = _mm256_setzero_si256(); + let low_alpha = _mm256_unpacklo_epi16(pixel.3, zeros); + let high_alpha = _mm256_unpackhi_epi16(pixel.3, zeros); - let new_rrr = avx2_pack_u32( + let new_rrr = _mm256_packus_epi32( _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.0)), + _mm256_unpacklo_epi16(pixel.0, zeros), low_alpha, )), _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.0)), + _mm256_unpackhi_epi16(pixel.0, zeros), high_alpha, )), ); - let new_ggg = avx2_pack_u32( + let new_ggg = _mm256_packus_epi32( _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.1)), + _mm256_unpacklo_epi16(pixel.1, zeros), low_alpha, )), _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.1)), + _mm256_unpackhi_epi16(pixel.1, zeros), high_alpha, )), ); - let new_bbb = avx2_pack_u32( + let new_bbb = _mm256_packus_epi32( _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.2)), + _mm256_unpacklo_epi16(pixel.2, zeros), low_alpha, )), _mm256_div_by_65535_epi32(_mm256_mullo_epi32( - _mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>(pixel.2)), + _mm256_unpackhi_epi16(pixel.2, zeros), high_alpha, )), ); @@ -280,14 +284,14 @@ unsafe fn avx_premultiply_alpha_rgba_u16_row(dst: &mut [u16], src: &[u16], bit_d let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); + let zeros = _mm256_setzero_si256(); + let low_alpha = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_castsi256_si128(pixel.3))), + _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(pixel.3, zeros)), v_scale_colors, ); let high_alpha = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepu16_epi32(_mm256_extracti128_si256::<1>( - pixel.3, - ))), + _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(pixel.3, zeros)), v_scale_colors, ); @@ -368,17 +372,17 @@ unsafe fn avx_unpremultiply_alpha_rgba_u16_row(in_place: &mut [u16], bit_depth: let pixel = avx_deinterleave_rgba_epi16(lane0, lane1, lane2, lane3); - let is_zero_alpha_mask = _mm256_cmpeq_epi16(pixel.3, _mm256_setzero_si256()); + let zeros = _mm256_setzero_si256(); + + let is_zero_alpha_mask = _mm256_cmpeq_epi16(pixel.3, zeros); - let mut low_alpha = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32( - _mm256_castsi256_si128(pixel.3), - ))); + let mut low_alpha = + _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(pixel.3, zeros))); low_alpha = _mm256_mul_ps(low_alpha, v_scale_colors); - let mut high_alpha = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepu16_epi32( - _mm256_extracti128_si256::<1>(pixel.3), - ))); + let mut high_alpha = + _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(pixel.3, zeros))); high_alpha = _mm256_mul_ps(high_alpha, v_scale_colors); diff --git a/src/avx2/alpha_u8.rs b/src/avx2/alpha_u8.rs index 291c4b4..83200dc 100644 --- a/src/avx2/alpha_u8.rs +++ b/src/avx2/alpha_u8.rs @@ -29,8 +29,7 @@ use crate::alpha_handle_u8::{premultiply_alpha_rgba_row_impl, unpremultiply_alpha_rgba_row_impl}; use crate::avx2::utils::{ - _mm256_packus_four_epi32, _mm256_select_si256, avx2_deinterleave_rgba, avx2_div_by255, - avx2_interleave_rgba, avx2_pack_u16, + _mm256_select_si256, avx2_deinterleave_rgba, avx2_div_by255, avx2_interleave_rgba, }; use crate::sse::{ _mm_div_by_255_epi16, sse_deinterleave_rgba, sse_interleave_rgba, sse_unpremultiply_row, @@ -46,8 +45,8 @@ use std::arch::x86_64::*; #[inline(always)] unsafe fn avx2_unpremultiply_row(x: __m256i, a: __m256i) -> __m256i { let zeros = _mm256_setzero_si256(); - let lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(x)); - let hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(x)); + let lo = _mm256_unpacklo_epi8(x, zeros); + let hi = _mm256_unpackhi_epi8(x, zeros); let scale = _mm256_set1_epi16(255); @@ -57,35 +56,27 @@ unsafe fn avx2_unpremultiply_row(x: __m256i, a: __m256i) -> __m256i { let scale_ps = _mm256_set1_ps(255f32); let lo_lo = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(lo))), + _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(lo, zeros)), scale_ps, ); let lo_hi = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(lo))), + _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(lo, zeros)), scale_ps, ); let hi_lo = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_castsi256_si128(hi))), + _mm256_cvtepi32_ps(_mm256_unpacklo_epi16(hi, zeros)), scale_ps, ); let hi_hi = _mm256_mul_ps( - _mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(hi))), + _mm256_cvtepi32_ps(_mm256_unpackhi_epi16(hi, zeros)), scale_ps, ); - let a_lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(a)); - let a_hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(a)); - let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32( - _mm256_castsi256_si128(a_lo), - ))); - let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32( - _mm256_extracti128_si256::<1>(a_lo), - ))); - let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32( - _mm256_castsi256_si128(a_hi), - ))); - let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32( - _mm256_extracti128_si256::<1>(a_hi), - ))); + let a_lo = _mm256_unpacklo_epi8(a, zeros); + let a_hi = _mm256_unpackhi_epi8(x, zeros); + let a_lo_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_lo, zeros))); + let a_lo_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_lo, zeros))); + let a_hi_lo = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpacklo_epi16(a_hi, zeros))); + let a_hi_hi = _mm256_rcp_ps(_mm256_cvtepi32_ps(_mm256_unpackhi_epi16(a_hi, zeros))); let lo_lo = _mm256_cvtps_epi32(_mm256_mul_ps(lo_lo, a_lo_lo)); let lo_hi = _mm256_cvtps_epi32(_mm256_mul_ps(lo_hi, a_lo_hi)); @@ -95,7 +86,10 @@ unsafe fn avx2_unpremultiply_row(x: __m256i, a: __m256i) -> __m256i { _mm256_select_si256( is_zero_mask, zeros, - _mm256_packus_four_epi32(lo_lo, lo_hi, hi_lo, hi_hi), + _mm256_packus_epi16( + _mm256_packus_epi32(lo_lo, lo_hi), + _mm256_packus_epi32(hi_lo, hi_hi), + ), ) } @@ -128,17 +122,19 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) { let rgba3 = _mm256_loadu_si256(src_ptr.add(96) as *const __m256i); let (rrr, ggg, bbb, aaa) = avx2_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - let mut rrr_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(rrr)); - let mut rrr_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(rrr)); + let zeros = _mm256_setzero_si256(); - let mut ggg_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(ggg)); - let mut ggg_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(ggg)); + let mut rrr_low = _mm256_unpacklo_epi8(rrr, zeros); + let mut rrr_high = _mm256_unpackhi_epi8(rrr, zeros); - let mut bbb_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(bbb)); - let mut bbb_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(bbb)); + let mut ggg_low = _mm256_unpacklo_epi8(ggg, zeros); + let mut ggg_high = _mm256_unpackhi_epi8(ggg, zeros); - let aaa_low = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(aaa)); - let aaa_high = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(aaa)); + let mut bbb_low = _mm256_unpacklo_epi8(bbb, zeros); + let mut bbb_high = _mm256_unpackhi_epi8(bbb, zeros); + + let aaa_low = _mm256_unpacklo_epi8(aaa, zeros); + let aaa_high = _mm256_unpackhi_epi8(aaa, zeros); rrr_low = avx2_div_by255(_mm256_mullo_epi16(rrr_low, aaa_low)); rrr_high = avx2_div_by255(_mm256_mullo_epi16(rrr_high, aaa_high)); @@ -147,9 +143,9 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) { bbb_low = avx2_div_by255(_mm256_mullo_epi16(bbb_low, aaa_low)); bbb_high = avx2_div_by255(_mm256_mullo_epi16(bbb_high, aaa_high)); - let rrr = avx2_pack_u16(rrr_low, rrr_high); - let ggg = avx2_pack_u16(ggg_low, ggg_high); - let bbb = avx2_pack_u16(bbb_low, bbb_high); + let rrr = _mm256_packus_epi16(rrr_low, rrr_high); + let ggg = _mm256_packus_epi16(ggg_low, ggg_high); + let bbb = _mm256_packus_epi16(bbb_low, bbb_high); let (rgba0, rgba1, rgba2, rgba3) = avx2_interleave_rgba(rrr, ggg, bbb, aaa); let dst_ptr = dst.as_mut_ptr(); @@ -174,16 +170,16 @@ unsafe fn avx_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) { let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - let mut rrr_low = _mm_cvtepu8_epi16(rrr); + let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros); let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros); - let mut ggg_low = _mm_cvtepu8_epi16(ggg); + let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros); let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros); - let mut bbb_low = _mm_cvtepu8_epi16(bbb); + let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros); let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros); - let aaa_low = _mm_cvtepu8_epi16(aaa); + let aaa_low = _mm_unpacklo_epi8(aaa, zeros); let aaa_high = _mm_unpackhi_epi8(aaa, zeros); rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low)); diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs index 85207bb..f3f6b22 100644 --- a/src/avx2/rgba_f16.rs +++ b/src/avx2/rgba_f16.rs @@ -261,11 +261,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl( _mm256_castps256_ps128(store), _mm256_extractf128_ps::<1>(store), )); - std::ptr::copy_nonoverlapping( - &converted_f16 as *const _ as *const u8, - dest_ptr as *mut u8, - 8, - ); + _mm_storeu_si64(dest_ptr as *mut u8, converted_f16); filter_offset += filter_weights.aligned_size; } @@ -454,8 +450,8 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( while jx + 2 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); - let weight0 = _mm_set1_ps(ptr.read_unaligned()); - let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned()); + let weight0 = _mm_load1_ps(ptr); + let weight1 = _mm_load1_ps(ptr.add(1)); let weight = avx_combine_ps(weight0, weight1); let filter_start = jx + bounds.start; store_0 = convolve_horizontal_parts_2_rgba_f16::( @@ -522,44 +518,28 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( _mm256_castps256_ps128(store_0), _mm256_extractf128_ps::<1>(store_0), )); - std::ptr::copy_nonoverlapping( - &converted_f16_0 as *const _ as *const u8, - dest_ptr as *mut u8, - 8, - ); + _mm_storeu_si64(dest_ptr as *mut u8, converted_f16_0); let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr(); let converted_f16_1 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps( _mm256_castps256_ps128(store_1), _mm256_extractf128_ps::<1>(store_1), )); - std::ptr::copy_nonoverlapping( - &converted_f16_1 as *const _ as *const u8, - dest_ptr as *mut u8, - 8, - ); + _mm_storeu_si64(dest_ptr as *mut u8, converted_f16_1); let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); let converted_f16_2 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps( _mm256_castps256_ps128(store_2), _mm256_extractf128_ps::<1>(store_2), )); - std::ptr::copy_nonoverlapping( - &converted_f16_2 as *const _ as *const u8, - dest_ptr as *mut u8, - 8, - ); + _mm_storeu_si64(dest_ptr as *mut u8, converted_f16_2); let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); let converted_f16_3 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps( _mm256_castps256_ps128(store_3), _mm256_extractf128_ps::<1>(store_3), )); - std::ptr::copy_nonoverlapping( - &converted_f16_3 as *const _ as *const u8, - dest_ptr as *mut u8, - 8, - ); + _mm_storeu_si64(dest_ptr as *mut u8, converted_f16_3); filter_offset += filter_weights.aligned_size; } diff --git a/src/avx2/rgba_f32.rs b/src/avx2/rgba_f32.rs index 0aba5bd..370abb1 100644 --- a/src/avx2/rgba_f32.rs +++ b/src/avx2/rgba_f32.rs @@ -500,8 +500,8 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f32_impl( while jx + 2 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); - let weight0 = _mm_set1_ps(ptr.read_unaligned()); - let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned()); + let weight0 = _mm_load1_ps(ptr); + let weight1 = _mm_load1_ps(ptr.add(1)); let weight = avx_combine_ps(weight0, weight1); let filter_start = jx + bounds.start; store = convolve_horizontal_parts_2_rgba_f32::(filter_start, src, weight, store); diff --git a/src/avx2/utils.rs b/src/avx2/utils.rs index cd11c57..599468e 100644 --- a/src/avx2/utils.rs +++ b/src/avx2/utils.rs @@ -300,30 +300,6 @@ pub(crate) unsafe fn avx2_pack_u16(s_1: __m256i, s_2: __m256i) -> __m256i { _mm256_permute4x64_epi64::(packed) } -#[inline] -#[target_feature(enable = "avx2")] -pub(crate) unsafe fn _mm256_packus_four_epi32( - a: __m256i, - b: __m256i, - c: __m256i, - d: __m256i, -) -> __m256i { - let ab = _mm256_packs_epi32(a, b); - let cd = _mm256_packs_epi32(c, d); - - const MASK: i32 = shuffle(3, 1, 2, 0); - - let abcd = _mm256_permute4x64_epi64::(_mm256_packus_epi16(ab, cd)); - _mm256_shuffle_epi32::(abcd) -} - -#[inline(always)] -pub(crate) unsafe fn avx2_pack_u32(s_1: __m256i, s_2: __m256i) -> __m256i { - let packed = _mm256_packus_epi32(s_1, s_2); - const MASK: i32 = shuffle(3, 1, 2, 0); - _mm256_permute4x64_epi64::(packed) -} - #[inline(always)] #[allow(dead_code)] pub(crate) unsafe fn avx_combine_ps(lo: __m128, hi: __m128) -> __m256 { diff --git a/src/avx2/vertical_f16.rs b/src/avx2/vertical_f16.rs index bfe1436..8c915d4 100644 --- a/src/avx2/vertical_f16.rs +++ b/src/avx2/vertical_f16.rs @@ -102,7 +102,7 @@ unsafe fn convolve_vertical_part_avx_4_f16( let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); let acc = _mm256_cvtps_ph::(store_0); - std::ptr::copy_nonoverlapping(&acc as *const _ as *const u8, dst_ptr as *mut u8, 8); + _mm_storeu_si64(dst_ptr as *mut u8, acc); } #[inline(always)] diff --git a/src/avx2/vertical_f32.rs b/src/avx2/vertical_f32.rs index 6e88617..28248e8 100644 --- a/src/avx2/vertical_f32.rs +++ b/src/avx2/vertical_f32.rs @@ -161,7 +161,10 @@ pub(crate) unsafe fn convolve_vertical_part_avx_f32( } let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); - (dst_ptr as *mut i32).write_unaligned(_mm256_extract_epi32::<0>(_mm256_castps_si256(store_0))); + _mm_storeu_si32( + dst_ptr as *mut u8, + _mm256_castsi256_si128(_mm256_castps_si256(store_0)), + ); } #[inline] diff --git a/src/avx2/vertical_u8.rs b/src/avx2/vertical_u8.rs index b8e3ee8..e578206 100644 --- a/src/avx2/vertical_u8.rs +++ b/src/avx2/vertical_u8.rs @@ -83,176 +83,64 @@ unsafe fn convolve_vertical_part_avx_64( let bounds_size = bounds.size; - if bounds_size == 2 { - let py = start_y; - let weight = filter.get_unchecked(0..2); - let v_weight0 = _mm256_set1_epi32(weight[0] as i32); - let v_weight1 = _mm256_set1_epi32(weight[1] as i32); - - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); + let mut jj = 0usize; - let item_row_0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i); - let item_row_1 = - _mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i); - let item_row_10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); - let item_row_11 = - _mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i); + while jj < bounds_size.saturating_sub(2) { + let py = start_y + jj; + let f_ptr = filter.get_unchecked(jj..).as_ptr() as *const i32; + let v_weight_2 = _mm256_set1_epi32(f_ptr.read_unaligned()); + let src_ptr = src.get_unchecked((src_stride * py + px)..); + let s_ptr_next = src_ptr.get_unchecked(src_stride..); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight0); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight0); + let item_row_0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i); + let item_row_1 = _mm256_loadu_si256(s_ptr_next.as_ptr() as *const __m256i); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_10, v_weight1); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_11, v_weight1); - } else if bounds_size == 3 { - let py = start_y; - let weight = filter.get_unchecked(0..3); - let v_weight0 = _mm256_set1_epi32(weight[0] as i32); - let v_weight1 = _mm256_set1_epi32(weight[1] as i32); - let v_weight2 = _mm256_set1_epi32(weight[2] as i32); + let interleaved = _mm256_unpacklo_epi8(item_row_0, item_row_1); + let pix = _mm256_unpacklo_epi8(interleaved, zeros); + store_0 = _mm256_add_epi32(store_0, _mm256_madd_epi16(pix, v_weight_2)); + let pix = _mm256_unpackhi_epi8(interleaved, zeros); + store_1 = _mm256_add_epi32(store_1, _mm256_madd_epi16(pix, v_weight_2)); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); + let interleaved = _mm256_unpackhi_epi8(item_row_0, item_row_1); + let pix = _mm256_unpacklo_epi8(interleaved, zeros); + store_2 = _mm256_add_epi32(store_2, _mm256_madd_epi16(pix, v_weight_2)); + let pix = _mm256_unpackhi_epi8(interleaved, zeros); + store_3 = _mm256_add_epi32(store_3, _mm256_madd_epi16(pix, v_weight_2)); - let item_row_0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i); + let item_row_0 = + _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i); let item_row_1 = - _mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i); + _mm256_loadu_si256(s_ptr_next.get_unchecked(32..).as_ptr() as *const __m256i); - let item_row_10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); - let item_row_11 = - _mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i); + let interleaved = _mm256_unpacklo_epi8(item_row_0, item_row_1); + let pix = _mm256_unpacklo_epi8(interleaved, zeros); + store_4 = _mm256_add_epi32(store_4, _mm256_madd_epi16(pix, v_weight_2)); + let pix = _mm256_unpackhi_epi8(interleaved, zeros); + store_5 = _mm256_add_epi32(store_5, _mm256_madd_epi16(pix, v_weight_2)); - let item_row_20 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i); - let item_row_21 = - _mm256_loadu_si256(src_ptr2.get_unchecked(32..).as_ptr() as *const __m256i); + let interleaved = _mm256_unpackhi_epi8(item_row_0, item_row_1); + let pix = _mm256_unpacklo_epi8(interleaved, zeros); + store_6 = _mm256_add_epi32(store_6, _mm256_madd_epi16(pix, v_weight_2)); + let pix = _mm256_unpackhi_epi8(interleaved, zeros); + store_7 = _mm256_add_epi32(store_7, _mm256_madd_epi16(pix, v_weight_2)); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight0); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight0); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_10, v_weight1); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_11, v_weight1); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_20, v_weight2); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_21, v_weight2); - } else if bounds_size == 4 { - let py = start_y; - let weight = filter.get_unchecked(0..4); - let v_weight0 = _mm256_set1_epi32(weight[0] as i32); - let v_weight1 = _mm256_set1_epi32(weight[1] as i32); - let v_weight2 = _mm256_set1_epi32(weight[2] as i32); - let v_weight3 = _mm256_set1_epi32(weight[3] as i32); + jj += 2; + } - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); + for j in jj..bounds_size { + let py = start_y + j; + let weight = *filter.get_unchecked(j); + let v_weight = _mm256_set1_epi32(weight as i32); + let src_ptr = src.get_unchecked((src_stride * py + px)..); - let item_row_0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i); + let item_row_0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i); let item_row_1 = - _mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i); - - let item_row_10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); - let item_row_11 = - _mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i); - - let item_row_20 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i); - let item_row_21 = - _mm256_loadu_si256(src_ptr2.get_unchecked(32..).as_ptr() as *const __m256i); - - let item_row_30 = _mm256_loadu_si256(src_ptr3.as_ptr() as *const __m256i); - let item_row_31 = - _mm256_loadu_si256(src_ptr3.get_unchecked(32..).as_ptr() as *const __m256i); + _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i); (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight0); + dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight); (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight0); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_10, v_weight1); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_11, v_weight1); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_20, v_weight2); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_21, v_weight2); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_30, v_weight3); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_31, v_weight3); - } else { - let mut jj = 0usize; - - while jj < bounds_size.saturating_sub(2) { - let py = start_y + jj; - let f_ptr = filter.get_unchecked(jj..).as_ptr() as *const i32; - let v_weight_2 = _mm256_set1_epi32(f_ptr.read_unaligned()); - let src_ptr = src.get_unchecked((src_stride * py + px)..); - let s_ptr_next = src_ptr.get_unchecked(src_stride..); - - let item_row_0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i); - let item_row_1 = _mm256_loadu_si256(s_ptr_next.as_ptr() as *const __m256i); - - let interleaved = _mm256_unpacklo_epi8(item_row_0, item_row_1); - let pix = _mm256_unpacklo_epi8(interleaved, zeros); - store_0 = _mm256_add_epi32(store_0, _mm256_madd_epi16(pix, v_weight_2)); - let pix = _mm256_unpackhi_epi8(interleaved, zeros); - store_1 = _mm256_add_epi32(store_1, _mm256_madd_epi16(pix, v_weight_2)); - - let interleaved = _mm256_unpackhi_epi8(item_row_0, item_row_1); - let pix = _mm256_unpacklo_epi8(interleaved, zeros); - store_2 = _mm256_add_epi32(store_2, _mm256_madd_epi16(pix, v_weight_2)); - let pix = _mm256_unpackhi_epi8(interleaved, zeros); - store_3 = _mm256_add_epi32(store_3, _mm256_madd_epi16(pix, v_weight_2)); - - let item_row_0 = - _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i); - let item_row_1 = - _mm256_loadu_si256(s_ptr_next.get_unchecked(32..).as_ptr() as *const __m256i); - - let interleaved = _mm256_unpacklo_epi8(item_row_0, item_row_1); - let pix = _mm256_unpacklo_epi8(interleaved, zeros); - store_4 = _mm256_add_epi32(store_4, _mm256_madd_epi16(pix, v_weight_2)); - let pix = _mm256_unpackhi_epi8(interleaved, zeros); - store_5 = _mm256_add_epi32(store_5, _mm256_madd_epi16(pix, v_weight_2)); - - let interleaved = _mm256_unpackhi_epi8(item_row_0, item_row_1); - let pix = _mm256_unpacklo_epi8(interleaved, zeros); - store_6 = _mm256_add_epi32(store_6, _mm256_madd_epi16(pix, v_weight_2)); - let pix = _mm256_unpackhi_epi8(interleaved, zeros); - store_7 = _mm256_add_epi32(store_7, _mm256_madd_epi16(pix, v_weight_2)); - - jj += 2; - } - - for j in jj..bounds_size { - let py = start_y + j; - let weight = *filter.get_unchecked(j); - let v_weight = _mm256_set1_epi32(weight as i32); - let src_ptr = src.get_unchecked((src_stride * py + px)..); - - let item_row_0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i); - let item_row_1 = - _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight); - } + dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight); } store_0 = _mm256_srai_epi32::(store_0); @@ -299,78 +187,16 @@ unsafe fn convolve_vertical_part_avx_32( let bounds_size = bounds.size; - if bounds_size == 2 { - let py = start_y; - let weight = filter.get_unchecked(0..2); - let v_weight0 = _mm256_set1_epi32(weight[0] as i32); - let v_weight1 = _mm256_set1_epi32(weight[1] as i32); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - - let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i); - let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row0, v_weight0); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row1, v_weight1); - } else if bounds_size == 3 { - let py = start_y; - let weight = filter.get_unchecked(0..3); - let v_weight0 = _mm256_set1_epi32(weight[0] as i32); - let v_weight1 = _mm256_set1_epi32(weight[1] as i32); - let v_weight2 = _mm256_set1_epi32(weight[2] as i32); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - - let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i); - let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); - let item_row2 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row0, v_weight0); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row1, v_weight1); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row2, v_weight2); - } else if bounds_size == 4 { - let py = start_y; - let weight = filter.get_unchecked(0..4); - let v_weight0 = _mm256_set1_epi32(weight[0] as i32); - let v_weight1 = _mm256_set1_epi32(weight[1] as i32); - let v_weight2 = _mm256_set1_epi32(weight[2] as i32); - let v_weight3 = _mm256_set1_epi32(weight[3] as i32); - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); + for j in 0..bounds_size { + let py = start_y + j; + let weight = *filter.get_unchecked(j); + let v_weight = _mm256_set1_epi32(weight as i32); + let src_ptr = src.get_unchecked((src_stride * py + px)..); - let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i); - let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); - let item_row2 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i); - let item_row3 = _mm256_loadu_si256(src_ptr3.as_ptr() as *const __m256i); + let item_row = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i); (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row0, v_weight0); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row1, v_weight1); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row2, v_weight2); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row3, v_weight3); - } else { - for j in 0..bounds_size { - let py = start_y + j; - let weight = *filter.get_unchecked(j); - let v_weight = _mm256_set1_epi32(weight as i32); - let src_ptr = src.get_unchecked((src_stride * py + px)..); - - let item_row = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row, v_weight); - } + dot_prod(store_0, store_1, store_2, store_3, item_row, v_weight); } store_0 = _mm256_srai_epi32::(store_0); @@ -410,13 +236,14 @@ unsafe fn convolve_vertical_part_8_avx( let weight = *filter.get_unchecked(j); let v_weight = _mm256_set1_epi32(weight as i32); let src_ptr = src.get_unchecked((src_stride * py + px)..); - let item_row = _mm256_cvtepu16_epi32(_mm_cvtepu8_epi16(_mm_loadu_si64(src_ptr.as_ptr()))); + let item_row = _mm256_cvtepu16_epi32(_mm_unpacklo_epi8( + _mm_loadu_si64(src_ptr.as_ptr()), + _mm_setzero_si128(), + )); store_0 = _mm256_add_epi32(store_0, _mm256_mullo_epi32(item_row, v_weight)); } - store_0 = _mm256_max_epi32(store_0, zeros); - const MASK: i32 = shuffle(3, 1, 2, 0); let low_16 = _mm256_permute4x64_epi64::(_mm256_packus_epi32( @@ -428,7 +255,7 @@ unsafe fn convolve_vertical_part_8_avx( let item_sse = _mm256_castsi256_si128(item); let dst_ptr = dst.get_unchecked_mut(px..); - std::ptr::copy_nonoverlapping(&item_sse as *const _ as *const u8, dst_ptr.as_mut_ptr(), 8); + _mm_storeu_si64(dst_ptr.as_mut_ptr(), item_sse); } #[inline(always)] @@ -510,8 +337,6 @@ unsafe fn convolve_vertical_part_avx( } } - store_0 = _mm256_max_epi32(store_0, zeros); - let low_16 = _mm256_packus_epi32(_mm256_srai_epi32::(store_0), zeros); let item = _mm256_packus_epi16(low_16, low_16); diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs index af4ebc3..48ea85e 100644 --- a/src/avx2/vertical_u8_lp.rs +++ b/src/avx2/vertical_u8_lp.rs @@ -53,8 +53,9 @@ unsafe fn m256dot( row: __m256i, weight: __m256i, ) -> (__m256i, __m256i) { - let lo = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(row)); - let hi = _mm256_cvtepu8_epi16(_mm256_extracti128_si256::<1>(row)); + let zeros = _mm256_setzero_si256(); + let lo = _mm256_unpacklo_epi8(row, zeros); + let hi = _mm256_unpackhi_epi8(row, zeros); let store0 = _mm256_add_epi16( store0, @@ -216,8 +217,8 @@ unsafe fn convolve_vertical_avx2_row_impl( let rebased2 = _mm256_srai_epi16::(store2); let rebased3 = _mm256_srai_epi16::(store3); - let shrank0 = avx2_pack_u16(rebased0, rebased1); - let shrank1 = avx2_pack_u16(rebased2, rebased3); + let shrank0 = _mm256_packus_epi16(rebased0, rebased1); + let shrank1 = _mm256_packus_epi16(rebased2, rebased3); _mm256_storeu_si256(dst.as_mut_ptr() as *mut __m256i, shrank0); _mm256_storeu_si256( @@ -316,7 +317,7 @@ unsafe fn convolve_vertical_avx2_row_impl( let rebased0 = _mm256_srai_epi16::(store0); let rebased1 = _mm256_srai_epi16::(store1); - let shrank0 = avx2_pack_u16(rebased0, rebased1); + let shrank0 = _mm256_packus_epi16(rebased0, rebased1); _mm256_storeu_si256(dst.as_mut_ptr() as *mut __m256i, shrank0); cx += 32; @@ -469,7 +470,7 @@ unsafe fn convolve_vertical_avx2_row_impl( let rebased = _mm_srai_epi16::(store); let shrank = _mm_packus_epi16(rebased, rebased); - std::ptr::copy_nonoverlapping(&shrank as *const _ as *const u8, dst.as_mut_ptr(), 8); + _mm_storeu_si64(dst.as_mut_ptr(), shrank); cx += 8; } diff --git a/src/neon/rgb_f32.rs b/src/neon/rgb_f32.rs index f1a5325..d0f22a2 100644 --- a/src/neon/rgb_f32.rs +++ b/src/neon/rgb_f32.rs @@ -35,10 +35,8 @@ use crate::neon::utils::{prefer_vfmaq_f32, prefer_vfmaq_lane_f32}; macro_rules! write_rgb_f32 { ($store: expr, $dest_ptr: expr) => {{ - let l1 = vgetq_lane_u64::<0>(vreinterpretq_u64_f32($store)); - let l3 = vgetq_lane_f32::<2>($store); - ($dest_ptr as *mut u64).write_unaligned(l1); - $dest_ptr.add(2).write_unaligned(l3); + vst1_f32($dest_ptr, vget_low_f32($store)); + vst1q_lane_f32::<2>($dest_ptr.add(2), $store); }}; } diff --git a/src/neon/rgb_u8.rs b/src/neon/rgb_u8.rs index 44c0862..af240f5 100644 --- a/src/neon/rgb_u8.rs +++ b/src/neon/rgb_u8.rs @@ -102,11 +102,11 @@ unsafe fn conv_horiz_rgba_1_u8( unsafe fn write_accumulator_u8(store: int32x4_t, dst: &mut [u8]) { let store_16 = vqshrun_n_s32::(store); let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); - let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); - let bytes = pixel.to_le_bytes(); - let first_byte = u16::from_le_bytes([bytes[0], bytes[1]]); - (dst.as_mut_ptr() as *mut u16).write_unaligned(first_byte); - *dst.get_unchecked_mut(2) = bytes[2]; + vst1_lane_u16::<0>( + dst.as_mut_ptr() as *mut u16, + vreinterpret_u16_u8(store_16_8), + ); + vst1_lane_u8::<2>(dst.as_mut_ptr().add(2), store_16_8); } pub(crate) fn convolve_horizontal_rgb_neon_rows_4( diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs index d4fa251..b2a4fb9 100644 --- a/src/neon/rgba_u8.rs +++ b/src/neon/rgba_u8.rs @@ -433,21 +433,22 @@ pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8( let store_16_8_2 = vqmovn_u16(vcombine_u16(store_16_2, store_16_2)); let store_16_8 = vqmovn_u16(vcombine_u16(store_16_3, store_16_3)); - let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_0)); - let dest_ptr_32 = chunk0.as_mut_ptr() as *mut u32; - dest_ptr_32.write_unaligned(pixel); - - let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_1)); - let dest_ptr_32 = chunk1.as_mut_ptr() as *mut u32; - dest_ptr_32.write_unaligned(pixel); - - let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8_2)); - let dest_ptr_32 = chunk2.as_mut_ptr() as *mut u32; - dest_ptr_32.write_unaligned(pixel); - - let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); - let dest_ptr_32 = chunk3.as_mut_ptr() as *mut u32; - dest_ptr_32.write_unaligned(pixel); + vst1_lane_u32::<0>( + chunk0.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8_0), + ); + vst1_lane_u32::<0>( + chunk1.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8_1), + ); + vst1_lane_u32::<0>( + chunk2.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8_2), + ); + vst1_lane_u32::<0>( + chunk3.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8), + ); } } } @@ -585,8 +586,9 @@ unsafe fn convolve_horizontal_rgba_neon_row_i16_impl( let store_16 = vshr_n_s16::(store); let store_16_8 = vqmovun_s16(vcombine_s16(store_16, store_16)); - let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); - let dest_ptr_32 = dst.as_mut_ptr() as *mut u32; - dest_ptr_32.write_unaligned(value); + vst1_lane_u32::<0>( + dst.as_mut_ptr() as *mut u32, + vreinterpret_u32_u8(store_16_8), + ); } } diff --git a/src/neon/vertical_u8.rs b/src/neon/vertical_u8.rs index 1e01da3..2a0f44a 100644 --- a/src/neon/vertical_u8.rs +++ b/src/neon/vertical_u8.rs @@ -1216,9 +1216,7 @@ fn convolve_vertical_neon_row_full( let low_16 = vcombine_u16(shrinked_store, shrinked_store); let item = vqmovn_u16(low_16); - - let value = vget_lane_u8::<0>(item); - *dst = value; + vst1_lane_u8::<0>(dst, item); cx += 1; } } diff --git a/src/sse/alpha_u16.rs b/src/sse/alpha_u16.rs index 9cde8aa..38e79e4 100644 --- a/src/sse/alpha_u16.rs +++ b/src/sse/alpha_u16.rs @@ -46,7 +46,7 @@ unsafe fn sse_unpremultiply_row_u16( a_hi_f: __m128, ) -> __m128i { let zeros = _mm_setzero_si128(); - let lo = _mm_cvtepu16_epi32(x); + let lo = _mm_unpacklo_epi16(x, zeros); let hi = _mm_unpackhi_epi16(x, zeros); const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; @@ -119,7 +119,10 @@ unsafe fn unpremultiply_alpha_sse_rgba_u16_row_impl(in_place: &mut [u16], bit_de let is_zero_mask = _mm_cmpeq_epi16(aaaa, _mm_setzero_si128()); let a_lo_f = _mm_mul_ps( - _mm_rcp_ps(_mm_cvtepi32_ps(_mm_cvtepu16_epi32(aaaa))), + _mm_rcp_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16( + aaaa, + _mm_setzero_si128(), + ))), v_max_colors, ); let a_hi_f = _mm_mul_ps( @@ -183,7 +186,7 @@ unsafe fn sse_premultiply_row_u16( v_max_colors_scale: __m128, ) -> __m128i { let zeros = _mm_setzero_si128(); - let lo = _mm_cvtepu16_epi32(x); + let lo = _mm_unpacklo_epi16(x, zeros); let hi = _mm_unpackhi_epi16(x, zeros); let new_lo = _mm_cvtps_epi32(_mm_mul_ps( @@ -241,7 +244,7 @@ unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); - let a_lo_f = _mm_cvtepu16_epi32(aaaa); + let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros); let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros); let new_rrrr = _mm_packus_epi32( @@ -276,7 +279,7 @@ unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); - let a_lo_f = _mm_cvtepu16_epi32(aaaa); + let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros); let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros); let new_rrrr = _mm_packus_epi32( @@ -311,7 +314,7 @@ unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); - let a_lo_f = _mm_cvtepu16_epi32(aaaa); + let a_lo_f = _mm_unpacklo_epi16(aaaa, zeros); let a_hi_f = _mm_unpackhi_epi16(aaaa, zeros); let new_rrrr = _mm_packus_epi32( @@ -365,7 +368,7 @@ unsafe fn premultiply_alpha_sse_rgba_u16_row_impl(dst: &mut [u16], src: &[u16], let row3 = _mm_loadu_si128(src_ptr.add(24) as *const __m128i); let (rrrr, gggg, bbbb, aaaa) = sse_deinterleave_rgba_epi16(row0, row1, row2, row3); - let a_lo_f = _mm_cvtepi32_ps(_mm_cvtepu16_epi32(aaaa)); + let a_lo_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(aaaa, _mm_setzero_si128())); let a_hi_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(aaaa, _mm_setzero_si128())); let new_rrrr = sse_premultiply_row_u16(rrrr, a_lo_f, a_hi_f, v_max_colors_scale); diff --git a/src/sse/alpha_u8.rs b/src/sse/alpha_u8.rs index f194299..32f01cd 100644 --- a/src/sse/alpha_u8.rs +++ b/src/sse/alpha_u8.rs @@ -61,7 +61,7 @@ pub(crate) unsafe fn _mm_div_by_255_epi16(v: __m128i) -> __m128i { #[inline(always)] pub(crate) unsafe fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i { let zeros = _mm_setzero_si128(); - let lo = _mm_cvtepu8_epi16(x); + let lo = _mm_unpacklo_epi8(x, zeros); let hi = _mm_unpackhi_epi8(x, zeros); let scale = _mm_set1_epi16(255); @@ -71,15 +71,15 @@ pub(crate) unsafe fn sse_unpremultiply_row(x: __m128i, a: __m128i) -> __m128i { let scale_ps = _mm_set1_ps(255f32); - let lo_lo = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(lo)), scale_ps); + let lo_lo = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(lo, zeros)), scale_ps); let lo_hi = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(lo, zeros)), scale_ps); - let hi_lo = _mm_mul_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(hi)), scale_ps); + let hi_lo = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(hi, zeros)), scale_ps); let hi_hi = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(hi, zeros)), scale_ps); - let a_lo = _mm_cvtepu8_epi16(a); + let a_lo = _mm_unpacklo_epi8(a, zeros); let a_hi = _mm_unpackhi_epi8(a, zeros); - let a_lo_lo = _mm_rcp_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_lo))); + let a_lo_lo = _mm_rcp_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(a_lo, zeros))); let a_lo_hi = _mm_rcp_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_lo, zeros))); - let a_hi_lo = _mm_rcp_ps(_mm_cvtepi32_ps(_mm_cvtepi16_epi32(a_hi))); + let a_hi_lo = _mm_rcp_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(a_hi, zeros))); let a_hi_hi = _mm_rcp_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(a_hi, zeros))); const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC; @@ -124,16 +124,16 @@ unsafe fn sse_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) { let rgba3 = _mm_loadu_si128(src_ptr.add(48) as *const __m128i); let (rrr, ggg, bbb, aaa) = sse_deinterleave_rgba(rgba0, rgba1, rgba2, rgba3); - let mut rrr_low = _mm_cvtepu8_epi16(rrr); + let mut rrr_low = _mm_unpacklo_epi8(rrr, zeros); let mut rrr_high = _mm_unpackhi_epi8(rrr, zeros); - let mut ggg_low = _mm_cvtepu8_epi16(ggg); + let mut ggg_low = _mm_unpacklo_epi8(ggg, zeros); let mut ggg_high = _mm_unpackhi_epi8(ggg, zeros); - let mut bbb_low = _mm_cvtepu8_epi16(bbb); + let mut bbb_low = _mm_unpacklo_epi8(bbb, zeros); let mut bbb_high = _mm_unpackhi_epi8(bbb, zeros); - let aaa_low = _mm_cvtepu8_epi16(aaa); + let aaa_low = _mm_unpacklo_epi8(aaa, zeros); let aaa_high = _mm_unpackhi_epi8(aaa, zeros); rrr_low = _mm_div_by_255_epi16(_mm_mullo_epi16(rrr_low, aaa_low)); diff --git a/src/sse/plane_u8.rs b/src/sse/plane_u8.rs index 6f275f2..039f653 100644 --- a/src/sse/plane_u8.rs +++ b/src/sse/plane_u8.rs @@ -39,7 +39,7 @@ use crate::support::{PRECISION, ROUNDING_CONST}; macro_rules! s_accumulate_8_horiz { ($store: expr, $ptr: expr, $weights: expr) => {{ let pixel_colors = _mm_loadu_si64($ptr); - let px_16 = _mm_cvtepu8_epi16(pixel_colors); + let px_16 = _mm_unpacklo_epi8(pixel_colors, _mm_setzero_si128()); let px_lo = _mm_unpacklo_epi16(px_16, _mm_setzero_si128()); let px_hi = _mm_unpackhi_epi16(px_16, _mm_setzero_si128()); diff --git a/src/sse/rgb_f32.rs b/src/sse/rgb_f32.rs index 26641f4..bca2a31 100644 --- a/src/sse/rgb_f32.rs +++ b/src/sse/rgb_f32.rs @@ -28,7 +28,7 @@ */ use crate::filter_weights::FilterWeights; -use crate::sse::{_mm_extract_epi64x, _mm_prefer_fma_ps, load_4_weights, shuffle}; +use crate::sse::{_mm_prefer_fma_ps, load_4_weights, shuffle}; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] @@ -231,7 +231,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f32_impl( let px = x * CHANNELS; let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); - (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store))); + _mm_storeu_si64(dest_ptr as *mut u8, _mm_castps_si128(store)); (dest_ptr as *mut i32) .add(2) .write_unaligned(_mm_extract_ps::<2>(store)); @@ -452,25 +452,25 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f32_impl( let px = x * CHANNELS; let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); - (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_0))); + _mm_storeu_si64(dest_ptr as *mut u8, _mm_castps_si128(store_0)); (dest_ptr as *mut i32) .add(2) .write_unaligned(_mm_extract_ps::<2>(store_0)); let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr(); - (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_1))); + _mm_storeu_si64(dest_ptr as *mut u8, _mm_castps_si128(store_1)); (dest_ptr as *mut i32) .add(2) .write_unaligned(_mm_extract_ps::<2>(store_1)); let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); - (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_2))); + _mm_storeu_si64(dest_ptr as *mut u8, _mm_castps_si128(store_2)); (dest_ptr as *mut i32) .add(2) .write_unaligned(_mm_extract_ps::<2>(store_2)); let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); - (dest_ptr as *mut i64).write_unaligned(_mm_extract_epi64x::<0>(_mm_castps_si128(store_3))); + _mm_storeu_si64(dest_ptr as *mut u8, _mm_castps_si128(store_3)); (dest_ptr as *mut i32) .add(2) .write_unaligned(_mm_extract_ps::<2>(store_3)); diff --git a/src/sse/rgba_f16.rs b/src/sse/rgba_f16.rs index f29b541..d032a21 100644 --- a/src/sse/rgba_f16.rs +++ b/src/sse/rgba_f16.rs @@ -265,11 +265,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl(store); - std::ptr::copy_nonoverlapping( - &converted_f16 as *const _ as *const u8, - dest_ptr as *mut u8, - 8, - ); + _mm_storeu_si64(dest_ptr as *mut u8, converted_f16); filter_offset += filter_weights.aligned_size; } @@ -536,29 +532,11 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl(store_1); let converted_f16_2 = _mm_cvtps_phx::(store_2); let converted_f16_3 = _mm_cvtps_phx::(store_3); - std::ptr::copy_nonoverlapping( - &converted_f16_0 as *const _ as *const u8, - dest_ptr0 as *mut u8, - 8, - ); - - std::ptr::copy_nonoverlapping( - &converted_f16_1 as *const _ as *const u8, - dest_ptr1 as *mut u8, - 8, - ); - - std::ptr::copy_nonoverlapping( - &converted_f16_2 as *const _ as *const u8, - dest_ptr2 as *mut u8, - 8, - ); - - std::ptr::copy_nonoverlapping( - &converted_f16_3 as *const _ as *const u8, - dest_ptr3 as *mut u8, - 8, - ); + + _mm_storeu_si64(dest_ptr0 as *mut u8, converted_f16_0); + _mm_storeu_si64(dest_ptr1 as *mut u8, converted_f16_1); + _mm_storeu_si64(dest_ptr2 as *mut u8, converted_f16_2); + _mm_storeu_si64(dest_ptr3 as *mut u8, converted_f16_3); filter_offset += filter_weights.aligned_size; } diff --git a/src/sse/rgba_u16.rs b/src/sse/rgba_u16.rs index 79ec664..969e85e 100644 --- a/src/sse/rgba_u16.rs +++ b/src/sse/rgba_u16.rs @@ -369,26 +369,10 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_u16_impl( let store_16_2 = _mm_packus_epi32(v_st2, v_st2); let store_16_3 = _mm_packus_epi32(v_st3, v_st3); - std::ptr::copy_nonoverlapping( - &store_16_0 as *const _ as *const u8, - chunk0.as_mut_ptr() as *mut u8, - 8, - ); - std::ptr::copy_nonoverlapping( - &store_16_1 as *const _ as *const u8, - chunk1.as_mut_ptr() as *mut u8, - 8, - ); - std::ptr::copy_nonoverlapping( - &store_16_2 as *const _ as *const u8, - chunk2.as_mut_ptr() as *mut u8, - 8, - ); - std::ptr::copy_nonoverlapping( - &store_16_3 as *const _ as *const u8, - chunk3.as_mut_ptr() as *mut u8, - 8, - ); + _mm_storeu_si64(chunk0.as_mut_ptr() as *mut u8, store_16_0); + _mm_storeu_si64(chunk1.as_mut_ptr() as *mut u8, store_16_1); + _mm_storeu_si64(chunk2.as_mut_ptr() as *mut u8, store_16_2); + _mm_storeu_si64(chunk3.as_mut_ptr() as *mut u8, store_16_3); } } @@ -502,11 +486,6 @@ unsafe fn convolve_horizontal_rgba_sse_u16_row_impl( ); let store_16_0 = _mm_packus_epi32(v_st, v_st); - - std::ptr::copy_nonoverlapping( - &store_16_0 as *const _ as *const u8, - dst.as_mut_ptr() as *mut u8, - 8, - ); + _mm_storeu_si64(dst.as_mut_ptr() as *mut u8, store_16_0); } } diff --git a/src/sse/rgba_u16_lb.rs b/src/sse/rgba_u16_lb.rs index f2d5974..d295c05 100644 --- a/src/sse/rgba_u16_lb.rs +++ b/src/sse/rgba_u16_lb.rs @@ -192,7 +192,6 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_lb_u8_impl( bit_depth: u32, ) { const CHANNELS: usize = 4; - let zeros = _mm_setzero_si128(); let init = _mm_set1_epi32(ROUNDING_CONST); let v_max_colors = _mm_set1_epi16((1 << bit_depth) - 1); @@ -287,36 +286,20 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_lb_u8_impl( jx += 1; } - let v_st0 = _mm_srai_epi32::(_mm_max_epi32(store_0, zeros)); - let v_st1 = _mm_srai_epi32::(_mm_max_epi32(store_1, zeros)); - let v_st2 = _mm_srai_epi32::(_mm_max_epi32(store_2, zeros)); - let v_st3 = _mm_srai_epi32::(_mm_max_epi32(store_3, zeros)); + let v_st0 = _mm_srai_epi32::(store_0); + let v_st1 = _mm_srai_epi32::(store_1); + let v_st2 = _mm_srai_epi32::(store_2); + let v_st3 = _mm_srai_epi32::(store_3); let store_16_0 = _mm_min_epi16(_mm_packus_epi32(v_st0, v_st0), v_max_colors); let store_16_1 = _mm_min_epi16(_mm_packus_epi32(v_st1, v_st1), v_max_colors); let store_16_2 = _mm_min_epi16(_mm_packus_epi32(v_st2, v_st2), v_max_colors); let store_16_3 = _mm_min_epi16(_mm_packus_epi32(v_st3, v_st3), v_max_colors); - std::ptr::copy_nonoverlapping( - &store_16_0 as *const _ as *const u8, - chunk0.as_mut_ptr() as *mut u8, - 8, - ); - std::ptr::copy_nonoverlapping( - &store_16_1 as *const _ as *const u8, - chunk1.as_mut_ptr() as *mut u8, - 8, - ); - std::ptr::copy_nonoverlapping( - &store_16_2 as *const _ as *const u8, - chunk2.as_mut_ptr() as *mut u8, - 8, - ); - std::ptr::copy_nonoverlapping( - &store_16_3 as *const _ as *const u8, - chunk3.as_mut_ptr() as *mut u8, - 8, - ); + _mm_storeu_si64(chunk0.as_mut_ptr() as *mut u8, store_16_0); + _mm_storeu_si64(chunk1.as_mut_ptr() as *mut u8, store_16_1); + _mm_storeu_si64(chunk2.as_mut_ptr() as *mut u8, store_16_2); + _mm_storeu_si64(chunk3.as_mut_ptr() as *mut u8, store_16_3); } } @@ -340,7 +323,6 @@ unsafe fn convolve_horizontal_rgba_sse_u16_lb_row_impl( ) { const CHANNELS: usize = 4; - let zeros = _mm_setzero_si128(); let v_max_colors = _mm_set1_epi16((1 << bit_depth) - 1); for ((dst, bounds), weights) in dst @@ -401,14 +383,9 @@ unsafe fn convolve_horizontal_rgba_sse_u16_lb_row_impl( jx += 1; } - let v_st = _mm_srai_epi32::(_mm_max_epi32(store, zeros)); + let v_st = _mm_srai_epi32::(store); let store_16_0 = _mm_min_epi16(_mm_packus_epi32(v_st, v_st), v_max_colors); - - std::ptr::copy_nonoverlapping( - &store_16_0 as *const _ as *const u8, - dst.as_mut_ptr() as *mut u8, - 8, - ); + _mm_storeu_si64(dst.as_mut_ptr() as *mut u8, store_16_0); } } diff --git a/src/sse/rgba_u8.rs b/src/sse/rgba_u8.rs index e41d35f..c746c33 100644 --- a/src/sse/rgba_u8.rs +++ b/src/sse/rgba_u8.rs @@ -47,7 +47,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_sse( let src_ptr_32 = src_ptr.as_ptr() as *const i32; let rgba_pixel = _mm_cvtsi32_si128(src_ptr_32.read_unaligned()); - let lo = _mm_cvtepu8_epi16(rgba_pixel); + let lo = _mm_unpacklo_epi8(rgba_pixel, _mm_setzero_si128()); _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(lo), weight0)) } @@ -224,22 +224,22 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( let store_16_8_2 = compress_i32(store_2); let store_16_8_3 = compress_i32(store_3); - let pixel_0 = _mm_extract_epi32::<0>(store_16_8_0); - let pixel_1 = _mm_extract_epi32::<0>(store_16_8_1); - let pixel_2 = _mm_extract_epi32::<0>(store_16_8_2); - let pixel_3 = _mm_extract_epi32::<0>(store_16_8_3); - - let dest_ptr = chunk0.as_mut_ptr() as *mut i32; - dest_ptr.write_unaligned(pixel_0); - - let dest_ptr = chunk1.as_mut_ptr() as *mut i32; - dest_ptr.write_unaligned(pixel_1); - - let dest_ptr = chunk2.as_mut_ptr() as *mut i32; - dest_ptr.write_unaligned(pixel_2); - - let dest_ptr = chunk3.as_mut_ptr() as *mut i32; - dest_ptr.write_unaligned(pixel_3); + _mm_storeu_si32( + chunk0.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_0, store_16_8_0), + ); + _mm_storeu_si32( + chunk1.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_1, store_16_8_1), + ); + _mm_storeu_si32( + chunk2.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_2, store_16_8_2), + ); + _mm_storeu_si32( + chunk3.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_3, store_16_8_3), + ); } } } @@ -342,9 +342,9 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl( } let store_16_8 = compress_i32(store); - let pixel = _mm_extract_epi32::<0>(store_16_8); - - let dest_ptr_32 = dst.as_mut_ptr() as *mut i32; - dest_ptr_32.write_unaligned(pixel); + _mm_storeu_si32( + dst.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8, store_16_8), + ); } } diff --git a/src/sse/rgba_u8_lb.rs b/src/sse/rgba_u8_lb.rs index 1cef21a..9f7ffe9 100644 --- a/src/sse/rgba_u8_lb.rs +++ b/src/sse/rgba_u8_lb.rs @@ -45,7 +45,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_sse( let src_ptr_32 = src_ptr.as_ptr() as *const i32; let rgba_pixel = _mm_cvtsi32_si128(src_ptr_32.read_unaligned()); - let lo = _mm_slli_epi16::(_mm_cvtepu8_epi16(rgba_pixel)); + let lo = _mm_slli_epi16::(_mm_unpacklo_epi8(rgba_pixel, _mm_setzero_si128())); _mm_add_epi16(store_0, _mm_mulhi_epi16(lo, weight0)) } @@ -128,8 +128,6 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( const ROUNDING: i16 = 1 << (SCALE - 1); const V_SHR: i32 = SCALE - 1; - let zeros = _mm_setzero_si128(); - let vld = _mm_set1_epi16(ROUNDING); let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3); @@ -360,27 +358,27 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( jx += 1; } - let store_16_8_0 = _mm_srai_epi16::(_mm_max_epi16(store_0, zeros)); - let store_16_8_1 = _mm_srai_epi16::(_mm_max_epi16(store_1, zeros)); - let store_16_8_2 = _mm_srai_epi16::(_mm_max_epi16(store_2, zeros)); - let store_16_8_3 = _mm_srai_epi16::(_mm_max_epi16(store_3, zeros)); - - let pixel_0 = _mm_extract_epi32::<0>(_mm_packus_epi16(store_16_8_0, store_16_8_0)); - let pixel_1 = _mm_extract_epi32::<0>(_mm_packus_epi16(store_16_8_1, store_16_8_1)); - let pixel_2 = _mm_extract_epi32::<0>(_mm_packus_epi16(store_16_8_2, store_16_8_2)); - let pixel_3 = _mm_extract_epi32::<0>(_mm_packus_epi16(store_16_8_3, store_16_8_3)); - - let dest_ptr = chunk0.as_mut_ptr() as *mut i32; - dest_ptr.write_unaligned(pixel_0); + let store_16_8_0 = _mm_srai_epi16::(store_0); + let store_16_8_1 = _mm_srai_epi16::(store_1); + let store_16_8_2 = _mm_srai_epi16::(store_2); + let store_16_8_3 = _mm_srai_epi16::(store_3); - let dest_ptr = chunk1.as_mut_ptr() as *mut i32; - dest_ptr.write_unaligned(pixel_1); - - let dest_ptr = chunk2.as_mut_ptr() as *mut i32; - dest_ptr.write_unaligned(pixel_2); - - let dest_ptr = chunk3.as_mut_ptr() as *mut i32; - dest_ptr.write_unaligned(pixel_3); + _mm_storeu_si32( + chunk0.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_0, store_16_8_0), + ); + _mm_storeu_si32( + chunk1.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_1, store_16_8_1), + ); + _mm_storeu_si32( + chunk2.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_2, store_16_8_2), + ); + _mm_storeu_si32( + chunk3.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8_3, store_16_8_3), + ); } } } @@ -405,8 +403,6 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl( let shuffle_weights = _mm_setr_epi8(0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3); - let zeros = _mm_setzero_si128(); - const SCALE: i32 = 6; const ROUNDING: i16 = 1 << (SCALE - 1); const V_SHR: i32 = SCALE - 1; @@ -520,10 +516,10 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl( jx += 1; } - let store_16_8 = _mm_srai_epi16::(_mm_max_epi16(store, zeros)); - let pixel = _mm_extract_epi32::<0>(_mm_packus_epi16(store_16_8, store_16_8)); - - let dest_ptr_32 = dst.as_mut_ptr() as *mut i32; - dest_ptr_32.write_unaligned(pixel); + let store_16_8 = _mm_srai_epi16::(store); + _mm_storeu_si32( + dst.as_mut_ptr() as *mut _, + _mm_packus_epi16(store_16_8, store_16_8), + ); } } diff --git a/src/sse/u8_utils.rs b/src/sse/u8_utils.rs index 3605d2a..7b6e5ec 100644 --- a/src/sse/u8_utils.rs +++ b/src/sse/u8_utils.rs @@ -36,7 +36,7 @@ use crate::support::PRECISION; #[inline(always)] pub(crate) fn compress_i32(x: __m128i) -> __m128i { - let store_32 = unsafe { _mm_srai_epi32::(_mm_max_epi32(x, _mm_setzero_si128())) }; + let store_32 = unsafe { _mm_srai_epi32::(x) }; let store_16 = unsafe { _mm_packus_epi32(store_32, store_32) }; unsafe { _mm_packus_epi16(store_16, store_16) } } @@ -57,6 +57,9 @@ pub(crate) unsafe fn convolve_horizontal_parts_one_sse_rgb( 0, ]); let m_vl = _mm_cvtsi32_si128(vl); - let lo = _mm_cvtepu8_epi16(m_vl); - _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(lo), weight0)) + let lo = _mm_unpacklo_epi8(m_vl, _mm_setzero_si128()); + _mm_add_epi32( + store_0, + _mm_madd_epi16(_mm_unpacklo_epi16(lo, _mm_setzero_si128()), weight0), + ) } diff --git a/src/sse/vertical_f16.rs b/src/sse/vertical_f16.rs index 6d7ca93..f6a616b 100644 --- a/src/sse/vertical_f16.rs +++ b/src/sse/vertical_f16.rs @@ -95,7 +95,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_4_f16(store_0); - std::ptr::copy_nonoverlapping(&acc as *const _ as *const u8, dst_ptr as *mut u8, 8); + _mm_storeu_si64(dst_ptr as *mut u8, acc); } #[inline(always)] diff --git a/src/sse/vertical_u16.rs b/src/sse/vertical_u16.rs index 2fde19b..731bbad 100644 --- a/src/sse/vertical_u16.rs +++ b/src/sse/vertical_u16.rs @@ -110,295 +110,35 @@ unsafe fn convolve_column_lb_u16_impl( let v_dx = v_px + x * 16; - if bounds_size == 2 { - let weights = weight.get_unchecked(0..2); - let weight0 = weights[0]; - let weight1 = weights[1]; - - let py = bounds.start; - let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); - - let v_weight0 = _mm_set1_ps(weight0); - let v_weight1 = _mm_set1_ps(weight1); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr0.as_ptr().add(8) as *const __m128i); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)), - v_weight0, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)), - v_weight0, - ); - store2 = _mm_prefer_fma_ps::( - store2, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)), - v_weight0, - ); - store3 = _mm_prefer_fma_ps::( - store3, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)), - v_weight0, - ); - - let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row11 = _mm_loadu_si128(src_ptr1.as_ptr().add(8) as *const __m128i); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row10, zeros)), - v_weight1, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row10, zeros)), - v_weight1, - ); - store2 = _mm_prefer_fma_ps::( - store2, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row11, zeros)), - v_weight1, - ); - store3 = _mm_prefer_fma_ps::( - store3, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row11, zeros)), - v_weight1, - ); - } else if bounds_size == 3 { - let weights = weight.get_unchecked(0..3); - let weight0 = weights[0]; - let weight1 = weights[1]; - let weight2 = weights[2]; + for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { + let py = bounds.start + j; + let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); - let py = bounds.start; - let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..); - - let v_weight0 = _mm_set1_ps(weight0); - let v_weight1 = _mm_set1_ps(weight1); - let v_weight2 = _mm_set1_ps(weight2); + let v_weight = _mm_set1_ps(k_weight); - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr0.as_ptr().add(8) as *const __m128i); + let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); + let item_row1 = _mm_loadu_si128(src_ptr.as_ptr().add(8) as *const __m128i); store0 = _mm_prefer_fma_ps::( store0, _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)), - v_weight0, + v_weight, ); store1 = _mm_prefer_fma_ps::( store1, _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)), - v_weight0, + v_weight, ); store2 = _mm_prefer_fma_ps::( store2, _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)), - v_weight0, + v_weight, ); store3 = _mm_prefer_fma_ps::( store3, _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)), - v_weight0, - ); - - let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row11 = _mm_loadu_si128(src_ptr1.as_ptr().add(8) as *const __m128i); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row10, zeros)), - v_weight1, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row10, zeros)), - v_weight1, - ); - store2 = _mm_prefer_fma_ps::( - store2, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row11, zeros)), - v_weight1, - ); - store3 = _mm_prefer_fma_ps::( - store3, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row11, zeros)), - v_weight1, - ); - - let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - let item_row21 = _mm_loadu_si128(src_ptr2.as_ptr().add(8) as *const __m128i); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row20, zeros)), - v_weight2, + v_weight, ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row20, zeros)), - v_weight2, - ); - store2 = _mm_prefer_fma_ps::( - store2, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row21, zeros)), - v_weight2, - ); - store3 = _mm_prefer_fma_ps::( - store3, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row21, zeros)), - v_weight2, - ); - } else if bounds_size == 4 { - let weights = weight.get_unchecked(0..4); - let weight0 = weights[0]; - let weight1 = weights[1]; - let weight2 = weights[2]; - let weight3 = weights[3]; - - let py = bounds.start; - let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..); - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_dx)..); - - let v_weight0 = _mm_set1_ps(weight0); - let v_weight1 = _mm_set1_ps(weight1); - let v_weight2 = _mm_set1_ps(weight2); - let v_weight3 = _mm_set1_ps(weight3); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr0.as_ptr().add(8) as *const __m128i); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)), - v_weight0, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)), - v_weight0, - ); - store2 = _mm_prefer_fma_ps::( - store2, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)), - v_weight0, - ); - store3 = _mm_prefer_fma_ps::( - store3, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)), - v_weight0, - ); - - let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row11 = _mm_loadu_si128(src_ptr1.as_ptr().add(8) as *const __m128i); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row10, zeros)), - v_weight1, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row10, zeros)), - v_weight1, - ); - store2 = _mm_prefer_fma_ps::( - store2, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row11, zeros)), - v_weight1, - ); - store3 = _mm_prefer_fma_ps::( - store3, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row11, zeros)), - v_weight1, - ); - - let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - let item_row21 = _mm_loadu_si128(src_ptr2.as_ptr().add(8) as *const __m128i); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row20, zeros)), - v_weight2, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row20, zeros)), - v_weight2, - ); - store2 = _mm_prefer_fma_ps::( - store2, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row21, zeros)), - v_weight2, - ); - store3 = _mm_prefer_fma_ps::( - store3, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row21, zeros)), - v_weight2, - ); - - let item_row30 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i); - let item_row31 = _mm_loadu_si128(src_ptr3.as_ptr().add(8) as *const __m128i); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row30, zeros)), - v_weight3, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row30, zeros)), - v_weight3, - ); - store2 = _mm_prefer_fma_ps::( - store2, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row31, zeros)), - v_weight3, - ); - store3 = _mm_prefer_fma_ps::( - store3, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row31, zeros)), - v_weight3, - ); - } else { - for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { - let py = bounds.start + j; - let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); - - let v_weight = _mm_set1_ps(k_weight); - - let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr.as_ptr().add(8) as *const __m128i); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)), - v_weight, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)), - v_weight, - ); - store2 = _mm_prefer_fma_ps::( - store2, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)), - v_weight, - ); - store3 = _mm_prefer_fma_ps::( - store3, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)), - v_weight, - ); - } } let v_st0 = _mm_min_epi32( @@ -438,179 +178,24 @@ unsafe fn convolve_column_lb_u16_impl( let v_dx = v_px + x * 8; - if bounds_size == 2 { - let weights = weight.get_unchecked(0..2); - let weight0 = weights[0]; - let weight1 = weights[1]; - - let py = bounds.start; - let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); - - let v_weight0 = _mm_set1_ps(weight0); - let v_weight1 = _mm_set1_ps(weight1); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)), - v_weight0, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)), - v_weight0, - ); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)), - v_weight1, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)), - v_weight1, - ); - } else if bounds_size == 3 { - let weights = weight.get_unchecked(0..3); - let weight0 = weights[0]; - let weight1 = weights[1]; - let weight2 = weights[2]; - - let py = bounds.start; - let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..); - - let v_weight0 = _mm_set1_ps(weight0); - let v_weight1 = _mm_set1_ps(weight1); - let v_weight2 = _mm_set1_ps(weight2); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)), - v_weight0, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)), - v_weight0, - ); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)), - v_weight1, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)), - v_weight1, - ); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row2, zeros)), - v_weight2, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row2, zeros)), - v_weight2, - ); - } else if bounds_size == 4 { - let weights = weight.get_unchecked(0..4); - let weight0 = weights[0]; - let weight1 = weights[1]; - let weight2 = weights[2]; - let weight3 = weights[3]; - - let py = bounds.start; - let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..); - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_dx)..); + for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { + let py = bounds.start + j; + let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); - let v_weight0 = _mm_set1_ps(weight0); - let v_weight1 = _mm_set1_ps(weight1); - let v_weight2 = _mm_set1_ps(weight2); - let v_weight3 = _mm_set1_ps(weight3); + let v_weight = _mm_set1_ps(k_weight); - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - let item_row3 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i); + let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); store0 = _mm_prefer_fma_ps::( store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row0, zeros)), - v_weight0, + _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row, zeros)), + v_weight, ); store1 = _mm_prefer_fma_ps::( store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row0, zeros)), - v_weight0, + _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row, zeros)), + v_weight, ); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row1, zeros)), - v_weight1, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row1, zeros)), - v_weight1, - ); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row2, zeros)), - v_weight2, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row2, zeros)), - v_weight2, - ); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row3, zeros)), - v_weight3, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row3, zeros)), - v_weight3, - ); - } else { - for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { - let py = bounds.start + j; - let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); - - let v_weight = _mm_set1_ps(k_weight); - - let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); - - store0 = _mm_prefer_fma_ps::( - store0, - _mm_cvtepi32_ps(_mm_unpacklo_epi16(item_row, zeros)), - v_weight, - ); - store1 = _mm_prefer_fma_ps::( - store1, - _mm_cvtepi32_ps(_mm_unpackhi_epi16(item_row, zeros)), - v_weight, - ); - } } let v_st0 = _mm_min_epi32( @@ -769,11 +354,7 @@ unsafe fn convolve_column_lb_u16_impl( ); let u_store0 = _mm_packus_epi32(v_st, v_st); - std::ptr::copy_nonoverlapping( - &u_store0 as *const _ as *const u8, - dst.as_mut_ptr() as *mut u8, - 8, - ); + _mm_storeu_si64(dst.as_mut_ptr() as *mut u8, u_store0); cx = v_dx; } diff --git a/src/sse/vertical_u16_lb.rs b/src/sse/vertical_u16_lb.rs index 9715cd7..bd7a053 100644 --- a/src/sse/vertical_u16_lb.rs +++ b/src/sse/vertical_u16_lb.rs @@ -80,252 +80,37 @@ unsafe fn convolve_column_lb_u16_impl( let v_dx = v_px + x * 16; - if bounds_size == 2 { - let weights = weight.get_unchecked(0..2); - - let v_weight0 = _mm_set1_epi32(weights[0] as i32); - let v_weight1 = _mm_set1_epi32(weights[1] as i32); - - let py = bounds.start; - let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr0.as_ptr().add(8) as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight0), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight0), - ); - store2 = _mm_add_epi32( - store2, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight0), - ); - store3 = _mm_add_epi32( - store3, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight0), - ); - - let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row11 = _mm_loadu_si128(src_ptr1.as_ptr().add(8) as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row10, zeros), v_weight1), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row10, zeros), v_weight1), - ); - store2 = _mm_add_epi32( - store2, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row11, zeros), v_weight1), - ); - store3 = _mm_add_epi32( - store3, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row11, zeros), v_weight1), - ); - } else if bounds_size == 3 { - let weights = weight.get_unchecked(0..3); - - let v_weight0 = _mm_set1_epi32(weights[0] as i32); - let v_weight1 = _mm_set1_epi32(weights[1] as i32); - let v_weight2 = _mm_set1_epi32(weights[2] as i32); - - let py = bounds.start; - let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr0.as_ptr().add(8) as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight0), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight0), - ); - store2 = _mm_add_epi32( - store2, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight0), - ); - store3 = _mm_add_epi32( - store3, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight0), - ); - - let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row11 = _mm_loadu_si128(src_ptr1.as_ptr().add(8) as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row10, zeros), v_weight1), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row10, zeros), v_weight1), - ); - store2 = _mm_add_epi32( - store2, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row11, zeros), v_weight1), - ); - store3 = _mm_add_epi32( - store3, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row11, zeros), v_weight1), - ); - - let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - let item_row21 = _mm_loadu_si128(src_ptr2.as_ptr().add(8) as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row20, zeros), v_weight2), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row20, zeros), v_weight2), - ); - store2 = _mm_add_epi32( - store2, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row21, zeros), v_weight2), - ); - store3 = _mm_add_epi32( - store3, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row21, zeros), v_weight2), - ); - } else if bounds_size == 4 { - let weights = weight.get_unchecked(0..4); - - let v_weight0 = _mm_set1_epi32(weights[0] as i32); - let v_weight1 = _mm_set1_epi32(weights[1] as i32); - let v_weight2 = _mm_set1_epi32(weights[2] as i32); - let v_weight3 = _mm_set1_epi32(weights[3] as i32); - - let py = bounds.start; - let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..); - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_dx)..); + for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { + let py = bounds.start + j; + let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr0.as_ptr().add(8) as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight0), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight0), - ); - store2 = _mm_add_epi32( - store2, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight0), - ); - store3 = _mm_add_epi32( - store3, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight0), - ); - - let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row11 = _mm_loadu_si128(src_ptr1.as_ptr().add(8) as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row10, zeros), v_weight1), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row10, zeros), v_weight1), - ); - store2 = _mm_add_epi32( - store2, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row11, zeros), v_weight1), - ); - store3 = _mm_add_epi32( - store3, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row11, zeros), v_weight1), - ); - - let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - let item_row21 = _mm_loadu_si128(src_ptr2.as_ptr().add(8) as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row20, zeros), v_weight2), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row20, zeros), v_weight2), - ); - store2 = _mm_add_epi32( - store2, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row21, zeros), v_weight2), - ); - store3 = _mm_add_epi32( - store3, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row21, zeros), v_weight2), - ); + let v_weight = _mm_set1_epi32(k_weight as i32); - let item_row30 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i); - let item_row31 = _mm_loadu_si128(src_ptr3.as_ptr().add(8) as *const __m128i); + let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); + let item_row1 = _mm_loadu_si128(src_ptr.as_ptr().add(8) as *const __m128i); store0 = _mm_add_epi32( store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row30, zeros), v_weight3), + _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight), ); store1 = _mm_add_epi32( store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row30, zeros), v_weight3), + _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight), ); store2 = _mm_add_epi32( store2, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row31, zeros), v_weight3), + _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight), ); store3 = _mm_add_epi32( store3, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row31, zeros), v_weight3), + _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight), ); - } else { - for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { - let py = bounds.start + j; - let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); - - let v_weight = _mm_set1_epi32(k_weight as i32); - - let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr.as_ptr().add(8) as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight), - ); - store2 = _mm_add_epi32( - store2, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight), - ); - store3 = _mm_add_epi32( - store3, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight), - ); - } } - let v_st0 = _mm_srai_epi32::(_mm_max_epi32(store0, zeros)); - let v_st1 = _mm_srai_epi32::(_mm_max_epi32(store1, zeros)); - let v_st2 = _mm_srai_epi32::(_mm_max_epi32(store2, zeros)); - let v_st3 = _mm_srai_epi32::(_mm_max_epi32(store3, zeros)); + let v_st0 = _mm_srai_epi32::(store0); + let v_st1 = _mm_srai_epi32::(store1); + let v_st2 = _mm_srai_epi32::(store2); + let v_st3 = _mm_srai_epi32::(store3); let item0 = _mm_min_epi16(_mm_packus_epi32(v_st0, v_st1), v_max_colors); let item1 = _mm_min_epi16(_mm_packus_epi32(v_st2, v_st3), v_max_colors); @@ -347,160 +132,26 @@ unsafe fn convolve_column_lb_u16_impl( let v_dx = v_px + x * 8; - if bounds_size == 2 { - let weights = weight.get_unchecked(0..2); - - let v_weight0 = _mm_set1_epi32(weights[0] as i32); - let v_weight1 = _mm_set1_epi32(weights[1] as i32); - - let py = bounds.start; - let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight0), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight0), - ); - - let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight1), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight1), - ); - } else if bounds_size == 3 { - let weights = weight.get_unchecked(0..3); - - let v_weight0 = _mm_set1_epi32(weights[0] as i32); - let v_weight1 = _mm_set1_epi32(weights[1] as i32); - let v_weight2 = _mm_set1_epi32(weights[2] as i32); - - let py = bounds.start; - let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight0), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight0), - ); - - let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); + for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { + let py = bounds.start + j; + let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight1), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight1), - ); - - let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row2, zeros), v_weight2), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row2, zeros), v_weight2), - ); - } else if bounds_size == 4 { - let weights = weight.get_unchecked(0..4); - - let v_weight0 = _mm_set1_epi32(weights[0] as i32); - let v_weight1 = _mm_set1_epi32(weights[1] as i32); - let v_weight2 = _mm_set1_epi32(weights[2] as i32); - let v_weight3 = _mm_set1_epi32(weights[3] as i32); - - let py = bounds.start; - let src_ptr0 = src.get_unchecked((src_stride * py + v_dx)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + v_dx)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + v_dx)..); - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + v_dx)..); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row0, zeros), v_weight0), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row0, zeros), v_weight0), - ); - - let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row1, zeros), v_weight1), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row1, zeros), v_weight1), - ); + let v_weight = _mm_set1_epi16(k_weight); - let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); + let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); store0 = _mm_add_epi32( store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row2, zeros), v_weight2), + _mm_madd_epi16(_mm_unpacklo_epi16(item_row, zeros), v_weight), ); store1 = _mm_add_epi32( store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row2, zeros), v_weight2), + _mm_madd_epi16(_mm_unpackhi_epi16(item_row, zeros), v_weight), ); - - let item_row3 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row3, zeros), v_weight3), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row3, zeros), v_weight3), - ); - } else { - for (j, &k_weight) in weight.iter().take(bounds_size).enumerate() { - let py = bounds.start + j; - let src_ptr = src.get_unchecked((src_stride * py + v_dx)..); - - let v_weight = _mm_set1_epi16(k_weight); - - let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); - - store0 = _mm_add_epi32( - store0, - _mm_madd_epi16(_mm_unpacklo_epi16(item_row, zeros), v_weight), - ); - store1 = _mm_add_epi32( - store1, - _mm_madd_epi16(_mm_unpackhi_epi16(item_row, zeros), v_weight), - ); - } } - let v_st0 = _mm_srai_epi32::(_mm_max_epi32(store0, zeros)); - let v_st1 = _mm_srai_epi32::(_mm_max_epi32(store1, zeros)); + let v_st0 = _mm_srai_epi32::(store0); + let v_st1 = _mm_srai_epi32::(store1); let item = _mm_min_epi16(_mm_packus_epi32(v_st0, v_st1), v_max_colors); _mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, item); @@ -621,14 +272,10 @@ unsafe fn convolve_column_lb_u16_impl( } } - let v_st = _mm_srai_epi32::(_mm_max_epi32(store0, zeros)); + let v_st = _mm_srai_epi32::(store0); let u_store0 = _mm_min_epi16(_mm_packus_epi32(v_st, v_st), v_max_colors); - std::ptr::copy_nonoverlapping( - &u_store0 as *const _ as *const u8, - dst.as_mut_ptr() as *mut u8, - 8, - ); + _mm_storeu_si64(dst.as_mut_ptr() as *mut u8, u_store0); cx = v_dx; } diff --git a/src/sse/vertical_u8.rs b/src/sse/vertical_u8.rs index 5b66bf4..ab567d7 100644 --- a/src/sse/vertical_u8.rs +++ b/src/sse/vertical_u8.rs @@ -84,172 +84,62 @@ pub(crate) unsafe fn convolve_vertical_part_sse_32( let bounds_size = bounds.size; - if bounds_size == 2 { - let py = start_y; - let weight = filter.get_unchecked(0..2); - let v_weight0 = _mm_set1_epi32(weight[0] as i32); - let v_weight1 = _mm_set1_epi32(weight[1] as i32); - - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let item_row_0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row_1 = _mm_loadu_si128(src_ptr0.as_ptr().add(16) as *const __m128i); - - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let item_row_10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row_11 = _mm_loadu_si128(src_ptr1.as_ptr().add(16) as *const __m128i); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight0); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight0); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_10, v_weight1); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_11, v_weight1); - } else if bounds_size == 3 { - let py = start_y; - let weight = filter.get_unchecked(0..3); - let v_weight0 = _mm_set1_epi32(weight[0] as i32); - let v_weight1 = _mm_set1_epi32(weight[1] as i32); - let v_weight2 = _mm_set1_epi32(weight[2] as i32); - - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let item_row_0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row_1 = _mm_loadu_si128(src_ptr0.as_ptr().add(16) as *const __m128i); - - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let item_row_10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row_11 = _mm_loadu_si128(src_ptr1.as_ptr().add(16) as *const __m128i); - - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - let item_row_20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - let item_row_21 = _mm_loadu_si128(src_ptr2.as_ptr().add(16) as *const __m128i); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight0); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight0); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_10, v_weight1); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_11, v_weight1); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_20, v_weight2); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_21, v_weight2); - } else if bounds_size == 4 { - let py = start_y; - let weight = filter.get_unchecked(0..4); - let v_weight0 = _mm_set1_epi32(weight[0] as i32); - let v_weight1 = _mm_set1_epi32(weight[1] as i32); - let v_weight2 = _mm_set1_epi32(weight[2] as i32); - let v_weight3 = _mm_set1_epi32(weight[3] as i32); - - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let item_row_0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row_1 = _mm_loadu_si128(src_ptr0.as_ptr().add(16) as *const __m128i); - - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let item_row_10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row_11 = _mm_loadu_si128(src_ptr1.as_ptr().add(16) as *const __m128i); - - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - let item_row_20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - let item_row_21 = _mm_loadu_si128(src_ptr2.as_ptr().add(16) as *const __m128i); - - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); - let item_row_30 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i); - let item_row_31 = _mm_loadu_si128(src_ptr3.as_ptr().add(16) as *const __m128i); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight0); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight0); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_10, v_weight1); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_11, v_weight1); + let mut jj = 0usize; + + while jj < bounds_size.saturating_sub(2) { + let py = start_y + jj; + let f_ptr = filter.get_unchecked(jj..).as_ptr() as *const i32; + let v_weight_2 = _mm_set1_epi32(f_ptr.read_unaligned()); + let src_ptr = src.get_unchecked((src_stride * py + px)..); + let s_ptr_next = src_ptr.as_ptr().add(src_stride); + + let item_row_0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); + let item_row_1 = _mm_loadu_si128(s_ptr_next as *const __m128i); + + let interleaved = _mm_unpacklo_epi8(item_row_0, item_row_1); + let pix = _mm_unpacklo_epi8(interleaved, zeros); + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(pix, v_weight_2)); + let pix = _mm_unpackhi_epi8(interleaved, zeros); + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(pix, v_weight_2)); + + let interleaved = _mm_unpackhi_epi8(item_row_0, item_row_1); + let pix = _mm_unpacklo_epi8(interleaved, zeros); + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(pix, v_weight_2)); + let pix = _mm_unpackhi_epi8(interleaved, zeros); + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(pix, v_weight_2)); + + let item_row_0 = _mm_loadu_si128(src_ptr.as_ptr().add(16) as *const __m128i); + let item_row_1 = _mm_loadu_si128(s_ptr_next.add(16) as *const __m128i); + + let interleaved = _mm_unpacklo_epi8(item_row_0, item_row_1); + let pix = _mm_unpacklo_epi8(interleaved, zeros); + store_4 = _mm_add_epi32(store_4, _mm_madd_epi16(pix, v_weight_2)); + let pix = _mm_unpackhi_epi8(interleaved, zeros); + store_5 = _mm_add_epi32(store_5, _mm_madd_epi16(pix, v_weight_2)); + + let interleaved = _mm_unpackhi_epi8(item_row_0, item_row_1); + let pix = _mm_unpacklo_epi8(interleaved, zeros); + store_6 = _mm_add_epi32(store_6, _mm_madd_epi16(pix, v_weight_2)); + let pix = _mm_unpackhi_epi8(interleaved, zeros); + store_7 = _mm_add_epi32(store_7, _mm_madd_epi16(pix, v_weight_2)); + + jj += 2; + } - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_20, v_weight2); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_21, v_weight2); + for j in jj..bounds_size { + let py = start_y + j; + let weight = *filter.get_unchecked(j); + let v_weight = _mm_set1_epi32(weight as i32); + let src_ptr = src.get_unchecked((src_stride * py + px)..); + let item_row_0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); + let item_row_1 = _mm_loadu_si128(src_ptr.as_ptr().add(16) as *const __m128i); (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_30, v_weight3); + dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight); (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_31, v_weight3); - } else { - let mut jj = 0usize; - - while jj < bounds_size.saturating_sub(2) { - let py = start_y + jj; - let f_ptr = filter.get_unchecked(jj..).as_ptr() as *const i32; - let v_weight_2 = _mm_set1_epi32(f_ptr.read_unaligned()); - let src_ptr = src.get_unchecked((src_stride * py + px)..); - let s_ptr_next = src_ptr.as_ptr().add(src_stride); - - let item_row_0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); - let item_row_1 = _mm_loadu_si128(s_ptr_next as *const __m128i); - - let interleaved = _mm_unpacklo_epi8(item_row_0, item_row_1); - let pix = _mm_unpacklo_epi8(interleaved, zeros); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(pix, v_weight_2)); - let pix = _mm_unpackhi_epi8(interleaved, zeros); - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(pix, v_weight_2)); - - let interleaved = _mm_unpackhi_epi8(item_row_0, item_row_1); - let pix = _mm_unpacklo_epi8(interleaved, zeros); - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(pix, v_weight_2)); - let pix = _mm_unpackhi_epi8(interleaved, zeros); - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(pix, v_weight_2)); - - let item_row_0 = _mm_loadu_si128(src_ptr.as_ptr().add(16) as *const __m128i); - let item_row_1 = _mm_loadu_si128(s_ptr_next.add(16) as *const __m128i); - - let interleaved = _mm_unpacklo_epi8(item_row_0, item_row_1); - let pix = _mm_unpacklo_epi8(interleaved, zeros); - store_4 = _mm_add_epi32(store_4, _mm_madd_epi16(pix, v_weight_2)); - let pix = _mm_unpackhi_epi8(interleaved, zeros); - store_5 = _mm_add_epi32(store_5, _mm_madd_epi16(pix, v_weight_2)); - - let interleaved = _mm_unpackhi_epi8(item_row_0, item_row_1); - let pix = _mm_unpacklo_epi8(interleaved, zeros); - store_6 = _mm_add_epi32(store_6, _mm_madd_epi16(pix, v_weight_2)); - let pix = _mm_unpackhi_epi8(interleaved, zeros); - store_7 = _mm_add_epi32(store_7, _mm_madd_epi16(pix, v_weight_2)); - - jj += 2; - } - - for j in jj..bounds_size { - let py = start_y + j; - let weight = *filter.get_unchecked(j); - let v_weight = _mm_set1_epi32(weight as i32); - let src_ptr = src.get_unchecked((src_stride * py + px)..); - let item_row_0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); - let item_row_1 = _mm_loadu_si128(src_ptr.as_ptr().add(16) as *const __m128i); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight); - (store_4, store_5, store_6, store_7) = - dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight); - } + dot_prod(store_4, store_5, store_6, store_7, item_row_1, v_weight); } - store_0 = _mm_srai_epi32::(store_0); - store_1 = _mm_srai_epi32::(store_1); - store_2 = _mm_srai_epi32::(store_2); - store_3 = _mm_srai_epi32::(store_3); - store_4 = _mm_srai_epi32::(store_4); - store_5 = _mm_srai_epi32::(store_5); - store_6 = _mm_srai_epi32::(store_6); - store_7 = _mm_srai_epi32::(store_7); - let rgb0 = _mm_packs_epi32(store_0, store_1); let rgb2 = _mm_packs_epi32(store_2, store_3); let rgb = _mm_packus_epi16(rgb0, rgb2); @@ -283,88 +173,19 @@ pub(crate) unsafe fn convolve_vertical_part_sse_16( let px = start_x; - let zeros = _mm_setzero_si128(); - let bounds_size = bounds.size; - if bounds_size == 2 { - let py = start_y; - let weight = filter.get_unchecked(0..2); - let v_weight0 = _mm_set1_epi32(weight[0] as i32); - let v_weight1 = _mm_set1_epi32(weight[1] as i32); - - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row0, v_weight0); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row1, v_weight1); - } else if bounds_size == 3 { - let py = start_y; - let weight = filter.get_unchecked(0..3); - let v_weight0 = _mm_set1_epi32(weight[0] as i32); - let v_weight1 = _mm_set1_epi32(weight[1] as i32); - let v_weight2 = _mm_set1_epi32(weight[2] as i32); - - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); + for j in 0..bounds_size { + let py = start_y + j; + let weight = *filter.get_unchecked(j); + let v_weight = _mm_set1_epi32(weight as i32); + let src_ptr = src.get_unchecked((src_stride * py + px)..); + let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row0, v_weight0); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row1, v_weight1); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row2, v_weight2); - } else if bounds_size == 4 { - let py = start_y; - let weight = filter.get_unchecked(0..4); - let v_weight0 = _mm_set1_epi32(weight[0] as i32); - let v_weight1 = _mm_set1_epi32(weight[1] as i32); - let v_weight2 = _mm_set1_epi32(weight[2] as i32); - let v_weight3 = _mm_set1_epi32(weight[3] as i32); - - let src_ptr0 = src.get_unchecked((src_stride * py + px)..); - let src_ptr1 = src.get_unchecked((src_stride * (py + 1) + px)..); - let src_ptr2 = src.get_unchecked((src_stride * (py + 2) + px)..); - let src_ptr3 = src.get_unchecked((src_stride * (py + 3) + px)..); - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - let item_row3 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row0, v_weight0); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row1, v_weight1); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row2, v_weight2); - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row3, v_weight3); - } else { - for j in 0..bounds_size { - let py = start_y + j; - let weight = *filter.get_unchecked(j); - let v_weight = _mm_set1_epi32(weight as i32); - let src_ptr = src.get_unchecked((src_stride * py + px)..); - let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); - - (store_0, store_1, store_2, store_3) = - dot_prod(store_0, store_1, store_2, store_3, item_row, v_weight); - } + dot_prod(store_0, store_1, store_2, store_3, item_row, v_weight); } - store_0 = _mm_max_epi32(store_0, zeros); - store_1 = _mm_max_epi32(store_1, zeros); - store_2 = _mm_max_epi32(store_2, zeros); - store_3 = _mm_max_epi32(store_3, zeros); - let low_16 = _mm_packs_epi32( _mm_srai_epi32::(store_0), _mm_srai_epi32::(store_1), @@ -410,14 +231,14 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8( let item_row0 = _mm_loadu_si64(src_ptr0.as_ptr()); let item_row1 = _mm_loadu_si64(src_ptr1.as_ptr()); - let low0 = _mm_cvtepu8_epi16(item_row0); + let low0 = _mm_unpacklo_epi8(item_row0, zeros); store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low0), v_weight0)); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low0, zeros), v_weight0), ); - let low1 = _mm_cvtepu8_epi16(item_row1); + let low1 = _mm_unpacklo_epi8(item_row1, zeros); store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low1), v_weight1)); store_1 = _mm_add_epi32( store_1, @@ -436,21 +257,21 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8( let item_row1 = _mm_loadu_si64(src_ptr1.as_ptr()); let item_row2 = _mm_loadu_si64(src_ptr2.as_ptr()); - let low0 = _mm_cvtepu8_epi16(item_row0); + let low0 = _mm_unpacklo_epi8(item_row0, zeros); store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low0), v_weight0)); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low0, zeros), v_weight0), ); - let low1 = _mm_cvtepu8_epi16(item_row1); + let low1 = _mm_unpacklo_epi8(item_row1, zeros); store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low1), v_weight1)); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low1, zeros), v_weight1), ); - let low2 = _mm_cvtepu8_epi16(item_row2); + let low2 = _mm_unpacklo_epi8(item_row2, zeros); store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low2), v_weight2)); store_1 = _mm_add_epi32( store_1, @@ -472,28 +293,28 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8( let item_row2 = _mm_loadu_si64(src_ptr2.as_ptr()); let item_row3 = _mm_loadu_si64(src_ptr3.as_ptr()); - let low0 = _mm_cvtepu8_epi16(item_row0); + let low0 = _mm_unpacklo_epi8(item_row0, zeros); store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low0), v_weight0)); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low0, zeros), v_weight0), ); - let low1 = _mm_cvtepu8_epi16(item_row1); + let low1 = _mm_unpacklo_epi8(item_row1, zeros); store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low1), v_weight1)); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low1, zeros), v_weight1), ); - let low2 = _mm_cvtepu8_epi16(item_row2); + let low2 = _mm_unpacklo_epi8(item_row2, zeros); store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low2), v_weight2)); store_1 = _mm_add_epi32( store_1, _mm_madd_epi16(_mm_unpackhi_epi16(low2, zeros), v_weight2), ); - let low3 = _mm_cvtepu8_epi16(item_row3); + let low3 = _mm_unpacklo_epi8(item_row3, zeros); store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low3), v_weight3)); store_1 = _mm_add_epi32( store_1, @@ -507,7 +328,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8( let src_ptr = src.get_unchecked((src_stride * py + px)..); let item_row = _mm_loadu_si64(src_ptr.as_ptr()); - let low = _mm_cvtepu8_epi16(item_row); + let low = _mm_unpacklo_epi8(item_row, zeros); store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(_mm_cvtepi16_epi32(low), v_weight)); store_1 = _mm_add_epi32( store_1, @@ -516,9 +337,6 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8( } } - store_0 = _mm_max_epi32(store_0, zeros); - store_1 = _mm_max_epi32(store_1, zeros); - let low_16 = _mm_packus_epi32( _mm_srai_epi32::(store_0), _mm_srai_epi32::(store_1), @@ -527,7 +345,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8( let item = _mm_packus_epi16(low_16, low_16); let dst_ptr = dst.get_unchecked_mut(px..); - std::ptr::copy_nonoverlapping(&item as *const _ as *const u8, dst_ptr.as_mut_ptr(), 8); + _mm_storeu_si64(dst_ptr.as_mut_ptr(), item); } #[inline(always)] @@ -543,8 +361,6 @@ pub(crate) unsafe fn convolve_vertical_part_sse( let vld = _mm_set1_epi32(ROUNDING_CONST); let mut store = vld; - let zeros = _mm_setzero_si128(); - let px = start_x; let bounds_size = bounds.size; @@ -619,8 +435,6 @@ pub(crate) unsafe fn convolve_vertical_part_sse( } } - store = _mm_max_epi32(store, zeros); - let vegi = _mm_srai_epi32::(store); let low_16 = _mm_packus_epi32(vegi, vegi); diff --git a/src/sse/vertical_u8_lp.rs b/src/sse/vertical_u8_lp.rs index d507857..7b1a651 100644 --- a/src/sse/vertical_u8_lp.rs +++ b/src/sse/vertical_u8_lp.rs @@ -94,190 +94,23 @@ unsafe fn convolve_vertical_sse_row_impl( let px = cx; - if bounds_size == 2 { - let py = bounds.start; - let weights = weight.get_unchecked(0..2); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = - _mm_loadu_si128(src_ptr0.get_unchecked(16..).as_ptr() as *const __m128i); - let item_row2 = - _mm_loadu_si128(src_ptr0.get_unchecked(32..).as_ptr() as *const __m128i); - let item_row3 = - _mm_loadu_si128(src_ptr0.get_unchecked(48..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row0, v_weight0); - (store2, store3) = mdot::(store2, store3, item_row1, v_weight0); - (store4, store5) = mdot::(store4, store5, item_row2, v_weight0); - (store6, store7) = mdot::(store6, store7, item_row3, v_weight0); - - let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row11 = - _mm_loadu_si128(src_ptr1.get_unchecked(16..).as_ptr() as *const __m128i); - let item_row12 = - _mm_loadu_si128(src_ptr1.get_unchecked(32..).as_ptr() as *const __m128i); - let item_row13 = - _mm_loadu_si128(src_ptr1.get_unchecked(48..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row10, v_weight1); - (store2, store3) = mdot::(store2, store3, item_row11, v_weight1); - (store4, store5) = mdot::(store4, store5, item_row12, v_weight1); - (store6, store7) = mdot::(store6, store7, item_row13, v_weight1); - } else if bounds_size == 3 { - let py = bounds.start; - let weights = weight.get_unchecked(0..3); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_weight2 = _mm_set1_epi16(weights[2]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..); - let v_offset2 = src_stride * (py + 2) + px; - let src_ptr2 = src.get_unchecked(v_offset2..); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = - _mm_loadu_si128(src_ptr0.get_unchecked(16..).as_ptr() as *const __m128i); - let item_row2 = - _mm_loadu_si128(src_ptr0.get_unchecked(32..).as_ptr() as *const __m128i); - let item_row3 = - _mm_loadu_si128(src_ptr0.get_unchecked(48..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row0, v_weight0); - (store2, store3) = mdot::(store2, store3, item_row1, v_weight0); - (store4, store5) = mdot::(store4, store5, item_row2, v_weight0); - (store6, store7) = mdot::(store6, store7, item_row3, v_weight0); - - let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row11 = - _mm_loadu_si128(src_ptr1.get_unchecked(16..).as_ptr() as *const __m128i); - let item_row12 = - _mm_loadu_si128(src_ptr1.get_unchecked(32..).as_ptr() as *const __m128i); - let item_row13 = - _mm_loadu_si128(src_ptr1.get_unchecked(48..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row10, v_weight1); - (store2, store3) = mdot::(store2, store3, item_row11, v_weight1); - (store4, store5) = mdot::(store4, store5, item_row12, v_weight1); - (store6, store7) = mdot::(store6, store7, item_row13, v_weight1); - - let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - let item_row21 = - _mm_loadu_si128(src_ptr2.get_unchecked(16..).as_ptr() as *const __m128i); - let item_row22 = - _mm_loadu_si128(src_ptr2.get_unchecked(32..).as_ptr() as *const __m128i); - let item_row23 = - _mm_loadu_si128(src_ptr2.get_unchecked(48..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row20, v_weight2); - (store2, store3) = mdot::(store2, store3, item_row21, v_weight2); - (store4, store5) = mdot::(store4, store5, item_row22, v_weight2); - (store6, store7) = mdot::(store6, store7, item_row23, v_weight2); - } else if bounds_size == 4 { - let py = bounds.start; - let weights = weight.get_unchecked(0..4); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_weight2 = _mm_set1_epi16(weights[2]); - let v_weight3 = _mm_set1_epi16(weights[3]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..); - let v_offset2 = src_stride * (py + 2) + px; - let src_ptr2 = src.get_unchecked(v_offset2..); - let v_offset3 = src_stride * (py + 3) + px; - let src_ptr3 = src.get_unchecked(v_offset3..); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = - _mm_loadu_si128(src_ptr0.get_unchecked(16..).as_ptr() as *const __m128i); - let item_row2 = - _mm_loadu_si128(src_ptr0.get_unchecked(32..).as_ptr() as *const __m128i); - let item_row3 = - _mm_loadu_si128(src_ptr0.get_unchecked(48..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row0, v_weight0); - (store2, store3) = mdot::(store2, store3, item_row1, v_weight0); - (store4, store5) = mdot::(store4, store5, item_row2, v_weight0); - (store6, store7) = mdot::(store6, store7, item_row3, v_weight0); - - let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row11 = - _mm_loadu_si128(src_ptr1.get_unchecked(16..).as_ptr() as *const __m128i); - let item_row12 = - _mm_loadu_si128(src_ptr1.get_unchecked(32..).as_ptr() as *const __m128i); - let item_row13 = - _mm_loadu_si128(src_ptr1.get_unchecked(48..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row10, v_weight1); - (store2, store3) = mdot::(store2, store3, item_row11, v_weight1); - (store4, store5) = mdot::(store4, store5, item_row12, v_weight1); - (store6, store7) = mdot::(store6, store7, item_row13, v_weight1); - - let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - let item_row21 = - _mm_loadu_si128(src_ptr2.get_unchecked(16..).as_ptr() as *const __m128i); - let item_row22 = - _mm_loadu_si128(src_ptr2.get_unchecked(32..).as_ptr() as *const __m128i); - let item_row23 = - _mm_loadu_si128(src_ptr2.get_unchecked(48..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row20, v_weight2); - (store2, store3) = mdot::(store2, store3, item_row21, v_weight2); - (store4, store5) = mdot::(store4, store5, item_row22, v_weight2); - (store6, store7) = mdot::(store6, store7, item_row23, v_weight2); - - let item_row30 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i); - let item_row31 = - _mm_loadu_si128(src_ptr3.get_unchecked(16..).as_ptr() as *const __m128i); - let item_row32 = - _mm_loadu_si128(src_ptr3.get_unchecked(32..).as_ptr() as *const __m128i); - let item_row33 = - _mm_loadu_si128(src_ptr3.get_unchecked(48..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row30, v_weight3); - (store2, store3) = mdot::(store2, store3, item_row31, v_weight3); - (store4, store5) = mdot::(store4, store5, item_row32, v_weight3); - (store6, store7) = mdot::(store6, store7, item_row33, v_weight3); - } else { - for j in 0..bounds_size { - let py = bounds.start + j; - let weight = weight.get_unchecked(j..(j + 1)); - let v_weight = _mm_set1_epi16(weight[0]); - let v_offset = src_stride * py + px; - let src_ptr = src.get_unchecked(v_offset..); - let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); - let item_row1 = - _mm_loadu_si128(src_ptr.get_unchecked(16..).as_ptr() as *const __m128i); - let item_row2 = - _mm_loadu_si128(src_ptr.get_unchecked(32..).as_ptr() as *const __m128i); - let item_row3 = - _mm_loadu_si128(src_ptr.get_unchecked(48..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row0, v_weight); - (store2, store3) = mdot::(store2, store3, item_row1, v_weight); - (store4, store5) = mdot::(store4, store5, item_row2, v_weight); - (store6, store7) = mdot::(store6, store7, item_row3, v_weight); - } + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..(j + 1)); + let v_weight = _mm_set1_epi16(weight[0]); + let v_offset = src_stride * py + px; + let src_ptr = src.get_unchecked(v_offset..); + let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); + let item_row1 = _mm_loadu_si128(src_ptr.get_unchecked(16..).as_ptr() as *const __m128i); + let item_row2 = _mm_loadu_si128(src_ptr.get_unchecked(32..).as_ptr() as *const __m128i); + let item_row3 = _mm_loadu_si128(src_ptr.get_unchecked(48..).as_ptr() as *const __m128i); + + (store0, store1) = mdot::(store0, store1, item_row0, v_weight); + (store2, store3) = mdot::(store2, store3, item_row1, v_weight); + (store4, store5) = mdot::(store4, store5, item_row2, v_weight); + (store6, store7) = mdot::(store6, store7, item_row3, v_weight); } - store0 = _mm_max_epi16(store0, zeros); - store1 = _mm_max_epi16(store1, zeros); - store2 = _mm_max_epi16(store2, zeros); - store3 = _mm_max_epi16(store3, zeros); - store4 = _mm_max_epi16(store4, zeros); - store5 = _mm_max_epi16(store5, zeros); - store6 = _mm_max_epi16(store6, zeros); - store7 = _mm_max_epi16(store7, zeros); - let rebased0 = _mm_srli_epi16::(store0); let rebased1 = _mm_srli_epi16::(store1); let rebased2 = _mm_srli_epi16::(store2); @@ -318,126 +151,19 @@ unsafe fn convolve_vertical_sse_row_impl( let px = cx; - if bounds_size == 2 { - let py = bounds.start; - let weights = weight.get_unchecked(0..2); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = - _mm_loadu_si128(src_ptr0.get_unchecked(16..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row0, v_weight0); - (store2, store3) = mdot::(store2, store3, item_row1, v_weight0); - - let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row11 = - _mm_loadu_si128(src_ptr1.get_unchecked(16..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row10, v_weight1); - (store2, store3) = mdot::(store2, store3, item_row11, v_weight1); - } else if bounds_size == 3 { - let py = bounds.start; - let weights = weight.get_unchecked(0..3); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_weight2 = _mm_set1_epi16(weights[2]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..); - let v_offset2 = src_stride * (py + 2) + px; - let src_ptr2 = src.get_unchecked(v_offset2..); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = - _mm_loadu_si128(src_ptr0.get_unchecked(16..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row0, v_weight0); - (store2, store3) = mdot::(store2, store3, item_row1, v_weight0); - - let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row11 = - _mm_loadu_si128(src_ptr1.get_unchecked(16..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row10, v_weight1); - (store2, store3) = mdot::(store2, store3, item_row11, v_weight1); - - let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - let item_row21 = - _mm_loadu_si128(src_ptr2.get_unchecked(16..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row20, v_weight2); - (store2, store3) = mdot::(store2, store3, item_row21, v_weight2); - } else if bounds_size == 4 { - let py = bounds.start; - let weights = weight.get_unchecked(0..4); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_weight2 = _mm_set1_epi16(weights[2]); - let v_weight3 = _mm_set1_epi16(weights[3]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..); - let v_offset2 = src_stride * (py + 2) + px; - let src_ptr2 = src.get_unchecked(v_offset2..); - let v_offset3 = src_stride * (py + 3) + px; - let src_ptr3 = src.get_unchecked(v_offset3..); - - let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - let item_row1 = - _mm_loadu_si128(src_ptr0.get_unchecked(16..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row0, v_weight0); - (store2, store3) = mdot::(store2, store3, item_row1, v_weight0); - - let item_row10 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - let item_row11 = - _mm_loadu_si128(src_ptr1.get_unchecked(16..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row10, v_weight1); - (store2, store3) = mdot::(store2, store3, item_row11, v_weight1); - - let item_row20 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - let item_row21 = - _mm_loadu_si128(src_ptr2.get_unchecked(16..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row20, v_weight2); - (store2, store3) = mdot::(store2, store3, item_row21, v_weight2); - - let item_row30 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i); - let item_row31 = - _mm_loadu_si128(src_ptr3.get_unchecked(16..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row30, v_weight3); - (store2, store3) = mdot::(store2, store3, item_row31, v_weight3); - } else { - for j in 0..bounds_size { - let py = bounds.start + j; - let weight = weight.get_unchecked(j..(j + 1)); - let v_weight = _mm_set1_epi16(weight[0]); - let v_offset = src_stride * py + px; - let src_ptr = src.get_unchecked(v_offset..); - let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); - let item_row1 = - _mm_loadu_si128(src_ptr.get_unchecked(16..).as_ptr() as *const __m128i); - - (store0, store1) = mdot::(store0, store1, item_row0, v_weight); - (store2, store3) = mdot::(store2, store3, item_row1, v_weight); - } + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..(j + 1)); + let v_weight = _mm_set1_epi16(weight[0]); + let v_offset = src_stride * py + px; + let src_ptr = src.get_unchecked(v_offset..); + let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); + let item_row1 = _mm_loadu_si128(src_ptr.get_unchecked(16..).as_ptr() as *const __m128i); + + (store0, store1) = mdot::(store0, store1, item_row0, v_weight); + (store2, store3) = mdot::(store2, store3, item_row1, v_weight); } - store0 = _mm_max_epi16(store0, zeros); - store1 = _mm_max_epi16(store1, zeros); - store2 = _mm_max_epi16(store2, zeros); - store3 = _mm_max_epi16(store3, zeros); - let rebased0 = _mm_srli_epi16::(store0); let rebased1 = _mm_srli_epi16::(store1); let rebased2 = _mm_srli_epi16::(store2); @@ -532,9 +258,6 @@ unsafe fn convolve_vertical_sse_row_impl( } } - store0 = _mm_max_epi16(store0, zeros); - store1 = _mm_max_epi16(store1, zeros); - let rebased0 = _mm_srli_epi16::(store0); let rebased1 = _mm_srli_epi16::(store1); let shrank = _mm_packus_epi16(rebased0, rebased1); @@ -655,11 +378,9 @@ unsafe fn convolve_vertical_sse_row_impl( } } - store = _mm_max_epi16(store, zeros); - let rebased = _mm_srli_epi16::(store); let shrank = _mm_packus_epi16(rebased, rebased); - std::ptr::copy_nonoverlapping(&shrank as *const _ as *const u8, dst.as_mut_ptr(), 8); + _mm_storeu_si64(dst.as_mut_ptr(), shrank); cx += 8; } @@ -781,8 +502,6 @@ unsafe fn convolve_vertical_sse_row_impl( } } - store = _mm_max_epi16(store, zeros); - let rebased = _mm_srli_epi16::(store); let value = _mm_extract_epi8::<0>(_mm_packus_epi16(rebased, rebased)); *dst = value as u8; From fa1f2ec6e0f4ab74a60824694d829459bd0c179a Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Sat, 28 Dec 2024 10:38:53 +0000 Subject: [PATCH 4/9] Refactor and improvements --- .github/workflows/build_push.yml | 1 + Cargo.lock | 2 +- Cargo.toml | 4 +- app/src/main.rs | 36 +- fuzz/Cargo.toml | 7 + fuzz/colorspaces/colorspaces.rs | 95 +++ src/avx2/vertical_u8.rs | 6 +- src/colors/jzazbz_scaler.rs | 117 ++-- src/colors/lab_scaler.rs | 119 ++-- src/colors/lch_scaler.rs | 116 +-- src/colors/linear_precise_scaler.rs | 159 +++-- src/colors/linear_scaler.rs | 170 +++-- src/colors/luv_scaler.rs | 163 +++-- src/colors/oklab_scaler.rs | 159 +++-- src/colors/sigmoidal_scaler.rs | 155 ++-- src/colors/xyz_scaler.rs | 165 +++-- src/convolution.rs | 6 +- src/dispatch_group_f16.rs | 21 +- src/dispatch_group_f32.rs | 21 +- src/dispatch_group_u16.rs | 15 +- src/dispatch_group_u8.rs | 11 +- src/f16.rs | 13 +- src/image_store.rs | 311 ++++++-- src/lib.rs | 2 +- src/neon/utils.rs | 14 + src/neon/vertical_f32.rs | 125 ++-- src/plane_f32.rs | 5 +- src/plane_u16.rs | 5 +- src/plane_u8.rs | 5 +- src/rgb_f32.rs | 6 +- src/rgb_u16.rs | 5 +- src/rgb_u8.rs | 6 +- src/rgba_f32.rs | 5 +- src/rgba_u16.rs | 5 +- src/rgba_u8.rs | 5 +- src/scaler.rs | 1014 +++++++++------------------ src/scaler_f16.rs | 260 +------ 37 files changed, 1626 insertions(+), 1708 deletions(-) create mode 100644 fuzz/colorspaces/colorspaces.rs diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml index e10a6e8..a72a34b 100644 --- a/.github/workflows/build_push.yml +++ b/.github/workflows/build_push.yml @@ -57,6 +57,7 @@ jobs: - run: cargo fuzz run resize_rgba -- -max_total_time=30 - run: cargo fuzz run resize_rgb -- -max_total_time=30 - run: cargo fuzz run resize_plane -- -max_total_time=30 + - run: cargo fuzz run colorspaces -- -max_total_time=10 fuzz_rgba_high_bit: name: Fuzzing High bit-depth diff --git a/Cargo.lock b/Cargo.lock index 65a8fe9..f1d0691 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -780,7 +780,7 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pic-scale" -version = "0.3.6" +version = "0.4.0" dependencies = [ "colorutils-rs", "half", diff --git a/Cargo.toml b/Cargo.toml index 68371ce..8dd615e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ workspace = { members = ["app", "wasm", "fuzz"] } [package] name = "pic-scale" -version = "0.3.6" +version = "0.4.0" edition = "2021" description = "High performance image scaling" readme = "README.md" @@ -14,7 +14,7 @@ categories = ["multimedia::images", "multimedia::video"] homepage = "https://github.com/awxkee/pic-scale" repository = "https://github.com/awxkee/pic-scale" exclude = ["*.jpg", "/assets", "*.png", "*.sh", "/assets/*"] -rust-version = "1.73.0" +rust-version = "1.82.0" [dependencies] colorutils-rs = {version = "0.7.0", optional = true} diff --git a/app/src/main.rs b/app/src/main.rs index cb713ce..8166b23 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -11,8 +11,8 @@ use fast_image_resize::{ }; use image::{EncodableLayout, GenericImageView, ImageReader}; use pic_scale::{ - Ar30ByteOrder, ImageSize, ImageStore, LinearApproxScaler, LinearScaler, ResamplingFunction, - Scaler, Scaling, ScalingU16, ThreadingPolicy, + Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, LinearScaler, ResamplingFunction, Scaler, + Scaling, ScalingU16, ThreadingPolicy, }; fn resize_plane( @@ -37,15 +37,14 @@ fn resize_plane( let mut src_data = vec![15u8; src_width * src_height * 1]; let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let mut dst_store = ImageStoreMut::::alloc(src_width / 2, src_height / 2); let scaler = Scaler::new(sampler); - _ = scaler - .resize_plane(ImageSize::new(dst_width, dst_height), store) - .unwrap(); + _ = scaler.resize_plane(&store, &mut dst_store).unwrap(); } fn main() { // test_fast_image(); - let img = ImageReader::open("./assets/test_1.jpg") + let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png") .unwrap() .decode() .unwrap(); @@ -53,7 +52,7 @@ fn main() { let transient = img.to_rgba8(); let mut bytes = Vec::from(transient.as_bytes()); - let mut scaler = Scaler::new(ResamplingFunction::Bilinear); + let mut scaler = LinearScaler::new(ResamplingFunction::Bilinear); scaler.set_threading_policy(ThreadingPolicy::Single); // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear); @@ -77,13 +76,10 @@ fn main() { // ) // .unwrap(); - let resized = scaler - .resize_rgba( - ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2), - store, - false, - ) - .unwrap(); + let mut dst_store = + ImageStoreMut::::alloc(dimensions.0 as usize / 2, dimensions.1 as usize / 2); + + scaler.resize_rgba(&store, &mut dst_store, false).unwrap(); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds @@ -162,7 +158,7 @@ fn main() { // let dst: Vec = resized.as_bytes().iter().map(|&x| (x >> 2) as u8).collect(); // - let dst = resized.as_bytes(); + let dst = dst_store.as_bytes(); // let dst = resized; // image::save_buffer( // "converted.png", @@ -173,12 +169,12 @@ fn main() { // ) // .unwrap(); - if resized.channels == 4 { + if dst_store.channels == 4 { image::save_buffer( "converted.png", &dst, - resized.width as u32, - resized.height as u32, + dst_store.width as u32, + dst_store.height as u32, image::ColorType::Rgba8, ) .unwrap(); @@ -186,8 +182,8 @@ fn main() { image::save_buffer( "converted.png", &dst, - resized.width as u32, - resized.height as u32, + dst_store.width as u32, + dst_store.height as u32, image::ColorType::Rgb8, ) .unwrap(); diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 3db86e5..079bc34 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -73,3 +73,10 @@ path = "resize_plane_f32/resize_plane_f32.rs" test = false doc = false bench = false + +[[bin]] +name = "colorspaces" +path = "colorspaces/colorspaces.rs" +test = false +doc = false +bench = false diff --git a/fuzz/colorspaces/colorspaces.rs b/fuzz/colorspaces/colorspaces.rs new file mode 100644 index 0000000..2451bf0 --- /dev/null +++ b/fuzz/colorspaces/colorspaces.rs @@ -0,0 +1,95 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#![no_main] + +use libfuzzer_sys::fuzz_target; +use pic_scale::{ + ImageStore, ImageStoreMut, JzazbzScaler, LChScaler, LabScaler, LinearApproxScaler, + LinearScaler, LuvScaler, OklabScaler, ResamplingFunction, Scaling, SigmoidalScaler, + TransferFunction, XYZScaler, +}; + +fuzz_target!(|data: (u16, u16, u16, u16)| { + resize_plane( + data.0 as usize, + data.1 as usize, + data.2 as usize, + data.3 as usize, + ResamplingFunction::Bilinear, + ) +}); + +fn resize_plane( + src_width: usize, + src_height: usize, + dst_width: usize, + dst_height: usize, + sampler: ResamplingFunction, +) { + if src_width == 0 + || src_width > 2000 + || src_height == 0 + || src_height > 2000 + || dst_width == 0 + || dst_width > 512 + || dst_height == 0 + || dst_height > 512 + { + return; + } + + let scalers: Vec> = vec![ + Box::new(JzazbzScaler::new(sampler, 203f32, TransferFunction::Srgb)), + Box::new(LabScaler::new(sampler)), + Box::new(LChScaler::new(sampler)), + Box::new(LinearScaler::new(sampler)), + Box::new(LinearApproxScaler::new(sampler)), + Box::new(LuvScaler::new(sampler)), + Box::new(OklabScaler::new(sampler, TransferFunction::Srgb)), + Box::new(SigmoidalScaler::new(sampler)), + Box::new(XYZScaler::new(sampler)), + ]; + + for scaler in scalers { + let mut src_data_rgb = vec![15u8; src_width * src_height * 3]; + let store = + ImageStore::::from_slice(&mut src_data_rgb, src_width, src_height).unwrap(); + let mut target_store = ImageStoreMut::alloc(dst_width, dst_height); + scaler.resize_rgb(&store, &mut target_store).unwrap(); + + let mut src_data_rgba = vec![15u8; src_width * src_height * 4]; + let store_rgba = + ImageStore::::from_slice(&mut src_data_rgba, src_width, src_height).unwrap(); + let mut target_store_rgba = ImageStoreMut::alloc(dst_width, dst_height); + scaler + .resize_rgba(&store_rgba, &mut target_store_rgba, false) + .unwrap(); + } +} diff --git a/src/avx2/vertical_u8.rs b/src/avx2/vertical_u8.rs index e578206..0e86cf6 100644 --- a/src/avx2/vertical_u8.rs +++ b/src/avx2/vertical_u8.rs @@ -107,8 +107,7 @@ unsafe fn convolve_vertical_part_avx_64( let pix = _mm256_unpackhi_epi8(interleaved, zeros); store_3 = _mm256_add_epi32(store_3, _mm256_madd_epi16(pix, v_weight_2)); - let item_row_0 = - _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i); + let item_row_0 = _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i); let item_row_1 = _mm256_loadu_si256(s_ptr_next.get_unchecked(32..).as_ptr() as *const __m256i); @@ -134,8 +133,7 @@ unsafe fn convolve_vertical_part_avx_64( let src_ptr = src.get_unchecked((src_stride * py + px)..); let item_row_0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i); - let item_row_1 = - _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i); + let item_row_1 = _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i); (store_0, store_1, store_2, store_3) = dot_prod(store_0, store_1, store_2, store_3, item_row_0, v_weight); diff --git a/src/colors/jzazbz_scaler.rs b/src/colors/jzazbz_scaler.rs index f939632..845d321 100644 --- a/src/colors/jzazbz_scaler.rs +++ b/src/colors/jzazbz_scaler.rs @@ -30,11 +30,10 @@ use colorutils_rs::{ jzazbz_to_rgb, jzazbz_to_rgba, rgb_to_jzazbz, rgba_to_jzazbz, TransferFunction, }; -use crate::alpha_check::has_non_constant_cap_alpha_rgba8; use crate::pic_scale_error::PicScaleError; use crate::scaler::ScalingF32; use crate::support::check_image_size_overflow; -use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ThreadingPolicy}; +use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ThreadingPolicy}; #[derive(Debug, Copy, Clone)] /// Converts image to *Jzazbz* components scales it and convert back @@ -60,35 +59,44 @@ impl JzazbzScaler { } } - fn rgba_to_laba<'a>(&self, store: ImageStore<'a, u8, 4>) -> ImageStore<'a, f32, 4> { - let mut new_store = ImageStore::::alloc(store.width, store.height); + fn rgba_to_laba<'a>(&self, store: &ImageStore<'a, u8, 4>) -> ImageStore<'a, f32, 4> { + let mut source_slice = vec![f32::default(); 4 * store.width * store.height]; let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::() as u32; rgba_to_jzazbz( - store.buffer.borrow(), + store.buffer.as_ref(), store.width as u32 * 4u32, - new_store.buffer.borrow_mut(), + &mut source_slice, lab_stride, store.width as u32, store.height as u32, self.display_luminance, self.transfer_function, ); + let new_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(source_slice), + channels: 4, + width: store.width, + height: store.height, + bit_depth: store.bit_depth, + }; new_store } - fn laba_to_srgba<'a>(&self, store: ImageStore<'a, f32, 4>) -> ImageStore<'a, u8, 4> { - let mut new_store = ImageStore::::alloc(store.width, store.height); + fn laba_to_srgba<'a>( + &self, + store: &ImageStoreMut<'a, f32, 4>, + into: &mut ImageStoreMut<'a, u8, 4>, + ) { jzazbz_to_rgba( store.buffer.borrow(), store.width as u32 * 4u32 * std::mem::size_of::() as u32, - new_store.buffer.borrow_mut(), + into.buffer.borrow_mut(), store.width as u32 * 4u32, store.width as u32, store.height as u32, self.display_luminance, self.transfer_function, ); - new_store } } @@ -97,11 +105,12 @@ impl Scaling for JzazbzScaler { self.scaler.threading_policy = threading_policy; } - fn resize_rgb( - &self, - new_size: ImageSize, - store: ImageStore, - ) -> Result, PicScaleError> { + fn resize_rgb<'a>( + &'a self, + store: &ImageStore, + into: &mut ImageStoreMut, + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -115,15 +124,22 @@ impl Scaling for JzazbzScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } const COMPONENTS: usize = 3; - let mut lab_store = ImageStore::::alloc(store.width, store.height); + + let mut target = vec![f32::default(); store.width * store.height * COMPONENTS]; + + let mut lab_store = + ImageStoreMut::::from_slice(&mut target, store.width, store.height)?; + lab_store.bit_depth = into.bit_depth; + let lab_stride = lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; rgb_to_jzazbz( - store.buffer.borrow(), + store.buffer.as_ref(), store.width as u32 * COMPONENTS as u32, lab_store.buffer.borrow_mut(), lab_stride, @@ -132,29 +148,42 @@ impl Scaling for JzazbzScaler { self.display_luminance, self.transfer_function, ); - let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?; - let mut new_u8_store = ImageStore::::alloc(new_size.width, new_size.height); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + self.scaler + .resize_rgb_f32(&new_immutable_store, &mut new_store)?; + let new_lab_stride = new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + jzazbz_to_rgb( new_store.buffer.borrow(), new_lab_stride, - new_u8_store.buffer.borrow_mut(), - new_u8_store.width as u32 * COMPONENTS as u32, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, new_store.width as u32, new_store.height as u32, self.display_luminance, self.transfer_function, ); - Ok(new_u8_store) + Ok(()) } fn resize_rgba<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, u8, 4>, + store: &ImageStore<'a, u8, 4>, + into: &mut ImageStoreMut<'a, u8, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError> { + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -168,36 +197,16 @@ impl Scaling for JzazbzScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } - let mut src_store = store; + let lab_store = self.rgba_to_laba(store); + let mut new_target_store = ImageStoreMut::alloc(new_size.width, new_size.height); - let pool = self - .scaler - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - - let mut has_alpha_premultiplied = false; - - if premultiply_alpha { - let is_alpha_premultiplication_reasonable = - has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width); - if is_alpha_premultiplication_reasonable { - let mut new_store = ImageStore::::alloc(src_store.width, src_store.height); - src_store.premultiply_alpha(&mut new_store, &pool); - src_store = new_store; - has_alpha_premultiplied = true; - } - } - let lab_store = self.rgba_to_laba(src_store); - let new_store = self - .scaler - .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?; - let mut rgba_store = self.laba_to_srgba(new_store); - if premultiply_alpha && has_alpha_premultiplied { - rgba_store.unpremultiply_alpha(&pool); - } - Ok(rgba_store) + self.scaler + .resize_rgba_f32(&lab_store, &mut new_target_store, premultiply_alpha)?; + self.laba_to_srgba(&new_target_store, into); + Ok(()) } } diff --git a/src/colors/lab_scaler.rs b/src/colors/lab_scaler.rs index 0ea7d70..0878451 100644 --- a/src/colors/lab_scaler.rs +++ b/src/colors/lab_scaler.rs @@ -27,17 +27,15 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +use crate::pic_scale_error::PicScaleError; +use crate::scaler::{Scaling, ScalingF32}; +use crate::support::check_image_size_overflow; +use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ThreadingPolicy}; use colorutils_rs::{ lab_to_srgb, lab_with_alpha_to_rgba, rgb_to_lab, rgba_to_lab_with_alpha, TransferFunction, SRGB_TO_XYZ_D65, XYZ_TO_SRGB_D65, }; -use crate::alpha_check::has_non_constant_cap_alpha_rgba8; -use crate::pic_scale_error::PicScaleError; -use crate::scaler::{Scaling, ScalingF32}; -use crate::support::check_image_size_overflow; -use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, ThreadingPolicy}; - #[derive(Debug, Copy, Clone)] /// Converts image to *CIE LAB* components scales it and convert back pub struct LabScaler { @@ -51,35 +49,40 @@ impl LabScaler { } } - fn rgba_to_laba(store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); + fn rgba_to_laba<'a>(store: &ImageStore<'a, u8, 4>) -> ImageStore<'a, f32, 4> { + let mut source_slice = vec![f32::default(); 4 * store.width * store.height]; let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::() as u32; rgba_to_lab_with_alpha( - store.buffer.borrow(), + store.buffer.as_ref(), store.width as u32 * 4u32, - new_store.buffer.borrow_mut(), + &mut source_slice, lab_stride, store.width as u32, store.height as u32, &SRGB_TO_XYZ_D65, TransferFunction::Srgb, ); + let new_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(source_slice), + channels: 4, + width: store.width, + height: store.height, + bit_depth: store.bit_depth, + }; new_store } - fn laba_to_srgba(store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); + fn laba_to_srgba<'a>(store: &ImageStoreMut<'a, f32, 4>, into: &mut ImageStoreMut<'a, u8, 4>) { lab_with_alpha_to_rgba( store.buffer.borrow(), store.width as u32 * 4u32 * std::mem::size_of::() as u32, - new_store.buffer.borrow_mut(), + into.buffer.borrow_mut(), store.width as u32 * 4u32, store.width as u32, store.height as u32, &XYZ_TO_SRGB_D65, TransferFunction::Srgb, ); - new_store } } @@ -88,11 +91,12 @@ impl Scaling for LabScaler { self.scaler.threading_policy = threading_policy; } - fn resize_rgb( + fn resize_rgb<'a>( &self, - new_size: ImageSize, - store: ImageStore, - ) -> Result, PicScaleError> { + store: &ImageStore<'a, u8, 3>, + into: &mut ImageStoreMut<'a, u8, 3>, + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -106,14 +110,23 @@ impl Scaling for LabScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } + const COMPONENTS: usize = 3; - let mut lab_store = ImageStore::::alloc(store.width, store.height); + + let mut target = vec![f32::default(); store.width * store.height * COMPONENTS]; + + let mut lab_store = + ImageStoreMut::::from_slice(&mut target, store.width, store.height)?; + lab_store.bit_depth = into.bit_depth; + let lab_stride = lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + rgb_to_lab( - store.buffer.borrow(), + store.buffer.as_ref(), store.width as u32 * COMPONENTS as u32, lab_store.buffer.borrow_mut(), lab_stride, @@ -122,27 +135,39 @@ impl Scaling for LabScaler { &SRGB_TO_XYZ_D65, TransferFunction::Srgb, ); - let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?; - let mut new_u8_store = ImageStore::::alloc(new_size.width, new_size.height); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + self.scaler + .resize_rgb_f32(&new_immutable_store, &mut new_store)?; + let new_lab_stride = new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; lab_to_srgb( new_store.buffer.borrow(), new_lab_stride, - new_u8_store.buffer.borrow_mut(), - new_u8_store.width as u32 * COMPONENTS as u32, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, new_store.width as u32, new_store.height as u32, ); - Ok(new_u8_store) + Ok(()) } fn resize_rgba<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, u8, 4>, + store: &ImageStore<'a, u8, 4>, + into: &mut ImageStoreMut<'a, u8, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError> { + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -156,36 +181,16 @@ impl Scaling for LabScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } - let mut src_store = store; - - let pool = self - .scaler - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - - let mut has_alpha_premultiplied = false; + let lab_store = Self::rgba_to_laba(store); + let mut new_target_store = ImageStoreMut::alloc(new_size.width, new_size.height); - if premultiply_alpha { - let is_alpha_premultiplication_reasonable = - has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width); - if is_alpha_premultiplication_reasonable { - let mut new_store = ImageStore::::alloc(src_store.width, src_store.height); - src_store.premultiply_alpha(&mut new_store, &pool); - src_store = new_store; - has_alpha_premultiplied = true; - } - } - let lab_store = Self::rgba_to_laba(src_store); - let new_store = self - .scaler - .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?; - let mut rgba_store = Self::laba_to_srgba(new_store); - if premultiply_alpha && has_alpha_premultiplied { - rgba_store.unpremultiply_alpha(&pool); - } - Ok(rgba_store) + self.scaler + .resize_rgba_f32(&lab_store, &mut new_target_store, premultiply_alpha)?; + Self::laba_to_srgba(&new_target_store, into); + Ok(()) } } diff --git a/src/colors/lch_scaler.rs b/src/colors/lch_scaler.rs index 96fc290..00d9812 100644 --- a/src/colors/lch_scaler.rs +++ b/src/colors/lch_scaler.rs @@ -32,11 +32,10 @@ use colorutils_rs::{ SRGB_TO_XYZ_D65, XYZ_TO_SRGB_D65, }; -use crate::alpha_check::has_non_constant_cap_alpha_rgba8; use crate::pic_scale_error::PicScaleError; use crate::scaler::ScalingF32; use crate::support::check_image_size_overflow; -use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ThreadingPolicy}; +use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ThreadingPolicy}; #[derive(Debug, Copy, Clone)] /// Converts image to *CIE LCH(uv)* components scales it and convert back @@ -51,35 +50,40 @@ impl LChScaler { } } - fn rgba_to_lcha(store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); + fn rgba_to_lcha<'a>(store: &ImageStore<'a, u8, 4>) -> ImageStore<'a, f32, 4> { + let mut source_slice = vec![f32::default(); 4 * store.width * store.height]; let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::() as u32; rgba_to_lch_with_alpha( - store.buffer.borrow(), + store.buffer.as_ref(), store.width as u32 * 4u32, - new_store.buffer.borrow_mut(), + &mut source_slice, lab_stride, store.width as u32, store.height as u32, &SRGB_TO_XYZ_D65, TransferFunction::Srgb, ); + let new_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(source_slice), + channels: 4, + width: store.width, + height: store.height, + bit_depth: store.bit_depth, + }; new_store } - fn lcha_to_srgba(store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); + fn lcha_to_srgba<'a>(store: &ImageStoreMut<'a, f32, 4>, into: &mut ImageStoreMut<'a, u8, 4>) { lch_with_alpha_to_rgba( store.buffer.borrow(), store.width as u32 * 4u32 * std::mem::size_of::() as u32, - new_store.buffer.borrow_mut(), + into.buffer.borrow_mut(), store.width as u32 * 4u32, store.width as u32, store.height as u32, &XYZ_TO_SRGB_D65, TransferFunction::Srgb, ); - new_store } } @@ -88,11 +92,12 @@ impl Scaling for LChScaler { self.scaler.set_threading_policy(threading_policy) } - fn resize_rgb( - &self, - new_size: ImageSize, - store: ImageStore, - ) -> Result, PicScaleError> { + fn resize_rgb<'a>( + &'a self, + store: &ImageStore<'a, u8, 3>, + into: &mut ImageStoreMut<'a, u8, 3>, + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -106,15 +111,25 @@ impl Scaling for LChScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } const COMPONENTS: usize = 3; - let mut lab_store = ImageStore::::alloc(store.width, store.height); + + let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS]; + + let mut lab_store = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + store.height, + )?; + lab_store.bit_depth = into.bit_depth; + let lab_stride = lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; rgb_to_lch( - store.buffer.borrow(), + store.buffer.as_ref(), store.width as u32 * COMPONENTS as u32, lab_store.buffer.borrow_mut(), lab_stride, @@ -123,29 +138,42 @@ impl Scaling for LChScaler { &SRGB_TO_XYZ_D65, TransferFunction::Srgb, ); - let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?; - let mut new_u8_store = ImageStore::::alloc(new_size.width, new_size.height); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + self.scaler + .resize_rgb_f32(&new_immutable_store, &mut new_store)?; + let new_lab_stride = new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + lch_to_rgb( new_store.buffer.borrow(), new_lab_stride, - new_u8_store.buffer.borrow_mut(), - new_u8_store.width as u32 * COMPONENTS as u32, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, new_store.width as u32, new_store.height as u32, &XYZ_TO_SRGB_D65, TransferFunction::Srgb, ); - Ok(new_u8_store) + Ok(()) } fn resize_rgba<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, u8, 4>, + store: &ImageStore<'a, u8, 4>, + into: &mut ImageStoreMut<'a, u8, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError> { + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -159,36 +187,16 @@ impl Scaling for LChScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } - let mut src_store = store; - - let pool = self - .scaler - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); + let lab_store = Self::rgba_to_lcha(store); + let mut new_target_store = ImageStoreMut::alloc(new_size.width, new_size.height); - let mut has_alpha_premultiplied = false; - - if premultiply_alpha { - let is_alpha_premultiplication_reasonable = - has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width); - if is_alpha_premultiplication_reasonable { - let mut new_store = ImageStore::::alloc(src_store.width, src_store.height); - src_store.premultiply_alpha(&mut new_store, &pool); - src_store = new_store; - has_alpha_premultiplied = true; - } - } - let lab_store = Self::rgba_to_lcha(src_store); - let new_store = self - .scaler - .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?; - let mut rgba_store = Self::lcha_to_srgba(new_store); - if premultiply_alpha && has_alpha_premultiplied { - rgba_store.unpremultiply_alpha(&pool); - } - Ok(rgba_store) + self.scaler + .resize_rgba_f32(&lab_store, &mut new_target_store, premultiply_alpha)?; + Self::lcha_to_srgba(&new_target_store, into); + Ok(()) } } diff --git a/src/colors/linear_precise_scaler.rs b/src/colors/linear_precise_scaler.rs index 2ab3256..03f11d8 100644 --- a/src/colors/linear_precise_scaler.rs +++ b/src/colors/linear_precise_scaler.rs @@ -27,11 +27,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::alpha_check::has_non_constant_cap_alpha_rgba8; use crate::pic_scale_error::PicScaleError; use crate::scaler::{Scaling, ScalingF32}; use crate::support::check_image_size_overflow; -use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, ThreadingPolicy}; +use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ThreadingPolicy}; use colorutils_rs::{ linear_to_rgb, linear_to_rgba, rgb_to_linear, rgba_to_linear, TransferFunction, }; @@ -63,35 +62,6 @@ impl LinearScaler { transfer_function, } } - - fn rgba_to_linear(&self, store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); - let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::() as u32; - rgba_to_linear( - store.buffer.borrow(), - store.width as u32 * 4u32, - new_store.buffer.borrow_mut(), - lab_stride, - store.width as u32, - store.height as u32, - self.transfer_function, - ); - new_store - } - - fn linear_to_rgba(&self, store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); - linear_to_rgba( - store.buffer.borrow(), - store.width as u32 * 4u32 * std::mem::size_of::() as u32, - new_store.buffer.borrow_mut(), - store.width as u32 * 4u32, - store.width as u32, - store.height as u32, - self.transfer_function, - ); - new_store - } } impl Scaling for LinearScaler { @@ -99,11 +69,12 @@ impl Scaling for LinearScaler { self.scaler.threading_policy = threading_policy; } - fn resize_rgb( - &self, - new_size: ImageSize, - store: ImageStore, - ) -> Result, PicScaleError> { + fn resize_rgb<'a>( + &'a self, + store: &ImageStore<'a, u8, 3>, + into: &mut ImageStoreMut<'a, u8, 3>, + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -117,15 +88,26 @@ impl Scaling for LinearScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } const COMPONENTS: usize = 3; - let mut lab_store = ImageStore::::alloc(store.width, store.height); + + let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS]; + + let mut lab_store = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + store.height, + )?; + lab_store.bit_depth = into.bit_depth; + let lab_stride = lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + rgb_to_linear( - store.buffer.borrow(), + store.buffer.as_ref(), store.width as u32 * COMPONENTS as u32, lab_store.buffer.borrow_mut(), lab_stride, @@ -133,28 +115,40 @@ impl Scaling for LinearScaler { lab_store.height as u32, self.transfer_function, ); - let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?; - let mut new_u8_store = ImageStore::::alloc(new_size.width, new_size.height); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + + self.scaler + .resize_rgb_f32(&new_immutable_store, &mut new_store)?; let new_lab_stride = new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; linear_to_rgb( new_store.buffer.borrow(), new_lab_stride, - new_u8_store.buffer.borrow_mut(), - new_u8_store.width as u32 * COMPONENTS as u32, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, new_store.width as u32, new_store.height as u32, self.transfer_function, ); - Ok(new_u8_store) + Ok(()) } fn resize_rgba<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, u8, 4>, + store: &ImageStore<'a, u8, 4>, + into: &mut ImageStoreMut<'a, u8, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError> { + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -168,36 +162,55 @@ impl Scaling for LinearScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } - let mut src_store = store; + const COMPONENTS: usize = 4; - let pool = self - .scaler - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); + let mut target = vec![f32::default(); store.width * store.height * COMPONENTS]; - let mut has_alpha_premultiplied = false; + let mut lab_store = + ImageStoreMut::::from_slice(&mut target, store.width, store.height)?; + lab_store.bit_depth = into.bit_depth; - if premultiply_alpha { - let is_alpha_premultiplication_reasonable = - has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width); - if is_alpha_premultiplication_reasonable { - let mut new_store = ImageStore::::alloc(src_store.width, src_store.height); - src_store.premultiply_alpha(&mut new_store, &pool); - src_store = new_store; - has_alpha_premultiplied = true; - } - } - let lab_store = self.rgba_to_linear(src_store); - let new_store = self - .scaler - .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?; - let mut rgba_store = self.linear_to_rgba(new_store); - if premultiply_alpha && has_alpha_premultiplied { - rgba_store.unpremultiply_alpha(&pool); - } - Ok(rgba_store) + let lab_stride = + lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + rgba_to_linear( + store.buffer.as_ref(), + store.width as u32 * COMPONENTS as u32, + lab_store.buffer.borrow_mut(), + lab_stride, + lab_store.width as u32, + lab_store.height as u32, + TransferFunction::Srgb, + ); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + self.scaler + .resize_rgba_f32(&new_immutable_store, &mut new_store, premultiply_alpha)?; + + let new_lab_stride = + new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + + linear_to_rgba( + new_store.buffer.borrow(), + new_lab_stride, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, + new_store.width as u32, + new_store.height as u32, + TransferFunction::Srgb, + ); + + Ok(()) } } diff --git a/src/colors/linear_scaler.rs b/src/colors/linear_scaler.rs index 230cda6..427cc38 100644 --- a/src/colors/linear_scaler.rs +++ b/src/colors/linear_scaler.rs @@ -31,11 +31,10 @@ use colorutils_rs::{ linear_u8_to_rgb, linear_u8_to_rgba, rgb_to_linear_u8, rgba_to_linear_u8, TransferFunction, }; -use crate::alpha_check::has_non_constant_cap_alpha_rgba8; use crate::pic_scale_error::PicScaleError; use crate::scaler::Scaling; use crate::support::check_image_size_overflow; -use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, ThreadingPolicy}; +use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ThreadingPolicy}; #[derive(Debug, Copy, Clone)] /// Linearize image into u8, scale and then convert it back. It's much faster than scale in f32, however involves some precision loss @@ -70,11 +69,12 @@ impl Scaling for LinearApproxScaler { self.scaler.threading_policy = threading_policy; } - fn resize_rgb( + fn resize_rgb<'a>( &self, - new_size: ImageSize, - store: ImageStore, - ) -> Result, PicScaleError> { + store: &ImageStore<'a, u8, 3>, + into: &mut ImageStoreMut<'a, u8, 3>, + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -88,42 +88,67 @@ impl Scaling for LinearApproxScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } - const CHANNELS: usize = 3; - let mut linear_store = ImageStore::::alloc(store.width, store.height); + const COMPONENTS: usize = 3; + + let mut target_vertical = vec![u8::default(); store.width * store.height * COMPONENTS]; + + let mut lab_store = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + store.height, + )?; + lab_store.bit_depth = into.bit_depth; + + let lab_stride = + lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + rgb_to_linear_u8( - store.buffer.borrow(), - store.width as u32 * CHANNELS as u32, - linear_store.buffer.borrow_mut(), - linear_store.width as u32 * CHANNELS as u32, - linear_store.width as u32, - linear_store.height as u32, + store.buffer.as_ref(), + store.width as u32 * COMPONENTS as u32, + lab_store.buffer.borrow_mut(), + lab_stride, + lab_store.width as u32, + lab_store.height as u32, self.transfer_function, ); - let new_store = self.scaler.resize_rgb(new_size, linear_store)?; - let mut gamma_store = ImageStore::::alloc(new_store.width, new_store.height); - let src = new_store.buffer.borrow(); - let gamma_buffer = gamma_store.buffer.borrow_mut(); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + + self.scaler + .resize_rgb(&new_immutable_store, &mut new_store)?; + let new_lab_stride = + new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; linear_u8_to_rgb( - src, - new_store.width as u32 * CHANNELS as u32, - gamma_buffer, - gamma_store.width as u32 * CHANNELS as u32, - gamma_store.width as u32, - gamma_store.height as u32, + new_store.buffer.borrow(), + new_lab_stride, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, + new_store.width as u32, + new_store.height as u32, self.transfer_function, ); - Ok(gamma_store) + Ok(()) } fn resize_rgba<'a>( &self, - new_size: ImageSize, - store: ImageStore<'a, u8, 4>, + store: &ImageStore<'a, u8, 4>, + into: &mut ImageStoreMut<'a, u8, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError> { + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -137,58 +162,57 @@ impl Scaling for LinearApproxScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } - const CHANNELS: usize = 4; - let mut src_store = store; - - let pool = self - .scaler - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - - let mut has_alpha_premultiplied = false; - - if premultiply_alpha { - let is_alpha_premultiplication_reasonable = - has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width); - if is_alpha_premultiplication_reasonable { - let mut new_store = ImageStore::::alloc(src_store.width, src_store.height); - src_store.premultiply_alpha(&mut new_store, &pool); - src_store = new_store; - has_alpha_premultiplied = true; - } - } + const COMPONENTS: usize = 4; + + let mut target_vertical = vec![u8::default(); store.width * store.height * COMPONENTS]; + + let mut lab_store = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + store.height, + )?; + lab_store.bit_depth = into.bit_depth; + + let lab_stride = + lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; - let mut linear_store = ImageStore::::alloc(src_store.width, src_store.height); rgba_to_linear_u8( - src_store.buffer.borrow(), - src_store.width as u32 * CHANNELS as u32, - linear_store.buffer.borrow_mut(), - linear_store.width as u32 * CHANNELS as u32, - linear_store.width as u32, - linear_store.height as u32, + store.buffer.as_ref(), + store.width as u32 * COMPONENTS as u32, + lab_store.buffer.borrow_mut(), + lab_stride, + lab_store.width as u32, + lab_store.height as u32, self.transfer_function, ); - let new_store = self - .scaler - .resize_rgba_impl(new_size, linear_store, false, &pool)?; - let mut gamma_store = ImageStore::::alloc(new_store.width, new_store.height); - let src = new_store.buffer.borrow(); - let gamma_buffer = gamma_store.buffer.borrow_mut(); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + + self.scaler + .resize_rgba(&new_immutable_store, &mut new_store, premultiply_alpha)?; + let new_lab_stride = + new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; linear_u8_to_rgba( - src, - new_store.width as u32 * CHANNELS as u32, - gamma_buffer, - gamma_store.width as u32 * CHANNELS as u32, - gamma_store.width as u32, - gamma_store.height as u32, + new_store.buffer.borrow(), + new_lab_stride, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, + new_store.width as u32, + new_store.height as u32, self.transfer_function, ); - if premultiply_alpha && has_alpha_premultiplied { - gamma_store.unpremultiply_alpha(&pool); - } - Ok(gamma_store) + Ok(()) } } diff --git a/src/colors/luv_scaler.rs b/src/colors/luv_scaler.rs index 3470c5c..4528a71 100644 --- a/src/colors/luv_scaler.rs +++ b/src/colors/luv_scaler.rs @@ -32,11 +32,10 @@ use colorutils_rs::{ SRGB_TO_XYZ_D65, XYZ_TO_SRGB_D65, }; -use crate::alpha_check::has_non_constant_cap_alpha_rgba8; use crate::pic_scale_error::PicScaleError; use crate::scaler::ScalingF32; use crate::support::check_image_size_overflow; -use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ThreadingPolicy}; +use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ThreadingPolicy}; #[derive(Debug, Copy, Clone)] /// Converts image to *CIE LUV* components scales it and convert back @@ -50,37 +49,6 @@ impl LuvScaler { scaler: Scaler::new(filter), } } - - fn rgba_to_laba(store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); - let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::() as u32; - rgba_to_luv_with_alpha( - store.buffer.borrow(), - store.width as u32 * 4u32, - new_store.buffer.borrow_mut(), - lab_stride, - store.width as u32, - store.height as u32, - &SRGB_TO_XYZ_D65, - TransferFunction::Srgb, - ); - new_store - } - - fn laba_to_srgba(store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); - luv_with_alpha_to_rgba( - store.buffer.borrow(), - store.width as u32 * 4u32 * std::mem::size_of::() as u32, - new_store.buffer.borrow_mut(), - store.width as u32 * 4u32, - store.width as u32, - store.height as u32, - &XYZ_TO_SRGB_D65, - TransferFunction::Srgb, - ); - new_store - } } impl Scaling for LuvScaler { @@ -88,11 +56,12 @@ impl Scaling for LuvScaler { self.scaler.set_threading_policy(threading_policy) } - fn resize_rgb( + fn resize_rgb<'a>( &self, - new_size: ImageSize, - store: ImageStore, - ) -> Result, PicScaleError> { + store: &ImageStore<'a, u8, 3>, + into: &mut ImageStoreMut<'a, u8, 3>, + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -106,15 +75,25 @@ impl Scaling for LuvScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } const COMPONENTS: usize = 3; - let mut lab_store = ImageStore::::alloc(store.width, store.height); + + let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS]; + + let mut lab_store = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + store.height, + )?; + lab_store.bit_depth = into.bit_depth; + let lab_stride = lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; rgb_to_luv( - store.buffer.borrow(), + store.buffer.as_ref(), store.width as u32 * COMPONENTS as u32, lab_store.buffer.borrow_mut(), lab_stride, @@ -123,29 +102,42 @@ impl Scaling for LuvScaler { &SRGB_TO_XYZ_D65, TransferFunction::Srgb, ); - let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?; - let mut new_u8_store = ImageStore::::alloc(new_size.width, new_size.height); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + self.scaler + .resize_rgb_f32(&new_immutable_store, &mut new_store)?; + let new_lab_stride = new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + luv_to_rgb( new_store.buffer.borrow(), new_lab_stride, - new_u8_store.buffer.borrow_mut(), - new_u8_store.width as u32 * COMPONENTS as u32, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, new_store.width as u32, new_store.height as u32, &XYZ_TO_SRGB_D65, TransferFunction::Srgb, ); - Ok(new_u8_store) + Ok(()) } fn resize_rgba<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, u8, 4>, + store: &ImageStore<'a, u8, 4>, + into: &mut ImageStoreMut<'a, u8, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError> { + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -159,36 +151,59 @@ impl Scaling for LuvScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } - let mut src_store = store; + const COMPONENTS: usize = 4; - let pool = self - .scaler - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); + let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS]; - let mut has_alpha_premultiplied = false; + let mut lab_store = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + store.height, + )?; + lab_store.bit_depth = into.bit_depth; - if premultiply_alpha { - let is_alpha_premultiplication_reasonable = - has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width); - if is_alpha_premultiplication_reasonable { - let mut new_store = ImageStore::::alloc(src_store.width, src_store.height); - src_store.premultiply_alpha(&mut new_store, &pool); - src_store = new_store; - has_alpha_premultiplied = true; - } - } - let lab_store = Self::rgba_to_laba(src_store); - let new_store = self - .scaler - .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?; - let mut rgba_store = Self::laba_to_srgba(new_store); - if premultiply_alpha && has_alpha_premultiplied { - rgba_store.unpremultiply_alpha(&pool); - } - Ok(rgba_store) + let lab_stride = + lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + rgba_to_luv_with_alpha( + store.buffer.as_ref(), + store.width as u32 * COMPONENTS as u32, + lab_store.buffer.borrow_mut(), + lab_stride, + lab_store.width as u32, + lab_store.height as u32, + &SRGB_TO_XYZ_D65, + TransferFunction::Srgb, + ); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + self.scaler + .resize_rgba_f32(&new_immutable_store, &mut new_store, premultiply_alpha)?; + + let new_lab_stride = + new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + + luv_with_alpha_to_rgba( + new_store.buffer.borrow(), + new_lab_stride, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, + new_store.width as u32, + new_store.height as u32, + &XYZ_TO_SRGB_D65, + TransferFunction::Srgb, + ); + Ok(()) } } diff --git a/src/colors/oklab_scaler.rs b/src/colors/oklab_scaler.rs index d37bfdf..1a49697 100644 --- a/src/colors/oklab_scaler.rs +++ b/src/colors/oklab_scaler.rs @@ -28,11 +28,10 @@ */ use colorutils_rs::{oklab_to_rgb, oklab_to_rgba, rgb_to_oklab, rgba_to_oklab, TransferFunction}; -use crate::alpha_check::has_non_constant_cap_alpha_rgba8; use crate::pic_scale_error::PicScaleError; use crate::scaler::ScalingF32; use crate::support::check_image_size_overflow; -use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ThreadingPolicy}; +use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ThreadingPolicy}; #[derive(Debug, Copy, Clone)] /// Converts image to *Oklab* components scales it and convert back @@ -50,35 +49,6 @@ impl OklabScaler { transfer_function, } } - - fn rgba_to_laba(&self, store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); - let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::() as u32; - rgba_to_oklab( - store.buffer.borrow(), - store.width as u32 * 4u32, - new_store.buffer.borrow_mut(), - lab_stride, - store.width as u32, - store.height as u32, - self.transfer_function, - ); - new_store - } - - fn laba_to_srgba(&self, store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); - oklab_to_rgba( - store.buffer.borrow(), - store.width as u32 * 4u32 * std::mem::size_of::() as u32, - new_store.buffer.borrow_mut(), - store.width as u32 * 4u32, - store.width as u32, - store.height as u32, - self.transfer_function, - ); - new_store - } } impl Scaling for OklabScaler { @@ -86,11 +56,12 @@ impl Scaling for OklabScaler { self.scaler.threading_policy = threading_policy; } - fn resize_rgb( + fn resize_rgb<'a>( &self, - new_size: ImageSize, - store: ImageStore, - ) -> Result, PicScaleError> { + store: &ImageStore<'a, u8, 3>, + into: &mut ImageStoreMut<'a, u8, 3>, + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -104,15 +75,25 @@ impl Scaling for OklabScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } const COMPONENTS: usize = 3; - let mut lab_store = ImageStore::::alloc(store.width, store.height); + + let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS]; + + let mut lab_store = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + store.height, + )?; + lab_store.bit_depth = into.bit_depth; + let lab_stride = lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; rgb_to_oklab( - store.buffer.borrow(), + store.buffer.as_ref(), store.width as u32 * COMPONENTS as u32, lab_store.buffer.borrow_mut(), lab_stride, @@ -120,28 +101,41 @@ impl Scaling for OklabScaler { lab_store.height as u32, self.transfer_function, ); - let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?; - let mut new_u8_store = ImageStore::::alloc(new_size.width, new_size.height); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + self.scaler + .resize_rgb_f32(&new_immutable_store, &mut new_store)?; + let new_lab_stride = new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + oklab_to_rgb( new_store.buffer.borrow(), new_lab_stride, - new_u8_store.buffer.borrow_mut(), - new_u8_store.width as u32 * COMPONENTS as u32, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, new_store.width as u32, new_store.height as u32, self.transfer_function, ); - Ok(new_u8_store) + Ok(()) } fn resize_rgba<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, u8, 4>, + store: &ImageStore<'a, u8, 4>, + into: &mut ImageStoreMut<'a, u8, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError> { + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -155,36 +149,57 @@ impl Scaling for OklabScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } - let mut src_store = store; + const COMPONENTS: usize = 4; - let pool = self - .scaler - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); + let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS]; - let mut has_alpha_premultiplied = false; + let mut lab_store = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + store.height, + )?; + lab_store.bit_depth = into.bit_depth; - if premultiply_alpha { - let is_alpha_premultiplication_reasonable = - has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width); - if is_alpha_premultiplication_reasonable { - let mut new_store = ImageStore::::alloc(src_store.width, src_store.height); - src_store.premultiply_alpha(&mut new_store, &pool); - src_store = new_store; - has_alpha_premultiplied = true; - } - } - let lab_store = self.rgba_to_laba(src_store); - let new_store = self - .scaler - .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?; - let mut rgba_store = self.laba_to_srgba(new_store); - if premultiply_alpha && has_alpha_premultiplied { - rgba_store.unpremultiply_alpha(&pool); - } - Ok(rgba_store) + let lab_stride = + lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + rgba_to_oklab( + store.buffer.as_ref(), + store.width as u32 * COMPONENTS as u32, + lab_store.buffer.borrow_mut(), + lab_stride, + lab_store.width as u32, + lab_store.height as u32, + self.transfer_function, + ); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + self.scaler + .resize_rgba_f32(&new_immutable_store, &mut new_store, premultiply_alpha)?; + + let new_lab_stride = + new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + + oklab_to_rgba( + new_store.buffer.borrow(), + new_lab_stride, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, + new_store.width as u32, + new_store.height as u32, + self.transfer_function, + ); + Ok(()) } } diff --git a/src/colors/sigmoidal_scaler.rs b/src/colors/sigmoidal_scaler.rs index 69cae09..e1bf688 100644 --- a/src/colors/sigmoidal_scaler.rs +++ b/src/colors/sigmoidal_scaler.rs @@ -27,11 +27,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::alpha_check::has_non_constant_cap_alpha_rgba8; use crate::pic_scale_error::PicScaleError; use crate::scaler::ScalingF32; use crate::support::check_image_size_overflow; -use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ThreadingPolicy}; +use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ThreadingPolicy}; use colorutils_rs::{rgb_to_sigmoidal, rgba_to_sigmoidal, sigmoidal_to_rgb, sigmoidal_to_rgba}; #[derive(Debug, Copy, Clone)] @@ -46,33 +45,6 @@ impl SigmoidalScaler { scaler: Scaler::new(filter), } } - - fn rgba_to_sigmoidal(store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); - let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::() as u32; - rgba_to_sigmoidal( - store.buffer.borrow(), - store.width as u32 * 4u32, - new_store.buffer.borrow_mut(), - lab_stride, - store.width as u32, - store.height as u32, - ); - new_store - } - - fn sigmoidal_to_rgba(store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); - sigmoidal_to_rgba( - store.buffer.borrow(), - store.width as u32 * 4u32 * std::mem::size_of::() as u32, - new_store.buffer.borrow_mut(), - store.width as u32 * 4u32, - store.width as u32, - store.height as u32, - ); - new_store - } } impl Scaling for SigmoidalScaler { @@ -80,11 +52,12 @@ impl Scaling for SigmoidalScaler { self.scaler.set_threading_policy(threading_policy) } - fn resize_rgb( + fn resize_rgb<'a>( &self, - new_size: ImageSize, - store: ImageStore, - ) -> Result, PicScaleError> { + store: &ImageStore<'a, u8, 3>, + into: &mut ImageStoreMut<'a, u8, 3>, + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -98,42 +71,65 @@ impl Scaling for SigmoidalScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } const COMPONENTS: usize = 3; - let mut lab_store = ImageStore::::alloc(store.width, store.height); + + let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS]; + + let mut lab_store = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + store.height, + )?; + lab_store.bit_depth = into.bit_depth; + let lab_stride = lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; rgb_to_sigmoidal( - store.buffer.borrow(), + store.buffer.as_ref(), store.width as u32 * COMPONENTS as u32, lab_store.buffer.borrow_mut(), lab_stride, lab_store.width as u32, lab_store.height as u32, ); - let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?; - let mut new_u8_store = ImageStore::::alloc(new_size.width, new_size.height); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + self.scaler + .resize_rgb_f32(&new_immutable_store, &mut new_store)?; + let new_lab_stride = new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + sigmoidal_to_rgb( new_store.buffer.borrow(), new_lab_stride, - new_u8_store.buffer.borrow_mut(), - new_u8_store.width as u32 * COMPONENTS as u32, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, new_store.width as u32, new_store.height as u32, ); - Ok(new_u8_store) + Ok(()) } fn resize_rgba<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, u8, 4>, + store: &ImageStore<'a, u8, 4>, + into: &mut ImageStoreMut<'a, u8, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError> { + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -147,36 +143,55 @@ impl Scaling for SigmoidalScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } - let mut src_store = store; + const COMPONENTS: usize = 4; - let pool = self - .scaler - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); + let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS]; - let mut has_alpha_premultiplied = false; + let mut lab_store = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + store.height, + )?; + lab_store.bit_depth = into.bit_depth; - if premultiply_alpha { - let is_alpha_premultiplication_reasonable = - has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width); - if is_alpha_premultiplication_reasonable { - let mut new_store = ImageStore::::alloc(src_store.width, src_store.height); - src_store.premultiply_alpha(&mut new_store, &pool); - src_store = new_store; - has_alpha_premultiplied = true; - } - } - let lab_store = Self::rgba_to_sigmoidal(src_store); - let new_store = self - .scaler - .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?; - let mut rgba_store = Self::sigmoidal_to_rgba(new_store); - if premultiply_alpha && has_alpha_premultiplied { - rgba_store.unpremultiply_alpha(&pool); - } - Ok(rgba_store) + let lab_stride = + lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + rgba_to_sigmoidal( + store.buffer.as_ref(), + store.width as u32 * COMPONENTS as u32, + lab_store.buffer.borrow_mut(), + lab_stride, + lab_store.width as u32, + lab_store.height as u32, + ); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + self.scaler + .resize_rgba_f32(&new_immutable_store, &mut new_store, premultiply_alpha)?; + + let new_lab_stride = + new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + + sigmoidal_to_rgba( + new_store.buffer.borrow(), + new_lab_stride, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, + new_store.width as u32, + new_store.height as u32, + ); + Ok(()) } } diff --git a/src/colors/xyz_scaler.rs b/src/colors/xyz_scaler.rs index 75f5321..328eb48 100644 --- a/src/colors/xyz_scaler.rs +++ b/src/colors/xyz_scaler.rs @@ -32,11 +32,10 @@ use colorutils_rs::{ SRGB_TO_XYZ_D65, XYZ_TO_SRGB_D65, }; -use crate::alpha_check::has_non_constant_cap_alpha_rgba8; use crate::pic_scale_error::PicScaleError; use crate::scaler::{Scaling, ScalingF32}; use crate::support::check_image_size_overflow; -use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, ThreadingPolicy}; +use crate::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ThreadingPolicy}; #[derive(Debug, Copy, Clone)] /// Converts image to CIE XYZ components scales it and convert back @@ -50,37 +49,6 @@ impl XYZScaler { scaler: Scaler::new(filter), } } - - fn rgba_to_xyz(store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); - let lab_stride = store.width as u32 * 4u32 * std::mem::size_of::() as u32; - rgba_to_xyz_with_alpha( - store.buffer.borrow(), - store.width as u32 * 4u32, - new_store.buffer.borrow_mut(), - lab_stride, - store.width as u32, - store.height as u32, - &SRGB_TO_XYZ_D65, - TransferFunction::Srgb, - ); - new_store - } - - fn xyz_to_srgba(store: ImageStore) -> ImageStore { - let mut new_store = ImageStore::::alloc(store.width, store.height); - xyz_with_alpha_to_rgba( - store.buffer.borrow(), - store.width as u32 * 4u32 * std::mem::size_of::() as u32, - new_store.buffer.borrow_mut(), - store.width as u32 * 4u32, - store.width as u32, - store.height as u32, - &XYZ_TO_SRGB_D65, - TransferFunction::Srgb, - ); - new_store - } } impl Scaling for XYZScaler { @@ -88,11 +56,12 @@ impl Scaling for XYZScaler { self.scaler.threading_policy = threading_policy; } - fn resize_rgb( - &self, - new_size: ImageSize, - store: ImageStore, - ) -> Result, PicScaleError> { + fn resize_rgb<'a>( + &'a self, + store: &ImageStore<'a, u8, 3>, + into: &mut ImageStoreMut<'a, u8, 3>, + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -106,42 +75,65 @@ impl Scaling for XYZScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } const COMPONENTS: usize = 3; - let mut lab_store = ImageStore::::alloc(store.width, store.height); + + let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS]; + + let mut lab_store = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + store.height, + )?; + lab_store.bit_depth = into.bit_depth; + let lab_stride = lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + srgb_to_xyz( - store.buffer.borrow(), + store.buffer.as_ref(), store.width as u32 * COMPONENTS as u32, lab_store.buffer.borrow_mut(), lab_stride, lab_store.width as u32, lab_store.height as u32, ); - let new_store = self.scaler.resize_rgb_f32(new_size, lab_store)?; - let mut new_u8_store = ImageStore::::alloc(new_size.width, new_size.height); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + + self.scaler + .resize_rgb_f32(&new_immutable_store, &mut new_store)?; let new_lab_stride = new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; xyz_to_srgb( new_store.buffer.borrow(), new_lab_stride, - new_u8_store.buffer.borrow_mut(), - new_u8_store.width as u32 * COMPONENTS as u32, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, new_store.width as u32, new_store.height as u32, ); - Ok(new_u8_store) + Ok(()) } fn resize_rgba<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, u8, 4>, + store: &ImageStore<'a, u8, 4>, + into: &mut ImageStoreMut<'a, u8, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError> { + ) -> Result<(), PicScaleError> { + let new_size = into.get_size(); if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -155,36 +147,59 @@ impl Scaling for XYZScaler { } if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } - let mut src_store = store; + const COMPONENTS: usize = 4; - let pool = self - .scaler - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); + let mut target_vertical = vec![f32::default(); store.width * store.height * COMPONENTS]; - let mut has_alpha_premultiplied = false; + let mut lab_store = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + store.height, + )?; + lab_store.bit_depth = into.bit_depth; - if premultiply_alpha { - let is_alpha_premultiplication_reasonable = - has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width); - if is_alpha_premultiplication_reasonable { - let mut new_store = ImageStore::::alloc(src_store.width, src_store.height); - src_store.premultiply_alpha(&mut new_store, &pool); - src_store = new_store; - has_alpha_premultiplied = true; - } - } - let lab_store = Self::rgba_to_xyz(src_store); - let new_store = self - .scaler - .resize_rgba_f32_impl(new_size, lab_store, false, &pool)?; - let mut rgba_store = Self::xyz_to_srgba(new_store); - if premultiply_alpha && has_alpha_premultiplied { - rgba_store.unpremultiply_alpha(&pool); - } - Ok(rgba_store) + let lab_stride = + lab_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + + rgba_to_xyz_with_alpha( + store.buffer.as_ref(), + store.width as u32 * COMPONENTS as u32, + lab_store.buffer.borrow_mut(), + lab_stride, + lab_store.width as u32, + lab_store.height as u32, + &SRGB_TO_XYZ_D65, + TransferFunction::Srgb, + ); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: COMPONENTS, + width: store.width, + height: store.height, + bit_depth: into.bit_depth, + }; + + let mut new_store = ImageStoreMut::::alloc(into.width, into.height); + + self.scaler + .resize_rgba_f32(&new_immutable_store, &mut new_store, premultiply_alpha)?; + let new_lab_stride = + new_store.width as u32 * COMPONENTS as u32 * std::mem::size_of::() as u32; + xyz_with_alpha_to_rgba( + new_store.buffer.borrow(), + new_lab_stride, + into.buffer.borrow_mut(), + into.width as u32 * COMPONENTS as u32, + new_store.width as u32, + new_store.height as u32, + &XYZ_TO_SRGB_D65, + TransferFunction::Srgb, + ); + Ok(()) } } diff --git a/src/convolution.rs b/src/convolution.rs index 944b4ae..8c67463 100644 --- a/src/convolution.rs +++ b/src/convolution.rs @@ -32,7 +32,7 @@ use rayon::ThreadPool; use std::fmt::Debug; use crate::filter_weights::FilterWeights; -use crate::ImageStore; +use crate::image_store::ImageStoreMut; pub(crate) trait HorizontalConvolutionPass where @@ -41,7 +41,7 @@ where fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ); } @@ -53,7 +53,7 @@ where fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ); } diff --git a/src/dispatch_group_f16.rs b/src/dispatch_group_f16.rs index dd811b4..03bd2d2 100644 --- a/src/dispatch_group_f16.rs +++ b/src/dispatch_group_f16.rs @@ -28,6 +28,7 @@ */ use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::image_store::ImageStoreMut; use crate::ImageStore; use half::f16; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; @@ -37,7 +38,7 @@ use rayon::ThreadPool; pub(crate) fn convolve_vertical_dispatch_f16( image_store: &ImageStore, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]), ) { @@ -57,7 +58,7 @@ pub(crate) fn convolve_vertical_dispatch_f16( let bounds = filter_weights.bounds[y]; let filter_offset = y * filter_weights.aligned_size; let weights = &filter_weights.weights[filter_offset..]; - let source_buffer = image_store.buffer.borrow(); + let source_buffer = image_store.buffer.as_ref(); dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights); }); }); @@ -71,7 +72,7 @@ pub(crate) fn convolve_vertical_dispatch_f16( let bounds = filter_weights.bounds[y]; let filter_offset = y * filter_weights.aligned_size; let weights = &filter_weights.weights[filter_offset..]; - let source_buffer = image_store.buffer.borrow(); + let source_buffer = image_store.buffer.as_ref(); dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights); }); } @@ -80,7 +81,7 @@ pub(crate) fn convolve_vertical_dispatch_f16( pub(crate) fn convolve_horizontal_dispatch_f16( image_store: &ImageStore, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, dispatcher_4_rows: Option< fn(usize, usize, &FilterWeights, &[f16], usize, &mut [f16], usize), @@ -99,7 +100,7 @@ pub(crate) fn convolve_horizontal_dispatch_f16( if let Some(dispatcher) = dispatcher_4_rows { image_store .buffer - .borrow() + .as_ref() .par_chunks_exact(src_stride * 4) .zip( destination @@ -124,11 +125,11 @@ pub(crate) fn convolve_horizontal_dispatch_f16( let left_src_rows = if processed_4 { image_store .buffer - .borrow() + .as_ref() .chunks_exact(src_stride * 4) .remainder() } else { - image_store.buffer.borrow() + image_store.buffer.as_ref() }; let left_dst_rows = if processed_4 { destination @@ -152,7 +153,7 @@ pub(crate) fn convolve_horizontal_dispatch_f16( if let Some(dispatcher) = dispatcher_4_rows { for (src, dst) in image_store .buffer - .borrow() + .as_ref() .chunks_exact(src_stride * 4) .zip( destination @@ -177,11 +178,11 @@ pub(crate) fn convolve_horizontal_dispatch_f16( let left_src_rows = if processed_4 { image_store .buffer - .borrow() + .as_ref() .chunks_exact(src_stride * 4) .remainder() } else { - image_store.buffer.borrow() + image_store.buffer.as_ref() }; let left_dst_rows = if processed_4 { destination diff --git a/src/dispatch_group_f32.rs b/src/dispatch_group_f32.rs index f4b87e9..418afba 100644 --- a/src/dispatch_group_f32.rs +++ b/src/dispatch_group_f32.rs @@ -28,6 +28,7 @@ */ use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::image_store::ImageStoreMut; use crate::ImageStore; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; use rayon::prelude::{ParallelSlice, ParallelSliceMut}; @@ -36,7 +37,7 @@ use rayon::ThreadPool; pub(crate) fn convolve_vertical_dispatch_f32( image_store: &ImageStore, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]), ) { @@ -56,7 +57,7 @@ pub(crate) fn convolve_vertical_dispatch_f32( let bounds = filter_weights.bounds[y]; let filter_offset = y * filter_weights.aligned_size; let weights = &filter_weights.weights[filter_offset..]; - let source_buffer = image_store.buffer.borrow(); + let source_buffer = image_store.buffer.as_ref(); dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights); }); }); @@ -70,7 +71,7 @@ pub(crate) fn convolve_vertical_dispatch_f32( let bounds = filter_weights.bounds[y]; let filter_offset = y * filter_weights.aligned_size; let weights = &filter_weights.weights[filter_offset..]; - let source_buffer = image_store.buffer.borrow(); + let source_buffer = image_store.buffer.as_ref(); dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights); }); } @@ -80,7 +81,7 @@ pub(crate) fn convolve_vertical_dispatch_f32( pub(crate) fn convolve_horizontal_dispatch_f32( image_store: &ImageStore, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, dispatcher_4_rows: Option< fn(usize, usize, &FilterWeights, &[f32], usize, &mut [f32], usize), @@ -99,7 +100,7 @@ pub(crate) fn convolve_horizontal_dispatch_f32( if let Some(dispatcher) = dispatcher_4_rows { image_store .buffer - .borrow() + .as_ref() .par_chunks_exact(src_stride * 4) .zip( destination @@ -124,11 +125,11 @@ pub(crate) fn convolve_horizontal_dispatch_f32( let left_src_rows = if processed_4 { image_store .buffer - .borrow() + .as_ref() .chunks_exact(src_stride * 4) .remainder() } else { - image_store.buffer.borrow() + image_store.buffer.as_ref() }; let left_dst_rows = if processed_4 { destination @@ -152,7 +153,7 @@ pub(crate) fn convolve_horizontal_dispatch_f32( if let Some(dispatcher) = dispatcher_4_rows { for (src, dst) in image_store .buffer - .borrow() + .as_ref() .chunks_exact(src_stride * 4) .zip( destination @@ -177,11 +178,11 @@ pub(crate) fn convolve_horizontal_dispatch_f32( let left_src_rows = if processed_4 { image_store .buffer - .borrow() + .as_ref() .chunks_exact(src_stride * 4) .remainder() } else { - image_store.buffer.borrow() + image_store.buffer.as_ref() }; let left_dst_rows = if processed_4 { destination diff --git a/src/dispatch_group_u16.rs b/src/dispatch_group_u16.rs index 0e16934..3ff1c37 100644 --- a/src/dispatch_group_u16.rs +++ b/src/dispatch_group_u16.rs @@ -32,6 +32,7 @@ use crate::handler_provider::{ ColumnHandlerFixedPoint, ColumnHandlerFloatingPoint, RowHandlerFixedPoint, RowHandlerFloatingPoint, }; +use crate::image_store::ImageStoreMut; use crate::support::PRECISION; use crate::ImageStore; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; @@ -42,10 +43,10 @@ use rayon::ThreadPool; pub(crate) fn convolve_horizontal_dispatch_u16( image_store: &ImageStore, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { - let src = image_store.buffer.borrow(); + let src = image_store.buffer.as_ref(); let dst = destination.buffer.borrow_mut(); let src_stride = image_store.width * image_store.channels; @@ -156,7 +157,7 @@ pub(crate) fn convolve_horizontal_dispatch_u16( pub(crate) fn convolve_vertical_dispatch_u16( image_store: &ImageStore, filter_weights: FilterWeights, - destination: &mut ImageStore<'_, u16, COMPONENTS>, + destination: &mut ImageStoreMut<'_, u16, COMPONENTS>, pool: &Option, ) { let src_stride = image_store.width * image_store.channels; @@ -176,7 +177,7 @@ pub(crate) fn convolve_vertical_dispatch_u16( let bounds = filter_weights.bounds[y]; let filter_offset = y * filter_weights.aligned_size; let weights = &filter_weights.weights[filter_offset..]; - let source_buffer = image_store.buffer.borrow(); + let source_buffer = image_store.buffer.as_ref(); u16::handle_floating_column( dst_width, &bounds, @@ -196,7 +197,7 @@ pub(crate) fn convolve_vertical_dispatch_u16( let bounds = filter_weights.bounds[y]; let filter_offset = y * filter_weights.aligned_size; let weights = &approx.weights[filter_offset..]; - let source_buffer = image_store.buffer.borrow(); + let source_buffer = image_store.buffer.as_ref(); u16::handle_fixed_column::( dst_width, &bounds, @@ -218,7 +219,7 @@ pub(crate) fn convolve_vertical_dispatch_u16( let bounds = filter_weights.bounds[y]; let filter_offset = y * filter_weights.aligned_size; let weights = &filter_weights.weights[filter_offset..]; - let source_buffer = image_store.buffer.borrow(); + let source_buffer = image_store.buffer.as_ref(); u16::handle_floating_column( dst_width, &bounds, @@ -239,7 +240,7 @@ pub(crate) fn convolve_vertical_dispatch_u16( let bounds = filter_weights.bounds[y]; let filter_offset = y * filter_weights.aligned_size; let weights = &approx.weights[filter_offset..]; - let source_buffer = image_store.buffer.borrow(); + let source_buffer = image_store.buffer.as_ref(); u16::handle_fixed_column::( dst_width, &bounds, diff --git a/src/dispatch_group_u8.rs b/src/dispatch_group_u8.rs index f78b6c4..89d9eaa 100644 --- a/src/dispatch_group_u8.rs +++ b/src/dispatch_group_u8.rs @@ -28,6 +28,7 @@ */ use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::image_store::ImageStoreMut; use crate::support::PRECISION; use crate::ImageStore; use rayon::iter::{IndexedParallelIterator, ParallelIterator}; @@ -39,14 +40,14 @@ use std::sync::Arc; pub(crate) fn convolve_horizontal_dispatch_u8( image_store: &ImageStore, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, dispatcher_4_rows: Option)>, dispatcher_1_row: fn(&[u8], &mut [u8], &FilterWeights), ) { let approx_weights = filter_weights.numerical_approximation_i16::(0); - let src = image_store.buffer.borrow(); + let src = image_store.buffer.as_ref(); let dst = destination.buffer.borrow_mut(); let src_stride = image_store.width * image_store.channels; @@ -100,7 +101,7 @@ pub(crate) fn convolve_horizontal_dispatch_u8( pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>( image_store: &ImageStore, filter_weights: FilterWeights, - destination: &mut ImageStore<'a, u8, COMPONENTS>, + destination: &mut ImageStoreMut<'a, u8, COMPONENTS>, pool: &Option, dispatcher: fn(usize, &FilterBounds, &[u8], &mut [u8], usize, &[i16]), ) { @@ -120,7 +121,7 @@ pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>( let bounds = filter_weights.bounds[y]; let filter_offset = y * filter_weights.aligned_size; let weights = &approx.weights[filter_offset..]; - let source_buffer = image_store.buffer.borrow(); + let source_buffer = image_store.buffer.as_ref(); dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights); }); }); @@ -134,7 +135,7 @@ pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>( let bounds = filter_weights.bounds[y]; let filter_offset = y * filter_weights.aligned_size; let weights = &approx.weights[filter_offset..]; - let source_buffer = image_store.buffer.borrow(); + let source_buffer = image_store.buffer.as_ref(); dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights); }); } diff --git a/src/f16.rs b/src/f16.rs index 3e5c23c..a445a9e 100644 --- a/src/f16.rs +++ b/src/f16.rs @@ -43,6 +43,7 @@ use crate::floating_point_horizontal::{ convolve_row_handler_floating_point, convolve_row_handler_floating_point_4, }; use crate::floating_point_vertical::column_handler_floating_point; +use crate::image_store::ImageStoreMut; #[cfg(all(target_arch = "aarch64", target_feature = "neon",))] use crate::neon::{ convolve_horizontal_rgb_neon_row_one_f16, convolve_horizontal_rgb_neon_rows_4_f16, @@ -98,7 +99,7 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 4> { fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let mut _dispatcher_4_rows: Option< @@ -170,7 +171,7 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f16, 4> { fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = @@ -214,7 +215,7 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 3> { fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let mut _dispatcher_4_rows: Option< @@ -266,7 +267,7 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f16, 3> { fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = @@ -310,7 +311,7 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 1> { fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let _dispatcher_4_rows: Option< @@ -333,7 +334,7 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f16, 1> { fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = diff --git a/src/image_store.rs b/src/image_store.rs index 16bd36f..2fc67c2 100644 --- a/src/image_store.rs +++ b/src/image_store.rs @@ -26,6 +26,10 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +use crate::alpha_check::{ + has_non_constant_cap_alpha_rgba16, has_non_constant_cap_alpha_rgba8, + has_non_constant_cap_alpha_rgba_f32, +}; #[cfg(feature = "half")] use crate::alpha_handle_f16::{premultiply_alpha_rgba_f16, unpremultiply_alpha_rgba_f16}; use crate::alpha_handle_f32::{premultiply_alpha_rgba_f32, unpremultiply_alpha_rgba_f32}; @@ -35,9 +39,10 @@ use crate::pic_scale_error::{PicScaleBufferMismatch, PicScaleError}; use crate::ImageSize; use num_traits::FromPrimitive; use rayon::ThreadPool; +use std::borrow::Cow; use std::fmt::Debug; -#[derive(Debug)] +#[derive(Debug, Clone)] /// Holds an image /// /// # Arguments @@ -51,7 +56,7 @@ pub struct ImageStore<'a, T, const N: usize> where T: FromPrimitive + Clone + Copy + Debug, { - pub(crate) buffer: BufferStore<'a, T>, + pub(crate) buffer: std::borrow::Cow<'a, [T]>, /// Channels in the image pub channels: usize, /// Image width @@ -62,6 +67,35 @@ where pub(crate) bit_depth: usize, } +#[derive(Debug)] +/// Holds an image +/// +/// # Arguments +/// `N` - count of channels +/// +/// # Examples +/// ImageStore - represents RGBA +/// ImageStore - represents RGB +/// ImageStore - represents RGB in f32 and etc +pub struct ImageStoreMut<'a, T, const N: usize> +where + T: FromPrimitive + Clone + Copy + Debug, +{ + pub(crate) buffer: BufferStore<'a, T>, + /// Channels in the image + pub channels: usize, + /// Image width + pub width: usize, + /// Image height + pub height: usize, + /// Required for `u16` images + pub bit_depth: usize, +} + +pub(crate) trait CheckStoreDensity { + fn should_have_bit_depth(&self) -> bool; +} + #[derive(Debug)] pub(crate) enum BufferStore<'a, T: Copy + Debug> { Borrowed(&'a mut [T]), @@ -84,7 +118,7 @@ impl BufferStore<'_, T> { } } -impl ImageStore<'static, T, N> +impl<'a, T, const N: usize> ImageStore<'a, T, N> where T: FromPrimitive + Clone + Copy + Debug + Default, { @@ -92,7 +126,7 @@ where slice_ref: Vec, width: usize, height: usize, - ) -> Result, PicScaleError> { + ) -> Result, PicScaleError> { let expected_size = width * height * N; if slice_ref.len() != width * height * N { return Err(PicScaleError::BufferMismatch(PicScaleBufferMismatch { @@ -104,7 +138,7 @@ where })); } Ok(ImageStore:: { - buffer: BufferStore::Owned(slice_ref), + buffer: std::borrow::Cow::Owned(slice_ref), channels: N, width, height, @@ -112,9 +146,93 @@ where }) } - pub fn alloc(width: usize, height: usize) -> ImageStore<'static, T, N> { + pub fn alloc(width: usize, height: usize) -> ImageStore<'a, T, N> { let vc = vec![T::from_u32(0).unwrap_or_default(); width * N * height]; ImageStore:: { + buffer: std::borrow::Cow::Owned(vc), + channels: N, + width, + height, + bit_depth: 0, + } + } +} + +impl CheckStoreDensity for ImageStoreMut<'_, u8, N> { + fn should_have_bit_depth(&self) -> bool { + false + } +} + +impl CheckStoreDensity for ImageStoreMut<'_, f32, N> { + fn should_have_bit_depth(&self) -> bool { + false + } +} + +#[cfg(feature = "half")] +impl CheckStoreDensity for ImageStoreMut<'_, half::f16, N> { + fn should_have_bit_depth(&self) -> bool { + false + } +} + +impl CheckStoreDensity for ImageStoreMut<'_, u16, N> { + fn should_have_bit_depth(&self) -> bool { + true + } +} + +impl ImageStoreMut<'_, T, N> +where + T: FromPrimitive + Clone + Copy + Debug + Default, +{ + pub(crate) fn validate(&self) -> Result<(), PicScaleError> { + let expected_size = self.width * self.height * N; + if self.buffer.borrow().len() != self.width * self.height * N { + return Err(PicScaleError::BufferMismatch(PicScaleBufferMismatch { + expected: expected_size, + width: self.width, + height: self.height, + channels: N, + slice_len: self.buffer.borrow().len(), + })); + } + Ok(()) + } +} + +impl<'a, T, const N: usize> ImageStoreMut<'a, T, N> +where + T: FromPrimitive + Clone + Copy + Debug + Default, +{ + pub fn new( + slice_ref: Vec, + width: usize, + height: usize, + ) -> Result, PicScaleError> { + let expected_size = width * height * N; + if slice_ref.len() != width * height * N { + return Err(PicScaleError::BufferMismatch(PicScaleBufferMismatch { + expected: expected_size, + width, + height, + channels: N, + slice_len: slice_ref.len(), + })); + } + Ok(ImageStoreMut:: { + buffer: BufferStore::Owned(slice_ref), + channels: N, + width, + height, + bit_depth: 0, + }) + } + + pub fn alloc(width: usize, height: usize) -> ImageStoreMut<'a, T, N> { + let vc = vec![T::from_u32(0).unwrap_or_default(); width * N * height]; + ImageStoreMut:: { buffer: BufferStore::Owned(vc), channels: N, width, @@ -122,6 +240,21 @@ where bit_depth: 0, } } + + pub fn alloc_with_depth( + width: usize, + height: usize, + bit_depth: usize, + ) -> ImageStoreMut<'a, T, N> { + let vc = vec![T::from_u32(0).unwrap_or_default(); width * N * height]; + ImageStoreMut:: { + buffer: BufferStore::Owned(vc), + channels: N, + width, + height, + bit_depth, + } + } } impl<'a, T, const N: usize> ImageStore<'a, T, N> @@ -134,13 +267,13 @@ where pub fn as_bytes(&self) -> &[T] { match &self.buffer { - BufferStore::Borrowed(p) => p, - BufferStore::Owned(v) => v, + Cow::Borrowed(br) => br.as_ref(), + Cow::Owned(v) => v.as_ref(), } } pub fn from_slice( - slice_ref: &'a mut [T], + slice_ref: &'a [T], width: usize, height: usize, ) -> Result, PicScaleError> { @@ -155,7 +288,7 @@ where })); } Ok(ImageStore:: { - buffer: BufferStore::Borrowed(slice_ref), + buffer: std::borrow::Cow::Borrowed(slice_ref), channels: N, width, height, @@ -165,68 +298,168 @@ where pub fn copied<'b>(&self) -> ImageStore<'b, T, N> { ImageStore:: { - buffer: BufferStore::Owned(self.buffer.borrow().to_vec()), + buffer: std::borrow::Cow::Owned(self.buffer.as_ref().to_vec()), channels: N, width: self.width, height: self.height, bit_depth: self.bit_depth, } } + + pub fn copied_to_mut<'b>(&self, into: &mut ImageStoreMut<'b, T, N>) { + for (&src, dst) in self.buffer.as_ref().iter().zip(into.buffer.borrow_mut()) { + *dst = src; + } + } } -impl ImageStore<'_, u8, 4> { - pub fn unpremultiply_alpha(&mut self, pool: &Option) { - let dst = self.buffer.borrow_mut(); - unpremultiply_alpha_rgba(dst, self.width, self.height, pool); +impl<'a, T, const N: usize> ImageStoreMut<'a, T, N> +where + T: FromPrimitive + Clone + Copy + Debug, +{ + pub fn get_size(&self) -> ImageSize { + ImageSize::new(self.width, self.height) + } + + pub fn as_bytes(&self) -> &[T] { + match &self.buffer { + BufferStore::Borrowed(p) => p, + BufferStore::Owned(v) => v, + } + } + + pub fn from_slice( + slice_ref: &'a mut [T], + width: usize, + height: usize, + ) -> Result, PicScaleError> { + let expected_size = width * height * N; + if slice_ref.len() != width * height * N { + return Err(PicScaleError::BufferMismatch(PicScaleBufferMismatch { + expected: expected_size, + width, + height, + channels: N, + slice_len: slice_ref.len(), + })); + } + Ok(ImageStoreMut:: { + buffer: BufferStore::Borrowed(slice_ref), + channels: N, + width, + height, + bit_depth: 0, + }) + } + + pub fn copied<'b>(&self) -> ImageStoreMut<'b, T, N> { + ImageStoreMut:: { + buffer: BufferStore::Owned(self.buffer.borrow().to_vec()), + channels: N, + width: self.width, + height: self.height, + bit_depth: self.bit_depth, + } } - pub fn premultiply_alpha(&self, into: &mut ImageStore<'_, u8, 4>, pool: &Option) { + pub fn to_immutable(&self) -> ImageStore<'_, T, N> { + ImageStore:: { + buffer: std::borrow::Cow::Owned(self.buffer.borrow().to_owned()), + channels: N, + width: self.width, + height: self.height, + bit_depth: self.bit_depth, + } + } +} + +pub(crate) trait AssociateAlpha { + fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, T, N>, pool: &Option); + fn is_alpha_premultiplication_needed(&self) -> bool; +} + +pub(crate) trait UnassociateAlpha { + fn unpremultiply_alpha(&mut self, pool: &Option); +} + +impl AssociateAlpha for ImageStore<'_, u8, 4> { + fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, u8, 4>, pool: &Option) { let dst = into.buffer.borrow_mut(); - let src = self.buffer.borrow(); + let src = self.buffer.as_ref(); premultiply_alpha_rgba(dst, src, self.width, self.height, pool); } + + fn is_alpha_premultiplication_needed(&self) -> bool { + has_non_constant_cap_alpha_rgba8(self.buffer.as_ref(), self.width) + } } -impl ImageStore<'_, u16, 4> { - pub fn unpremultiply_alpha(&mut self, pool: &Option) { - let in_place = self.buffer.borrow_mut(); - unpremultiply_alpha_rgba_u16(in_place, self.width, self.height, self.bit_depth, pool); +impl UnassociateAlpha for ImageStoreMut<'_, u8, 4> { + fn unpremultiply_alpha(&mut self, pool: &Option) { + let dst = self.buffer.borrow_mut(); + unpremultiply_alpha_rgba(dst, self.width, self.height, pool); } +} - pub fn premultiply_alpha(&self, into: &mut ImageStore<'_, u16, 4>, pool: &Option) { +impl AssociateAlpha for ImageStore<'_, u16, 4> { + fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, u16, 4>, pool: &Option) { let dst = into.buffer.borrow_mut(); - let src = self.buffer.borrow(); + let src = self.buffer.as_ref(); premultiply_alpha_rgba_u16(dst, src, self.width, self.height, self.bit_depth, pool); } -} -impl ImageStore<'_, f32, 4> { - pub fn unpremultiply_alpha(&mut self, pool: &Option) { - let dst = self.buffer.borrow_mut(); - unpremultiply_alpha_rgba_f32(dst, self.width, self.height, pool); + fn is_alpha_premultiplication_needed(&self) -> bool { + has_non_constant_cap_alpha_rgba16(self.buffer.as_ref(), self.width) } +} - pub fn premultiply_alpha(&self, into: &mut ImageStore<'_, f32, 4>, pool: &Option) { +impl AssociateAlpha for ImageStore<'_, f32, 4> { + fn premultiply_alpha(&self, into: &mut ImageStoreMut<'_, f32, 4>, pool: &Option) { let dst = into.buffer.borrow_mut(); - let src = self.buffer.borrow(); + let src = self.buffer.as_ref(); premultiply_alpha_rgba_f32(dst, src, self.width, self.height, pool); } -} -#[cfg(feature = "half")] -impl<'a> ImageStore<'a, half::f16, 4> { - pub fn unpremultiply_alpha(&mut self, pool: &Option) { - let dst = self.buffer.borrow_mut(); - unpremultiply_alpha_rgba_f16(dst, self.width, self.height, pool); + fn is_alpha_premultiplication_needed(&self) -> bool { + has_non_constant_cap_alpha_rgba_f32(self.buffer.as_ref(), self.width) } +} - pub fn premultiply_alpha( +#[cfg(feature = "half")] +impl AssociateAlpha for ImageStore<'_, half::f16, 4> { + fn premultiply_alpha( &self, - into: &mut ImageStore<'_, half::f16, 4>, + into: &mut ImageStoreMut<'_, half::f16, 4>, pool: &Option, ) { let dst = into.buffer.borrow_mut(); - let src = self.buffer.borrow(); + let src = self.buffer.as_ref(); premultiply_alpha_rgba_f16(dst, src, self.width, self.height, pool); } + + fn is_alpha_premultiplication_needed(&self) -> bool { + true + } +} + +impl UnassociateAlpha for ImageStoreMut<'_, u16, 4> { + fn unpremultiply_alpha(&mut self, pool: &Option) { + let in_place = self.buffer.borrow_mut(); + unpremultiply_alpha_rgba_u16(in_place, self.width, self.height, self.bit_depth, pool); + } +} + +impl UnassociateAlpha for ImageStoreMut<'_, f32, 4> { + fn unpremultiply_alpha(&mut self, pool: &Option) { + let dst = self.buffer.borrow_mut(); + unpremultiply_alpha_rgba_f32(dst, self.width, self.height, pool); + } +} + +#[cfg(feature = "half")] +impl UnassociateAlpha for ImageStoreMut<'_, half::f16, 4> { + fn unpremultiply_alpha(&mut self, pool: &Option) { + let dst = self.buffer.borrow_mut(); + unpremultiply_alpha_rgba_f16(dst, self.width, self.height, pool); + } } diff --git a/src/lib.rs b/src/lib.rs index 290b8d4..6591d36 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -97,7 +97,7 @@ pub use colors::*; #[cfg(feature = "colorspaces")] pub use colorutils_rs::TransferFunction; pub use image_size::ImageSize; -pub use image_store::ImageStore; +pub use image_store::{ImageStore, ImageStoreMut}; pub use math::*; pub use sampler::*; pub use scaler::Scaler; diff --git a/src/neon/utils.rs b/src/neon/utils.rs index a17f401..76c2381 100644 --- a/src/neon/utils.rs +++ b/src/neon/utils.rs @@ -104,6 +104,20 @@ pub(crate) unsafe fn prefer_vfmaq_f32( } } +#[inline(always)] +pub unsafe fn xvst1q_f32_x4(a: *mut f32, b: float32x4x4_t) { + vst1q_f32(a, b.0); + vst1q_f32(a.add(4), b.1); + vst1q_f32(a.add(8), b.2); + vst1q_f32(a.add(12), b.3); +} + +#[inline(always)] +pub unsafe fn xvst1q_f32_x2(a: *mut f32, b: float32x4x2_t) { + vst1q_f32(a, b.0); + vst1q_f32(a.add(4), b.1); +} + #[inline(always)] pub(crate) unsafe fn prefer_vfmaq_laneq_f32( a: float32x4_t, diff --git a/src/neon/vertical_f32.rs b/src/neon/vertical_f32.rs index 454892c..71226ef 100644 --- a/src/neon/vertical_f32.rs +++ b/src/neon/vertical_f32.rs @@ -27,8 +27,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ use crate::filter_weights::FilterBounds; -use crate::neon::utils::xvld1q_f32_x4; use crate::neon::utils::{prefer_vfmaq_f32, xvld1q_f32_x2}; +use crate::neon::utils::{xvld1q_f32_x4, xvst1q_f32_x2, xvst1q_f32_x4}; use std::arch::aarch64::*; macro_rules! conv_vertical_part_neon_16_f32 { @@ -56,7 +56,7 @@ macro_rules! conv_vertical_part_neon_16_f32 { let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr(); let f_set = float32x4x4_t(store_0, store_1, store_2, store_3); - vst1q_f32_x4(dst_ptr, f_set); + xvst1q_f32_x4(dst_ptr, f_set); } }}; } @@ -96,70 +96,77 @@ macro_rules! conv_vertical_part_neon_32_f32 { let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr(); let f_set = float32x4x4_t(store_0, store_1, store_2, store_3); - vst1q_f32_x4(dst_ptr, f_set); + xvst1q_f32_x4(dst_ptr, f_set); let f_set_1 = float32x4x4_t(store_4, store_5, store_6, store_7); - vst1q_f32_x4(dst_ptr.add(16), f_set_1); + xvst1q_f32_x4(dst_ptr.add(16), f_set_1); } }}; } -macro_rules! conv_vertical_part_neon_48_f32 { - ($start_y: expr, $start_x: expr, $src: expr, $src_stride: expr, $dst: expr, $filter: expr, $bounds: expr) => {{ - unsafe { - let mut store_0 = vdupq_n_f32(0.); - let mut store_1 = vdupq_n_f32(0.); - let mut store_2 = vdupq_n_f32(0.); - let mut store_3 = vdupq_n_f32(0.); - - let mut store_4 = vdupq_n_f32(0.); - let mut store_5 = vdupq_n_f32(0.); - let mut store_6 = vdupq_n_f32(0.); - let mut store_7 = vdupq_n_f32(0.); - - let mut store_8 = vdupq_n_f32(0.); - let mut store_9 = vdupq_n_f32(0.); - let mut store_10 = vdupq_n_f32(0.); - let mut store_11 = vdupq_n_f32(0.); - - let px = $start_x; - - for j in 0..$bounds.size { - let py = $start_y + j; - let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr()); - let src_ptr = $src.get_unchecked($src_stride * py + px..).as_ptr(); - - let item_row_0 = xvld1q_f32_x4(src_ptr); - let item_row_1 = xvld1q_f32_x4(src_ptr.add(16)); - let item_row_2 = xvld1q_f32_x4(src_ptr.add(32)); - - store_0 = prefer_vfmaq_f32(store_0, item_row_0.0, v_weight); - store_1 = prefer_vfmaq_f32(store_1, item_row_0.1, v_weight); - store_2 = prefer_vfmaq_f32(store_2, item_row_0.2, v_weight); - store_3 = prefer_vfmaq_f32(store_3, item_row_0.3, v_weight); - - store_4 = prefer_vfmaq_f32(store_4, item_row_1.0, v_weight); - store_5 = prefer_vfmaq_f32(store_5, item_row_1.1, v_weight); - store_6 = prefer_vfmaq_f32(store_6, item_row_1.2, v_weight); - store_7 = prefer_vfmaq_f32(store_7, item_row_1.3, v_weight); - - store_8 = prefer_vfmaq_f32(store_8, item_row_2.0, v_weight); - store_9 = prefer_vfmaq_f32(store_9, item_row_2.1, v_weight); - store_10 = prefer_vfmaq_f32(store_10, item_row_2.2, v_weight); - store_11 = prefer_vfmaq_f32(store_11, item_row_2.3, v_weight); - } +#[inline(always)] +fn conv_vertical_part_neon_48_f32( + start_y: usize, + start_x: usize, + src: &[f32], + src_stride: usize, + dst: &mut [f32], + filter: &[f32], + bounds: &FilterBounds, +) { + unsafe { + let mut store_0 = vdupq_n_f32(0.); + let mut store_1 = vdupq_n_f32(0.); + let mut store_2 = vdupq_n_f32(0.); + let mut store_3 = vdupq_n_f32(0.); + + let mut store_4 = vdupq_n_f32(0.); + let mut store_5 = vdupq_n_f32(0.); + let mut store_6 = vdupq_n_f32(0.); + let mut store_7 = vdupq_n_f32(0.); + + let mut store_8 = vdupq_n_f32(0.); + let mut store_9 = vdupq_n_f32(0.); + let mut store_10 = vdupq_n_f32(0.); + let mut store_11 = vdupq_n_f32(0.); + + let px = start_x; + + for j in 0..bounds.size { + let py = start_y + j; + let v_weight = vld1q_dup_f32(filter.get_unchecked(j..).as_ptr()); + let src_ptr = src.get_unchecked(src_stride * py + px..).as_ptr(); + + let item_row_0 = xvld1q_f32_x4(src_ptr); + let item_row_1 = xvld1q_f32_x4(src_ptr.add(16)); + let item_row_2 = xvld1q_f32_x4(src_ptr.add(32)); + + store_0 = prefer_vfmaq_f32(store_0, item_row_0.0, v_weight); + store_1 = prefer_vfmaq_f32(store_1, item_row_0.1, v_weight); + store_2 = prefer_vfmaq_f32(store_2, item_row_0.2, v_weight); + store_3 = prefer_vfmaq_f32(store_3, item_row_0.3, v_weight); + + store_4 = prefer_vfmaq_f32(store_4, item_row_1.0, v_weight); + store_5 = prefer_vfmaq_f32(store_5, item_row_1.1, v_weight); + store_6 = prefer_vfmaq_f32(store_6, item_row_1.2, v_weight); + store_7 = prefer_vfmaq_f32(store_7, item_row_1.3, v_weight); + + store_8 = prefer_vfmaq_f32(store_8, item_row_2.0, v_weight); + store_9 = prefer_vfmaq_f32(store_9, item_row_2.1, v_weight); + store_10 = prefer_vfmaq_f32(store_10, item_row_2.2, v_weight); + store_11 = prefer_vfmaq_f32(store_11, item_row_2.3, v_weight); + } - let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr(); - let f_set = float32x4x4_t(store_0, store_1, store_2, store_3); - vst1q_f32_x4(dst_ptr, f_set); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); + let f_set = float32x4x4_t(store_0, store_1, store_2, store_3); + xvst1q_f32_x4(dst_ptr, f_set); - let f_set_1 = float32x4x4_t(store_4, store_5, store_6, store_7); - vst1q_f32_x4(dst_ptr.add(16), f_set_1); + let f_set_1 = float32x4x4_t(store_4, store_5, store_6, store_7); + xvst1q_f32_x4(dst_ptr.add(16), f_set_1); - let f_set_2 = float32x4x4_t(store_8, store_9, store_10, store_11); - vst1q_f32_x4(dst_ptr.add(32), f_set_2); - } - }}; + let f_set_2 = float32x4x4_t(store_8, store_9, store_10, store_11); + xvst1q_f32_x4(dst_ptr.add(32), f_set_2); + } } #[inline(always)] @@ -191,7 +198,7 @@ unsafe fn convolve_vertical_part_neon_8_f32( let item = float32x4x2_t(store_0, store_1); let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); - vst1q_f32_x2(dst_ptr, item); + xvst1q_f32_x2(dst_ptr, item); } #[inline(always)] @@ -263,7 +270,7 @@ pub(crate) fn convolve_vertical_rgb_neon_row_f32( let dst_width = width * CHANNELS; while cx + 48 < dst_width { - conv_vertical_part_neon_48_f32!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); + conv_vertical_part_neon_48_f32(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); cx += 48; } diff --git a/src/plane_f32.rs b/src/plane_f32.rs index 61f533a..14df280 100644 --- a/src/plane_f32.rs +++ b/src/plane_f32.rs @@ -34,6 +34,7 @@ use crate::convolve_naive_f32::{ }; use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32}; use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::image_store::ImageStoreMut; #[cfg(all(target_arch = "aarch64", target_feature = "neon",))] use crate::neon::{ convolve_horizontal_plane_neon_row_one, convolve_horizontal_plane_neon_rows_4, @@ -54,7 +55,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f32, 1> { fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let mut _dispatcher_4_rows: Option< @@ -93,7 +94,7 @@ impl VerticalConvolutionPass for ImageStore<'_, f32, 1> { fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) = diff --git a/src/plane_u16.rs b/src/plane_u16.rs index 4d6fa7d..bb86038 100644 --- a/src/plane_u16.rs +++ b/src/plane_u16.rs @@ -30,6 +30,7 @@ use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16}; use crate::filter_weights::FilterWeights; +use crate::image_store::ImageStoreMut; use crate::ImageStore; use rayon::ThreadPool; @@ -38,7 +39,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u16, 1> { fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, _pool: &Option, ) { convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool); @@ -49,7 +50,7 @@ impl VerticalConvolutionPass for ImageStore<'_, u16, 1> { fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { convolve_vertical_dispatch_u16(self, filter_weights, destination, pool); diff --git a/src/plane_u8.rs b/src/plane_u8.rs index 0cd736e..b2d43b8 100644 --- a/src/plane_u8.rs +++ b/src/plane_u8.rs @@ -32,6 +32,7 @@ use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8}; use crate::filter_weights::{FilterBounds, FilterWeights}; use crate::handler_provider::{handle_fixed_column_u8, handle_fixed_row_u8}; +use crate::image_store::ImageStoreMut; #[cfg(all(target_arch = "aarch64", target_feature = "neon",))] use crate::neon::{convolve_horizontal_plane_neon_row, convolve_horizontal_plane_neon_rows_4_u8}; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] @@ -51,7 +52,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 1> { fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, _pool: &Option, ) { let mut _dispatcher_4_rows: Option< @@ -86,7 +87,7 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 1> { fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let _scale_factor = self.height as f32 / destination.height as f32; diff --git a/src/rgb_f32.rs b/src/rgb_f32.rs index 357d8ec..a3c896c 100644 --- a/src/rgb_f32.rs +++ b/src/rgb_f32.rs @@ -33,7 +33,7 @@ use crate::convolve_naive_f32::*; use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32}; use crate::filter_weights::{FilterBounds, FilterWeights}; use crate::floating_point_vertical::column_handler_floating_point; -use crate::image_store::ImageStore; +use crate::image_store::{ImageStore, ImageStoreMut}; #[cfg(all(target_arch = "aarch64", target_feature = "neon",))] use crate::neon::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -56,7 +56,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f32, 3> { fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let mut _dispatcher_4_rows: Option< @@ -95,7 +95,7 @@ impl VerticalConvolutionPass for ImageStore<'_, f32, 3> { fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) = diff --git a/src/rgb_u16.rs b/src/rgb_u16.rs index ac33c17..d420454 100644 --- a/src/rgb_u16.rs +++ b/src/rgb_u16.rs @@ -30,6 +30,7 @@ use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16}; use crate::filter_weights::FilterWeights; +use crate::image_store::ImageStoreMut; use crate::ImageStore; use rayon::ThreadPool; @@ -38,7 +39,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u16, 3> { fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, _pool: &Option, ) { convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool); @@ -49,7 +50,7 @@ impl VerticalConvolutionPass for ImageStore<'_, u16, 3> { fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { convolve_vertical_dispatch_u16(self, filter_weights, destination, pool); diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs index 85ce1e5..8aff370 100644 --- a/src/rgb_u8.rs +++ b/src/rgb_u8.rs @@ -34,7 +34,7 @@ use crate::filter_weights::{FilterBounds, FilterWeights}; use crate::handler_provider::{ handle_fixed_column_u8, handle_fixed_row_u8, handle_fixed_rows_4_u8, }; -use crate::image_store::ImageStore; +use crate::image_store::{ImageStore, ImageStoreMut}; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] use crate::neon::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -51,7 +51,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 3> { fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let mut _dispatcher_4_rows: Option< @@ -86,7 +86,7 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 3> { fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let _scale_factor = self.height as f32 / destination.height as f32; diff --git a/src/rgba_f32.rs b/src/rgba_f32.rs index f2081a4..66c60cc 100644 --- a/src/rgba_f32.rs +++ b/src/rgba_f32.rs @@ -37,6 +37,7 @@ use crate::convolve_naive_f32::{ }; use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32}; use crate::filter_weights::*; +use crate::image_store::ImageStoreMut; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] use crate::neon::*; use crate::rgb_f32::convolve_vertical_rgb_native_row_f32; @@ -50,7 +51,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, f32, 4> { fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let mut _dispatcher_4_rows: Option< @@ -97,7 +98,7 @@ impl VerticalConvolutionPass for ImageStore<'_, f32, 4> { fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let mut _dispatcher: fn(usize, &FilterBounds, &[f32], &mut [f32], usize, &[f32]) = diff --git a/src/rgba_u16.rs b/src/rgba_u16.rs index 38532b3..613bc19 100644 --- a/src/rgba_u16.rs +++ b/src/rgba_u16.rs @@ -30,6 +30,7 @@ use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::dispatch_group_u16::{convolve_horizontal_dispatch_u16, convolve_vertical_dispatch_u16}; use crate::filter_weights::FilterWeights; +use crate::image_store::ImageStoreMut; use crate::ImageStore; use rayon::ThreadPool; @@ -38,7 +39,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u16, 4> { fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, _pool: &Option, ) { convolve_horizontal_dispatch_u16(self, filter_weights, destination, _pool); @@ -49,7 +50,7 @@ impl VerticalConvolutionPass for ImageStore<'_, u16, 4> { fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { convolve_vertical_dispatch_u16(self, filter_weights, destination, pool); diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs index 597ffe4..b4466ff 100644 --- a/src/rgba_u8.rs +++ b/src/rgba_u8.rs @@ -36,6 +36,7 @@ use crate::filter_weights::{FilterBounds, FilterWeights}; use crate::handler_provider::{ handle_fixed_column_u8, handle_fixed_row_u8, handle_fixed_rows_4_u8, }; +use crate::image_store::ImageStoreMut; #[cfg(all(target_arch = "aarch64", target_feature = "neon",))] use crate::neon::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] @@ -54,7 +55,7 @@ impl HorizontalConvolutionPass for ImageStore<'_, u8, 4> { fn convolve_horizontal( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, _pool: &Option, ) { let _scale_factor = self.width as f32 / destination.width as f32; @@ -100,7 +101,7 @@ impl VerticalConvolutionPass for ImageStore<'_, u8, 4> { fn convolve_vertical( &self, filter_weights: FilterWeights, - destination: &mut ImageStore, + destination: &mut ImageStoreMut, pool: &Option, ) { let _scale_factor = self.height as f32 / destination.height as f32; diff --git a/src/scaler.rs b/src/scaler.rs index 513183a..70841d7 100644 --- a/src/scaler.rs +++ b/src/scaler.rs @@ -26,22 +26,20 @@ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::alpha_check::{ - has_non_constant_cap_alpha_rgba16, has_non_constant_cap_alpha_rgba8, - has_non_constant_cap_alpha_rgba_f32, -}; use crate::ar30::{Ar30ByteOrder, Rgb30}; use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; use crate::filter_weights::{FilterBounds, FilterWeights}; use crate::image_size::ImageSize; -use crate::image_store::ImageStore; +use crate::image_store::{ + AssociateAlpha, CheckStoreDensity, ImageStore, ImageStoreMut, UnassociateAlpha, +}; use crate::nearest_sampler::resize_nearest; use crate::pic_scale_error::PicScaleError; use crate::resize_ar30::resize_ar30_impl; use crate::support::check_image_size_overflow; use crate::threading_policy::ThreadingPolicy; use crate::{ConstPI, ConstSqrt2, Jinc, ResamplingFunction}; -use num_traits::{AsPrimitive, Float, Signed}; +use num_traits::{AsPrimitive, Float, FromPrimitive, Signed}; use rayon::ThreadPool; use std::fmt::Debug; use std::ops::{AddAssign, MulAssign, Neg}; @@ -59,34 +57,34 @@ pub trait Scaling { /// Performs rescaling for RGB, channel order does not matter fn resize_rgb<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, u8, 3>, - ) -> Result, PicScaleError>; + store: &ImageStore<'a, u8, 3>, + into: &mut ImageStoreMut<'a, u8, 3>, + ) -> Result<(), PicScaleError>; /// Performs rescaling for RGBA, for pre-multiplying alpha, converting to LUV or LAB alpha must be last channel fn resize_rgba<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, u8, 4>, + store: &ImageStore<'a, u8, 4>, + into: &mut ImageStoreMut<'a, u8, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError>; + ) -> Result<(), PicScaleError>; } pub trait ScalingF32 { /// Performs rescaling for RGB f32, channel order does not matter fn resize_rgb_f32<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, f32, 3>, - ) -> Result, PicScaleError>; + store: &ImageStore<'a, f32, 3>, + into: &mut ImageStoreMut<'a, f32, 3>, + ) -> Result<(), PicScaleError>; /// Performs rescaling for RGBA f32, alpha expected to be last fn resize_rgba_f32<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, f32, 4>, + store: &ImageStore<'a, f32, 4>, + into: &mut ImageStoreMut<'a, f32, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError>; + ) -> Result<(), PicScaleError>; } pub trait ScalingU16 { @@ -100,11 +98,10 @@ pub trait ScalingU16 { /// # Panics /// Panic if bit-depth < 1 or bit-depth > 16 fn resize_plane_u16<'a>( - &self, - new_size: ImageSize, - store: ImageStore<'a, u16, 1>, - bit_depth: usize, - ) -> Result, PicScaleError>; + &'a self, + store: &ImageStore<'a, u16, 1>, + into: &mut ImageStoreMut<'a, u16, 1>, + ) -> Result<(), PicScaleError>; /// Performs rescaling for RGB, channel order does not matter /// @@ -116,11 +113,10 @@ pub trait ScalingU16 { /// # Panics /// Panic if bit-depth < 1 or bit-depth > 16 fn resize_rgb_u16<'a>( - &self, - new_size: ImageSize, - store: ImageStore<'a, u16, 3>, - bit_depth: usize, - ) -> Result, PicScaleError>; + &'a self, + store: &ImageStore<'a, u16, 3>, + into: &mut ImageStoreMut<'a, u16, 3>, + ) -> Result<(), PicScaleError>; /// Performs rescaling for RGBA, for pre-multiplying alpha should be last /// @@ -133,12 +129,11 @@ pub trait ScalingU16 { /// # Panics /// Panic if bit-depth < 1 or bit-depth > 16 fn resize_rgba_u16<'a>( - &self, - new_size: ImageSize, - store: ImageStore<'a, u16, 4>, - bit_depth: usize, + &'a self, + store: &ImageStore<'a, u16, 4>, + into: &mut ImageStoreMut<'a, u16, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError>; + ) -> Result<(), PicScaleError>; } impl Scaler { @@ -213,8 +208,8 @@ impl Scaler { let start: usize = (center_x - filter_radius).floor().max(0f32.as_()).as_(); let end: usize = (center_x + filter_radius) .ceil() - .min(in_size.as_()) .min(start.as_() + kernel_size.as_()) + .min(in_size.as_()) .as_(); let center = center_x - 0.5.as_(); @@ -325,8 +320,8 @@ impl Scaler { let start: usize = sx.floor().max(0f32.as_()).as_(); let end: usize = (sx + kernel_size.as_()) .ceil() - .min(in_size.as_()) .min(start.as_() + kernel_size.as_()) + .min(in_size.as_()) .as_(); let size = end - start; @@ -368,13 +363,21 @@ impl Scaler { } impl Scaler { - pub(crate) fn resize_rgba_impl<'a>( - &self, - new_size: ImageSize, - store: ImageStore<'a, u8, 4>, - premultiply_alpha: bool, - pool: &Option, - ) -> Result, PicScaleError> { + pub(crate) fn generic_resize< + 'a, + T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static, + const N: usize, + >( + &'a self, + store: &ImageStore<'a, T, N>, + into: &mut ImageStoreMut<'a, T, N>, + ) -> Result<(), PicScaleError> + where + ImageStore<'a, T, N>: VerticalConvolutionPass + HorizontalConvolutionPass, + ImageStoreMut<'a, T, N>: CheckStoreDensity, + { + let new_size = into.get_size(); + into.validate()?; if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -387,96 +390,15 @@ impl Scaler { return Err(PicScaleError::DestinationImageIsTooLarge); } - if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); - } - - let should_do_horizontal = store.width != new_size.width; - let should_do_vertical = store.height != new_size.height; - assert!(should_do_horizontal || should_do_vertical); - - let mut src_store = store; - - if self.function == ResamplingFunction::Nearest { - let mut new_image = ImageStore::::alloc(new_size.width, new_size.height); - resize_nearest::( - src_store.buffer.borrow(), - src_store.width, - src_store.height, - new_image.buffer.borrow_mut(), - new_size.width, - new_size.height, - pool, - ); - return Ok(new_image); - } - - let mut has_alpha_premultiplied = false; - - if premultiply_alpha { - let is_alpha_premultiplication_reasonable = - has_non_constant_cap_alpha_rgba8(src_store.buffer.borrow(), src_store.width); - if is_alpha_premultiplication_reasonable { - let mut new_store = ImageStore::::alloc(src_store.width, src_store.height); - src_store.premultiply_alpha(&mut new_store, pool); - src_store = new_store; - has_alpha_premultiplied = true; + if into.should_have_bit_depth() { + if !(1..=16).contains(&into.bit_depth) { + return Err(PicScaleError::UnsupportedBitDepth(into.bit_depth)); } } - if should_do_vertical { - let mut new_image_vertical = - ImageStore::::alloc(src_store.width, new_size.height); - let vertical_filters = - self.generate_weights(src_store.height, new_image_vertical.height); - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, pool); - src_store = new_image_vertical; - } - - assert_eq!(src_store.height, new_size.height); - - if should_do_horizontal { - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - let mut new_image_horizontal = - ImageStore::::alloc(new_size.width, new_size.height); - src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, pool); - src_store = new_image_horizontal; - } - - assert_eq!(src_store.width, new_size.width); - - if premultiply_alpha && has_alpha_premultiplied { - src_store.unpremultiply_alpha(pool); - } - - Ok(src_store) - } -} - -impl Scaling for Scaler { - fn set_threading_policy(&mut self, threading_policy: ThreadingPolicy) { - self.threading_policy = threading_policy; - } - - fn resize_rgb<'a>( - &'a self, - new_size: ImageSize, - store: ImageStore<'a, u8, 3>, - ) -> Result, PicScaleError> { - if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { - return Err(PicScaleError::ZeroImageDimensions); - } - - if check_image_size_overflow(store.width, store.height, store.channels) { - return Err(PicScaleError::SourceImageIsTooLarge); - } - - if check_image_size_overflow(new_size.width, new_size.height, store.channels) { - return Err(PicScaleError::DestinationImageIsTooLarge); - } - if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } let pool = self @@ -484,164 +406,257 @@ impl Scaling for Scaler { .get_pool(ImageSize::new(new_size.width, new_size.height)); if self.function == ResamplingFunction::Nearest { - let mut allocated_store: Vec = vec![0u8; new_size.width * 3 * new_size.height]; - resize_nearest::( - store.buffer.borrow(), + resize_nearest::( + store.buffer.as_ref(), store.width, store.height, - &mut allocated_store, + into.buffer.borrow_mut(), new_size.width, new_size.height, &pool, ); - return ImageStore::::new(allocated_store, new_size.width, new_size.height); + return Ok(()); } let should_do_horizontal = store.width != new_size.width; let should_do_vertical = store.height != new_size.height; assert!(should_do_horizontal || should_do_vertical); - let mut src_store = store; - - if should_do_vertical { - let vertical_filters = self.generate_weights(src_store.height, new_size.height); - let mut new_image_vertical = - ImageStore::::alloc(src_store.width, new_size.height); - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); - src_store = new_image_vertical; - } - - assert_eq!(src_store.height, new_size.height); + if should_do_vertical && should_do_horizontal { + let mut target_vertical = vec![T::default(); store.width * new_size.height * N]; - if should_do_horizontal { - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - let mut new_image_horizontal = - ImageStore::::alloc(new_size.width, new_size.height); - src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool); - src_store = new_image_horizontal; + let mut new_image_vertical = ImageStoreMut::::from_slice( + &mut target_vertical, + store.width, + new_size.height, + )?; + new_image_vertical.bit_depth = into.bit_depth; + let vertical_filters = self.generate_weights(store.height, new_size.height); + store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); + + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: N, + width: store.width, + height: new_size.height, + bit_depth: into.bit_depth, + }; + let horizontal_filters = self.generate_weights(store.width, new_size.width); + new_immutable_store.convolve_horizontal(horizontal_filters, into, &pool); + Ok(()) + } else if should_do_vertical { + let vertical_filters = self.generate_weights(store.height, new_size.height); + store.convolve_vertical(vertical_filters, into, &pool); + Ok(()) + } else { + assert!(should_do_horizontal); + let horizontal_filters = self.generate_weights(store.width, new_size.width); + store.convolve_horizontal(horizontal_filters, into, &pool); + Ok(()) } - - assert_eq!(src_store.width, new_size.width); - - Ok(src_store) - } - - fn resize_rgba<'a>( - &self, - new_size: ImageSize, - store: ImageStore<'a, u8, 4>, - premultiply_alpha: bool, - ) -> Result, PicScaleError> { - let pool = self - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - self.resize_rgba_impl(new_size, store, premultiply_alpha, &pool) } -} -impl Scaler { - pub(crate) fn resize_rgba_f32_impl<'a>( + fn forward_resize_with_alpha< + 'a, + T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static, + const N: usize, + >( &'a self, - new_size: ImageSize, - store: ImageStore<'a, f32, 4>, - premultiply_alpha: bool, + store: &ImageStore<'a, T, N>, + into: &mut ImageStoreMut<'a, T, N>, + premultiply_alpha_requested: bool, pool: &Option, - ) -> Result, PicScaleError> { - if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { - return Err(PicScaleError::ZeroImageDimensions); - } + ) -> Result<(), PicScaleError> + where + ImageStore<'a, T, N>: + VerticalConvolutionPass + HorizontalConvolutionPass + AssociateAlpha, + ImageStoreMut<'a, T, N>: CheckStoreDensity + UnassociateAlpha, + { + let new_size = into.get_size(); + let mut src_store: std::borrow::Cow<'_, ImageStore<'_, T, N>> = + std::borrow::Cow::Borrowed(store); - if check_image_size_overflow(store.width, store.height, store.channels) { - return Err(PicScaleError::SourceImageIsTooLarge); - } + let mut has_alpha_premultiplied = true; - if check_image_size_overflow(new_size.width, new_size.height, store.channels) { - return Err(PicScaleError::DestinationImageIsTooLarge); + if premultiply_alpha_requested { + let is_alpha_premultiplication_reasonable = + src_store.is_alpha_premultiplication_needed(); + if is_alpha_premultiplication_reasonable { + let mut target_premultiplied = + vec![T::default(); src_store.width * src_store.height * N]; + let mut new_store = ImageStoreMut::::from_slice( + &mut target_premultiplied, + src_store.width, + src_store.height, + )?; + new_store.bit_depth = into.bit_depth; + src_store.premultiply_alpha(&mut new_store, &pool); + src_store = std::borrow::Cow::Owned(ImageStore:: { + buffer: std::borrow::Cow::Owned(target_premultiplied), + channels: N, + width: src_store.width, + height: src_store.height, + bit_depth: into.bit_depth, + }); + has_alpha_premultiplied = true; + } } - if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); - } + let mut target_vertical = vec![T::default(); src_store.width * new_size.height * N]; - let mut src_store = store; + let mut new_image_vertical = ImageStoreMut::::from_slice( + &mut target_vertical, + src_store.width, + new_size.height, + )?; + new_image_vertical.bit_depth = into.bit_depth; + let vertical_filters = self.generate_weights(src_store.height, new_size.height); + src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); - if self.function == ResamplingFunction::Nearest { - let mut allocated_store: Vec = vec![0f32; new_size.width * 4 * new_size.height]; - resize_nearest::( - src_store.buffer.borrow(), - src_store.width, - src_store.height, - &mut allocated_store, - new_size.width, - new_size.height, - pool, - ); - let new_image = ImageStore::new(allocated_store, new_size.width, new_size.height)?; - return Ok(new_image); + let new_immutable_store = ImageStore:: { + buffer: std::borrow::Cow::Owned(target_vertical), + channels: N, + width: src_store.width, + height: new_size.height, + bit_depth: into.bit_depth, + }; + let horizontal_filters = self.generate_weights(src_store.width, new_size.width); + new_immutable_store.convolve_horizontal(horizontal_filters, into, &pool); + + if premultiply_alpha_requested && has_alpha_premultiplied { + into.unpremultiply_alpha(&pool); } - let should_do_horizontal = src_store.width != new_size.width; - let should_do_vertical = src_store.height != new_size.height; - assert!(should_do_horizontal || should_do_vertical); + Ok(()) + } - let mut has_alpha_premultiplied = false; + fn forward_resize_vertical_with_alpha< + 'a, + T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static, + const N: usize, + >( + &'a self, + store: &ImageStore<'a, T, N>, + into: &mut ImageStoreMut<'a, T, N>, + premultiply_alpha_requested: bool, + pool: &Option, + ) -> Result<(), PicScaleError> + where + ImageStore<'a, T, N>: + VerticalConvolutionPass + HorizontalConvolutionPass + AssociateAlpha, + ImageStoreMut<'a, T, N>: CheckStoreDensity + UnassociateAlpha, + { + let new_size = into.get_size(); + let mut src_store = std::borrow::Cow::Borrowed(store); + + let mut has_alpha_premultiplied = true; - if premultiply_alpha { + if premultiply_alpha_requested { let is_alpha_premultiplication_reasonable = - has_non_constant_cap_alpha_rgba_f32(src_store.buffer.borrow(), src_store.width); + src_store.is_alpha_premultiplication_needed(); if is_alpha_premultiplication_reasonable { - let mut new_store = ImageStore::::alloc(src_store.width, new_size.height); - src_store.premultiply_alpha(&mut new_store, pool); - src_store = new_store; + let mut target_premultiplied = + vec![T::default(); src_store.width * src_store.height * N]; + let mut new_store = ImageStoreMut::::from_slice( + &mut target_premultiplied, + src_store.width, + src_store.height, + )?; + new_store.bit_depth = into.bit_depth; + src_store.premultiply_alpha(&mut new_store, &pool); + src_store = std::borrow::Cow::Owned(ImageStore:: { + buffer: std::borrow::Cow::Owned(target_premultiplied), + channels: N, + width: src_store.width, + height: src_store.height, + bit_depth: into.bit_depth, + }); has_alpha_premultiplied = true; } } - if should_do_vertical { - let allocated_store_vertical: Vec = - vec![0f32; src_store.width * 4 * new_size.height]; - let mut new_image_vertical = ImageStore::::new( - allocated_store_vertical, - src_store.width, - new_size.height, - )?; - let vertical_filters = - self.generate_weights(src_store.height, new_image_vertical.height); - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, pool); - src_store = new_image_vertical; + let vertical_filters = self.generate_weights(src_store.height, new_size.height); + src_store.convolve_vertical(vertical_filters, into, &pool); + + if premultiply_alpha_requested && has_alpha_premultiplied { + into.unpremultiply_alpha(&pool); } - assert_eq!(src_store.height, new_size.height); + Ok(()) + } - if should_do_horizontal { - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - let allocated_store_horizontal: Vec = - vec![0f32; new_size.width * 4 * new_size.height]; - let mut new_image_horizontal = ImageStore::::new( - allocated_store_horizontal, - new_size.width, - new_size.height, - )?; - src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, pool); - src_store = new_image_horizontal; + fn forward_resize_horizontal_with_alpha< + 'a, + T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static, + const N: usize, + >( + &'a self, + store: &ImageStore<'a, T, N>, + into: &mut ImageStoreMut<'a, T, N>, + premultiply_alpha_requested: bool, + pool: &Option, + ) -> Result<(), PicScaleError> + where + ImageStore<'a, T, N>: + VerticalConvolutionPass + HorizontalConvolutionPass + AssociateAlpha, + ImageStoreMut<'a, T, N>: CheckStoreDensity + UnassociateAlpha, + { + let new_size = into.get_size(); + let mut src_store = std::borrow::Cow::Borrowed(store); + + let mut has_alpha_premultiplied = true; + + if premultiply_alpha_requested { + let is_alpha_premultiplication_reasonable = + src_store.is_alpha_premultiplication_needed(); + if is_alpha_premultiplication_reasonable { + let mut target_premultiplied = + vec![T::default(); src_store.width * src_store.height * N]; + let mut new_store = ImageStoreMut::::from_slice( + &mut target_premultiplied, + src_store.width, + src_store.height, + )?; + new_store.bit_depth = into.bit_depth; + src_store.premultiply_alpha(&mut new_store, &pool); + src_store = std::borrow::Cow::Owned(ImageStore:: { + buffer: std::borrow::Cow::Owned(target_premultiplied), + channels: N, + width: src_store.width, + height: src_store.height, + bit_depth: into.bit_depth, + }); + has_alpha_premultiplied = true; + } } - assert_eq!(src_store.width, new_size.width); + let horizontal_filters = self.generate_weights(src_store.width, new_size.width); + src_store.convolve_horizontal(horizontal_filters, into, &pool); - if premultiply_alpha && has_alpha_premultiplied { - src_store.unpremultiply_alpha(pool); + if premultiply_alpha_requested && has_alpha_premultiplied { + into.unpremultiply_alpha(&pool); } - Ok(src_store) + Ok(()) } -} -impl ScalingF32 for Scaler { - fn resize_rgb_f32<'a>( + pub(crate) fn generic_resize_with_alpha< + 'a, + T: FromPrimitive + Clone + Copy + Debug + Send + Sync + Default + 'static, + const N: usize, + >( &'a self, - new_size: ImageSize, - store: ImageStore<'a, f32, 3>, - ) -> Result, PicScaleError> { + store: &ImageStore<'a, T, N>, + into: &mut ImageStoreMut<'a, T, N>, + premultiply_alpha_requested: bool, + ) -> Result<(), PicScaleError> + where + ImageStore<'a, T, N>: + VerticalConvolutionPass + HorizontalConvolutionPass + AssociateAlpha, + ImageStoreMut<'a, T, N>: CheckStoreDensity + UnassociateAlpha, + { + let new_size = into.get_size(); + into.validate()?; if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { return Err(PicScaleError::ZeroImageDimensions); } @@ -654,8 +669,15 @@ impl ScalingF32 for Scaler { return Err(PicScaleError::DestinationImageIsTooLarge); } + if into.should_have_bit_depth() { + if !(1..=16).contains(&into.bit_depth) { + return Err(PicScaleError::UnsupportedBitDepth(into.bit_depth)); + } + } + if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); + store.copied_to_mut(into); + return Ok(()); } let pool = self @@ -663,153 +685,88 @@ impl ScalingF32 for Scaler { .get_pool(ImageSize::new(new_size.width, new_size.height)); if self.function == ResamplingFunction::Nearest { - let mut allocated_store: Vec = vec![0f32; new_size.width * 3 * new_size.height]; - resize_nearest::( - store.buffer.borrow(), + resize_nearest::( + store.buffer.as_ref(), store.width, store.height, - &mut allocated_store, + into.buffer.borrow_mut(), new_size.width, new_size.height, &pool, ); - let new_image = - ImageStore::::new(allocated_store, new_size.width, new_size.height); - return new_image; + return Ok(()); } - let mut src_store = store; - - let should_do_horizontal = src_store.width != new_size.width; - let should_do_vertical = src_store.height != new_size.height; + let should_do_horizontal = store.width != new_size.width; + let should_do_vertical = store.height != new_size.height; assert!(should_do_horizontal || should_do_vertical); - if should_do_vertical { - let allocated_store_vertical: Vec = - vec![0f32; src_store.width * 3 * new_size.height]; - let mut new_image_vertical = ImageStore::::new( - allocated_store_vertical, - src_store.width, - new_size.height, - )?; - let vertical_filters = - self.generate_weights(src_store.height, new_image_vertical.height); - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); - src_store = new_image_vertical; + if should_do_vertical && should_do_horizontal { + self.forward_resize_with_alpha(store, into, premultiply_alpha_requested, &pool) + } else if should_do_vertical { + self.forward_resize_vertical_with_alpha(store, into, premultiply_alpha_requested, &pool) + } else { + assert!(should_do_horizontal); + self.forward_resize_horizontal_with_alpha( + store, + into, + premultiply_alpha_requested, + &pool, + ) } + } +} - assert_eq!(src_store.height, new_size.height); +impl Scaling for Scaler { + fn set_threading_policy(&mut self, threading_policy: ThreadingPolicy) { + self.threading_policy = threading_policy; + } - if should_do_horizontal { - let allocated_store_horizontal: Vec = - vec![0f32; new_size.width * 3 * new_size.height]; - let mut new_image_horizontal = ImageStore::::new( - allocated_store_horizontal, - new_size.width, - new_size.height, - )?; - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool); - src_store = new_image_horizontal; - } + fn resize_rgb<'a>( + &'a self, + store: &ImageStore<'a, u8, 3>, + into: &mut ImageStoreMut<'a, u8, 3>, + ) -> Result<(), PicScaleError> { + self.generic_resize(store, into) + } - assert_eq!(src_store.width, new_size.width); + fn resize_rgba<'a>( + &'a self, + store: &ImageStore<'a, u8, 4>, + into: &mut ImageStoreMut<'a, u8, 4>, + premultiply_alpha: bool, + ) -> Result<(), PicScaleError> { + self.generic_resize_with_alpha(store, into, premultiply_alpha) + } +} - Ok(src_store) +impl ScalingF32 for Scaler { + fn resize_rgb_f32<'a>( + &'a self, + store: &ImageStore<'a, f32, 3>, + into: &mut ImageStoreMut<'a, f32, 3>, + ) -> Result<(), PicScaleError> { + self.generic_resize(store, into) } fn resize_rgba_f32<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, f32, 4>, + store: &ImageStore<'a, f32, 4>, + into: &mut ImageStoreMut<'a, f32, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError> { - let pool = self - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - self.resize_rgba_f32_impl(new_size, store, premultiply_alpha, &pool) + ) -> Result<(), PicScaleError> { + self.generic_resize_with_alpha(store, into, premultiply_alpha) } } impl Scaler { /// Performs rescaling for f32 plane pub fn resize_plane_f32<'a>( - &self, - new_size: ImageSize, - store: ImageStore<'a, f32, 1>, - ) -> Result, PicScaleError> { - if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { - return Err(PicScaleError::ZeroImageDimensions); - } - - if check_image_size_overflow(store.width, store.height, store.channels) { - return Err(PicScaleError::SourceImageIsTooLarge); - } - - if check_image_size_overflow(new_size.width, new_size.height, store.channels) { - return Err(PicScaleError::DestinationImageIsTooLarge); - } - - if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); - } - - let pool = self - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - - if self.function == ResamplingFunction::Nearest { - let mut allocated_store: Vec = vec![0f32; new_size.width * new_size.height]; - resize_nearest::( - store.buffer.borrow(), - store.width, - store.height, - &mut allocated_store, - new_size.width, - new_size.height, - &pool, - ); - let new_image = - ImageStore::::new(allocated_store, new_size.width, new_size.height)?; - return Ok(new_image); - } - - let mut src_store = store; - - let should_do_horizontal = src_store.width != new_size.width; - let should_do_vertical = src_store.height != new_size.height; - assert!(should_do_horizontal || should_do_vertical); - - if should_do_vertical { - let allocated_store_vertical: Vec = vec![0f32; src_store.width * new_size.height]; - let mut new_image_vertical = ImageStore::::new( - allocated_store_vertical, - src_store.width, - new_size.height, - )?; - let vertical_filters = - self.generate_weights(src_store.height, new_image_vertical.height); - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); - src_store = new_image_vertical; - } - - assert_eq!(src_store.height, new_size.height); - - if should_do_horizontal { - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - let allocated_store_horizontal: Vec = vec![0f32; new_size.width * new_size.height]; - let mut new_image_horizontal = ImageStore::::new( - allocated_store_horizontal, - new_size.width, - new_size.height, - )?; - src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool); - src_store = new_image_horizontal; - } - - assert_eq!(src_store.width, new_size.width); - - Ok(src_store) + &'a self, + store: &ImageStore<'a, f32, 1>, + into: &mut ImageStoreMut<'a, f32, 1>, + ) -> Result<(), PicScaleError> { + self.generic_resize(store, into) } } @@ -817,154 +774,20 @@ impl Scaler { /// Performs rescaling for u8 plane pub fn resize_plane<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, u8, 1>, - ) -> Result, PicScaleError> { - if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { - return Err(PicScaleError::ZeroImageDimensions); - } - - if check_image_size_overflow(store.width, store.height, store.channels) { - return Err(PicScaleError::SourceImageIsTooLarge); - } - - if check_image_size_overflow(new_size.width, new_size.height, store.channels) { - return Err(PicScaleError::DestinationImageIsTooLarge); - } - - if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); - } - - let pool = self - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - - if self.function == ResamplingFunction::Nearest { - let mut allocated_store: Vec = vec![0u8; new_size.width * new_size.height]; - resize_nearest::( - store.buffer.borrow(), - store.width, - store.height, - &mut allocated_store, - new_size.width, - new_size.height, - &pool, - ); - let new_image = - ImageStore::::new(allocated_store, new_size.width, new_size.height)?; - return Ok(new_image); - } - - let should_do_horizontal = store.width != new_size.width; - let should_do_vertical = store.height != new_size.height; - assert!(should_do_horizontal || should_do_vertical); - - let mut src_store = store; - - if should_do_vertical { - let vertical_filters = self.generate_weights(src_store.height, new_size.height); - let mut new_image_vertical = - ImageStore::::alloc(src_store.width, new_size.height); - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); - src_store = new_image_vertical; - } - - assert_eq!(src_store.height, new_size.height); - - if should_do_horizontal { - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - let mut new_image_horizontal = - ImageStore::::alloc(new_size.width, new_size.height); - src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool); - src_store = new_image_horizontal; - } - - assert_eq!(src_store.width, new_size.width); - - Ok(src_store) + store: &ImageStore<'a, u8, 1>, + into: &mut ImageStoreMut<'a, u8, 1>, + ) -> Result<(), PicScaleError> { + self.generic_resize(store, into) } } impl ScalingU16 for Scaler { fn resize_rgb_u16<'a>( - &self, - new_size: ImageSize, - store: ImageStore<'a, u16, 3>, - bit_depth: usize, - ) -> Result, PicScaleError> { - if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { - return Err(PicScaleError::ZeroImageDimensions); - } - - if check_image_size_overflow(store.width, store.height, store.channels) { - return Err(PicScaleError::SourceImageIsTooLarge); - } - - if check_image_size_overflow(new_size.width, new_size.height, store.channels) { - return Err(PicScaleError::DestinationImageIsTooLarge); - } - - if !(1..=16).contains(&bit_depth) { - return Err(PicScaleError::UnsupportedBitDepth(bit_depth)); - } - - if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); - } - - let should_do_horizontal = store.width != new_size.width; - let should_do_vertical = store.height != new_size.height; - assert!(should_do_horizontal || should_do_vertical); - - let pool = self - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - - if self.function == ResamplingFunction::Nearest { - let mut allocated_store: Vec = vec![0u16; new_size.width * 3 * new_size.height]; - resize_nearest::( - store.buffer.borrow(), - store.width, - store.height, - &mut allocated_store, - new_size.width, - new_size.height, - &pool, - ); - let mut new_image = - ImageStore::::new(allocated_store, new_size.width, new_size.height)?; - new_image.bit_depth = bit_depth; - return Ok(new_image); - } - - let mut src_store = store; - src_store.bit_depth = bit_depth; - - if should_do_vertical { - let vertical_filters = self.generate_weights(src_store.height, new_size.height); - let mut new_image_vertical = - ImageStore::::alloc(src_store.width, new_size.height); - new_image_vertical.bit_depth = bit_depth; - src_store.bit_depth = bit_depth; - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); - src_store = new_image_vertical; - } - - assert_eq!(src_store.height, new_size.height); - - if should_do_horizontal { - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - let mut new_image_horizontal = - ImageStore::::alloc(new_size.width, new_size.height); - new_image_horizontal.bit_depth = bit_depth; - src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool); - src_store = new_image_horizontal; - } - - assert_eq!(src_store.width, new_size.width); - - Ok(src_store) + &'a self, + store: &ImageStore<'a, u16, 3>, + into: &mut ImageStoreMut<'a, u16, 3>, + ) -> Result<(), PicScaleError> { + self.generic_resize(store, into) } /// Resizes u16 image @@ -978,180 +801,21 @@ impl ScalingU16 for Scaler { /// # Panics /// Panic if bit depth < 1 or bit depth > 16 fn resize_rgba_u16<'a>( - &self, - new_size: ImageSize, - store: ImageStore<'a, u16, 4>, - bit_depth: usize, + &'a self, + store: &ImageStore<'a, u16, 4>, + into: &mut ImageStoreMut<'a, u16, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError> { - if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { - return Err(PicScaleError::ZeroImageDimensions); - } - - if check_image_size_overflow(store.width, store.height, store.channels) { - return Err(PicScaleError::SourceImageIsTooLarge); - } - - if check_image_size_overflow(new_size.width, new_size.height, store.channels) { - return Err(PicScaleError::DestinationImageIsTooLarge); - } - - if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); - } - - let should_do_horizontal = store.width != new_size.width; - let should_do_vertical = store.height != new_size.height; - assert!(should_do_horizontal || should_do_vertical); - - if !(1..=16).contains(&bit_depth) { - return Err(PicScaleError::UnsupportedBitDepth(bit_depth)); - } - - let mut src_store = store; - src_store.bit_depth = bit_depth; - - let pool = self - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - - if self.function == ResamplingFunction::Nearest { - let mut new_image = ImageStore::::alloc(new_size.width, new_size.height); - resize_nearest::( - src_store.buffer.borrow(), - src_store.width, - src_store.height, - new_image.buffer.borrow_mut(), - new_size.width, - new_size.height, - &pool, - ); - new_image.bit_depth = bit_depth; - return Ok(new_image); - } - - let mut has_alpha_premultiplied = false; - - if premultiply_alpha { - let is_alpha_premultiplication_reasonable = - has_non_constant_cap_alpha_rgba16(src_store.buffer.borrow(), src_store.width); - if is_alpha_premultiplication_reasonable { - let mut new_store = ImageStore::::alloc(src_store.width, src_store.height); - new_store.bit_depth = src_store.bit_depth; - src_store.premultiply_alpha(&mut new_store, &pool); - src_store = new_store; - has_alpha_premultiplied = true; - } - } - - if should_do_vertical { - let mut new_image_vertical = - ImageStore::::alloc(src_store.width, new_size.height); - let vertical_filters = - self.generate_weights(src_store.height, new_image_vertical.height); - src_store.bit_depth = bit_depth; - new_image_vertical.bit_depth = bit_depth; - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); - src_store = new_image_vertical; - } - - assert_eq!(src_store.height, new_size.height); - - if should_do_horizontal { - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - let mut new_image_horizontal = - ImageStore::::alloc(new_size.width, new_size.height); - new_image_horizontal.bit_depth = bit_depth; - src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool); - src_store = new_image_horizontal; - } - - assert_eq!(src_store.width, new_size.width); - - if premultiply_alpha && has_alpha_premultiplied { - src_store.unpremultiply_alpha(&pool); - return Ok(src_store); - } - Ok(src_store) + ) -> Result<(), PicScaleError> { + self.generic_resize_with_alpha(store, into, premultiply_alpha) } /// Performs rescaling for u16 plane fn resize_plane_u16<'a>( - &self, - new_size: ImageSize, - store: ImageStore<'a, u16, 1>, - bit_depth: usize, - ) -> Result, PicScaleError> { - if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { - return Err(PicScaleError::ZeroImageDimensions); - } - - if check_image_size_overflow(store.width, store.height, store.channels) { - return Err(PicScaleError::SourceImageIsTooLarge); - } - - if check_image_size_overflow(new_size.width, new_size.height, store.channels) { - return Err(PicScaleError::DestinationImageIsTooLarge); - } - - if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); - } - - if !(1..=16).contains(&bit_depth) { - return Err(PicScaleError::UnsupportedBitDepth(bit_depth)); - } - - let should_do_horizontal = store.width != new_size.width; - let should_do_vertical = store.height != new_size.height; - assert!(should_do_horizontal || should_do_vertical); - - let pool = self - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - - if self.function == ResamplingFunction::Nearest { - let mut allocated_store: Vec = vec![0u16; new_size.width * new_size.height]; - resize_nearest::( - store.buffer.borrow(), - store.width, - store.height, - &mut allocated_store, - new_size.width, - new_size.height, - &pool, - ); - let mut new_image = - ImageStore::::new(allocated_store, new_size.width, new_size.height)?; - new_image.bit_depth = bit_depth; - return Ok(new_image); - } - - let mut src_store = store; - src_store.bit_depth = bit_depth; - - if should_do_vertical { - let vertical_filters = self.generate_weights(src_store.height, new_size.height); - let mut new_image_vertical = - ImageStore::::alloc(src_store.width, new_size.height); - new_image_vertical.bit_depth = bit_depth; - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); - src_store = new_image_vertical; - } - - assert_eq!(src_store.height, new_size.height); - - if should_do_horizontal { - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - let mut new_image_horizontal = - ImageStore::::alloc(new_size.width, new_size.height); - new_image_horizontal.bit_depth = bit_depth; - src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool); - src_store = new_image_horizontal; - } - assert_eq!(src_store.width, new_size.width); - - Ok(src_store) + &'a self, + store: &ImageStore<'a, u16, 1>, + into: &mut ImageStoreMut<'a, u16, 1>, + ) -> Result<(), PicScaleError> { + self.generic_resize(store, into) } } diff --git a/src/scaler_f16.rs b/src/scaler_f16.rs index d7c3d72..ba8096f 100644 --- a/src/scaler_f16.rs +++ b/src/scaler_f16.rs @@ -27,12 +27,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; -use crate::nearest_sampler::resize_nearest; +use crate::image_store::ImageStoreMut; use crate::pic_scale_error::PicScaleError; -use crate::support::check_image_size_overflow; -use crate::ResamplingFunction::Nearest; -use crate::{ImageSize, ImageStore, Scaler}; +use crate::{ImageStore, Scaler}; use half::f16; // f16 @@ -40,253 +37,28 @@ impl Scaler { /// Resize f16 RGBA image pub fn resize_rgba_f16<'a>( &'a self, - new_size: ImageSize, - store: ImageStore<'a, f16, 4>, + store: &ImageStore<'a, f16, 4>, + into: &mut ImageStoreMut<'a, f16, 4>, premultiply_alpha: bool, - ) -> Result, PicScaleError> { - if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { - return Err(PicScaleError::ZeroImageDimensions); - } - - if check_image_size_overflow(store.width, store.height, store.channels) { - return Err(PicScaleError::SourceImageIsTooLarge); - } - - if check_image_size_overflow(new_size.width, new_size.height, store.channels) { - return Err(PicScaleError::DestinationImageIsTooLarge); - } - - if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); - } - - let mut src_store = store; - - let pool = self - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - - if self.function == Nearest { - let mut allocated_store: Vec = - vec![f16::from_f32(0.); new_size.width * 4 * new_size.height]; - resize_nearest::( - &src_store.buffer.borrow(), - src_store.width, - src_store.height, - &mut allocated_store, - new_size.width, - new_size.height, - &pool, - ); - let new_image = - ImageStore::::new(allocated_store, new_size.width, new_size.height)?; - - return Ok(new_image); - } - - let should_do_horizontal = src_store.width != new_size.width; - let should_do_vertical = src_store.height != new_size.height; - assert!(should_do_horizontal || should_do_vertical); - - if premultiply_alpha { - let mut new_store = ImageStore::alloc(src_store.width, src_store.height); - src_store.premultiply_alpha(&mut new_store, &pool); - src_store = new_store; - } - - if should_do_vertical { - let allocated_store_vertical: Vec = - vec![f16::from_f32(0.); src_store.width * 4 * new_size.height]; - let mut new_image_vertical = ImageStore::::new( - allocated_store_vertical, - src_store.width, - new_size.height, - )?; - let vertical_filters = - self.generate_weights(src_store.height, new_image_vertical.height); - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); - src_store = new_image_vertical; - } - - assert_eq!(src_store.height, new_size.height); - - if should_do_horizontal { - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - let allocated_store_horizontal: Vec = - vec![f16::from_f32(0.); new_size.width * 4 * new_size.height]; - let mut new_image_horizontal = ImageStore::::new( - allocated_store_horizontal, - new_size.width, - new_size.height, - )?; - src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool); - src_store = new_image_horizontal; - } - - assert_eq!(src_store.width, new_size.width); - - if premultiply_alpha { - src_store.unpremultiply_alpha(&pool); - } - - Ok(src_store) + ) -> Result<(), PicScaleError> { + self.generic_resize_with_alpha(store, into, premultiply_alpha) } /// Resize f16 RGB image pub fn resize_rgb_f16<'a>( - &self, - new_size: ImageSize, - store: ImageStore<'a, f16, 3>, - ) -> Result, PicScaleError> { - if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { - return Err(PicScaleError::ZeroImageDimensions); - } - - if check_image_size_overflow(store.width, store.height, store.channels) { - return Err(PicScaleError::SourceImageIsTooLarge); - } - - if check_image_size_overflow(new_size.width, new_size.height, store.channels) { - return Err(PicScaleError::DestinationImageIsTooLarge); - } - - if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); - } - - let pool = self - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - - if self.function == Nearest { - let mut allocated_store: Vec = - vec![f16::from_f32(0.); new_size.width * 3 * new_size.height]; - resize_nearest::( - &store.buffer.borrow(), - store.width, - store.height, - &mut allocated_store, - new_size.width, - new_size.height, - &pool, - ); - let new_image = - ImageStore::::new(allocated_store, new_size.width, new_size.height)?; - return Ok(new_image); - } - - let mut src_store = store; - - let should_do_horizontal = src_store.width != new_size.width; - let should_do_vertical = src_store.height != new_size.height; - assert!(should_do_horizontal || should_do_vertical); - - if should_do_vertical { - let allocated_store_vertical: Vec = - vec![f16::from_f32(0.); src_store.width * 3 * new_size.height]; - let mut new_image_vertical = ImageStore::::new( - allocated_store_vertical, - src_store.width, - new_size.height, - )?; - let vertical_filters = - self.generate_weights(src_store.height, new_image_vertical.height); - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); - src_store = new_image_vertical; - } - - if should_do_horizontal { - let allocated_store_horizontal: Vec = - vec![f16::from_f32(0.); new_size.width * 3 * new_size.height]; - let mut new_image_horizontal = ImageStore::::new( - allocated_store_horizontal, - new_size.width, - new_size.height, - )?; - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool); - src_store = new_image_horizontal; - } - - Ok(src_store) + &'a self, + store: &ImageStore<'a, f16, 3>, + into: &mut ImageStoreMut<'a, f16, 3>, + ) -> Result<(), PicScaleError> { + self.generic_resize(store, into) } /// Resize f16 plane pub fn resize_plane_f16<'a>( - &self, - new_size: ImageSize, - store: ImageStore<'a, f16, 1>, - ) -> Result, PicScaleError> { - if store.width == 0 || store.height == 0 || new_size.width == 0 || new_size.height == 0 { - return Err(PicScaleError::ZeroImageDimensions); - } - - if check_image_size_overflow(store.width, store.height, store.channels) { - return Err(PicScaleError::SourceImageIsTooLarge); - } - - if check_image_size_overflow(new_size.width, new_size.height, store.channels) { - return Err(PicScaleError::DestinationImageIsTooLarge); - } - - if store.width == new_size.width && store.height == new_size.height { - return Ok(store.copied()); - } - - let pool = self - .threading_policy - .get_pool(ImageSize::new(new_size.width, new_size.height)); - - if self.function == Nearest { - let mut allocated_store: Vec = - vec![f16::from_f32(0.); new_size.width * new_size.height]; - resize_nearest::( - &store.buffer.borrow(), - store.width, - store.height, - &mut allocated_store, - new_size.width, - new_size.height, - &pool, - ); - let new_image = - ImageStore::::new(allocated_store, new_size.width, new_size.height)?; - return Ok(new_image); - } - - let mut src_store = store; - - let should_do_horizontal = src_store.width != new_size.width; - let should_do_vertical = src_store.height != new_size.height; - assert!(should_do_horizontal || should_do_vertical); - - if should_do_vertical { - let allocated_store_vertical: Vec = - vec![f16::from_f32(0.); src_store.width * new_size.height]; - let mut new_image_vertical = ImageStore::::new( - allocated_store_vertical, - src_store.width, - new_size.height, - )?; - let vertical_filters = - self.generate_weights(src_store.height, new_image_vertical.height); - src_store.convolve_vertical(vertical_filters, &mut new_image_vertical, &pool); - src_store = new_image_vertical; - } - - if should_do_horizontal { - let allocated_store_horizontal: Vec = - vec![f16::from_f32(0.); new_size.width * new_size.height]; - let mut new_image_horizontal = ImageStore::::new( - allocated_store_horizontal, - new_size.width, - new_size.height, - )?; - let horizontal_filters = self.generate_weights(src_store.width, new_size.width); - src_store.convolve_horizontal(horizontal_filters, &mut new_image_horizontal, &pool); - src_store = new_image_horizontal; - } - - Ok(src_store) + &'a self, + store: &ImageStore<'a, f16, 1>, + into: &mut ImageStoreMut<'a, f16, 1>, + ) -> Result<(), PicScaleError> { + self.generic_resize(store, into) } } From 1d38af30828531ca8b26fc744e4318e44174641d Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Sat, 28 Dec 2024 11:40:20 +0000 Subject: [PATCH 5/9] Updates for x86 --- app/benches/resize_rgb/main.rs | 16 ++--- app/benches/resize_rgba/main.rs | 32 ++++----- fuzz/resize_plane/resize_plane.rs | 7 +- fuzz/resize_plane_f32/resize_plane_f32.rs | 12 ++-- fuzz/resize_plane_u16/resize_plane_u16.rs | 13 ++-- fuzz/resize_rgb/resize_rgb.rs | 7 +- fuzz/resize_rgb_f32/resize_rgb_f32.rs | 12 ++-- fuzz/resize_rgb_u16/resize_rgb_u16.rs | 13 ++-- fuzz/resize_rgba/resize_rgba.rs | 11 ++- fuzz/resize_rgba_f32/resize_rgba_f32.rs | 12 ++-- fuzz/resize_rgba_u16/resize_rgba_u16.rs | 24 +++---- src/avx2/vertical_u8_lp.rs | 50 ++++++------- src/sse/rgba_u8_lb.rs | 20 +++--- src/sse/vertical_u8_lp.rs | 86 ++++++++++++----------- 14 files changed, 147 insertions(+), 168 deletions(-) diff --git a/app/benches/resize_rgb/main.rs b/app/benches/resize_rgb/main.rs index 329a933..aeb1de2 100644 --- a/app/benches/resize_rgb/main.rs +++ b/app/benches/resize_rgb/main.rs @@ -4,7 +4,7 @@ use fast_image_resize::FilterType::Lanczos3; use fast_image_resize::{CpuExtensions, PixelType, ResizeAlg, ResizeOptions, Resizer}; use image::{EncodableLayout, GenericImageView, ImageReader}; use pic_scale::{ - ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ScalingF32, ThreadingPolicy, + ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingF32, ThreadingPolicy, }; pub fn criterion_benchmark(c: &mut Criterion) { @@ -27,10 +27,9 @@ pub fn criterion_benchmark(c: &mut Criterion) { dimensions.1 as usize, ) .unwrap(); - _ = scaler.resize_rgb( - ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4), - store, - ); + let mut target = + ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); + scaler.resize_rgb(&store, &mut target).unwrap(); }) }); @@ -47,10 +46,9 @@ pub fn criterion_benchmark(c: &mut Criterion) { dimensions.1 as usize, ) .unwrap(); - _ = scaler.resize_rgb_f32( - ImageSize::new(dimensions.0 as usize / 4, dimensions.1 as usize / 4), - store, - ); + let mut target = + ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); + scaler.resize_rgb_f32(&store, &mut target).unwrap(); }) }); diff --git a/app/benches/resize_rgba/main.rs b/app/benches/resize_rgba/main.rs index 5635f51..3ac926c 100644 --- a/app/benches/resize_rgba/main.rs +++ b/app/benches/resize_rgba/main.rs @@ -4,7 +4,7 @@ use fast_image_resize::FilterType::Lanczos3; use fast_image_resize::{CpuExtensions, PixelType, ResizeAlg, ResizeOptions, Resizer}; use image::{GenericImageView, ImageReader}; use pic_scale::{ - ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ScalingF32, ThreadingPolicy, + ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling, ScalingF32, ThreadingPolicy, }; pub fn criterion_benchmark(c: &mut Criterion) { @@ -25,11 +25,9 @@ pub fn criterion_benchmark(c: &mut Criterion) { dimensions.1 as usize, ) .unwrap(); - _ = scaler.resize_rgba( - ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2), - store, - true, - ); + let mut target = + ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); + _ = scaler.resize_rgba(&store, &mut target, true); }) }); @@ -46,17 +44,15 @@ pub fn criterion_benchmark(c: &mut Criterion) { dimensions.1 as usize, ) .unwrap(); - _ = scaler.resize_rgba_f32( - ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2), - store, - false, - ); + let mut target = + ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); + _ = scaler.resize_rgba_f32(&store, &mut target, false); }) }); c.bench_function("Fast image resize RGBA with alpha: Lanczos 3", |b| { + let mut vc = Vec::from(img.as_bytes()); b.iter(|| { - let mut vc = Vec::from(img.as_bytes()); let pixel_type: PixelType = PixelType::U8x4; let src_image = Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap(); @@ -84,27 +80,25 @@ pub fn criterion_benchmark(c: &mut Criterion) { }); c.bench_function("Pic scale RGBA without alpha: Lanczos 3", |b| { + let mut copied: Vec = Vec::from(src_bytes); b.iter(|| { let mut scaler = Scaler::new(ResamplingFunction::Lanczos3); scaler.set_threading_policy(ThreadingPolicy::Single); - let mut copied: Vec = Vec::from(src_bytes); let store = ImageStore::::from_slice( &mut copied, dimensions.0 as usize, dimensions.1 as usize, ) .unwrap(); - _ = scaler.resize_rgba( - ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2), - store, - false, - ); + let mut target = + ImageStoreMut::alloc(dimensions.0 as usize / 4, dimensions.1 as usize / 4); + _ = scaler.resize_rgba(&store, &mut target, false); }) }); c.bench_function("Fast image resize RGBA without alpha: Lanczos 3", |b| { + let mut vc = Vec::from(img.as_bytes()); b.iter(|| { - let mut vc = Vec::from(img.as_bytes()); let pixel_type: PixelType = PixelType::U8x4; let src_image = Image::from_slice_u8(dimensions.0, dimensions.1, &mut vc, pixel_type).unwrap(); diff --git a/fuzz/resize_plane/resize_plane.rs b/fuzz/resize_plane/resize_plane.rs index 829cca4..7238732 100644 --- a/fuzz/resize_plane/resize_plane.rs +++ b/fuzz/resize_plane/resize_plane.rs @@ -30,7 +30,7 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler}; +use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler}; fuzz_target!(|data: (u16, u16, u16, u16)| { resize_plane( @@ -64,8 +64,7 @@ fn resize_plane( let mut src_data = vec![15u8; src_width * src_height]; let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let mut target = ImageStoreMut::alloc(dst_width, dst_height); let scaler = Scaler::new(sampler); - _ = scaler - .resize_plane(ImageSize::new(dst_width, dst_height), store) - .unwrap(); + scaler.resize_plane(&store, &mut target).unwrap(); } diff --git a/fuzz/resize_plane_f32/resize_plane_f32.rs b/fuzz/resize_plane_f32/resize_plane_f32.rs index bb128e0..74f2d31 100644 --- a/fuzz/resize_plane_f32/resize_plane_f32.rs +++ b/fuzz/resize_plane_f32/resize_plane_f32.rs @@ -30,7 +30,7 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler}; +use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler}; fuzz_target!(|data: (u16, u16, u16, u16)| { resize_plane( @@ -64,12 +64,8 @@ fn resize_plane( let mut src_data = vec![0f32; src_width * src_height]; let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let mut target = ImageStoreMut::alloc(dst_width, dst_height); + let scaler = Scaler::new(sampler); - _ = scaler - .resize_plane_f32(ImageSize::new(dst_width, dst_height), store) - .unwrap(); - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); - _ = scaler - .resize_plane_f32(ImageSize::new(dst_width, dst_height), store) - .unwrap(); + scaler.resize_plane_f32(&store, &mut target).unwrap(); } diff --git a/fuzz/resize_plane_u16/resize_plane_u16.rs b/fuzz/resize_plane_u16/resize_plane_u16.rs index 8a59c96..0b7f44f 100644 --- a/fuzz/resize_plane_u16/resize_plane_u16.rs +++ b/fuzz/resize_plane_u16/resize_plane_u16.rs @@ -30,7 +30,7 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16}; +use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ScalingU16}; fuzz_target!(|data: (u16, u16, u16, u16)| { resize_rgb( @@ -64,13 +64,12 @@ fn resize_rgb( let mut src_data = vec![1u16; src_width * src_height]; let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10); + let scaler = Scaler::new(sampler); - _ = scaler - .resize_plane_u16(ImageSize::new(dst_width, dst_height), store, 10) - .unwrap(); + scaler.resize_plane_u16(&store, &mut target).unwrap(); let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); - _ = scaler - .resize_plane_u16(ImageSize::new(dst_width, dst_height), store, 16) - .unwrap(); + let mut target16 = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16); + scaler.resize_plane_u16(&store, &mut target16).unwrap(); } diff --git a/fuzz/resize_rgb/resize_rgb.rs b/fuzz/resize_rgb/resize_rgb.rs index ecc74d3..40ad6ae 100644 --- a/fuzz/resize_rgb/resize_rgb.rs +++ b/fuzz/resize_rgb/resize_rgb.rs @@ -30,7 +30,7 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling}; +use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling}; fuzz_target!(|data: (u16, u16, u16, u16)| { resize_rgb( @@ -64,8 +64,7 @@ fn resize_rgb( let mut src_data = vec![0u8; src_width * src_height * 3]; let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let mut target = ImageStoreMut::alloc(dst_width, dst_height); let scaler = Scaler::new(sampler); - _ = scaler - .resize_rgb(ImageSize::new(dst_width, dst_height), store) - .unwrap(); + scaler.resize_rgb(&store, &mut target).unwrap(); } diff --git a/fuzz/resize_rgb_f32/resize_rgb_f32.rs b/fuzz/resize_rgb_f32/resize_rgb_f32.rs index f2d4773..68228c9 100644 --- a/fuzz/resize_rgb_f32/resize_rgb_f32.rs +++ b/fuzz/resize_rgb_f32/resize_rgb_f32.rs @@ -30,7 +30,7 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingF32}; +use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ScalingF32}; fuzz_target!(|data: (u16, u16, u16, u16)| { resize_rgb( @@ -64,12 +64,8 @@ fn resize_rgb( let mut src_data = vec![0f32; src_width * src_height * 3]; let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let mut target = ImageStoreMut::alloc(dst_width, dst_height); + let scaler = Scaler::new(sampler); - _ = scaler - .resize_rgb_f32(ImageSize::new(dst_width, dst_height), store) - .unwrap(); - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); - _ = scaler - .resize_rgb_f32(ImageSize::new(dst_width, dst_height), store) - .unwrap(); + scaler.resize_rgb_f32(&store, &mut target).unwrap(); } diff --git a/fuzz/resize_rgb_u16/resize_rgb_u16.rs b/fuzz/resize_rgb_u16/resize_rgb_u16.rs index 47e48fd..7018c61 100644 --- a/fuzz/resize_rgb_u16/resize_rgb_u16.rs +++ b/fuzz/resize_rgb_u16/resize_rgb_u16.rs @@ -30,7 +30,7 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16}; +use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ScalingU16}; fuzz_target!(|data: (u16, u16, u16, u16)| { resize_rgb( @@ -64,13 +64,12 @@ fn resize_rgb( let mut src_data = vec![1u16; src_width * src_height * 3]; let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10); + let scaler = Scaler::new(sampler); - _ = scaler - .resize_rgb_u16(ImageSize::new(dst_width, dst_height), store, 10) - .unwrap(); + scaler.resize_rgb_u16(&store, &mut target).unwrap(); let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); - _ = scaler - .resize_rgb_u16(ImageSize::new(dst_width, dst_height), store, 16) - .unwrap(); + let mut target16 = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16); + scaler.resize_rgb_u16(&store, &mut target16).unwrap(); } diff --git a/fuzz/resize_rgba/resize_rgba.rs b/fuzz/resize_rgba/resize_rgba.rs index dab34f5..ecf7055 100644 --- a/fuzz/resize_rgba/resize_rgba.rs +++ b/fuzz/resize_rgba/resize_rgba.rs @@ -30,7 +30,7 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling}; +use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, Scaling}; fuzz_target!(|data: (u16, u16, u16, u16)| { resize_rgba( @@ -64,12 +64,9 @@ fn resize_rgba( let mut src_data = vec![0u8; src_width * src_height * 4]; let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let mut target = ImageStoreMut::alloc(dst_width, dst_height); let scaler = Scaler::new(sampler); - _ = scaler - .resize_rgba(ImageSize::new(dst_width, dst_height), store, false) - .unwrap(); + scaler.resize_rgba(&store, &mut target, false).unwrap(); let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); - _ = scaler - .resize_rgba(ImageSize::new(dst_width, dst_height), store, true) - .unwrap(); + scaler.resize_rgba(&store, &mut target, true).unwrap(); } diff --git a/fuzz/resize_rgba_f32/resize_rgba_f32.rs b/fuzz/resize_rgba_f32/resize_rgba_f32.rs index 8c08146..67a2e47 100644 --- a/fuzz/resize_rgba_f32/resize_rgba_f32.rs +++ b/fuzz/resize_rgba_f32/resize_rgba_f32.rs @@ -30,7 +30,7 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use pic_scale::{ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingF32}; +use pic_scale::{ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ScalingF32}; fuzz_target!(|data: (u16, u16, u16, u16)| { resize_rgba( @@ -64,12 +64,10 @@ fn resize_rgba( let mut src_data = vec![0f32; src_width * src_height * 4]; let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let mut target = ImageStoreMut::alloc(dst_width, dst_height); + let scaler = Scaler::new(sampler); - _ = scaler - .resize_rgba_f32(ImageSize::new(dst_width, dst_height), store, false) - .unwrap(); + scaler.resize_rgba_f32(&store, &mut target, false).unwrap(); let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); - _ = scaler - .resize_rgba_f32(ImageSize::new(dst_width, dst_height), store, true) - .unwrap(); + scaler.resize_rgba_f32(&store, &mut target, true).unwrap(); } diff --git a/fuzz/resize_rgba_u16/resize_rgba_u16.rs b/fuzz/resize_rgba_u16/resize_rgba_u16.rs index 494da64..57d8090 100644 --- a/fuzz/resize_rgba_u16/resize_rgba_u16.rs +++ b/fuzz/resize_rgba_u16/resize_rgba_u16.rs @@ -30,7 +30,9 @@ #![no_main] use libfuzzer_sys::fuzz_target; -use pic_scale::{Ar30ByteOrder, ImageSize, ImageStore, ResamplingFunction, Scaler, ScalingU16}; +use pic_scale::{ + Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, ResamplingFunction, Scaler, ScalingU16, +}; fuzz_target!(|data: (u16, u16, u16, u16)| { resize_rgba( @@ -64,24 +66,20 @@ fn resize_rgba( let mut src_data = vec![1u16; src_width * src_height * 4]; let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10); + let scaler = Scaler::new(sampler); - _ = scaler - .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 10, false) - .unwrap(); + scaler.resize_rgba_u16(&store, &mut target, false).unwrap(); let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); - _ = scaler - .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 10, true) - .unwrap(); + scaler.resize_rgba_u16(&store, &mut target, true).unwrap(); + + let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16); let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); - _ = scaler - .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 16, false) - .unwrap(); + scaler.resize_rgba_u16(&store, &mut target, false).unwrap(); let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); - _ = scaler - .resize_rgba_u16(ImageSize::new(dst_width, dst_height), store, 16, true) - .unwrap(); + scaler.resize_rgba_u16(&store, &mut target, true).unwrap(); let src_data_ar30 = vec![1u32; src_width * src_height]; let mut dst_data_ar30 = vec![1u32; dst_width * dst_height]; diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs index 48ea85e..c63a4ca 100644 --- a/src/avx2/vertical_u8_lp.rs +++ b/src/avx2/vertical_u8_lp.rs @@ -59,11 +59,11 @@ unsafe fn m256dot( let store0 = _mm256_add_epi16( store0, - _mm256_mulhi_epi16(_mm256_slli_epi16::(lo), weight), + _mm256_mulhrs_epi16(_mm256_slli_epi16::(lo), weight), ); let store1 = _mm256_add_epi16( store1, - _mm256_mulhi_epi16(_mm256_slli_epi16::(hi), weight), + _mm256_mulhrs_epi16(_mm256_slli_epi16::(hi), weight), ); (store0, store1) } @@ -81,8 +81,8 @@ unsafe fn convolve_vertical_avx2_row_impl( let bounds_size = bounds.size; const SCALE: i32 = 6; - const R_SHR_SCALE: i32 = SCALE - 1; - const ROUNDING: i16 = 1 << (SCALE - 1); + const R_SHR_SCALE: i32 = SCALE; + const ROUNDING: i16 = 1 << (R_SHR_SCALE - 1); let mut cx = 0usize; @@ -342,7 +342,7 @@ unsafe fn convolve_vertical_avx2_row_impl( store0 = _mm256_add_epi16( store0, - _mm256_mulhi_epi16(_mm256_slli_epi16::(item_row), v_weight), + _mm256_mulhrs_epi16(_mm256_slli_epi16::(item_row), v_weight), ); } @@ -377,13 +377,13 @@ unsafe fn convolve_vertical_avx2_row_impl( let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row0), v_weight0), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), ); let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row1), v_weight1), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), ); } else if bounds_size == 3 { let py = bounds.start; @@ -401,19 +401,19 @@ unsafe fn convolve_vertical_avx2_row_impl( let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row0), v_weight0), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), ); let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row1), v_weight1), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), ); let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row2), v_weight2), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), ); } else if bounds_size == 4 { let py = bounds.start; @@ -434,25 +434,25 @@ unsafe fn convolve_vertical_avx2_row_impl( let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row0), v_weight0), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), ); let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row1), v_weight1), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), ); let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row2), v_weight2), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), ); let item_row3 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr3.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row3), v_weight3), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row3), v_weight3), ); } else { for j in 0..bounds_size { @@ -464,7 +464,7 @@ unsafe fn convolve_vertical_avx2_row_impl( let item_row = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr.as_ptr()), zeros); let low = _mm_slli_epi16::(item_row); - store = _mm_add_epi16(store, _mm_mulhi_epi16(low, v_weight)); + store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight)); } } @@ -497,13 +497,13 @@ unsafe fn convolve_vertical_avx2_row_impl( store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row0), v_weight0), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), ); let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row1), v_weight1), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), ); } else if bounds_size == 3 { let py = bounds.start; @@ -522,19 +522,19 @@ unsafe fn convolve_vertical_avx2_row_impl( store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row0), v_weight0), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), ); let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row1), v_weight1), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), ); let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row2), v_weight2), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), ); } else if bounds_size == 4 { let py = bounds.start; @@ -556,25 +556,25 @@ unsafe fn convolve_vertical_avx2_row_impl( store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row0), v_weight0), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), ); let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row1), v_weight1), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), ); let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row2), v_weight2), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), ); let item_row3 = _mm_set1_epi16(src_ptr3[0] as i16); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row3), v_weight3), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row3), v_weight3), ); } else { for j in 0..bounds_size { @@ -587,7 +587,7 @@ unsafe fn convolve_vertical_avx2_row_impl( store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row), v_weight), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row), v_weight), ); } } diff --git a/src/sse/rgba_u8_lb.rs b/src/sse/rgba_u8_lb.rs index 9f7ffe9..1eded53 100644 --- a/src/sse/rgba_u8_lb.rs +++ b/src/sse/rgba_u8_lb.rs @@ -47,7 +47,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_sse( let rgba_pixel = _mm_cvtsi32_si128(src_ptr_32.read_unaligned()); let lo = _mm_slli_epi16::(_mm_unpacklo_epi8(rgba_pixel, _mm_setzero_si128())); - _mm_add_epi16(store_0, _mm_mulhi_epi16(lo, weight0)) + _mm_add_epi16(store_0, _mm_mulhrs_epi16(lo, weight0)) } pub(crate) fn convolve_horizontal_rgba_sse_rows_4_lb( @@ -77,10 +77,10 @@ unsafe fn hdot4( let hi0 = _mm_slli_epi16::(_mm_unpackhi_epi8(v0, zeros)); let lo1 = _mm_slli_epi16::(_mm_unpacklo_epi8(v1, zeros)); let hi1 = _mm_slli_epi16::(_mm_unpackhi_epi8(v1, zeros)); - let mut p = _mm_mulhi_epi16(lo0, w01); - p = _mm_add_epi16(p, _mm_mulhi_epi16(hi0, w23)); - p = _mm_add_epi16(p, _mm_mulhi_epi16(lo1, w45)); - p = _mm_add_epi16(p, _mm_mulhi_epi16(hi1, w67)); + let mut p = _mm_mulhrs_epi16(lo0, w01); + p = _mm_add_epi16(p, _mm_mulhrs_epi16(hi0, w23)); + p = _mm_add_epi16(p, _mm_mulhrs_epi16(lo1, w45)); + p = _mm_add_epi16(p, _mm_mulhrs_epi16(hi1, w67)); let hi_part = _mm_unpackhi_epi64(p, p); p = _mm_add_epi16(hi_part, p); _mm_add_epi16(store, p) @@ -96,8 +96,8 @@ unsafe fn hdot2( let zeros = _mm_setzero_si128(); let lo = _mm_slli_epi16::(_mm_unpacklo_epi8(v, zeros)); let hi = _mm_slli_epi16::(_mm_unpackhi_epi8(v, zeros)); - let mut p = _mm_mulhi_epi16(lo, w01); - p = _mm_add_epi16(p, _mm_mulhi_epi16(hi, w23)); + let mut p = _mm_mulhrs_epi16(lo, w01); + p = _mm_add_epi16(p, _mm_mulhrs_epi16(hi, w23)); let hi_part = _mm_unpackhi_epi64(p, p); p = _mm_add_epi16(hi_part, p); _mm_add_epi16(store, p) @@ -107,7 +107,7 @@ unsafe fn hdot2( unsafe fn hdot(store: __m128i, v: __m128i, w01: __m128i) -> __m128i { let zeros = _mm_setzero_si128(); let lo = _mm_slli_epi16::(_mm_unpacklo_epi8(v, zeros)); - let mut p = _mm_mulhi_epi16(lo, w01); + let mut p = _mm_mulhrs_epi16(lo, w01); let hi_part = _mm_unpackhi_epi64(p, p); p = _mm_add_epi16(hi_part, p); _mm_add_epi16(store, p) @@ -125,8 +125,8 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( const CHANNELS: usize = 4; const SCALE: i32 = 6; - const ROUNDING: i16 = 1 << (SCALE - 1); - const V_SHR: i32 = SCALE - 1; + const V_SHR: i32 = SCALE; + const ROUNDING: i16 = 1 << (V_SHR - 1); let vld = _mm_set1_epi16(ROUNDING); diff --git a/src/sse/vertical_u8_lp.rs b/src/sse/vertical_u8_lp.rs index 7b1a651..5deefdf 100644 --- a/src/sse/vertical_u8_lp.rs +++ b/src/sse/vertical_u8_lp.rs @@ -56,8 +56,14 @@ unsafe fn mdot( let lo = _mm_unpacklo_epi8(row, zeros); let hi = _mm_unpackhi_epi8(row, zeros); - let store0 = _mm_add_epi16(store0, _mm_mulhi_epi16(_mm_slli_epi16::(lo), weight)); - let store1 = _mm_add_epi16(store1, _mm_mulhi_epi16(_mm_slli_epi16::(hi), weight)); + let store0 = _mm_add_epi16( + store0, + _mm_mulhrs_epi16(_mm_slli_epi16::(lo), weight), + ); + let store1 = _mm_add_epi16( + store1, + _mm_mulhrs_epi16(_mm_slli_epi16::(hi), weight), + ); (store0, store1) } @@ -74,8 +80,8 @@ unsafe fn convolve_vertical_sse_row_impl( let bounds_size = bounds.size; const SCALE: i32 = 6; - const R_SHR_SCALE: i32 = SCALE - 1; - const ROUNDING: i16 = 1 << (SCALE - 1); + const R_SHR_SCALE: i32 = SCALE; + const ROUNDING: i16 = 1 << (R_SHR_SCALE - 1); let mut cx = 0usize; @@ -111,14 +117,14 @@ unsafe fn convolve_vertical_sse_row_impl( (store6, store7) = mdot::(store6, store7, item_row3, v_weight); } - let rebased0 = _mm_srli_epi16::(store0); - let rebased1 = _mm_srli_epi16::(store1); - let rebased2 = _mm_srli_epi16::(store2); - let rebased3 = _mm_srli_epi16::(store3); - let rebased4 = _mm_srli_epi16::(store4); - let rebased5 = _mm_srli_epi16::(store5); - let rebased6 = _mm_srli_epi16::(store6); - let rebased7 = _mm_srli_epi16::(store7); + let rebased0 = _mm_srai_epi16::(store0); + let rebased1 = _mm_srai_epi16::(store1); + let rebased2 = _mm_srai_epi16::(store2); + let rebased3 = _mm_srai_epi16::(store3); + let rebased4 = _mm_srai_epi16::(store4); + let rebased5 = _mm_srai_epi16::(store5); + let rebased6 = _mm_srai_epi16::(store6); + let rebased7 = _mm_srai_epi16::(store7); let shrank0 = _mm_packus_epi16(rebased0, rebased1); let shrank1 = _mm_packus_epi16(rebased2, rebased3); let shrank2 = _mm_packus_epi16(rebased4, rebased5); @@ -164,10 +170,10 @@ unsafe fn convolve_vertical_sse_row_impl( (store2, store3) = mdot::(store2, store3, item_row1, v_weight); } - let rebased0 = _mm_srli_epi16::(store0); - let rebased1 = _mm_srli_epi16::(store1); - let rebased2 = _mm_srli_epi16::(store2); - let rebased3 = _mm_srli_epi16::(store3); + let rebased0 = _mm_srai_epi16::(store0); + let rebased1 = _mm_srai_epi16::(store1); + let rebased2 = _mm_srai_epi16::(store2); + let rebased3 = _mm_srai_epi16::(store3); let shrank0 = _mm_packus_epi16(rebased0, rebased1); let shrank1 = _mm_packus_epi16(rebased2, rebased3); _mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, shrank0); @@ -258,8 +264,8 @@ unsafe fn convolve_vertical_sse_row_impl( } } - let rebased0 = _mm_srli_epi16::(store0); - let rebased1 = _mm_srli_epi16::(store1); + let rebased0 = _mm_srai_epi16::(store0); + let rebased1 = _mm_srai_epi16::(store1); let shrank = _mm_packus_epi16(rebased0, rebased1); _mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, shrank); @@ -287,13 +293,13 @@ unsafe fn convolve_vertical_sse_row_impl( let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row0), v_weight0), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), ); let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row1), v_weight1), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), ); } else if bounds_size == 3 { let py = bounds.start; @@ -311,19 +317,19 @@ unsafe fn convolve_vertical_sse_row_impl( let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row0), v_weight0), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), ); let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row1), v_weight1), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), ); let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row2), v_weight2), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), ); } else if bounds_size == 4 { let py = bounds.start; @@ -344,25 +350,25 @@ unsafe fn convolve_vertical_sse_row_impl( let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row0), v_weight0), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), ); let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row1), v_weight1), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), ); let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row2), v_weight2), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), ); let item_row3 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr3.as_ptr()), zeros); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row3), v_weight3), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row3), v_weight3), ); } else { for j in 0..bounds_size { @@ -374,11 +380,11 @@ unsafe fn convolve_vertical_sse_row_impl( let item_row = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr.as_ptr()), zeros); let low = _mm_slli_epi16::(item_row); - store = _mm_add_epi16(store, _mm_mulhi_epi16(low, v_weight)); + store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight)); } } - let rebased = _mm_srli_epi16::(store); + let rebased = _mm_srai_epi16::(store); let shrank = _mm_packus_epi16(rebased, rebased); _mm_storeu_si64(dst.as_mut_ptr(), shrank); @@ -407,13 +413,13 @@ unsafe fn convolve_vertical_sse_row_impl( store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row0), v_weight0), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), ); let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row1), v_weight1), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), ); } else if bounds_size == 3 { let py = bounds.start; @@ -432,19 +438,19 @@ unsafe fn convolve_vertical_sse_row_impl( store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row0), v_weight0), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), ); let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row1), v_weight1), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), ); let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row2), v_weight2), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), ); } else if bounds_size == 4 { let py = bounds.start; @@ -466,25 +472,25 @@ unsafe fn convolve_vertical_sse_row_impl( store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row0), v_weight0), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), ); let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row1), v_weight1), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), ); let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row2), v_weight2), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), ); let item_row3 = _mm_set1_epi16(src_ptr3[0] as i16); store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row3), v_weight3), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row3), v_weight3), ); } else { for j in 0..bounds_size { @@ -497,12 +503,12 @@ unsafe fn convolve_vertical_sse_row_impl( store = _mm_add_epi16( store, - _mm_mulhi_epi16(_mm_slli_epi16::(item_row), v_weight), + _mm_mulhrs_epi16(_mm_slli_epi16::(item_row), v_weight), ); } } - let rebased = _mm_srli_epi16::(store); + let rebased = _mm_srai_epi16::(store); let value = _mm_extract_epi8::<0>(_mm_packus_epi16(rebased, rebased)); *dst = value as u8; From b169d4535304d00c823a1620fe0296ae69e9ae3e Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Sat, 28 Dec 2024 11:46:59 +0000 Subject: [PATCH 6/9] Updates for x86 --- fuzz/resize_plane_u16/resize_plane_u16.rs | 6 ++---- fuzz/resize_rgb_u16/resize_rgb_u16.rs | 6 ++---- fuzz/resize_rgba_f32/resize_rgba_f32.rs | 6 ++---- fuzz/resize_rgba_u16/resize_rgba_u16.rs | 10 ++++------ 4 files changed, 10 insertions(+), 18 deletions(-) diff --git a/fuzz/resize_plane_u16/resize_plane_u16.rs b/fuzz/resize_plane_u16/resize_plane_u16.rs index 0b7f44f..c6e983d 100644 --- a/fuzz/resize_plane_u16/resize_plane_u16.rs +++ b/fuzz/resize_plane_u16/resize_plane_u16.rs @@ -61,15 +61,13 @@ fn resize_rgb( return; } - let mut src_data = vec![1u16; src_width * src_height]; - - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let store = ImageStore::::alloc(src_width, src_height); let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10); let scaler = Scaler::new(sampler); scaler.resize_plane_u16(&store, &mut target).unwrap(); - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let store = ImageStore::::alloc(src_width, src_height); let mut target16 = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16); scaler.resize_plane_u16(&store, &mut target16).unwrap(); } diff --git a/fuzz/resize_rgb_u16/resize_rgb_u16.rs b/fuzz/resize_rgb_u16/resize_rgb_u16.rs index 7018c61..2cbc018 100644 --- a/fuzz/resize_rgb_u16/resize_rgb_u16.rs +++ b/fuzz/resize_rgb_u16/resize_rgb_u16.rs @@ -61,15 +61,13 @@ fn resize_rgb( return; } - let mut src_data = vec![1u16; src_width * src_height * 3]; - - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let store = ImageStore::::alloc(src_width, src_height); let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10); let scaler = Scaler::new(sampler); scaler.resize_rgb_u16(&store, &mut target).unwrap(); - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let store = ImageStore::::alloc(src_width, src_height); let mut target16 = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16); scaler.resize_rgb_u16(&store, &mut target16).unwrap(); } diff --git a/fuzz/resize_rgba_f32/resize_rgba_f32.rs b/fuzz/resize_rgba_f32/resize_rgba_f32.rs index 67a2e47..628b885 100644 --- a/fuzz/resize_rgba_f32/resize_rgba_f32.rs +++ b/fuzz/resize_rgba_f32/resize_rgba_f32.rs @@ -61,13 +61,11 @@ fn resize_rgba( return; } - let mut src_data = vec![0f32; src_width * src_height * 4]; - - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let store = ImageStore::::alloc(src_width, src_height); let mut target = ImageStoreMut::alloc(dst_width, dst_height); let scaler = Scaler::new(sampler); scaler.resize_rgba_f32(&store, &mut target, false).unwrap(); - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let store = ImageStore::::alloc(src_width, src_height); scaler.resize_rgba_f32(&store, &mut target, true).unwrap(); } diff --git a/fuzz/resize_rgba_u16/resize_rgba_u16.rs b/fuzz/resize_rgba_u16/resize_rgba_u16.rs index 57d8090..571290d 100644 --- a/fuzz/resize_rgba_u16/resize_rgba_u16.rs +++ b/fuzz/resize_rgba_u16/resize_rgba_u16.rs @@ -63,22 +63,20 @@ fn resize_rgba( return; } - let mut src_data = vec![1u16; src_width * src_height * 4]; - - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let store = ImageStore::::alloc(src_width, src_height); let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 10); let scaler = Scaler::new(sampler); scaler.resize_rgba_u16(&store, &mut target, false).unwrap(); - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let store = ImageStore::::alloc(src_width, src_height); scaler.resize_rgba_u16(&store, &mut target, true).unwrap(); let mut target = ImageStoreMut::alloc_with_depth(dst_width, dst_height, 16); - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let store = ImageStore::::alloc(src_width, src_height); scaler.resize_rgba_u16(&store, &mut target, false).unwrap(); - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let store = ImageStore::::alloc(src_width, src_height); scaler.resize_rgba_u16(&store, &mut target, true).unwrap(); let src_data_ar30 = vec![1u32; src_width * src_height]; From 1b7444a525ee7e03d772f165e17786c2dd754a6b Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Sat, 28 Dec 2024 12:21:18 +0000 Subject: [PATCH 7/9] Fuzzing fixes --- fuzz/resize_rgba/resize_rgba.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fuzz/resize_rgba/resize_rgba.rs b/fuzz/resize_rgba/resize_rgba.rs index ecf7055..2654ba7 100644 --- a/fuzz/resize_rgba/resize_rgba.rs +++ b/fuzz/resize_rgba/resize_rgba.rs @@ -61,12 +61,10 @@ fn resize_rgba( return; } - let mut src_data = vec![0u8; src_width * src_height * 4]; - - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let store = ImageStore::::alloc(src_width, src_height); let mut target = ImageStoreMut::alloc(dst_width, dst_height); let scaler = Scaler::new(sampler); scaler.resize_rgba(&store, &mut target, false).unwrap(); - let store = ImageStore::::from_slice(&mut src_data, src_width, src_height).unwrap(); + let store = ImageStore::::alloc(src_width, src_height); scaler.resize_rgba(&store, &mut target, true).unwrap(); } From 60200971613829276bdb47119a97cfc0854a31d8 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Sat, 28 Dec 2024 13:45:15 +0000 Subject: [PATCH 8/9] Improvements on x86 u8 --- Cargo.lock | 4 +- Cargo.toml | 2 +- app/src/main.rs | 5 +- src/avx2/vertical_u8_lp.rs | 245 ++++---------------------------- src/sse/rgba_u8_lb.rs | 97 +++++-------- src/sse/vertical_u8_lp.rs | 278 ++++++------------------------------- 6 files changed, 115 insertions(+), 516 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f1d0691..441a439 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -259,9 +259,9 @@ checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" [[package]] name = "colorutils-rs" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31ca2cc8ed986672b15bfd3e416014e40cada05196bdfaa51168985f3c2e81f1" +checksum = "c06bb7c7479a238be740a3312b5693d76e234eb49b73b3e61ae768132c79d06a" dependencies = [ "erydanos", "half", diff --git a/Cargo.toml b/Cargo.toml index 8dd615e..12abb08 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ exclude = ["*.jpg", "/assets", "*.png", "*.sh", "/assets/*"] rust-version = "1.82.0" [dependencies] -colorutils-rs = {version = "0.7.0", optional = true} +colorutils-rs = {version = "0.7.4", optional = true} half = { version = "2.4.1", optional = true, features = ["alloc", "std", "num-traits"] } num-traits = { version = "0.2.19", features = ["std"] } rayon = "1.10.0" diff --git a/app/src/main.rs b/app/src/main.rs index 8166b23..91e5cc3 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -11,8 +11,9 @@ use fast_image_resize::{ }; use image::{EncodableLayout, GenericImageView, ImageReader}; use pic_scale::{ - Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, LinearScaler, ResamplingFunction, Scaler, - Scaling, ScalingU16, ThreadingPolicy, + Ar30ByteOrder, ImageSize, ImageStore, ImageStoreMut, JzazbzScaler, LChScaler, LabScaler, + LinearApproxScaler, LinearScaler, LuvScaler, OklabScaler, ResamplingFunction, Scaler, Scaling, + ScalingU16, SigmoidalScaler, ThreadingPolicy, TransferFunction, XYZScaler, }; fn resize_plane( diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs index c63a4ca..1ccf827 100644 --- a/src/avx2/vertical_u8_lp.rs +++ b/src/avx2/vertical_u8_lp.rs @@ -53,17 +53,16 @@ unsafe fn m256dot( row: __m256i, weight: __m256i, ) -> (__m256i, __m256i) { - let zeros = _mm256_setzero_si256(); - let lo = _mm256_unpacklo_epi8(row, zeros); - let hi = _mm256_unpackhi_epi8(row, zeros); + let lo = _mm256_unpacklo_epi8(row, row); + let hi = _mm256_unpackhi_epi8(row, row); let store0 = _mm256_add_epi16( store0, - _mm256_mulhrs_epi16(_mm256_slli_epi16::(lo), weight), + _mm256_mulhrs_epi16(_mm256_srli_epi16::<2>(lo), weight), ); let store1 = _mm256_add_epi16( store1, - _mm256_mulhrs_epi16(_mm256_slli_epi16::(hi), weight), + _mm256_mulhrs_epi16(_mm256_srli_epi16::<2>(hi), weight), ); (store0, store1) } @@ -77,8 +76,6 @@ unsafe fn convolve_vertical_avx2_row_impl( src_stride: usize, weight: &[i16], ) { - let zeros = _mm_setzero_si128(); - let bounds_size = bounds.size; const SCALE: i32 = 6; const R_SHR_SCALE: i32 = SCALE; @@ -337,12 +334,13 @@ unsafe fn convolve_vertical_avx2_row_impl( let v_weight = _mm256_set1_epi16(weight[0]); let v_offset = src_stride * py + px; let src_ptr = src.get_unchecked(v_offset..); - let item_row = - _mm256_cvtepu8_epi16(_mm_loadu_si128(src_ptr.as_ptr() as *const __m128i)); - + let mut item_row = _mm256_permute4x64_epi64::<0x50>(_mm256_castsi128_si256( + _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i), + )); + item_row = _mm256_unpacklo_epi8(item_row, item_row); store0 = _mm256_add_epi16( store0, - _mm256_mulhrs_epi16(_mm256_slli_epi16::(item_row), v_weight), + _mm256_mulhrs_epi16(_mm256_srli_epi16::<2>(item_row), v_weight), ); } @@ -364,108 +362,17 @@ unsafe fn convolve_vertical_avx2_row_impl( let px = cx; - if bounds_size == 2 { - let py = bounds.start; - let weights = weight.get_unchecked(0..2); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..); - - let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), - ); - - let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), - ); - } else if bounds_size == 3 { - let py = bounds.start; - let weights = weight.get_unchecked(0..3); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_weight2 = _mm_set1_epi16(weights[2]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..); - let v_offset2 = src_stride * (py + 2) + px; - let src_ptr2 = src.get_unchecked(v_offset2..); - - let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), - ); - - let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), - ); - - let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), - ); - } else if bounds_size == 4 { - let py = bounds.start; - let weights = weight.get_unchecked(0..4); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_weight2 = _mm_set1_epi16(weights[2]); - let v_weight3 = _mm_set1_epi16(weights[3]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..); - let v_offset2 = src_stride * (py + 2) + px; - let src_ptr2 = src.get_unchecked(v_offset2..); - let v_offset3 = src_stride * (py + 3) + px; - let src_ptr3 = src.get_unchecked(v_offset3..); - - let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), - ); - - let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), - ); - - let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), - ); - - let item_row3 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr3.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row3), v_weight3), - ); - } else { - for j in 0..bounds_size { - let py = bounds.start + j; - let weight = weight.get_unchecked(j..(j + 1)); - let v_weight = _mm_set1_epi16(weight[0]); - let v_offset = src_stride * py + px; - let src_ptr = src.get_unchecked(v_offset..); - let item_row = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr.as_ptr()), zeros); + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..(j + 1)); + let v_weight = _mm_set1_epi16(weight[0]); + let v_offset = src_stride * py + px; + let src_ptr = src.get_unchecked(v_offset..); + let mut item_row = _mm_loadu_si64(src_ptr.as_ptr()); + item_row = _mm_unpacklo_epi8(item_row, item_row); - let low = _mm_slli_epi16::(item_row); - store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight)); - } + let low = _mm_srli_epi16::<2>(item_row); + store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight)); } let rebased = _mm_srai_epi16::(store); @@ -483,113 +390,21 @@ unsafe fn convolve_vertical_avx2_row_impl( let px = cx; - if bounds_size == 2 { - let py = bounds.start; - let weights = weight.get_unchecked(0..2); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..(v_offset0 + 1)); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..(v_offset1 + 1)); - - let item_row0 = _mm_set1_epi16(src_ptr0[0] as i16); - - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), - ); - - let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), - ); - } else if bounds_size == 3 { - let py = bounds.start; - let weights = weight.get_unchecked(0..3); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_weight2 = _mm_set1_epi16(weights[2]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..(v_offset0 + 1)); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..(v_offset1 + 1)); - let v_offset2 = src_stride * (py + 2) + px; - let src_ptr2 = src.get_unchecked(v_offset2..(v_offset2 + 1)); - - let item_row0 = _mm_set1_epi16(src_ptr0[0] as i16); - - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), - ); - - let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), - ); - - let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), - ); - } else if bounds_size == 4 { - let py = bounds.start; - let weights = weight.get_unchecked(0..4); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_weight2 = _mm_set1_epi16(weights[2]); - let v_weight3 = _mm_set1_epi16(weights[3]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..(v_offset0 + 1)); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..(v_offset1 + 1)); - let v_offset2 = src_stride * (py + 2) + px; - let src_ptr2 = src.get_unchecked(v_offset2..(v_offset2 + 1)); - let v_offset3 = src_stride * (py + 3) + px; - let src_ptr3 = src.get_unchecked(v_offset3..(v_offset3 + 1)); - - let item_row0 = _mm_set1_epi16(src_ptr0[0] as i16); - - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), - ); - - let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), - ); - - let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), - ); + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..(j + 1)); + let v_weight = _mm_set1_epi16(weight[0]); + let v_offset = src_stride * py + px; + let src_ptr = src.get_unchecked(v_offset..(v_offset + 1)); + let item_row = _mm_set1_epi8(src_ptr[0] as i8); - let item_row3 = _mm_set1_epi16(src_ptr3[0] as i16); store = _mm_add_epi16( store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row3), v_weight3), + _mm_mulhrs_epi16( + _mm_srli_epi16::<2>(_mm_unpacklo_epi8(item_row, item_row)), + v_weight, + ), ); - } else { - for j in 0..bounds_size { - let py = bounds.start + j; - let weight = weight.get_unchecked(j..(j + 1)); - let v_weight = _mm_set1_epi16(weight[0]); - let v_offset = src_stride * py + px; - let src_ptr = src.get_unchecked(v_offset..(v_offset + 1)); - let item_row = _mm_set1_epi16(src_ptr[0] as i16); - - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row), v_weight), - ); - } } let rebased = _mm_srai_epi16::(store); diff --git a/src/sse/rgba_u8_lb.rs b/src/sse/rgba_u8_lb.rs index 1eded53..b72751b 100644 --- a/src/sse/rgba_u8_lb.rs +++ b/src/sse/rgba_u8_lb.rs @@ -34,7 +34,7 @@ use std::arch::x86::*; use std::arch::x86_64::*; #[inline] -unsafe fn convolve_horizontal_parts_one_rgba_sse( +unsafe fn convolve_horizontal_parts_one_rgba_sse( start_x: usize, src: &[u8], weight0: __m128i, @@ -45,7 +45,7 @@ unsafe fn convolve_horizontal_parts_one_rgba_sse( let src_ptr_32 = src_ptr.as_ptr() as *const i32; let rgba_pixel = _mm_cvtsi32_si128(src_ptr_32.read_unaligned()); - let lo = _mm_slli_epi16::(_mm_unpacklo_epi8(rgba_pixel, _mm_setzero_si128())); + let lo = _mm_srli_epi16::<2>(_mm_unpacklo_epi8(rgba_pixel, rgba_pixel)); _mm_add_epi16(store_0, _mm_mulhrs_epi16(lo, weight0)) } @@ -63,7 +63,7 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_lb( } #[inline(always)] -unsafe fn hdot4( +unsafe fn hdot4( store: __m128i, v0: __m128i, v1: __m128i, @@ -72,11 +72,10 @@ unsafe fn hdot4( w45: __m128i, w67: __m128i, ) -> __m128i { - let zeros = _mm_setzero_si128(); - let lo0 = _mm_slli_epi16::(_mm_unpacklo_epi8(v0, zeros)); - let hi0 = _mm_slli_epi16::(_mm_unpackhi_epi8(v0, zeros)); - let lo1 = _mm_slli_epi16::(_mm_unpacklo_epi8(v1, zeros)); - let hi1 = _mm_slli_epi16::(_mm_unpackhi_epi8(v1, zeros)); + let lo0 = _mm_srli_epi16::<2>(_mm_unpacklo_epi8(v0, v0)); + let hi0 = _mm_srli_epi16::<2>(_mm_unpackhi_epi8(v0, v0)); + let lo1 = _mm_srli_epi16::<2>(_mm_unpacklo_epi8(v1, v1)); + let hi1 = _mm_srli_epi16::<2>(_mm_unpackhi_epi8(v1, v1)); let mut p = _mm_mulhrs_epi16(lo0, w01); p = _mm_add_epi16(p, _mm_mulhrs_epi16(hi0, w23)); p = _mm_add_epi16(p, _mm_mulhrs_epi16(lo1, w45)); @@ -87,15 +86,9 @@ unsafe fn hdot4( } #[inline(always)] -unsafe fn hdot2( - store: __m128i, - v: __m128i, - w01: __m128i, - w23: __m128i, -) -> __m128i { - let zeros = _mm_setzero_si128(); - let lo = _mm_slli_epi16::(_mm_unpacklo_epi8(v, zeros)); - let hi = _mm_slli_epi16::(_mm_unpackhi_epi8(v, zeros)); +unsafe fn hdot2(store: __m128i, v: __m128i, w01: __m128i, w23: __m128i) -> __m128i { + let lo = _mm_srli_epi16::<2>(_mm_unpacklo_epi8(v, v)); + let hi = _mm_srli_epi16::<2>(_mm_unpackhi_epi8(v, v)); let mut p = _mm_mulhrs_epi16(lo, w01); p = _mm_add_epi16(p, _mm_mulhrs_epi16(hi, w23)); let hi_part = _mm_unpackhi_epi64(p, p); @@ -104,9 +97,8 @@ unsafe fn hdot2( } #[inline(always)] -unsafe fn hdot(store: __m128i, v: __m128i, w01: __m128i) -> __m128i { - let zeros = _mm_setzero_si128(); - let lo = _mm_slli_epi16::(_mm_unpacklo_epi8(v, zeros)); +unsafe fn hdot(store: __m128i, v: __m128i, w01: __m128i) -> __m128i { + let lo = _mm_srli_epi16::<2>(_mm_unpacklo_epi8(v, v)); let mut p = _mm_mulhrs_epi16(lo, w01); let hi_part = _mm_unpackhi_epi64(p, p); p = _mm_add_epi16(hi_part, p); @@ -222,7 +214,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( .as_ptr() as *const __m128i, ); - store_0 = hdot4::( + store_0 = hdot4( store_0, rgb_pixel_0, rgb_pixel_0_1, @@ -231,7 +223,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( weight45, weight67, ); - store_1 = hdot4::( + store_1 = hdot4( store_1, rgb_pixel_1, rgb_pixel_1_0, @@ -240,7 +232,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( weight45, weight67, ); - store_2 = hdot4::( + store_2 = hdot4( store_2, rgb_pixel_2, rgb_pixel_2_1, @@ -249,7 +241,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( weight45, weight67, ); - store_3 = hdot4::( + store_3 = hdot4( store_3, rgb_pixel_3, rgb_pixel_3_1, @@ -290,10 +282,10 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( src3.get_unchecked((start_bounds * CHANNELS)..).as_ptr() as *const __m128i, ); - store_0 = hdot2::(store_0, rgb_pixel_0, weight01, weight23); - store_1 = hdot2::(store_1, rgb_pixel_1, weight01, weight23); - store_2 = hdot2::(store_2, rgb_pixel_2, weight01, weight23); - store_3 = hdot2::(store_3, rgb_pixel_3, weight01, weight23); + store_0 = hdot2(store_0, rgb_pixel_0, weight01, weight23); + store_1 = hdot2(store_1, rgb_pixel_1, weight01, weight23); + store_2 = hdot2(store_2, rgb_pixel_2, weight01, weight23); + store_3 = hdot2(store_3, rgb_pixel_3, weight01, weight23); jx += 4; } @@ -316,10 +308,10 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( let rgb_pixel_3 = _mm_loadu_si64(src3.get_unchecked((bounds_start * CHANNELS)..).as_ptr()); - store_0 = hdot::(store_0, rgb_pixel_0, weight01); - store_1 = hdot::(store_1, rgb_pixel_1, weight01); - store_2 = hdot::(store_2, rgb_pixel_2, weight01); - store_3 = hdot::(store_3, rgb_pixel_3, weight01); + store_0 = hdot(store_0, rgb_pixel_0, weight01); + store_1 = hdot(store_1, rgb_pixel_1, weight01); + store_2 = hdot(store_2, rgb_pixel_2, weight01); + store_3 = hdot(store_3, rgb_pixel_3, weight01); jx += 2; } @@ -331,30 +323,14 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_impl( let start_bounds = bounds.start + jx; - store_0 = convolve_horizontal_parts_one_rgba_sse::( - start_bounds, - src0, - weight0, - store_0, - ); - store_1 = convolve_horizontal_parts_one_rgba_sse::( - start_bounds, - src1, - weight0, - store_1, - ); - store_2 = convolve_horizontal_parts_one_rgba_sse::( - start_bounds, - src2, - weight0, - store_2, - ); - store_3 = convolve_horizontal_parts_one_rgba_sse::( - start_bounds, - src3, - weight0, - store_3, - ); + store_0 = + convolve_horizontal_parts_one_rgba_sse(start_bounds, src0, weight0, store_0); + store_1 = + convolve_horizontal_parts_one_rgba_sse(start_bounds, src1, weight0, store_1); + store_2 = + convolve_horizontal_parts_one_rgba_sse(start_bounds, src2, weight0, store_2); + store_3 = + convolve_horizontal_parts_one_rgba_sse(start_bounds, src3, weight0, store_3); jx += 1; } @@ -452,7 +428,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl( src.get_unchecked((start_bounds * CHANNELS + 16)..).as_ptr() as *const __m128i, ); - store = hdot4::( + store = hdot4( store, rgb_pixel_0, rgb_pixel_0_1, @@ -482,7 +458,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl( let rgb_pixel = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); - store = hdot2::(store, rgb_pixel, weight01, weight23); + store = hdot2(store, rgb_pixel, weight01, weight23); jx += 4; } @@ -500,7 +476,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl( let rgb_pixel = _mm_loadu_si64(src_ptr.as_ptr()); - store = hdot::(store, rgb_pixel, weight01); + store = hdot(store, rgb_pixel, weight01); jx += 2; } @@ -511,8 +487,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_one_impl( let start_bounds = bounds.start + jx; - store = - convolve_horizontal_parts_one_rgba_sse::(start_bounds, src, weight0, store); + store = convolve_horizontal_parts_one_rgba_sse(start_bounds, src, weight0, store); jx += 1; } diff --git a/src/sse/vertical_u8_lp.rs b/src/sse/vertical_u8_lp.rs index 5deefdf..4236b58 100644 --- a/src/sse/vertical_u8_lp.rs +++ b/src/sse/vertical_u8_lp.rs @@ -46,24 +46,17 @@ pub(crate) fn convolve_vertical_sse_row_lp( } #[inline(always)] -unsafe fn mdot( +unsafe fn mdot( store0: __m128i, store1: __m128i, row: __m128i, weight: __m128i, ) -> (__m128i, __m128i) { - let zeros = _mm_setzero_si128(); - let lo = _mm_unpacklo_epi8(row, zeros); - let hi = _mm_unpackhi_epi8(row, zeros); - - let store0 = _mm_add_epi16( - store0, - _mm_mulhrs_epi16(_mm_slli_epi16::(lo), weight), - ); - let store1 = _mm_add_epi16( - store1, - _mm_mulhrs_epi16(_mm_slli_epi16::(hi), weight), - ); + let lo = _mm_unpacklo_epi8(row, row); + let hi = _mm_unpackhi_epi8(row, row); + + let store0 = _mm_add_epi16(store0, _mm_mulhrs_epi16(_mm_srli_epi16::<2>(lo), weight)); + let store1 = _mm_add_epi16(store1, _mm_mulhrs_epi16(_mm_srli_epi16::<2>(hi), weight)); (store0, store1) } @@ -76,8 +69,6 @@ unsafe fn convolve_vertical_sse_row_impl( src_stride: usize, weight: &[i16], ) { - let zeros = _mm_setzero_si128(); - let bounds_size = bounds.size; const SCALE: i32 = 6; const R_SHR_SCALE: i32 = SCALE; @@ -111,10 +102,10 @@ unsafe fn convolve_vertical_sse_row_impl( let item_row2 = _mm_loadu_si128(src_ptr.get_unchecked(32..).as_ptr() as *const __m128i); let item_row3 = _mm_loadu_si128(src_ptr.get_unchecked(48..).as_ptr() as *const __m128i); - (store0, store1) = mdot::(store0, store1, item_row0, v_weight); - (store2, store3) = mdot::(store2, store3, item_row1, v_weight); - (store4, store5) = mdot::(store4, store5, item_row2, v_weight); - (store6, store7) = mdot::(store6, store7, item_row3, v_weight); + (store0, store1) = mdot(store0, store1, item_row0, v_weight); + (store2, store3) = mdot(store2, store3, item_row1, v_weight); + (store4, store5) = mdot(store4, store5, item_row2, v_weight); + (store6, store7) = mdot(store6, store7, item_row3, v_weight); } let rebased0 = _mm_srai_epi16::(store0); @@ -125,10 +116,12 @@ unsafe fn convolve_vertical_sse_row_impl( let rebased5 = _mm_srai_epi16::(store5); let rebased6 = _mm_srai_epi16::(store6); let rebased7 = _mm_srai_epi16::(store7); + let shrank0 = _mm_packus_epi16(rebased0, rebased1); let shrank1 = _mm_packus_epi16(rebased2, rebased3); let shrank2 = _mm_packus_epi16(rebased4, rebased5); let shrank3 = _mm_packus_epi16(rebased6, rebased7); + _mm_storeu_si128(dst.as_mut_ptr() as *mut __m128i, shrank0); _mm_storeu_si128( dst.get_unchecked_mut(16..).as_mut_ptr() as *mut __m128i, @@ -166,8 +159,8 @@ unsafe fn convolve_vertical_sse_row_impl( let item_row0 = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); let item_row1 = _mm_loadu_si128(src_ptr.get_unchecked(16..).as_ptr() as *const __m128i); - (store0, store1) = mdot::(store0, store1, item_row0, v_weight); - (store2, store3) = mdot::(store2, store3, item_row1, v_weight); + (store0, store1) = mdot(store0, store1, item_row0, v_weight); + (store2, store3) = mdot(store2, store3, item_row1, v_weight); } let rebased0 = _mm_srai_epi16::(store0); @@ -205,9 +198,9 @@ unsafe fn convolve_vertical_sse_row_impl( let src_ptr1 = src.get_unchecked(v_offset1..); let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - (store0, store1) = mdot::(store0, store1, item_row0, v_weight0); + (store0, store1) = mdot(store0, store1, item_row0, v_weight0); let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - (store0, store1) = mdot::(store0, store1, item_row1, v_weight1); + (store0, store1) = mdot(store0, store1, item_row1, v_weight1); } else if bounds_size == 3 { let py = bounds.start; let weights = weight.get_unchecked(0..3); @@ -222,11 +215,11 @@ unsafe fn convolve_vertical_sse_row_impl( let src_ptr2 = src.get_unchecked(v_offset2..); let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - (store0, store1) = mdot::(store0, store1, item_row0, v_weight0); + (store0, store1) = mdot(store0, store1, item_row0, v_weight0); let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - (store0, store1) = mdot::(store0, store1, item_row1, v_weight1); + (store0, store1) = mdot(store0, store1, item_row1, v_weight1); let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - (store0, store1) = mdot::(store0, store1, item_row2, v_weight2); + (store0, store1) = mdot(store0, store1, item_row2, v_weight2); } else if bounds_size == 4 { let py = bounds.start; let weights = weight.get_unchecked(0..4); @@ -244,13 +237,13 @@ unsafe fn convolve_vertical_sse_row_impl( let src_ptr3 = src.get_unchecked(v_offset3..); let item_row0 = _mm_loadu_si128(src_ptr0.as_ptr() as *const __m128i); - (store0, store1) = mdot::(store0, store1, item_row0, v_weight0); + (store0, store1) = mdot(store0, store1, item_row0, v_weight0); let item_row1 = _mm_loadu_si128(src_ptr1.as_ptr() as *const __m128i); - (store0, store1) = mdot::(store0, store1, item_row1, v_weight1); + (store0, store1) = mdot(store0, store1, item_row1, v_weight1); let item_row2 = _mm_loadu_si128(src_ptr2.as_ptr() as *const __m128i); - (store0, store1) = mdot::(store0, store1, item_row2, v_weight2); + (store0, store1) = mdot(store0, store1, item_row2, v_weight2); let item_row3 = _mm_loadu_si128(src_ptr3.as_ptr() as *const __m128i); - (store0, store1) = mdot::(store0, store1, item_row3, v_weight3); + (store0, store1) = mdot(store0, store1, item_row3, v_weight3); } else { for j in 0..bounds_size { let py = bounds.start + j; @@ -260,7 +253,7 @@ unsafe fn convolve_vertical_sse_row_impl( let src_ptr = src.get_unchecked(v_offset..); let item_row = _mm_loadu_si128(src_ptr.as_ptr() as *const __m128i); - (store0, store1) = mdot::(store0, store1, item_row, v_weight); + (store0, store1) = mdot(store0, store1, item_row, v_weight); } } @@ -280,108 +273,17 @@ unsafe fn convolve_vertical_sse_row_impl( let px = cx; - if bounds_size == 2 { - let py = bounds.start; - let weights = weight.get_unchecked(0..2); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..); - - let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), - ); - - let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), - ); - } else if bounds_size == 3 { - let py = bounds.start; - let weights = weight.get_unchecked(0..3); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_weight2 = _mm_set1_epi16(weights[2]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..); - let v_offset2 = src_stride * (py + 2) + px; - let src_ptr2 = src.get_unchecked(v_offset2..); - - let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), - ); - - let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), - ); - - let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), - ); - } else if bounds_size == 4 { - let py = bounds.start; - let weights = weight.get_unchecked(0..4); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_weight2 = _mm_set1_epi16(weights[2]); - let v_weight3 = _mm_set1_epi16(weights[3]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..); - let v_offset2 = src_stride * (py + 2) + px; - let src_ptr2 = src.get_unchecked(v_offset2..); - let v_offset3 = src_stride * (py + 3) + px; - let src_ptr3 = src.get_unchecked(v_offset3..); - - let item_row0 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr0.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), - ); - - let item_row1 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr1.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), - ); - - let item_row2 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr2.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), - ); - - let item_row3 = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr3.as_ptr()), zeros); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row3), v_weight3), - ); - } else { - for j in 0..bounds_size { - let py = bounds.start + j; - let weight = weight.get_unchecked(j..(j + 1)); - let v_weight = _mm_set1_epi16(weight[0]); - let v_offset = src_stride * py + px; - let src_ptr = src.get_unchecked(v_offset..); - let item_row = _mm_unpacklo_epi8(_mm_loadu_si64(src_ptr.as_ptr()), zeros); + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..(j + 1)); + let v_weight = _mm_set1_epi16(weight[0]); + let v_offset = src_stride * py + px; + let src_ptr = src.get_unchecked(v_offset..); + let mut item_row = _mm_loadu_si64(src_ptr.as_ptr()); + item_row = _mm_unpacklo_epi8(item_row, item_row); - let low = _mm_slli_epi16::(item_row); - store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight)); - } + let low = _mm_srli_epi16::<2>(item_row); + store = _mm_add_epi16(store, _mm_mulhrs_epi16(low, v_weight)); } let rebased = _mm_srai_epi16::(store); @@ -399,113 +301,19 @@ unsafe fn convolve_vertical_sse_row_impl( let px = cx; - if bounds_size == 2 { - let py = bounds.start; - let weights = weight.get_unchecked(0..2); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..(v_offset0 + 1)); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..(v_offset1 + 1)); - - let item_row0 = _mm_set1_epi16(src_ptr0[0] as i16); - - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), - ); - - let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), - ); - } else if bounds_size == 3 { - let py = bounds.start; - let weights = weight.get_unchecked(0..3); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_weight2 = _mm_set1_epi16(weights[2]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..(v_offset0 + 1)); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..(v_offset1 + 1)); - let v_offset2 = src_stride * (py + 2) + px; - let src_ptr2 = src.get_unchecked(v_offset2..(v_offset2 + 1)); - - let item_row0 = _mm_set1_epi16(src_ptr0[0] as i16); - - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), - ); - - let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), - ); - - let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), - ); - } else if bounds_size == 4 { - let py = bounds.start; - let weights = weight.get_unchecked(0..4); - let v_weight0 = _mm_set1_epi16(weights[0]); - let v_weight1 = _mm_set1_epi16(weights[1]); - let v_weight2 = _mm_set1_epi16(weights[2]); - let v_weight3 = _mm_set1_epi16(weights[3]); - let v_offset0 = src_stride * py + px; - let src_ptr0 = src.get_unchecked(v_offset0..(v_offset0 + 1)); - let v_offset1 = src_stride * (py + 1) + px; - let src_ptr1 = src.get_unchecked(v_offset1..(v_offset1 + 1)); - let v_offset2 = src_stride * (py + 2) + px; - let src_ptr2 = src.get_unchecked(v_offset2..(v_offset2 + 1)); - let v_offset3 = src_stride * (py + 3) + px; - let src_ptr3 = src.get_unchecked(v_offset3..(v_offset3 + 1)); - - let item_row0 = _mm_set1_epi16(src_ptr0[0] as i16); - - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row0), v_weight0), - ); - - let item_row1 = _mm_set1_epi16(src_ptr1[0] as i16); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row1), v_weight1), - ); - - let item_row2 = _mm_set1_epi16(src_ptr2[0] as i16); - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row2), v_weight2), - ); + for j in 0..bounds_size { + let py = bounds.start + j; + let weight = weight.get_unchecked(j..(j + 1)); + let v_weight = _mm_set1_epi16(weight[0]); + let v_offset = src_stride * py + px; + let src_ptr = src.get_unchecked(v_offset..(v_offset + 1)); + let mut item_row = _mm_set1_epi8(src_ptr[0] as i8); + item_row = _mm_unpacklo_epi8(item_row, item_row); - let item_row3 = _mm_set1_epi16(src_ptr3[0] as i16); store = _mm_add_epi16( store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row3), v_weight3), + _mm_mulhrs_epi16(_mm_srli_epi16::<2>(item_row), v_weight), ); - } else { - for j in 0..bounds_size { - let py = bounds.start + j; - let weight = weight.get_unchecked(j..(j + 1)); - let v_weight = _mm_set1_epi16(weight[0]); - let v_offset = src_stride * py + px; - let src_ptr = src.get_unchecked(v_offset..(v_offset + 1)); - let item_row = _mm_set1_epi16(src_ptr[0] as i16); - - store = _mm_add_epi16( - store, - _mm_mulhrs_epi16(_mm_slli_epi16::(item_row), v_weight), - ); - } } let rebased = _mm_srai_epi16::(store); From 642f00dde8771682e03eccfdbd6c6f28a17b0835 Mon Sep 17 00:00:00 2001 From: Radzivon Bartoshyk Date: Sat, 28 Dec 2024 13:52:11 +0000 Subject: [PATCH 9/9] Fuzzing fixes --- src/avx2/vertical_u8_lp.rs | 62 +++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/src/avx2/vertical_u8_lp.rs b/src/avx2/vertical_u8_lp.rs index 1ccf827..d560c5f 100644 --- a/src/avx2/vertical_u8_lp.rs +++ b/src/avx2/vertical_u8_lp.rs @@ -47,7 +47,7 @@ pub(crate) fn convolve_vertical_avx_row_lp( } #[inline(always)] -unsafe fn m256dot( +unsafe fn m256dot( store0: __m256i, store1: __m256i, row: __m256i, @@ -108,15 +108,15 @@ unsafe fn convolve_vertical_avx2_row_impl( let item_row1 = _mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row0, v_weight0); - (store2, store3) = m256dot::(store2, store3, item_row1, v_weight0); + (store0, store1) = m256dot(store0, store1, item_row0, v_weight0); + (store2, store3) = m256dot(store2, store3, item_row1, v_weight0); let item_row10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); let item_row11 = _mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row10, v_weight1); - (store2, store3) = m256dot::(store2, store3, item_row11, v_weight1); + (store0, store1) = m256dot(store0, store1, item_row10, v_weight1); + (store2, store3) = m256dot(store2, store3, item_row11, v_weight1); } else if bounds_size == 3 { let py = bounds.start; let weights = weight.get_unchecked(0..3); @@ -134,22 +134,22 @@ unsafe fn convolve_vertical_avx2_row_impl( let item_row1 = _mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row0, v_weight0); - (store2, store3) = m256dot::(store2, store3, item_row1, v_weight0); + (store0, store1) = m256dot(store0, store1, item_row0, v_weight0); + (store2, store3) = m256dot(store2, store3, item_row1, v_weight0); let item_row10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); let item_row11 = _mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row10, v_weight1); - (store2, store3) = m256dot::(store2, store3, item_row11, v_weight1); + (store0, store1) = m256dot(store0, store1, item_row10, v_weight1); + (store2, store3) = m256dot(store2, store3, item_row11, v_weight1); let item_row20 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i); let item_row21 = _mm256_loadu_si256(src_ptr2.get_unchecked(32..).as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row20, v_weight2); - (store2, store3) = m256dot::(store2, store3, item_row21, v_weight2); + (store0, store1) = m256dot(store0, store1, item_row20, v_weight2); + (store2, store3) = m256dot(store2, store3, item_row21, v_weight2); } else if bounds_size == 4 { let py = bounds.start; let weights = weight.get_unchecked(0..4); @@ -170,29 +170,29 @@ unsafe fn convolve_vertical_avx2_row_impl( let item_row1 = _mm256_loadu_si256(src_ptr0.get_unchecked(32..).as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row0, v_weight0); - (store2, store3) = m256dot::(store2, store3, item_row1, v_weight0); + (store0, store1) = m256dot(store0, store1, item_row0, v_weight0); + (store2, store3) = m256dot(store2, store3, item_row1, v_weight0); let item_row10 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); let item_row11 = _mm256_loadu_si256(src_ptr1.get_unchecked(32..).as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row10, v_weight1); - (store2, store3) = m256dot::(store2, store3, item_row11, v_weight1); + (store0, store1) = m256dot(store0, store1, item_row10, v_weight1); + (store2, store3) = m256dot(store2, store3, item_row11, v_weight1); let item_row20 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i); let item_row21 = _mm256_loadu_si256(src_ptr2.get_unchecked(32..).as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row20, v_weight2); - (store2, store3) = m256dot::(store2, store3, item_row21, v_weight2); + (store0, store1) = m256dot(store0, store1, item_row20, v_weight2); + (store2, store3) = m256dot(store2, store3, item_row21, v_weight2); let item_row30 = _mm256_loadu_si256(src_ptr3.as_ptr() as *const __m256i); let item_row31 = _mm256_loadu_si256(src_ptr3.get_unchecked(32..).as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row30, v_weight3); - (store2, store3) = m256dot::(store2, store3, item_row31, v_weight3); + (store0, store1) = m256dot(store0, store1, item_row30, v_weight3); + (store2, store3) = m256dot(store2, store3, item_row31, v_weight3); } else { for j in 0..bounds_size { let py = bounds.start + j; @@ -204,8 +204,8 @@ unsafe fn convolve_vertical_avx2_row_impl( let item_row1 = _mm256_loadu_si256(src_ptr.get_unchecked(32..).as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row0, v_weight); - (store2, store3) = m256dot::(store2, store3, item_row1, v_weight); + (store0, store1) = m256dot(store0, store1, item_row0, v_weight); + (store2, store3) = m256dot(store2, store3, item_row1, v_weight); } } @@ -246,10 +246,10 @@ unsafe fn convolve_vertical_avx2_row_impl( let src_ptr1 = src.get_unchecked(v_offset1..); let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row0, v_weight0); + (store0, store1) = m256dot(store0, store1, item_row0, v_weight0); let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row1, v_weight1); + (store0, store1) = m256dot(store0, store1, item_row1, v_weight1); } else if bounds_size == 3 { let py = bounds.start; let weights = weight.get_unchecked(0..3); @@ -264,13 +264,13 @@ unsafe fn convolve_vertical_avx2_row_impl( let src_ptr2 = src.get_unchecked(v_offset2..); let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row0, v_weight0); + (store0, store1) = m256dot(store0, store1, item_row0, v_weight0); let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row1, v_weight1); + (store0, store1) = m256dot(store0, store1, item_row1, v_weight1); let item_row2 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row2, v_weight2); + (store0, store1) = m256dot(store0, store1, item_row2, v_weight2); } else if bounds_size == 4 { let py = bounds.start; let weights = weight.get_unchecked(0..4); @@ -288,16 +288,16 @@ unsafe fn convolve_vertical_avx2_row_impl( let src_ptr3 = src.get_unchecked(v_offset3..); let item_row0 = _mm256_loadu_si256(src_ptr0.as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row0, v_weight0); + (store0, store1) = m256dot(store0, store1, item_row0, v_weight0); let item_row1 = _mm256_loadu_si256(src_ptr1.as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row1, v_weight1); + (store0, store1) = m256dot(store0, store1, item_row1, v_weight1); let item_row2 = _mm256_loadu_si256(src_ptr2.as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row2, v_weight2); + (store0, store1) = m256dot(store0, store1, item_row2, v_weight2); let item_row3 = _mm256_loadu_si256(src_ptr3.as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row3, v_weight3); + (store0, store1) = m256dot(store0, store1, item_row3, v_weight3); } else { for j in 0..bounds_size { let py = bounds.start + j; @@ -307,7 +307,7 @@ unsafe fn convolve_vertical_avx2_row_impl( let src_ptr = src.get_unchecked(v_offset..); let item_row0 = _mm256_loadu_si256(src_ptr.as_ptr() as *const __m256i); - (store0, store1) = m256dot::(store0, store1, item_row0, v_weight); + (store0, store1) = m256dot(store0, store1, item_row0, v_weight); } }