diff --git a/.github/workflows/build_push.yml b/.github/workflows/build_push.yml index 3e8fb85..e10a6e8 100644 --- a/.github/workflows/build_push.yml +++ b/.github/workflows/build_push.yml @@ -8,7 +8,6 @@ on: push: branches: - 'master' - - 'dev' - '!ci_test_*' tags-ignore: - '*' diff --git a/app/src/main.rs b/app/src/main.rs index 3b4cb82..6d8dcec 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -11,11 +11,8 @@ use fast_image_resize::{ }; use image::{EncodableLayout, GenericImageView, ImageReader}; use pic_scale::{ - Ar30ByteOrder, ImageSize, ImageStore, LinearApproxScaler, ResamplingFunction, Scaler, Scaling, - ScalingU16, ThreadingPolicy, -}; -use yuvutils_rs::{ - ar30_to_rgba8, ra30_to_rgba8, rgb8_to_ar30, rgba8_to_ar30, rgba8_to_ra30, Rgb30ByteOrder, + Ar30ByteOrder, ImageSize, ImageStore, LinearApproxScaler, LinearScaler, ResamplingFunction, + Scaler, Scaling, ScalingU16, ThreadingPolicy, }; fn resize_plane( @@ -46,7 +43,6 @@ fn resize_plane( .unwrap(); } - fn main() { // test_fast_image(); let img = ImageReader::open("./assets/nasa-4928x3279-rgba.png") @@ -57,10 +53,10 @@ fn main() { let transient = img.to_rgba8(); let mut bytes = Vec::from(transient.as_bytes()); - let mut scaler = Scaler::new(ResamplingFunction::Bilinear); + let mut scaler = LinearScaler::new(ResamplingFunction::Bilinear); scaler.set_threading_policy(ThreadingPolicy::Single); - resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear); + // resize_plane(378, 257, 257, 257, ResamplingFunction::Bilinear); // let mut choke: Vec = bytes.iter().map(|&x| (x as u16) << 2).collect(); // diff --git a/src/avx2/rgba_f16.rs b/src/avx2/rgba_f16.rs index e9d645d..85207bb 100644 --- a/src/avx2/rgba_f16.rs +++ b/src/avx2/rgba_f16.rs @@ -120,8 +120,8 @@ pub(crate) fn convolve_horizontal_rgba_avx_row_one_f16( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { unsafe { if FMA { @@ -129,16 +129,16 @@ pub(crate) fn convolve_horizontal_rgba_avx_row_one_f16( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } else { convolve_horizontal_rgba_avx_row_one_f16_regular( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } } @@ -150,15 +150,15 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { convolve_horizontal_rgba_avx_row_one_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } @@ -168,15 +168,15 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_regular( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { convolve_horizontal_rgba_avx_row_one_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } @@ -185,8 +185,8 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { const CHANNELS: usize = 4; let mut filter_offset = 0usize; @@ -203,7 +203,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl( let filter_start = jx + bounds.start; store = convolve_horizontal_parts_8_rgba_f16::( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, weight1, weight2, @@ -219,7 +219,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl( let filter_start = jx + bounds.start; store = convolve_horizontal_parts_4_rgba_f16::( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, weight1, store, @@ -235,7 +235,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl( let filter_start = jx + bounds.start; store = convolve_horizontal_parts_2_rgba_f16::( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight, store, ); @@ -248,7 +248,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl( let filter_start = jx + bounds.start; store = convolve_horizontal_parts_one_rgba_f16::( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, store, ); @@ -256,7 +256,7 @@ unsafe fn convolve_horizontal_rgba_avx_row_one_f16_impl( } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); let converted_f16 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps( _mm256_castps256_ps128(store), _mm256_extractf128_ps::<1>(store), @@ -275,9 +275,9 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f16( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { unsafe { @@ -286,9 +286,9 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f16( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } else { @@ -296,9 +296,9 @@ pub(crate) fn convolve_horizontal_rgba_avx_rows_4_f16( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -311,18 +311,18 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_regular( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { convolve_horizontal_rgba_avx_rows_4_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -333,18 +333,18 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { convolve_horizontal_rgba_avx_rows_4_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -354,9 +354,9 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { const CHANNELS: usize = 4; @@ -379,7 +379,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( store_0 = convolve_horizontal_parts_8_rgba_f16::( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, weight1, weight2, @@ -388,7 +388,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( ); store_1 = convolve_horizontal_parts_8_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..).as_ptr(), weight0, weight1, weight2, @@ -397,7 +397,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( ); store_2 = convolve_horizontal_parts_8_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..).as_ptr(), weight0, weight1, weight2, @@ -406,7 +406,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( ); store_3 = convolve_horizontal_parts_8_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..).as_ptr(), weight0, weight1, weight2, @@ -423,28 +423,28 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( store_0 = convolve_horizontal_parts_4_rgba_f16::( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, weight1, store_0, ); store_1 = convolve_horizontal_parts_4_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..).as_ptr(), weight0, weight1, store_1, ); store_2 = convolve_horizontal_parts_4_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..).as_ptr(), weight0, weight1, store_2, ); store_3 = convolve_horizontal_parts_4_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..).as_ptr(), weight0, weight1, store_3, @@ -460,25 +460,25 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( let filter_start = jx + bounds.start; store_0 = convolve_horizontal_parts_2_rgba_f16::( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight, store_0, ); store_1 = convolve_horizontal_parts_2_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..).as_ptr(), weight, store_1, ); store_2 = convolve_horizontal_parts_2_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..).as_ptr(), weight, store_2, ); store_3 = convolve_horizontal_parts_2_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..).as_ptr(), weight, store_3, ); @@ -491,25 +491,25 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( let weight0 = _mm256_set1_ps(ptr.read_unaligned()); store_0 = convolve_horizontal_parts_one_rgba_f16::( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, store_0, ); store_1 = convolve_horizontal_parts_one_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..).as_ptr(), weight0, store_1, ); store_2 = convolve_horizontal_parts_one_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..).as_ptr(), weight0, store_2, ); store_3 = convolve_horizontal_parts_one_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..).as_ptr(), weight0, store_3, ); @@ -517,7 +517,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); let converted_f16_0 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps( _mm256_castps256_ps128(store_0), _mm256_extractf128_ps::<1>(store_0), @@ -528,7 +528,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( 8, ); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr(); let converted_f16_1 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps( _mm256_castps256_ps128(store_1), _mm256_extractf128_ps::<1>(store_1), @@ -539,7 +539,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( 8, ); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); let converted_f16_2 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps( _mm256_castps256_ps128(store_2), _mm256_extractf128_ps::<1>(store_2), @@ -550,7 +550,7 @@ unsafe fn convolve_horizontal_rgba_avx_rows_4_f16_impl( 8, ); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); let converted_f16_3 = _mm_cvtps_ph::<_MM_FROUND_TO_NEAREST_INT>(_mm_add_ps( _mm256_castps256_ps128(store_3), _mm256_extractf128_ps::<1>(store_3), diff --git a/src/avx2/vertical_f16.rs b/src/avx2/vertical_f16.rs index 63b2871..bfe1436 100644 --- a/src/avx2/vertical_f16.rs +++ b/src/avx2/vertical_f16.rs @@ -37,9 +37,9 @@ use std::arch::x86_64::*; unsafe fn convolve_vertical_part_avx_f16( start_y: usize, start_x: usize, - src: *const half::f16, + src: &[half::f16], src_stride: usize, - dst: *mut half::f16, + dst: &mut [half::f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -51,7 +51,7 @@ unsafe fn convolve_vertical_part_avx_f16( let py = start_y + j; let weight = *filter.get_unchecked(j); let v_weight = _mm256_set1_ps(weight); - let src_ptr = src.add(src_stride * py); + let src_ptr = src.get_unchecked(src_stride * py..).as_ptr(); let s_ptr = src_ptr.add(px); let item_row_0 = _mm256_set1_epi16(s_ptr.read_unaligned().to_bits() as i16); @@ -63,7 +63,7 @@ unsafe fn convolve_vertical_part_avx_f16( ); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT; @@ -76,9 +76,9 @@ unsafe fn convolve_vertical_part_avx_f16( unsafe fn convolve_vertical_part_avx_4_f16( start_y: usize, start_x: usize, - src: *const half::f16, + src: &[half::f16], src_stride: usize, - dst: *mut half::f16, + dst: &mut [half::f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -90,7 +90,7 @@ unsafe fn convolve_vertical_part_avx_4_f16( let py = start_y + j; let weight = *filter.get_unchecked(j); let v_weight = _mm256_set1_ps(weight); - let src_ptr = src.add(src_stride * py); + let src_ptr = src.get_unchecked(src_stride * py..).as_ptr(); let s_ptr = src_ptr.add(px); let item_row_0 = _mm_loadu_si64(s_ptr as *const u8); @@ -100,7 +100,7 @@ unsafe fn convolve_vertical_part_avx_4_f16( const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT; - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); let acc = _mm256_cvtps_ph::(store_0); std::ptr::copy_nonoverlapping(&acc as *const _ as *const u8, dst_ptr as *mut u8, 8); } @@ -109,9 +109,9 @@ unsafe fn convolve_vertical_part_avx_4_f16( unsafe fn convolve_vertical_part_avx_32_f16( start_y: usize, start_x: usize, - src: *const half::f16, + src: &[half::f16], src_stride: usize, - dst: *mut half::f16, + dst: &mut [half::f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -126,7 +126,7 @@ unsafe fn convolve_vertical_part_avx_32_f16( let py = start_y + j; let weight = *filter.get_unchecked(j); let v_weight = _mm256_set1_ps(weight); - let src_ptr = src.add(src_stride * py); + let src_ptr = src.get_unchecked(src_stride * py..).as_ptr(); let s_ptr = src_ptr.add(px); let item_row_0 = _mm256_loadu_si256(s_ptr as *const __m256i); @@ -143,7 +143,7 @@ unsafe fn convolve_vertical_part_avx_32_f16( store_3 = _mm256_fma_ps::(store_3, items3, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT; @@ -164,9 +164,9 @@ unsafe fn convolve_vertical_part_avx_32_f16( unsafe fn convolve_vertical_part_avx_16_f16( start_y: usize, start_x: usize, - src: *const half::f16, + src: &[half::f16], src_stride: usize, - dst: *mut half::f16, + dst: &mut [half::f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -179,7 +179,7 @@ unsafe fn convolve_vertical_part_avx_16_f16( let py = start_y + j; let weight = *filter.get_unchecked(j); let v_weight = _mm256_set1_ps(weight); - let src_ptr = src.add(src_stride * py); + let src_ptr = src.get_unchecked(src_stride * py..).as_ptr(); let s_ptr = src_ptr.add(px); let item_row = _mm256_loadu_si256(s_ptr as *const __m256i); @@ -193,7 +193,7 @@ unsafe fn convolve_vertical_part_avx_16_f16( const ROUNDING_FLAGS: i32 = _MM_FROUND_TO_NEAREST_INT; - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); let acc0 = avx_combine_epi( _mm256_cvtps_ph::(store_0), _mm256_cvtps_ph::(store_1), @@ -204,29 +204,19 @@ unsafe fn convolve_vertical_part_avx_16_f16( pub(crate) fn convolve_vertical_avx_row_f16( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], src_stride: usize, weight_ptr: &[f32], ) { unsafe { if FMA { convolve_vertical_avx_row_f16_fma::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } else { convolve_vertical_avx_row_f16_regular::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } } @@ -237,18 +227,13 @@ pub(crate) fn convolve_vertical_avx_row_f16( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], src_stride: usize, weight_ptr: &[f32], ) { convolve_vertical_avx_row_f16_impl::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } @@ -257,18 +242,13 @@ unsafe fn convolve_vertical_avx_row_f16_regular( unsafe fn convolve_vertical_avx_row_f16_fma( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], src_stride: usize, weight_ptr: &[f32], ) { convolve_vertical_avx_row_f16_impl::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } @@ -276,8 +256,8 @@ unsafe fn convolve_vertical_avx_row_f16_fma( pub(crate) fn convolve_vertical_avx_row_f16_impl( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], src_stride: usize, weight_ptr: &[f32], ) { @@ -289,9 +269,9 @@ pub(crate) fn convolve_vertical_avx_row_f16_impl( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -305,9 +285,9 @@ pub(crate) fn convolve_vertical_avx_row_f16_impl( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -321,9 +301,9 @@ pub(crate) fn convolve_vertical_avx_row_f16_impl( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -337,9 +317,9 @@ pub(crate) fn convolve_vertical_avx_row_f16_impl( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); diff --git a/src/convolve_naive_u16.rs b/src/convolve_naive_u16.rs deleted file mode 100644 index 1a2f849..0000000 --- a/src/convolve_naive_u16.rs +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Copyright (c) Radzivon Bartoshyk. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, - * are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ -use crate::filter_weights::FilterBounds; -use crate::floating_point_vertical::{ - convolve_column_handler_floating_point, convolve_column_handler_floating_point_4, -}; - -#[allow(dead_code)] -pub(crate) fn convolve_vertical_rgb_native_row_u16( - dst_width: usize, - bounds: &FilterBounds, - src: &[u16], - dst: &mut [u16], - src_stride: usize, - weight: &[f32], - bit_depth: usize, -) { - let mut cx = 0usize; - - while cx + 4 < dst_width { - convolve_column_handler_floating_point_4::( - src, - src_stride, - dst, - weight, - bounds, - bit_depth as u32, - cx, - ); - - cx += 4; - } - - while cx < dst_width { - convolve_column_handler_floating_point::( - src, - src_stride, - dst, - weight, - bounds, - bit_depth as u32, - cx, - ); - - cx += 1; - } -} diff --git a/src/dispatch_group_f16.rs b/src/dispatch_group_f16.rs index 8c162c0..dd811b4 100644 --- a/src/dispatch_group_f16.rs +++ b/src/dispatch_group_f16.rs @@ -28,71 +28,52 @@ */ use crate::filter_weights::{FilterBounds, FilterWeights}; -use crate::unsafe_slice::UnsafeSlice; use crate::ImageStore; use half::f16; +use rayon::iter::{IndexedParallelIterator, ParallelIterator}; +use rayon::prelude::{ParallelSlice, ParallelSliceMut}; use rayon::ThreadPool; -use std::sync::Arc; pub(crate) fn convolve_vertical_dispatch_f16( image_store: &ImageStore, filter_weights: FilterWeights, destination: &mut ImageStore, pool: &Option, - dispatcher: fn(usize, &FilterBounds, *const f16, *mut f16, usize, &[f32]), + dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]), ) { - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - let src_stride = image_store.width * image_store.channels; - - let mut filter_offset = 0usize; - let dst_stride = destination.width * image_store.channels; + let dst_width = destination.width; if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let bounds = unsafe { weights.bounds.get_unchecked(y) }; - let weight_ptr = - unsafe { weights.weights.get_unchecked((weights.aligned_size * y)..) }; - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - dispatcher( - dst_width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, - ); + pool.install(|| { + destination + .buffer + .borrow_mut() + .par_chunks_exact_mut(dst_stride) + .enumerate() + .for_each(|(y, row)| { + let bounds = filter_weights.bounds[y]; + let filter_offset = y * filter_weights.aligned_size; + let weights = &filter_weights.weights[filter_offset..]; + let source_buffer = image_store.buffer.borrow(); + dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights); }); - } }); } else { - for y in 0..destination.height { - let bounds = unsafe { filter_weights.bounds.get_unchecked(y) }; - let weight_ptr = unsafe { filter_weights.weights.get_unchecked(filter_offset..) }; - - dispatcher( - dst_width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, - ); - - filter_offset += filter_weights.aligned_size; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } + destination + .buffer + .borrow_mut() + .chunks_exact_mut(dst_stride) + .enumerate() + .for_each(|(y, row)| { + let bounds = filter_weights.bounds[y]; + let filter_offset = y * filter_weights.aligned_size; + let weights = &filter_weights.weights[filter_offset..]; + let source_buffer = image_store.buffer.borrow(); + dispatcher(dst_width, &bounds, source_buffer, row, src_stride, weights); + }); } } @@ -102,95 +83,120 @@ pub(crate) fn convolve_horizontal_dispatch_f16( destination: &mut ImageStore, pool: &Option, dispatcher_4_rows: Option< - fn(usize, usize, &FilterWeights, *const f16, usize, *mut f16, usize), + fn(usize, usize, &FilterWeights, &[f16], usize, &mut [f16], usize), >, - dispatcher_row: fn(usize, usize, &FilterWeights, *const f16, *mut f16), + dispatcher_row: fn(usize, usize, &FilterWeights, &[f16], &mut [f16]), ) { - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - let src_stride = image_store.width * image_store.channels; let dst_stride = destination.width * image_store.channels; let dst_width = destination.width; let src_width = image_store.width; if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - let mut yy = 0usize; + pool.install(|| { + let mut processed_4 = false; + if let Some(dispatcher) = dispatcher_4_rows { - for y in (0..destination.height.saturating_sub(4)).step_by(4) { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; + image_store + .buffer + .borrow() + .par_chunks_exact(src_stride * 4) + .zip( + destination + .buffer + .borrow_mut() + .par_chunks_exact_mut(dst_stride * 4), + ) + .for_each(|(src, dst)| { dispatcher( dst_width, src_width, - &weights, - unsafe_source_ptr_0, + &filter_weights, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); }); - yy = y; - } + processed_4 = true; } - for y in yy..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - dispatcher_row( - dst_width, - src_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); + + let left_src_rows = if processed_4 { + image_store + .buffer + .borrow() + .chunks_exact(src_stride * 4) + .remainder() + } else { + image_store.buffer.borrow() + }; + let left_dst_rows = if processed_4 { + destination + .buffer + .borrow_mut() + .chunks_exact_mut(dst_stride * 4) + .into_remainder() + } else { + destination.buffer.borrow_mut() + }; + + left_src_rows + .par_chunks_exact(src_stride) + .zip(left_dst_rows.par_chunks_exact_mut(dst_stride)) + .for_each(|(src, dst)| { + dispatcher_row(dst_width, src_width, &filter_weights, src, dst); }); - } }); } else { - let mut yy = 0usize; - + let mut processed_4 = false; if let Some(dispatcher) = dispatcher_4_rows { - while yy + 4 < destination.height { + for (src, dst) in image_store + .buffer + .borrow() + .chunks_exact(src_stride * 4) + .zip( + destination + .buffer + .borrow_mut() + .chunks_exact_mut(dst_stride * 4), + ) + { dispatcher( dst_width, src_width, &filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) }; - - yy += 4; } + processed_4 = true; } - for _ in yy..destination.height { - dispatcher_row( - dst_width, - src_width, - &filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; + let left_src_rows = if processed_4 { + image_store + .buffer + .borrow() + .chunks_exact(src_stride * 4) + .remainder() + } else { + image_store.buffer.borrow() + }; + let left_dst_rows = if processed_4 { + destination + .buffer + .borrow_mut() + .chunks_exact_mut(dst_stride * 4) + .into_remainder() + } else { + destination.buffer.borrow_mut() + }; + for (src, dst) in left_src_rows + .chunks_exact(src_stride) + .zip(left_dst_rows.chunks_exact_mut(dst_stride)) + { + dispatcher_row(dst_width, src_width, &filter_weights, src, dst); } } } diff --git a/src/f16.rs b/src/f16.rs index 5d74f8b..3e5c23c 100644 --- a/src/f16.rs +++ b/src/f16.rs @@ -35,13 +35,14 @@ use crate::avx2::{ convolve_vertical_avx_row_f16, }; use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; -use crate::convolve_naive_f32::{ - convolve_horizontal_rgb_native_row, convolve_horizontal_rgba_4_row_f32, -}; #[cfg(all(target_arch = "aarch64", target_feature = "neon",))] use crate::cpu_features::{is_aarch_f16_supported, is_aarch_f16c_supported}; use crate::dispatch_group_f16::{convolve_horizontal_dispatch_f16, convolve_vertical_dispatch_f16}; use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::floating_point_horizontal::{ + convolve_row_handler_floating_point, convolve_row_handler_floating_point_4, +}; +use crate::floating_point_vertical::column_handler_floating_point; #[cfg(all(target_arch = "aarch64", target_feature = "neon",))] use crate::neon::{ convolve_horizontal_rgb_neon_row_one_f16, convolve_horizontal_rgb_neon_rows_4_f16, @@ -54,7 +55,6 @@ use crate::neon::{ xconvolve_horizontal_rgba_neon_row_one_f16, xconvolve_horizontal_rgba_neon_rows_4_f16, xconvolve_vertical_rgb_neon_row_f16, }; -use crate::rgb_f32::convolve_vertical_rgb_native_row_f32; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] use crate::sse::{ convolve_horizontal_rgb_sse_row_one_f16, convolve_horizontal_rgb_sse_rows_4_f16, @@ -65,6 +65,35 @@ use crate::ImageStore; use half::f16; use rayon::ThreadPool; +fn convolve_horizontal_rgba_4_row_f16( + _: usize, + _: usize, + filter_weights: &FilterWeights, + src: &[f16], + src_stride: usize, + dst: &mut [f16], + dst_stride: usize, +) { + convolve_row_handler_floating_point_4::( + src, + src_stride, + dst, + dst_stride, + filter_weights, + 8, + ) +} + +fn convolve_horizontal_rgb_native_row_f16( + _: usize, + _: usize, + filter_weights: &FilterWeights, + src: &[f16], + dst: &mut [f16], +) { + convolve_row_handler_floating_point::(src, dst, filter_weights, 8) +} + impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 4> { fn convolve_horizontal( &self, @@ -73,10 +102,10 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 4> { pool: &Option, ) { let mut _dispatcher_4_rows: Option< - fn(usize, usize, &FilterWeights, *const f16, usize, *mut f16, usize), - > = Some(convolve_horizontal_rgba_4_row_f32::); - let mut _dispatcher_row: fn(usize, usize, &FilterWeights, *const f16, *mut f16) = - convolve_horizontal_rgb_native_row::; + fn(usize, usize, &FilterWeights, &[f16], usize, &mut [f16], usize), + > = Some(convolve_horizontal_rgba_4_row_f16::<4>); + let mut _dispatcher_row: fn(usize, usize, &FilterWeights, &[f16], &mut [f16]) = + convolve_horizontal_rgb_native_row_f16::<4>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { if is_aarch_f16c_supported() { @@ -126,6 +155,17 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 4> { } } +fn convolve_vertical_rgb_native_row_f16( + _: usize, + bounds: &FilterBounds, + src: &[f16], + dst: &mut [f16], + src_stride: usize, + weight: &[f32], +) { + column_handler_floating_point::(bounds, src, dst, src_stride, weight, 8); +} + impl<'a> VerticalConvolutionPass for ImageStore<'a, f16, 4> { fn convolve_vertical( &self, @@ -133,8 +173,8 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f16, 4> { destination: &mut ImageStore, pool: &Option, ) { - let mut _dispatcher: fn(usize, &FilterBounds, *const f16, *mut f16, usize, &[f32]) = - convolve_vertical_rgb_native_row_f32::; + let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = + convolve_vertical_rgb_native_row_f16::<4>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { if is_aarch_f16c_supported() { @@ -178,10 +218,10 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 3> { pool: &Option, ) { let mut _dispatcher_4_rows: Option< - fn(usize, usize, &FilterWeights, *const f16, usize, *mut f16, usize), - > = Some(convolve_horizontal_rgba_4_row_f32::); - let mut _dispatcher_row: fn(usize, usize, &FilterWeights, *const f16, *mut f16) = - convolve_horizontal_rgb_native_row::; + fn(usize, usize, &FilterWeights, &[f16], usize, &mut [f16], usize), + > = Some(convolve_horizontal_rgba_4_row_f16::<3>); + let mut _dispatcher_row: fn(usize, usize, &FilterWeights, &[f16], &mut [f16]) = + convolve_horizontal_rgb_native_row_f16::<3>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { if is_aarch_f16c_supported() { @@ -229,8 +269,8 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f16, 3> { destination: &mut ImageStore, pool: &Option, ) { - let mut _dispatcher: fn(usize, &FilterBounds, *const f16, *mut f16, usize, &[f32]) = - convolve_vertical_rgb_native_row_f32::; + let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = + convolve_vertical_rgb_native_row_f16::<3>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { if is_aarch_f16c_supported() { @@ -274,10 +314,10 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f16, 1> { pool: &Option, ) { let _dispatcher_4_rows: Option< - fn(usize, usize, &FilterWeights, *const f16, usize, *mut f16, usize), - > = Some(convolve_horizontal_rgba_4_row_f32::); - let _dispatcher_row: fn(usize, usize, &FilterWeights, *const f16, *mut f16) = - convolve_horizontal_rgb_native_row::; + fn(usize, usize, &FilterWeights, &[f16], usize, &mut [f16], usize), + > = Some(convolve_horizontal_rgba_4_row_f16::<1>); + let _dispatcher_row: fn(usize, usize, &FilterWeights, &[f16], &mut [f16]) = + convolve_horizontal_rgb_native_row_f16::<1>; convolve_horizontal_dispatch_f16( self, filter_weights, @@ -296,8 +336,8 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f16, 1> { destination: &mut ImageStore, pool: &Option, ) { - let mut _dispatcher: fn(usize, &FilterBounds, *const f16, *mut f16, usize, &[f32]) = - convolve_vertical_rgb_native_row_f32::; + let mut _dispatcher: fn(usize, &FilterBounds, &[f16], &mut [f16], usize, &[f32]) = + convolve_vertical_rgb_native_row_f16::<1>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { if is_aarch_f16c_supported() { diff --git a/src/lib.rs b/src/lib.rs index 929b771..de47fb5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,7 +43,6 @@ mod color_group; mod colors; mod convolution; mod convolve_naive_f32; -mod convolve_naive_u16; mod cpu_features; mod dispatch_group_ar30; #[cfg(feature = "half")] diff --git a/src/mixed_storage.rs b/src/mixed_storage.rs index 594b8c4..1cc63ba 100644 --- a/src/mixed_storage.rs +++ b/src/mixed_storage.rs @@ -55,6 +55,15 @@ impl MixedStorage for f32 { } } +#[cfg(feature = "half")] +impl MixedStorage for f32 { + #[inline(always)] + #[allow(clippy::manual_clamp)] + fn to_mixed(self, _: u32) -> half::f16 { + half::f16::from_f32(self) + } +} + impl MixedStorage for f64 { #[inline(always)] #[allow(clippy::manual_clamp)] diff --git a/src/neon/alpha_f16.rs b/src/neon/alpha_f16.rs index 20a320c..71bd739 100644 --- a/src/neon/alpha_f16.rs +++ b/src/neon/alpha_f16.rs @@ -90,7 +90,7 @@ unsafe fn neon_premultiply_alpha_rgba_row_f16(dst: &mut [half::f16], src: &[half premultiply_pixel_f16_row(rem, src_rem); } -pub fn neon_premultiply_alpha_rgba_f16( +pub(crate) fn neon_premultiply_alpha_rgba_f16( dst: &mut [half::f16], src: &[half::f16], width: usize, @@ -197,7 +197,7 @@ unsafe fn neon_unpremultiply_alpha_rgba_row_f16(in_place: &mut [half::f16]) { unpremultiply_pixel_f16_row(rem); } -pub fn neon_unpremultiply_alpha_rgba_f16( +pub(crate) fn neon_unpremultiply_alpha_rgba_f16( in_place: &mut [half::f16], width: usize, _: usize, diff --git a/src/neon/alpha_f16_full.rs b/src/neon/alpha_f16_full.rs index c3c16ec..0a2f1c4 100644 --- a/src/neon/alpha_f16_full.rs +++ b/src/neon/alpha_f16_full.rs @@ -67,7 +67,7 @@ unsafe fn neon_premultiply_alpha_rgba_row_f16_full(dst: &mut [half::f16], src: & premultiply_pixel_f16_row(rem, src_rem); } -pub fn neon_premultiply_alpha_rgba_f16_full( +pub(crate) fn neon_premultiply_alpha_rgba_f16_full( dst: &mut [half::f16], src: &[half::f16], width: usize, @@ -135,7 +135,7 @@ unsafe fn neon_unpremultiply_alpha_rgba_f16_row_full(in_place: &mut [half::f16]) unpremultiply_pixel_f16_row(rem); } -pub fn neon_unpremultiply_alpha_rgba_f16_full( +pub(crate) fn neon_unpremultiply_alpha_rgba_f16_full( in_place: &mut [half::f16], width: usize, _: usize, diff --git a/src/neon/alpha_f32.rs b/src/neon/alpha_f32.rs index d280c19..dbcfe7d 100644 --- a/src/neon/alpha_f32.rs +++ b/src/neon/alpha_f32.rs @@ -64,7 +64,7 @@ unsafe fn neon_premultiply_alpha_rgba_row_f32(dst: &mut [f32], src: &[f32]) { premultiply_pixel_f32_row(rem, src_rem); } -pub fn neon_premultiply_alpha_rgba_f32( +pub(crate) fn neon_premultiply_alpha_rgba_f32( dst: &mut [f32], src: &[f32], width: usize, @@ -108,7 +108,7 @@ unsafe fn neon_unpremultiply_alpha_rgba_f32_row(in_place: &mut [f32]) { unpremultiply_pixel_f32_row(rem); } -pub fn neon_unpremultiply_alpha_rgba_f32( +pub(crate) fn neon_unpremultiply_alpha_rgba_f32( in_place: &mut [f32], width: usize, _: usize, diff --git a/src/neon/alpha_u16.rs b/src/neon/alpha_u16.rs index dbdee0e..bc7048f 100644 --- a/src/neon/alpha_u16.rs +++ b/src/neon/alpha_u16.rs @@ -161,7 +161,7 @@ pub fn neon_premultiply_alpha_rgba_row_u16(dst: &mut [u16], src: &[u16], bit_dep premultiply_alpha_rgba_row(rem, src_rem, max_colors); } -pub fn neon_premultiply_alpha_rgba_u16( +pub(crate) fn neon_premultiply_alpha_rgba_u16( dst: &mut [u16], src: &[u16], width: usize, @@ -252,7 +252,7 @@ fn neon_unpremultiply_alpha_rgba_row_u16(in_place: &mut [u16], bit_depth: usize) unpremultiply_alpha_rgba_row(rem, max_colors); } -pub fn neon_unpremultiply_alpha_rgba_u16( +pub(crate) fn neon_unpremultiply_alpha_rgba_u16( in_place: &mut [u16], width: usize, _: usize, diff --git a/src/neon/alpha_u8.rs b/src/neon/alpha_u8.rs index c021518..c296a63 100644 --- a/src/neon/alpha_u8.rs +++ b/src/neon/alpha_u8.rs @@ -134,7 +134,7 @@ unsafe fn neon_premultiply_alpha_rgba_impl_row(dst: &mut [u8], src: &[u8]) { premultiply_alpha_rgba_row_impl(rem, src_rem); } -pub fn neon_premultiply_alpha_rgba( +pub(crate) fn neon_premultiply_alpha_rgba( dst: &mut [u8], src: &[u8], width: usize, @@ -178,7 +178,7 @@ unsafe fn neon_unpremultiply_alpha_rgba_impl_row(in_place: &mut [u8]) { unpremultiply_alpha_rgba_row_impl(rem); } -pub fn neon_unpremultiply_alpha_rgba( +pub(crate) fn neon_unpremultiply_alpha_rgba( in_place: &mut [u8], width: usize, _: usize, diff --git a/src/neon/convolve_f16.rs b/src/neon/convolve_f16.rs index e7276ee..8d0ada8 100644 --- a/src/neon/convolve_f16.rs +++ b/src/neon/convolve_f16.rs @@ -36,9 +36,9 @@ use crate::neon::*; pub(crate) unsafe fn convolve_vertical_part_neon_8_f16( start_y: usize, start_x: usize, - src: *const half::f16, + src: &[half::f16], src_stride: usize, - dst: *mut half::f16, + dst: &mut [half::f16], filter: &[f32], bounds: &FilterBounds, blend_length: usize, @@ -52,7 +52,7 @@ pub(crate) unsafe fn convolve_vertical_part_neon_8_f16 let py = start_y + j; let weight = filter.get_unchecked(j..); let v_weight = vld1q_dup_f32(weight.as_ptr()); - let src_ptr = src.add(src_stride * py); + let src_ptr = src.get_unchecked(src_stride * py..).as_ptr(); let s_ptr = src_ptr.add(px); let item_row = if USE_BLENDING { @@ -72,7 +72,7 @@ pub(crate) unsafe fn convolve_vertical_part_neon_8_f16 let item = xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)); - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); if USE_BLENDING { let mut transient: [half::f16; 8] = [half::f16::from_f32(0.); 8]; xvstq_f16(transient.as_mut_ptr(), item); diff --git a/src/neon/f16_utils.rs b/src/neon/f16_utils.rs index 6a23a70..0a94d05 100644 --- a/src/neon/f16_utils.rs +++ b/src/neon/f16_utils.rs @@ -33,23 +33,56 @@ use std::arch::asm; /// Provides basic support for f16 +#[allow(unused)] +macro_rules! static_assert { + ($e:expr) => { + const { + assert!($e); + } + }; + ($e:expr, $msg:expr) => { + const { + assert!($e, $msg); + } + }; +} + +#[allow(unused_macros)] +macro_rules! static_assert_uimm_bits { + ($imm:ident, $bits:expr) => { + // `0 <= $imm` produces a warning if the immediate has an unsigned type + #[allow(unused_comparisons)] + { + static_assert!( + 0 <= $imm && $imm < (1 << $bits), + concat!( + stringify!($imm), + " doesn't fit in ", + stringify!($bits), + " bits", + ) + ) + } + }; +} + #[derive(Debug, Clone, Copy)] #[allow(non_camel_case_types)] #[allow(dead_code)] -pub struct x_float16x4_t(pub(crate) uint16x4_t); +pub(crate) struct x_float16x4_t(pub(crate) uint16x4_t); #[derive(Debug, Clone, Copy)] #[allow(non_camel_case_types)] #[allow(dead_code)] -pub struct x_float16x8_t(pub(crate) uint16x8_t); +pub(crate) struct x_float16x8_t(pub(crate) uint16x8_t); #[derive(Debug, Clone, Copy)] #[allow(non_camel_case_types)] -pub struct x_float16x8x2_t(pub(crate) x_float16x8_t, pub(crate) x_float16x8_t); +pub(crate) struct x_float16x8x2_t(pub(crate) x_float16x8_t, pub(crate) x_float16x8_t); #[derive(Debug, Clone, Copy)] #[allow(non_camel_case_types)] -pub struct x_float16x8x4_t( +pub(crate) struct x_float16x8x4_t( pub(crate) x_float16x8_t, pub(crate) x_float16x8_t, pub(crate) x_float16x8_t, @@ -57,41 +90,49 @@ pub struct x_float16x8x4_t( ); #[inline] -pub unsafe fn xvld_f16(ptr: *const half::f16) -> x_float16x4_t { +pub(crate) unsafe fn xvld_f16(ptr: *const half::f16) -> x_float16x4_t { let store: uint16x4_t = vld1_u16(std::mem::transmute(ptr)); std::mem::transmute(store) } #[inline] -pub unsafe fn xvldq_f16(ptr: *const half::f16) -> x_float16x8_t { +pub(crate) unsafe fn xvldq_f16(ptr: *const half::f16) -> x_float16x8_t { let store: uint16x8_t = vld1q_u16(std::mem::transmute(ptr)); std::mem::transmute(store) } #[inline] -pub unsafe fn xvldq_f16_x2(ptr: *const half::f16) -> x_float16x8x2_t { - let store = vld1q_u16_x2(std::mem::transmute(ptr)); - std::mem::transmute(store) +pub(crate) unsafe fn xvldq_f16_x2(ptr: *const half::f16) -> x_float16x8x2_t { + let ptr_u16 = ptr as *const u16; + x_float16x8x2_t( + xreinterpretq_f16_u16(vld1q_u16(ptr_u16)), + xreinterpretq_f16_u16(vld1q_u16(ptr_u16.add(8))), + ) } #[inline] -pub unsafe fn xvldq_f16_x4(ptr: *const half::f16) -> x_float16x8x4_t { - let store = vld1q_u16_x4(std::mem::transmute(ptr)); - std::mem::transmute(store) +pub(crate) unsafe fn xvldq_f16_x4(ptr: *const half::f16) -> x_float16x8x4_t { + let ptr_u16 = ptr as *const u16; + x_float16x8x4_t( + xreinterpretq_f16_u16(vld1q_u16(ptr_u16)), + xreinterpretq_f16_u16(vld1q_u16(ptr_u16.add(8))), + xreinterpretq_f16_u16(vld1q_u16(ptr_u16.add(16))), + xreinterpretq_f16_u16(vld1q_u16(ptr_u16.add(24))), + ) } #[inline] -pub unsafe fn xvget_low_f16(x: x_float16x8_t) -> x_float16x4_t { +pub(crate) unsafe fn xvget_low_f16(x: x_float16x8_t) -> x_float16x4_t { std::mem::transmute(vget_low_u16(std::mem::transmute(x))) } #[inline] -pub unsafe fn xvget_high_f16(x: x_float16x8_t) -> x_float16x4_t { +pub(crate) unsafe fn xvget_high_f16(x: x_float16x8_t) -> x_float16x4_t { std::mem::transmute(vget_high_u16(std::mem::transmute(x))) } #[inline] -pub unsafe fn xcombine_f16(low: x_float16x4_t, high: x_float16x4_t) -> x_float16x8_t { +pub(crate) unsafe fn xcombine_f16(low: x_float16x4_t, high: x_float16x4_t) -> x_float16x8_t { std::mem::transmute(vcombine_u16( std::mem::transmute(low), std::mem::transmute(high), @@ -99,22 +140,22 @@ pub unsafe fn xcombine_f16(low: x_float16x4_t, high: x_float16x4_t) -> x_float16 } #[inline] -pub unsafe fn xreinterpret_u16_f16(x: x_float16x4_t) -> uint16x4_t { +pub(crate) unsafe fn xreinterpret_u16_f16(x: x_float16x4_t) -> uint16x4_t { std::mem::transmute(x) } #[inline] -pub unsafe fn xreinterpretq_u16_f16(x: x_float16x8_t) -> uint16x8_t { +pub(crate) unsafe fn xreinterpretq_u16_f16(x: x_float16x8_t) -> uint16x8_t { std::mem::transmute(x) } #[inline] -pub unsafe fn xreinterpret_f16_u16(x: uint16x4_t) -> x_float16x4_t { +pub(crate) unsafe fn xreinterpret_f16_u16(x: uint16x4_t) -> x_float16x4_t { std::mem::transmute(x) } #[inline] -pub unsafe fn xreinterpretq_f16_u16(x: uint16x8_t) -> x_float16x8_t { +pub(crate) unsafe fn xreinterpretq_f16_u16(x: uint16x8_t) -> x_float16x8_t { std::mem::transmute(x) } @@ -129,7 +170,7 @@ pub(super) unsafe fn xvzeros_f16() -> x_float16x4_t { } #[inline] -pub unsafe fn xvcvt_f32_f16(x: x_float16x4_t) -> float32x4_t { +pub(crate) unsafe fn xvcvt_f32_f16(x: x_float16x4_t) -> float32x4_t { let src: uint16x4_t = xreinterpret_u16_f16(x); let dst: float32x4_t; asm!( @@ -216,6 +257,130 @@ pub(super) unsafe fn xvfmla_f16( xreinterpret_f16_u16(result) } +#[target_feature(enable = "fp16")] +#[inline] +pub(super) unsafe fn xvfmla_laneq_f16( + a: x_float16x4_t, + b: x_float16x4_t, + c: x_float16x8_t, +) -> x_float16x4_t { + static_assert_uimm_bits!(LANE, 3); + let mut result: uint16x4_t = xreinterpret_u16_f16(a); + + if LANE == 0 { + asm!( + "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[0]", + inout(vreg) result, + in(vreg) xreinterpret_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 1 { + asm!( + "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[1]", + inout(vreg) result, + in(vreg) xreinterpret_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 2 { + asm!( + "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[2]", + inout(vreg) result, + in(vreg) xreinterpret_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 3 { + asm!( + "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[3]", + inout(vreg) result, + in(vreg) xreinterpret_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 4 { + asm!( + "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[4]", + inout(vreg) result, + in(vreg) xreinterpret_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 5 { + asm!( + "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[5]", + inout(vreg) result, + in(vreg) xreinterpret_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 6 { + asm!( + "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[6]", + inout(vreg) result, + in(vreg) xreinterpret_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 7 { + asm!( + "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[7]", + inout(vreg) result, + in(vreg) xreinterpret_u16_f16(b), + in(vreg) xreinterpretq_u16_f16(c), + options(pure, nomem, nostack) + ); + } + xreinterpret_f16_u16(result) +} + +#[target_feature(enable = "fp16")] +#[inline] +pub(super) unsafe fn xvfmla_lane_f16( + a: x_float16x4_t, + b: x_float16x4_t, + c: x_float16x4_t, +) -> x_float16x4_t { + static_assert_uimm_bits!(LANE, 3); + let mut result: uint16x4_t = xreinterpret_u16_f16(a); + + if LANE == 0 { + asm!( + "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[0]", + inout(vreg) result, + in(vreg) xreinterpret_u16_f16(b), + in(vreg) xreinterpret_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 1 { + asm!( + "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[1]", + inout(vreg) result, + in(vreg) xreinterpret_u16_f16(b), + in(vreg) xreinterpret_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 2 { + asm!( + "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[2]", + inout(vreg) result, + in(vreg) xreinterpret_u16_f16(b), + in(vreg) xreinterpret_u16_f16(c), + options(pure, nomem, nostack) + ); + } else if LANE == 3 { + asm!( + "fmla {0:v}.4h, {1:v}.4h, {2:v}.h[3]", + inout(vreg) result, + in(vreg) xreinterpret_u16_f16(b), + in(vreg) xreinterpret_u16_f16(c), + options(pure, nomem, nostack) + ); + } + xreinterpret_f16_u16(result) +} + #[target_feature(enable = "fp16")] #[inline] pub(super) unsafe fn xvfmlaq_f16( @@ -309,38 +474,66 @@ pub(super) unsafe fn xvbslq_f16( } #[inline] -pub unsafe fn xvst_f16(ptr: *const half::f16, x: x_float16x4_t) { +pub(crate) unsafe fn xvst_f16(ptr: *mut half::f16, x: x_float16x4_t) { vst1_u16(std::mem::transmute(ptr), xreinterpret_u16_f16(x)) } #[inline] -pub unsafe fn xvstq_f16(ptr: *const half::f16, x: x_float16x8_t) { +pub(crate) unsafe fn xvstq_f16(ptr: *mut half::f16, x: x_float16x8_t) { vst1q_u16(std::mem::transmute(ptr), xreinterpretq_u16_f16(x)) } #[inline] -pub unsafe fn xvstq_f16_x2(ptr: *const half::f16, x: x_float16x8x2_t) { - vst1q_u16_x2(std::mem::transmute(ptr), std::mem::transmute(x)) +pub(crate) unsafe fn xvstq_f16_x2(ptr: *mut half::f16, x: x_float16x8x2_t) { + let ptr_u16 = ptr as *mut u16; + vst1q_u16(ptr_u16, xreinterpretq_u16_f16(x.0)); + vst1q_u16(ptr_u16.add(8), xreinterpretq_u16_f16(x.1)); } #[inline] -pub unsafe fn xvstq_f16_x4(ptr: *const half::f16, x: x_float16x8x4_t) { - vst1q_u16_x4(std::mem::transmute(ptr), std::mem::transmute(x)) +pub(crate) unsafe fn xvstq_f16_x4(ptr: *const half::f16, x: x_float16x8x4_t) { + let ptr_u16 = ptr as *mut u16; + vst1q_u16(ptr_u16, xreinterpretq_u16_f16(x.0)); + vst1q_u16(ptr_u16.add(8), xreinterpretq_u16_f16(x.1)); + vst1q_u16(ptr_u16.add(16), xreinterpretq_u16_f16(x.2)); + vst1q_u16(ptr_u16.add(24), xreinterpretq_u16_f16(x.3)); } #[inline] -pub unsafe fn xvdup_lane_f16(a: x_float16x4_t) -> x_float16x4_t { +pub(crate) unsafe fn xvdup_lane_f16(a: x_float16x4_t) -> x_float16x4_t { xreinterpret_f16_u16(vdup_lane_u16::(xreinterpret_u16_f16(a))) } #[inline] -pub unsafe fn xvdup_laneq_f16(a: x_float16x8_t) -> x_float16x4_t { +pub(crate) unsafe fn xvdup_laneq_f16(a: x_float16x8_t) -> x_float16x4_t { xreinterpret_f16_u16(vdup_laneq_u16::(xreinterpretq_u16_f16(a))) } +#[inline] +pub(crate) unsafe fn xvld1q_lane_f16( + ptr: *const half::f16, + src: x_float16x8_t, +) -> x_float16x8_t { + xreinterpretq_f16_u16(vld1q_lane_u16::( + ptr as *const u16, + xreinterpretq_u16_f16(src), + )) +} + +#[inline] +pub(crate) unsafe fn xvsetq_lane_f16( + v: half::f16, + r: x_float16x8_t, +) -> x_float16x8_t { + xreinterpretq_f16_u16(vsetq_lane_u16::( + v.to_bits(), + xreinterpretq_u16_f16(r), + )) +} + #[target_feature(enable = "fp16")] #[inline] -pub unsafe fn vceqzq_f16(a: x_float16x8_t) -> uint16x8_t { +pub(crate) unsafe fn vceqzq_f16(a: x_float16x8_t) -> uint16x8_t { let mut result: uint16x8_t; asm!( "fcmeq {0:v}.8h, {1:v}.8h, #0", diff --git a/src/neon/mod.rs b/src/neon/mod.rs index f937e95..00c0c9d 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -66,51 +66,62 @@ mod vertical_u16_lb; mod vertical_u8; #[cfg(feature = "half")] -pub use alpha_f16::{neon_premultiply_alpha_rgba_f16, neon_unpremultiply_alpha_rgba_f16}; +pub(crate) use alpha_f16::{neon_premultiply_alpha_rgba_f16, neon_unpremultiply_alpha_rgba_f16}; #[cfg(feature = "half")] -pub use alpha_f16_full::{ +pub(crate) use alpha_f16_full::{ neon_premultiply_alpha_rgba_f16_full, neon_unpremultiply_alpha_rgba_f16_full, }; -pub use alpha_f32::neon_premultiply_alpha_rgba_f32; -pub use alpha_f32::neon_unpremultiply_alpha_rgba_f32; -pub use alpha_u16::{neon_premultiply_alpha_rgba_u16, neon_unpremultiply_alpha_rgba_u16}; -pub use alpha_u8::neon_premultiply_alpha_rgba; -pub use alpha_u8::neon_unpremultiply_alpha_rgba; +pub(crate) use alpha_f32::neon_premultiply_alpha_rgba_f32; +pub(crate) use alpha_f32::neon_unpremultiply_alpha_rgba_f32; +pub(crate) use alpha_u16::{neon_premultiply_alpha_rgba_u16, neon_unpremultiply_alpha_rgba_u16}; +pub(crate) use alpha_u8::neon_premultiply_alpha_rgba; +pub(crate) use alpha_u8::neon_unpremultiply_alpha_rgba; #[cfg(feature = "half")] -pub use f16_utils::*; +pub(crate) use f16_utils::*; pub(crate) use horizontal_ar30::neon_convolve_horizontal_rgba_rows_4_ar30; -pub use plane_f32::convolve_horizontal_plane_neon_row_one; -pub use plane_f32::convolve_horizontal_plane_neon_rows_4; +pub(crate) use plane_f32::convolve_horizontal_plane_neon_row_one; +pub(crate) use plane_f32::convolve_horizontal_plane_neon_rows_4; pub use plane_u8::{convolve_horizontal_plane_neon_row, convolve_horizontal_plane_neon_rows_4_u8}; #[cfg(feature = "half")] -pub use rgb_f16::{ +pub(crate) use rgb_f16::{ convolve_horizontal_rgb_neon_row_one_f16, convolve_horizontal_rgb_neon_rows_4_f16, }; #[cfg(feature = "half")] -pub use rgb_f16_full::{ +pub(crate) use rgb_f16_full::{ xconvolve_horizontal_rgb_neon_row_one_f16, xconvolve_horizontal_rgb_neon_rows_4_f16, }; -pub use rgb_f32::*; -pub use rgb_u8::*; +pub(crate) use rgb_f32::{ + convolve_horizontal_rgb_neon_row_one_f32, convolve_horizontal_rgb_neon_rows_4_f32, +}; +pub(crate) use rgb_u8::{ + convolve_horizontal_rgb_neon_row_one, convolve_horizontal_rgb_neon_rows_4, +}; #[cfg(feature = "half")] -pub use rgba_f16::convolve_horizontal_rgba_neon_row_one_f16; +pub(crate) use rgba_f16::convolve_horizontal_rgba_neon_row_one_f16; #[cfg(feature = "half")] -pub use rgba_f16::convolve_horizontal_rgba_neon_rows_4_f16; +pub(crate) use rgba_f16::convolve_horizontal_rgba_neon_rows_4_f16; #[cfg(feature = "half")] -pub use rgba_f16_full::{ +pub(crate) use rgba_f16_full::{ xconvolve_horizontal_rgba_neon_row_one_f16, xconvolve_horizontal_rgba_neon_rows_4_f16, }; -pub use rgba_f32::*; -pub use rgba_u16_lb::{ +pub(crate) use rgba_f32::{ + convolve_horizontal_rgba_neon_row_one, convolve_horizontal_rgba_neon_rows_4, +}; +pub(crate) use rgba_u16_lb::{ convolve_horizontal_rgba_neon_rows_4_lb_u16, convolve_horizontal_rgba_neon_u16_lb_row, }; -pub use rgba_u8::*; +pub(crate) use rgba_u8::{ + convolve_horizontal_rgba_neon_row, convolve_horizontal_rgba_neon_row_i16, + convolve_horizontal_rgba_neon_rows_4_u8, convolve_horizontal_rgba_neon_rows_4_u8_i16, +}; pub(crate) use vertical_ar30::neon_column_handler_fixed_point_ar30; #[cfg(feature = "half")] -pub use vertical_f16::convolve_vertical_rgb_neon_row_f16; +pub(crate) use vertical_f16::convolve_vertical_rgb_neon_row_f16; #[cfg(feature = "half")] -pub use vertical_f16_full::xconvolve_vertical_rgb_neon_row_f16; -pub use vertical_f32::convolve_vertical_rgb_neon_row_f32; -pub use vertical_u16::convolve_column_u16; -pub use vertical_u16_lb::convolve_column_lb_u16; -pub use vertical_u8::{convolve_vertical_neon_i16_precision, convolve_vertical_neon_i32_precision}; +pub(crate) use vertical_f16_full::xconvolve_vertical_rgb_neon_row_f16; +pub(crate) use vertical_f32::convolve_vertical_rgb_neon_row_f32; +pub(crate) use vertical_u16::convolve_column_u16; +pub(crate) use vertical_u16_lb::convolve_column_lb_u16; +pub(crate) use vertical_u8::{ + convolve_vertical_neon_i16_precision, convolve_vertical_neon_i32_precision, +}; diff --git a/src/neon/plane_f32.rs b/src/neon/plane_f32.rs index e13e5b3..3db9629 100644 --- a/src/neon/plane_f32.rs +++ b/src/neon/plane_f32.rs @@ -28,7 +28,7 @@ */ use crate::filter_weights::FilterWeights; -use crate::neon::utils::{prefer_vfmaq_f32, xvld1q_f32_x4}; +use crate::neon::utils::{prefer_vfmaq_f32, xvld1q_f32_x2, xvld1q_f32_x4}; use std::arch::aarch64::*; macro_rules! conv_horiz_plane_16_f32 { @@ -49,7 +49,7 @@ macro_rules! conv_horiz_plane_8_f32 { ($start_x: expr, $src: expr, $set1: expr, $set2: expr, $store: expr) => {{ let src_ptr = $src.add($start_x); - let rgb_pixel = vld1q_f32_x2(src_ptr); + let rgb_pixel = xvld1q_f32_x2(src_ptr); let mut acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set1); acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set2); @@ -87,7 +87,7 @@ macro_rules! conv_horiz_plane_1_f32 { }}; } -pub fn convolve_horizontal_plane_neon_row_one( +pub(crate) fn convolve_horizontal_plane_neon_row_one( dst_width: usize, _: usize, filter_weights: &FilterWeights, @@ -119,7 +119,7 @@ pub fn convolve_horizontal_plane_neon_row_one( while jx + 8 < bounds.size { let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); - let read_weights = vld1q_f32_x2(ptr); + let read_weights = xvld1q_f32_x2(ptr); store = conv_horiz_plane_8_f32!( bounds_start, unsafe_source_ptr_0, @@ -165,7 +165,7 @@ pub fn convolve_horizontal_plane_neon_row_one( } } -pub fn convolve_horizontal_plane_neon_rows_4( +pub(crate) fn convolve_horizontal_plane_neon_rows_4( dst_width: usize, _: usize, filter_weights: &FilterWeights, @@ -208,7 +208,7 @@ pub fn convolve_horizontal_plane_neon_rows_4( while jx + 8 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); - let read_weights = vld1q_f32_x2(ptr); + let read_weights = xvld1q_f32_x2(ptr); let bounds_start = bounds.start + jx; store_0 = conv_horiz_plane_8_f32!( bounds_start, diff --git a/src/neon/rgb_f16.rs b/src/neon/rgb_f16.rs index ea47563..b2f9737 100644 --- a/src/neon/rgb_f16.rs +++ b/src/neon/rgb_f16.rs @@ -30,7 +30,7 @@ use std::arch::aarch64::*; use crate::filter_weights::FilterWeights; -use crate::neon::utils::prefer_vfmaq_f32; +use crate::neon::utils::{prefer_vfmaq_f32, prefer_vfmaq_lane_f32, prefer_vfmaq_laneq_f32}; use crate::neon::*; macro_rules! write_rgb_f16 { @@ -43,45 +43,8 @@ macro_rules! write_rgb_f16 { }}; } -macro_rules! conv_horiz_5_rgb_f16 { - ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ - const COMPONENTS: usize = 3; - let src_ptr = $src.add($start_x * COMPONENTS); - - let rgb_pixel_s = xvldq_f16_x2(src_ptr); - let rgb_first_u = vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)); - let rgb_first = xreinterpret_f16_u16(rgb_first_u); - let rgb_second_u = vext_u16::<3>( - vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)), - vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)), - ); - let rgb_second = xreinterpret_f16_u16(rgb_second_u); - - let rgb_third_u = vext_u16::<2>( - vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)), - vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)), - ); - let rgb_third = xreinterpret_f16_u16(rgb_third_u); - - let rgb_fourth_u = vext_u16::<1>( - vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)), - vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)), - ); - let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u); - - let rgb_fifth = xvget_high_f16(rgb_pixel_s.1); - - let mut acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(rgb_first), $set.0); - acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_second), $set.1); - acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_third), $set.2); - acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_fourth), $set.3); - acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_fifth), $set.4); - acc - }}; -} - macro_rules! conv_horiz_4_rgb_f16 { - ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ + ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{ const COMPONENTS: usize = 3; let src_ptr = $src.add($start_x * COMPONENTS); @@ -106,10 +69,10 @@ macro_rules! conv_horiz_4_rgb_f16 { ); let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u); - let acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(rgb_first), $set.0); - let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_second), $set.1); - let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_third), $set.2); - let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_fourth), $set.3); + let acc = prefer_vfmaq_laneq_f32::<0>($store, xvcvt_f32_f16(rgb_first), $weights); + let acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(rgb_second), $weights); + let acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(rgb_third), $weights); + let acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(rgb_fourth), $weights); acc }}; } @@ -138,8 +101,8 @@ macro_rules! conv_horiz_2_rgb_f16 { ); let rgb_second = xreinterpret_f16_u16(rgb_second_u); - let acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(rgb_first), $set.0); - let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(rgb_second), $set.1); + let acc = prefer_vfmaq_lane_f32::<0>($store, xvcvt_f32_f16(rgb_first), $set); + let acc = prefer_vfmaq_lane_f32::<1>(acc, xvcvt_f32_f16(rgb_second), $set); acc }}; } @@ -164,13 +127,13 @@ macro_rules! conv_horiz_1_rgb_f16 { }}; } -pub fn convolve_horizontal_rgb_neon_rows_4_f16( +pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f16( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const half::f16, + src: &[half::f16], src_stride: usize, - unsafe_destination_ptr_0: *mut half::f16, + dst: &mut [half::f16], dst_stride: usize, ) { unsafe { @@ -189,43 +152,17 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f16( let mut store_2 = zeros; let mut store_3 = zeros; - while jx + 5 < bounds.size && bounds.start + jx + 6 < src_width { - let bounds_start = bounds.start + jx; - let ptr = weights_ptr.add(jx + filter_offset); - let read_weights = vld1q_f32(ptr); - let w0 = vdupq_laneq_f32::<0>(read_weights); - let w1 = vdupq_laneq_f32::<1>(read_weights); - let w2 = vdupq_laneq_f32::<2>(read_weights); - let w3 = vdupq_laneq_f32::<3>(read_weights); - let w4 = vld1q_dup_f32(ptr.add(4)); - let set = (w0, w1, w2, w3, w4); - let b_start = bounds_start; - store_0 = conv_horiz_5_rgb_f16!(b_start, unsafe_source_ptr_0, set, store_0); - let s_ptr1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_5_rgb_f16!(b_start, s_ptr1, set, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_5_rgb_f16!(b_start, s_ptr2, set, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_5_rgb_f16!(b_start, s_ptr3, set, store_3); - jx += 5; - } - while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - let w0 = vdupq_laneq_f32::<0>(read_weights); - let w1 = vdupq_laneq_f32::<1>(read_weights); - let w2 = vdupq_laneq_f32::<2>(read_weights); - let w3 = vdupq_laneq_f32::<3>(read_weights); - let set = (w0, w1, w2, w3); - store_0 = conv_horiz_4_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store_0); - let s_ptr1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr1, set, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr2, set, store_2); - let s_ptr = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr, set, store_3); + store_0 = conv_horiz_4_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store_0); + let s_ptr1 = src.get_unchecked(src_stride..).as_ptr(); + store_1 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); + store_2 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr = src.get_unchecked(src_stride * 3..).as_ptr(); + store_3 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr, read_weights, store_3); jx += 4; } @@ -233,16 +170,13 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f16( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - let w0 = vdupq_lane_f32::<0>(read_weights); - let w1 = vdupq_lane_f32::<1>(read_weights); - let set = (w0, w1); - store_0 = conv_horiz_2_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store_0); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr_1, set, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr2, set, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr3, set, store_3); + store_0 = conv_horiz_2_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); + store_1 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr_1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); + store_2 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); + store_3 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr3, read_weights, store_3); jx += 2; } @@ -250,28 +184,27 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f16( let ptr = weights_ptr.add(jx + filter_offset); let bounds_start = bounds.start + jx; let weight0 = vld1q_dup_f32(ptr); - store_0 = - conv_horiz_1_rgb_f16!(bounds_start, unsafe_source_ptr_0, weight0, store_0); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_1_rgb_f16!(bounds_start, src.as_ptr(), weight0, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_1, weight0, store_1); - let s_ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_2, weight0, store_2); - let s_ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_3, weight0, store_3); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); write_rgb_f16!(store_0, dest_ptr); - let dest_ptr_1 = unsafe_destination_ptr_0.add(px + dst_stride); + let dest_ptr_1 = dst.get_unchecked_mut(px + dst_stride..).as_ptr(); write_rgb_f16!(store_1, dest_ptr_1); - let dest_ptr_2 = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let dest_ptr_2 = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); write_rgb_f16!(store_2, dest_ptr_2); - let dest_ptr_3 = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let dest_ptr_3 = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); write_rgb_f16!(store_3, dest_ptr_3); filter_offset += filter_weights.aligned_size; @@ -279,12 +212,12 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f16( } } -pub fn convolve_horizontal_rgb_neon_row_one_f16( +pub(crate) fn convolve_horizontal_rgb_neon_row_one_f16( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], ) { unsafe { const CHANNELS: usize = 3; @@ -300,13 +233,7 @@ pub fn convolve_horizontal_rgb_neon_row_one_f16( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - let w0 = vdupq_laneq_f32::<0>(read_weights); - let w1 = vdupq_laneq_f32::<1>(read_weights); - let w2 = vdupq_laneq_f32::<2>(read_weights); - let w3 = vdupq_laneq_f32::<3>(read_weights); - let set = (w0, w1, w2, w3); - - store = conv_horiz_4_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store); + store = conv_horiz_4_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store); jx += 4; } @@ -314,10 +241,7 @@ pub fn convolve_horizontal_rgb_neon_row_one_f16( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - let w0 = vdupq_lane_f32::<0>(read_weights); - let w1 = vdupq_lane_f32::<1>(read_weights); - let set = (w0, w1); - store = conv_horiz_2_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store); + store = conv_horiz_2_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store); jx += 2; } @@ -325,12 +249,12 @@ pub fn convolve_horizontal_rgb_neon_row_one_f16( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vld1q_dup_f32(ptr); let bounds_start = bounds.start + jx; - store = conv_horiz_1_rgb_f16!(bounds_start, unsafe_source_ptr_0, weight0, store); + store = conv_horiz_1_rgb_f16!(bounds_start, src.as_ptr(), weight0, store); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); write_rgb_f16!(store, dest_ptr); filter_offset += filter_weights.aligned_size; diff --git a/src/neon/rgb_f16_full.rs b/src/neon/rgb_f16_full.rs index da8e18a..9a5e407 100644 --- a/src/neon/rgb_f16_full.rs +++ b/src/neon/rgb_f16_full.rs @@ -44,43 +44,6 @@ macro_rules! write_rgb_f16 { }}; } -macro_rules! conv_horiz_5_rgb_f16 { - ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ - const COMPONENTS: usize = 3; - let src_ptr = $src.add($start_x * COMPONENTS); - - let rgb_pixel_s = xvldq_f16_x2(src_ptr); - let rgb_first_u = vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)); - let rgb_first = xreinterpret_f16_u16(rgb_first_u); - let rgb_second_u = vext_u16::<3>( - vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)), - vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)), - ); - let rgb_second = xreinterpret_f16_u16(rgb_second_u); - - let rgb_third_u = vext_u16::<2>( - vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.0)), - vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)), - ); - let rgb_third = xreinterpret_f16_u16(rgb_third_u); - - let rgb_fourth_u = vext_u16::<1>( - vget_low_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)), - vget_high_u16(xreinterpretq_u16_f16(rgb_pixel_s.1)), - ); - let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u); - - let rgb_fifth = xvget_high_f16(rgb_pixel_s.1); - - let mut acc = xvfmla_f16($store, rgb_first, $set.0); - acc = xvfmla_f16(acc, rgb_second, $set.1); - acc = xvfmla_f16(acc, rgb_third, $set.2); - acc = xvfmla_f16(acc, rgb_fourth, $set.3); - acc = xvfmla_f16(acc, rgb_fifth, $set.4); - acc - }}; -} - macro_rules! conv_horiz_4_rgb_f16 { ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ const COMPONENTS: usize = 3; @@ -107,10 +70,10 @@ macro_rules! conv_horiz_4_rgb_f16 { ); let rgb_fourth = xreinterpret_f16_u16(rgb_fourth_u); - let acc = xvfmla_f16($store, rgb_first, $set.0); - let acc = xvfmla_f16(acc, rgb_second, $set.1); - let acc = xvfmla_f16(acc, rgb_third, $set.2); - let acc = xvfmla_f16(acc, rgb_fourth, $set.3); + let acc = xvfmla_lane_f16::<0>($store, rgb_first, $set); + let acc = xvfmla_lane_f16::<1>(acc, rgb_second, $set); + let acc = xvfmla_lane_f16::<2>(acc, rgb_third, $set); + let acc = xvfmla_lane_f16::<3>(acc, rgb_fourth, $set); acc }}; } @@ -141,8 +104,8 @@ macro_rules! conv_horiz_2_rgb_f16 { rgb_second_u = vset_lane_u16::<3>(0, rgb_second_u); let rgb_second = xreinterpret_f16_u16(rgb_second_u); - let acc = xvfmla_f16($store, rgb_first, $set.0); - let acc = xvfmla_f16(acc, rgb_second, $set.1); + let acc = xvfmla_lane_f16::<0>($store, rgb_first, $set); + let acc = xvfmla_lane_f16::<1>(acc, rgb_second, $set); acc }}; } @@ -167,13 +130,13 @@ macro_rules! conv_horiz_1_rgb_f16 { }}; } -pub fn xconvolve_horizontal_rgb_neon_rows_4_f16( +pub(crate) fn xconvolve_horizontal_rgb_neon_rows_4_f16( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { unsafe { @@ -181,9 +144,9 @@ pub fn xconvolve_horizontal_rgb_neon_rows_4_f16( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -194,9 +157,9 @@ unsafe fn xconvolve_horizontal_rgb_neon_rows_4_f16_impl( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { unsafe { @@ -213,43 +176,17 @@ unsafe fn xconvolve_horizontal_rgb_neon_rows_4_f16_impl( let mut store_2 = xvzeros_f16(); let mut store_3 = xvzeros_f16(); - while jx + 5 < bounds.size && bounds.start + jx + 6 < src_width { - let bounds_start = bounds.start + jx; - let ptr = weights_ptr.add(jx + filter_offset); - let read_weights = xvcvt_f16_f32(vld1q_f32(ptr)); - let w0 = xvdup_lane_f16::<0>(read_weights); - let w1 = xvdup_lane_f16::<1>(read_weights); - let w2 = xvdup_lane_f16::<2>(read_weights); - let w3 = xvdup_lane_f16::<3>(read_weights); - let w4 = xvcvt_f16_f32(vld1q_dup_f32(ptr.add(4))); - let set = (w0, w1, w2, w3, w4); - let b_start = bounds_start; - store_0 = conv_horiz_5_rgb_f16!(b_start, unsafe_source_ptr_0, set, store_0); - let s_ptr1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_5_rgb_f16!(b_start, s_ptr1, set, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_5_rgb_f16!(b_start, s_ptr2, set, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_5_rgb_f16!(b_start, s_ptr3, set, store_3); - jx += 5; - } - while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = xvcvt_f16_f32(vld1q_f32(ptr)); - let w0 = xvdup_lane_f16::<0>(read_weights); - let w1 = xvdup_lane_f16::<1>(read_weights); - let w2 = xvdup_lane_f16::<2>(read_weights); - let w3 = xvdup_lane_f16::<3>(read_weights); - let set = (w0, w1, w2, w3); - store_0 = conv_horiz_4_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store_0); - let s_ptr1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr1, set, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr2, set, store_2); - let s_ptr = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr, set, store_3); + store_0 = conv_horiz_4_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store_0); + let s_ptr1 = src.get_unchecked(src_stride..).as_ptr(); + store_1 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); + store_2 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr = src.get_unchecked(src_stride * 3..).as_ptr(); + store_3 = conv_horiz_4_rgb_f16!(bounds_start, s_ptr, read_weights, store_3); jx += 4; } @@ -258,16 +195,13 @@ unsafe fn xconvolve_horizontal_rgb_neon_rows_4_f16_impl( let ptr = weights_ptr.add(jx + filter_offset); let read_weights_h = vld1_f32(ptr); let read_weights = xvcvt_f16_f32(vcombine_f32(read_weights_h, read_weights_h)); - let w0 = xvdup_lane_f16::<0>(read_weights); - let w1 = xvdup_lane_f16::<1>(read_weights); - let set = (w0, w1); - store_0 = conv_horiz_2_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store_0); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr_1, set, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr2, set, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr3, set, store_3); + store_0 = conv_horiz_2_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); + store_1 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr_1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); + store_2 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); + store_3 = conv_horiz_2_rgb_f16!(bounds_start, s_ptr3, read_weights, store_3); jx += 2; } @@ -275,28 +209,27 @@ unsafe fn xconvolve_horizontal_rgb_neon_rows_4_f16_impl( let ptr = weights_ptr.add(jx + filter_offset); let bounds_start = bounds.start + jx; let weight0 = xvcvt_f16_f32(vld1q_dup_f32(ptr)); - store_0 = - conv_horiz_1_rgb_f16!(bounds_start, unsafe_source_ptr_0, weight0, store_0); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_1_rgb_f16!(bounds_start, src.as_ptr(), weight0, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_1, weight0, store_1); - let s_ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + let s_ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_2, weight0, store_2); - let s_ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + let s_ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_1_rgb_f16!(bounds_start, s_ptr_3, weight0, store_3); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); write_rgb_f16!(store_0, dest_ptr); - let dest_ptr_1 = unsafe_destination_ptr_0.add(px + dst_stride); + let dest_ptr_1 = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr(); write_rgb_f16!(store_1, dest_ptr_1); - let dest_ptr_2 = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let dest_ptr_2 = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); write_rgb_f16!(store_2, dest_ptr_2); - let dest_ptr_3 = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let dest_ptr_3 = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); write_rgb_f16!(store_3, dest_ptr_3); filter_offset += filter_weights.aligned_size; @@ -304,20 +237,20 @@ unsafe fn xconvolve_horizontal_rgb_neon_rows_4_f16_impl( } } -pub fn xconvolve_horizontal_rgb_neon_row_one_f16( +pub(crate) fn xconvolve_horizontal_rgb_neon_row_one_f16( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[f16], + dst: &mut [f16], ) { unsafe { xconvolve_horizontal_rgb_neon_row_one_f16_impl( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } } @@ -327,8 +260,8 @@ unsafe fn xconvolve_horizontal_rgb_neon_row_one_f16_impl( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[f16], + dst: &mut [f16], ) { const CHANNELS: usize = 3; let weights_ptr = filter_weights.weights.as_ptr(); @@ -343,13 +276,7 @@ unsafe fn xconvolve_horizontal_rgb_neon_row_one_f16_impl( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = xvcvt_f16_f32(vld1q_f32(ptr)); - let w0 = xvdup_lane_f16::<0>(read_weights); - let w1 = xvdup_lane_f16::<1>(read_weights); - let w2 = xvdup_lane_f16::<2>(read_weights); - let w3 = xvdup_lane_f16::<3>(read_weights); - let set = (w0, w1, w2, w3); - - store = conv_horiz_4_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store); + store = conv_horiz_4_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store); jx += 4; } @@ -358,10 +285,7 @@ unsafe fn xconvolve_horizontal_rgb_neon_row_one_f16_impl( let ptr = weights_ptr.add(jx + filter_offset); let read_weights_h = vld1_f32(ptr); let read_weights = xvcvt_f16_f32(vcombine_f32(read_weights_h, read_weights_h)); - let w0 = xvdup_lane_f16::<0>(read_weights); - let w1 = xvdup_lane_f16::<1>(read_weights); - let set = (w0, w1); - store = conv_horiz_2_rgb_f16!(bounds_start, unsafe_source_ptr_0, set, store); + store = conv_horiz_2_rgb_f16!(bounds_start, src.as_ptr(), read_weights, store); jx += 2; } @@ -369,12 +293,12 @@ unsafe fn xconvolve_horizontal_rgb_neon_row_one_f16_impl( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = xvcvt_f16_f32(vld1q_dup_f32(ptr)); let bounds_start = bounds.start + jx; - store = conv_horiz_1_rgb_f16!(bounds_start, unsafe_source_ptr_0, weight0, store); + store = conv_horiz_1_rgb_f16!(bounds_start, src.as_ptr(), weight0, store); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); write_rgb_f16!(store, dest_ptr); filter_offset += filter_weights.aligned_size; diff --git a/src/neon/rgb_f32.rs b/src/neon/rgb_f32.rs index 6934bbb..473f745 100644 --- a/src/neon/rgb_f32.rs +++ b/src/neon/rgb_f32.rs @@ -30,8 +30,8 @@ use std::arch::aarch64::*; use crate::filter_weights::FilterWeights; -use crate::neon::utils::xvld1q_f32_x4; -use crate::neon::utils::{prefer_vfmaq_f32, vsplit_rgb_5}; +use crate::neon::utils::prefer_vfmaq_laneq_f32; +use crate::neon::utils::{prefer_vfmaq_f32, prefer_vfmaq_lane_f32}; macro_rules! write_rgb_f32 { ($store: expr, $dest_ptr: expr) => {{ @@ -42,25 +42,8 @@ macro_rules! write_rgb_f32 { }}; } -macro_rules! conv_horiz_5_rgb_f32 { - ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ - const COMPONENTS: usize = 3; - let src_ptr = $src.add($start_x * COMPONENTS); - - let full_pixel = xvld1q_f32_x4(src_ptr); - let splat = vsplit_rgb_5(full_pixel); - - let mut acc = prefer_vfmaq_f32($store, splat.0, $set.0); - acc = prefer_vfmaq_f32(acc, splat.1, $set.1); - acc = prefer_vfmaq_f32(acc, splat.2, $set.2); - acc = prefer_vfmaq_f32(acc, splat.3, $set.3); - acc = prefer_vfmaq_f32(acc, splat.4, $set.4); - acc - }}; -} - macro_rules! conv_horiz_4_rgb_f32 { - ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ + ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{ const COMPONENTS: usize = 3; let src_ptr = $src.add($start_x * COMPONENTS); @@ -77,10 +60,10 @@ macro_rules! conv_horiz_4_rgb_f32 { .as_ptr(), ); - let acc = prefer_vfmaq_f32($store, rgb_pixel_0, $set.0); - let acc = prefer_vfmaq_f32(acc, rgb_pixel_1, $set.1); - let acc = prefer_vfmaq_f32(acc, rgb_pixel_2, $set.2); - let acc = prefer_vfmaq_f32(acc, rgb_pixel_3, $set.3); + let acc = prefer_vfmaq_laneq_f32::<0>($store, rgb_pixel_0, $weights); + let acc = prefer_vfmaq_laneq_f32::<1>(acc, rgb_pixel_1, $weights); + let acc = prefer_vfmaq_laneq_f32::<2>(acc, rgb_pixel_2, $weights); + let acc = prefer_vfmaq_laneq_f32::<3>(acc, rgb_pixel_3, $weights); acc }}; } @@ -102,8 +85,8 @@ macro_rules! conv_horiz_2_rgb_f32 { .as_ptr(), ); - let acc = prefer_vfmaq_f32($store, rgb_pixel_0, $set.0); - let acc = prefer_vfmaq_f32(acc, rgb_pixel_1, $set.1); + let acc = prefer_vfmaq_lane_f32::<0>($store, rgb_pixel_0, $set); + let acc = prefer_vfmaq_lane_f32::<1>(acc, rgb_pixel_1, $set); acc }}; } @@ -126,7 +109,7 @@ macro_rules! conv_horiz_1_rgb_f32 { }}; } -pub fn convolve_horizontal_rgb_neon_rows_4_f32( +pub(crate) fn convolve_horizontal_rgb_neon_rows_4_f32( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, @@ -151,43 +134,18 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f32( let mut store_2 = zeros; let mut store_3 = zeros; - while jx + 5 < bounds.size && bounds.start + jx + 6 < src_width { - let bounds_start = bounds.start + jx; - let ptr = weights_ptr.add(jx + filter_offset); - let read_weights = vld1q_f32(ptr); - let w0 = vdupq_laneq_f32::<0>(read_weights); - let w1 = vdupq_laneq_f32::<1>(read_weights); - let w2 = vdupq_laneq_f32::<2>(read_weights); - let w3 = vdupq_laneq_f32::<3>(read_weights); - let w4 = vld1q_dup_f32(ptr.add(4)); - let set = (w0, w1, w2, w3, w4); - let b_start = bounds_start; - store_0 = conv_horiz_5_rgb_f32!(b_start, unsafe_source_ptr_0, set, store_0); - let s_ptr1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_5_rgb_f32!(b_start, s_ptr1, set, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_5_rgb_f32!(b_start, s_ptr2, set, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_5_rgb_f32!(b_start, s_ptr3, set, store_3); - jx += 5; - } - while jx + 4 < bounds.size && bounds.start + jx + 5 < src_width { let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - let w0 = vdupq_laneq_f32::<0>(read_weights); - let w1 = vdupq_laneq_f32::<1>(read_weights); - let w2 = vdupq_laneq_f32::<2>(read_weights); - let w3 = vdupq_laneq_f32::<3>(read_weights); - let set = (w0, w1, w2, w3); - store_0 = conv_horiz_4_rgb_f32!(bounds_start, unsafe_source_ptr_0, set, store_0); + store_0 = + conv_horiz_4_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store_0); let s_ptr1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr1, set, store_1); + store_1 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr1, read_weights, store_1); let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr2, set, store_2); + store_2 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr2, read_weights, store_2); let s_ptr = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr, set, store_3); + store_3 = conv_horiz_4_rgb_f32!(bounds_start, s_ptr, read_weights, store_3); jx += 4; } @@ -195,16 +153,14 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f32( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - let w0 = vdupq_lane_f32::<0>(read_weights); - let w1 = vdupq_lane_f32::<1>(read_weights); - let set = (w0, w1); - store_0 = conv_horiz_2_rgb_f32!(bounds_start, unsafe_source_ptr_0, set, store_0); + store_0 = + conv_horiz_2_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store_0); let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr_1, set, store_1); + store_1 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr_1, read_weights, store_1); let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr2, set, store_2); + store_2 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr2, read_weights, store_2); let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr3, set, store_3); + store_3 = conv_horiz_2_rgb_f32!(bounds_start, s_ptr3, read_weights, store_3); jx += 2; } @@ -241,7 +197,7 @@ pub fn convolve_horizontal_rgb_neon_rows_4_f32( } } -pub fn convolve_horizontal_rgb_neon_row_one_f32( +pub(crate) fn convolve_horizontal_rgb_neon_row_one_f32( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, @@ -262,13 +218,8 @@ pub fn convolve_horizontal_rgb_neon_row_one_f32( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - let w0 = vdupq_laneq_f32::<0>(read_weights); - let w1 = vdupq_laneq_f32::<1>(read_weights); - let w2 = vdupq_laneq_f32::<2>(read_weights); - let w3 = vdupq_laneq_f32::<3>(read_weights); - let set = (w0, w1, w2, w3); - - store = conv_horiz_4_rgb_f32!(bounds_start, unsafe_source_ptr_0, set, store); + store = + conv_horiz_4_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store); jx += 4; } @@ -276,10 +227,8 @@ pub fn convolve_horizontal_rgb_neon_row_one_f32( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - let w0 = vdupq_lane_f32::<0>(read_weights); - let w1 = vdupq_lane_f32::<1>(read_weights); - let set = (w0, w1); - store = conv_horiz_2_rgb_f32!(bounds_start, unsafe_source_ptr_0, set, store); + store = + conv_horiz_2_rgb_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store); jx += 2; } diff --git a/src/neon/rgb_u8.rs b/src/neon/rgb_u8.rs index 8192ec2..44c0862 100644 --- a/src/neon/rgb_u8.rs +++ b/src/neon/rgb_u8.rs @@ -109,7 +109,7 @@ unsafe fn write_accumulator_u8(store: int32x4_t, dst: &mut [u8]) { *dst.get_unchecked_mut(2) = bytes[2]; } -pub fn convolve_horizontal_rgb_neon_rows_4( +pub(crate) fn convolve_horizontal_rgb_neon_rows_4( src: &[u8], src_stride: usize, dst: &mut [u8], @@ -200,7 +200,7 @@ pub fn convolve_horizontal_rgb_neon_rows_4( } } -pub fn convolve_horizontal_rgb_neon_row_one( +pub(crate) fn convolve_horizontal_rgb_neon_row_one( src: &[u8], dst: &mut [u8], filter_weights: &FilterWeights, diff --git a/src/neon/rgba_f16.rs b/src/neon/rgba_f16.rs index ac1ea1e..3926a16 100644 --- a/src/neon/rgba_f16.rs +++ b/src/neon/rgba_f16.rs @@ -29,7 +29,9 @@ use crate::filter_weights::FilterWeights; use crate::neon::f16_utils::xvcvt_f16_f32; -use crate::neon::utils::prefer_vfmaq_f32; +use crate::neon::utils::{ + prefer_vfmaq_f32, prefer_vfmaq_lane_f32, prefer_vfmaq_laneq_f32, xvld1q_f32_x2, +}; use crate::neon::{ xvcvt_f32_f16, xvget_high_f16, xvget_low_f16, xvld_f16, xvldq_f16, xvldq_f16_x2, xvldq_f16_x4, xvst_f16, @@ -43,14 +45,15 @@ macro_rules! conv_horiz_rgba_8_f16 { let rgb_pixel = xvldq_f16_x4(src_ptr); - let mut acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), $set1.0); - acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1.1); - acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), $set1.2); - acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.1)), $set1.3); - acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.2)), $set2.0); - acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.2)), $set2.1); - acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.3)), $set2.2); - acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.3)), $set2.3); + let mut acc = + prefer_vfmaq_laneq_f32::<0>($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), $set1); + acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1); + acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), $set1); + acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.1)), $set1); + acc = prefer_vfmaq_laneq_f32::<0>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.2)), $set2); + acc = prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.2)), $set2); + acc = prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.3)), $set2); + acc = prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.3)), $set2); acc }}; } @@ -62,10 +65,14 @@ macro_rules! conv_horiz_rgba_4_f16 { let rgb_pixel = xvldq_f16_x2(src_ptr); - let acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), $set1.0); - let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1.1); - let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), $set1.2); - let acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1.3); + let acc = + prefer_vfmaq_laneq_f32::<0>($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.0)), $set1); + let acc = + prefer_vfmaq_laneq_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1); + let acc = + prefer_vfmaq_laneq_f32::<2>(acc, xvcvt_f32_f16(xvget_low_f16(rgb_pixel.1)), $set1); + let acc = + prefer_vfmaq_laneq_f32::<3>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel.0)), $set1); acc }}; } @@ -77,8 +84,9 @@ macro_rules! conv_horiz_rgba_2_f32 { let rgb_pixel = xvldq_f16(src_ptr); - let mut acc = prefer_vfmaq_f32($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel)), $set.0); - acc = prefer_vfmaq_f32(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel)), $set.1); + let mut acc = + prefer_vfmaq_lane_f32::<0>($store, xvcvt_f32_f16(xvget_low_f16(rgb_pixel)), $set); + acc = prefer_vfmaq_lane_f32::<1>(acc, xvcvt_f32_f16(xvget_high_f16(rgb_pixel)), $set); acc }}; } @@ -93,12 +101,12 @@ macro_rules! conv_horiz_rgba_1_f16 { }}; } -pub fn convolve_horizontal_rgba_neon_row_one_f16( +pub(crate) fn convolve_horizontal_rgba_neon_row_one_f16( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], ) { unsafe { const CHANNELS: usize = 4; @@ -114,12 +122,7 @@ pub fn convolve_horizontal_rgba_neon_row_one_f16( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - let w0 = vdupq_laneq_f32::<0>(read_weights); - let w1 = vdupq_laneq_f32::<1>(read_weights); - let w2 = vdupq_laneq_f32::<2>(read_weights); - let w3 = vdupq_laneq_f32::<3>(read_weights); - let set1 = (w0, w1, w2, w3); - store = conv_horiz_rgba_4_f16!(bounds_start, unsafe_source_ptr_0, set1, store); + store = conv_horiz_rgba_4_f16!(bounds_start, src.as_ptr(), read_weights, store); jx += 4; } @@ -127,10 +130,7 @@ pub fn convolve_horizontal_rgba_neon_row_one_f16( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - let w0 = vdupq_lane_f32::<0>(read_weights); - let w1 = vdupq_lane_f32::<1>(read_weights); - let set = (w0, w1); - store = conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, set, store); + store = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store); jx += 2; } @@ -138,12 +138,12 @@ pub fn convolve_horizontal_rgba_neon_row_one_f16( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vld1q_dup_f32(ptr); - store = conv_horiz_rgba_1_f16!(bounds_start, unsafe_source_ptr_0, weight0, store); + store = conv_horiz_rgba_1_f16!(bounds_start, src.as_ptr(), weight0, store); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); xvst_f16(dest_ptr, xvcvt_f16_f32(store)); filter_offset += filter_weights.aligned_size; @@ -151,13 +151,13 @@ pub fn convolve_horizontal_rgba_neon_row_one_f16( } } -pub fn convolve_horizontal_rgba_neon_rows_4_f16( +pub(crate) fn convolve_horizontal_rgba_neon_rows_4_f16( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const half::f16, + src: &[half::f16], src_stride: usize, - unsafe_destination_ptr_0: *mut half::f16, + dst: &mut [half::f16], dst_stride: usize, ) { unsafe { @@ -176,62 +176,67 @@ pub fn convolve_horizontal_rgba_neon_rows_4_f16( while jx + 8 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); - let read_weights = vld1q_f32_x2(ptr); - let w0 = vdupq_laneq_f32::<0>(read_weights.0); - let w1 = vdupq_laneq_f32::<1>(read_weights.0); - let w2 = vdupq_laneq_f32::<2>(read_weights.0); - let w3 = vdupq_laneq_f32::<3>(read_weights.0); - let w4 = vdupq_laneq_f32::<0>(read_weights.1); - let w5 = vdupq_laneq_f32::<1>(read_weights.1); - let w6 = vdupq_laneq_f32::<2>(read_weights.1); - let w7 = vdupq_laneq_f32::<3>(read_weights.1); - let set1 = (w0, w1, w2, w3); - let set2 = (w4, w5, w6, w7); + let read_weights = xvld1q_f32_x2(ptr); let bounds_start = bounds.start + jx; - store_0 = - conv_horiz_rgba_8_f16!(bounds_start, unsafe_source_ptr_0, set1, set2, store_0); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr_1, set1, set2, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr2, set1, set2, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr3, set1, set2, store_3); + store_0 = conv_horiz_rgba_8_f16!( + bounds_start, + src.as_ptr(), + read_weights.0, + read_weights.1, + store_0 + ); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); + store_1 = conv_horiz_rgba_8_f16!( + bounds_start, + s_ptr_1, + read_weights.0, + read_weights.1, + store_1 + ); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); + store_2 = conv_horiz_rgba_8_f16!( + bounds_start, + s_ptr2, + read_weights.0, + read_weights.1, + store_2 + ); + let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); + store_3 = conv_horiz_rgba_8_f16!( + bounds_start, + s_ptr3, + read_weights.0, + read_weights.1, + store_3 + ); jx += 8; } while jx + 4 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - let w0 = vdupq_laneq_f32::<0>(read_weights); - let w1 = vdupq_laneq_f32::<1>(read_weights); - let w2 = vdupq_laneq_f32::<2>(read_weights); - let w3 = vdupq_laneq_f32::<3>(read_weights); - let set1 = (w0, w1, w2, w3); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_4_f16!(bounds_start, unsafe_source_ptr_0, set1, store_0); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr_1, set1, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr2, set1, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr3, set1, store_3); + store_0 = conv_horiz_rgba_4_f16!(bounds_start, src.as_ptr(), read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); + store_1 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr_1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); + store_2 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); + store_3 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr3, read_weights, store_3); jx += 4; } while jx + 2 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - let w0 = vdupq_lane_f32::<0>(read_weights); - let w1 = vdupq_lane_f32::<1>(read_weights); - let set = (w0, w1); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, set, store_0); - let ptr_1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, set, store_1); - let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, set, store_2); - let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, set, store_3); + store_0 = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store_0); + let ptr_1 = src.get_unchecked(src_stride..).as_ptr(); + store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, read_weights, store_1); + let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); + store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, read_weights, store_2); + let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); + store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, read_weights, store_3); jx += 2; } @@ -239,28 +244,27 @@ pub fn convolve_horizontal_rgba_neon_rows_4_f16( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vld1q_dup_f32(ptr); let bounds_start = bounds.start + jx; - store_0 = - conv_horiz_rgba_1_f16!(bounds_start, unsafe_source_ptr_0, weight0, store_0); - let ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_rgba_1_f16!(bounds_start, src.as_ptr(), weight0, store_0); + let ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_rgba_1_f16!(bounds_start, ptr_1, weight0, store_1); - let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_rgba_1_f16!(bounds_start, ptr_2, weight0, store_2); - let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_rgba_1_f16!(bounds_start, ptr_3, weight0, store_3); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); xvst_f16(dest_ptr, xvcvt_f16_f32(store_0)); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr(); xvst_f16(dest_ptr, xvcvt_f16_f32(store_1)); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); xvst_f16(dest_ptr, xvcvt_f16_f32(store_2)); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); xvst_f16(dest_ptr, xvcvt_f16_f32(store_3)); filter_offset += filter_weights.aligned_size; diff --git a/src/neon/rgba_f16_full.rs b/src/neon/rgba_f16_full.rs index 758293a..9b582ff 100644 --- a/src/neon/rgba_f16_full.rs +++ b/src/neon/rgba_f16_full.rs @@ -30,55 +30,57 @@ use std::arch::aarch64::*; use crate::filter_weights::FilterWeights; -use crate::neon::f16_utils::{xvcombine_f16, xvcvt_f16_f32, xvfmla_f16, xvzeros_f16}; +use crate::neon::f16_utils::{ + xvcombine_f16, xvcvt_f16_f32, xvfmla_f16, xvfmla_lane_f16, xvfmla_laneq_f16, xvzeros_f16, +}; +use crate::neon::utils::xvld1q_f32_x2; use crate::neon::{ - xvdup_lane_f16, xvdup_laneq_f16, xvget_high_f16, xvget_low_f16, xvld_f16, xvldq_f16, - xvldq_f16_x2, xvldq_f16_x4, xvst_f16, + xvget_high_f16, xvget_low_f16, xvld_f16, xvldq_f16, xvldq_f16_x2, xvldq_f16_x4, xvst_f16, }; macro_rules! conv_horiz_rgba_8_f16 { - ($start_x: expr, $src: expr, $set1: expr, $set2: expr, $store: expr) => {{ + ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{ const COMPONENTS: usize = 4; let src_ptr = $src.add($start_x * COMPONENTS); let rgb_pixel = xvldq_f16_x4(src_ptr); - let mut acc = xvfmla_f16($store, xvget_low_f16(rgb_pixel.0), $set1.0); - acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel.0), $set1.1); - acc = xvfmla_f16(acc, xvget_low_f16(rgb_pixel.1), $set1.2); - acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel.1), $set1.3); - acc = xvfmla_f16(acc, xvget_low_f16(rgb_pixel.2), $set2.0); - acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel.2), $set2.1); - acc = xvfmla_f16(acc, xvget_low_f16(rgb_pixel.3), $set2.2); - acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel.3), $set2.3); + let mut acc = xvfmla_laneq_f16::<0>($store, xvget_low_f16(rgb_pixel.0), $weights); + acc = xvfmla_laneq_f16::<1>(acc, xvget_high_f16(rgb_pixel.0), $weights); + acc = xvfmla_laneq_f16::<2>(acc, xvget_low_f16(rgb_pixel.1), $weights); + acc = xvfmla_laneq_f16::<3>(acc, xvget_high_f16(rgb_pixel.1), $weights); + acc = xvfmla_laneq_f16::<4>(acc, xvget_low_f16(rgb_pixel.2), $weights); + acc = xvfmla_laneq_f16::<5>(acc, xvget_high_f16(rgb_pixel.2), $weights); + acc = xvfmla_laneq_f16::<6>(acc, xvget_low_f16(rgb_pixel.3), $weights); + acc = xvfmla_laneq_f16::<7>(acc, xvget_high_f16(rgb_pixel.3), $weights); acc }}; } macro_rules! conv_horiz_rgba_4_f16 { - ($start_x: expr, $src: expr, $set1: expr, $store: expr) => {{ + ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{ const COMPONENTS: usize = 4; let src_ptr = $src.add($start_x * COMPONENTS); let rgb_pixel = xvldq_f16_x2(src_ptr); - let acc = xvfmla_f16($store, xvget_low_f16(rgb_pixel.0), $set1.0); - let acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel.0), $set1.1); - let acc = xvfmla_f16(acc, xvget_low_f16(rgb_pixel.1), $set1.2); - let acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel.0), $set1.3); + let acc = xvfmla_lane_f16::<0>($store, xvget_low_f16(rgb_pixel.0), $weights); + let acc = xvfmla_lane_f16::<1>(acc, xvget_high_f16(rgb_pixel.0), $weights); + let acc = xvfmla_lane_f16::<2>(acc, xvget_low_f16(rgb_pixel.1), $weights); + let acc = xvfmla_lane_f16::<3>(acc, xvget_high_f16(rgb_pixel.0), $weights); acc }}; } macro_rules! conv_horiz_rgba_2_f32 { - ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ + ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{ const COMPONENTS: usize = 4; let src_ptr = $src.add($start_x * COMPONENTS); let rgb_pixel = xvldq_f16(src_ptr); - let mut acc = xvfmla_f16($store, xvget_low_f16(rgb_pixel), $set.0); - acc = xvfmla_f16(acc, xvget_high_f16(rgb_pixel), $set.1); + let mut acc = xvfmla_lane_f16::<0>($store, xvget_low_f16(rgb_pixel), $weights); + acc = xvfmla_lane_f16::<1>(acc, xvget_high_f16(rgb_pixel), $weights); acc }}; } @@ -93,20 +95,20 @@ macro_rules! conv_horiz_rgba_1_f16 { }}; } -pub fn xconvolve_horizontal_rgba_neon_row_one_f16( +pub(crate) fn xconvolve_horizontal_rgba_neon_row_one_f16( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], ) { unsafe { xconvolve_horizontal_rgba_neon_row_one_f16_impl( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } } @@ -116,8 +118,8 @@ unsafe fn xconvolve_horizontal_rgba_neon_row_one_f16_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], ) { unsafe { const CHANNELS: usize = 4; @@ -133,12 +135,7 @@ unsafe fn xconvolve_horizontal_rgba_neon_row_one_f16_impl( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = xvcvt_f16_f32(vld1q_f32(ptr)); - let w0 = xvdup_lane_f16::<0>(read_weights); - let w1 = xvdup_lane_f16::<1>(read_weights); - let w2 = xvdup_lane_f16::<2>(read_weights); - let w3 = xvdup_lane_f16::<3>(read_weights); - let set1 = (w0, w1, w2, w3); - store = conv_horiz_rgba_4_f16!(bounds_start, unsafe_source_ptr_0, set1, store); + store = conv_horiz_rgba_4_f16!(bounds_start, src.as_ptr(), read_weights, store); jx += 4; } @@ -147,10 +144,7 @@ unsafe fn xconvolve_horizontal_rgba_neon_row_one_f16_impl( let ptr = weights_ptr.add(jx + filter_offset); let read_weights_h = vld1_f32(ptr); let read_weights = xvcvt_f16_f32(vcombine_f32(read_weights_h, read_weights_h)); - let w0 = xvdup_lane_f16::<0>(read_weights); - let w1 = xvdup_lane_f16::<1>(read_weights); - let set = (w0, w1); - store = conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, set, store); + store = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store); jx += 2; } @@ -158,12 +152,12 @@ unsafe fn xconvolve_horizontal_rgba_neon_row_one_f16_impl( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let weight0 = xvcvt_f16_f32(vld1q_dup_f32(ptr)); - store = conv_horiz_rgba_1_f16!(bounds_start, unsafe_source_ptr_0, weight0, store); + store = conv_horiz_rgba_1_f16!(bounds_start, src.as_ptr(), weight0, store); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); xvst_f16(dest_ptr, store); filter_offset += filter_weights.aligned_size; @@ -171,13 +165,13 @@ unsafe fn xconvolve_horizontal_rgba_neon_row_one_f16_impl( } } -pub fn xconvolve_horizontal_rgba_neon_rows_4_f16( +pub(crate) fn xconvolve_horizontal_rgba_neon_rows_4_f16( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const half::f16, + unsafe_source_ptr_0: &[half::f16], src_stride: usize, - unsafe_destination_ptr_0: *mut half::f16, + unsafe_destination_ptr_0: &mut [half::f16], dst_stride: usize, ) { unsafe { @@ -198,9 +192,9 @@ unsafe fn xconvolve_horizontal_rgba_neon_rows_4_f16_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const half::f16, + src: &[half::f16], src_stride: usize, - unsafe_destination_ptr_0: *mut half::f16, + dst: &mut [half::f16], dst_stride: usize, ) { const CHANNELS: usize = 4; @@ -218,49 +212,33 @@ unsafe fn xconvolve_horizontal_rgba_neon_rows_4_f16_impl( while jx + 8 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); - let read_weights_h = vld1q_f32_x2(ptr); + let read_weights_h = xvld1q_f32_x2(ptr); let read_weights = xvcombine_f16( xvcvt_f16_f32(read_weights_h.0), xvcvt_f16_f32(read_weights_h.1), ); - let w0 = xvdup_laneq_f16::<0>(read_weights); - let w1 = xvdup_laneq_f16::<1>(read_weights); - let w2 = xvdup_laneq_f16::<2>(read_weights); - let w3 = xvdup_laneq_f16::<3>(read_weights); - let w4 = xvdup_laneq_f16::<4>(read_weights); - let w5 = xvdup_laneq_f16::<5>(read_weights); - let w6 = xvdup_laneq_f16::<6>(read_weights); - let w7 = xvdup_laneq_f16::<7>(read_weights); - let set1 = (w0, w1, w2, w3); - let set2 = (w4, w5, w6, w7); let bounds_start = bounds.start + jx; - store_0 = - conv_horiz_rgba_8_f16!(bounds_start, unsafe_source_ptr_0, set1, set2, store_0); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr_1, set1, set2, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr2, set1, set2, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr3, set1, set2, store_3); + store_0 = conv_horiz_rgba_8_f16!(bounds_start, src.as_ptr(), read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); + store_1 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr_1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); + store_2 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); + store_3 = conv_horiz_rgba_8_f16!(bounds_start, s_ptr3, read_weights, store_3); jx += 8; } while jx + 4 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); let read_weights = xvcvt_f16_f32(vld1q_f32(ptr)); - let w0 = xvdup_lane_f16::<0>(read_weights); - let w1 = xvdup_lane_f16::<1>(read_weights); - let w2 = xvdup_lane_f16::<2>(read_weights); - let w3 = xvdup_lane_f16::<3>(read_weights); - let set1 = (w0, w1, w2, w3); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_4_f16!(bounds_start, unsafe_source_ptr_0, set1, store_0); - let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr_1, set1, store_1); - let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr2, set1, store_2); - let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr3, set1, store_3); + store_0 = conv_horiz_rgba_4_f16!(bounds_start, src.as_ptr(), read_weights, store_0); + let s_ptr_1 = src.get_unchecked(src_stride..).as_ptr(); + store_1 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr_1, read_weights, store_1); + let s_ptr2 = src.get_unchecked(src_stride * 2..).as_ptr(); + store_2 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr3 = src.get_unchecked(src_stride * 3..).as_ptr(); + store_3 = conv_horiz_rgba_4_f16!(bounds_start, s_ptr3, read_weights, store_3); jx += 4; } @@ -268,17 +246,14 @@ unsafe fn xconvolve_horizontal_rgba_neon_rows_4_f16_impl( let ptr = weights_ptr.add(jx + filter_offset); let read_weights_h = vld1_f32(ptr); let read_weights = xvcvt_f16_f32(vcombine_f32(read_weights_h, read_weights_h)); - let w0 = xvdup_lane_f16::<0>(read_weights); - let w1 = xvdup_lane_f16::<1>(read_weights); - let set = (w0, w1); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, set, store_0); - let ptr_1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, set, store_1); - let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, set, store_2); - let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, set, store_3); + store_0 = conv_horiz_rgba_2_f32!(bounds_start, src.as_ptr(), read_weights, store_0); + let ptr_1 = src.get_unchecked(src_stride..).as_ptr(); + store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, read_weights, store_1); + let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); + store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, read_weights, store_2); + let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); + store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, read_weights, store_3); jx += 2; } @@ -286,27 +261,27 @@ unsafe fn xconvolve_horizontal_rgba_neon_rows_4_f16_impl( let ptr = weights_ptr.add(jx + filter_offset); let weight0 = xvcvt_f16_f32(vld1q_dup_f32(ptr)); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_1_f16!(bounds_start, unsafe_source_ptr_0, weight0, store_0); - let ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_0 = conv_horiz_rgba_1_f16!(bounds_start, src.as_ptr(), weight0, store_0); + let ptr_1 = src.get_unchecked(src_stride..).as_ptr(); store_1 = conv_horiz_rgba_1_f16!(bounds_start, ptr_1, weight0, store_1); - let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + let ptr_2 = src.get_unchecked(src_stride * 2..).as_ptr(); store_2 = conv_horiz_rgba_1_f16!(bounds_start, ptr_2, weight0, store_2); - let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + let ptr_3 = src.get_unchecked(src_stride * 3..).as_ptr(); store_3 = conv_horiz_rgba_1_f16!(bounds_start, ptr_3, weight0, store_3); jx += 1; } let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); xvst_f16(dest_ptr, store_0); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr(); xvst_f16(dest_ptr, store_1); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); xvst_f16(dest_ptr, store_2); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); xvst_f16(dest_ptr, store_3); filter_offset += filter_weights.aligned_size; diff --git a/src/neon/rgba_f32.rs b/src/neon/rgba_f32.rs index 0f60df9..3db6b0e 100644 --- a/src/neon/rgba_f32.rs +++ b/src/neon/rgba_f32.rs @@ -28,41 +28,41 @@ */ use crate::filter_weights::FilterWeights; -use crate::neon::utils::prefer_vfmaq_f32; -use crate::neon::utils::xvld1q_f32_x4; +use crate::neon::utils::{prefer_vfmaq_f32, prefer_vfmaq_laneq_f32, xvld1q_f32_x2}; +use crate::neon::utils::{prefer_vfmaq_lane_f32, xvld1q_f32_x4}; use std::arch::aarch64::*; macro_rules! conv_horiz_rgba_8_f32 { - ($start_x: expr, $src: expr, $set1: expr, $set2: expr, $store: expr) => {{ + ($start_x: expr, $src: expr, $weights1: expr, $weights2: expr, $store: expr) => {{ const COMPONENTS: usize = 4; let src_ptr = $src.add($start_x * COMPONENTS); let rgb_pixel0 = xvld1q_f32_x4(src_ptr); let rgb_pixel1 = xvld1q_f32_x4(src_ptr.add(16)); - let mut acc = prefer_vfmaq_f32($store, rgb_pixel0.0, $set1.0); - acc = prefer_vfmaq_f32(acc, rgb_pixel0.1, $set1.1); - acc = prefer_vfmaq_f32(acc, rgb_pixel0.2, $set1.2); - acc = prefer_vfmaq_f32(acc, rgb_pixel0.3, $set1.3); - acc = prefer_vfmaq_f32(acc, rgb_pixel1.0, $set2.0); - acc = prefer_vfmaq_f32(acc, rgb_pixel1.1, $set2.1); - acc = prefer_vfmaq_f32(acc, rgb_pixel1.2, $set2.2); - acc = prefer_vfmaq_f32(acc, rgb_pixel1.3, $set2.3); + let mut acc = prefer_vfmaq_laneq_f32::<0>($store, rgb_pixel0.0, $weights1); + acc = prefer_vfmaq_laneq_f32::<1>(acc, rgb_pixel0.1, $weights1); + acc = prefer_vfmaq_laneq_f32::<2>(acc, rgb_pixel0.2, $weights1); + acc = prefer_vfmaq_laneq_f32::<3>(acc, rgb_pixel0.3, $weights1); + acc = prefer_vfmaq_laneq_f32::<0>(acc, rgb_pixel1.0, $weights2); + acc = prefer_vfmaq_laneq_f32::<1>(acc, rgb_pixel1.1, $weights2); + acc = prefer_vfmaq_laneq_f32::<2>(acc, rgb_pixel1.2, $weights2); + acc = prefer_vfmaq_laneq_f32::<3>(acc, rgb_pixel1.3, $weights2); acc }}; } macro_rules! conv_horiz_rgba_4_f32 { - ($start_x: expr, $src: expr, $set1: expr, $store: expr) => {{ + ($start_x: expr, $src: expr, $weights: expr, $store: expr) => {{ const COMPONENTS: usize = 4; let src_ptr = $src.add($start_x * COMPONENTS); let rgb_pixel = xvld1q_f32_x4(src_ptr); - let acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set1.0); - let acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set1.1); - let acc = prefer_vfmaq_f32(acc, rgb_pixel.2, $set1.2); - let acc = prefer_vfmaq_f32(acc, rgb_pixel.3, $set1.3); + let acc = prefer_vfmaq_laneq_f32::<0>($store, rgb_pixel.0, $weights); + let acc = prefer_vfmaq_laneq_f32::<1>(acc, rgb_pixel.1, $weights); + let acc = prefer_vfmaq_laneq_f32::<2>(acc, rgb_pixel.2, $weights); + let acc = prefer_vfmaq_laneq_f32::<3>(acc, rgb_pixel.3, $weights); acc }}; } @@ -72,10 +72,10 @@ macro_rules! conv_horiz_rgba_2_f32 { const COMPONENTS: usize = 4; let src_ptr = $src.add($start_x * COMPONENTS); - let rgb_pixel = vld1q_f32_x2(src_ptr); + let rgb_pixel = xvld1q_f32_x2(src_ptr); - let mut acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set.0); - acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set.1); + let mut acc = prefer_vfmaq_lane_f32::<0>($store, rgb_pixel.0, $set); + acc = prefer_vfmaq_lane_f32::<1>(acc, rgb_pixel.1, $set); acc }}; } @@ -90,7 +90,7 @@ macro_rules! conv_horiz_rgba_1_f32 { }}; } -pub fn convolve_horizontal_rgba_neon_row_one( +pub(crate) fn convolve_horizontal_rgba_neon_row_one( dst_width: usize, _: usize, filter_weights: &FilterWeights, @@ -111,12 +111,8 @@ pub fn convolve_horizontal_rgba_neon_row_one( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - let w0 = vdupq_laneq_f32::<0>(read_weights); - let w1 = vdupq_laneq_f32::<1>(read_weights); - let w2 = vdupq_laneq_f32::<2>(read_weights); - let w3 = vdupq_laneq_f32::<3>(read_weights); - let set1 = (w0, w1, w2, w3); - store = conv_horiz_rgba_4_f32!(bounds_start, unsafe_source_ptr_0, set1, store); + store = + conv_horiz_rgba_4_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store); jx += 4; } @@ -124,10 +120,8 @@ pub fn convolve_horizontal_rgba_neon_row_one( let bounds_start = bounds.start + jx; let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - let w0 = vdupq_lane_f32::<0>(read_weights); - let w1 = vdupq_lane_f32::<1>(read_weights); - let set = (w0, w1); - store = conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, set, store); + store = + conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store); jx += 2; } @@ -148,7 +142,7 @@ pub fn convolve_horizontal_rgba_neon_row_one( } } -pub fn convolve_horizontal_rgba_neon_rows_4( +pub(crate) fn convolve_horizontal_rgba_neon_rows_4( dst_width: usize, _: usize, filter_weights: &FilterWeights, @@ -173,62 +167,77 @@ pub fn convolve_horizontal_rgba_neon_rows_4( while jx + 8 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); - let read_weights = vld1q_f32_x2(ptr); - let w0 = vdupq_laneq_f32::<0>(read_weights.0); - let w1 = vdupq_laneq_f32::<1>(read_weights.0); - let w2 = vdupq_laneq_f32::<2>(read_weights.0); - let w3 = vdupq_laneq_f32::<3>(read_weights.0); - let w4 = vdupq_laneq_f32::<0>(read_weights.1); - let w5 = vdupq_laneq_f32::<1>(read_weights.1); - let w6 = vdupq_laneq_f32::<2>(read_weights.1); - let w7 = vdupq_laneq_f32::<3>(read_weights.1); - let set1 = (w0, w1, w2, w3); - let set2 = (w4, w5, w6, w7); + let read_weights = xvld1q_f32_x2(ptr); let bounds_start = bounds.start + jx; - store_0 = - conv_horiz_rgba_8_f32!(bounds_start, unsafe_source_ptr_0, set1, set2, store_0); + store_0 = conv_horiz_rgba_8_f32!( + bounds_start, + unsafe_source_ptr_0, + read_weights.0, + read_weights.1, + store_0 + ); let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_rgba_8_f32!(bounds_start, s_ptr_1, set1, set2, store_1); + store_1 = conv_horiz_rgba_8_f32!( + bounds_start, + s_ptr_1, + read_weights.0, + read_weights.1, + store_1 + ); let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_rgba_8_f32!(bounds_start, s_ptr2, set1, set2, store_2); + store_2 = conv_horiz_rgba_8_f32!( + bounds_start, + s_ptr2, + read_weights.0, + read_weights.1, + store_2 + ); let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_rgba_8_f32!(bounds_start, s_ptr3, set1, set2, store_3); + store_3 = conv_horiz_rgba_8_f32!( + bounds_start, + s_ptr3, + read_weights.0, + read_weights.1, + store_3 + ); jx += 8; } while jx + 4 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1q_f32(ptr); - let w0 = vdupq_laneq_f32::<0>(read_weights); - let w1 = vdupq_laneq_f32::<1>(read_weights); - let w2 = vdupq_laneq_f32::<2>(read_weights); - let w3 = vdupq_laneq_f32::<3>(read_weights); - let set1 = (w0, w1, w2, w3); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_4_f32!(bounds_start, unsafe_source_ptr_0, set1, store_0); + store_0 = conv_horiz_rgba_4_f32!( + bounds_start, + unsafe_source_ptr_0, + read_weights, + store_0 + ); let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr_1, set1, store_1); + store_1 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr_1, read_weights, store_1); let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr2, set1, store_2); + store_2 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr2, read_weights, store_2); let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr3, set1, store_3); + store_3 = conv_horiz_rgba_4_f32!(bounds_start, s_ptr3, read_weights, store_3); jx += 4; } while jx + 2 < bounds.size { let ptr = weights_ptr.add(jx + filter_offset); let read_weights = vld1_f32(ptr); - let w0 = vdupq_lane_f32::<0>(read_weights); - let w1 = vdupq_lane_f32::<1>(read_weights); - let set = (w0, w1); let bounds_start = bounds.start + jx; - store_0 = conv_horiz_rgba_2_f32!(bounds_start, unsafe_source_ptr_0, set, store_0); + store_0 = conv_horiz_rgba_2_f32!( + bounds_start, + unsafe_source_ptr_0, + read_weights, + store_0 + ); let ptr_1 = unsafe_source_ptr_0.add(src_stride); - store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, set, store_1); + store_1 = conv_horiz_rgba_2_f32!(bounds_start, ptr_1, read_weights, store_1); let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); - store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, set, store_2); + store_2 = conv_horiz_rgba_2_f32!(bounds_start, ptr_2, read_weights, store_2); let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); - store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, set, store_3); + store_3 = conv_horiz_rgba_2_f32!(bounds_start, ptr_3, read_weights, store_3); jx += 2; } diff --git a/src/neon/rgba_u16_lb.rs b/src/neon/rgba_u16_lb.rs index 36f2d91..8570fe1 100644 --- a/src/neon/rgba_u16_lb.rs +++ b/src/neon/rgba_u16_lb.rs @@ -113,7 +113,7 @@ unsafe fn conv_horiz_rgba_8_u16( acc } -pub fn convolve_horizontal_rgba_neon_rows_4_lb_u16( +pub(crate) fn convolve_horizontal_rgba_neon_rows_4_lb_u16( src: &[u16], src_stride: usize, dst: &mut [u16], @@ -218,7 +218,7 @@ pub fn convolve_horizontal_rgba_neon_rows_4_lb_u16( } } -pub fn convolve_horizontal_rgba_neon_u16_lb_row( +pub(crate) fn convolve_horizontal_rgba_neon_u16_lb_row( src: &[u16], dst: &mut [u16], filter_weights: &FilterWeights, diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs index 62373f0..d4fa251 100644 --- a/src/neon/rgba_u8.rs +++ b/src/neon/rgba_u8.rs @@ -195,7 +195,7 @@ unsafe fn conv_horiz_rgba_1_u8_i16( vqrdmlah_s16(store, lo, w0) } -pub fn convolve_horizontal_rgba_neon_rows_4_u8_i16( +pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8_i16( src: &[u8], src_stride: usize, dst: &mut [u8], @@ -334,7 +334,7 @@ unsafe fn convolve_horizontal_rgba_neon_rows_4_u8_i16_impl( } } -pub fn convolve_horizontal_rgba_neon_rows_4_u8( +pub(crate) fn convolve_horizontal_rgba_neon_rows_4_u8( src: &[u8], src_stride: usize, dst: &mut [u8], @@ -452,7 +452,7 @@ pub fn convolve_horizontal_rgba_neon_rows_4_u8( } } -pub fn convolve_horizontal_rgba_neon_row( +pub(crate) fn convolve_horizontal_rgba_neon_row( src: &[u8], dst: &mut [u8], filter_weights: &FilterWeights, @@ -516,7 +516,7 @@ pub fn convolve_horizontal_rgba_neon_row( } } -pub fn convolve_horizontal_rgba_neon_row_i16( +pub(crate) fn convolve_horizontal_rgba_neon_row_i16( src: &[u8], dst: &mut [u8], filter_weights: &FilterWeights, diff --git a/src/neon/utils.rs b/src/neon/utils.rs index e21a1d8..a17f401 100644 --- a/src/neon/utils.rs +++ b/src/neon/utils.rs @@ -69,6 +69,11 @@ pub(crate) unsafe fn xvld1q_f32_x4(a: *const f32) -> float32x4x4_t { ) } +#[inline(always)] +pub(crate) unsafe fn xvld1q_f32_x2(a: *const f32) -> float32x4x2_t { + float32x4x2_t(vld1q_f32(a), vld1q_f32(a.add(4))) +} + #[inline(always)] pub(crate) unsafe fn xvst1q_u8_x2(ptr: *mut u8, b: uint8x16x2_t) { vst1q_u8(ptr, b.0); @@ -99,6 +104,24 @@ pub(crate) unsafe fn prefer_vfmaq_f32( } } +#[inline(always)] +pub(crate) unsafe fn prefer_vfmaq_laneq_f32( + a: float32x4_t, + b: float32x4_t, + c: float32x4_t, +) -> float32x4_t { + vfmaq_laneq_f32::(a, b, c) +} + +#[inline(always)] +pub(crate) unsafe fn prefer_vfmaq_lane_f32( + a: float32x4_t, + b: float32x4_t, + c: float32x2_t, +) -> float32x4_t { + vfmaq_lane_f32::(a, b, c) +} + #[inline(always)] pub(crate) unsafe fn load_3b_as_u16x4(src_ptr: *const u8) -> uint16x4_t { let v_new_value1 = u16::from_le_bytes([src_ptr.read_unaligned(), 0]); @@ -117,20 +140,3 @@ pub(crate) unsafe fn load_4b_as_u16x4(src_ptr: *const u8) -> uint16x4_t { let arr = [v_new_value1, v_new_value2, v_new_value3, v_new_value4]; vld1_u16(arr.as_ptr()) } - -#[inline(always)] -pub(crate) unsafe fn vsplit_rgb_5(px: float32x4x4_t) -> Float32x5T { - let first_pixel = px.0; - let second_pixel = vextq_f32::<3>(px.0, px.1); - let third_pixel = vextq_f32::<2>(px.1, px.2); - let four_pixel = vextq_f32::<1>(px.2, px.3); - Float32x5T(first_pixel, second_pixel, third_pixel, four_pixel, px.3) -} - -pub(crate) struct Float32x5T( - pub(crate) float32x4_t, - pub(crate) float32x4_t, - pub(crate) float32x4_t, - pub(crate) float32x4_t, - pub(crate) float32x4_t, -); diff --git a/src/neon/vertical_f16.rs b/src/neon/vertical_f16.rs index 1a7a6ef..49e8315 100644 --- a/src/neon/vertical_f16.rs +++ b/src/neon/vertical_f16.rs @@ -46,7 +46,7 @@ macro_rules! conv_vertical_part_neon_16_f16 { for j in 0..$bounds.size { let py = $start_y + j; let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr()); - let src_ptr = $src.add($src_stride * py); + let src_ptr = $src.get_unchecked($src_stride * py..).as_ptr(); let s_ptr = src_ptr.add(px); let item_row = xvldq_f16_x2(s_ptr); @@ -61,7 +61,7 @@ macro_rules! conv_vertical_part_neon_16_f16 { prefer_vfmaq_f32(store_3, xvcvt_f32_f16(xvget_high_f16(item_row.1)), v_weight); } - let dst_ptr = $dst.add(px); + let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr(); let f_set = x_float16x8x2_t( xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)), xcombine_f16(xvcvt_f16_f32(store_2), xvcvt_f16_f32(store_3)), @@ -88,7 +88,7 @@ macro_rules! conv_vertical_part_neon_32_f16 { for j in 0..$bounds.size { let py = $start_y + j; let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr()); - let src_ptr = $src.add($src_stride * py); + let src_ptr = $src.get_unchecked($src_stride * py..).as_ptr(); let s_ptr = src_ptr.add(px); let item_row = xvldq_f16_x4(s_ptr); @@ -112,7 +112,7 @@ macro_rules! conv_vertical_part_neon_32_f16 { prefer_vfmaq_f32(store_7, xvcvt_f32_f16(xvget_high_f16(item_row.3)), v_weight); } - let dst_ptr = $dst.add(px); + let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr(); let f_set = x_float16x8x4_t( xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)), xcombine_f16(xvcvt_f16_f32(store_2), xvcvt_f16_f32(store_3)), @@ -147,7 +147,7 @@ macro_rules! conv_vertical_part_neon_48_f16 { for j in 0..$bounds.size { let py = $start_y + j; let v_weight = vld1q_dup_f32($filter.get_unchecked(j..).as_ptr()); - let src_ptr = $src.add($src_stride * py); + let src_ptr = $src.get_unchecked($src_stride * py..).as_ptr(); let s_ptr = src_ptr.add(px); let item_row_0 = xvldq_f16_x4(s_ptr); @@ -217,7 +217,7 @@ macro_rules! conv_vertical_part_neon_48_f16 { ); } - let dst_ptr = $dst.add(px); + let dst_ptr = $dst.get_unchecked_mut(px..).as_mut_ptr(); let f_set = x_float16x8x4_t( xcombine_f16(xvcvt_f16_f32(store_0), xvcvt_f16_f32(store_1)), xcombine_f16(xvcvt_f16_f32(store_2), xvcvt_f16_f32(store_3)), @@ -236,11 +236,11 @@ macro_rules! conv_vertical_part_neon_48_f16 { }}; } -pub fn convolve_vertical_rgb_neon_row_f16( +pub(crate) fn convolve_vertical_rgb_neon_row_f16( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], src_stride: usize, weight_ptr: &[f32], ) { @@ -248,43 +248,19 @@ pub fn convolve_vertical_rgb_neon_row_f16( let dst_width = width * CHANNELS; while cx + 48 < dst_width { - conv_vertical_part_neon_48_f16!( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds - ); + conv_vertical_part_neon_48_f16!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); cx += 48; } while cx + 32 < dst_width { - conv_vertical_part_neon_32_f16!( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds - ); + conv_vertical_part_neon_32_f16!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); cx += 32; } while cx + 16 < dst_width { - conv_vertical_part_neon_16_f16!( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds - ); + conv_vertical_part_neon_16_f16!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); cx += 16; } @@ -294,9 +270,9 @@ pub fn convolve_vertical_rgb_neon_row_f16( convolve_vertical_part_neon_8_f16::( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, 8, @@ -313,9 +289,9 @@ pub fn convolve_vertical_rgb_neon_row_f16( convolve_vertical_part_neon_8_f16::( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, left, diff --git a/src/neon/vertical_f16_full.rs b/src/neon/vertical_f16_full.rs index d6ae278..2630b25 100644 --- a/src/neon/vertical_f16_full.rs +++ b/src/neon/vertical_f16_full.rs @@ -35,9 +35,9 @@ use crate::neon::*; pub(crate) unsafe fn xconvolve_vertical_part_neon_8_f16( start_y: usize, start_x: usize, - src: *const half::f16, + src: &[half::f16], src_stride: usize, - dst: *mut half::f16, + dst: &mut [half::f16], filter: &[f32], bounds: &FilterBounds, blend_length: usize, @@ -51,7 +51,7 @@ pub(crate) unsafe fn xconvolve_vertical_part_neon_8_f16( +pub(crate) fn xconvolve_vertical_rgb_neon_row_f16( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], src_stride: usize, weight_ptr: &[f32], ) { unsafe { xconvolve_vertical_rgb_neon_row_f16_impl::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } } @@ -202,8 +197,8 @@ pub fn xconvolve_vertical_rgb_neon_row_f16( pub unsafe fn xconvolve_vertical_rgb_neon_row_f16_impl( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], src_stride: usize, weight_ptr: &[f32], ) { @@ -211,43 +206,19 @@ pub unsafe fn xconvolve_vertical_rgb_neon_row_f16_impl( let dst_width = width * CHANNELS; while cx + 48 < dst_width { - conv_vertical_part_neon_48_f16!( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds - ); + conv_vertical_part_neon_48_f16!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); cx += 48; } while cx + 32 < dst_width { - conv_vertical_part_neon_32_f16!( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds - ); + conv_vertical_part_neon_32_f16!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); cx += 32; } while cx + 16 < dst_width { - conv_vertical_part_neon_16_f16!( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds - ); + conv_vertical_part_neon_16_f16!(bounds.start, cx, src, src_stride, dst, weight_ptr, bounds); cx += 16; } @@ -257,9 +228,9 @@ pub unsafe fn xconvolve_vertical_rgb_neon_row_f16_impl( xconvolve_vertical_part_neon_8_f16::( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, 8, @@ -276,9 +247,9 @@ pub unsafe fn xconvolve_vertical_rgb_neon_row_f16_impl( xconvolve_vertical_part_neon_8_f16::( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, left, diff --git a/src/neon/vertical_f32.rs b/src/neon/vertical_f32.rs index d1a241f..dbb1b6d 100644 --- a/src/neon/vertical_f32.rs +++ b/src/neon/vertical_f32.rs @@ -27,8 +27,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ use crate::filter_weights::FilterBounds; -use crate::neon::utils::prefer_vfmaq_f32; use crate::neon::utils::xvld1q_f32_x4; +use crate::neon::utils::{prefer_vfmaq_f32, xvld1q_f32_x2}; use std::arch::aarch64::*; macro_rules! conv_vertical_part_neon_16_f32 { @@ -187,7 +187,7 @@ unsafe fn convolve_vertical_part_neon_8_f32( let src_ptr = src.add(src_stride * py); let s_ptr = src_ptr.add(px); - let item_row = vld1q_f32_x2(s_ptr); + let item_row = xvld1q_f32_x2(s_ptr); store_0 = prefer_vfmaq_f32(store_0, item_row.0, v_weight); store_1 = prefer_vfmaq_f32(store_1, item_row.1, v_weight); @@ -220,7 +220,7 @@ unsafe fn convolve_vertical_part_neon_4_f32( let src_ptr = src.add(src_stride * py); let s_ptr = src_ptr.add(px); - let item_row = vld1q_f32_x2(s_ptr); + let item_row = xvld1q_f32_x2(s_ptr); store_0 = prefer_vfmaq_f32(store_0, item_row.0, v_weight); } @@ -259,7 +259,7 @@ unsafe fn convolve_vertical_part_neon_1_f32( dst_ptr.write_unaligned(vgetq_lane_f32::<0>(store_0)); } -pub fn convolve_vertical_rgb_neon_row_f32( +pub(crate) fn convolve_vertical_rgb_neon_row_f32( width: usize, bounds: &FilterBounds, unsafe_source_ptr_0: *const f32, diff --git a/src/neon/vertical_u16.rs b/src/neon/vertical_u16.rs index a268f4c..bf34f50 100644 --- a/src/neon/vertical_u16.rs +++ b/src/neon/vertical_u16.rs @@ -32,7 +32,7 @@ use crate::neon::utils::prefer_vfmaq_f32; use std::arch::aarch64::*; #[inline(always)] -pub fn convolve_column_u16( +pub(crate) fn convolve_column_u16( _: usize, bounds: &FilterBounds, src: &[u16], diff --git a/src/neon/vertical_u16_lb.rs b/src/neon/vertical_u16_lb.rs index 7dc925d..f76a525 100644 --- a/src/neon/vertical_u16_lb.rs +++ b/src/neon/vertical_u16_lb.rs @@ -31,7 +31,7 @@ use crate::support::{PRECISION, ROUNDING_CONST}; use std::arch::aarch64::*; #[inline(always)] -pub fn convolve_column_lb_u16( +pub(crate) fn convolve_column_lb_u16( _: usize, bounds: &FilterBounds, src: &[u16], diff --git a/src/neon/vertical_u8.rs b/src/neon/vertical_u8.rs index 667fb39..1e01da3 100644 --- a/src/neon/vertical_u8.rs +++ b/src/neon/vertical_u8.rs @@ -69,7 +69,7 @@ macro_rules! accumulate_4_into_lane { }}; } -pub fn convolve_vertical_neon_i16_precision( +pub(crate) fn convolve_vertical_neon_i16_precision( width: usize, bounds: &FilterBounds, src: &[u8], @@ -82,7 +82,7 @@ pub fn convolve_vertical_neon_i16_precision( } } -pub fn convolve_vertical_neon_i32_precision( +pub(crate) fn convolve_vertical_neon_i32_precision( width: usize, bounds: &FilterBounds, src: &[u8], diff --git a/src/sse/rgb_f16.rs b/src/sse/rgb_f16.rs index 18ba209..4f145c5 100644 --- a/src/sse/rgb_f16.rs +++ b/src/sse/rgb_f16.rs @@ -150,8 +150,8 @@ pub(crate) fn convolve_horizontal_rgb_sse_row_one_f16, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { unsafe { if F16C { @@ -160,16 +160,16 @@ pub(crate) fn convolve_horizontal_rgb_sse_row_one_f16, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { convolve_horizontal_rgb_sse_row_one_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } @@ -208,15 +208,15 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16c( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { convolve_horizontal_rgb_sse_row_one_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } @@ -226,15 +226,15 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16c_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { convolve_horizontal_rgb_sse_row_one_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } @@ -243,8 +243,8 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_impl, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { const CHANNELS: usize = 3; let mut filter_offset = 0usize; @@ -261,7 +261,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_impl( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, weight1, weight2, @@ -283,7 +283,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_impl( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, weight1, store, @@ -297,7 +297,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_impl( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, store, ); @@ -307,7 +307,7 @@ unsafe fn convolve_horizontal_rgb_sse_row_one_f16_impl(store); let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); (dest_ptr as *mut i32).write_unaligned(_mm_extract_epi32::<0>(store_ph)); (dest_ptr as *mut i16) .add(2) @@ -321,9 +321,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f16, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { unsafe { @@ -333,9 +333,9 @@ pub(crate) fn convolve_horizontal_rgb_sse_rows_4_f16, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { convolve_horizontal_rgb_sse_rows_4_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -391,18 +391,18 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16c( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { convolve_horizontal_rgb_sse_rows_4_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -413,30 +413,30 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16c_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { convolve_horizontal_rgb_sse_rows_4_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } -#[inline] +#[inline(always)] unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl( dst_width: usize, _: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { const CHANNELS: usize = 3; @@ -458,7 +458,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, weight1, weight2, @@ -467,7 +467,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..).as_ptr(), weight0, weight1, weight2, @@ -476,7 +476,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..).as_ptr(), weight0, weight1, weight2, @@ -485,7 +485,7 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..).as_ptr(), weight0, weight1, weight2, @@ -507,28 +507,28 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, weight1, store_0, ); store_1 = convolve_horizontal_parts_2_rgb_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..).as_ptr(), weight0, weight1, store_1, ); store_2 = convolve_horizontal_parts_2_rgb_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..).as_ptr(), weight0, weight1, store_2, ); store_3 = convolve_horizontal_parts_2_rgb_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..).as_ptr(), weight0, weight1, store_3, @@ -542,25 +542,25 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, store_0, ); store_1 = convolve_horizontal_parts_one_rgb_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..).as_ptr(), weight0, store_1, ); store_2 = convolve_horizontal_parts_one_rgb_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..).as_ptr(), weight0, store_2, ); store_3 = convolve_horizontal_parts_one_rgb_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..).as_ptr(), weight0, store_3, ); @@ -573,25 +573,25 @@ unsafe fn convolve_horizontal_rgb_sse_rows_4_f16_impl(store_3); let px = x * CHANNELS; - let dest_ptr = unsafe_destination_ptr_0.add(px); + let dest_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); (dest_ptr as *mut i32).write_unaligned(_mm_extract_epi32::<0>(store_ph_0)); (dest_ptr as *mut i16) .add(2) .write_unaligned(_mm_extract_epi16::<2>(store_ph_0) as i16); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride..).as_mut_ptr(); (dest_ptr as *mut i32).write_unaligned(_mm_extract_epi32::<0>(store_ph_1)); (dest_ptr as *mut i16) .add(2) .write_unaligned(_mm_extract_epi16::<2>(store_ph_1) as i16); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 2..).as_mut_ptr(); (dest_ptr as *mut i32).write_unaligned(_mm_extract_epi32::<0>(store_ph_2)); (dest_ptr as *mut i16) .add(2) .write_unaligned(_mm_extract_epi16::<2>(store_ph_2) as i16); - let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let dest_ptr = dst.get_unchecked_mut(px + dst_stride * 3..).as_mut_ptr(); (dest_ptr as *mut i32).write_unaligned(_mm_extract_epi32::<0>(store_ph_3)); (dest_ptr as *mut i16) .add(2) diff --git a/src/sse/rgba_f16.rs b/src/sse/rgba_f16.rs index 6b7b1e7..f29b541 100644 --- a/src/sse/rgba_f16.rs +++ b/src/sse/rgba_f16.rs @@ -107,8 +107,8 @@ pub(crate) fn convolve_horizontal_rgba_sse_row_one_f16, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { unsafe { if F16C { @@ -117,16 +117,16 @@ pub(crate) fn convolve_horizontal_rgba_sse_row_one_f16, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { convolve_horizontal_rgba_sse_row_one_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } @@ -165,15 +165,15 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16c( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { convolve_horizontal_rgba_sse_row_one_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } @@ -183,15 +183,15 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16c_fma( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { convolve_horizontal_rgba_sse_row_one_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, + src, + dst, ); } @@ -200,8 +200,8 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl, - unsafe_source_ptr_0: *const f16, - unsafe_destination_ptr_0: *mut f16, + src: &[f16], + dst: &mut [f16], ) { unsafe { const CHANNELS: usize = 4; @@ -219,7 +219,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, weight1, weight2, @@ -241,7 +241,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, weight1, store, @@ -255,7 +255,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, store, ); @@ -263,7 +263,7 @@ unsafe fn convolve_horizontal_rgba_sse_row_one_f16_impl(store); std::ptr::copy_nonoverlapping( &converted_f16 as *const _ as *const u8, @@ -280,9 +280,9 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f16, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { unsafe { @@ -292,9 +292,9 @@ pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f16, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { convolve_horizontal_rgba_sse_rows_4_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -350,18 +350,18 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16c( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { convolve_horizontal_rgba_sse_rows_4_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -372,18 +372,18 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_regular( dst_width: usize, src_width: usize, filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { convolve_horizontal_rgba_sse_rows_4_f16_impl::( dst_width, src_width, filter_weights, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, dst_stride, ); } @@ -393,9 +393,9 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl, - unsafe_source_ptr_0: *const f16, + src: &[f16], src_stride: usize, - unsafe_destination_ptr_0: *mut f16, + dst: &mut [f16], dst_stride: usize, ) { const CHANNELS: usize = 4; @@ -417,7 +417,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, weight1, weight2, @@ -426,7 +426,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..).as_ptr(), weight0, weight1, weight2, @@ -435,7 +435,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..).as_ptr(), weight0, weight1, weight2, @@ -444,7 +444,7 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..).as_ptr(), weight0, weight1, weight2, @@ -466,28 +466,28 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, weight1, store_0, ); store_1 = convolve_horizontal_parts_2_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..).as_ptr(), weight0, weight1, store_1, ); store_2 = convolve_horizontal_parts_2_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..).as_ptr(), weight0, weight1, store_2, ); store_3 = convolve_horizontal_parts_2_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..).as_ptr(), weight0, weight1, store_3, @@ -501,25 +501,25 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl( filter_start, - unsafe_source_ptr_0, + src.as_ptr(), weight0, store_0, ); store_1 = convolve_horizontal_parts_one_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride), + src.get_unchecked(src_stride..).as_ptr(), weight0, store_1, ); store_2 = convolve_horizontal_parts_one_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 2), + src.get_unchecked(src_stride * 2..).as_ptr(), weight0, store_2, ); store_3 = convolve_horizontal_parts_one_rgba_f16::( filter_start, - unsafe_source_ptr_0.add(src_stride * 3), + src.get_unchecked(src_stride * 3..).as_ptr(), weight0, store_3, ); @@ -527,10 +527,10 @@ unsafe fn convolve_horizontal_rgba_sse_rows_4_f16_impl(store_0); let converted_f16_1 = _mm_cvtps_phx::(store_1); diff --git a/src/sse/vertical_f16.rs b/src/sse/vertical_f16.rs index 50e0ede..6d7ca93 100644 --- a/src/sse/vertical_f16.rs +++ b/src/sse/vertical_f16.rs @@ -39,9 +39,9 @@ use crate::sse::f16_utils::{_mm_cvtph_psx, _mm_cvtps_phx}; pub(crate) unsafe fn convolve_vertical_part_sse_f16( start_y: usize, start_x: usize, - src: *const half::f16, + src: &[half::f16], src_stride: usize, - dst: *mut half::f16, + dst: &mut [half::f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -53,7 +53,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_f16(store_0, _mm_cvtph_psx::(item_row_0), v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); let converted = _mm_cvtps_phx::(store_0); let first_item = _mm_extract_epi16::<0>(converted) as u16; (dst_ptr as *mut u16).write_unaligned(first_item); @@ -71,9 +71,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_f16( start_y: usize, start_x: usize, - src: *const half::f16, + src: &[half::f16], src_stride: usize, - dst: *mut half::f16, + dst: &mut [half::f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -85,7 +85,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_4_f16(store_0, _mm_cvtph_psx::(item_row_0), v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); let acc = _mm_cvtps_phx::(store_0); std::ptr::copy_nonoverlapping(&acc as *const _ as *const u8, dst_ptr as *mut u8, 8); } @@ -102,9 +102,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_4_f16( start_y: usize, start_x: usize, - src: *const half::f16, + src: &[half::f16], src_stride: usize, - dst: *mut half::f16, + dst: &mut [half::f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -119,7 +119,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_16_16(store_3, items3, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); let acc0 = _mm_unpacklo_epi64( _mm_cvtps_phx::(store_0), @@ -155,9 +155,9 @@ pub(crate) unsafe fn convolve_vertical_part_sse_16_16( start_y: usize, start_x: usize, - src: *const half::f16, + src: &[half::f16], src_stride: usize, - dst: *mut half::f16, + dst: &mut [half::f16], filter: &[f32], bounds: &FilterBounds, ) { @@ -170,7 +170,7 @@ pub(crate) unsafe fn convolve_vertical_part_sse_8_f16(store_1, items1, v_weight); } - let dst_ptr = dst.add(px); + let dst_ptr = dst.get_unchecked_mut(px..).as_mut_ptr(); let acc0 = _mm_unpacklo_epi64( _mm_cvtps_phx::(store_0), _mm_cvtps_phx::(store_1), @@ -196,8 +196,8 @@ pub(crate) fn convolve_vertical_sse_row_f16< >( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], src_stride: usize, weight_ptr: &[f32], ) { @@ -205,31 +205,16 @@ pub(crate) fn convolve_vertical_sse_row_f16< if F16C { if FMA { convolve_vertical_sse_row_f16c_fma::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } else { convolve_vertical_sse_row_f16c::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } } else { convolve_vertical_sse_row_f16_regular::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } } @@ -240,18 +225,13 @@ pub(crate) fn convolve_vertical_sse_row_f16< unsafe fn convolve_vertical_sse_row_f16_regular( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], src_stride: usize, weight_ptr: &[f32], ) { convolve_vertical_sse_row_f16_impl::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } @@ -260,18 +240,13 @@ unsafe fn convolve_vertical_sse_row_f16_regular( unsafe fn convolve_vertical_sse_row_f16c_fma( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], src_stride: usize, weight_ptr: &[f32], ) { convolve_vertical_sse_row_f16_impl::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } @@ -280,18 +255,13 @@ unsafe fn convolve_vertical_sse_row_f16c_fma( unsafe fn convolve_vertical_sse_row_f16c( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], src_stride: usize, weight_ptr: &[f32], ) { convolve_vertical_sse_row_f16_impl::( - width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, + width, bounds, src, dst, src_stride, weight_ptr, ); } @@ -303,8 +273,8 @@ unsafe fn convolve_vertical_sse_row_f16_impl< >( width: usize, bounds: &FilterBounds, - unsafe_source_ptr_0: *const half::f16, - unsafe_destination_ptr_0: *mut half::f16, + src: &[half::f16], + dst: &mut [half::f16], src_stride: usize, weight_ptr: &[f32], ) { @@ -316,9 +286,9 @@ unsafe fn convolve_vertical_sse_row_f16_impl< convolve_vertical_part_sse_16_16::( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -332,9 +302,9 @@ unsafe fn convolve_vertical_sse_row_f16_impl< convolve_vertical_part_sse_8_f16::( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -348,9 +318,9 @@ unsafe fn convolve_vertical_sse_row_f16_impl< convolve_vertical_part_sse_4_f16::( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, ); @@ -364,9 +334,9 @@ unsafe fn convolve_vertical_sse_row_f16_impl< convolve_vertical_part_sse_f16::( bounds.start, cx, - unsafe_source_ptr_0, + src, src_stride, - unsafe_destination_ptr_0, + dst, weight_ptr, bounds, );