diff --git a/app/src/main.rs b/app/src/main.rs index f64de06..87e234f 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -8,26 +8,38 @@ use fast_image_resize::{ use image::io::Reader as ImageReader; use image::{EncodableLayout, GenericImageView}; -use pic_scale::{ImageSize, ImageStore, LabScaler, LChScaler, LinearApproxScaler, LinearScaler, LuvScaler, ResamplingFunction, Scaler, Scaling, SigmoidalScaler, ThreadingPolicy, TransferFunction, XYZScaler}; +use pic_scale::{ + ImageSize, ImageStore, LinearScaler, ResamplingFunction, Scaler, Scaling, ThreadingPolicy, +}; fn main() { // test_fast_image(); - let img = ImageReader::open("./assets/beach_horizon.jpg") + let img = ImageReader::open("./assets/asset.jpg") .unwrap() .decode() .unwrap(); let dimensions = img.dimensions(); let mut bytes = Vec::from(img.as_bytes()); + let mut scaler = LinearScaler::new(ResamplingFunction::Lagrange3); + scaler.set_threading_policy(ThreadingPolicy::Single); + // let store = + // ImageStore::::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize); + // let resized = scaler.resize_rgba( + // ImageSize::new(dimensions.0 as usize / 3, dimensions.1 as usize / 3), + // store, + // false, + // ); + + let mut f_store: Vec = bytes.iter().map(|&x| x as f32 * (1f32 / 255f32)).collect(); + let start_time = Instant::now(); - let mut scaler = LChScaler::new(ResamplingFunction::Lanczos3); - scaler.set_threading_policy(ThreadingPolicy::Single); let store = - ImageStore::::from_slice(&mut bytes, dimensions.0 as usize, dimensions.1 as usize); - let resized = scaler.resize_rgb( - ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2), + ImageStore::::from_slice(&mut f_store, dimensions.0 as usize, dimensions.1 as usize); + let resized = scaler.resize_rgb_f32( + ImageSize::new(dimensions.0 as usize / 3, dimensions.1 as usize / 3), store, ); @@ -35,10 +47,13 @@ fn main() { // Print the elapsed time in milliseconds println!("Scaler: {:.2?}", elapsed_time); + let j_store: Vec = resized.as_bytes().iter().map(|&x| (x * 255f32) as u8).collect(); + let dst = j_store; + if resized.channels == 4 { image::save_buffer( "converted.png", - resized.as_bytes(), + &dst, resized.width as u32, resized.height as u32, image::ExtendedColorType::Rgba8, @@ -46,8 +61,8 @@ fn main() { .unwrap(); } else { image::save_buffer( - "converted_lch.jpg", - resized.as_bytes(), + "converted.jpg", + &dst, resized.width as u32, resized.height as u32, image::ExtendedColorType::Rgb8, diff --git a/src/lab_scaler.rs b/src/colors/lab_scaler.rs similarity index 100% rename from src/lab_scaler.rs rename to src/colors/lab_scaler.rs diff --git a/src/lch_scaler.rs b/src/colors/lch_scaler.rs similarity index 97% rename from src/lch_scaler.rs rename to src/colors/lch_scaler.rs index 73ef3b1..200c948 100644 --- a/src/lch_scaler.rs +++ b/src/colors/lch_scaler.rs @@ -5,10 +5,7 @@ * // license that can be found in the LICENSE file. */ -use colorutils_rs::{ - lch_to_rgb, lch_with_alpha_to_rgba, - rgb_to_lch, rgba_to_lch_with_alpha, -}; +use colorutils_rs::{lch_to_rgb, lch_with_alpha_to_rgba, rgb_to_lch, rgba_to_lch_with_alpha}; use crate::{ImageSize, ImageStore, ResamplingFunction, Scaler, Scaling, ThreadingPolicy}; diff --git a/src/linear_precise_scaler.rs b/src/colors/linear_precise_scaler.rs similarity index 100% rename from src/linear_precise_scaler.rs rename to src/colors/linear_precise_scaler.rs diff --git a/src/linear_scaler.rs b/src/colors/linear_scaler.rs similarity index 100% rename from src/linear_scaler.rs rename to src/colors/linear_scaler.rs diff --git a/src/luv_scaler.rs b/src/colors/luv_scaler.rs similarity index 100% rename from src/luv_scaler.rs rename to src/colors/luv_scaler.rs diff --git a/src/colors/mod.rs b/src/colors/mod.rs new file mode 100644 index 0000000..01172d2 --- /dev/null +++ b/src/colors/mod.rs @@ -0,0 +1,15 @@ +mod lab_scaler; +mod lch_scaler; +mod linear_precise_scaler; +mod linear_scaler; +mod luv_scaler; +mod sigmoidal_scaler; +mod xyz_scaler; + +pub use lab_scaler::*; +pub use lch_scaler::*; +pub use linear_precise_scaler::*; +pub use linear_scaler::*; +pub use luv_scaler::*; +pub use sigmoidal_scaler::*; +pub use xyz_scaler::*; diff --git a/src/sigmoidal_scaler.rs b/src/colors/sigmoidal_scaler.rs similarity index 100% rename from src/sigmoidal_scaler.rs rename to src/colors/sigmoidal_scaler.rs diff --git a/src/xyz_scaler.rs b/src/colors/xyz_scaler.rs similarity index 100% rename from src/xyz_scaler.rs rename to src/colors/xyz_scaler.rs diff --git a/src/convolve_naive_f32.rs b/src/convolve_naive_f32.rs new file mode 100644 index 0000000..187a3f5 --- /dev/null +++ b/src/convolve_naive_f32.rs @@ -0,0 +1,253 @@ +/* + * // Copyright (c) the Radzivon Bartoshyk. All rights reserved. + * // + * // Use of this source code is governed by a BSD-style + * // license that can be found in the LICENSE file. + */ + +use crate::filter_weights::{FilterBounds, FilterWeights}; + +#[inline(always)] +pub(crate) unsafe fn convolve_vertical_part_f32( + start_y: usize, + start_x: usize, + src: *const f32, + src_stride: usize, + dst: *mut f32, + filter: *const f32, + bounds: &FilterBounds, +) { + let mut store: [[f32; CHANNELS]; PART] = [[0f32; CHANNELS]; PART]; + + for j in 0..bounds.size { + let py = start_y + j; + let weight = unsafe { filter.add(j).read_unaligned() }; + let src_ptr = src.add(src_stride * py); + for x in 0..PART { + let px = (start_x + x) * CHANNELS; + let s_ptr = src_ptr.add(px); + for c in 0..CHANNELS { + let store_p = store.get_unchecked_mut(x); + let store_v = store_p.get_unchecked_mut(c); + *store_v += unsafe { s_ptr.add(c).read_unaligned() } * weight; + } + } + } + + for x in 0..PART { + let px = (start_x + x) * CHANNELS; + let dst_ptr = dst.add(px); + for c in 0..CHANNELS { + let vl = *(*store.get_unchecked_mut(x)).get_unchecked_mut(c); + dst_ptr.add(c).write_unaligned(vl); + } + } +} + +#[inline(always)] +pub(crate) fn convolve_horizontal_rgb_native_row( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + unsafe_source_ptr_0: *const f32, + unsafe_destination_ptr_0: *mut f32, +) { + unsafe { + let weights_ptr = filter_weights.weights.as_ptr(); + let mut filter_offset = 0usize; + + for x in 0..dst_width { + let mut _sum_r = 0f32; + let mut _sum_g = 0f32; + let mut _sum_b = 0f32; + let mut _sum_a = 0f32; + + let bounds = filter_weights.bounds.get_unchecked(x); + let start_x = bounds.start; + for j in 0..bounds.size { + let px = (start_x + j) * CHANNELS; + let weight = weights_ptr.add(j + filter_offset).read_unaligned(); + let src = unsafe_source_ptr_0.add(px); + _sum_r += src.read_unaligned() * weight; + if CHANNELS > 1 { + _sum_g += src.add(1).read_unaligned() * weight; + } + if CHANNELS > 2 { + _sum_b += src.add(2).read_unaligned() * weight; + } + if CHANNELS == 4 { + _sum_a += src.add(3).read_unaligned() * weight; + } + } + + let px = x * CHANNELS; + + let dest_ptr = unsafe_destination_ptr_0.add(px); + dest_ptr.write_unaligned(_sum_r); + if CHANNELS > 1 { + dest_ptr.add(1).write_unaligned(_sum_g); + } + if CHANNELS > 2 { + dest_ptr.add(2).write_unaligned(_sum_b); + } + if CHANNELS == 4 { + dest_ptr.add(3).write_unaligned(_sum_a); + } + + filter_offset += filter_weights.aligned_size; + } + } +} + +#[allow(unused)] +pub(crate) fn convolve_horizontal_rgba_4_row_f32( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + unsafe_source_ptr_0: *const f32, + src_stride: usize, + unsafe_destination_ptr_0: *mut f32, + dst_stride: usize, +) { + unsafe { + let mut filter_offset = 0usize; + let weights_ptr = filter_weights.weights.as_ptr(); + + let src_row0 = unsafe_source_ptr_0; + let src_row1 = unsafe_source_ptr_0.add(src_stride); + let src_row2 = unsafe_source_ptr_0.add(src_stride * 2); + let src_row3 = unsafe_source_ptr_0.add(src_stride * 3); + + let dst_row0 = unsafe_destination_ptr_0; + let dst_row1 = unsafe_destination_ptr_0.add(dst_stride); + let dst_row2 = unsafe_destination_ptr_0.add(dst_stride * 2); + let dst_row3 = unsafe_destination_ptr_0.add(dst_stride * 3); + + for x in 0..dst_width { + let mut sum_r_0 = 0f32; + let mut sum_g_0 = 0f32; + let mut sum_b_0 = 0f32; + let mut sum_a_0 = 0f32; + let mut sum_r_1 = 0f32; + let mut sum_g_1 = 0f32; + let mut sum_b_1 = 0f32; + let mut sum_a_1 = 0f32; + let mut sum_r_2 = 0f32; + let mut sum_g_2 = 0f32; + let mut sum_b_2 = 0f32; + let mut sum_a_2 = 0f32; + let mut sum_r_3 = 0f32; + let mut sum_g_3 = 0f32; + let mut sum_b_3 = 0f32; + let mut sum_a_3 = 0f32; + + let bounds = filter_weights.bounds.get_unchecked(x); + let start_x = bounds.start; + for j in 0..bounds.size { + let px = (start_x + j) * CHANNELS; + let weight = weights_ptr.add(j + filter_offset).read_unaligned(); + + let src0 = src_row0.add(px); + sum_r_0 += src0.read_unaligned() * weight; + if CHANNELS > 1 { + sum_g_0 += src0.add(1).read_unaligned() * weight; + } + if CHANNELS > 2 { + sum_b_0 += src0.add(2).read_unaligned() * weight; + } + if CHANNELS == 4 { + sum_a_0 += src0.add(3).read_unaligned() * weight; + } + + let src1 = src_row1.add(px); + sum_r_1 += src1.read_unaligned() * weight; + if CHANNELS > 1 { + sum_g_1 += src1.add(1).read_unaligned() * weight; + } + if CHANNELS > 2 { + sum_b_1 += src1.add(2).read_unaligned() * weight; + } + if CHANNELS == 4 { + sum_a_1 += src1.add(3).read_unaligned() * weight; + } + + let src2 = src_row2.add(px); + sum_r_2 += src2.read_unaligned() * weight; + if CHANNELS > 1 { + sum_g_2 += src2.add(1).read_unaligned() * weight; + } + if CHANNELS > 2 { + sum_b_2 += src2.add(2).read_unaligned() * weight; + } + if CHANNELS == 4 { + sum_a_2 += src2.add(3).read_unaligned() * weight; + } + + let src3 = src_row3.add(px); + sum_r_3 += src3.read_unaligned() * weight; + if CHANNELS > 1 { + sum_g_3 += src3.add(1).read_unaligned() * weight; + } + if CHANNELS > 2 { + sum_b_3 += src3.add(2).read_unaligned() * weight; + } + if CHANNELS == 4 { + sum_a_3 += src3.add(3).read_unaligned() * weight; + } + } + + let px = x * CHANNELS; + + let dest_ptr_0 = dst_row0.add(px); + let dest_ptr_1 = dst_row1.add(px); + let dest_ptr_2 = dst_row2.add(px); + let dest_ptr_3 = dst_row3.add(px); + + dest_ptr_0.write_unaligned(sum_r_0); + if CHANNELS > 1 { + dest_ptr_0.add(1).write_unaligned(sum_g_0); + } + if CHANNELS > 2 { + dest_ptr_0.add(2).write_unaligned(sum_b_0); + } + if CHANNELS == 4 { + dest_ptr_0.add(3).write_unaligned(sum_a_0); + } + + dest_ptr_1.write_unaligned(sum_r_1); + if CHANNELS > 1 { + dest_ptr_1.add(1).write_unaligned(sum_g_1); + } + if CHANNELS > 2 { + dest_ptr_1.add(2).write_unaligned(sum_b_1); + } + if CHANNELS == 4 { + dest_ptr_1.add(3).write_unaligned(sum_a_1); + } + + dest_ptr_2.write_unaligned(sum_r_2); + if CHANNELS > 1 { + dest_ptr_2.add(1).write_unaligned(sum_g_2); + } + if CHANNELS > 2 { + dest_ptr_2.add(2).write_unaligned(sum_b_2); + } + if CHANNELS == 4 { + dest_ptr_2.add(3).write_unaligned(sum_a_2); + } + + dest_ptr_3.write_unaligned(sum_r_3); + if CHANNELS > 1 { + dest_ptr_3.add(1).write_unaligned(sum_g_3); + } + if CHANNELS > 2 { + dest_ptr_3.add(2).write_unaligned(sum_b_3); + } + if CHANNELS == 4 { + dest_ptr_3.add(3).write_unaligned(sum_a_3); + } + + filter_offset += filter_weights.aligned_size; + } + } +} diff --git a/src/convolve_naive_u8.rs b/src/convolve_naive_u8.rs new file mode 100644 index 0000000..d0144aa --- /dev/null +++ b/src/convolve_naive_u8.rs @@ -0,0 +1,285 @@ +/* + * // Copyright (c) the Radzivon Bartoshyk. All rights reserved. + * // + * // Use of this source code is governed by a BSD-style + * // license that can be found in the LICENSE file. + */ + +use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::support::{PRECISION, ROUNDING_APPROX}; + +#[inline(always)] +pub(crate) unsafe fn convolve_vertical_part( + start_y: usize, + start_x: usize, + src: *const u8, + src_stride: usize, + dst: *mut u8, + filter: *const i16, + bounds: &FilterBounds, +) { + let mut store: [[i32; CHANNELS]; PART] = [[ROUNDING_APPROX; CHANNELS]; PART]; + + for j in 0..bounds.size { + let py = start_y + j; + let weight = unsafe { filter.add(j).read_unaligned() } as i32; + let src_ptr = src.add(src_stride * py); + for x in 0..PART { + let px = (start_x + x) * CHANNELS; + let s_ptr = src_ptr.add(px); + for c in 0..CHANNELS { + let store_p = store.get_unchecked_mut(x); + let store_v = store_p.get_unchecked_mut(c); + *store_v += unsafe { s_ptr.add(c).read_unaligned() } as i32 * weight; + } + } + } + + for x in 0..PART { + let px = (start_x + x) * CHANNELS; + let dst_ptr = dst.add(px); + for c in 0..CHANNELS { + let vl = *(*store.get_unchecked_mut(x)).get_unchecked_mut(c); + let ck = vl >> PRECISION; + dst_ptr.add(c).write_unaligned(ck.max(0).min(255) as u8); + } + } +} + +pub(crate) fn convolve_horizontal_rgba_native_row( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + unsafe_source_ptr_0: *const u8, + unsafe_destination_ptr_0: *mut u8, +) { + let mut filter_offset = 0usize; + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let mut sum_r = ROUNDING_APPROX; + let mut sum_g = ROUNDING_APPROX; + let mut sum_b = ROUNDING_APPROX; + let mut sum_a = ROUNDING_APPROX; + + let bounds = unsafe { filter_weights.bounds.get_unchecked(x) }; + let start_x = bounds.start; + for j in 0..bounds.size { + let px = (start_x + j) * CHANNELS; + let weight = unsafe { weights_ptr.add(j + filter_offset).read_unaligned() } as i32; + let src = unsafe { unsafe_source_ptr_0.add(px) }; + sum_r += unsafe { src.read_unaligned() } as i32 * weight; + if CHANNELS > 1 { + sum_g += unsafe { src.add(1).read_unaligned() } as i32 * weight; + } + if CHANNELS > 2 { + sum_b += unsafe { src.add(2).read_unaligned() } as i32 * weight; + } + if CHANNELS == 4 { + sum_a += unsafe { src.add(3).read_unaligned() } as i32 * weight; + } + } + + let px = x * CHANNELS; + + let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; + + unsafe { + dest_ptr.write_unaligned((sum_r >> PRECISION).min(255).max(0) as u8); + if CHANNELS > 1 { + dest_ptr + .add(1) + .write_unaligned((sum_g >> PRECISION).min(255).max(0) as u8); + } + if CHANNELS > 2 { + dest_ptr + .add(2) + .write_unaligned((sum_b >> PRECISION).min(255).max(0) as u8); + } + if CHANNELS == 4 { + dest_ptr + .add(3) + .write_unaligned((sum_a >> PRECISION).min(255).max(0) as u8); + } + } + + filter_offset += filter_weights.aligned_size; + } +} + +#[allow(unused)] +pub(crate) fn convolve_horizontal_rgba_native_4_row( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + unsafe_source_ptr_0: *const u8, + src_stride: usize, + unsafe_destination_ptr_0: *mut u8, + dst_stride: usize, +) { + unsafe { + let mut filter_offset = 0usize; + let weights_ptr = filter_weights.weights.as_ptr(); + + let src_row0 = unsafe_source_ptr_0; + let src_row1 = unsafe_source_ptr_0.add(src_stride); + let src_row2 = unsafe_source_ptr_0.add(src_stride * 2); + let src_row3 = unsafe_source_ptr_0.add(src_stride * 3); + + let dst_row0 = unsafe_destination_ptr_0; + let dst_row1 = unsafe_destination_ptr_0.add(dst_stride); + let dst_row2 = unsafe_destination_ptr_0.add(dst_stride * 2); + let dst_row3 = unsafe_destination_ptr_0.add(dst_stride * 3); + + for x in 0..dst_width { + let mut sum_r_0 = ROUNDING_APPROX; + let mut sum_g_0 = ROUNDING_APPROX; + let mut sum_b_0 = ROUNDING_APPROX; + let mut sum_a_0 = ROUNDING_APPROX; + let mut sum_r_1 = ROUNDING_APPROX; + let mut sum_g_1 = ROUNDING_APPROX; + let mut sum_b_1 = ROUNDING_APPROX; + let mut sum_a_1 = ROUNDING_APPROX; + let mut sum_r_2 = ROUNDING_APPROX; + let mut sum_g_2 = ROUNDING_APPROX; + let mut sum_b_2 = ROUNDING_APPROX; + let mut sum_a_2 = ROUNDING_APPROX; + let mut sum_r_3 = ROUNDING_APPROX; + let mut sum_g_3 = ROUNDING_APPROX; + let mut sum_b_3 = ROUNDING_APPROX; + let mut sum_a_3 = ROUNDING_APPROX; + + let bounds = filter_weights.bounds.get_unchecked(x); + let start_x = bounds.start; + for j in 0..bounds.size { + let px = (start_x + j) * CHANNELS; + let weight = weights_ptr.add(j + filter_offset).read_unaligned() as i32; + + let src0 = src_row0.add(px); + sum_r_0 += src0.read_unaligned() as i32 * weight; + if CHANNELS > 1 { + sum_g_0 += src0.add(1).read_unaligned() as i32 * weight; + } + if CHANNELS > 2 { + sum_b_0 += src0.add(2).read_unaligned() as i32 * weight; + } + if CHANNELS == 4 { + sum_a_0 += src0.add(3).read_unaligned() as i32 * weight; + } + + let src1 = src_row1.add(px); + sum_r_1 += src1.read_unaligned() as i32 * weight; + if CHANNELS > 1 { + sum_g_1 += src1.add(1).read_unaligned() as i32 * weight; + } + if CHANNELS > 2 { + sum_b_1 += src1.add(2).read_unaligned() as i32 * weight; + } + if CHANNELS == 4 { + sum_a_1 += src1.add(3).read_unaligned() as i32 * weight; + } + + let src2 = src_row2.add(px); + sum_r_2 += src2.read_unaligned() as i32 * weight; + if CHANNELS > 1 { + sum_g_2 += src2.add(1).read_unaligned() as i32 * weight; + } + if CHANNELS > 2 { + sum_b_2 += src2.add(2).read_unaligned() as i32 * weight; + } + if CHANNELS == 4 { + sum_a_2 += src2.add(3).read_unaligned() as i32 * weight; + } + + let src3 = src_row3.add(px); + sum_r_3 += src3.read_unaligned() as i32 * weight; + if CHANNELS > 1 { + sum_g_3 += src3.add(1).read_unaligned() as i32 * weight; + } + if CHANNELS > 2 { + sum_b_3 += src3.add(2).read_unaligned() as i32 * weight; + } + if CHANNELS == 4 { + sum_a_3 += src3.add(3).read_unaligned() as i32 * weight; + } + } + + let px = x * CHANNELS; + + let dest_ptr_0 = dst_row0.add(px); + let dest_ptr_1 = dst_row1.add(px); + let dest_ptr_2 = dst_row2.add(px); + let dest_ptr_3 = dst_row3.add(px); + + dest_ptr_0.write_unaligned((sum_r_0 >> PRECISION).min(255).max(0) as u8); + if CHANNELS > 1 { + dest_ptr_0 + .add(1) + .write_unaligned((sum_g_0 >> PRECISION).min(255).max(0) as u8); + } + if CHANNELS > 2 { + dest_ptr_0 + .add(2) + .write_unaligned((sum_b_0 >> PRECISION).min(255).max(0) as u8); + } + if CHANNELS == 4 { + dest_ptr_0 + .add(3) + .write_unaligned((sum_a_0 >> PRECISION).min(255).max(0) as u8); + } + + dest_ptr_1.write_unaligned((sum_r_1 >> PRECISION).min(255).max(0) as u8); + if CHANNELS > 1 { + dest_ptr_1 + .add(1) + .write_unaligned((sum_g_1 >> PRECISION).min(255).max(0) as u8); + } + if CHANNELS > 2 { + dest_ptr_1 + .add(2) + .write_unaligned((sum_b_1 >> PRECISION).min(255).max(0) as u8); + } + if CHANNELS == 4 { + dest_ptr_1 + .add(3) + .write_unaligned((sum_a_1 >> PRECISION).min(255).max(0) as u8); + } + + dest_ptr_2.write_unaligned((sum_r_2 >> PRECISION).min(255).max(0) as u8); + if CHANNELS > 1 { + dest_ptr_2 + .add(1) + .write_unaligned((sum_g_2 >> PRECISION).min(255).max(0) as u8); + } + if CHANNELS > 2 { + dest_ptr_2 + .add(2) + .write_unaligned((sum_b_2 >> PRECISION).min(255).max(0) as u8); + } + if CHANNELS == 4 { + dest_ptr_2 + .add(3) + .write_unaligned((sum_a_2 >> PRECISION).min(255).max(0) as u8); + } + + dest_ptr_3.write_unaligned((sum_r_3 >> PRECISION).min(255).max(0) as u8); + if CHANNELS > 1 { + dest_ptr_3 + .add(1) + .write_unaligned((sum_g_3 >> PRECISION).min(255).max(0) as u8); + } + if CHANNELS > 2 { + dest_ptr_3 + .add(2) + .write_unaligned((sum_b_3 >> PRECISION).min(255).max(0) as u8); + } + if CHANNELS == 4 { + dest_ptr_3 + .add(3) + .write_unaligned((sum_a_3 >> PRECISION).min(255).max(0) as u8); + } + + filter_offset += filter_weights.aligned_size; + } + } +} diff --git a/src/convolve_u8.rs b/src/convolve_u8.rs deleted file mode 100644 index 32533d7..0000000 --- a/src/convolve_u8.rs +++ /dev/null @@ -1,48 +0,0 @@ -/* - * // Copyright (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -use crate::filter_weights::FilterBounds; -use crate::support::{PRECISION, ROUNDING_APPROX}; - -#[inline(always)] -#[allow(unused)] -pub(crate) unsafe fn convolve_vertical_part( - start_y: usize, - start_x: usize, - src: *const u8, - src_stride: usize, - dst: *mut u8, - filter: *const i16, - bounds: &FilterBounds, -) { - let mut store: [[i32; CHANNELS]; PART] = [[ROUNDING_APPROX; CHANNELS]; PART]; - - for j in 0..bounds.size { - let py = start_y + j; - let weight = unsafe { filter.add(j).read_unaligned() } as i32; - let src_ptr = src.add(src_stride * py); - for x in 0..PART { - let px = (start_x + x) * CHANNELS; - let s_ptr = src_ptr.add(px); - for c in 0..CHANNELS { - let store_p = store.get_unchecked_mut(x); - let store_v = store_p.get_unchecked_mut(c); - *store_v += unsafe { s_ptr.add(c).read_unaligned() } as i32 * weight; - } - } - } - - for x in 0..PART { - let px = (start_x + x) * CHANNELS; - let dst_ptr = dst.add(px); - for c in 0..CHANNELS { - let vl = *(*store.get_unchecked_mut(x)).get_unchecked_mut(c); - let ck = vl >> PRECISION; - dst_ptr.add(c).write_unaligned(ck.max(0).min(255) as u8); - } - } -} diff --git a/src/dispatch_group_f32.rs b/src/dispatch_group_f32.rs new file mode 100644 index 0000000..f4b7ee4 --- /dev/null +++ b/src/dispatch_group_f32.rs @@ -0,0 +1,165 @@ +use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::unsafe_slice::UnsafeSlice; +use crate::ImageStore; +use rayon::ThreadPool; +use std::sync::Arc; + +pub(crate) fn convolve_vertical_dispatch_f32( + image_store: &ImageStore, + filter_weights: FilterWeights, + destination: &mut ImageStore, + pool: &Option, + dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, *const f32), +) { + let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); + let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); + + let src_stride = image_store.width * image_store.channels; + + let mut filter_offset = 0usize; + + let dst_stride = destination.width * image_store.channels; + let dst_width = destination.width; + + if let Some(pool) = pool { + let arc_weights = Arc::new(filter_weights); + let borrowed = destination.buffer.borrow_mut(); + let unsafe_slice = UnsafeSlice::new(borrowed); + pool.scope(|scope| { + for y in 0..destination.height { + let weights = arc_weights.clone(); + scope.spawn(move |_| { + let bounds = unsafe { weights.bounds.get_unchecked(y) }; + let weight_ptr = + unsafe { weights.weights.as_ptr().add(weights.aligned_size * y) }; + let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); + let dst_ptr = unsafe_slice.mut_ptr(); + let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; + dispatcher( + dst_width, + bounds, + unsafe_source_ptr_0, + unsafe_destination_ptr_0, + src_stride, + weight_ptr, + ); + }); + } + }); + } else { + for y in 0..destination.height { + let bounds = unsafe { filter_weights.bounds.get_unchecked(y) }; + let weight_ptr = unsafe { filter_weights.weights.as_ptr().add(filter_offset) }; + + dispatcher( + dst_width, + bounds, + unsafe_source_ptr_0, + unsafe_destination_ptr_0, + src_stride, + weight_ptr, + ); + + filter_offset += filter_weights.aligned_size; + unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; + } + } +} + +#[inline(always)] +pub(crate) fn convolve_horizontal_dispatch_f32( + image_store: &ImageStore, + filter_weights: FilterWeights, + destination: &mut ImageStore, + pool: &Option, + dispatcher_4_rows: Option, *const f32, usize, *mut f32, usize)>, + dispatcher_row: fn(usize, usize, &FilterWeights, *const f32, *mut f32), +) { + let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); + let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); + + let src_stride = image_store.width * image_store.channels; + let dst_stride = destination.width * image_store.channels; + let dst_width = destination.width; + let src_width = image_store.width; + + if let Some(pool) = pool { + let arc_weights = Arc::new(filter_weights); + let borrowed = destination.buffer.borrow_mut(); + let unsafe_slice = UnsafeSlice::new(borrowed); + pool.scope(|scope| { + let mut yy = 0usize; + if let Some(dispatcher) = dispatcher_4_rows { + for y in (0..destination.height.saturating_sub(4)).step_by(4) { + let weights = arc_weights.clone(); + scope.spawn(move |_| { + let unsafe_source_ptr_0 = + unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; + let dst_ptr = unsafe_slice.mut_ptr(); + let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; + dispatcher( + dst_width, + src_width, + &weights, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + dst_stride, + ); + }); + yy = y; + } + } + for y in (yy..destination.height).step_by(4) { + let weights = arc_weights.clone(); + scope.spawn(move |_| { + let unsafe_source_ptr_0 = + unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; + let dst_ptr = unsafe_slice.mut_ptr(); + let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; + dispatcher_row( + dst_width, + src_width, + &weights, + unsafe_source_ptr_0, + unsafe_destination_ptr_0, + ); + }); + } + }); + } else { + let mut yy = 0usize; + + if let Some(dispatcher) = dispatcher_4_rows { + while yy + 4 < destination.height { + dispatcher( + dst_width, + src_width, + &filter_weights, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + dst_stride, + ); + + unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) }; + unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) }; + + yy += 4; + } + } + + for _ in yy..destination.height { + dispatcher_row( + dst_width, + src_width, + &filter_weights, + unsafe_source_ptr_0, + unsafe_destination_ptr_0, + ); + + unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; + unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; + } + } +} diff --git a/src/dispatch_group_u8.rs b/src/dispatch_group_u8.rs new file mode 100644 index 0000000..1204dd7 --- /dev/null +++ b/src/dispatch_group_u8.rs @@ -0,0 +1,164 @@ +use crate::filter_weights::{FilterBounds, FilterWeights}; +use crate::support::PRECISION; +use crate::unsafe_slice::UnsafeSlice; +use crate::ImageStore; +use rayon::ThreadPool; +use std::sync::Arc; + +pub(crate) fn convolve_horizontal_dispatch_u8( + image_store: &ImageStore, + filter_weights: FilterWeights, + destination: &mut ImageStore, + pool: &Option, + dispatcher_4_rows: Option< + fn(usize, usize, &FilterWeights, *const u8, usize, *mut u8, usize), + >, + dispatcher_1_row: fn(usize, usize, &FilterWeights, *const u8, *mut u8), +) { + let approx_weights = filter_weights.numerical_approximation_i16::(0); + + let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); + let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); + + let src_stride = image_store.width * image_store.channels; + let dst_stride = destination.width * image_store.channels; + let dst_width = destination.width; + let src_width = image_store.width; + + if let Some(pool) = pool { + let arc_weights = Arc::new(approx_weights); + let borrowed = destination.buffer.borrow_mut(); + let unsafe_slice = UnsafeSlice::new(borrowed); + pool.scope(|scope| { + let mut yy = 0usize; + if let Some(dispatcher) = dispatcher_4_rows { + for y in (0..destination.height.saturating_sub(4)).step_by(4) { + let weights = arc_weights.clone(); + scope.spawn(move |_| { + let unsafe_source_ptr_0 = + unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; + let dst_ptr = unsafe_slice.mut_ptr(); + let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; + dispatcher( + dst_width, + src_width, + &weights, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + dst_stride, + ); + }); + yy = y; + } + } + for y in (yy..destination.height).step_by(4) { + let weights = arc_weights.clone(); + scope.spawn(move |_| { + let unsafe_source_ptr_0 = + unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; + let dst_ptr = unsafe_slice.mut_ptr(); + let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; + dispatcher_1_row( + dst_width, + src_width, + &weights, + unsafe_source_ptr_0, + unsafe_destination_ptr_0, + ); + }); + } + }); + } else { + let mut yy = 0usize; + if let Some(dispatcher) = dispatcher_4_rows { + while yy + 4 < destination.height { + dispatcher( + dst_width, + src_width, + &approx_weights, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + dst_stride, + ); + unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) }; + unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) }; + yy += 4; + } + } + + for _ in yy..destination.height { + dispatcher_1_row( + dst_width, + src_width, + &approx_weights, + unsafe_source_ptr_0, + unsafe_destination_ptr_0, + ); + unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; + unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; + } + } +} + +pub(crate) fn convolve_vertical_dispatch_u8<'a, const COMPONENTS: usize>( + image_store: &ImageStore, + filter_weights: FilterWeights, + destination: &mut ImageStore<'a, u8, COMPONENTS>, + pool: &Option, + dispatcher: fn(usize, &FilterBounds, *const u8, *mut u8, usize, *const i16), +) { + let approx_weights = filter_weights.numerical_approximation_i16::(0); + + let src_stride = image_store.width * image_store.channels; + let dst_stride = destination.width * image_store.channels; + + let dst_width = destination.width; + + if let Some(pool) = pool { + let arc_weights = Arc::new(approx_weights); + let borrowed = destination.buffer.borrow_mut(); + let unsafe_slice = UnsafeSlice::new(borrowed); + pool.scope(|scope| { + for y in 0..destination.height { + let weights = arc_weights.clone(); + scope.spawn(move |_| { + let bounds = unsafe { weights.bounds.get_unchecked(y) }; + let weight_ptr = + unsafe { weights.weights.as_ptr().add(weights.aligned_size * y) }; + let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); + let dst_ptr = unsafe_slice.mut_ptr(); + let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; + dispatcher( + dst_width, + bounds, + unsafe_source_ptr_0, + unsafe_destination_ptr_0, + src_stride, + weight_ptr, + ); + }); + } + }); + } else { + let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); + let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); + let mut filter_offset = 0usize; + for y in 0..destination.height { + let bounds = unsafe { approx_weights.bounds.get_unchecked(y) }; + let weight_ptr = unsafe { approx_weights.weights.as_ptr().add(filter_offset) }; + dispatcher( + dst_width, + bounds, + unsafe_source_ptr_0, + unsafe_destination_ptr_0, + src_stride, + weight_ptr, + ); + + filter_offset += approx_weights.aligned_size; + unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; + } + } +} diff --git a/src/filter_weights.rs b/src/filter_weights.rs index 0c1c5ab..2839e3d 100644 --- a/src/filter_weights.rs +++ b/src/filter_weights.rs @@ -7,6 +7,7 @@ use crate::chunking::chunked; +#[derive(Debug, Clone)] pub struct FilterWeights { pub weights: Vec, pub bounds: Vec, @@ -16,7 +17,7 @@ pub struct FilterWeights { pub coeffs_size: i32, } -#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] pub struct FilterBounds { pub start: usize, pub size: usize, diff --git a/src/image_size.rs b/src/image_size.rs index 1ada831..f2e3ec5 100644 --- a/src/image_size.rs +++ b/src/image_size.rs @@ -5,7 +5,7 @@ * // license that can be found in the LICENSE file. */ -#[derive(Copy, Clone)] +#[derive(Debug, Copy, Clone, Ord, PartialOrd, Eq, PartialEq)] pub struct ImageSize { pub width: usize, pub height: usize, diff --git a/src/lib.rs b/src/lib.rs index 050d4fe..2bcb529 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -9,50 +9,43 @@ mod acceleration_feature; mod alpha_handle; mod avx2_utils; mod chunking; +mod colors; mod convolution; -mod convolve_f32; -mod convolve_u8; +mod convolve_naive_f32; +mod convolve_naive_u8; +mod dispatch_group_f32; +mod dispatch_group_u8; mod filter_weights; mod image_size; mod image_store; -mod lab_scaler; -mod linear_precise_scaler; -mod linear_scaler; -mod luv_scaler; mod math; mod nearest_sampler; -mod neon_rgb_f32; -mod neon_rgb_u8; -mod neon_simd_u8; +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +mod neon; mod rgb_f32; mod rgb_u8; mod rgba_f32; mod rgba_u8; mod sampler; mod scaler; -mod sigmoidal_scaler; -mod sse_rgb_f32; -mod sse_rgb_u8; -mod sse_simd_u8; -mod sse_utils; +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +mod sse; mod support; mod threading_policy; mod unsafe_slice; -mod xyz_scaler; -mod lch_scaler; +pub use colors::LChScaler; +pub use colors::LabScaler; +pub use colors::LinearApproxScaler; +pub use colors::LinearScaler; +pub use colors::SigmoidalScaler; +pub use colors::XYZScaler; +pub use colors::*; pub use colorutils_rs::TransferFunction; pub use image_size::ImageSize; pub use image_store::ImageStore; -pub use lab_scaler::LabScaler; -pub use linear_precise_scaler::LinearScaler; -pub use linear_scaler::LinearApproxScaler; -pub use luv_scaler::*; pub use math::*; pub use sampler::*; pub use scaler::Scaler; pub use scaler::Scaling; -pub use sigmoidal_scaler::SigmoidalScaler; pub use threading_policy::*; -pub use xyz_scaler::XYZScaler; -pub use lch_scaler::LChScaler; diff --git a/src/convolve_f32.rs b/src/neon/convolve_f32.rs similarity index 70% rename from src/convolve_f32.rs rename to src/neon/convolve_f32.rs index 42d423b..0f92778 100644 --- a/src/convolve_f32.rs +++ b/src/neon/convolve_f32.rs @@ -1,73 +1,8 @@ -/* - * // Copyright (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - use crate::filter_weights::FilterBounds; -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +use crate::neon::utils::prefer_vfmaq_f32; use std::arch::aarch64::*; -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -#[inline(always)] -#[allow(dead_code)] -pub(crate) unsafe fn prefer_vfmaq_f32( - a: float32x4_t, - b: float32x4_t, - c: float32x4_t, -) -> float32x4_t { - #[cfg(target_arch = "aarch64")] - { - return vfmaq_f32(a, b, c); - } - #[cfg(target_arch = "arm")] - { - return vmlaq_f32(a, b, c); - } -} - -#[inline(always)] -#[allow(unused)] -pub(crate) unsafe fn convolve_vertical_part_f32( - start_y: usize, - start_x: usize, - src: *const f32, - src_stride: usize, - dst: *mut f32, - filter: *const f32, - bounds: &FilterBounds, -) { - let mut store: [[f32; CHANNELS]; PART] = [[0f32; CHANNELS]; PART]; - - for j in 0..bounds.size { - let py = start_y + j; - let weight = *unsafe { filter.add(j) }; - let src_ptr = src.add(src_stride * py); - for x in 0..PART { - let px = (start_x + x) * CHANNELS; - let s_ptr = src_ptr.add(px); - for c in 0..CHANNELS { - let store_p = store.get_unchecked_mut(x); - let store_v = store_p.get_unchecked_mut(c); - *store_v += unsafe { s_ptr.add(c).read_unaligned() } * weight; - } - } - } - - for x in 0..PART { - let px = (start_x + x) * CHANNELS; - let dst_ptr = dst.add(px); - for c in 0..CHANNELS { - let vl = *(*store.get_unchecked_mut(x)).get_unchecked_mut(c); - dst_ptr.add(c).write_unaligned(vl); - } - } -} - -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] #[allow(unused)] -#[inline(always)] pub(crate) unsafe fn convolve_vertical_part_neon_16_f32( start_y: usize, start_x: usize, @@ -86,7 +21,7 @@ pub(crate) unsafe fn convolve_vertical_part_neon_16_f32( for j in 0..bounds.size { let py = start_y + j; - let weight = *unsafe { filter.add(j) }; + let weight = unsafe { filter.add(j).read_unaligned() }; let v_weight = vdupq_n_f32(weight); let src_ptr = src.add(src_stride * py); @@ -104,7 +39,6 @@ pub(crate) unsafe fn convolve_vertical_part_neon_16_f32( vst1q_f32_x4(dst_ptr, f_set); } -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] #[inline(always)] pub(crate) unsafe fn convolve_vertical_part_neon_8_f32( start_y: usize, @@ -152,7 +86,6 @@ pub(crate) unsafe fn convolve_vertical_part_neon_8_f32 } } -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] #[inline(always)] pub unsafe fn vtransposeq_f32(matrix: float32x4x4_t) -> float32x4x4_t { let row0 = matrix.0; @@ -172,7 +105,6 @@ pub unsafe fn vtransposeq_f32(matrix: float32x4x4_t) -> float32x4x4_t { return r; } -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] #[inline(always)] pub(crate) unsafe fn convolve_horizontal_parts_4_rgb_f32( start_x: usize, @@ -203,7 +135,6 @@ pub(crate) unsafe fn convolve_horizontal_parts_4_rgb_f32( acc } -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] #[inline(always)] pub(crate) unsafe fn convolve_horizontal_parts_one_rgb_f32( start_x: usize, @@ -226,7 +157,6 @@ pub(crate) unsafe fn convolve_horizontal_parts_one_rgb_f32( acc } -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] #[inline(always)] pub(crate) unsafe fn convolve_horizontal_parts_4_rgba_f32( start_x: usize, @@ -249,7 +179,6 @@ pub(crate) unsafe fn convolve_horizontal_parts_4_rgba_f32( acc } -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] #[inline(always)] pub(crate) unsafe fn convolve_horizontal_parts_one_rgba_f32( start_x: usize, diff --git a/src/neon/mod.rs b/src/neon/mod.rs new file mode 100644 index 0000000..450bf00 --- /dev/null +++ b/src/neon/mod.rs @@ -0,0 +1,10 @@ +mod convolve_f32; +mod rgb_f32; +mod rgb_u8; +mod rgba_u8; +mod utils; + +pub use convolve_f32::*; +pub use rgb_f32::neon_convolve_floats::*; +pub use rgb_u8::neon_rgb::*; +pub use rgba_u8::*; diff --git a/src/neon/rgb_f32.rs b/src/neon/rgb_f32.rs new file mode 100644 index 0000000..d878c8b --- /dev/null +++ b/src/neon/rgb_f32.rs @@ -0,0 +1,448 @@ +/* + * // Copyright (c) the Radzivon Bartoshyk. All rights reserved. + * // + * // Use of this source code is governed by a BSD-style + * // license that can be found in the LICENSE file. + */ + +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +pub mod neon_convolve_floats { + use crate::filter_weights::{FilterBounds, FilterWeights}; + use crate::neon::*; + use std::arch::aarch64::*; + + pub fn convolve_horizontal_rgba_neon_row_one( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + unsafe_source_ptr_0: *const f32, + unsafe_destination_ptr_0: *mut f32, + ) { + unsafe { + const CHANNELS: usize = 4; + let mut filter_offset = 0usize; + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store = vdupq_n_f32(0f32); + + while jx + 4 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = ptr.read_unaligned(); + let weight1 = ptr.add(1).read_unaligned(); + let weight2 = ptr.add(2).read_unaligned(); + let weight3 = ptr.add(3).read_unaligned(); + store = convolve_horizontal_parts_4_rgba_f32( + bounds.start, + unsafe_source_ptr_0, + weight0, + weight1, + weight2, + weight3, + store, + ); + jx += 4; + } + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = ptr.read_unaligned(); + store = convolve_horizontal_parts_one_rgba_f32( + bounds.start, + unsafe_source_ptr_0, + weight0, + store, + ); + jx += 1; + } + + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px); + vst1q_f32(dest_ptr, store); + + filter_offset += filter_weights.aligned_size; + } + } + } + + pub fn convolve_horizontal_rgba_neon_rows_4( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + unsafe_source_ptr_0: *const f32, + src_stride: usize, + unsafe_destination_ptr_0: *mut f32, + dst_stride: usize, + ) { + unsafe { + const CHANNELS: usize = 4; + let mut filter_offset = 0usize; + let zeros = vdupq_n_f32(0f32); + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store_0 = zeros; + let mut store_1 = zeros; + let mut store_2 = zeros; + let mut store_3 = zeros; + + while jx + 4 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = ptr.read_unaligned(); + let weight1 = ptr.add(1).read_unaligned(); + let weight2 = ptr.add(2).read_unaligned(); + let weight3 = ptr.add(3).read_unaligned(); + store_0 = convolve_horizontal_parts_4_rgba_f32( + bounds.start, + unsafe_source_ptr_0, + weight0, + weight1, + weight2, + weight3, + store_0, + ); + store_1 = convolve_horizontal_parts_4_rgba_f32( + bounds.start, + unsafe_source_ptr_0.add(src_stride), + weight0, + weight1, + weight2, + weight3, + store_1, + ); + store_2 = convolve_horizontal_parts_4_rgba_f32( + bounds.start, + unsafe_source_ptr_0.add(src_stride * 2), + weight0, + weight1, + weight2, + weight3, + store_2, + ); + store_3 = convolve_horizontal_parts_4_rgba_f32( + bounds.start, + unsafe_source_ptr_0.add(src_stride * 3), + weight0, + weight1, + weight2, + weight3, + store_3, + ); + jx += 4; + } + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = ptr.read_unaligned(); + store_0 = convolve_horizontal_parts_one_rgba_f32( + bounds.start, + unsafe_source_ptr_0, + weight0, + store_0, + ); + store_1 = convolve_horizontal_parts_one_rgba_f32( + bounds.start, + unsafe_source_ptr_0.add(src_stride), + weight0, + store_1, + ); + store_2 = convolve_horizontal_parts_one_rgba_f32( + bounds.start, + unsafe_source_ptr_0.add(src_stride * 2), + weight0, + store_2, + ); + store_3 = convolve_horizontal_parts_one_rgba_f32( + bounds.start, + unsafe_source_ptr_0.add(src_stride * 3), + weight0, + store_3, + ); + jx += 1; + } + + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px); + vst1q_f32(dest_ptr, store_0); + + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); + vst1q_f32(dest_ptr, store_1); + + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); + vst1q_f32(dest_ptr, store_2); + + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); + vst1q_f32(dest_ptr, store_3); + + filter_offset += filter_weights.aligned_size; + } + } + } + + pub fn convolve_horizontal_rgb_neon_rows_4_f32( + dst_width: usize, + src_width: usize, + filter_weights: &FilterWeights, + unsafe_source_ptr_0: *const f32, + src_stride: usize, + unsafe_destination_ptr_0: *mut f32, + dst_stride: usize, + ) { + unsafe { + const CHANNELS: usize = 3; + let mut filter_offset = 0usize; + + let zeros = vdupq_n_f32(0f32); + + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store_0 = zeros; + let mut store_1 = zeros; + let mut store_2 = zeros; + let mut store_3 = zeros; + + while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = vdupq_n_f32(ptr.read_unaligned()); + let weight1 = vdupq_n_f32(ptr.add(1).read_unaligned()); + let weight2 = vdupq_n_f32(ptr.add(2).read_unaligned()); + let weight3 = vdupq_n_f32(ptr.add(3).read_unaligned()); + store_0 = convolve_horizontal_parts_4_rgb_f32( + bounds_start, + unsafe_source_ptr_0, + weight0, + weight1, + weight2, + weight3, + store_0, + ); + store_1 = convolve_horizontal_parts_4_rgb_f32( + bounds_start, + unsafe_source_ptr_0.add(src_stride), + weight0, + weight1, + weight2, + weight3, + store_1, + ); + store_2 = convolve_horizontal_parts_4_rgb_f32( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 2), + weight0, + weight1, + weight2, + weight3, + store_2, + ); + store_3 = convolve_horizontal_parts_4_rgb_f32( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 3), + weight0, + weight1, + weight2, + weight3, + store_3, + ); + jx += 4; + } + + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let bounds_start = bounds.start + jx; + let weight0 = vdupq_n_f32(ptr.read_unaligned()); + store_0 = convolve_horizontal_parts_one_rgb_f32( + bounds_start, + unsafe_source_ptr_0, + weight0, + store_0, + ); + store_1 = convolve_horizontal_parts_one_rgb_f32( + bounds_start, + unsafe_source_ptr_0.add(src_stride), + weight0, + store_1, + ); + store_2 = convolve_horizontal_parts_one_rgb_f32( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 2), + weight0, + store_2, + ); + store_3 = convolve_horizontal_parts_one_rgb_f32( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 3), + weight0, + store_3, + ); + jx += 1; + } + + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px); + let l1 = vgetq_lane_f32::<0>(store_0); + let l2 = vgetq_lane_f32::<1>(store_0); + let l3 = vgetq_lane_f32::<2>(store_0); + dest_ptr.write_unaligned(l1); + dest_ptr.add(1).write_unaligned(l2); + dest_ptr.add(2).write_unaligned(l3); + + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); + let l1 = vgetq_lane_f32::<0>(store_1); + let l2 = vgetq_lane_f32::<1>(store_1); + let l3 = vgetq_lane_f32::<2>(store_1); + dest_ptr.write_unaligned(l1); + dest_ptr.add(1).write_unaligned(l2); + dest_ptr.add(2).write_unaligned(l3); + + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let l1 = vgetq_lane_f32::<0>(store_2); + let l2 = vgetq_lane_f32::<1>(store_2); + let l3 = vgetq_lane_f32::<2>(store_2); + dest_ptr.write_unaligned(l1); + dest_ptr.add(1).write_unaligned(l2); + dest_ptr.add(2).write_unaligned(l3); + + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let l1 = vgetq_lane_f32::<0>(store_3); + let l2 = vgetq_lane_f32::<1>(store_3); + let l3 = vgetq_lane_f32::<2>(store_3); + dest_ptr.write_unaligned(l1); + dest_ptr.add(1).write_unaligned(l2); + dest_ptr.add(2).write_unaligned(l3); + + filter_offset += filter_weights.aligned_size; + } + } + } + + pub fn convolve_horizontal_rgb_neon_row_one_f32( + dst_width: usize, + src_width: usize, + filter_weights: &FilterWeights, + unsafe_source_ptr_0: *const f32, + unsafe_destination_ptr_0: *mut f32, + ) { + unsafe { + const CHANNELS: usize = 3; + let weights_ptr = filter_weights.weights.as_ptr(); + let mut filter_offset = 0usize; + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store = vdupq_n_f32(0f32); + + while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = vdupq_n_f32(ptr.read_unaligned()); + let weight1 = vdupq_n_f32(ptr.add(1).read_unaligned()); + let weight2 = vdupq_n_f32(ptr.add(2).read_unaligned()); + let weight3 = vdupq_n_f32(ptr.add(3).read_unaligned()); + store = convolve_horizontal_parts_4_rgb_f32( + bounds_start, + unsafe_source_ptr_0, + weight0, + weight1, + weight2, + weight3, + store, + ); + jx += 4; + } + + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = vdupq_n_f32(ptr.read_unaligned()); + store = convolve_horizontal_parts_one_rgb_f32( + bounds.start + jx, + unsafe_source_ptr_0, + weight0, + store, + ); + jx += 1; + } + + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px); + let l1 = vgetq_lane_f32::<0>(store); + let l2 = vgetq_lane_f32::<1>(store); + let l3 = vgetq_lane_f32::<2>(store); + dest_ptr.write_unaligned(l1); + dest_ptr.add(1).write_unaligned(l2); + dest_ptr.add(2).write_unaligned(l3); + + filter_offset += filter_weights.aligned_size; + } + } + } + + #[inline(always)] + pub(crate) fn convolve_vertical_rgb_neon_row_f32( + width: usize, + bounds: &FilterBounds, + unsafe_source_ptr_0: *const f32, + unsafe_destination_ptr_0: *mut f32, + src_stride: usize, + weight_ptr: *const f32, + ) { + let mut cx = 0usize; + let dst_width = width * CHANNELS; + + while cx + 16 < dst_width { + unsafe { + convolve_vertical_part_neon_16_f32( + bounds.start, + cx, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + weight_ptr, + bounds, + ); + } + + cx += 16; + } + + while cx + 8 < dst_width { + unsafe { + convolve_vertical_part_neon_8_f32::( + bounds.start, + cx, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + weight_ptr, + bounds, + 8, + ); + } + + cx += 8; + } + + let left = dst_width - cx; + + if left > 0 { + unsafe { + convolve_vertical_part_neon_8_f32::( + bounds.start, + cx, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + weight_ptr, + bounds, + left, + ); + } + } + } +} diff --git a/src/neon_rgb_u8.rs b/src/neon/rgb_u8.rs similarity index 68% rename from src/neon_rgb_u8.rs rename to src/neon/rgb_u8.rs index 571ecd9..1f913ed 100644 --- a/src/neon_rgb_u8.rs +++ b/src/neon/rgb_u8.rs @@ -8,11 +8,11 @@ #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] pub mod neon_rgb { use crate::filter_weights::{FilterBounds, FilterWeights}; - use crate::neon_simd_u8::neon_convolve_u8; + use crate::neon::utils::neon_convolve_u8; use crate::support::ROUNDING_APPROX; use std::arch::aarch64::*; - pub unsafe fn convolve_horizontal_rgb_neon_rows_4( + pub fn convolve_horizontal_rgb_neon_rows_4( dst_width: usize, src_width: usize, approx_weights: &FilterWeights, @@ -21,29 +21,29 @@ pub mod neon_rgb { unsafe_destination_ptr_0: *mut u8, dst_stride: usize, ) { - let shuf_table_1: [u8; 8] = [0, 1, 2, 255, 3, 4, 5, 255]; - let shuffle_1 = vld1_u8(shuf_table_1.as_ptr()); - let shuf_table_2: [u8; 8] = [6, 7, 8, 255, 9, 10, 11, 255]; - let shuffle_2 = vld1_u8(shuf_table_2.as_ptr()); - let shuffle = vcombine_u8(shuffle_1, shuffle_2); - - let mut filter_offset = 0usize; - let weights_ptr = approx_weights.weights.as_ptr(); - const CHANNELS: usize = 3; - let zeros = vdupq_n_s32(0i32); - let init = vdupq_n_s32(ROUNDING_APPROX); - for x in 0..dst_width { - let bounds = unsafe { approx_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store_0 = init; - let mut store_1 = init; - let mut store_2 = init; - let mut store_3 = init; - - while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { - let bounds_start = bounds.start + jx; - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { + unsafe { + let shuf_table_1: [u8; 8] = [0, 1, 2, 255, 3, 4, 5, 255]; + let shuffle_1 = vld1_u8(shuf_table_1.as_ptr()); + let shuf_table_2: [u8; 8] = [6, 7, 8, 255, 9, 10, 11, 255]; + let shuffle_2 = vld1_u8(shuf_table_2.as_ptr()); + let shuffle = vcombine_u8(shuffle_1, shuffle_2); + + let mut filter_offset = 0usize; + let weights_ptr = approx_weights.weights.as_ptr(); + const CHANNELS: usize = 3; + let zeros = vdupq_n_s32(0i32); + let init = vdupq_n_s32(ROUNDING_APPROX); + for x in 0..dst_width { + let bounds = approx_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store_0 = init; + let mut store_1 = init; + let mut store_2 = init; + let mut store_3 = init; + + while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vdup_n_s16(ptr.read_unaligned()); let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned()); let weight2 = vdup_n_s16(ptr.add(2).read_unaligned()); @@ -88,14 +88,12 @@ pub mod neon_rgb { store_3, shuffle, ); + jx += 4; } - jx += 4; - } - while jx + 2 < bounds.size && bounds.start + jx + 3 < src_width { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - let bounds_start = bounds.start + jx; - unsafe { + while jx + 2 < bounds.size && bounds.start + jx + 3 < src_width { + let ptr = weights_ptr.add(jx + filter_offset); + let bounds_start = bounds.start + jx; let weight0 = vdup_n_s16(ptr.read_unaligned()); let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned()); store_0 = neon_convolve_u8::convolve_horizontal_parts_2_rgb( @@ -130,14 +128,12 @@ pub mod neon_rgb { store_3, shuffle_1, ); + jx += 2; } - jx += 2; - } - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - let bounds_start = bounds.start + jx; - unsafe { + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let bounds_start = bounds.start + jx; let weight0 = vdup_n_s16(ptr.read_unaligned()); store_0 = neon_convolve_u8::convolve_horizontal_parts_one_rgb( bounds_start, @@ -163,93 +159,86 @@ pub mod neon_rgb { weight0, store_3, ); + jx += 1; } - jx += 1; - } - let store_16 = unsafe { vqshrun_n_s32::<12>(vmaxq_s32(store_0, zeros)) }; - let store_16_8 = unsafe { vqmovn_u16(vcombine_u16(store_16, store_16)) }; + let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_0, zeros)); + let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - unsafe { + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px); let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); let bytes = pixel.to_le_bytes(); dest_ptr.write_unaligned(bytes[0]); dest_ptr.add(1).write_unaligned(bytes[1]); dest_ptr.add(2).write_unaligned(bytes[2]); - } - let store_16 = unsafe { vqshrun_n_s32::<12>(vmaxq_s32(store_1, zeros)) }; - let store_16_8 = unsafe { vqmovn_u16(vcombine_u16(store_16, store_16)) }; + let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_1, zeros)); + let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); + + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) }; - unsafe { let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); let bytes = pixel.to_le_bytes(); dest_ptr.write_unaligned(bytes[0]); dest_ptr.add(1).write_unaligned(bytes[1]); dest_ptr.add(2).write_unaligned(bytes[2]); - } - let store_16 = unsafe { vqshrun_n_s32::<12>(vmaxq_s32(store_2, zeros)) }; - let store_16_8 = unsafe { vqmovn_u16(vcombine_u16(store_16, store_16)) }; + let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_2, zeros)); + let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) }; - unsafe { + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); let bytes = pixel.to_le_bytes(); dest_ptr.write_unaligned(bytes[0]); dest_ptr.add(1).write_unaligned(bytes[1]); dest_ptr.add(2).write_unaligned(bytes[2]); - } - let store_16 = unsafe { vqshrun_n_s32::<12>(vmaxq_s32(store_3, zeros)) }; - let store_16_8 = unsafe { vqmovn_u16(vcombine_u16(store_16, store_16)) }; + let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_3, zeros)); + let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) }; - unsafe { + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); let bytes = pixel.to_le_bytes(); dest_ptr.write_unaligned(bytes[0]); dest_ptr.add(1).write_unaligned(bytes[1]); dest_ptr.add(2).write_unaligned(bytes[2]); - } - filter_offset += approx_weights.aligned_size; + filter_offset += approx_weights.aligned_size; + } } } - pub unsafe fn convolve_horizontal_rgb_neon_row_one( + pub fn convolve_horizontal_rgb_neon_row_one( dst_width: usize, src_width: usize, approx_weights: &FilterWeights, unsafe_source_ptr_0: *const u8, unsafe_destination_ptr_0: *mut u8, ) { - const CHANNELS: usize = 3; - let mut filter_offset = 0usize; - let zeros = vdupq_n_s32(0i32); - let weights_ptr = approx_weights.weights.as_ptr(); - - let shuf_table_1: [u8; 8] = [0, 1, 2, 255, 3, 4, 5, 255]; - let shuffle_1 = vld1_u8(shuf_table_1.as_ptr()); - let shuf_table_2: [u8; 8] = [6, 7, 8, 255, 9, 10, 11, 255]; - let shuffle_2 = vld1_u8(shuf_table_2.as_ptr()); - let shuffle = vcombine_u8(shuffle_1, shuffle_2); - - for x in 0..dst_width { - let bounds = unsafe { approx_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store = vdupq_n_s32(ROUNDING_APPROX); - - while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { - let bounds_start = bounds.start + jx; - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { + unsafe { + const CHANNELS: usize = 3; + let mut filter_offset = 0usize; + let zeros = vdupq_n_s32(0i32); + let weights_ptr = approx_weights.weights.as_ptr(); + + let shuf_table_1: [u8; 8] = [0, 1, 2, 255, 3, 4, 5, 255]; + let shuffle_1 = vld1_u8(shuf_table_1.as_ptr()); + let shuf_table_2: [u8; 8] = [6, 7, 8, 255, 9, 10, 11, 255]; + let shuffle_2 = vld1_u8(shuf_table_2.as_ptr()); + let shuffle = vcombine_u8(shuffle_1, shuffle_2); + + for x in 0..dst_width { + let bounds = approx_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store = vdupq_n_s32(ROUNDING_APPROX); + + while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vdup_n_s16(ptr.read_unaligned()); let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned()); let weight2 = vdup_n_s16(ptr.add(2).read_unaligned()); @@ -264,14 +253,12 @@ pub mod neon_rgb { store, shuffle, ); + jx += 4; } - jx += 4; - } - while jx + 2 < bounds.size && bounds.start + jx + 3 < src_width { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - let bounds_start = bounds.start + jx; - unsafe { + while jx + 2 < bounds.size && bounds.start + jx + 3 < src_width { + let ptr = weights_ptr.add(jx + filter_offset); + let bounds_start = bounds.start + jx; let weight0 = vdup_n_s16(ptr.read_unaligned()); let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned()); store = neon_convolve_u8::convolve_horizontal_parts_2_rgb( @@ -282,13 +269,11 @@ pub mod neon_rgb { store, shuffle_1, ); + jx += 2; } - jx += 2; - } - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); let weight0 = vdup_n_s16(ptr.read_unaligned()); store = neon_convolve_u8::convolve_horizontal_parts_one_rgb( bounds.start + jx, @@ -296,30 +281,28 @@ pub mod neon_rgb { weight0, store, ); + jx += 1; } - jx += 1; - } - let store_16 = unsafe { vqshrun_n_s32::<12>(vmaxq_s32(store, zeros)) }; - let store_16_8 = unsafe { vqmovn_u16(vcombine_u16(store_16, store_16)) }; + let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store, zeros)); + let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - unsafe { + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px); let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); let bytes = pixel.to_le_bytes(); dest_ptr.write_unaligned(bytes[0]); dest_ptr.add(1).write_unaligned(bytes[1]); dest_ptr.add(2).write_unaligned(bytes[2]); - } - filter_offset += approx_weights.aligned_size; + filter_offset += approx_weights.aligned_size; + } } } #[inline(always)] - pub fn convolve_vertical_rgb_neon_row( - dst_width: usize, + pub fn convolve_vertical_rgb_neon_row( + width: usize, bounds: &FilterBounds, unsafe_source_ptr_0: *const u8, unsafe_destination_ptr_0: *mut u8, @@ -327,6 +310,7 @@ pub mod neon_rgb { weight_ptr: *const i16, ) { let mut cx = 0usize; + let dst_width = width * CHANNELS; while cx + 32 < dst_width { unsafe { neon_convolve_u8::convolve_vertical_part_neon_32( diff --git a/src/neon/rgba_u8.rs b/src/neon/rgba_u8.rs new file mode 100644 index 0000000..5abdc31 --- /dev/null +++ b/src/neon/rgba_u8.rs @@ -0,0 +1,261 @@ +use crate::filter_weights::FilterWeights; +use crate::neon::utils::neon_convolve_u8::{ + convolve_horizontal_parts_2_rgba, convolve_horizontal_parts_4_rgba, + convolve_horizontal_parts_one_rgba, +}; +use crate::support::ROUNDING_APPROX; +use std::arch::aarch64::*; + +pub fn convolve_horizontal_rgba_neon_rows_4_u8( + dst_width: usize, + _: usize, + approx_weights: &FilterWeights, + unsafe_source_ptr_0: *const u8, + src_stride: usize, + unsafe_destination_ptr_0: *mut u8, + dst_stride: usize, +) { + unsafe { + let mut filter_offset = 0usize; + let weights_ptr = approx_weights.weights.as_ptr(); + const CHANNELS: usize = 4; + let zeros = vdupq_n_s32(0i32); + let init = vdupq_n_s32(ROUNDING_APPROX); + for x in 0..dst_width { + let bounds = approx_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store_0 = init; + let mut store_1 = init; + let mut store_2 = init; + let mut store_3 = init; + + while jx + 4 < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = vdup_n_s16(ptr.read_unaligned()); + let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned()); + let weight2 = vdup_n_s16(ptr.add(2).read_unaligned()); + let weight3 = vdupq_n_s16(ptr.add(3).read_unaligned()); + store_0 = convolve_horizontal_parts_4_rgba( + bounds_start, + unsafe_source_ptr_0, + weight0, + weight1, + weight2, + weight3, + store_0, + ); + store_1 = convolve_horizontal_parts_4_rgba( + bounds_start, + unsafe_source_ptr_0.add(src_stride), + weight0, + weight1, + weight2, + weight3, + store_1, + ); + store_2 = convolve_horizontal_parts_4_rgba( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 2), + weight0, + weight1, + weight2, + weight3, + store_2, + ); + store_3 = convolve_horizontal_parts_4_rgba( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 3), + weight0, + weight1, + weight2, + weight3, + store_3, + ); + jx += 4; + } + + while jx + 2 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let bounds_start = bounds.start + jx; + let weight0 = vdup_n_s16(ptr.read_unaligned()); + let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned()); + store_0 = convolve_horizontal_parts_2_rgba( + bounds_start, + unsafe_source_ptr_0, + weight0, + weight1, + store_0, + ); + store_1 = convolve_horizontal_parts_2_rgba( + bounds_start, + unsafe_source_ptr_0.add(src_stride), + weight0, + weight1, + store_1, + ); + store_2 = convolve_horizontal_parts_2_rgba( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 2), + weight0, + weight1, + store_2, + ); + store_3 = convolve_horizontal_parts_2_rgba( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 3), + weight0, + weight1, + store_3, + ); + jx += 2; + } + + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let bounds_start = bounds.start + jx; + let weight0 = vdup_n_s16(ptr.read_unaligned()); + store_0 = convolve_horizontal_parts_one_rgba( + bounds_start, + unsafe_source_ptr_0, + weight0, + store_0, + ); + store_1 = convolve_horizontal_parts_one_rgba( + bounds_start, + unsafe_source_ptr_0.add(src_stride), + weight0, + store_1, + ); + store_2 = convolve_horizontal_parts_one_rgba( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 2), + weight0, + store_2, + ); + store_3 = convolve_horizontal_parts_one_rgba( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 3), + weight0, + store_3, + ); + jx += 1; + } + + let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_0, zeros)); + let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); + + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px); + let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); + let dest_ptr_32 = dest_ptr as *mut u32; + dest_ptr_32.write_unaligned(pixel); + + let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_1, zeros)); + let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); + + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); + let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); + let dest_ptr_32 = dest_ptr as *mut u32; + dest_ptr_32.write_unaligned(pixel); + + let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_2, zeros)); + let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); + + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); + let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); + let dest_ptr_32 = dest_ptr as *mut u32; + dest_ptr_32.write_unaligned(pixel); + + let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store_3, zeros)); + let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); + + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); + let pixel = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); + let dest_ptr_32 = dest_ptr as *mut u32; + dest_ptr_32.write_unaligned(pixel); + + filter_offset += approx_weights.aligned_size; + } + } +} + +pub fn convolve_horizontal_rgba_neon_row( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + unsafe_source_ptr_0: *const u8, + unsafe_destination_ptr_0: *mut u8, +) { + unsafe { + const CHANNELS: usize = 4; + let mut filter_offset = 0usize; + + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store = vdupq_n_s32(ROUNDING_APPROX); + + while jx + 4 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = vdup_n_s16(ptr.read_unaligned()); + let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned()); + let weight2 = vdup_n_s16(ptr.add(2).read_unaligned()); + let weight3 = vdupq_n_s16(ptr.add(3).read_unaligned()); + store = convolve_horizontal_parts_4_rgba( + bounds.start + jx, + unsafe_source_ptr_0, + weight0, + weight1, + weight2, + weight3, + store, + ); + jx += 4; + } + + while jx + 2 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let bounds_start = bounds.start + jx; + let weight0 = vdup_n_s16(ptr.read_unaligned()); + let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned()); + store = convolve_horizontal_parts_2_rgba( + bounds_start, + unsafe_source_ptr_0, + weight0, + weight1, + store, + ); + jx += 2; + } + + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = vdup_n_s16(ptr.read_unaligned()); + store = convolve_horizontal_parts_one_rgba( + bounds.start + jx, + unsafe_source_ptr_0, + weight0, + store, + ); + jx += 1; + } + + let store_16 = vqshrun_n_s32::<12>(vmaxq_s32(store, vdupq_n_s32(0i32))); + let store_16_8 = vqmovn_u16(vcombine_u16(store_16, store_16)); + + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px); + let value = vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)); + let dest_ptr_32 = dest_ptr as *mut u32; + dest_ptr_32.write_unaligned(value); + + filter_offset += filter_weights.aligned_size; + } + } +} diff --git a/src/neon_simd_u8.rs b/src/neon/utils.rs similarity index 92% rename from src/neon_simd_u8.rs rename to src/neon/utils.rs index 1289d72..14781c6 100644 --- a/src/neon_simd_u8.rs +++ b/src/neon/utils.rs @@ -5,6 +5,8 @@ * // license that can be found in the LICENSE file. */ +use std::arch::aarch64::{float32x4_t, vfmaq_f32}; + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] pub mod neon_convolve_u8 { use crate::filter_weights::FilterBounds; @@ -109,6 +111,25 @@ pub mod neon_convolve_u8 { acc } + #[inline(always)] + pub(crate) unsafe fn convolve_horizontal_parts_2_rgba( + start_x: usize, + src: *const u8, + weight0: int16x4_t, + weight1: int16x8_t, + store_0: int32x4_t, + ) -> int32x4_t { + const COMPONENTS: usize = 4; + let src_ptr = src.add(start_x * COMPONENTS); + + let rgb_pixel = vld1_u8(src_ptr); + let wide = vreinterpretq_s16_u16(vmovl_u8(rgb_pixel)); + + let acc = vmlal_high_s16(store_0, wide, weight1); + let acc = vmlal_s16(acc, vget_low_s16(wide), weight0); + acc + } + #[inline(always)] pub(crate) unsafe fn convolve_vertical_part_neon_32( start_y: usize, @@ -319,3 +340,19 @@ pub mod neon_convolve_u8 { acc } } + +#[inline(always)] +pub(crate) unsafe fn prefer_vfmaq_f32( + a: float32x4_t, + b: float32x4_t, + c: float32x4_t, +) -> float32x4_t { + #[cfg(target_arch = "aarch64")] + { + return vfmaq_f32(a, b, c); + } + #[cfg(target_arch = "arm")] + { + return vmlaq_f32(a, b, c); + } +} diff --git a/src/neon_rgb_f32.rs b/src/neon_rgb_f32.rs deleted file mode 100644 index 6f36586..0000000 --- a/src/neon_rgb_f32.rs +++ /dev/null @@ -1,413 +0,0 @@ -/* - * // Copyright (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -pub mod neon_convolve_floats { - use crate::convolve_f32::{ - convolve_horizontal_parts_4_rgb_f32, convolve_horizontal_parts_4_rgba_f32, - convolve_horizontal_parts_one_rgb_f32, convolve_horizontal_parts_one_rgba_f32, - }; - use crate::filter_weights::FilterWeights; - use std::arch::aarch64::*; - - pub unsafe fn convolve_horizontal_rgba_neon_row_one( - dst_width: usize, - filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, - ) { - const CHANNELS: usize = 4; - let mut filter_offset = 0usize; - let weights_ptr = filter_weights.weights.as_ptr(); - - for x in 0..dst_width { - let bounds = unsafe { filter_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store = unsafe { vdupq_n_f32(0f32) }; - - while jx + 4 < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - let weight0 = unsafe { ptr.read_unaligned() }; - let weight1 = unsafe { ptr.add(1).read_unaligned() }; - let weight2 = unsafe { ptr.add(2).read_unaligned() }; - let weight3 = unsafe { ptr.add(3).read_unaligned() }; - unsafe { - store = convolve_horizontal_parts_4_rgba_f32( - bounds.start, - unsafe_source_ptr_0, - weight0, - weight1, - weight2, - weight3, - store, - ); - } - jx += 4; - } - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - let weight0 = unsafe { ptr.read_unaligned() }; - unsafe { - store = convolve_horizontal_parts_one_rgba_f32( - bounds.start, - unsafe_source_ptr_0, - weight0, - store, - ); - } - jx += 1; - } - - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - unsafe { - vst1q_f32(dest_ptr, store); - } - - filter_offset += filter_weights.aligned_size; - } - } - - pub unsafe fn convolve_horizontal_rgba_neon_rows_4( - dst_width: usize, - filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - src_stride: usize, - unsafe_destination_ptr_0: *mut f32, - dst_stride: usize, - ) { - const CHANNELS: usize = 4; - let mut filter_offset = 0usize; - let zeros = unsafe { vdupq_n_f32(0f32) }; - let weights_ptr = filter_weights.weights.as_ptr(); - - for x in 0..dst_width { - let bounds = unsafe { filter_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store_0 = zeros; - let mut store_1 = zeros; - let mut store_2 = zeros; - let mut store_3 = zeros; - - while jx + 4 < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - let weight0 = unsafe { ptr.read_unaligned() }; - let weight1 = unsafe { ptr.add(1).read_unaligned() }; - let weight2 = unsafe { ptr.add(2).read_unaligned() }; - let weight3 = unsafe { ptr.add(3).read_unaligned() }; - unsafe { - store_0 = convolve_horizontal_parts_4_rgba_f32( - bounds.start, - unsafe_source_ptr_0, - weight0, - weight1, - weight2, - weight3, - store_0, - ); - store_1 = convolve_horizontal_parts_4_rgba_f32( - bounds.start, - unsafe_source_ptr_0.add(src_stride), - weight0, - weight1, - weight2, - weight3, - store_1, - ); - store_2 = convolve_horizontal_parts_4_rgba_f32( - bounds.start, - unsafe_source_ptr_0.add(src_stride * 2), - weight0, - weight1, - weight2, - weight3, - store_2, - ); - store_3 = convolve_horizontal_parts_4_rgba_f32( - bounds.start, - unsafe_source_ptr_0.add(src_stride * 3), - weight0, - weight1, - weight2, - weight3, - store_3, - ); - } - jx += 4; - } - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - let weight0 = unsafe { ptr.read_unaligned() }; - unsafe { - store_0 = convolve_horizontal_parts_one_rgba_f32( - bounds.start, - unsafe_source_ptr_0, - weight0, - store_0, - ); - store_1 = convolve_horizontal_parts_one_rgba_f32( - bounds.start, - unsafe_source_ptr_0.add(src_stride), - weight0, - store_1, - ); - store_2 = convolve_horizontal_parts_one_rgba_f32( - bounds.start, - unsafe_source_ptr_0.add(src_stride * 2), - weight0, - store_2, - ); - store_3 = convolve_horizontal_parts_one_rgba_f32( - bounds.start, - unsafe_source_ptr_0.add(src_stride * 3), - weight0, - store_3, - ); - } - jx += 1; - } - - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - unsafe { - vst1q_f32(dest_ptr, store_0); - } - - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) }; - unsafe { - vst1q_f32(dest_ptr, store_1); - } - - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) }; - unsafe { - vst1q_f32(dest_ptr, store_2); - } - - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) }; - unsafe { - vst1q_f32(dest_ptr, store_3); - } - - filter_offset += filter_weights.aligned_size; - } - } - - pub unsafe fn convolve_horizontal_rgb_neon_rows_4( - dst_width: usize, - src_width: usize, - filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - src_stride: usize, - unsafe_destination_ptr_0: *mut f32, - dst_stride: usize, - ) { - const CHANNELS: usize = 3; - let mut filter_offset = 0usize; - - let zeros = unsafe { vdupq_n_f32(0f32) }; - - let weights_ptr = filter_weights.weights.as_ptr(); - - for x in 0..dst_width { - let bounds = unsafe { filter_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store_0 = zeros; - let mut store_1 = zeros; - let mut store_2 = zeros; - let mut store_3 = zeros; - - while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { - let bounds_start = bounds.start + jx; - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let weight0 = vdupq_n_f32(ptr.read_unaligned()); - let weight1 = vdupq_n_f32(ptr.add(1).read_unaligned()); - let weight2 = vdupq_n_f32(ptr.add(2).read_unaligned()); - let weight3 = vdupq_n_f32(ptr.add(3).read_unaligned()); - store_0 = convolve_horizontal_parts_4_rgb_f32( - bounds_start, - unsafe_source_ptr_0, - weight0, - weight1, - weight2, - weight3, - store_0, - ); - store_1 = convolve_horizontal_parts_4_rgb_f32( - bounds_start, - unsafe_source_ptr_0.add(src_stride), - weight0, - weight1, - weight2, - weight3, - store_1, - ); - store_2 = convolve_horizontal_parts_4_rgb_f32( - bounds_start, - unsafe_source_ptr_0.add(src_stride * 2), - weight0, - weight1, - weight2, - weight3, - store_2, - ); - store_3 = convolve_horizontal_parts_4_rgb_f32( - bounds_start, - unsafe_source_ptr_0.add(src_stride * 3), - weight0, - weight1, - weight2, - weight3, - store_3, - ); - } - jx += 4; - } - - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let bounds_start = bounds.start + jx; - let weight0 = vdupq_n_f32(ptr.read_unaligned()); - store_0 = convolve_horizontal_parts_one_rgb_f32( - bounds_start, - unsafe_source_ptr_0, - weight0, - store_0, - ); - store_1 = convolve_horizontal_parts_one_rgb_f32( - bounds_start, - unsafe_source_ptr_0.add(src_stride), - weight0, - store_1, - ); - store_2 = convolve_horizontal_parts_one_rgb_f32( - bounds_start, - unsafe_source_ptr_0.add(src_stride * 2), - weight0, - store_2, - ); - store_3 = convolve_horizontal_parts_one_rgb_f32( - bounds_start, - unsafe_source_ptr_0.add(src_stride * 3), - weight0, - store_3, - ); - } - jx += 1; - } - - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - unsafe { - let l1 = vgetq_lane_f32::<0>(store_0); - let l2 = vgetq_lane_f32::<1>(store_0); - let l3 = vgetq_lane_f32::<2>(store_0); - dest_ptr.write_unaligned(l1); - dest_ptr.add(1).write_unaligned(l2); - dest_ptr.add(2).write_unaligned(l3); - } - - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) }; - unsafe { - let l1 = vgetq_lane_f32::<0>(store_1); - let l2 = vgetq_lane_f32::<1>(store_1); - let l3 = vgetq_lane_f32::<2>(store_1); - dest_ptr.write_unaligned(l1); - dest_ptr.add(1).write_unaligned(l2); - dest_ptr.add(2).write_unaligned(l3); - } - - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) }; - unsafe { - let l1 = vgetq_lane_f32::<0>(store_2); - let l2 = vgetq_lane_f32::<1>(store_2); - let l3 = vgetq_lane_f32::<2>(store_2); - dest_ptr.write_unaligned(l1); - dest_ptr.add(1).write_unaligned(l2); - dest_ptr.add(2).write_unaligned(l3); - } - - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) }; - unsafe { - let l1 = vgetq_lane_f32::<0>(store_3); - let l2 = vgetq_lane_f32::<1>(store_3); - let l3 = vgetq_lane_f32::<2>(store_3); - dest_ptr.write_unaligned(l1); - dest_ptr.add(1).write_unaligned(l2); - dest_ptr.add(2).write_unaligned(l3); - } - - filter_offset += filter_weights.aligned_size; - } - } - - pub unsafe fn convolve_horizontal_rgb_neon_row_one( - dst_width: usize, - filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, - ) { - const CHANNELS: usize = 3; - let weights_ptr = filter_weights.weights.as_ptr(); - let mut filter_offset = 0usize; - - for x in 0..dst_width { - let bounds = unsafe { filter_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store = unsafe { vdupq_n_f32(0f32) }; - - while jx + 4 < bounds.size && bounds.start + jx + 6 < dst_width { - let bounds_start = bounds.start + jx; - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let weight0 = vdupq_n_f32(ptr.read_unaligned()); - let weight1 = vdupq_n_f32(ptr.add(1).read_unaligned()); - let weight2 = vdupq_n_f32(ptr.add(2).read_unaligned()); - let weight3 = vdupq_n_f32(ptr.add(3).read_unaligned()); - store = convolve_horizontal_parts_4_rgb_f32( - bounds_start, - unsafe_source_ptr_0, - weight0, - weight1, - weight2, - weight3, - store, - ); - } - jx += 4; - } - - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let weight0 = vdupq_n_f32(ptr.read_unaligned()); - store = convolve_horizontal_parts_one_rgb_f32( - bounds.start + jx, - unsafe_source_ptr_0, - weight0, - store, - ); - } - jx += 1; - } - - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - unsafe { - let l1 = vgetq_lane_f32::<0>(store); - let l2 = vgetq_lane_f32::<1>(store); - let l3 = vgetq_lane_f32::<2>(store); - dest_ptr.write_unaligned(l1); - dest_ptr.add(1).write_unaligned(l2); - dest_ptr.add(2).write_unaligned(l3); - } - - filter_offset += filter_weights.aligned_size; - } - } -} diff --git a/src/rgb_f32.rs b/src/rgb_f32.rs index e81bd47..0a0668f 100644 --- a/src/rgb_f32.rs +++ b/src/rgb_f32.rs @@ -5,461 +5,20 @@ * // license that can be found in the LICENSE file. */ -use crate::acceleration_feature::AccelerationFeature; +use rayon::ThreadPool; + use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; -use crate::convolve_f32::*; +use crate::convolve_naive_f32::*; +use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32}; use crate::filter_weights::{FilterBounds, FilterWeights}; use crate::image_store::ImageStore; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon_rgb_f32::neon_convolve_floats; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::sse_rgb_f32::sse_convolve_f32::*; -use crate::unsafe_slice::UnsafeSlice; -use rayon::ThreadPool; -use std::sync::Arc; - -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -#[inline(always)] -fn convolve_horizontal_neon( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - let dst_width = destination.width; - let src_width = image_store.width; - - if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - let mut yy = 0usize; - for y in (0..destination.height.saturating_sub(4)).step_by(4) { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - unsafe { - neon_convolve_floats::convolve_horizontal_rgb_neon_rows_4( - dst_width, - src_width, - &weights, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - dst_stride, - ); - } - }); - yy = y; - } - for y in yy..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - unsafe { - neon_convolve_floats::convolve_horizontal_rgb_neon_row_one( - dst_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - }); - } - }); - } else { - let mut yy = 0usize; - while yy + 4 < destination.height { - unsafe { - neon_convolve_floats::convolve_horizontal_rgb_neon_rows_4( - dst_width, - src_width, - &filter_weights, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - dst_stride, - ); - } - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) }; - yy += 4; - } - - for _ in yy..destination.height { - unsafe { - neon_convolve_floats::convolve_horizontal_rgb_neon_row_one( - dst_width, - &filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - -#[inline(always)] -fn convolve_horizontal_rgb_native_row( - dst_width: usize, - filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, -) { - let mut filter_offset = 0usize; - let weights_ptr = filter_weights.weights.as_ptr(); - - const CHANNELS: usize = 3; - for x in 0..dst_width { - let mut sum_r = 0f32; - let mut sum_g = 0f32; - let mut sum_b = 0f32; - - let bounds = unsafe { filter_weights.bounds.get_unchecked(x) }; - let start_x = bounds.start; - for j in 0..bounds.size { - let px = (start_x + j) * CHANNELS; - let weight = unsafe { weights_ptr.add(j + filter_offset).read_unaligned() }; - let src = unsafe { unsafe_source_ptr_0.add(px) }; - sum_r += unsafe { src.read_unaligned() } * weight; - sum_g += unsafe { src.add(1).read_unaligned() } * weight; - sum_b += unsafe { src.add(2).read_unaligned() } * weight; - } - - let px = x * CHANNELS; - - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - - unsafe { - dest_ptr.write_unaligned(sum_r); - dest_ptr.add(1).write_unaligned(sum_g); - dest_ptr.add(2).write_unaligned(sum_b); - } - - filter_offset += filter_weights.aligned_size; - } -} - -fn convolve_horizontal_native( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - - let dst_width = destination.width; - - if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - convolve_horizontal_rgb_native_row( - dst_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - }); - } - }); - } else { - for _ in 0..destination.height { - convolve_horizontal_rgb_native_row( - dst_width, - &filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -#[inline(always)] -fn convolve_vertical_rgb_native_row( - total_width: usize, - src_stride: usize, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, - weight_ptr: *const f32, - bounds: &FilterBounds, -) { - let mut cx = 0usize; - - while cx + 16 < total_width { - unsafe { - convolve_vertical_part_neon_16_f32( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - ); - } - - cx += 16; - } - while cx + 8 < total_width { - unsafe { - convolve_vertical_part_neon_8_f32::( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - 8, - ); - } - - cx += 8; - } - - let left = total_width - cx; - - if left > 0 { - unsafe { - convolve_vertical_part_neon_8_f32::( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - left, - ); - } - } -} - -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -fn convolve_vertical_neon( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - let src_stride = image_store.width * image_store.channels; - let mut filter_offset = 0usize; - let dst_stride = destination.width * image_store.channels; - let total_width = destination.width * image_store.channels; - - if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - let bounds = unsafe { weights.bounds.get_unchecked(y) }; - let weight_ptr = unsafe { weights.weights.as_ptr().add(filter_offset) }; - convolve_vertical_rgb_native_row( - total_width, - src_stride, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - ); - }); - } - }); - } else { - for y in 0..destination.height { - let bounds = unsafe { filter_weights.bounds.get_unchecked(y) }; - let weight_ptr = unsafe { filter_weights.weights.as_ptr().add(filter_offset) }; - - convolve_vertical_rgb_native_row( - total_width, - src_stride, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - ); - - filter_offset += filter_weights.aligned_size; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - +use crate::neon::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline(always)] -fn convolve_horizontal_rgb_sse_row( - total_width: usize, - src_stride: usize, - bounds: &FilterBounds, - weight_ptr: *const f32, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, -) { - let mut cx = 0usize; - - while cx + 16 < total_width { - unsafe { - convolve_vertical_part_sse_16_f32( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - ); - } - - cx += 16; - } - - while cx + 8 < total_width { - unsafe { - convolve_vertical_part_sse_8_f32( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - ); - } - - cx += 8; - } - - while cx + 4 < total_width { - unsafe { - convolve_vertical_part_sse_4_f32( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - ); - } - - cx += 4; - } - - while cx < total_width { - unsafe { - convolve_vertical_part_sse_f32( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - ); - } - cx += 1; - } -} - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -pub(crate) fn convolve_vertical_sse_rgb_f32( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - let total_width = destination.width * image_store.channels; - - if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - let filter_offset = y * weights.aligned_size; - let bounds = unsafe { weights.bounds.get_unchecked(y) }; - let weight_ptr = unsafe { weights.weights.as_ptr().add(filter_offset) }; - convolve_horizontal_rgb_sse_row( - total_width, - src_stride, - &bounds, - weight_ptr, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - }); - } - }); - } else { - let mut filter_offset = 0usize; - for y in 0..destination.height { - let bounds = unsafe { filter_weights.bounds.get_unchecked(y) }; - let weight_ptr = unsafe { filter_weights.weights.as_ptr().add(filter_offset) }; - - convolve_horizontal_rgb_sse_row( - total_width, - src_stride, - &bounds, - weight_ptr, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - - filter_offset += filter_weights.aligned_size; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} +use crate::sse::sse_convolve_f32::convolve_vertical_rgb_sse_row_f32; #[inline(always)] -fn convolve_vertical_rgb_native_row_f32( +pub(crate) fn convolve_vertical_rgb_native_row_f32( dst_width: usize, bounds: &FilterBounds, unsafe_source_ptr_0: *const f32, @@ -517,67 +76,6 @@ fn convolve_vertical_rgb_native_row_f32( } } -pub(crate) fn convolve_vertical_native_f32( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - - let src_stride = image_store.width * image_store.channels; - - let mut filter_offset = 0usize; - - let dst_stride = destination.width * image_store.channels; - let dst_width = destination.width; - - if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let bounds = unsafe { weights.bounds.get_unchecked(y) }; - let weight_ptr = - unsafe { weights.weights.as_ptr().add(weights.aligned_size * y) }; - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - convolve_vertical_rgb_native_row_f32::( - dst_width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, - ); - }); - } - }); - } else { - for y in 0..destination.height { - let bounds = unsafe { filter_weights.bounds.get_unchecked(y) }; - let weight_ptr = unsafe { filter_weights.weights.as_ptr().add(filter_offset) }; - - convolve_vertical_rgb_native_row_f32::( - dst_width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, - ); - - filter_offset += filter_weights.aligned_size; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - impl<'a> HorizontalConvolutionPass for ImageStore<'a, f32, 3> { #[inline(always)] fn convolve_horizontal( @@ -586,22 +84,34 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f32, 3> { destination: &mut ImageStore, pool: &Option, ) { - let mut _using_feature = AccelerationFeature::Native; + let mut _dispatcher_4_rows: Option< + fn(usize, usize, &FilterWeights, *const f32, usize, *mut f32, usize), + > = Some(convolve_horizontal_rgba_4_row_f32::<3>); + let mut _dispatcher_row: fn(usize, usize, &FilterWeights, *const f32, *mut f32) = + convolve_horizontal_rgb_native_row::<3>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - _using_feature = AccelerationFeature::Neon; - } - match _using_feature { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - AccelerationFeature::Neon => { - convolve_horizontal_neon(self, filter_weights, destination, pool); - } - AccelerationFeature::Native => { - convolve_horizontal_native(self, filter_weights, destination, pool); - } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - AccelerationFeature::Sse => {} - } + _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4_f32); + _dispatcher_row = convolve_horizontal_rgb_neon_row_one_f32; + } + // #[cfg(all( + // any(target_arch = "x86_64", target_arch = "x86"), + // target_feature = "sse4.1" + // ))] + // { + // if is_x86_feature_detected!("sse4.1") { + // _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4_f32); + // _dispatcher_row = convolve_horizontal_rgb_sse_row_f32; + // } + // } + convolve_horizontal_dispatch_f32( + self, + filter_weights, + destination, + pool, + _dispatcher_4_rows, + _dispatcher_row, + ); } } @@ -612,12 +122,11 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f32, 3> { destination: &mut ImageStore, pool: &Option, ) { - #[allow(unused_assignments)] - #[allow(unused_mut)] - let mut using_feature = AccelerationFeature::Native; + let mut _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, *const f32) = + convolve_vertical_rgb_native_row_f32::<3>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - using_feature = AccelerationFeature::Neon; + _dispatcher = convolve_vertical_rgb_neon_row_f32::<3>; } #[cfg(all( any(target_arch = "x86_64", target_arch = "x86"), @@ -625,21 +134,9 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f32, 3> { ))] { if is_x86_feature_detected!("sse4.1") { - using_feature = AccelerationFeature::Sse; - } - } - match using_feature { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - AccelerationFeature::Neon => { - convolve_vertical_neon(self, filter_weights, destination, pool); - } - AccelerationFeature::Native => { - convolve_vertical_native_f32(self, filter_weights, destination, pool); - } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - AccelerationFeature::Sse => { - convolve_vertical_sse_rgb_f32(self, filter_weights, destination, pool); + _dispatcher = convolve_vertical_rgb_sse_row_f32; } } + convolve_vertical_dispatch_f32(self, filter_weights, destination, pool, _dispatcher); } } diff --git a/src/rgb_u8.rs b/src/rgb_u8.rs index b1c9dc2..d680d0a 100644 --- a/src/rgb_u8.rs +++ b/src/rgb_u8.rs @@ -5,432 +5,24 @@ * // license that can be found in the LICENSE file. */ -use rayon::ThreadPool; -use std::sync::Arc; - -use crate::acceleration_feature::AccelerationFeature; use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; -use crate::convolve_u8::*; +use crate::convolve_naive_u8::*; +use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8}; use crate::filter_weights::{FilterBounds, FilterWeights}; use crate::image_store::ImageStore; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon_rgb_u8::neon_rgb::*; +use crate::neon::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::sse_rgb_u8::sse_rgb::*; -use crate::support::{PRECISION, ROUNDING_APPROX}; -use crate::unsafe_slice::UnsafeSlice; - +use crate::sse::sse_rgb::{ + convolve_horizontal_rgb_sse_row_one, convolve_horizontal_rgb_sse_rows_4, + convolve_vertical_rgb_sse_row, +}; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -fn convolve_horizontal_rgb_sse( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let approx_weights = filter_weights.numerical_approximation_i16::<12>(0); - - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - - if let Some(pool) = pool { - let arc_weights = Arc::new(approx_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - let destination_height = destination.height; - let dst_width = destination.width; - pool.scope(|scope| { - let mut yy = 0usize; - while yy + 4 < destination_height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * yy) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * yy) }; - unsafe { - convolve_horizontal_rgb_sse_rows_4( - image_store.width, - dst_width, - &weights, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - dst_stride, - ); - } - }); - yy += 4; - } - for y in (yy..destination.height).step_by(4) { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - unsafe { - convolve_horizontal_rgb_sse_row_one( - image_store.width, - dst_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - }); - } - }); - } else { - let mut yy = 0usize; - - while yy + 4 < destination.height { - unsafe { - convolve_horizontal_rgb_sse_rows_4( - image_store.width, - destination.width, - &approx_weights, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - dst_stride, - ); - } - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) }; - yy += 4; - } - - for _ in yy..destination.height { - unsafe { - convolve_horizontal_rgb_sse_row_one( - image_store.width, - destination.width, - &approx_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -fn convolve_horizontal_rgb_neon( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let approx_weights = filter_weights.numerical_approximation_i16::<12>(0); - - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - let dst_width = destination.width; - let src_width = image_store.width; - - if let Some(pool) = pool { - let arc_weights = Arc::new(approx_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - let mut yy = 0usize; - for y in (0..destination.height.saturating_sub(4)).step_by(4) { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - unsafe { - convolve_horizontal_rgb_neon_rows_4( - dst_width, - src_width, - &weights, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - dst_stride, - ); - } - }); - yy = y; - } - for y in (yy..destination.height).step_by(4) { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - unsafe { - convolve_horizontal_rgb_neon_row_one( - dst_width, - src_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - }); - } - }); - } else { - let mut yy = 0usize; - while yy + 4 < destination.height { - unsafe { - convolve_horizontal_rgb_neon_rows_4( - dst_width, - src_width, - &approx_weights, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - dst_stride, - ); - } - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) }; - - yy += 4; - } - - for _ in yy..destination.height { - unsafe { - convolve_horizontal_rgb_neon_row_one( - dst_width, - src_width, - &approx_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - -fn convolve_horizontal_rgb_native_row( - dst_width: usize, - filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const u8, - unsafe_destination_ptr_0: *mut u8, -) { - const CHANNELS: usize = 3; - let mut filter_offset = 0usize; - let weights_ptr = filter_weights.weights.as_ptr(); - for x in 0..dst_width { - let mut sum_r = ROUNDING_APPROX; - let mut sum_g = ROUNDING_APPROX; - let mut sum_b = ROUNDING_APPROX; - - let bounds = unsafe { filter_weights.bounds.get_unchecked(x) }; - let start_x = bounds.start; - for j in 0..bounds.size { - let px = (start_x + j) * CHANNELS; - let weight = unsafe { weights_ptr.add(j + filter_offset).read_unaligned() } as i32; - let src = unsafe { unsafe_source_ptr_0.add(px) }; - sum_r += unsafe { src.read_unaligned() } as i32 * weight; - sum_g += unsafe { src.add(1).read_unaligned() } as i32 * weight; - sum_b += unsafe { src.add(2).read_unaligned() } as i32 * weight; - } - - let px = x * CHANNELS; - - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - - unsafe { - dest_ptr.write_unaligned((sum_r >> PRECISION).min(255).max(0) as u8); - dest_ptr - .add(1) - .write_unaligned((sum_g >> PRECISION).min(255).max(0) as u8); - dest_ptr - .add(2) - .write_unaligned((sum_b >> PRECISION).min(255).max(0) as u8); - } - - filter_offset += filter_weights.aligned_size; - } -} - -fn convolve_horizontal_rgb_native( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let approx_weights = filter_weights.numerical_approximation_i16::<12>(0); - - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - let dst_width = destination.width; - - if let Some(pool) = pool { - let arc_weights = Arc::new(approx_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - convolve_horizontal_rgb_native_row( - dst_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - }); - } - }); - } else { - for _ in 0..destination.height { - convolve_horizontal_rgb_native_row( - destination.width, - &approx_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -pub(crate) fn convolve_vertical_rgb_sse_8( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let approx_weights = filter_weights.numerical_approximation_i16::<12>(0); - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - let src_stride = image_store.width * image_store.channels; - let mut filter_offset = 0usize; - let dst_stride = destination.width * image_store.channels; - let total_width = destination.width * image_store.channels; - - if let Some(pool) = pool { - let arc_weights = Arc::new(approx_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let bounds = unsafe { weights.bounds.get_unchecked(y) }; - let weight_ptr = - unsafe { weights.weights.as_ptr().add(weights.aligned_size * y) }; - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - convolve_vertical_rgb_sse_row( - total_width, - &bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, - ); - }); - } - }); - } else { - for y in 0..destination.height { - let bounds = unsafe { approx_weights.bounds.get_unchecked(y) }; - let weight_ptr = unsafe { approx_weights.weights.as_ptr().add(filter_offset) }; - convolve_vertical_rgb_sse_row( - total_width, - &bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, - ); - filter_offset += approx_weights.aligned_size; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -#[inline(always)] -pub(crate) fn convolve_vertical_rgb_neon( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let approx_weights = filter_weights.numerical_approximation_i16::<12>(0); - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - let src_stride = image_store.width * image_store.channels; - let mut filter_offset = 0usize; - let dst_stride = destination.width * image_store.channels; - let total_width = destination.width * image_store.channels; - - if let Some(pool) = pool { - let arc_weights = Arc::new(approx_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let bounds = unsafe { weights.bounds.get_unchecked(y) }; - let weight_ptr = - unsafe { weights.weights.as_ptr().add(weights.aligned_size * y) }; - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - convolve_vertical_rgb_neon_row( - total_width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, - ); - }); - } - }); - } else { - for y in 0..destination.height { - let bounds = unsafe { approx_weights.bounds.get_unchecked(y) }; - let weight_ptr = unsafe { approx_weights.weights.as_ptr().add(filter_offset) }; - convolve_vertical_rgb_neon_row( - total_width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, - ); - filter_offset += approx_weights.aligned_size; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} +use crate::sse::*; +use rayon::ThreadPool; #[inline(always)] -pub(crate) fn convolve_vertical_rgb_native_row( +pub(crate) fn convolve_vertical_rgb_native_row_u8( dst_width: usize, bounds: &FilterBounds, unsafe_source_ptr_0: *const u8, @@ -441,7 +33,7 @@ pub(crate) fn convolve_vertical_rgb_native_row( let mut cx = 0usize; while cx + 12 < dst_width { unsafe { - convolve_vertical_part::<12, 3>( + convolve_vertical_part::<12, COMPONENTS>( bounds.start, cx, unsafe_source_ptr_0, @@ -457,7 +49,7 @@ pub(crate) fn convolve_vertical_rgb_native_row( while cx + 8 < dst_width { unsafe { - convolve_vertical_part::<8, 3>( + convolve_vertical_part::<8, COMPONENTS>( bounds.start, cx, unsafe_source_ptr_0, @@ -473,7 +65,7 @@ pub(crate) fn convolve_vertical_rgb_native_row( while cx < dst_width { unsafe { - convolve_vertical_part::<1, 3>( + convolve_vertical_part::<1, COMPONENTS>( bounds.start, cx, unsafe_source_ptr_0, @@ -488,67 +80,6 @@ pub(crate) fn convolve_vertical_rgb_native_row( } } -#[inline(always)] -pub(crate) fn convolve_vertical_rgb_native_8<'a, const COMPONENTS: usize>( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore<'a, u8, COMPONENTS>, - pool: &Option, -) { - let approx_weights = filter_weights.numerical_approximation_i16::<12>(0); - - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - - let dst_width = destination.width; - - if let Some(pool) = pool { - let arc_weights = Arc::new(approx_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let bounds = unsafe { weights.bounds.get_unchecked(y) }; - let weight_ptr = - unsafe { weights.weights.as_ptr().add(weights.aligned_size * y) }; - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - convolve_vertical_rgb_native_row( - dst_width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, - ); - }); - } - }); - } else { - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - let mut filter_offset = 0usize; - for y in 0..destination.height { - let bounds = unsafe { approx_weights.bounds.get_unchecked(y) }; - let weight_ptr = unsafe { approx_weights.weights.as_ptr().add(filter_offset) }; - convolve_vertical_rgb_native_row( - dst_width, - bounds, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - src_stride, - weight_ptr, - ); - - filter_offset += approx_weights.aligned_size; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - impl<'a> HorizontalConvolutionPass for ImageStore<'a, u8, 3> { fn convolve_horizontal( &self, @@ -556,33 +87,35 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, u8, 3> { destination: &mut ImageStore, pool: &Option, ) { - let mut _using_feature = AccelerationFeature::Native; + let mut _dispatcher_4_rows: Option< + fn(usize, usize, &FilterWeights, *const u8, usize, *mut u8, usize), + > = None; + let mut _dispatcher_1_row: fn(usize, usize, &FilterWeights, *const u8, *mut u8) = + convolve_horizontal_rgba_native_row::<3>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - _using_feature = AccelerationFeature::Neon; + _dispatcher_4_rows = Some(convolve_horizontal_rgb_neon_rows_4); + _dispatcher_1_row = convolve_horizontal_rgb_neon_row_one; } #[cfg(all( any(target_arch = "x86_64", target_arch = "x86"), target_feature = "sse4.1" ))] { + _dispatcher_4_rows = Some(convolve_horizontal_rgba_native_4_row::<3>); if is_x86_feature_detected!("sse4.1") { - _using_feature = AccelerationFeature::Sse; - } - } - match _using_feature { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - AccelerationFeature::Neon => { - convolve_horizontal_rgb_neon(self, filter_weights, destination, pool); - } - AccelerationFeature::Native => { - convolve_horizontal_rgb_native(self, filter_weights, destination, pool); - } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - AccelerationFeature::Sse => { - convolve_horizontal_rgb_sse(self, filter_weights, destination, pool); - } - } + _dispatcher_4_rows = Some(convolve_horizontal_rgb_sse_rows_4); + _dispatcher_1_row = convolve_horizontal_rgb_sse_row_one; + } + } + convolve_horizontal_dispatch_u8( + self, + filter_weights, + destination, + pool, + _dispatcher_4_rows, + _dispatcher_1_row, + ); } } @@ -593,10 +126,17 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, u8, 3> { destination: &mut ImageStore, pool: &Option, ) { - let mut _using_feature = AccelerationFeature::Native; + let mut _dispatcher: fn( + dst_width: usize, + bounds: &FilterBounds, + unsafe_source_ptr_0: *const u8, + unsafe_destination_ptr_0: *mut u8, + src_stride: usize, + weight_ptr: *const i16, + ) = convolve_vertical_rgb_native_row_u8::<3>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - _using_feature = AccelerationFeature::Neon; + _dispatcher = convolve_vertical_rgb_neon_row::<3>; } #[cfg(all( any(target_arch = "x86_64", target_arch = "x86"), @@ -604,21 +144,9 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, u8, 3> { ))] { if is_x86_feature_detected!("sse4.1") { - _using_feature = AccelerationFeature::Sse; - } - } - match _using_feature { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - AccelerationFeature::Neon => { - convolve_vertical_rgb_neon(self, filter_weights, destination, pool); - } - AccelerationFeature::Native => { - convolve_vertical_rgb_native_8(self, filter_weights, destination, pool); - } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - AccelerationFeature::Sse => { - convolve_vertical_rgb_sse_8(self, filter_weights, destination, pool); + _dispatcher = convolve_vertical_rgb_sse_row; } } + convolve_vertical_dispatch_u8(self, filter_weights, destination, pool, _dispatcher); } } diff --git a/src/rgba_f32.rs b/src/rgba_f32.rs index ac3779d..c92f2ba 100644 --- a/src/rgba_f32.rs +++ b/src/rgba_f32.rs @@ -5,426 +5,21 @@ * // license that can be found in the LICENSE file. */ -use std::sync::Arc; - use rayon::ThreadPool; -use crate::acceleration_feature::AccelerationFeature; use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::convolve_f32::*; +use crate::convolve_naive_f32::{convolve_horizontal_rgb_native_row, convolve_horizontal_rgba_4_row_f32}; +use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32}; use crate::filter_weights::*; -use crate::rgb_f32::convolve_vertical_native_f32; -use crate::unsafe_slice::UnsafeSlice; use crate::ImageStore; - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -#[inline(always)] -fn convolve_horizontal_rgba_f32_neon( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - let dst_width = destination.width; - - if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - let mut yy = 0usize; - for y in (0..destination.height.saturating_sub(4)).step_by(4) { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - unsafe { - crate::neon_rgb_f32::neon_convolve_floats::convolve_horizontal_rgba_neon_rows_4( - dst_width, - &weights, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - dst_stride, - ); - } - }); - yy = y; - } - for y in (yy..destination.height).step_by(4) { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - unsafe { - crate::neon_rgb_f32::neon_convolve_floats::convolve_horizontal_rgba_neon_row_one( - dst_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - }); - } - }); - } else { - let mut yy = 0usize; - - while yy + 4 < destination.height { - unsafe { - crate::neon_rgb_f32::neon_convolve_floats::convolve_horizontal_rgba_neon_rows_4( - dst_width, - &filter_weights, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - dst_stride, - ); - } - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) }; - - yy += 4; - } - - for _ in yy..destination.height { - unsafe { - crate::neon_rgb_f32::neon_convolve_floats::convolve_horizontal_rgba_neon_row_one( - dst_width, - &filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - +use crate::neon::*; +use crate::rgb_f32::convolve_vertical_rgb_native_row_f32; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -#[inline(always)] -fn convolve_horizontal_rgba_f32_sse( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - let dst_width = destination.width; - - if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - let mut yy = 0usize; - for y in (0..destination.height.saturating_sub(4)).step_by(4) { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - unsafe { - crate::sse_rgb_f32::sse_convolve_f32::convolve_horizontal_rgba_sse_rows_4( - dst_width, - &weights, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - dst_stride, - ); - } - }); - yy = y; - } - for y in (yy..destination.height).step_by(4) { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - unsafe { - crate::sse_rgb_f32::sse_convolve_f32::convolve_horizontal_rgba_sse_row_one( - dst_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - }); - } - }); - } else { - let mut yy = 0usize; - - while yy + 4 < destination.height { - unsafe { - crate::sse_rgb_f32::sse_convolve_f32::convolve_horizontal_rgba_sse_rows_4( - dst_width, - &filter_weights, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - dst_stride, - ); - } - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) }; - - yy += 4; - } - - for _ in yy..destination.height { - unsafe { - crate::sse_rgb_f32::sse_convolve_f32::convolve_horizontal_rgba_sse_row_one( - dst_width, - &filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - -#[inline(always)] -fn convolve_horizontal_rgb_native_row( - dst_width: usize, - filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, -) { - const CHANNELS: usize = 4; - let weights_ptr = filter_weights.weights.as_ptr(); - let mut filter_offset = 0usize; - - for x in 0..dst_width { - let mut sum_r = 0f32; - let mut sum_g = 0f32; - let mut sum_b = 0f32; - let mut sum_a = 0f32; - - let bounds = unsafe { filter_weights.bounds.get_unchecked(x) }; - let start_x = bounds.start; - for j in 0..bounds.size { - let px = (start_x + j) * CHANNELS; - let weight = unsafe { weights_ptr.add(j + filter_offset).read_unaligned() }; - let src = unsafe { unsafe_source_ptr_0.add(px) }; - sum_r += unsafe { src.read_unaligned() } * weight; - sum_g += unsafe { src.add(1).read_unaligned() } * weight; - sum_b += unsafe { src.add(2).read_unaligned() } * weight; - sum_a += unsafe { src.add(3).read_unaligned() } * weight; - } - - let px = x * CHANNELS; - - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - - unsafe { - dest_ptr.write_unaligned(sum_r); - dest_ptr.add(1).write_unaligned(sum_g); - dest_ptr.add(2).write_unaligned(sum_b); - dest_ptr.add(3).write_unaligned(sum_a); - } - - filter_offset += filter_weights.aligned_size; - } -} - -fn convolve_horizontal_rgba_f32_native( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - - let dst_width = destination.width; - - if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - convolve_horizontal_rgb_native_row( - dst_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - }); - } - }); - } else { - for _ in 0..destination.height { - convolve_horizontal_rgb_native_row( - dst_width, - &filter_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -#[inline(always)] -fn convolve_vertical_rgb_native_row( - total_width: usize, - src_stride: usize, - unsafe_source_ptr_0: *const f32, - unsafe_destination_ptr_0: *mut f32, - weight_ptr: *const f32, - bounds: &FilterBounds, -) { - let mut cx = 0usize; - - while cx + 16 < total_width { - unsafe { - convolve_vertical_part_neon_16_f32( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - ); - } - - cx += 16; - } - while cx + 8 < total_width { - unsafe { - convolve_vertical_part_neon_8_f32::( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - 8, - ); - } - - cx += 8; - } - - let left = total_width - cx; - - if left > 0 { - unsafe { - convolve_vertical_part_neon_8_f32::( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - left, - ); - } - } -} - -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -fn convolve_vertical_rgba_f32_neon( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - let total_width = destination.width * image_store.channels; - - if let Some(pool) = pool { - let arc_weights = Arc::new(filter_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - let filter_offset = y * weights.aligned_size; - let bounds = unsafe { weights.bounds.get_unchecked(y) }; - let weight_ptr = unsafe { weights.weights.as_ptr().add(filter_offset) }; - - convolve_vertical_rgb_native_row( - total_width, - src_stride, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - weight_ptr, - &bounds, - ); - }); - } - }); - } else { - let mut filter_offset = 0usize; - for y in 0..destination.height { - let bounds = unsafe { filter_weights.bounds.get_unchecked(y) }; - let weight_ptr = unsafe { filter_weights.weights.as_ptr().add(filter_offset) }; - - convolve_vertical_rgb_native_row( - total_width, - src_stride, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - weight_ptr, - &bounds, - ); - - filter_offset += filter_weights.aligned_size; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} +use crate::sse::sse_convolve_f32::{ + convolve_vertical_rgb_sse_row_f32, convolve_horizontal_rgba_sse_row_one_f32, + convolve_horizontal_rgba_sse_rows_4_f32, +}; impl<'a> HorizontalConvolutionPass for ImageStore<'a, f32, 4> { #[inline(always)] @@ -434,12 +29,15 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f32, 4> { destination: &mut ImageStore, pool: &Option, ) { - #[allow(unused_assignments)] - #[allow(unused_mut)] - let mut using_feature = AccelerationFeature::Native; + let mut _dispatcher_4_rows: Option< + fn(usize, usize, &FilterWeights, *const f32, usize, *mut f32, usize), + > = Some(convolve_horizontal_rgba_4_row_f32::<4>); + let mut _dispatcher_row: fn(usize, usize, &FilterWeights, *const f32, *mut f32) = + convolve_horizontal_rgb_native_row::<4>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - using_feature = AccelerationFeature::Neon; + _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4); + _dispatcher_row = convolve_horizontal_rgba_neon_row_one; } #[cfg(all( any(target_arch = "x86_64", target_arch = "x86"), @@ -447,22 +45,18 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f32, 4> { ))] { if is_x86_feature_detected!("sse4.1") { - using_feature = AccelerationFeature::Sse; - } - } - match using_feature { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - AccelerationFeature::Neon => { - convolve_horizontal_rgba_f32_neon(self, filter_weights, destination, pool); - } - AccelerationFeature::Native => { - convolve_horizontal_rgba_f32_native(self, filter_weights, destination, pool); - } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - AccelerationFeature::Sse => { - convolve_horizontal_rgba_f32_sse(self, filter_weights, destination, pool); - } - } + _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4_f32); + _dispatcher_row = convolve_horizontal_rgba_sse_row_one_f32; + } + } + convolve_horizontal_dispatch_f32( + self, + filter_weights, + destination, + pool, + _dispatcher_4_rows, + _dispatcher_row, + ); } } @@ -473,12 +67,11 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f32, 4> { destination: &mut ImageStore, pool: &Option, ) { - #[allow(unused_assignments)] - #[allow(unused_mut)] - let mut using_feature = AccelerationFeature::Native; + let mut _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, *const f32) = + convolve_vertical_rgb_native_row_f32::<4>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - using_feature = AccelerationFeature::Neon; + _dispatcher = convolve_vertical_rgb_neon_row_f32::<4>; } #[cfg(all( any(target_arch = "x86_64", target_arch = "x86"), @@ -486,26 +79,9 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f32, 4> { ))] { if is_x86_feature_detected!("sse4.1") { - using_feature = AccelerationFeature::Sse; - } - } - match using_feature { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - AccelerationFeature::Neon => { - convolve_vertical_rgba_f32_neon(self, filter_weights, destination, pool); - } - AccelerationFeature::Native => { - convolve_vertical_native_f32(self, filter_weights, destination, pool); - } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - AccelerationFeature::Sse => { - crate::rgb_f32::convolve_vertical_sse_rgb_f32( - self, - filter_weights, - destination, - pool, - ); + _dispatcher = convolve_vertical_rgb_sse_row_f32; } } + convolve_vertical_dispatch_f32(self, filter_weights, destination, pool, _dispatcher); } } diff --git a/src/rgba_u8.rs b/src/rgba_u8.rs index 855b505..47bde4d 100644 --- a/src/rgba_u8.rs +++ b/src/rgba_u8.rs @@ -5,346 +5,22 @@ * // license that can be found in the LICENSE file. */ -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use std::arch::aarch64::*; -use std::sync::Arc; - use rayon::ThreadPool; -use crate::acceleration_feature::AccelerationFeature; use crate::convolution::{HorizontalConvolutionPass, VerticalConvolutionPass}; -use crate::filter_weights::FilterWeights; +use crate::convolve_naive_u8::convolve_horizontal_rgba_native_row; +use crate::dispatch_group_u8::{convolve_horizontal_dispatch_u8, convolve_vertical_dispatch_u8}; +use crate::filter_weights::{FilterBounds, FilterWeights}; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -use crate::neon_simd_u8::*; +use crate::neon::*; use crate::rgb_u8::*; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::sse_rgb_u8::sse_rgb::*; -use crate::support::{PRECISION, ROUNDING_APPROX}; -use crate::unsafe_slice::UnsafeSlice; +use crate::sse::sse_rgb::{ + convolve_horizontal_rgba_sse_rows_4, convolve_horizontal_rgba_sse_rows_one, + convolve_vertical_rgb_sse_row, +}; use crate::ImageStore; -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -fn convolve_horizontal_rgba_sse( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let approx_weights = filter_weights.numerical_approximation_i16::<12>(0); - - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - let dst_width = destination.width; - - let mut yy = 0usize; - - if let Some(pool) = pool { - let arc_weights = Arc::new(approx_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - let destination_height = destination.height; - let dst_width = destination.width; - pool.scope(|scope| { - let mut yy = 0usize; - for y in (0..destination_height.saturating_sub(4)).step_by(4) { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - unsafe { - convolve_horizontal_rgba_sse_rows_4( - dst_width, - &weights, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - dst_stride, - ); - } - }); - yy = y; - } - for y in (yy..destination.height).step_by(4) { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - unsafe { - convolve_horizontal_rgba_sse_rows_one( - dst_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - }); - } - }); - } else { - while yy < destination.height.saturating_sub(4) { - unsafe { - convolve_horizontal_rgba_sse_rows_4( - dst_width, - &approx_weights, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - dst_stride, - ); - } - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride * 4) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride * 4) }; - - yy += 4; - } - - for _ in yy..destination.height { - unsafe { - convolve_horizontal_rgba_sse_rows_one( - dst_width, - &approx_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -pub unsafe fn convolve_horizontal_rgba_neon_row( - dst_width: usize, - approx_weights: &FilterWeights, - unsafe_source_ptr_0: *const u8, - unsafe_destination_ptr_0: *mut u8, -) { - const CHANNELS: usize = 4; - let mut filter_offset = 0usize; - - let weights_ptr = approx_weights.weights.as_ptr(); - - for x in 0..dst_width { - let bounds = unsafe { approx_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store = unsafe { vdupq_n_s32(ROUNDING_APPROX) }; - - while jx + 4 < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let weight0 = vdup_n_s16(ptr.read_unaligned()); - let weight1 = vdupq_n_s16(ptr.add(1).read_unaligned()); - let weight2 = vdup_n_s16(ptr.add(2).read_unaligned()); - let weight3 = vdupq_n_s16(ptr.add(3).read_unaligned()); - store = neon_convolve_u8::convolve_horizontal_parts_4_rgba( - bounds.start + jx, - unsafe_source_ptr_0, - weight0, - weight1, - weight2, - weight3, - store, - ); - } - jx += 4; - } - - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let weight0 = vdup_n_s16(ptr.read_unaligned()); - store = neon_convolve_u8::convolve_horizontal_parts_one_rgba( - bounds.start + jx, - unsafe_source_ptr_0, - weight0, - store, - ); - } - jx += 1; - } - - let store_16 = unsafe { vqshrun_n_s32::<12>(vmaxq_s32(store, vdupq_n_s32(0i32))) }; - let store_16_8 = unsafe { vqmovn_u16(vcombine_u16(store_16, store_16)) }; - - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - let value = unsafe { vget_lane_u32::<0>(vreinterpret_u32_u8(store_16_8)) }; - let dest_ptr_32 = dest_ptr as *mut u32; - unsafe { - dest_ptr_32.write_unaligned(value); - } - - filter_offset += approx_weights.aligned_size; - } -} - -#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] -fn convolve_horizontal_rgba_neon( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - pool: &Option, -) { - let approx_weights = filter_weights.numerical_approximation_i16::<12>(0); - - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - - let dst_width = destination.width; - - if let Some(pool) = pool { - let arc_weights = Arc::new(approx_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - let destination_height = destination.height; - let dst_width = destination.width; - pool.scope(|_| { - let weights = arc_weights.clone(); - for y in 0..destination_height { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - unsafe { - convolve_horizontal_rgba_neon_row( - dst_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - } - }); - } else { - for _ in 0..destination.height { - unsafe { - convolve_horizontal_rgba_neon_row( - dst_width, - &approx_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - } - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - -fn convolve_horizontal_rgba_native_row( - dst_width: usize, - filter_weights: &FilterWeights, - unsafe_source_ptr_0: *const u8, - unsafe_destination_ptr_0: *mut u8, -) { - const CHANNELS: usize = 4; - let mut filter_offset = 0usize; - let weights_ptr = filter_weights.weights.as_ptr(); - - for x in 0..dst_width { - let mut sum_r = ROUNDING_APPROX; - let mut sum_g = ROUNDING_APPROX; - let mut sum_b = ROUNDING_APPROX; - let mut sum_a = ROUNDING_APPROX; - - let bounds = unsafe { filter_weights.bounds.get_unchecked(x) }; - let start_x = bounds.start; - for j in 0..bounds.size { - let px = (start_x + j) * CHANNELS; - let weight = unsafe { weights_ptr.add(j + filter_offset).read_unaligned() } as i32; - let src = unsafe { unsafe_source_ptr_0.add(px) }; - sum_r += unsafe { src.read_unaligned() } as i32 * weight; - sum_g += unsafe { src.add(1).read_unaligned() } as i32 * weight; - sum_b += unsafe { src.add(2).read_unaligned() } as i32 * weight; - sum_a += unsafe { src.add(3).read_unaligned() } as i32 * weight; - } - - let px = x * CHANNELS; - - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - - unsafe { - dest_ptr.write_unaligned((sum_r >> PRECISION).min(255).max(0) as u8); - dest_ptr - .add(1) - .write_unaligned((sum_g >> PRECISION).min(255).max(0) as u8); - dest_ptr - .add(2) - .write_unaligned((sum_b >> PRECISION).min(255).max(0) as u8); - dest_ptr - .add(3) - .write_unaligned((sum_a >> PRECISION).min(255).max(0) as u8); - } - - filter_offset += filter_weights.aligned_size; - } -} - -fn convolve_horizontal_rgba_native( - image_store: &ImageStore, - filter_weights: FilterWeights, - destination: &mut ImageStore, - _pool: &Option, -) { - let approx_weights = filter_weights.numerical_approximation_i16::<12>(0); - - let mut unsafe_source_ptr_0 = image_store.buffer.borrow().as_ptr(); - let mut unsafe_destination_ptr_0 = destination.buffer.borrow_mut().as_mut_ptr(); - - let dst_width = destination.width; - - let src_stride = image_store.width * image_store.channels; - let dst_stride = destination.width * image_store.channels; - - if let Some(pool) = _pool { - let arc_weights = Arc::new(approx_weights); - let borrowed = destination.buffer.borrow_mut(); - let unsafe_slice = UnsafeSlice::new(borrowed); - pool.scope(|scope| { - for y in 0..destination.height { - let weights = arc_weights.clone(); - scope.spawn(move |_| { - let unsafe_source_ptr_0 = - unsafe { image_store.buffer.borrow().as_ptr().add(src_stride * y) }; - let dst_ptr = unsafe_slice.mut_ptr(); - let unsafe_destination_ptr_0 = unsafe { dst_ptr.add(dst_stride * y) }; - convolve_horizontal_rgba_native_row( - dst_width, - &weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - }); - } - }); - } else { - for _ in 0..destination.height { - convolve_horizontal_rgba_native_row( - dst_width, - &approx_weights, - unsafe_source_ptr_0, - unsafe_destination_ptr_0, - ); - - unsafe_source_ptr_0 = unsafe { unsafe_source_ptr_0.add(src_stride) }; - unsafe_destination_ptr_0 = unsafe { unsafe_destination_ptr_0.add(dst_stride) }; - } - } -} - impl<'a> HorizontalConvolutionPass for ImageStore<'a, u8, 4> { fn convolve_horizontal( &self, @@ -352,33 +28,35 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, u8, 4> { destination: &mut ImageStore, _pool: &Option, ) { - let mut _using_feature = AccelerationFeature::Native; + let mut _dispatcher_4_rows: Option< + fn(usize, usize, &FilterWeights, *const u8, usize, *mut u8, usize), + > = None; + let mut _dispatcher_1_row: fn(usize, usize, &FilterWeights, *const u8, *mut u8) = + convolve_horizontal_rgba_native_row::<4>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - _using_feature = AccelerationFeature::Neon; + _dispatcher_4_rows = Some(convolve_horizontal_rgba_neon_rows_4_u8); + _dispatcher_1_row = convolve_horizontal_rgba_neon_row; } #[cfg(all( any(target_arch = "x86_64", target_arch = "x86"), target_feature = "sse4.1" ))] { + _dispatcher_4_rows = Some(convolve_horizontal_rgba_native_4_row::<4>); if is_x86_feature_detected!("sse4.1") { - _using_feature = AccelerationFeature::Sse; - } - } - match _using_feature { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - AccelerationFeature::Neon => { - convolve_horizontal_rgba_neon(self, filter_weights, destination, _pool); - } - AccelerationFeature::Native => { - convolve_horizontal_rgba_native(self, filter_weights, destination, _pool); - } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - AccelerationFeature::Sse => { - convolve_horizontal_rgba_sse(self, filter_weights, destination, _pool); + _dispatcher_4_rows = Some(convolve_horizontal_rgba_sse_rows_4); + _dispatcher_1_row = convolve_horizontal_rgba_sse_rows_one; } } + convolve_horizontal_dispatch_u8( + self, + filter_weights, + destination, + _pool, + _dispatcher_4_rows, + _dispatcher_1_row, + ); } } @@ -389,10 +67,17 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, u8, 4> { destination: &mut ImageStore, pool: &Option, ) { - let mut _using_feature = AccelerationFeature::Native; + let mut _dispatcher: fn( + dst_width: usize, + bounds: &FilterBounds, + unsafe_source_ptr_0: *const u8, + unsafe_destination_ptr_0: *mut u8, + src_stride: usize, + weight_ptr: *const i16, + ) = convolve_vertical_rgb_native_row_u8::<4>; #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] { - _using_feature = AccelerationFeature::Neon; + _dispatcher = convolve_vertical_rgb_neon_row::<4>; } #[cfg(all( any(target_arch = "x86_64", target_arch = "x86"), @@ -400,21 +85,9 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, u8, 4> { ))] { if is_x86_feature_detected!("sse4.1") { - _using_feature = AccelerationFeature::Sse; - } - } - match _using_feature { - #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] - AccelerationFeature::Neon => { - convolve_vertical_rgb_neon(self, filter_weights, destination, pool); - } - AccelerationFeature::Native => { - convolve_vertical_rgb_native_8(self, filter_weights, destination, pool); - } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] - AccelerationFeature::Sse => { - convolve_vertical_rgb_sse_8(self, filter_weights, destination, pool); + _dispatcher = convolve_vertical_rgb_sse_row; } } + convolve_vertical_dispatch_u8(self, filter_weights, destination, pool, _dispatcher); } } diff --git a/src/scaler.rs b/src/scaler.rs index 9b838ea..75ea329 100644 --- a/src/scaler.rs +++ b/src/scaler.rs @@ -28,7 +28,7 @@ pub trait Scaling { fn resize_rgb(&self, new_size: ImageSize, store: ImageStore) -> ImageStore; /// Performs rescaling for RGB f32, channel order does not matter fn resize_rgb_f32(&self, new_size: ImageSize, store: ImageStore) -> ImageStore; - /// Performs rescaling for RGBA, for pre-multiplying alpha, converting to LUV, LAB alpha must be last channel + /// Performs rescaling for RGBA, for pre-multiplying alpha, converting to LUV or LAB alpha must be last channel fn resize_rgba( &self, new_size: ImageSize, diff --git a/src/sse/mod.rs b/src/sse/mod.rs new file mode 100644 index 0000000..0c4fc30 --- /dev/null +++ b/src/sse/mod.rs @@ -0,0 +1,9 @@ +mod rgb_f32; +mod rgb_u8; +mod simd_u8; +mod utils; + +pub use rgb_f32::*; +pub use rgb_u8::*; +pub use simd_u8::*; +pub use utils::*; diff --git a/src/sse_rgb_f32.rs b/src/sse/rgb_f32.rs similarity index 69% rename from src/sse_rgb_f32.rs rename to src/sse/rgb_f32.rs index dd1a3bc..eda46b8 100644 --- a/src/sse_rgb_f32.rs +++ b/src/sse/rgb_f32.rs @@ -52,34 +52,35 @@ pub mod sse_convolve_f32 { acc } - pub unsafe fn convolve_horizontal_rgba_sse_rows_4( + pub(crate) fn convolve_horizontal_rgba_sse_rows_4_f32( dst_width: usize, + _: usize, filter_weights: &FilterWeights, unsafe_source_ptr_0: *const f32, src_stride: usize, unsafe_destination_ptr_0: *mut f32, dst_stride: usize, ) { - const CHANNELS: usize = 4; - let mut filter_offset = 0usize; - let zeros = unsafe { _mm_setzero_ps() }; - let weights_ptr = filter_weights.weights.as_ptr(); - - for x in 0..dst_width { - let bounds = unsafe { filter_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store_0 = zeros; - let mut store_1 = zeros; - let mut store_2 = zeros; - let mut store_3 = zeros; - - while jx + 4 < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - let weight0 = unsafe { _mm_set1_ps(ptr.read_unaligned()) }; - let weight1 = unsafe { _mm_set1_ps(ptr.add(1).read_unaligned()) }; - let weight2 = unsafe { _mm_set1_ps(ptr.add(2).read_unaligned()) }; - let weight3 = unsafe { _mm_set1_ps(ptr.add(3).read_unaligned()) }; - unsafe { + unsafe { + const CHANNELS: usize = 4; + let mut filter_offset = 0usize; + let zeros = _mm_setzero_ps(); + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store_0 = zeros; + let mut store_1 = zeros; + let mut store_2 = zeros; + let mut store_3 = zeros; + + while jx + 4 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = _mm_set1_ps(ptr.read_unaligned()); + let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned()); + let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned()); + let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned()); store_0 = convolve_horizontal_parts_4_rgba_f32( bounds.start, unsafe_source_ptr_0, @@ -116,13 +117,11 @@ pub mod sse_convolve_f32 { weight3, store_3, ); + jx += 4; } - jx += 4; - } - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - let weight0 = unsafe { _mm_set1_ps(ptr.read_unaligned()) }; - unsafe { + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = _mm_set1_ps(ptr.read_unaligned()); store_0 = convolve_horizontal_parts_one_rgba_f32( bounds.start, unsafe_source_ptr_0, @@ -147,57 +146,50 @@ pub mod sse_convolve_f32 { weight0, store_3, ); + jx += 1; } - jx += 1; - } - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - unsafe { + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px); _mm_storeu_ps(dest_ptr, store_0); - } - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) }; - unsafe { + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); _mm_storeu_ps(dest_ptr, store_1); - } - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) }; - unsafe { + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); _mm_storeu_ps(dest_ptr, store_2); - } - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) }; - unsafe { + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); _mm_storeu_ps(dest_ptr, store_3); - } - filter_offset += filter_weights.aligned_size; + filter_offset += filter_weights.aligned_size; + } } } - pub unsafe fn convolve_horizontal_rgba_sse_row_one( + pub fn convolve_horizontal_rgba_sse_row_one_f32( dst_width: usize, + _: usize, filter_weights: &FilterWeights, unsafe_source_ptr_0: *const f32, unsafe_destination_ptr_0: *mut f32, ) { - const CHANNELS: usize = 4; - let mut filter_offset = 0usize; - let weights_ptr = filter_weights.weights.as_ptr(); - - for x in 0..dst_width { - let bounds = unsafe { filter_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store = unsafe { _mm_setzero_ps() }; - - while jx + 4 < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - let weight0 = unsafe { _mm_set1_ps(ptr.read_unaligned()) }; - let weight1 = unsafe { _mm_set1_ps(ptr.add(1).read_unaligned()) }; - let weight2 = unsafe { _mm_set1_ps(ptr.add(2).read_unaligned()) }; - let weight3 = unsafe { _mm_set1_ps(ptr.add(3).read_unaligned()) }; - unsafe { + unsafe { + const CHANNELS: usize = 4; + let mut filter_offset = 0usize; + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store = _mm_setzero_ps(); + + while jx + 4 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = _mm_set1_ps(ptr.read_unaligned()); + let weight1 = _mm_set1_ps(ptr.add(1).read_unaligned()); + let weight2 = _mm_set1_ps(ptr.add(2).read_unaligned()); + let weight3 = _mm_set1_ps(ptr.add(3).read_unaligned()); store = convolve_horizontal_parts_4_rgba_f32( bounds.start, unsafe_source_ptr_0, @@ -207,30 +199,26 @@ pub mod sse_convolve_f32 { weight3, store, ); + jx += 4; } - jx += 4; - } - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - let weight0 = unsafe { _mm_set1_ps(ptr.read_unaligned()) }; - unsafe { + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = _mm_set1_ps(ptr.read_unaligned()); store = convolve_horizontal_parts_one_rgba_f32( bounds.start, unsafe_source_ptr_0, weight0, store, ); + jx += 1; } - jx += 1; - } - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - unsafe { + let px = x * CHANNELS; + let dest_ptr = unsafe_destination_ptr_0.add(px); _mm_storeu_ps(dest_ptr, store); - } - filter_offset += filter_weights.aligned_size; + filter_offset += filter_weights.aligned_size; + } } } @@ -374,7 +362,81 @@ pub mod sse_convolve_f32 { dst_ptr.write_unaligned(f32::from_bits(_mm_extract_ps::<0>(store_0) as u32)); } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] + #[inline(always)] + pub(crate) fn convolve_vertical_rgb_sse_row_f32( + dst_width: usize, + bounds: &FilterBounds, + unsafe_source_ptr_0: *const f32, + unsafe_destination_ptr_0: *mut f32, + src_stride: usize, + weight_ptr: *const f32, + ) { + let mut cx = 0usize; + + while cx + 16 < dst_width { + unsafe { + convolve_vertical_part_sse_16_f32( + bounds.start, + cx, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + weight_ptr, + bounds, + ); + } + + cx += 16; + } + + while cx + 8 < dst_width { + unsafe { + convolve_vertical_part_sse_8_f32( + bounds.start, + cx, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + weight_ptr, + bounds, + ); + } + + cx += 8; + } + + while cx + 4 < dst_width { + unsafe { + convolve_vertical_part_sse_4_f32( + bounds.start, + cx, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + weight_ptr, + bounds, + ); + } + + cx += 4; + } + + while cx < dst_width { + unsafe { + convolve_vertical_part_sse_f32( + bounds.start, + cx, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + weight_ptr, + bounds, + ); + } + cx += 1; + } + } + #[cfg(not(target_feature = "fma"))] #[inline] #[allow(dead_code)] @@ -382,7 +444,6 @@ pub mod sse_convolve_f32 { return _mm_add_ps(_mm_mul_ps(b, c), a); } - #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] #[cfg(target_feature = "fma")] #[inline] #[allow(dead_code)] diff --git a/src/sse/rgb_u8.rs b/src/sse/rgb_u8.rs new file mode 100644 index 0000000..515e882 --- /dev/null +++ b/src/sse/rgb_u8.rs @@ -0,0 +1,678 @@ +/* + * // Copyright (c) the Radzivon Bartoshyk. All rights reserved. + * // + * // Use of this source code is governed by a BSD-style + * // license that can be found in the LICENSE file. + */ + +#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] +pub mod sse_rgb { + use crate::filter_weights::{FilterBounds, FilterWeights}; + use crate::sse::sse_convolve_u8; + use crate::support::ROUNDING_APPROX; + #[cfg(target_arch = "x86")] + use std::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use std::arch::x86_64::*; + + pub(crate) fn convolve_horizontal_rgba_sse_rows_4( + dst_width: usize, + _: usize, + approx_weights: &FilterWeights, + unsafe_source_ptr_0: *const u8, + src_stride: usize, + unsafe_destination_ptr_0: *mut u8, + dst_stride: usize, + ) { + unsafe { + const CHANNELS: usize = 4; + let mut filter_offset = 0usize; + let weights_ptr = approx_weights.weights.as_ptr(); + + #[rustfmt::skip] + let shuffle_lo =_mm_setr_epi8(0, -1, + 4, -1, + 1, -1, + 5, -1, + 2, -1 , + 6,-1, + 3, -1, + 7, -1); + + #[rustfmt::skip] + let shuffle_hi =_mm_setr_epi8(8, -1, + 12, -1, + 9, -1, + 13, -1 , + 10,-1, + 14, -1, + 11, -1, + 15, -1); + + let vld = _mm_set1_epi32(ROUNDING_APPROX); + + for x in 0..dst_width { + let bounds = unsafe { approx_weights.bounds.get_unchecked(x) }; + let mut jx = 0usize; + let mut store_0 = vld; + let mut store_1 = vld; + let mut store_2 = vld; + let mut store_3 = vld; + + while jx + 4 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); + let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned()); + let start_bounds = bounds.start + jx; + + let src_ptr = unsafe_source_ptr_0.add(start_bounds * CHANNELS); + let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i); + + let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo, weight01)); + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi, weight23)); + + let rgb_pixel = _mm_loadu_si128(src_ptr.add(src_stride) as *const __m128i); + + let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo, weight01)); + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi, weight23)); + + let rgb_pixel = _mm_loadu_si128(src_ptr.add(src_stride * 2) as *const __m128i); + + let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo, weight01)); + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi, weight23)); + + let rgb_pixel = _mm_loadu_si128(src_ptr.add(src_stride * 3) as *const __m128i); + + let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo, weight01)); + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi, weight23)); + jx += 4; + } + + while jx + 2 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let bounds_start = bounds.start + jx; + + let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); + let src_ptr = unsafe_source_ptr_0.add(bounds_start * CHANNELS); + let rgb_pixel = _mm_loadu_si64(src_ptr); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo, weight01)); + + let rgb_pixel = _mm_loadu_si64(src_ptr.add(src_stride)); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo, weight01)); + + let rgb_pixel = _mm_loadu_si64(src_ptr.add(src_stride * 2)); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo, weight01)); + + let rgb_pixel = _mm_loadu_si64(src_ptr.add(src_stride * 3)); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo, weight01)); + jx += 2; + } + + while jx < bounds.size { + let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; + let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32); + let start_bounds = bounds.start + jx; + store_0 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse( + start_bounds, + unsafe_source_ptr_0, + weight0, + store_0, + ); + store_1 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse( + start_bounds, + unsafe_source_ptr_0.add(src_stride), + weight0, + store_1, + ); + store_2 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse( + start_bounds, + unsafe_source_ptr_0.add(src_stride * 2), + weight0, + store_2, + ); + store_3 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse( + start_bounds, + unsafe_source_ptr_0.add(src_stride * 3), + weight0, + store_3, + ); + jx += 1; + } + let store_16_8 = sse_convolve_u8::compress_i32(store_0); + let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) }; + + let px = x * CHANNELS; + let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; + let dest_ptr_32 = dest_ptr as *mut i32; + dest_ptr_32.write_unaligned(pixel); + + let store_16_8 = sse_convolve_u8::compress_i32(store_1); + let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) }; + + let px = x * CHANNELS; + let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) }; + let dest_ptr_32 = dest_ptr as *mut i32; + dest_ptr_32.write_unaligned(pixel); + + let store_16_8 = sse_convolve_u8::compress_i32(store_2); + let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) }; + + let px = x * CHANNELS; + let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) }; + let dest_ptr_32 = dest_ptr as *mut i32; + dest_ptr_32.write_unaligned(pixel); + + let store_16_8 = sse_convolve_u8::compress_i32(store_3); + let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) }; + + let px = x * CHANNELS; + let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) }; + let dest_ptr_32 = dest_ptr as *mut i32; + dest_ptr_32.write_unaligned(pixel); + + filter_offset += approx_weights.aligned_size; + } + } + } + + pub(crate) fn convolve_horizontal_rgba_sse_rows_one( + dst_width: usize, + _: usize, + approx_weights: &FilterWeights, + unsafe_source_ptr_0: *const u8, + unsafe_destination_ptr_0: *mut u8, + ) { + unsafe { + const CHANNELS: usize = 4; + let mut filter_offset = 0usize; + let weights_ptr = approx_weights.weights.as_ptr(); + + #[rustfmt::skip] + let shuffle_lo =_mm_setr_epi8(0, -1, + 4, -1, + 1, -1, + 5, -1, + 2, -1 , + 6,-1, + 3, -1, + 7, -1); + + #[rustfmt::skip] + let shuffle_hi =_mm_setr_epi8(8, -1, + 12, -1, + 9, -1, + 13, -1 , + 10,-1, + 14, -1, + 11, -1, + 15, -1); + + let vld = _mm_set1_epi32(ROUNDING_APPROX); + + for x in 0..dst_width { + let bounds = unsafe { approx_weights.bounds.get_unchecked(x) }; + let mut jx = 0usize; + let mut store = vld; + + while jx + 4 < bounds.size { + let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; + let bounds_start = bounds.start + jx; + let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); + let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned()); + + let src_ptr = unsafe_source_ptr_0.add(bounds_start * CHANNELS); + let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i); + + let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + + store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01)); + store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23)); + jx += 4; + } + + while jx + 2 < bounds.size { + let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; + let bounds_start = bounds.start + jx; + let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); + let src_ptr = unsafe_source_ptr_0.add(bounds_start * CHANNELS); + let rgb_pixel = _mm_loadu_si64(src_ptr); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01)); + jx += 2; + } + + while jx < bounds.size { + let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; + let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32); + store = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse( + bounds.start + jx, + unsafe_source_ptr_0, + weight0, + store, + ); + jx += 1; + } + + let store_16_8 = sse_convolve_u8::compress_i32(store); + let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) }; + + let px = x * CHANNELS; + let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; + let dest_ptr_32 = dest_ptr as *mut i32; + dest_ptr_32.write_unaligned(pixel); + + filter_offset += approx_weights.aligned_size; + } + } + } + + pub fn convolve_horizontal_rgb_sse_rows_4( + src_width: usize, + dst_width: usize, + approx_weights: &FilterWeights, + unsafe_source_ptr_0: *const u8, + src_stride: usize, + unsafe_destination_ptr_0: *mut u8, + dst_stride: usize, + ) { + unsafe { + const CHANNES: usize = 3; + let mut filter_offset = 0usize; + let weights_ptr = approx_weights.weights.as_ptr(); + + #[rustfmt::skip] + let shuffle_lo = unsafe { _mm_setr_epi8(0, -1, + 3, -1, + 1, -1, + 4, -1, + 2, -1 , + 5,-1, + -1, -1, + -1, -1) }; + + #[rustfmt::skip] + let shuffle_hi = unsafe { _mm_setr_epi8(6, -1, + 9, -1, + 7, -1, + 10, -1 , + 8,-1, + 11, -1, + -1, -1, + -1, -1) }; + + let vld = unsafe { _mm_set1_epi32(ROUNDING_APPROX) }; + + for x in 0..dst_width { + let bounds = unsafe { approx_weights.bounds.get_unchecked(x) }; + let mut jx = 0usize; + let mut store_0 = vld; + let mut store_1 = vld; + let mut store_2 = vld; + let mut store_3 = vld; + + // Will make step in 4 items however since it is RGB it is necessary to make a safe offset + while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { + let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; + unsafe { + let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); + let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned()); + let bounds_start = bounds.start + jx; + + let src_ptr_0 = unsafe_source_ptr_0.add(bounds_start * CHANNES); + + let rgb_pixel = _mm_loadu_si128(src_ptr_0 as *const __m128i); + let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo, weight01)); + store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi, weight23)); + + let src_ptr = src_ptr_0.add(src_stride); + let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i); + let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo, weight01)); + store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi, weight23)); + + let src_ptr = src_ptr_0.add(src_stride * 2); + let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i); + let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo, weight01)); + store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi, weight23)); + + let src_ptr = src_ptr_0.add(src_stride * 3); + let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i); + let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo, weight01)); + store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi, weight23)); + } + jx += 4; + } + + while jx + 2 < bounds.size && bounds.start + jx + 3 < src_width { + let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; + unsafe { + let bounds_start = bounds.start + jx; + let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); + store_0 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb( + bounds_start, + unsafe_source_ptr_0, + weight01, + store_0, + shuffle_lo, + ); + store_1 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb( + bounds_start, + unsafe_source_ptr_0.add(src_stride), + weight01, + store_1, + shuffle_lo, + ); + store_2 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 2), + weight01, + store_2, + shuffle_lo, + ); + store_3 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 3), + weight01, + store_3, + shuffle_lo, + ); + } + jx += 2; + } + + while jx < bounds.size { + let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; + let bounds_start = bounds.start + jx; + unsafe { + let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32); + store_0 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb( + bounds_start, + unsafe_source_ptr_0, + weight0, + store_0, + ); + store_1 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb( + bounds_start, + unsafe_source_ptr_0.add(src_stride), + weight0, + store_1, + ); + store_2 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 2), + weight0, + store_2, + ); + store_3 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb( + bounds_start, + unsafe_source_ptr_0.add(src_stride * 3), + weight0, + store_3, + ); + } + jx += 1; + } + let store_0_8 = sse_convolve_u8::compress_i32(store_0); + + let px = x * CHANNES; + let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; + + let element = unsafe { _mm_extract_epi32::<0>(store_0_8) }; + let bytes = element.to_le_bytes(); + unsafe { + dest_ptr.write_unaligned(bytes[0]); + dest_ptr.add(1).write_unaligned(bytes[1]); + dest_ptr.add(2).write_unaligned(bytes[2]); + } + + let store_1_8 = sse_convolve_u8::compress_i32(store_1); + + let px = x * CHANNES; + let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) }; + + let element = unsafe { _mm_extract_epi32::<0>(store_1_8) }; + let bytes = element.to_le_bytes(); + unsafe { + dest_ptr.write_unaligned(bytes[0]); + dest_ptr.add(1).write_unaligned(bytes[1]); + dest_ptr.add(2).write_unaligned(bytes[2]); + } + + let store_2_8 = sse_convolve_u8::compress_i32(store_2); + + let px = x * CHANNES; + let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) }; + + let element = unsafe { _mm_extract_epi32::<0>(store_2_8) }; + let bytes = element.to_le_bytes(); + unsafe { + dest_ptr.write_unaligned(bytes[0]); + dest_ptr.add(1).write_unaligned(bytes[1]); + dest_ptr.add(2).write_unaligned(bytes[2]); + } + + let store_3_8 = sse_convolve_u8::compress_i32(store_3); + + let px = x * CHANNES; + let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) }; + + let element = unsafe { _mm_extract_epi32::<0>(store_3_8) }; + let bytes = element.to_le_bytes(); + unsafe { + dest_ptr.write_unaligned(bytes[0]); + dest_ptr.add(1).write_unaligned(bytes[1]); + dest_ptr.add(2).write_unaligned(bytes[2]); + } + + filter_offset += approx_weights.aligned_size; + } + } + } + + pub fn convolve_horizontal_rgb_sse_row_one( + src_width: usize, + dst_width: usize, + approx_weights: &FilterWeights, + unsafe_source_ptr_0: *const u8, + unsafe_destination_ptr_0: *mut u8, + ) { + const CHANNELS: usize = 3; + let mut filter_offset = 0usize; + let weights_ptr = approx_weights.weights.as_ptr(); + + #[rustfmt::skip] + let shuffle_lo = unsafe { _mm_setr_epi8(0, -1, + 3, -1, + 1, -1, + 4, -1, + 2, -1 , + 5,-1, + -1, -1, + -1, -1) }; + + #[rustfmt::skip] + let shuffle_hi = unsafe { _mm_setr_epi8(6, -1, + 9, -1, + 7, -1, + 10, -1 , + 8,-1, + 11, -1, + -1, -1, + -1, -1) }; + + for x in 0..dst_width { + let bounds = unsafe { approx_weights.bounds.get_unchecked(x) }; + let mut jx = 0usize; + let mut store = unsafe { _mm_setzero_si128() }; + + while jx + 4 < bounds.size && x + 6 < src_width { + let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; + unsafe { + let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); + let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned()); + let bounds_start = bounds.start + jx; + let src_ptr_0 = unsafe_source_ptr_0.add(bounds_start * CHANNELS); + + let rgb_pixel = _mm_loadu_si128(src_ptr_0 as *const __m128i); + let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + + store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01)); + store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23)); + } + jx += 4; + } + + while jx + 2 < bounds.size && x + 3 < src_width { + let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; + unsafe { + let weight0 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); + let src_ptr = unsafe_source_ptr_0.add((bounds.start + jx) * 3); + let rgb_pixel = _mm_loadu_si64(src_ptr); + let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); + store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight0)); + } + jx += 2; + } + + while jx < bounds.size { + let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; + unsafe { + let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32); + store = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb( + bounds.start + jx, + unsafe_source_ptr_0, + weight0, + store, + ); + } + jx += 1; + } + + let store_16_8 = sse_convolve_u8::compress_i32(store); + + let px = x * CHANNELS; + let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; + + let element = unsafe { _mm_extract_epi32::<0>(store_16_8) }; + let bytes = element.to_le_bytes(); + unsafe { + dest_ptr.write_unaligned(bytes[0]); + dest_ptr.add(1).write_unaligned(bytes[1]); + dest_ptr.add(2).write_unaligned(bytes[2]); + } + unsafe { + dest_ptr.write_unaligned(bytes[0]); + dest_ptr.add(1).write_unaligned(bytes[1]); + dest_ptr.add(2).write_unaligned(bytes[2]); + } + + filter_offset += approx_weights.aligned_size; + } + } + + #[inline] + pub(crate) fn convolve_vertical_rgb_sse_row( + total_width: usize, + bounds: &FilterBounds, + unsafe_source_ptr_0: *const u8, + unsafe_destination_ptr_0: *mut u8, + src_stride: usize, + weight_ptr: *const i16, + ) { + let mut cx = 0usize; + + while cx + 32 < total_width { + unsafe { + sse_convolve_u8::convolve_vertical_part_sse_32( + bounds.start, + cx, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + weight_ptr, + bounds, + ); + } + + cx += 32; + } + + while cx + 16 < total_width { + unsafe { + sse_convolve_u8::convolve_vertical_part_sse_16( + bounds.start, + cx, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + weight_ptr, + bounds, + ); + } + + cx += 16; + } + + while cx + 8 < total_width { + unsafe { + sse_convolve_u8::convolve_vertical_part_sse_8::( + bounds.start, + cx, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + weight_ptr, + bounds, + 8, + ); + } + + cx += 8; + } + + let left = total_width - cx; + if left > 0 { + unsafe { + sse_convolve_u8::convolve_vertical_part_sse_8::( + bounds.start, + cx, + unsafe_source_ptr_0, + src_stride, + unsafe_destination_ptr_0, + weight_ptr, + bounds, + left, + ); + } + } + } +} diff --git a/src/sse_simd_u8.rs b/src/sse/simd_u8.rs similarity index 100% rename from src/sse_simd_u8.rs rename to src/sse/simd_u8.rs diff --git a/src/sse_utils.rs b/src/sse/utils.rs similarity index 100% rename from src/sse_utils.rs rename to src/sse/utils.rs diff --git a/src/sse_rgb_u8.rs b/src/sse_rgb_u8.rs deleted file mode 100644 index c59bb50..0000000 --- a/src/sse_rgb_u8.rs +++ /dev/null @@ -1,692 +0,0 @@ -/* - * // Copyright (c) the Radzivon Bartoshyk. All rights reserved. - * // - * // Use of this source code is governed by a BSD-style - * // license that can be found in the LICENSE file. - */ - -#[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -pub mod sse_rgb { - use crate::filter_weights::{FilterBounds, FilterWeights}; - use crate::sse_simd_u8::sse_convolve_u8; - use crate::support::ROUNDING_APPROX; - #[cfg(target_arch = "x86")] - use std::arch::x86::*; - #[cfg(target_arch = "x86_64")] - use std::arch::x86_64::*; - - pub(crate) unsafe fn convolve_horizontal_rgba_sse_rows_4( - dst_width: usize, - approx_weights: &FilterWeights, - unsafe_source_ptr_0: *const u8, - src_stride: usize, - unsafe_destination_ptr_0: *mut u8, - dst_stride: usize, - ) { - const CHANNELS: usize = 4; - let mut filter_offset = 0usize; - let weights_ptr = approx_weights.weights.as_ptr(); - - #[rustfmt::skip] - let shuffle_lo =_mm_setr_epi8(0, -1, - 4, -1, - 1, -1, - 5, -1, - 2, -1 , - 6,-1, - 3, -1, - 7, -1); - - #[rustfmt::skip] - let shuffle_hi =_mm_setr_epi8(8, -1, - 12, -1, - 9, -1, - 13, -1 , - 10,-1, - 14, -1, - 11, -1, - 15, -1); - - let vld = unsafe { _mm_set1_epi32(ROUNDING_APPROX) }; - - for x in 0..dst_width { - let bounds = unsafe { approx_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - while jx + 4 < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); - let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned()); - let start_bounds = bounds.start + jx; - - let src_ptr = unsafe_source_ptr_0.add(start_bounds * CHANNELS); - let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i); - - let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo, weight01)); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi, weight23)); - - let rgb_pixel = _mm_loadu_si128(src_ptr.add(src_stride) as *const __m128i); - - let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo, weight01)); - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi, weight23)); - - let rgb_pixel = _mm_loadu_si128(src_ptr.add(src_stride * 2) as *const __m128i); - - let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo, weight01)); - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi, weight23)); - - let rgb_pixel = _mm_loadu_si128(src_ptr.add(src_stride * 3) as *const __m128i); - - let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo, weight01)); - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi, weight23)); - } - jx += 4; - } - - while jx + 2 < bounds.size { - unsafe { - let ptr = weights_ptr.add(jx + filter_offset); - let bounds_start = bounds.start + jx; - - let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); - let src_ptr = unsafe_source_ptr_0.add(bounds_start * CHANNELS); - let rgb_pixel = _mm_loadu_si64(src_ptr); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo, weight01)); - - let rgb_pixel = _mm_loadu_si64(src_ptr.add(src_stride)); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo, weight01)); - - let rgb_pixel = _mm_loadu_si64(src_ptr.add(src_stride * 2)); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo, weight01)); - - let rgb_pixel = _mm_loadu_si64(src_ptr.add(src_stride * 3)); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo, weight01)); - } - jx += 2; - } - - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32); - let start_bounds = bounds.start + jx; - store_0 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse( - start_bounds, - unsafe_source_ptr_0, - weight0, - store_0, - ); - store_1 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse( - start_bounds, - unsafe_source_ptr_0.add(src_stride), - weight0, - store_1, - ); - store_2 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse( - start_bounds, - unsafe_source_ptr_0.add(src_stride * 2), - weight0, - store_2, - ); - store_3 = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse( - start_bounds, - unsafe_source_ptr_0.add(src_stride * 3), - weight0, - store_3, - ); - } - jx += 1; - } - let store_16_8 = sse_convolve_u8::compress_i32(store_0); - let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) }; - - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - let dest_ptr_32 = dest_ptr as *mut i32; - unsafe { - dest_ptr_32.write_unaligned(pixel); - } - - let store_16_8 = sse_convolve_u8::compress_i32(store_1); - let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) }; - - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) }; - let dest_ptr_32 = dest_ptr as *mut i32; - unsafe { - dest_ptr_32.write_unaligned(pixel); - } - - let store_16_8 = sse_convolve_u8::compress_i32(store_2); - let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) }; - - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) }; - let dest_ptr_32 = dest_ptr as *mut i32; - unsafe { - dest_ptr_32.write_unaligned(pixel); - } - - let store_16_8 = sse_convolve_u8::compress_i32(store_3); - let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) }; - - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) }; - let dest_ptr_32 = dest_ptr as *mut i32; - unsafe { - dest_ptr_32.write_unaligned(pixel); - } - - filter_offset += approx_weights.aligned_size; - } - } - - pub(crate) unsafe fn convolve_horizontal_rgba_sse_rows_one( - dst_width: usize, - approx_weights: &FilterWeights, - unsafe_source_ptr_0: *const u8, - unsafe_destination_ptr_0: *mut u8, - ) { - const CHANNELS: usize = 4; - let mut filter_offset = 0usize; - let weights_ptr = approx_weights.weights.as_ptr(); - - #[rustfmt::skip] - let shuffle_lo =_mm_setr_epi8(0, -1, - 4, -1, - 1, -1, - 5, -1, - 2, -1 , - 6,-1, - 3, -1, - 7, -1); - - #[rustfmt::skip] - let shuffle_hi =_mm_setr_epi8(8, -1, - 12, -1, - 9, -1, - 13, -1 , - 10,-1, - 14, -1, - 11, -1, - 15, -1); - - let vld = unsafe { _mm_set1_epi32(ROUNDING_APPROX) }; - - for x in 0..dst_width { - let bounds = unsafe { approx_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store = vld; - - while jx + 4 < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let bounds_start = bounds.start + jx; - let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); - let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned()); - - let src_ptr = unsafe_source_ptr_0.add(bounds_start * CHANNELS); - let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i); - - let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - - store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01)); - store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23)); - } - jx += 4; - } - - while jx + 2 < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let bounds_start = bounds.start + jx; - let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); - let src_ptr = unsafe_source_ptr_0.add(bounds_start * CHANNELS); - let rgb_pixel = _mm_loadu_si64(src_ptr); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01)); - } - jx += 2; - } - - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32); - store = sse_convolve_u8::convolve_horizontal_parts_one_rgba_sse( - bounds.start + jx, - unsafe_source_ptr_0, - weight0, - store, - ); - } - jx += 1; - } - - let store_16_8 = sse_convolve_u8::compress_i32(store); - let pixel = unsafe { _mm_extract_epi32::<0>(store_16_8) }; - - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - let dest_ptr_32 = dest_ptr as *mut i32; - unsafe { - dest_ptr_32.write_unaligned(pixel); - } - - filter_offset += approx_weights.aligned_size; - } - } - - pub unsafe fn convolve_horizontal_rgb_sse_rows_4( - src_width: usize, - dst_width: usize, - approx_weights: &FilterWeights, - unsafe_source_ptr_0: *const u8, - src_stride: usize, - unsafe_destination_ptr_0: *mut u8, - dst_stride: usize, - ) { - const CHANNES: usize = 3; - let mut filter_offset = 0usize; - let weights_ptr = approx_weights.weights.as_ptr(); - - #[rustfmt::skip] - let shuffle_lo = unsafe { _mm_setr_epi8(0, -1, - 3, -1, - 1, -1, - 4, -1, - 2, -1 , - 5,-1, - -1, -1, - -1, -1) }; - - #[rustfmt::skip] - let shuffle_hi = unsafe { _mm_setr_epi8(6, -1, - 9, -1, - 7, -1, - 10, -1 , - 8,-1, - 11, -1, - -1, -1, - -1, -1) }; - - let vld = unsafe { _mm_set1_epi32(ROUNDING_APPROX) }; - - for x in 0..dst_width { - let bounds = unsafe { approx_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store_0 = vld; - let mut store_1 = vld; - let mut store_2 = vld; - let mut store_3 = vld; - - // Will make step in 4 items however since it is RGB it is necessary to make a safe offset - while jx + 4 < bounds.size && bounds.start + jx + 6 < src_width { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); - let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned()); - let bounds_start = bounds.start + jx; - - let src_ptr_0 = unsafe_source_ptr_0.add(bounds_start * CHANNES); - - let rgb_pixel = _mm_loadu_si128(src_ptr_0 as *const __m128i); - let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(lo, weight01)); - store_0 = _mm_add_epi32(store_0, _mm_madd_epi16(hi, weight23)); - - let src_ptr = src_ptr_0.add(src_stride); - let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i); - let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(lo, weight01)); - store_1 = _mm_add_epi32(store_1, _mm_madd_epi16(hi, weight23)); - - let src_ptr = src_ptr_0.add(src_stride * 2); - let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i); - let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(lo, weight01)); - store_2 = _mm_add_epi32(store_2, _mm_madd_epi16(hi, weight23)); - - let src_ptr = src_ptr_0.add(src_stride * 3); - let rgb_pixel = _mm_loadu_si128(src_ptr as *const __m128i); - let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(lo, weight01)); - store_3 = _mm_add_epi32(store_3, _mm_madd_epi16(hi, weight23)); - } - jx += 4; - } - - while jx + 2 < bounds.size && bounds.start + jx + 3 < src_width { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let bounds_start = bounds.start + jx; - let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); - store_0 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb( - bounds_start, - unsafe_source_ptr_0, - weight01, - store_0, - shuffle_lo, - ); - store_1 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb( - bounds_start, - unsafe_source_ptr_0.add(src_stride), - weight01, - store_1, - shuffle_lo, - ); - store_2 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb( - bounds_start, - unsafe_source_ptr_0.add(src_stride * 2), - weight01, - store_2, - shuffle_lo, - ); - store_3 = sse_convolve_u8::convolve_horizontal_parts_two_sse_rgb( - bounds_start, - unsafe_source_ptr_0.add(src_stride * 3), - weight01, - store_3, - shuffle_lo, - ); - } - jx += 2; - } - - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - let bounds_start = bounds.start + jx; - unsafe { - let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32); - store_0 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb( - bounds_start, - unsafe_source_ptr_0, - weight0, - store_0, - ); - store_1 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb( - bounds_start, - unsafe_source_ptr_0.add(src_stride), - weight0, - store_1, - ); - store_2 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb( - bounds_start, - unsafe_source_ptr_0.add(src_stride * 2), - weight0, - store_2, - ); - store_3 = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb( - bounds_start, - unsafe_source_ptr_0.add(src_stride * 3), - weight0, - store_3, - ); - } - jx += 1; - } - let store_0_8 = sse_convolve_u8::compress_i32(store_0); - - let px = x * CHANNES; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - - let element = unsafe { _mm_extract_epi32::<0>(store_0_8) }; - let bytes = element.to_le_bytes(); - unsafe { - dest_ptr.write_unaligned(bytes[0]); - dest_ptr.add(1).write_unaligned(bytes[1]); - dest_ptr.add(2).write_unaligned(bytes[2]); - } - - let store_1_8 = sse_convolve_u8::compress_i32(store_1); - - let px = x * CHANNES; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride) }; - - let element = unsafe { _mm_extract_epi32::<0>(store_1_8) }; - let bytes = element.to_le_bytes(); - unsafe { - dest_ptr.write_unaligned(bytes[0]); - dest_ptr.add(1).write_unaligned(bytes[1]); - dest_ptr.add(2).write_unaligned(bytes[2]); - } - - let store_2_8 = sse_convolve_u8::compress_i32(store_2); - - let px = x * CHANNES; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 2) }; - - let element = unsafe { _mm_extract_epi32::<0>(store_2_8) }; - let bytes = element.to_le_bytes(); - unsafe { - dest_ptr.write_unaligned(bytes[0]); - dest_ptr.add(1).write_unaligned(bytes[1]); - dest_ptr.add(2).write_unaligned(bytes[2]); - } - - let store_3_8 = sse_convolve_u8::compress_i32(store_3); - - let px = x * CHANNES; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px + dst_stride * 3) }; - - let element = unsafe { _mm_extract_epi32::<0>(store_3_8) }; - let bytes = element.to_le_bytes(); - unsafe { - dest_ptr.write_unaligned(bytes[0]); - dest_ptr.add(1).write_unaligned(bytes[1]); - dest_ptr.add(2).write_unaligned(bytes[2]); - } - - filter_offset += approx_weights.aligned_size; - } - } - - pub unsafe fn convolve_horizontal_rgb_sse_row_one( - src_width: usize, - dst_width: usize, - approx_weights: &FilterWeights, - unsafe_source_ptr_0: *const u8, - unsafe_destination_ptr_0: *mut u8, - ) { - const CHANNELS: usize = 3; - let mut filter_offset = 0usize; - let weights_ptr = approx_weights.weights.as_ptr(); - - #[rustfmt::skip] - let shuffle_lo = unsafe { _mm_setr_epi8(0, -1, - 3, -1, - 1, -1, - 4, -1, - 2, -1 , - 5,-1, - -1, -1, - -1, -1) }; - - #[rustfmt::skip] - let shuffle_hi = unsafe { _mm_setr_epi8(6, -1, - 9, -1, - 7, -1, - 10, -1 , - 8,-1, - 11, -1, - -1, -1, - -1, -1) }; - - for x in 0..dst_width { - let bounds = unsafe { approx_weights.bounds.get_unchecked(x) }; - let mut jx = 0usize; - let mut store = unsafe { _mm_setzero_si128() }; - - while jx + 4 < bounds.size && x + 6 < src_width { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let weight01 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); - let weight23 = _mm_set1_epi32((ptr.add(2) as *const i32).read_unaligned()); - let bounds_start = bounds.start + jx; - let src_ptr_0 = unsafe_source_ptr_0.add(bounds_start * CHANNELS); - - let rgb_pixel = _mm_loadu_si128(src_ptr_0 as *const __m128i); - let hi = _mm_shuffle_epi8(rgb_pixel, shuffle_hi); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - - store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight01)); - store = _mm_add_epi32(store, _mm_madd_epi16(hi, weight23)); - } - jx += 4; - } - - while jx + 2 < bounds.size && x + 3 < src_width { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let weight0 = _mm_set1_epi32((ptr as *const i32).read_unaligned()); - let src_ptr = unsafe_source_ptr_0.add((bounds.start + jx) * 3); - let rgb_pixel = _mm_loadu_si64(src_ptr); - let lo = _mm_shuffle_epi8(rgb_pixel, shuffle_lo); - store = _mm_add_epi32(store, _mm_madd_epi16(lo, weight0)); - } - jx += 2; - } - - while jx < bounds.size { - let ptr = unsafe { weights_ptr.add(jx + filter_offset) }; - unsafe { - let weight0 = _mm_set1_epi32(ptr.read_unaligned() as i32); - store = sse_convolve_u8::convolve_horizontal_parts_one_sse_rgb( - bounds.start + jx, - unsafe_source_ptr_0, - weight0, - store, - ); - } - jx += 1; - } - - let store_16_8 = sse_convolve_u8::compress_i32(store); - - let px = x * CHANNELS; - let dest_ptr = unsafe { unsafe_destination_ptr_0.add(px) }; - - let element = unsafe { _mm_extract_epi32::<0>(store_16_8) }; - let bytes = element.to_le_bytes(); - unsafe { - dest_ptr.write_unaligned(bytes[0]); - dest_ptr.add(1).write_unaligned(bytes[1]); - dest_ptr.add(2).write_unaligned(bytes[2]); - } - unsafe { - dest_ptr.write_unaligned(bytes[0]); - dest_ptr.add(1).write_unaligned(bytes[1]); - dest_ptr.add(2).write_unaligned(bytes[2]); - } - - filter_offset += approx_weights.aligned_size; - } - } - - #[inline] - pub(crate) fn convolve_vertical_rgb_sse_row( - total_width: usize, - bounds: &FilterBounds, - unsafe_source_ptr_0: *const u8, - unsafe_destination_ptr_0: *mut u8, - src_stride: usize, - weight_ptr: *const i16, - ) { - let mut cx = 0usize; - - while cx + 32 < total_width { - unsafe { - sse_convolve_u8::convolve_vertical_part_sse_32( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - ); - } - - cx += 32; - } - - while cx + 16 < total_width { - unsafe { - sse_convolve_u8::convolve_vertical_part_sse_16( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - ); - } - - cx += 16; - } - - while cx + 8 < total_width { - unsafe { - sse_convolve_u8::convolve_vertical_part_sse_8::( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - 8, - ); - } - - cx += 8; - } - - let left = total_width - cx; - if left > 0 { - unsafe { - sse_convolve_u8::convolve_vertical_part_sse_8::( - bounds.start, - cx, - unsafe_source_ptr_0, - src_stride, - unsafe_destination_ptr_0, - weight_ptr, - bounds, - left, - ); - } - } - } -}