diff --git a/Cargo.lock b/Cargo.lock index 8576e9b..b33039f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -768,7 +768,7 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pic-scale" -version = "0.1.24" +version = "0.1.25" dependencies = [ "colorutils-rs", "half", diff --git a/Cargo.toml b/Cargo.toml index b277761..f83778c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ workspace = { members = ["app"] } [package] name = "pic-scale" -version = "0.1.24" +version = "0.1.25" edition = "2021" description = "High performance image scaling" readme = "README.md" diff --git a/app/src/main.rs b/app/src/main.rs index b9ba759..ff02fd1 100644 --- a/app/src/main.rs +++ b/app/src/main.rs @@ -1,3 +1,6 @@ +mod merge; +mod split; + use std::time::Instant; use fast_image_resize::images::Image; @@ -9,6 +12,8 @@ use half::f16; use image::io::Reader as ImageReader; use image::{EncodableLayout, GenericImageView}; +use crate::merge::merge_channels_3; +use crate::split::split_channels_3; use pic_scale::{ ImageSize, ImageStore, JzazbzScaler, OklabScaler, ResamplingFunction, Scaler, Scaling, ThreadingPolicy, TransferFunction, @@ -39,26 +44,61 @@ fn main() { let start_time = Instant::now(); - let store = ImageStore::::from_slice( - &mut f16_bytes, + let mut rec1 = vec![0f32; dimensions.0 as usize * dimensions.1 as usize]; + let mut rec2 = vec![0f32; dimensions.0 as usize * dimensions.1 as usize]; + let mut rec3 = vec![0f32; dimensions.0 as usize * dimensions.1 as usize]; + + split_channels_3( + &f16_bytes, dimensions.0 as usize, dimensions.1 as usize, + &mut rec1, + &mut rec2, + &mut rec3, + ); + + let store = + ImageStore::::from_slice(&mut rec1, dimensions.0 as usize, dimensions.1 as usize); + + let resized = scaler.resize_plane_f32( + ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2), + store, ); + rec1 = Vec::from(resized.as_bytes()); - let resized = scaler.resize_rgb_f32( + let store = + ImageStore::::from_slice(&mut rec2, dimensions.0 as usize, dimensions.1 as usize); + + let resized = scaler.resize_plane_f32( ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2), store, ); + rec2 = Vec::from(resized.as_bytes()); + + let store = + ImageStore::::from_slice(&mut rec3, dimensions.0 as usize, dimensions.1 as usize); + + let resized = scaler.resize_plane_f32( + ImageSize::new(dimensions.0 as usize / 2, dimensions.1 as usize / 2), + store, + ); + rec3 = Vec::from(resized.as_bytes()); + + let mut resized_data: Vec = vec![0f32; resized.width * resized.height * 3]; + merge_channels_3( + &mut resized_data, + resized.width, + resized.height, + &rec1, + &rec2, + &rec3, + ); let elapsed_time = start_time.elapsed(); // Print the elapsed time in milliseconds println!("Scaler: {:.2?}", elapsed_time); - let dst: Vec = resized - .as_bytes() - .iter() - .map(|&x| (x * 255f32) as u8) - .collect(); + let dst: Vec = resized_data.iter().map(|&x| (x * 255f32) as u8).collect(); // let dst = resized.as_bytes(); if resized.channels == 4 { diff --git a/app/src/merge.rs b/app/src/merge.rs new file mode 100644 index 0000000..a4ef42d --- /dev/null +++ b/app/src/merge.rs @@ -0,0 +1,25 @@ +pub(crate) fn merge_channels_3( + image: &mut [T], + width: usize, + height: usize, + first: &[T], + second: &[T], + third: &[T], +) { + let mut shift = 0usize; + let mut shift_plane = 0usize; + for _ in 0..height { + let shifted_image = &mut image[shift..]; + let shifted_first_plane = &first[shift_plane..]; + let shifted_second_plane = &second[shift_plane..]; + let shifted_third_plane = &third[shift_plane..]; + for x in 0..width { + let px = x * 3; + shifted_image[px] = shifted_first_plane[x]; + shifted_image[px + 1] = shifted_second_plane[x]; + shifted_image[px + 2] = shifted_third_plane[x]; + } + shift += width * 3; + shift_plane += width; + } +} diff --git a/app/src/split.rs b/app/src/split.rs new file mode 100644 index 0000000..2bcbab3 --- /dev/null +++ b/app/src/split.rs @@ -0,0 +1,25 @@ +pub(crate) fn split_channels_3( + image: &[T], + width: usize, + height: usize, + first: &mut [T], + second: &mut [T], + third: &mut [T], +) { + let mut shift = 0usize; + let mut shift_plane = 0usize; + for _ in 0..height { + let shifted_image = &image[shift..]; + let shifted_first_plane = &mut first[shift_plane..]; + let shifted_second_plane = &mut second[shift_plane..]; + let shifted_third_plane = &mut third[shift_plane..]; + for x in 0..width { + let px = x * 3; + shifted_first_plane[x] = shifted_image[px]; + shifted_second_plane[x] = shifted_image[px + 1]; + shifted_third_plane[x] = shifted_image[px + 2]; + } + shift += width * 3; + shift_plane += width; + } +} diff --git a/src/neon/mod.rs b/src/neon/mod.rs index fc07377..6e3e1fd 100644 --- a/src/neon/mod.rs +++ b/src/neon/mod.rs @@ -33,6 +33,7 @@ mod convolve_f16; mod convolve_f32; #[cfg(all(feature = "half"))] mod f16_utils; +mod plane_f32; #[cfg(all(feature = "half"))] mod rgb_f16; mod rgb_f32; @@ -51,6 +52,8 @@ pub use alpha::neon_premultiply_alpha_rgba; pub use alpha::neon_unpremultiply_alpha_rgba; #[cfg(all(feature = "half"))] pub use f16_utils::*; +pub use plane_f32::convolve_horizontal_plane_neon_row_one; +pub use plane_f32::convolve_horizontal_plane_neon_rows_4; #[cfg(all(feature = "half"))] pub use rgb_f16::{ convolve_horizontal_rgb_neon_row_one_f16, convolve_horizontal_rgb_neon_rows_4_f16, diff --git a/src/neon/plane_f32.rs b/src/neon/plane_f32.rs new file mode 100644 index 0000000..ad5001c --- /dev/null +++ b/src/neon/plane_f32.rs @@ -0,0 +1,320 @@ +/* + * Copyright (c) Radzivon Bartoshyk. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +use crate::filter_weights::FilterWeights; +use crate::neon::utils::prefer_vfmaq_f32; +use std::arch::aarch64::*; + +macro_rules! conv_horiz_plane_16_f32 { + ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ + let src_ptr = $src.add($start_x); + + let rgb_pixel = vld1q_f32_x4(src_ptr); + + let mut acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set.0); + acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set.1); + acc = prefer_vfmaq_f32(acc, rgb_pixel.2, $set.2); + acc = prefer_vfmaq_f32(acc, rgb_pixel.3, $set.3); + acc + }}; +} + +macro_rules! conv_horiz_plane_8_f32 { + ($start_x: expr, $src: expr, $set1: expr, $set2: expr, $store: expr) => {{ + let src_ptr = $src.add($start_x); + + let rgb_pixel = vld1q_f32_x2(src_ptr); + + let mut acc = prefer_vfmaq_f32($store, rgb_pixel.0, $set1); + acc = prefer_vfmaq_f32(acc, rgb_pixel.1, $set2); + acc + }}; +} + +macro_rules! conv_horiz_plane_4_f32 { + ($start_x: expr, $src: expr, $set1: expr, $store: expr) => {{ + let src_ptr = $src.add($start_x); + + let rgb_pixel = vld1q_f32(src_ptr); + + prefer_vfmaq_f32($store, rgb_pixel, $set1) + }}; +} + +macro_rules! conv_horiz_plane_2_f32 { + ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ + let src_ptr = $src.add($start_x); + + let rgb_pixel_0 = vld1_f32(src_ptr); + let rgb_pixel = vcombine_f32(rgb_pixel_0, vdup_n_f32(0.)); + + prefer_vfmaq_f32($store, rgb_pixel, $set) + }}; +} + +macro_rules! conv_horiz_plane_1_f32 { + ($start_x: expr, $src: expr, $set: expr, $store: expr) => {{ + let src_ptr = $src.add($start_x); + let mut rgb_pixel = vdupq_n_f32(0.); + rgb_pixel = vsetq_lane_f32::<0>(src_ptr.read_unaligned(), rgb_pixel); + prefer_vfmaq_f32($store, rgb_pixel, $set) + }}; +} + +macro_rules! vfullq_sum_f32 { + ($reg: expr) => {{ + let acc = vadd_f32(vget_low_f32($reg), vget_high_f32($reg)); + vpadds_f32(acc) + }}; +} + +pub fn convolve_horizontal_plane_neon_row_one( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + unsafe_source_ptr_0: *const f32, + unsafe_destination_ptr_0: *mut f32, +) { + unsafe { + let mut filter_offset = 0usize; + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store = vdupq_n_f32(0f32); + + while jx + 16 < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = vld1q_f32_x4(ptr); + store = conv_horiz_plane_16_f32!( + bounds_start, + unsafe_source_ptr_0, + read_weights, + store + ); + jx += 8; + } + + while jx + 8 < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = vld1q_f32_x2(ptr); + store = conv_horiz_plane_8_f32!( + bounds_start, + unsafe_source_ptr_0, + read_weights.0, + read_weights.1, + store + ); + jx += 8; + } + + while jx + 4 < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = vld1q_f32(ptr); + store = + conv_horiz_plane_4_f32!(bounds_start, unsafe_source_ptr_0, read_weights, store); + jx += 4; + } + + while jx + 2 < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let weights0 = vld1_f32(ptr); + let weights = vcombine_f32(weights0, vdup_n_f32(0.)); + store = conv_horiz_plane_2_f32!(bounds_start, unsafe_source_ptr_0, weights, store); + jx += 2; + } + + while jx < bounds.size { + let bounds_start = bounds.start + jx; + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = vdupq_n_f32(ptr.read_unaligned()); + store = conv_horiz_plane_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store); + jx += 1; + } + + let px = x; + let dest_ptr = unsafe_destination_ptr_0.add(px); + dest_ptr.write_unaligned(vfullq_sum_f32!(store)); + + filter_offset += filter_weights.aligned_size; + } + } +} + +pub fn convolve_horizontal_plane_neon_rows_4( + dst_width: usize, + _: usize, + filter_weights: &FilterWeights, + unsafe_source_ptr_0: *const f32, + src_stride: usize, + unsafe_destination_ptr_0: *mut f32, + dst_stride: usize, +) { + unsafe { + let mut filter_offset = 0usize; + let zeros = vdupq_n_f32(0f32); + let weights_ptr = filter_weights.weights.as_ptr(); + + for x in 0..dst_width { + let bounds = filter_weights.bounds.get_unchecked(x); + let mut jx = 0usize; + let mut store_0 = zeros; + let mut store_1 = zeros; + let mut store_2 = zeros; + let mut store_3 = zeros; + + while jx + 16 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = vld1q_f32_x4(ptr); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_16_f32!( + bounds_start, + unsafe_source_ptr_0, + read_weights, + store_0 + ); + let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_1 = conv_horiz_plane_16_f32!(bounds_start, s_ptr_1, read_weights, store_1); + let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + store_2 = conv_horiz_plane_16_f32!(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); + store_3 = conv_horiz_plane_16_f32!(bounds_start, s_ptr3, read_weights, store_3); + jx += 16; + } + + while jx + 8 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = vld1q_f32_x2(ptr); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_8_f32!( + bounds_start, + unsafe_source_ptr_0, + read_weights.0, + read_weights.1, + store_0 + ); + let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_1 = conv_horiz_plane_8_f32!( + bounds_start, + s_ptr_1, + read_weights.0, + read_weights.1, + store_1 + ); + let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + store_2 = conv_horiz_plane_8_f32!( + bounds_start, + s_ptr2, + read_weights.0, + read_weights.1, + store_2 + ); + let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); + store_3 = conv_horiz_plane_8_f32!( + bounds_start, + s_ptr3, + read_weights.0, + read_weights.1, + store_3 + ); + jx += 8; + } + + while jx + 4 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let read_weights = vld1q_f32(ptr); + let bounds_start = bounds.start + jx; + store_0 = conv_horiz_plane_4_f32!( + bounds_start, + unsafe_source_ptr_0, + read_weights, + store_0 + ); + let s_ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_1 = conv_horiz_plane_4_f32!(bounds_start, s_ptr_1, read_weights, store_1); + let s_ptr2 = unsafe_source_ptr_0.add(src_stride * 2); + store_2 = conv_horiz_plane_4_f32!(bounds_start, s_ptr2, read_weights, store_2); + let s_ptr3 = unsafe_source_ptr_0.add(src_stride * 3); + store_3 = conv_horiz_plane_4_f32!(bounds_start, s_ptr3, read_weights, store_3); + jx += 4; + } + + while jx + 2 < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weights0 = vld1_f32(ptr); + let weights = vcombine_f32(weights0, vdup_n_f32(0.)); + let bounds_start = bounds.start + jx; + store_0 = + conv_horiz_plane_2_f32!(bounds_start, unsafe_source_ptr_0, weights, store_0); + let ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_1 = conv_horiz_plane_2_f32!(bounds_start, ptr_1, weights, store_1); + let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + store_2 = conv_horiz_plane_2_f32!(bounds_start, ptr_2, weights, store_2); + let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + store_3 = conv_horiz_plane_2_f32!(bounds_start, ptr_3, weights, store_3); + jx += 2; + } + + while jx < bounds.size { + let ptr = weights_ptr.add(jx + filter_offset); + let weight0 = vdupq_n_f32(ptr.read_unaligned()); + let bounds_start = bounds.start + jx; + store_0 = + conv_horiz_plane_1_f32!(bounds_start, unsafe_source_ptr_0, weight0, store_0); + let ptr_1 = unsafe_source_ptr_0.add(src_stride); + store_1 = conv_horiz_plane_1_f32!(bounds_start, ptr_1, weight0, store_1); + let ptr_2 = unsafe_source_ptr_0.add(src_stride * 2); + store_2 = conv_horiz_plane_1_f32!(bounds_start, ptr_2, weight0, store_2); + let ptr_3 = unsafe_source_ptr_0.add(src_stride * 3); + store_3 = conv_horiz_plane_1_f32!(bounds_start, ptr_3, weight0, store_3); + jx += 1; + } + + let px = x; + let dest_ptr = unsafe_destination_ptr_0.add(px); + dest_ptr.write_unaligned(vfullq_sum_f32!(store_0)); + + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride); + dest_ptr.write_unaligned(vfullq_sum_f32!(store_1)); + + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 2); + dest_ptr.write_unaligned(vfullq_sum_f32!(store_2)); + + let dest_ptr = unsafe_destination_ptr_0.add(px + dst_stride * 3); + dest_ptr.write_unaligned(vfullq_sum_f32!(store_3)); + + filter_offset += filter_weights.aligned_size; + } + } +} diff --git a/src/plane_f32.rs b/src/plane_f32.rs index 412e1c2..77a8359 100644 --- a/src/plane_f32.rs +++ b/src/plane_f32.rs @@ -33,7 +33,17 @@ use crate::convolve_naive_f32::{ }; use crate::dispatch_group_f32::{convolve_horizontal_dispatch_f32, convolve_vertical_dispatch_f32}; use crate::filter_weights::{FilterBounds, FilterWeights}; +#[cfg(all(target_arch = "aarch64", target_feature = "neon"))] +use crate::neon::{ + convolve_horizontal_plane_neon_row_one, convolve_horizontal_plane_neon_rows_4, + convolve_vertical_rgb_neon_row_f32, +}; use crate::rgb_f32::convolve_vertical_rgb_native_row_f32; +#[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" +))] +use crate::sse::convolve_vertical_rgb_sse_row_f32; use crate::ImageStore; use rayon::ThreadPool; @@ -45,11 +55,16 @@ impl<'a> HorizontalConvolutionPass for ImageStore<'a, f32, 1> { destination: &mut ImageStore, pool: &Option, ) { - let _dispatcher_4_rows: Option< + let mut _dispatcher_4_rows: Option< fn(usize, usize, &FilterWeights, *const f32, usize, *mut f32, usize), > = Some(convolve_horizontal_rgba_4_row_f32::); - let _dispatcher_row: fn(usize, usize, &FilterWeights, *const f32, *mut f32) = + let mut _dispatcher_row: fn(usize, usize, &FilterWeights, *const f32, *mut f32) = convolve_horizontal_rgb_native_row::; + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + _dispatcher_4_rows = Some(convolve_horizontal_plane_neon_rows_4); + _dispatcher_row = convolve_horizontal_plane_neon_row_one; + } convolve_horizontal_dispatch_f32( self, filter_weights, @@ -68,8 +83,21 @@ impl<'a> VerticalConvolutionPass for ImageStore<'a, f32, 1> { destination: &mut ImageStore, pool: &Option, ) { - let _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, *const f32) = + let mut _dispatcher: fn(usize, &FilterBounds, *const f32, *mut f32, usize, *const f32) = convolve_vertical_rgb_native_row_f32::; + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] + { + _dispatcher = convolve_vertical_rgb_neon_row_f32::<1>; + } + #[cfg(all( + any(target_arch = "x86_64", target_arch = "x86"), + target_feature = "sse4.1" + ))] + { + if is_x86_feature_detected!("sse4.1") { + _dispatcher = convolve_vertical_rgb_sse_row_f32::<1>; + } + } convolve_vertical_dispatch_f32(self, filter_weights, destination, pool, _dispatcher); } }